예제 #1
0
파일: main.py 프로젝트: haukurb/Reynir
def news():
    """ Handler for a page with a top news list """
    topic = request.args.get("topic")
    try:
        offset = max(0, int(request.args.get("offset", 0)))
        limit = max(0, int(request.args.get("limit", _TOP_NEWS_LENGTH)))
    except:
        offset = 0
        limit = _TOP_NEWS_LENGTH

    limit = min(limit, 100)  # Cap at max 100 results per page
    articles = top_news(topic=topic, offset=offset, limit=limit)

    # If all articles in the list are timestamped within 24 hours of now,
    # we display their times in HH:MM format. Otherwise, we display date.
    display_time = True
    if articles and (datetime.utcnow() - articles[-1].timestamp).days >= 1:
        display_time = False

    # Fetch the topics
    with SessionContext(commit=True) as session:
        q = session.query(Topic.identifier,
                          Topic.name).order_by(Topic.name).all()
        d = {t[0]: t[1] for t in q}
        topics = dict(id=topic, name=d.get(topic, ""), topic_list=q)
    return render_template(
        "news.html",
        articles=articles,
        topics=topics,
        display_time=display_time,
        offset=offset,
        limit=limit,
    )
예제 #2
0
    def fetch_url_html(cls, url, enclosing_session=None):
        """ Fetch a URL using the scraping mechanism, returning
            a tuple (html, metadata, helper) or None if error """

        with SessionContext(enclosing_session) as session:

            helper = cls.helper_for(session, url)

            if helper is None or not hasattr(helper, "fetch_url"):
                # Do a straight HTTP fetch
                html_doc = cls._fetch_url(url)
            else:
                # Hand off to the helper
                html_doc = helper.fetch_url(url)

            if not html_doc:
                return (None, None, None)

            # Parse the HTML
            soup = Fetcher.make_soup(html_doc, helper)
            if soup is None:
                print("Fetcher.fetch_url_html(): No soup")
                return (None, None, None)

            # Obtain the metadata from the resulting soup
            metadata = helper.get_metadata(soup) if helper else None
            return (html_doc, metadata, helper)
예제 #3
0
파일: article.py 프로젝트: haukurb/Reynir
    def token_stream(limit=None, skip_errors=True):
        """ Generator of a token stream consisting of `limit` sentences (or less) from the
            most recently parsed articles. After each sentence, None is yielded. """
        with SessionContext(commit=True, read_only=True) as session:

            q = (session.query(
                ArticleRow.url, ArticleRow.parsed,
                ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by(
                    desc(ArticleRow.parsed)).yield_per(200))

            count = 0
            for a in q:
                doc = json.loads(a.tokens)
                for pg in doc:
                    for sent in pg:
                        if not sent:
                            continue
                        if skip_errors and any("err" in t for t in sent):
                            # Skip error sentences
                            continue
                        for t in sent:
                            # Yield the tokens
                            yield t
                        yield None  # End-of-sentence marker
                        # Are we done?
                        count += 1
                        if limit is not None and count >= limit:
                            return
예제 #4
0
파일: article.py 프로젝트: haukurb/Reynir
    def sentence_stream(limit=None, skip=None, skip_errors=True):
        """ Generator of a sentence stream consisting of `limit` sentences (or less) from the
            most recently parsed articles. Each sentence is a list of token dicts. """
        with SessionContext(commit=True, read_only=True) as session:

            q = (session.query(
                ArticleRow.url, ArticleRow.parsed,
                ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by(
                    desc(ArticleRow.parsed)).yield_per(200))

            count = 0
            skipped = 0
            for a in q:
                doc = json.loads(a.tokens)
                for pg in doc:
                    for sent in pg:
                        if not sent:
                            continue
                        if skip_errors and any("err" in t for t in sent):
                            # Skip error sentences
                            continue
                        if skip is not None and skipped < skip:
                            # If requested, skip sentences from the front (useful for test set)
                            skipped += 1
                            continue
                        # Yield the sentence as a fresh token list
                        yield [t for t in sent]
                        # Are we done?
                        count += 1
                        if limit is not None and count >= limit:
                            return
예제 #5
0
 def __iter__(self):
     """ Iterate through articles (documents) """
     print("Starting iteration through corpus from words table")
     if self._dictionary is not None:
         xform = lambda x: self._dictionary.doc2bow(x)
     else:
         xform = lambda x: x
     with SessionContext(commit=True) as session:
         # Fetch bags of words sorted by articles
         q = session.query(Word.article_id, Word.stem, Word.cat, Word.cnt) \
             .order_by(Word.article_id).yield_per(2000)
         bag = []
         last_uuid = None
         for uuid, stem, cat, cnt in q:
             if uuid != last_uuid:
                 if bag:
                     # Finishing the last article: yield its bag
                     # print("Yielding bag of {0} words".format(len(bag)))
                     yield xform(bag)
                     bag = []
                 # Beginning a new article with an empty bag
                 last_uuid = uuid
             # Convert stem to lowercase and replace spaces with underscores
             w = w_from_stem(stem, cat)
             if cnt == 1:
                 bag.append(w)
             else:
                 bag.extend([w] * cnt)
         if last_uuid is not None:
             # print("Yielding bag of {0} words".format(len(bag)))
             yield xform(bag)
     print("Finished iteration through corpus from words table")
예제 #6
0
def dump_tokens(limit):
    """ Iterate through parsed articles and print a list
        of tokens and their matched terminals """

    dtd = dict()
    with closing(BIN_Db.get_db()) as db:
        with SessionContext(commit = True) as session:
            # Iterate through the articles
            q = session.query(Article) \
                .filter(Article.tree != None) \
                .order_by(Article.timestamp)
            if limit is None:
                q = q.all()
            else:
                q = q[0:limit]
            for a in q:
                print("\nARTICLE\nHeading: '{0.heading}'\nURL: {0.url}\nTimestamp: {0.timestamp}".format(a))
                tree = TreeTokenList()
                tree.load(a.tree)
                for ix, toklist in tree.sentences():
                    print("\nSentence {0}:".format(ix))
                    at_start = True
                    for t in toklist:
                        if t.tokentype == "WORD":
                            wrd = t.token[1:-1]
                            td = dtd.get(t.terminal)
                            if td is None:
                                td = TerminalDescriptor(t.terminal)
                                dtd[t.terminal] = td
                            stem = td.stem(db, wrd, at_start)
                            at_start = False
                            print("    {0} {1} {2}".format(wrd, stem, t.terminal))
                        else:
                            print("    {0.token} {0.cat} {0.terminal}".format(t))
예제 #7
0
파일: simserver.py 프로젝트: haukurb/Reynir
 def refresh_topics(self):
     """ Load any new article topics into the _atopics dict """
     with self._lock:
         with SessionContext(commit=True, read_only=True) as session:
             # Do the next refresh from this time point
             ts = datetime.utcnow()
             q = session.query(Article).join(Root).filter(Root.visible) \
                 .filter(Article.indexed >= self._timestamp) \
                 .with_entities(Article.id, Article.topic_vector)
             self._timestamp = ts
             count = 0
             for a in q.yield_per(100):
                 if a.topic_vector:
                     # Load topic vector in to a numpy array
                     vec = json.loads(a.topic_vector)
                     if isinstance(
                             vec,
                             list) and len(vec) == self._corpus.dimensions:
                         self._atopics[a.id] = np.array(vec)
                         count += 1
                     else:
                         print(
                             "Warning: faulty topic vector for article {0}".
                             format(a.id))
             print("Completed refresh_topics, {0} article vectors added".
                   format(count))
예제 #8
0
파일: main.py 프로젝트: haukurb/Reynir
def reparse_api(version=1):
    """ Reparse an already parsed and stored article with a given UUID """
    if not (1 <= version <= 1):
        return better_jsonify(valid="False", reason="Unsupported version")

    uuid = request.form.get("id", "").strip()[0:_MAX_UUID_LENGTH]
    tokens = None
    register = {}
    stats = {}

    with SessionContext(commit=True) as session:
        # Load the article
        a = ArticleProxy.load_from_uuid(uuid, session)
        if a is not None:
            # Found: Parse it (with a fresh parser) and store the updated version
            a.parse(session, verbose=True, reload_parser=True)
            # Save the tokens
            tokens = a.tokens
            # Build register of person names
            register = a.create_register(session)
            stats = dict(
                num_tokens=a.num_tokens,
                num_sentences=a.num_sentences,
                num_parsed=a.num_parsed,
                ambiguity=a.ambiguity,
            )

    # Return the tokens as a JSON structure to the client,
    # along with a name register and article statistics
    return better_jsonify(valid=True,
                          result=tokens,
                          register=register,
                          stats=stats)
예제 #9
0
파일: main.py 프로젝트: busla/Reynir
def news():
    """ Handler for a page with a top news list """
    topic = request.args.get("topic")
    start = request.args.get("start")
    if start is not None:
        try:
            if '.' in start:
                # Assume full timestamp with microseconds
                start = datetime.strptime(start, "%Y-%m-%dT%H:%M:%S.%f")
            else:
                # Compact timestamp
                start = datetime.strptime(start, "%Y-%m-%dT%H:%M:%S")
        except ValueError:
            start = None
    articles = top_news(topic=topic, start=start)
    now = datetime.utcnow()
    # If all articles in the list are timestamped within 24 hours of now,
    # we display their times in HH:MM format. Otherwise, we display their
    # dates in YYYY-MM-DD format.
    display_time = True
    if articles and (now - articles[-1].timestamp).days >= 1:
        display_time = False
    # Fetch the topics
    with SessionContext(commit=True) as session:
        q = session.query(Topic.identifier,
                          Topic.name).order_by(Topic.name).all()
        d = {t[0]: t[1] for t in q}
        topics = dict(identifier=topic, name=d.get(topic, ""), topic_list=q)
    return render_template("news.html",
                           articles=articles,
                           topics=topics,
                           display_time=display_time)
예제 #10
0
파일: simserver.py 프로젝트: haukurb/Reynir
    def _load_topics(self):
        """ Load all article topics into the self._atopics dictionary """
        self._atopics = {}
        with SessionContext(commit=True, read_only=True) as session:
            print("Starting load of all article topic vectors")
            t0 = time.time()
            # Do the next refresh from this time point
            self._timestamp = datetime.utcnow()
            q = session.query(Article).join(Root).filter(Root.visible) \
                .with_entities(Article.id, Article.topic_vector)

            for a in q.yield_per(2000):
                if a.topic_vector:
                    # Load topic vector in to a numpy array
                    vec = json.loads(a.topic_vector)
                    if isinstance(
                            vec, list) and len(vec) == self._corpus.dimensions:
                        self._atopics[a.id] = np.array(vec)
                    else:
                        print("Warning: faulty topic vector for article {0}".
                              format(a.id))

            t1 = time.time()
            print("Loading of {0} topic vectors completed in {1:.2f} seconds".
                  format(len(self._atopics), t1 - t0))
예제 #11
0
파일: main.py 프로젝트: busla/Reynir
def query():
    """ Respond to a query string """

    q = request.form.get("q", "").strip()[0:_MAX_QUERY_LENGTH]
    # Auto-uppercasing can be turned off by sending autouppercase: false in the query JSON
    auto_uppercase = get_json_bool(request, "autouppercase", True)
    result = dict()

    with SessionContext(commit=True) as session:

        toklist = list(
            tokenize(q,
                     enclosing_session=session,
                     auto_uppercase=q.islower() if auto_uppercase else False))
        actual_q = correct_spaces(" ".join(t.txt or "" for t in toklist))

        if Settings.DEBUG:
            # Log the query string as seen by the parser
            print("Query is: '{0}'".format(actual_q))

        # Try to parse and process as a query
        is_query = process_query(session, toklist, result)

    result["is_query"] = is_query
    result["q"] = actual_q

    return jsonify(result=result)
예제 #12
0
파일: main.py 프로젝트: busla/Reynir
def top_persons(limit=_TOP_PERSONS_LENGTH):
    """ Return a list of names and titles appearing recently in the news """
    toplist = dict()
    bindb = BIN_Db.get_db()

    with SessionContext(commit=True) as session:

        q = session.query(Person.name, Person.title, Person.article_url, Article.id) \
            .join(Article).join(Root) \
            .filter(Root.visible) \
            .order_by(desc(Article.timestamp))[0:limit * 2] # Go through up to 2 * N records

        for p in q:
            # Insert the name into the list if it's not already there,
            # or if the new title is longer than the previous one
            if p.name not in toplist or len(p.title) > len(toplist[p.name][0]):
                toplist[p.name] = (correct_spaces(p.title), p.article_url,
                                   p.id, bindb.lookup_name_gender(p.name))
                if len(toplist) >= limit:
                    # We now have as many names as we initially wanted: terminate the loop
                    break

    with changedlocale() as strxfrm:
        # Convert the dictionary to a sorted list of dicts
        return sorted([
            dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2])
            for name, tu in toplist.items()
        ],
                      key=lambda x: strxfrm(x["name"]))
예제 #13
0
def _get_cached_entry(name, url, enclosing_session=None):
    """ Fetch cached entry by key and url """
    with SessionContext(commit=True, session=enclosing_session) as session:
        # TODO: content column should be converted to jsonb
        # from varchar to query faster & more intelligently
        return (session.query(Link).filter(Link.key == name).filter(
            Link.content.like("%" + url + "%")).one_or_none())
예제 #14
0
    def fetch_article(cls, url, enclosing_session=None):
        """ Fetch a previously scraped article, returning
            a tuple (article, metadata, content) or None if error """

        with SessionContext(enclosing_session) as session:

            article = cls.find_article(url, session)
            if article is None:
                return (None, None, None)

            html_doc = article.html
            if not html_doc:
                return (None, None, None)

            helper = cls.helper_for(session, url)
            # Parse the HTML
            soup = Fetcher.make_soup(html_doc, helper)
            if soup is None:
                print("Fetcher.fetch_article(): No soup")
                return (None, None, None)

            # Obtain the metadata and the content from the resulting soup
            metadata = helper.get_metadata(soup) if helper else None
            content = helper.get_content(soup) if helper else soup.html.body
            return (article, metadata, content)
예제 #15
0
파일: fetcher.py 프로젝트: haukurb/Reynir
    def fetch_url(cls, url, enclosing_session=None):
        """ Fetch a URL using the scraping mechanism, returning
            a tuple (metadata, content) or None if error """

        with SessionContext(enclosing_session) as session:

            helper = cls.helper_for(session, url)

            if helper is None or not hasattr(helper, "fetch_url"):
                # Do a straight HTTP fetch
                html_doc = cls.raw_fetch_url(url)
            else:
                # Hand off to the helper
                html_doc = helper.fetch_url(url)

            if not html_doc:
                return None

            # Parse the HTML
            soup = Fetcher.make_soup(html_doc, helper)
            if soup is None:
                print("Fetcher.fetch_url({0}): No soup or no soup.html".format(
                    url))
                return None

            # Obtain the metadata and the content from the resulting soup
            metadata = helper.get_metadata(soup) if helper else None
            content = helper.get_content(soup) if helper else soup.html.body
            return (metadata, content)
예제 #16
0
파일: main.py 프로젝트: haukurb/Reynir
def parse_api(version=1):
    """ API to parse text and return POS tagged tokens in JSON format """
    if not (1 <= version <= 1):
        # Unsupported version
        return better_jsonify(valid=False, reason="Unsupported version")

    try:
        text = text_from_request(request)
    except:
        return better_jsonify(valid=False, reason="Invalid request")

    with SessionContext(commit=True) as session:
        pgs, stats, register = TreeUtility.parse_text(session,
                                                      text,
                                                      all_names=True)
        # In this case, we should always get a single paragraph back
        if pgs:
            # Only process the first paragraph, if there are many of them
            if len(pgs) == 1:
                pgs = pgs[0]
            else:
                # More than one paragraph: gotta concatenate 'em all
                pa = []
                for pg in pgs:
                    pa.extend(pg)
                pgs = pa

    # Return the tokens as a JSON structure to the client
    return better_jsonify(valid=True,
                          result=pgs,
                          stats=stats,
                          register=register)
예제 #17
0
def _blacklisted_urls_for_key(key, enclosing_session=None):
    """ Fetch blacklisted urls for a given key """
    with SessionContext(commit=True, session=enclosing_session) as session:
        q = (session.query(BlacklistedLink.url).filter(
            BlacklistedLink.link_type == "image").filter(
                BlacklistedLink.key == key).all())
        return [r for (r, ) in q]
예제 #18
0
 def find_article(cls, url, enclosing_session=None):
     """ Return a scraped article object, if found, else None """
     article = None
     with SessionContext(enclosing_session, commit=True) as session:
         article = session.query(ArticleRow).filter_by(url = url) \
             .filter(ArticleRow.scraped != None).one_or_none()
     return article
예제 #19
0
def reparse():
    """ Reparse an already parsed and stored article with a given UUID """

    uuid = request.form.get("id", "").strip()[0:_MAX_UUID_LENGTH]
    tokens = None
    register = {}
    stats = {}

    with SessionContext(commit=True) as session:
        # Load the article
        a = ArticleProxy.load_from_uuid(uuid, session)
        if a is not None:
            # Found: Parse it (with a fresh parser) and store the updated version
            a.parse(session, verbose=True, reload_parser=True)
            # Save the tokens
            tokens = a.tokens
            # Build register of person names
            for name in a.person_names():
                add_name_to_register(name, register, session)
            # Add register of entity names
            for name in a.entity_names():
                add_entity_to_register(name, register, session)
            stats = dict(num_tokens=a.num_tokens,
                         num_sentences=a.num_sentences,
                         num_parsed=a.num_parsed,
                         ambiguity=a.ambiguity)

    # Return the tokens as a JSON structure to the client,
    # along with a name register and article statistics
    return jsonify(result=tokens, register=register, stats=stats)
예제 #20
0
def _purge_single(key, ctype=None, enclosing_session=None):
    """ Remove cache entry """
    with SessionContext(commit=True, session=enclosing_session) as session:
        filters = [Link.key == key]
        if ctype:
            filters.append(Link.ctype == ctype)

        session.query(Link).filter(*filters).delete()
예제 #21
0
 def assign_article_topics(self, article_id, heading):
     """ Assign the appropriate topics to the given article in the database """
     if self._dictionary is None:
         self.load_dictionary()
     if self._tfidf is None:
         self.load_tfidf_model()
     if self._model is None:
         self.load_lda_model()
     if self._topics is None:
         self.load_topics()
     with SessionContext(commit=True) as session:
         q = session.query(Word.stem, Word.cat, Word.cnt) \
             .filter(Word.article_id == article_id).all()
         wlist = []
         for stem, cat, cnt in q:
             # Convert stem to lowercase and replace spaces with underscores
             w = stem.lower().replace(" ", "_") + "/" + cat
             if cnt == 1:
                 wlist.append(w)
             else:
                 wlist.extend([w] * cnt)
         topics = []
         if self._topics and wlist:
             bag = self._dictionary.doc2bow(wlist)
             tfidf = self._tfidf[bag]
             article_vector = self._model[tfidf]
             topic_names = []
             if self._verbose:
                 print("{0} : {1}".format(article_id, heading))
             for topic_id, topic_info in self._topics.items():
                 topic_name = topic_info["name"]
                 topic_vector = topic_info["vector"]
                 topic_threshold = topic_info["threshold"]
                 # Calculate the cosine similarity betwee the article and the topic
                 similarity = matutils.cossim(article_vector, topic_vector)
                 if self._verbose:
                     print("   Similarity to topic {0} is {1:.3f}".format(
                         topic_name, similarity))
                 if similarity >= topic_threshold:
                     # Similar enough: this is a topic of the article
                     topics.append(topic_id)
                     topic_names.append((topic_name, similarity))
             if topic_names:
                 print("Article '{0}': topics {1}".format(
                     heading, topic_names))
         # Topics found (if any): delete previous ones (if any)
         session.execute(ArticleTopic.table().delete().where(
             ArticleTopic.article_id == article_id))
         # ...and add the new ones
         for topic_id in topics:
             session.add(
                 ArticleTopic(article_id=article_id, topic_id=topic_id))
         # Update the indexed timestamp
         a = session.query(Article).filter(
             Article.id == article_id).one_or_none()
         if a:
             a.indexed = datetime.utcnow()
예제 #22
0
 def load_from_url(cls, url, enclosing_session=None):
     """ Load or scrape an article, given its URL """
     with SessionContext(enclosing_session) as session:
         ar = session.query(ArticleRow).filter(
             ArticleRow.url == url).one_or_none()
         if ar is not None:
             return cls._init_from_row(ar)
         # Not found in database: attempt to fetch
         return cls._init_from_scrape(url, session)
예제 #23
0
파일: builder.py 프로젝트: busla/Reynir
 def load_topics(self):
     """ Load the topics into a dict of topic vectors by topic id """
     self._topics = { }
     with SessionContext(commit = True) as session:
         for topic in session.query(Topic).all():
             if topic.vector:
                 topic_vector = json.loads(topic.vector)[self._model_name]
                 if topic_vector:
                     self._topics[topic.id] = dict(name = topic.name,
                         vector = topic_vector, threshold = topic.threshold)
예제 #24
0
 def load_from_uuid(cls, uuid, enclosing_session=None):
     """ Load an article, given its UUID """
     with SessionContext(enclosing_session) as session:
         try:
             ar = session.query(ArticleRow).filter(
                 ArticleRow.id == uuid).one_or_none()
         except DataError:
             # Probably wrong UUID format
             ar = None
         return None if ar is None else cls._init_from_row(ar)
예제 #25
0
 def scrape_from_url(cls, url, enclosing_session=None):
     """ Force fetch of an article, given its URL """
     with SessionContext(enclosing_session) as session:
         ar = session.query(ArticleRow).filter(
             ArticleRow.url == url).one_or_none()
         a = cls._init_from_scrape(url, session)
         if a is not None and ar is not None:
             # This article already existed in the database, so note its UUID
             a._uuid = ar.id
         return a
예제 #26
0
파일: main.py 프로젝트: haukurb/Reynir
def top_persons(limit=_TOP_PERSONS_LENGTH):
    """ Return a list of names and titles appearing recently in the news """
    toplist = dict()
    MAX_TITLE_LENGTH = 64

    with SessionContext(commit=True) as session:

        q = (
            session.query(Person.name, Person.title, Person.article_url,
                          Article.id).join(Article).join(Root).filter(
                              Root.visible).order_by(desc(Article.timestamp))
            [0:limit * 2]  # Go through up to 2 * N records
        )

        def is_better_title(new_title, old_title):
            len_new = len(new_title)
            len_old = len(old_title)
            if len_old >= MAX_TITLE_LENGTH:
                # Too long: we want a shorter one
                return len_new < len_old
            if len_new >= MAX_TITLE_LENGTH:
                # This one is too long: we don't want it
                return False
            # Otherwise, longer is better
            return len_new > len_old

        with BIN_Db.get_db() as bindb:
            for p in q:
                # Insert the name into the list if it's not already there,
                # or if the new title is longer than the previous one
                if p.name not in toplist or is_better_title(
                        p.title, toplist[p.name][0]):
                    toplist[p.name] = (
                        correct_spaces(p.title),
                        p.article_url,
                        p.id,
                        bindb.lookup_name_gender(p.name),
                    )
                    if len(toplist) >= limit:
                        # We now have as many names as we initially wanted: terminate the loop
                        break

    with changedlocale() as strxfrm:
        # Convert the dictionary to a sorted list of dicts
        return sorted(
            [
                dict(name=name,
                     title=tu[0],
                     gender=tu[3],
                     url=tu[1],
                     uuid=tu[2]) for name, tu in toplist.items()
            ],
            key=lambda x: strxfrm(x["name"]),
        )
예제 #27
0
def page():
    """ Handler for a page displaying the parse of an arbitrary web page by URL
        or an already scraped article by UUID """
    url = request.args.get("url", None)
    uuid = request.args.get("id", None)
    if url:
        url = url.strip()[0:_MAX_URL_LENGTH]
    if uuid:
        uuid = uuid.strip()[0:_MAX_UUID_LENGTH]
    if url:
        # URL has priority, if both are specified
        uuid = None
    if not url and not uuid:
        # !!! TODO: Separate error page
        return redirect(url_for('main'))

    with SessionContext(commit=True) as session:

        if uuid:
            a = ArticleProxy.load_from_uuid(uuid, session)
        elif url.startswith("http:") or url.startswith("https:"):
            # a = ArticleProxy.load_from_url(url, session)
            a = ArticleProxy.scrape_from_url(url,
                                             session)  # Forces a new scrape
        else:
            a = None

        if a is None:
            # !!! TODO: Separate error page
            return redirect(url_for('main'))

        # Prepare the article for display (may cause it to be parsed and stored)
        a.prepare(session, verbose=True, reload_parser=True)
        register = {}
        # Build register of person names
        for name in a.person_names():
            add_name_to_register(name, register, session)
        # Add register of entity names
        for name in a.entity_names():
            add_entity_to_register(name, register, session)
        # Fetch names of article topics, if any
        topics = session.query(ArticleTopic) \
            .filter(ArticleTopic.article_id == a.uuid).all()
        topics = [
            dict(name=t.topic.name, identifier=t.topic.identifier)
            for t in topics
        ]

        return render_template("page.html",
                               article=a,
                               register=register,
                               topics=topics)
예제 #28
0
def make_trigrams(limit):
    """ Iterate through parsed articles and extract trigrams from
        successfully parsed sentences """

    with SessionContext(commit=True) as session:

        # Delete existing trigrams
        Trigram.delete_all(session)
        # Iterate through the articles
        q = session.query(Article.url, Article.timestamp, Article.tree) \
            .filter(Article.tree != None) \
            .order_by(Article.timestamp)
        if limit is None:
            q = q.yield_per(200)
        else:
            q = q[0:limit]

        def tokens(q):
            """ Generator for token stream """
            for a in q:
                print(
                    "Processing article from {0.timestamp}: {0.url}".format(a))
                tree = TreeTokenList()
                tree.load(a.tree)
                for ix, toklist in tree.sentences():
                    if toklist:
                        # For each sentence, start and end with empty strings
                        yield ""
                        yield ""
                        for t in toklist:
                            yield t.token[1:-1]
                        yield ""
                        yield ""

        def trigrams(iterable):
            return zip(*((islice(seq, i, None)
                          for i, seq in enumerate(tee(iterable, 3)))))

        FLUSH_THRESHOLD = 0  # 200 # Flush once every 200 records
        cnt = 0
        for tg in trigrams(tokens(q)):
            # print("{0}".format(tg))
            if any(w for w in tg):
                try:
                    Trigram.upsert(session, *tg)
                    cnt += 1
                    if cnt == FLUSH_THRESHOLD:
                        session.flush()
                        cnt = 0
                except DatabaseError as ex:
                    print("*** Exception {0} on trigram {1}, skipped".format(
                        ex, tg))
예제 #29
0
 def parse(self,
           enclosing_session=None,
           verbose=False,
           reload_parser=False):
     """ Force a parse of the article """
     with SessionContext(enclosing_session, commit=True) as session:
         if reload_parser:
             # We need a parse: Make sure we're using the newest grammar
             self.reload_parser()
         self._parse(session, verbose=verbose)
         if self._tree is not None or self._tokens is not None:
             # Store the updated article in the database
             self.store(session)
예제 #30
0
파일: main.py 프로젝트: haukurb/Reynir
def page():
    """ Handler for a page displaying the parse of an arbitrary web page by URL
        or an already scraped article by UUID """
    url = request.args.get("url", None)
    uuid = request.args.get("id", None)
    if url:
        url = url.strip()[0:_MAX_URL_LENGTH]
    if uuid:
        uuid = uuid.strip()[0:_MAX_UUID_LENGTH]
    if url:
        # URL has priority, if both are specified
        uuid = None
    if not url and not uuid:
        # !!! TODO: Separate error page
        return redirect(url_for("main"))

    with SessionContext(commit=True) as session:

        if uuid:
            a = ArticleProxy.load_from_uuid(uuid, session)
        elif url.startswith("http:") or url.startswith("https:"):
            # a = ArticleProxy.load_from_url(url, session)
            a = ArticleProxy.scrape_from_url(url,
                                             session)  # Forces a new scrape
        else:
            a = None

        if a is None:
            # !!! TODO: Separate error page
            return redirect(url_for("main"))

        # Prepare the article for display (may cause it to be parsed and stored)
        a.prepare(session, verbose=True, reload_parser=True)
        register = a.create_register(session)

        # Fetch names of article topics, if any
        topics = (session.query(ArticleTopic).filter(
            ArticleTopic.article_id == a.uuid).all())
        topics = [
            dict(name=t.topic.name, id=t.topic.identifier) for t in topics
        ]

        # Fetch similar (related) articles, if any
        DISPLAY = 10  # Display at most 10 matches
        similar = Search.list_similar_to_article(session, a.uuid, n=DISPLAY)

        return render_template("page.html",
                               article=a,
                               register=register,
                               topics=topics,
                               similar=similar)