示例#1
0
    def all_matches(
        cls,
        criteria: Mapping[str, Any],
        pattern: str,
        enclosing_session: Optional[Session] = None,
    ) -> Iterator[Tuple["Article", int, SimpleTree]]:
        """ Generator of SimpleTree objects (see matcher.py) from
            articles matching the given criteria and the pattern """

        with SessionContext(commit=True,
                            read_only=True,
                            session=enclosing_session) as session:

            # t0 = time.time()
            mcnt = acnt = tcnt = 0
            # print("Starting article loop")
            for a in cls.articles(criteria, enclosing_session=session):
                if a.tree is None:
                    continue
                acnt += 1
                tree = Tree(url=a.url or "", authority=a.authority)
                tree.load(a.tree)
                for ix, simple_tree in tree.simple_trees():
                    tcnt += 1
                    for match in simple_tree.all_matches(pattern):
                        yield (a, ix, match)
                        mcnt += 1
示例#2
0
文件: article.py 项目: Loknar/Greynir
    def token_stream(limit=None, skip_errors=True):
        """ Generator of a token stream consisting of `limit` sentences
            (or less) from the most recently parsed articles. After
            each sentence, None is yielded. """
        with SessionContext(commit=True, read_only=True) as session:

            q = (session.query(
                ArticleRow.url, ArticleRow.parsed,
                ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by(
                    desc(ArticleRow.parsed)).yield_per(200))

            count = 0
            for a in q:
                doc = json.loads(a.tokens)
                for pg in doc:
                    for sent in pg:
                        if not sent:
                            continue
                        if skip_errors and any("err" in t for t in sent):
                            # Skip error sentences
                            continue
                        for t in sent:
                            # Yield the tokens
                            yield t
                        yield None  # End-of-sentence marker
                        # Are we done?
                        count += 1
                        if limit is not None and count >= limit:
                            return
示例#3
0
    def fetch_url_html(cls, url, enclosing_session=None):
        """ Fetch a URL using the scraping mechanism, returning
            a tuple (html, metadata, helper) or None if error """

        with SessionContext(enclosing_session) as session:

            helper = cls.helper_for(session, url)

            if helper is None or not hasattr(helper, "fetch_url"):
                # Do a straight HTTP fetch
                html_doc = cls.raw_fetch_url(url)
            else:
                # Hand off to the helper
                html_doc = helper.fetch_url(url)

            if not html_doc:
                return (None, None, None)

            # Parse the HTML
            soup = Fetcher.make_soup(html_doc, helper)
            if soup is None:
                logging.warning(
                    "Fetcher.fetch_url_html({0}): No soup".format(url))
                return (None, None, None)

            # Obtain the metadata from the resulting soup
            metadata = helper.get_metadata(soup) if helper else None
            return (html_doc, metadata, helper)
示例#4
0
def main():

    try:
        # Read configuration file
        Settings.read(os.path.join(basepath, "config", "GreynirSimple.conf"))
    except ConfigError as e:
        print("Configuration error: {0}".format(e))
        quit()

    with SessionContext(commit=True) as session:

        # Zero sentences
        print("Deleting all articles with zero sentences")
        res = session.execute(
            ArticleModel.table().delete().where(ArticleModel.num_sentences == 0)
        )
        print(str(res.rowcount) + " articles deleted")

        # Non-Icelandic
        # TODO: Implement me!

        # Duplicates
        # For each https article, check whether there is a corresponding
        # article URL with http URI scheme
        dupl = 0
        q = session.query(ArticleModel.url).filter(ArticleModel.url.like("https://%"))
        for r in q.all():
            url = re.sub(r"^https://", r"http://", r.url)
            # c = session.query(ArticleModel.url).filter(ArticleModel.url == url).count()
            res = session.execute(
                ArticleModel.table().delete().where(ArticleModel.url == url)
            )
            dupl += res.rowcount
        print("{0} duplicate URLs w. HTTP scheme removed".format(dupl))
示例#5
0
文件: fetcher.py 项目: Loknar/Greynir
    def fetch_url(cls, url, enclosing_session=None):
        """ Fetch a URL using the scraping mechanism, returning
            a tuple (metadata, content) or None if error """

        with SessionContext(enclosing_session) as session:

            helper = cls.helper_for(session, url)

            if helper is None or not hasattr(helper, "fetch_url"):
                # Do a straight HTTP fetch
                html_doc = cls.raw_fetch_url(url)
            else:
                # Hand off to the helper
                html_doc = helper.fetch_url(url)

            if not html_doc:
                return None

            # Parse the HTML
            soup = Fetcher.make_soup(html_doc, helper)
            if soup is None:
                print("Fetcher.fetch_url({0}): No soup or no soup.html".format(
                    url))
                return None

            # Obtain the metadata and the content from the resulting soup
            metadata = helper.get_metadata(soup) if helper else None
            content = helper.get_content(soup) if helper else soup.html.body
            return (metadata, content)
示例#6
0
    def _load_topics(self):
        """ Load all article topics into the self._atopics dictionary """
        self._atopics = {}
        with SessionContext(commit=True, read_only=True) as session:
            print("Starting load of all article topic vectors")
            t0 = time.time()
            # Do the next refresh from this time point
            self._timestamp = datetime.utcnow()
            q = (session.query(Article).join(Root).filter(
                Root.visible).with_entities(Article.id, Article.topic_vector))

            for a in q.yield_per(2000):
                if a.topic_vector:
                    # Load topic vector in to a numpy array
                    vec = json.loads(a.topic_vector)
                    if isinstance(
                            vec, list) and len(vec) == self._corpus.dimensions:
                        self._atopics[a.id] = np.array(vec)
                    else:
                        print("Warning: faulty topic vector for article {0}".
                              format(a.id))

            t1 = time.time()
            print("Loading of {0} topic vectors completed in {1:.2f} seconds".
                  format(len(self._atopics), t1 - t0))
示例#7
0
文件: query.py 项目: sultur/Greynir
 def store_query_data(client_id: str, key: str,
                      data: ClientDataDict) -> bool:
     """ Save client query data in the database, under the given key """
     if not client_id or not key:
         return False
     now = datetime.utcnow()
     try:
         with SessionContext(commit=True) as session:
             row = (session.query(QueryData).filter(
                 QueryData.key == key).filter(
                     QueryData.client_id == client_id)).one_or_none()
             if row is None:
                 # Not already present: insert
                 row = QueryData(
                     client_id=client_id,
                     key=key,
                     created=now,
                     modified=now,
                     data=data,
                 )
                 session.add(row)
             else:
                 # Already present: update
                 row.data = data  # type: ignore
                 row.modified = now  # type: ignore
         # The session is auto-committed upon exit from the context manager
         return True
     except Exception as e:
         logging.error("Error storing query data in db: {0}".format(e))
     return False
示例#8
0
文件: stats.py 项目: Loknar/Greynir
def _gen_most_freq_queries_answer(q):
    """ Answer question concerning most frequent queries. """
    with SessionContext(read_only=True) as session:
        now = datetime.utcnow()
        start = now - timedelta(days=_QUERIES_PERIOD)
        end = now
        qr = QueryTypesQuery.period(start=start,
                                    end=end,
                                    enclosing_session=session)

        if qr:
            top_qtype = qr[0][1]
            desc = _QTYPE_TO_DESC.get(
                top_qtype) or "óskilgreindum fyrirspurnum"
            answer = "Undanfarið hef ég mest svarað {0}.".format(desc)
        else:
            answer = "Ég hef ekki svarað neinum fyrirspurnum upp á síðkastið."

        response = dict(answer=answer)
        voice = answer

        q.set_expires(now + timedelta(hours=1))
        q.set_answer(response, answer, voice)
        q.set_qtype(_STATS_QTYPE)
        q.set_key("FreqQuery")

        return True
示例#9
0
 def refresh_topics(self):
     """ Load any new article topics into the _atopics dict """
     with self._lock:
         with SessionContext(commit=True, read_only=True) as session:
             # Do the next refresh from this time point
             ts = datetime.utcnow()
             q = (session.query(Article).join(Root).filter(
                 Root.visible).filter(
                     Article.indexed >= self._timestamp).with_entities(
                         Article.id, Article.topic_vector))
             self._timestamp = ts
             count = 0
             for a in q.yield_per(100):
                 if a.topic_vector:
                     # Load topic vector in to a numpy array
                     vec = json.loads(a.topic_vector)
                     if (isinstance(vec, list)
                             and len(vec) == self._corpus.dimensions):
                         self._atopics[a.id] = np.array(vec)
                         count += 1
                     else:
                         print(
                             "Warning: faulty topic vector for article {0}".
                             format(a.id))
             print("Completed refresh_topics, {0} article vectors added".
                   format(count))
示例#10
0
    def fetch_article(cls, url, enclosing_session=None):
        """ Fetch a previously scraped article, returning
            a tuple (article, metadata, content) or None if error """

        with SessionContext(enclosing_session) as session:

            article = cls.find_article(url, session)
            if article is None:
                return (None, None, None)

            html_doc = article.html
            if not html_doc:
                return (None, None, None)

            helper = cls.helper_for(session, url)
            # Parse the HTML
            soup = Fetcher.make_soup(html_doc, helper)
            if soup is None:
                logging.warning(
                    "Fetcher.fetch_article({0}): No soup".format(url))
                return (None, None, None)

            # Obtain the metadata and the content from the resulting soup
            metadata = helper.get_metadata(soup) if helper else None
            content = helper.get_content(soup) if helper else soup.html.body
            return (article, metadata, content)
示例#11
0
def postag_api(version=1):
    """ API to parse text and return POS tagged tokens in a verbose JSON format """
    if not (1 <= version <= 1):
        # Unsupported version
        return better_jsonify(valid=False, reason="Unsupported version")

    try:
        text = text_from_request(request)
    except Exception:
        return better_jsonify(valid=False, reason="Invalid request")

    with SessionContext(commit=True) as session:
        pgs, stats, register = TreeUtility.tag_text(session, text, all_names=True)
        # Amalgamate the result into a single list of sentences
        pa: List[List[TokenDict]] = []
        if pgs:
            # Only process the first paragraph, if there are many of them
            if len(pgs) == 1:
                pa = pgs[0]
            else:
                # More than one paragraph: gotta concatenate 'em all
                for pg in pgs:
                    pa.extend(pg)
        for sent in pa:
            # Transform the token representation into a
            # nice canonical form for outside consumption
            # err = any("err" in t for t in sent)
            for t in sent:
                canonicalize_token(t)

    # Return the tokens as a JSON structure to the client
    return better_jsonify(valid=True, result=pa, stats=stats, register=register)
示例#12
0
def test_del_query_history(client):
    """ Test query history deletion API. """

    with SessionContext(commit=False) as session:
        # If database contains the logged query "GREYNIR_TESTING" we know the
        # tests are running on the dummy data in tests/test_files/test_queries.csv.
        cnt = session.query(Query).filter(Query.question == "GREYNIR_TESTING").count()
        if not cnt == 1:
            return

        # Num queries in dummy test data
        TEST_EXPECTED_NUM_QUERIES = 6

        # We expect one query with this client ID
        TEST_CLIENT_ID = "123456789"

        # Number of queries prior to API call
        pre_numq = session.query(Query).count()
        assert pre_numq == TEST_EXPECTED_NUM_QUERIES, "Malformed dummy test data"

        qstr = urlencode(
            {"action": "clear", "client_type": "some_type", "client_id": TEST_CLIENT_ID}
        )

        _ = client.get("/query_history.api?" + qstr)

        post_numq = session.query(Query).count()

        assert post_numq == pre_numq - 1
示例#13
0
def _get_cached_entry(name, url, enclosing_session=None):
    """ Fetch cached entry by key and url """
    with SessionContext(commit=True, session=enclosing_session) as session:
        # TODO: content column should be converted to jsonb
        # from varchar to query faster & more intelligently
        return (session.query(Link).filter(Link.key == name).filter(
            Link.content.like("%" + url + "%")).one_or_none())
示例#14
0
def _blacklisted_urls_for_key(key, enclosing_session=None):
    """ Fetch blacklisted urls for a given key """
    with SessionContext(commit=True, session=enclosing_session) as session:
        q = (session.query(BlacklistedLink.url).filter(
            BlacklistedLink.link_type == "image").filter(
                BlacklistedLink.key == key).all())
        return [r for (r, ) in q]
示例#15
0
 def _init_from_scrape(cls,
                       url: Optional[str],
                       enclosing_session: Optional[Session] = None):
     """ Scrape an article from its URL """
     if url is None:
         return None
     a = cls(url=url)
     with SessionContext(enclosing_session) as session:
         # Obtain a helper corresponding to the URL
         html, metadata, helper = Fetcher.fetch_url_html(url, session)
         if html is None:
             return a
         a._html = html
         if metadata is not None:
             a._heading = metadata.heading
             a._author = metadata.author
             a._timestamp = metadata.timestamp
             a._authority = metadata.authority
         a._scraped = datetime.utcnow()
         if helper is not None:
             helper = cast(Any, helper)
             a._scr_module = helper.scr_module
             a._scr_class = helper.scr_class
             a._scr_version = helper.scr_version
             a._root_id = helper.root_id
             a._root_domain = helper.domain
         return a
示例#16
0
def reparse_api(version=1):
    """ Reparse an already parsed and stored article with a given UUID """
    if not (1 <= version <= 1):
        return better_jsonify(valid="False", reason="Unsupported version")

    uuid = request.form.get("id", "").strip()[0:_MAX_UUID_LENGTH]
    tokens = None
    register = {}
    stats = {}

    with SessionContext(commit=True) as session:
        # Load the article
        a = ArticleProxy.load_from_uuid(uuid, session)
        if a is not None:
            # Found: Parse it (with a fresh parser) and store the updated version
            a.parse(session, verbose=True, reload_parser=True)
            # Save the tokens
            tokens = a.tokens
            # Build register of person names
            register = a.create_register(session)
            stats = dict(
                num_tokens=a.num_tokens,
                num_sentences=a.num_sentences,
                num_parsed=a.num_parsed,
                ambiguity=a.ambiguity,
            )

    # Return the tokens as a JSON structure to the client,
    # along with a name register and article statistics
    return better_jsonify(valid=True,
                          result=tokens,
                          register=register,
                          stats=stats)
示例#17
0
def feedback_api(version=1):
    """ Endpoint to accept submitted feedback forms. """

    if not (1 <= version <= 1):
        return better_jsonify(valid=False, reason="Unsupported version")

    name = request.values.get("name")
    email = request.values.get("email")
    comment = request.values.get("comment")
    topic = request.values.get("topic")

    if comment:
        with SessionContext(commit=True) as session:
            try:
                qrow = Feedback(
                    timestamp=datetime.utcnow(),
                    topic=topic,
                    name=name,
                    email=email,
                    comment=comment,
                )
                session.add(qrow)
                return better_jsonify(valid=True)
            except Exception as e:
                logging.error("Error saving feedback to db: {0}".format(e))

    return better_jsonify(valid=False)
示例#18
0
def parse_api(version=1):
    """ API to parse text and return POS tagged tokens in JSON format """
    if not (1 <= version <= 1):
        # Unsupported version
        return better_jsonify(valid=False, reason="Unsupported version")

    try:
        text = text_from_request(request)
    except:
        return better_jsonify(valid=False, reason="Invalid request")

    with SessionContext(commit=True) as session:
        pgs, stats, register = TreeUtility.parse_text(session,
                                                      text,
                                                      all_names=True)
        # In this case, we should always get a single paragraph back
        if pgs:
            # Only process the first paragraph, if there are many of them
            if len(pgs) == 1:
                pgs = pgs[0]
            else:
                # More than one paragraph: gotta concatenate 'em all
                pa = []
                for pg in pgs:
                    pa.extend(pg)
                pgs = pa

    # Return the tokens as a JSON structure to the client
    return better_jsonify(valid=True,
                          result=pgs,
                          stats=stats,
                          register=register)
示例#19
0
def dump_tokens(limit):
    """ Iterate through parsed articles and print a list
        of tokens and their matched terminals """

    dtd = dict()
    with BIN_Db.get_db() as db, SessionContext(commit=True) as session:
        # Iterate through the articles
        q = (session.query(Article).filter(Article.tree != None).order_by(
            Article.timestamp))
        if limit is None:
            q = q.all()
        else:
            q = q[0:limit]
        for a in q:
            print(
                "\nARTICLE\nHeading: '{0.heading}'\nURL: {0.url}\nTimestamp: {0.timestamp}"
                .format(a))
            tree = TreeTokenList()
            tree.load(a.tree)
            for ix, toklist in tree.token_lists():
                print("\nSentence {0}:".format(ix))
                at_start = True
                for t in toklist:
                    if t.tokentype == "WORD":
                        wrd = t.token[1:-1]
                        td = dtd.get(t.terminal)
                        if td is None:
                            td = TerminalDescriptor(t.terminal)
                            dtd[t.terminal] = td
                        stem = td.stem(db, wrd, at_start)
                        at_start = False
                        print("    {0} {1} {2}".format(wrd, stem, t.terminal))
                    else:
                        print("    {0.token} {0.cat} {0.terminal}".format(t))
示例#20
0
文件: article.py 项目: Loknar/Greynir
    def sentence_stream(limit=None, skip=None, skip_errors=True):
        """ Generator of a sentence stream consisting of `limit`
            sentences (or less) from the most recently parsed articles.
            Each sentence is a list of token dicts. """
        with SessionContext(commit=True, read_only=True) as session:

            q = (session.query(
                ArticleRow.url, ArticleRow.parsed,
                ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by(
                    desc(ArticleRow.parsed)).yield_per(200))

            count = 0
            skipped = 0
            for a in q:
                doc = json.loads(a.tokens)
                for pg in doc:
                    for sent in pg:
                        if not sent:
                            continue
                        if skip_errors and any("err" in t for t in sent):
                            # Skip error sentences
                            continue
                        if skip is not None and skipped < skip:
                            # If requested, skip sentences from the front
                            # (useful for test set)
                            skipped += 1
                            continue
                        # Yield the sentence as a fresh token list
                        yield [t for t in sent]
                        # Are we done?
                        count += 1
                        if limit is not None and count >= limit:
                            return
示例#21
0
    def token_stream(
            limit: Optional[int] = None,
            skip_errors: bool = True) -> Iterator[Optional[TokenDict]]:
        """ Generator of a token stream consisting of `limit` sentences
            (or less) from the most recently parsed articles. After
            each sentence, None is yielded. """
        with SessionContext(commit=True, read_only=True) as session:

            q: SqlQuery[ArticleRow] = (session.query(
                ArticleRow.url, ArticleRow.parsed,
                ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by(
                    desc(cast(Column, ArticleRow.parsed))).yield_per(200))

            count = 0
            for a in q:
                assert a is not None
                if not a.tokens:
                    continue
                doc = cast(PgsList, json.loads(a.tokens))
                for pg in doc:
                    for sent in pg:
                        if not sent:
                            continue
                        if skip_errors and any("err" in t for t in sent):
                            # Skip error sentences
                            continue
                        for t in sent:
                            # Yield the tokens
                            yield t
                        yield None  # End-of-sentence marker
                        # Are we done?
                        count += 1
                        if limit is not None and count >= limit:
                            return
示例#22
0
 def __iter__(self):
     """ Iterate through articles (documents) """
     print("Starting iteration through corpus from words table")
     if self._dictionary is not None:
         xform = lambda x: self._dictionary.doc2bow(x)
     else:
         xform = lambda x: x
     with SessionContext(commit=True) as session:
         # Fetch bags of words sorted by articles
         q = (session.query(Word.article_id, Word.stem, Word.cat,
                            Word.cnt).order_by(
                                Word.article_id).yield_per(2000))
         bag = []
         last_uuid = None
         for uuid, stem, cat, cnt in q:
             if uuid != last_uuid:
                 if bag:
                     # Finishing the last article: yield its bag
                     # print("Yielding bag of {0} words".format(len(bag)))
                     yield xform(bag)
                     bag = []
                 # Beginning a new article with an empty bag
                 last_uuid = uuid
             # Convert stem to lowercase and replace spaces with underscores
             w = w_from_stem(stem, cat)
             if cnt == 1:
                 bag.append(w)
             else:
                 bag.extend([w] * cnt)
         if (last_uuid is not None) and bag:
             # print("Yielding bag of {0} words".format(len(bag)))
             yield xform(bag)
     print("Finished iteration through corpus from words table")
示例#23
0
def article_api(version=1):
    """ Obtain information about an article, given its URL or id """

    if not (1 <= version <= 1):
        return better_jsonify(valid=False, reason="Unsupported version")

    url = request.values.get("url")
    uuid = request.values.get("id")

    if url:
        url = url.strip()[0:_MAX_URL_LENGTH]
    if uuid:
        uuid = uuid.strip()[0:_MAX_UUID_LENGTH]
    if url:
        # URL has priority, if both are specified
        uuid = None
    if not url and not uuid:
        return better_jsonify(valid=False,
                              reason="No url or id specified in query")

    with SessionContext(commit=True) as session:

        if uuid:
            a = ArticleProxy.load_from_uuid(uuid, session)
        elif url.startswith("http:") or url.startswith("https:"):
            a = ArticleProxy.load_from_url(url, session)
        else:
            a = None

        if a is None:
            return better_jsonify(valid=False, reason="Article not found")

        if a.html is None:
            return better_jsonify(valid=False,
                                  reason="Unable to fetch article")

        # Prepare the article for display
        a.prepare(session)
        register = a.create_register(session, all_names=True)
        # Fetch names of article topics, if any
        topics = (session.query(ArticleTopic).filter(
            ArticleTopic.article_id == a.uuid).all())
        topics = [
            dict(name=t.topic.name, id=t.topic.identifier) for t in topics
        ]

    return better_jsonify(
        valid=True,
        url=a.url,
        id=a.uuid,
        heading=a.heading,
        author=a.author,
        ts=a.timestamp.isoformat()[0:19],
        num_sentences=a.num_sentences,
        num_parsed=a.num_parsed,
        ambiguity=a.ambiguity,
        register=register,
        topics=topics,
    )
示例#24
0
def _purge_single(key, ctype=None, enclosing_session=None):
    """ Remove cache entry """
    with SessionContext(commit=True, session=enclosing_session) as session:
        filters = [Link.key == key]
        if ctype:
            filters.append(Link.ctype == ctype)

        session.query(Link).filter(*filters).delete()
示例#25
0
 def load_from_url(cls, url, enclosing_session=None):
     """ Load or scrape an article, given its URL """
     with SessionContext(enclosing_session) as session:
         ar = session.query(ArticleRow).filter(ArticleRow.url == url).one_or_none()
         if ar is not None:
             return cls._init_from_row(ar)
         # Not found in database: attempt to fetch
         return cls._init_from_scrape(url, session)
示例#26
0
def top_locations(limit=_TOP_LOC_LENGTH, kind=None, days=_TOP_LOC_PERIOD):
    """ Return a list of recent locations along with the list of
        articles in which they are mentioned """

    with SessionContext(read_only=True) as session:
        q = (session.query(
            Location.name,
            Location.kind,
            Location.country,
            Location.article_url,
            Location.latitude,
            Location.longitude,
            Article.id,
            Article.heading,
            Root.domain,
        ).join(Article, Article.url == Location.article_url).filter(
            Article.timestamp > datetime.utcnow() -
            timedelta(days=days)).join(Root).filter(Root.visible))

        # Filter by kind
        if kind:
            q = q.filter(Location.kind == kind)

        q = q.order_by(desc(Article.timestamp))

        # Group articles by unique location
        locs = defaultdict(list)
        for r in q.all():
            article = {
                "url": r.article_url,
                "id": r.id,
                "heading": r.heading,
                "domain": r.domain,
            }
            k = (r.name, r.kind, r.country, r.latitude, r.longitude)
            locs[k].append(article)

        # Create top locations list sorted by article count
        loclist = []
        for k, v in locs.items():
            (name, kind, country, lat, lon) = k  # Unpack tuple key
            # Google map links currently use the placename instead of
            # coordinates. This works well for most Icelandic and
            # international placenames, but fails on some.
            map_url = GMAPS_PLACE_URL.format(name)
            # if lat and lon:
            #     map_url = GMAPS_COORD_URL.format(lat, lon, "7z")

            loclist.append({
                "name": name,
                "kind": kind,
                "country": country,
                "map_url": map_url,
                "articles": v,
            })
        loclist.sort(key=lambda x: len(x["articles"]), reverse=True)

        return loclist[:limit]
示例#27
0
def graph_data(num_persons=_DEFAULT_NUM_PERSONS_GRAPH):
    """ Get and prepare data for people graph """
    with SessionContext(read_only=True) as session:
        # Find all persons mentioned in articles that
        # have at least two names (i.e. match whitespace)
        q = (session.query(Word.stem, Word.article_id, Word.cat).filter(
            Word.cat.like("person_%")).filter(Word.stem.like("% %")))
        res = q.all()

        # Count number of occurrences of each name
        cnt = Counter()
        for name, _, _ in res:
            cnt[name] += 1

        # Get most common names
        names = [name for name, freq in cnt.most_common(num_persons)]

        # Generate dict mapping article ids to a set of top names mentioned
        articles = defaultdict(set)
        for name, art_id, _ in res:
            if name in names:
                articles[art_id].add(name)

        # Find all links
        nlinks = defaultdict(int)
        for a_id, persons in articles.items():
            if len(persons) < 2:
                # We need at least two names to establish link
                continue

            # Find all permutations of people mentioned in article
            perm = list(permutations(persons, 2))
            for a, b in perm:
                # We use a sorted tuple as hashable dict key when
                # counting number of connections between any two names
                k = tuple(sorted([names.index(a), names.index(b)]))
                nlinks[k] += 1

        # Create final link and node data structures
        links = [{
            "source": k[0],
            "target": k[1],
            "weight": v
        } for k, v in nlinks.items()]
        nodes = []
        for idx, n in enumerate(names):
            # print(cnt[n])
            # TODO: Normalize influence
            nodes.append({
                "name": n,
                "id": idx,
                "influence": cnt[n] / 7,
                "zone": 0
            })

        dataset = {"nodes": nodes, "links": links}

        return dataset
示例#28
0
def news():
    """ Handler for a page with a list of articles + pagination """
    topic = request.args.get("topic")
    root = request.args.get("root")
    author = request.args.get("author")

    try:
        offset = max(0, int(request.args.get("offset", 0)))
        limit = max(0, int(request.args.get("limit", _DEFAULT_NUM_ARTICLES)))
    except:
        offset = 0
        limit = _DEFAULT_NUM_ARTICLES

    limit = min(limit, _MAX_NUM_ARTICLES)  # Cap at max 100 results per page

    with SessionContext(read_only=True) as session:
        # Fetch articles
        articles = fetch_articles(
            topic=topic,
            offset=offset,
            limit=limit,
            root=root,
            author=author,
            enclosing_session=session,
        )

        # If all articles in the list are timestamped within 24 hours of now,
        # we display their times in HH:MM format. Otherwise, we display full date.
        display_time = True
        if articles and (datetime.utcnow() - articles[-1].timestamp).days >= 1:
            display_time = False

        # Fetch lists of article topics
        q = session.query(Topic.identifier, Topic.name).order_by(Topic.name).all()
        d = {t[0]: t[1] for t in q}
        topics = dict(id=topic, name=d.get(topic, ""), topic_list=q)

        # Fetch list of article sources (roots)
        q = (
            session.query(Root.domain, Root.description)
            .filter(Root.visible == True)
            .order_by(Root.description)
        )
        roots = dict(q.all())

    return render_template(
        "news.html",
        title="Fréttir",
        articles=articles,
        topics=topics,
        display_time=display_time,
        offset=offset,
        limit=limit,
        selected_root=root,
        roots=roots,
        author=author,
    )
示例#29
0
def stats():
    """ Render a page containing various statistics from the Greynir database. """
    days = _DEFAULT_STATS_PERIOD
    try:
        days = min(_MAX_STATS_PERIOD,
                   int(request.args.get("days", _DEFAULT_STATS_PERIOD)))
    except Exception:
        pass

    chart_data: Dict[str, Any] = dict()

    with SessionContext(read_only=True) as session:

        # Article stats
        sq = StatsQuery()
        result = sq.execute(session)
        total = dict(art=Decimal(), sent=Decimal(), parsed=Decimal())
        for r in result:
            total["art"] += r.art
            total["sent"] += r.sent
            total["parsed"] += r.parsed

        # Gender stats
        gq = GenderQuery()
        gresult = gq.execute(session)

        gtotal = dict(kvk=Decimal(),
                      kk=Decimal(),
                      hk=Decimal(),
                      total=Decimal())
        for r in gresult:
            gtotal["kvk"] += r.kvk
            gtotal["kk"] += r.kk
            gtotal["hk"] += r.hk
            gtotal["total"] += r.kvk + r.kk + r.hk

        # Author stats
        authresult = top_authors(session=session)

        # Chart stats
        chart_data = chart_stats(session=session, num_days=days)

    return render_template(
        "stats.html",
        title="Tölfræði",
        result=result,
        total=total,
        gresult=gresult,
        gtotal=gtotal,
        authresult=authresult,
        scraped_chart_data=json.dumps(chart_data["scraped"]),
        parsed_chart_data=json.dumps(chart_data["parsed"]),
        queries_chart_data=json.dumps(chart_data["queries"]),
        scraped_avg=int(round(chart_data["scraped"]["avg"])),
        parsed_avg=round(chart_data["parsed"]["avg"], 1),
        queries_avg=round(chart_data["queries"]["avg"], 1),
    )
示例#30
0
 def scrape_from_url(cls, url, enclosing_session=None):
     """ Force fetch of an article, given its URL """
     with SessionContext(enclosing_session) as session:
         ar = session.query(ArticleRow).filter(ArticleRow.url == url).one_or_none()
         a = cls._init_from_scrape(url, session)
         if a is not None and ar is not None:
             # This article already existed in the database, so note its UUID
             a._uuid = ar.id
         return a
示例#31
0
def main(argv=None):
    """ Guido van Rossum's pattern for a Python main function """

    if argv is None:
        argv = sys.argv
    try:
        try:
            opts, args = getopt.getopt(
                argv[1:], "hirl:u:", ["help", "init", "reparse", "limit=", "urls="]
            )
        except getopt.error as msg:
            raise Usage(msg)
        init = False
        # !!! DEBUG default limit on number of articles to parse, unless otherwise specified
        limit = 10
        reparse = False
        urls = None

        # Process options
        for o, a in opts:
            if o in ("-h", "--help"):
                print(__doc__)
                sys.exit(0)
            elif o in ("-i", "--init"):
                init = True
            elif o in ("-r", "--reparse"):
                reparse = True
            elif o in ("-l", "--limit"):
                # Maximum number of articles to parse
                try:
                    limit = int(a)
                except ValueError:
                    pass
            elif o in ("-u", "--urls"):
                urls = a  # Text file with list of URLs

        # Process arguments
        for _ in args:
            pass

        # Set logging format
        logging.basicConfig(
            format="%(asctime)s %(levelname)s:%(message)s", level=logging.INFO
        )

        # Read the configuration settings file
        try:
            Settings.read("config/Reynir.conf")
            # Don't run the scraper in debug mode
            Settings.DEBUG = False
        except ConfigError as e:
            print("Configuration error: {0}".format(e), file=sys.stderr)
            return 2

        if init:
            # Initialize the scraper database
            init_roots()
        else:
            # Run the scraper
            scrape_articles(reparse=reparse, limit=limit, urls=urls)

    except Usage as err:
        print(err.msg, file=sys.stderr)
        print("For help use --help", file=sys.stderr)
        return 2

    finally:
        SessionContext.cleanup()
        Article.cleanup()

    # Completed with no error
    return 0