Пример #1
0
    def sentence_stream(limit=None, skip=None, skip_errors=True):
        """ Generator of a sentence stream consisting of `limit` sentences (or less) from the
            most recently parsed articles. Each sentence is a list of token dicts. """
        with SessionContext(commit=True, read_only=True) as session:

            q = (session.query(
                ArticleRow.url, ArticleRow.parsed,
                ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by(
                    desc(ArticleRow.parsed)).yield_per(200))

            count = 0
            skipped = 0
            for a in q:
                doc = json.loads(a.tokens)
                for pg in doc:
                    for sent in pg:
                        if not sent:
                            continue
                        if skip_errors and any("err" in t for t in sent):
                            # Skip error sentences
                            continue
                        if skip is not None and skipped < skip:
                            # If requested, skip sentences from the front (useful for test set)
                            skipped += 1
                            continue
                        # Yield the sentence as a fresh token list
                        yield [t for t in sent]
                        # Are we done?
                        count += 1
                        if limit is not None and count >= limit:
                            return
Пример #2
0
    def token_stream(limit=None, skip_errors=True):
        """ Generator of a token stream consisting of `limit` sentences (or less) from the
            most recently parsed articles. After each sentence, None is yielded. """
        with SessionContext(commit=True, read_only=True) as session:

            q = (session.query(
                ArticleRow.url, ArticleRow.parsed,
                ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by(
                    desc(ArticleRow.parsed)).yield_per(200))

            count = 0
            for a in q:
                doc = json.loads(a.tokens)
                for pg in doc:
                    for sent in pg:
                        if not sent:
                            continue
                        if skip_errors and any("err" in t for t in sent):
                            # Skip error sentences
                            continue
                        for t in sent:
                            # Yield the tokens
                            yield t
                        yield None  # End-of-sentence marker
                        # Are we done?
                        count += 1
                        if limit is not None and count >= limit:
                            return
Пример #3
0
def top_persons(limit=_TOP_PERSONS_LENGTH):
    """ Return a list of names and titles appearing recently in the news """
    toplist = dict()
    bindb = BIN_Db.get_db()

    with SessionContext(commit=True) as session:

        q = session.query(Person.name, Person.title, Person.article_url, Article.id) \
            .join(Article).join(Root) \
            .filter(Root.visible) \
            .order_by(desc(Article.timestamp))[0:limit * 2] # Go through up to 2 * N records

        for p in q:
            # Insert the name into the list if it's not already there,
            # or if the new title is longer than the previous one
            if p.name not in toplist or len(p.title) > len(toplist[p.name][0]):
                toplist[p.name] = (correct_spaces(p.title), p.article_url,
                                   p.id, bindb.lookup_name_gender(p.name))
                if len(toplist) >= limit:
                    # We now have as many names as we initially wanted: terminate the loop
                    break

    with changedlocale() as strxfrm:
        # Convert the dictionary to a sorted list of dicts
        return sorted([
            dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2])
            for name, tu in toplist.items()
        ],
                      key=lambda x: strxfrm(x["name"]))
Пример #4
0
def top_persons(limit=_TOP_PERSONS_LENGTH):
    """ Return a list of names and titles appearing recently in the news """
    toplist = dict()
    MAX_TITLE_LENGTH = 64

    with SessionContext(commit=True) as session:

        q = (
            session.query(Person.name, Person.title, Person.article_url,
                          Article.id).join(Article).join(Root).filter(
                              Root.visible).order_by(desc(Article.timestamp))
            [0:limit * 2]  # Go through up to 2 * N records
        )

        def is_better_title(new_title, old_title):
            len_new = len(new_title)
            len_old = len(old_title)
            if len_old >= MAX_TITLE_LENGTH:
                # Too long: we want a shorter one
                return len_new < len_old
            if len_new >= MAX_TITLE_LENGTH:
                # This one is too long: we don't want it
                return False
            # Otherwise, longer is better
            return len_new > len_old

        with BIN_Db.get_db() as bindb:
            for p in q:
                # Insert the name into the list if it's not already there,
                # or if the new title is longer than the previous one
                if p.name not in toplist or is_better_title(
                        p.title, toplist[p.name][0]):
                    toplist[p.name] = (
                        correct_spaces(p.title),
                        p.article_url,
                        p.id,
                        bindb.lookup_name_gender(p.name),
                    )
                    if len(toplist) >= limit:
                        # We now have as many names as we initially wanted: terminate the loop
                        break

    with changedlocale() as strxfrm:
        # Convert the dictionary to a sorted list of dicts
        return sorted(
            [
                dict(name=name,
                     title=tu[0],
                     gender=tu[3],
                     url=tu[1],
                     uuid=tu[2]) for name, tu in toplist.items()
            ],
            key=lambda x: strxfrm(x["name"]),
        )
Пример #5
0
def suggest(limit=10):
    """ Return suggestions for query field autocompletion """
    limit = request.args.get("limit", limit)
    txt = request.args.get("q", "").strip()

    suggestions = list()
    whois_prefix = "hver er "
    whatis_prefix = "hvað er "

    prefix = None
    if txt.lower().startswith(whois_prefix):
        prefix = whois_prefix
    elif txt.lower().startswith(whatis_prefix):
        prefix = whatis_prefix

    if not txt or not prefix:
        return better_jsonify(suggestions=suggestions)

    with SessionContext(commit=False) as session:
        name = txt[len(prefix):].strip()
        model_col = None

        # Hver er Jón Jónsson ?
        if prefix is whois_prefix and name[0].isupper():
            model_col = Person.name
        # Hver er seðlabankastjóri?
        elif prefix is whois_prefix:
            model_col = Person.title
        # Hvað er UNESCO?
        elif prefix is whatis_prefix:
            model_col = Entity.name

        q = (session.query(model_col,
                           dbfunc.count(Article.id).label("total")).filter(
                               model_col.ilike(name + "%")).join(
                                   Article).group_by(model_col).order_by(
                                       desc("total")).limit(limit).all())

        prefix = prefix[:1].upper() + prefix[1:].lower()
        suggestions = [{"value": (prefix + p[0] + "?"), "data": ""} for p in q]

    return better_jsonify(suggestions=suggestions)
Пример #6
0
    def articles(cls, criteria, enclosing_session=None):
        """ Generator of Article objects from the database that meet the given criteria """
        # The criteria are currently "timestamp", "author" and "domain",
        # as well as "order_by_parse" which if True indicates that the result
        # should be ordered with the most recently parsed articles first.
        with SessionContext(commit=True,
                            read_only=True,
                            session=enclosing_session) as session:

            # Only fetch articles that have a parse tree
            q = session.query(ArticleRow).filter(ArticleRow.tree != None)

            # timestamp is assumed to contain a tuple: (from, to)
            if criteria and "timestamp" in criteria:
                ts = criteria["timestamp"]
                q = (q.filter(ArticleRow.timestamp >= ts[0]).filter(
                    ArticleRow.timestamp < ts[1]))

            if criteria and "author" in criteria:
                author = criteria["author"]
                q = q.filter(ArticleRow.author == author)

            if criteria and ("visible" in criteria or "domain" in criteria):
                # Need a join with Root for these criteria
                q = q.join(Root)
                if "visible" in criteria:
                    # Return only articles from roots with the specified visibility
                    visible = criteria["visible"]
                    assert isinstance(visible, bool)
                    q = q.filter(Root.visible == visible)
                if "domain" in criteria:
                    # Return only articles from the specified domain
                    domain = criteria["domain"]
                    assert isinstance(domain, str)
                    q = q.filter(Root.domain == domain)

            if criteria and criteria.get("order_by_parse"):
                # Order with newest parses first
                q = q.order_by(desc(ArticleRow.parsed))

            for arow in q.yield_per(500):
                yield cls._init_from_row(arow)
Пример #7
0
def top_news(topic=None, offset=0, limit=_TOP_NEWS_LENGTH):
    """ Return a list of articles (with a particular topic) in
        chronologically reversed order. """
    toplist = []
    topdict = dict()

    with SessionContext(commit=True) as session:

        q = (session.query(Article).join(Root).filter(
            Article.tree != None).filter(Article.timestamp != None).filter(
                Article.timestamp <= datetime.utcnow()).filter(
                    Article.heading > "").filter(
                        Article.num_sentences > 0).filter(
                            Root.visible == True))

        if topic is not None:
            # Filter by topic identifier
            q = q.join(ArticleTopic).join(Topic).filter(
                Topic.identifier == topic)

        q = q.order_by(desc(Article.timestamp)).offset(offset).limit(limit)

        class ArticleDisplay:
            """ Utility class to carry information about an article to the web template """
            def __init__(
                self,
                heading,
                timestamp,
                url,
                uuid,
                num_sentences,
                num_parsed,
                icon,
                localized_date,
                source,
            ):
                self.heading = heading
                self.timestamp = timestamp
                self.url = url
                self.uuid = uuid
                self.num_sentences = num_sentences
                self.num_parsed = num_parsed
                self.icon = icon
                self.localized_date = localized_date
                self.source = source

            @property
            def width(self):
                """ The ratio of parsed sentences to the total number of sentences,
                    expressed as a percentage string """
                if self.num_sentences == 0:
                    return "0%"
                return "{0}%".format(
                    (100 * self.num_parsed) // self.num_sentences)

            @property
            def time(self):
                return self.timestamp.isoformat()[11:16]

            @property
            def date(self):
                if datetime.today().year == self.timestamp.year:
                    return self.localized_date
                return self.fulldate

            @property
            def fulldate(self):
                return self.localized_date + self.timestamp.strftime(" %Y")

        with changedlocale(category="LC_TIME"):
            for a in q:
                # Instantiate article objects from results
                source = a.root.domain
                icon = source + ".png"
                locdate = a.timestamp.strftime("%-d. %b")

                d = ArticleDisplay(
                    heading=a.heading,
                    timestamp=a.timestamp,
                    url=a.url,
                    uuid=a.id,
                    num_sentences=a.num_sentences,
                    num_parsed=a.num_parsed,
                    icon=icon,
                    localized_date=locdate,
                    source=source,
                )
                toplist.append(d)

    return toplist
Пример #8
0
def top_news(topic=None, start=None, limit=_TOP_NEWS_LENGTH):
    """ Return a list of top recent news, of a particular topic,
        up to a particular start time, having a specified length """
    toplist = []
    topdict = dict()
    if start is None:
        start = datetime.utcnow()
    MARGIN = 10  # Get more articles than requested in case there are duplicates

    with SessionContext(commit=True) as session:

        q = session.query(Article).join(Root) \
            .filter(Article.tree != None) \
            .filter(Article.timestamp != None) \
            .filter(Article.timestamp < start) \
            .filter(Article.heading > "") \
            .filter(Root.visible == True)

        if topic is not None:
            # Filter by topic identifier
            q = q.join(ArticleTopic).join(Topic).filter(
                Topic.identifier == topic)

        q = q.order_by(desc(Article.timestamp))[0:limit + MARGIN]

        class ArticleDisplay:
            """ Utility class to carry information about an article to the web template """
            def __init__(self, heading, timestamp, url, uuid, num_sentences,
                         num_parsed, icon):
                self.heading = heading
                self.timestamp = timestamp
                self.url = url
                self.uuid = uuid
                self.num_sentences = num_sentences
                self.num_parsed = num_parsed
                self.icon = icon

            @property
            def width(self):
                """ The ratio of parsed sentences to the total number of sentences,
                    expressed as a percentage string """
                if self.num_sentences == 0:
                    return "0%"
                return "{0}%".format(
                    (100 * self.num_parsed) // self.num_sentences)

            @property
            def time(self):
                return self.timestamp.isoformat()[11:16]

            @property
            def date(self):
                return self.timestamp.isoformat()[0:10]

        for a in q:
            # Collect and count the titles
            icon = a.root.domain + ".ico"

            d = ArticleDisplay(heading=a.heading,
                               timestamp=a.timestamp,
                               url=a.url,
                               uuid=a.id,
                               num_sentences=a.num_sentences,
                               num_parsed=a.num_parsed,
                               icon=icon)

            # Have we seen the same heading on the same domain?
            t = (a.root.domain, a.heading)
            if t in topdict:
                # Same domain+heading already in the list
                i = topdict[t]
                if d.timestamp > toplist[i].timestamp:
                    # The new entry is newer: replace the old one
                    toplist[i] = d
                # Otherwise, ignore the new entry and continue
            else:
                # New heading: note its index in the list
                llist = len(toplist)
                topdict[t] = llist
                toplist.append(d)
                if llist + 1 >= limit:
                    break

    return toplist[0:limit]