def sentence_stream(limit=None, skip=None, skip_errors=True): """ Generator of a sentence stream consisting of `limit` sentences (or less) from the most recently parsed articles. Each sentence is a list of token dicts. """ with SessionContext(commit=True, read_only=True) as session: q = (session.query( ArticleRow.url, ArticleRow.parsed, ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by( desc(ArticleRow.parsed)).yield_per(200)) count = 0 skipped = 0 for a in q: doc = json.loads(a.tokens) for pg in doc: for sent in pg: if not sent: continue if skip_errors and any("err" in t for t in sent): # Skip error sentences continue if skip is not None and skipped < skip: # If requested, skip sentences from the front (useful for test set) skipped += 1 continue # Yield the sentence as a fresh token list yield [t for t in sent] # Are we done? count += 1 if limit is not None and count >= limit: return
def token_stream(limit=None, skip_errors=True): """ Generator of a token stream consisting of `limit` sentences (or less) from the most recently parsed articles. After each sentence, None is yielded. """ with SessionContext(commit=True, read_only=True) as session: q = (session.query( ArticleRow.url, ArticleRow.parsed, ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by( desc(ArticleRow.parsed)).yield_per(200)) count = 0 for a in q: doc = json.loads(a.tokens) for pg in doc: for sent in pg: if not sent: continue if skip_errors and any("err" in t for t in sent): # Skip error sentences continue for t in sent: # Yield the tokens yield t yield None # End-of-sentence marker # Are we done? count += 1 if limit is not None and count >= limit: return
def top_persons(limit=_TOP_PERSONS_LENGTH): """ Return a list of names and titles appearing recently in the news """ toplist = dict() bindb = BIN_Db.get_db() with SessionContext(commit=True) as session: q = session.query(Person.name, Person.title, Person.article_url, Article.id) \ .join(Article).join(Root) \ .filter(Root.visible) \ .order_by(desc(Article.timestamp))[0:limit * 2] # Go through up to 2 * N records for p in q: # Insert the name into the list if it's not already there, # or if the new title is longer than the previous one if p.name not in toplist or len(p.title) > len(toplist[p.name][0]): toplist[p.name] = (correct_spaces(p.title), p.article_url, p.id, bindb.lookup_name_gender(p.name)) if len(toplist) >= limit: # We now have as many names as we initially wanted: terminate the loop break with changedlocale() as strxfrm: # Convert the dictionary to a sorted list of dicts return sorted([ dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2]) for name, tu in toplist.items() ], key=lambda x: strxfrm(x["name"]))
def top_persons(limit=_TOP_PERSONS_LENGTH): """ Return a list of names and titles appearing recently in the news """ toplist = dict() MAX_TITLE_LENGTH = 64 with SessionContext(commit=True) as session: q = ( session.query(Person.name, Person.title, Person.article_url, Article.id).join(Article).join(Root).filter( Root.visible).order_by(desc(Article.timestamp)) [0:limit * 2] # Go through up to 2 * N records ) def is_better_title(new_title, old_title): len_new = len(new_title) len_old = len(old_title) if len_old >= MAX_TITLE_LENGTH: # Too long: we want a shorter one return len_new < len_old if len_new >= MAX_TITLE_LENGTH: # This one is too long: we don't want it return False # Otherwise, longer is better return len_new > len_old with BIN_Db.get_db() as bindb: for p in q: # Insert the name into the list if it's not already there, # or if the new title is longer than the previous one if p.name not in toplist or is_better_title( p.title, toplist[p.name][0]): toplist[p.name] = ( correct_spaces(p.title), p.article_url, p.id, bindb.lookup_name_gender(p.name), ) if len(toplist) >= limit: # We now have as many names as we initially wanted: terminate the loop break with changedlocale() as strxfrm: # Convert the dictionary to a sorted list of dicts return sorted( [ dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2]) for name, tu in toplist.items() ], key=lambda x: strxfrm(x["name"]), )
def suggest(limit=10): """ Return suggestions for query field autocompletion """ limit = request.args.get("limit", limit) txt = request.args.get("q", "").strip() suggestions = list() whois_prefix = "hver er " whatis_prefix = "hvað er " prefix = None if txt.lower().startswith(whois_prefix): prefix = whois_prefix elif txt.lower().startswith(whatis_prefix): prefix = whatis_prefix if not txt or not prefix: return better_jsonify(suggestions=suggestions) with SessionContext(commit=False) as session: name = txt[len(prefix):].strip() model_col = None # Hver er Jón Jónsson ? if prefix is whois_prefix and name[0].isupper(): model_col = Person.name # Hver er seðlabankastjóri? elif prefix is whois_prefix: model_col = Person.title # Hvað er UNESCO? elif prefix is whatis_prefix: model_col = Entity.name q = (session.query(model_col, dbfunc.count(Article.id).label("total")).filter( model_col.ilike(name + "%")).join( Article).group_by(model_col).order_by( desc("total")).limit(limit).all()) prefix = prefix[:1].upper() + prefix[1:].lower() suggestions = [{"value": (prefix + p[0] + "?"), "data": ""} for p in q] return better_jsonify(suggestions=suggestions)
def articles(cls, criteria, enclosing_session=None): """ Generator of Article objects from the database that meet the given criteria """ # The criteria are currently "timestamp", "author" and "domain", # as well as "order_by_parse" which if True indicates that the result # should be ordered with the most recently parsed articles first. with SessionContext(commit=True, read_only=True, session=enclosing_session) as session: # Only fetch articles that have a parse tree q = session.query(ArticleRow).filter(ArticleRow.tree != None) # timestamp is assumed to contain a tuple: (from, to) if criteria and "timestamp" in criteria: ts = criteria["timestamp"] q = (q.filter(ArticleRow.timestamp >= ts[0]).filter( ArticleRow.timestamp < ts[1])) if criteria and "author" in criteria: author = criteria["author"] q = q.filter(ArticleRow.author == author) if criteria and ("visible" in criteria or "domain" in criteria): # Need a join with Root for these criteria q = q.join(Root) if "visible" in criteria: # Return only articles from roots with the specified visibility visible = criteria["visible"] assert isinstance(visible, bool) q = q.filter(Root.visible == visible) if "domain" in criteria: # Return only articles from the specified domain domain = criteria["domain"] assert isinstance(domain, str) q = q.filter(Root.domain == domain) if criteria and criteria.get("order_by_parse"): # Order with newest parses first q = q.order_by(desc(ArticleRow.parsed)) for arow in q.yield_per(500): yield cls._init_from_row(arow)
def top_news(topic=None, offset=0, limit=_TOP_NEWS_LENGTH): """ Return a list of articles (with a particular topic) in chronologically reversed order. """ toplist = [] topdict = dict() with SessionContext(commit=True) as session: q = (session.query(Article).join(Root).filter( Article.tree != None).filter(Article.timestamp != None).filter( Article.timestamp <= datetime.utcnow()).filter( Article.heading > "").filter( Article.num_sentences > 0).filter( Root.visible == True)) if topic is not None: # Filter by topic identifier q = q.join(ArticleTopic).join(Topic).filter( Topic.identifier == topic) q = q.order_by(desc(Article.timestamp)).offset(offset).limit(limit) class ArticleDisplay: """ Utility class to carry information about an article to the web template """ def __init__( self, heading, timestamp, url, uuid, num_sentences, num_parsed, icon, localized_date, source, ): self.heading = heading self.timestamp = timestamp self.url = url self.uuid = uuid self.num_sentences = num_sentences self.num_parsed = num_parsed self.icon = icon self.localized_date = localized_date self.source = source @property def width(self): """ The ratio of parsed sentences to the total number of sentences, expressed as a percentage string """ if self.num_sentences == 0: return "0%" return "{0}%".format( (100 * self.num_parsed) // self.num_sentences) @property def time(self): return self.timestamp.isoformat()[11:16] @property def date(self): if datetime.today().year == self.timestamp.year: return self.localized_date return self.fulldate @property def fulldate(self): return self.localized_date + self.timestamp.strftime(" %Y") with changedlocale(category="LC_TIME"): for a in q: # Instantiate article objects from results source = a.root.domain icon = source + ".png" locdate = a.timestamp.strftime("%-d. %b") d = ArticleDisplay( heading=a.heading, timestamp=a.timestamp, url=a.url, uuid=a.id, num_sentences=a.num_sentences, num_parsed=a.num_parsed, icon=icon, localized_date=locdate, source=source, ) toplist.append(d) return toplist
def top_news(topic=None, start=None, limit=_TOP_NEWS_LENGTH): """ Return a list of top recent news, of a particular topic, up to a particular start time, having a specified length """ toplist = [] topdict = dict() if start is None: start = datetime.utcnow() MARGIN = 10 # Get more articles than requested in case there are duplicates with SessionContext(commit=True) as session: q = session.query(Article).join(Root) \ .filter(Article.tree != None) \ .filter(Article.timestamp != None) \ .filter(Article.timestamp < start) \ .filter(Article.heading > "") \ .filter(Root.visible == True) if topic is not None: # Filter by topic identifier q = q.join(ArticleTopic).join(Topic).filter( Topic.identifier == topic) q = q.order_by(desc(Article.timestamp))[0:limit + MARGIN] class ArticleDisplay: """ Utility class to carry information about an article to the web template """ def __init__(self, heading, timestamp, url, uuid, num_sentences, num_parsed, icon): self.heading = heading self.timestamp = timestamp self.url = url self.uuid = uuid self.num_sentences = num_sentences self.num_parsed = num_parsed self.icon = icon @property def width(self): """ The ratio of parsed sentences to the total number of sentences, expressed as a percentage string """ if self.num_sentences == 0: return "0%" return "{0}%".format( (100 * self.num_parsed) // self.num_sentences) @property def time(self): return self.timestamp.isoformat()[11:16] @property def date(self): return self.timestamp.isoformat()[0:10] for a in q: # Collect and count the titles icon = a.root.domain + ".ico" d = ArticleDisplay(heading=a.heading, timestamp=a.timestamp, url=a.url, uuid=a.id, num_sentences=a.num_sentences, num_parsed=a.num_parsed, icon=icon) # Have we seen the same heading on the same domain? t = (a.root.domain, a.heading) if t in topdict: # Same domain+heading already in the list i = topdict[t] if d.timestamp > toplist[i].timestamp: # The new entry is newer: replace the old one toplist[i] = d # Otherwise, ignore the new entry and continue else: # New heading: note its index in the list llist = len(toplist) topdict[t] = llist toplist.append(d) if llist + 1 >= limit: break return toplist[0:limit]