class Source(db.Model): id = db.Column(db.Integer, primary_key=True) name = db.Column(db.String(255), unique=True) # Keep articles on the Source so if an # article's feed dies, we still know where the Article came from. articles = db.relationship('Article', backref='source', lazy='dynamic') feeds = db.relationship('Feed', backref='source') def __init__(self, name): self.name = name
class Story(Keywordable): story_id = db.Column('id', db.Integer, db.ForeignKey('keywordable.id'), primary_key=True) events = db.relationship('Event', backref='story', lazy='dynamic', foreign_keys=[Event.story_id]) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow) def __init__(self, event): self.events = [event] self.update() def add(self, event): self.events.append(event) @property def vecs(self): return vstack([e.vec for e in self.events]) @property def age(self): return datetime.utcnow() - self.created_at def update(self): #self.summary = multisummarize(self.articles) #self.title = title(self.articles) self.keywords = [ Keyword.find_or_create(name=kw) for kw, score in keywords(self.events) ] @classmethod def candidates(cls, event): """search stories to find candidates for the event""" # TODO this could be made more efficient candidates = defaultdict(float) for kw in event.keywords: for s in kw.subjects: if not isinstance(s, cls): continue candidates[s] += global_term_idf[kw.name] return sorted(candidates.items(), key=lambda t: t[1], reverse=True) def as_dict(self): whitelist = ['id', 'created_at', 'updated_at'] data = {attr: getattr(self, attr) for attr in whitelist} data['events'] = [e.as_dict() for e in self.events] data['keywords'] = [k.as_dict() for k in self.keywords] return data
class Keyword(db.Model): id = db.Column(db.Integer, primary_key=True) name = db.Column(db.Unicode) def __init__(self, name): self.name = name @classmethod def find_or_create(cls, **kwargs): obj = cls.query.filter_by(**kwargs).first() if obj is None: obj = cls(**kwargs) db.session.add(obj) db.session.commit() return obj def as_dict(self): return {'id': self.id, 'name': self.name}
class Keywordable(db.Model): id = db.Column(db.Integer, primary_key=True) type = db.Column('type', db.String(50)) @declared_attr def keywords(cls): return db.relationship('Keyword', secondary=keywordables_keywords, backref=db.backref('subjects', lazy='dynamic')) @declared_attr def __mapper_args__(cls): if cls.__name__ == 'Keywordable': return { 'polymorphic_on': cls.type, 'polymorphic_identity': 'Keywordable' } else: return { 'polymorphic_identity': cls.__name__ } @declared_attr def __tablename__(cls): return cls.__name__.lower()
class Feed(db.Model): id = db.Column(db.Integer, primary_key=True) url = db.Column(db.Unicode) errors = db.Column(db.Integer, default=0) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow) articles = db.relationship('Article', backref='feed', lazy='dynamic') source_id = db.Column(db.Integer, db.ForeignKey('source.id')) def __init__(self, url, source): self.url = url self.source = source self.errors = 0 def get_articles(self): data = feedparser.parse(self.url) # If the `bozo` value is anything # but 0, there was an error parsing (or connecting) to the feed. if data.bozo: # Some errors are ok. if not isinstance(data.bozo_exception, feedparser.CharacterEncodingOverride) \ and not isinstance(data.bozo_exception, feedparser.NonXMLContentType): raise data.bozo_exception for entry in data.entries: url = entry['links'][0]['href'] # Check for an existing Article. # If one exists, skip. if Article.query.filter_by( url=url).count() or Article.query.filter_by( source=self.source, title=entry['title']).count(): continue a_data = fetch(url) if a_data is None: continue a_data['feed'] = self # Although `newspaper` can extract published datetimes using metadata, # generally the published datetime included with the RSS entry will # be more precise (and sometimes `newspaper` does not successfully # extract a published datetime). # (see https://github.com/codelucas/newspaper/blob/41b930b467979577710b86ecb93c2a952e5c9a0d/newspaper/extractors.py#L166) if 'published' in entry: a_data['published'] = parser.parse(entry['published']) # Skip empty or short articles (which may be 404 pages) if a_data is None \ or len(word_tokenize(a_data['text'])) <= 150: continue yield Article(**a_data)
class Article(Keywordable): article_id = db.Column('id', db.Integer, db.ForeignKey('keywordable.id'), primary_key=True) url = db.Column(db.Unicode) title = db.Column(db.Unicode) text = db.Column(db.UnicodeText) html = db.Column(db.UnicodeText) image = db.Column(db.String) score = db.Column(db.Float, default=0.0) published = db.Column(db.DateTime) source_id = db.Column(db.Integer, db.ForeignKey('source.id')) feed_id = db.Column(db.Integer, db.ForeignKey('feed.id')) event_id = db.Column(db.Integer, db.ForeignKey('event.id')) authors = db.relationship('Author', secondary=articles_authors, backref=db.backref('articles', lazy='dynamic')) def __init__(self, url, title, text, html, image, published, authors, keywords, feed): self.url = url self.text = text self.html = html self.title = title self.image = image self.published = published self.authors = [Author.find_or_create(name=name) for name in authors] self.keywords = [ Keyword.find_or_create(name=kw) for kw in set(keywords) ] self.feed = feed self.source = feed.source @property def vec(self): # TODO for now, not storing vec - need to setup pytables or something similar cleaned = clean('\n'.join([self.title, self.text])) return vectorizer.vectorize([cleaned])[0] def as_dict(self): whitelist = [ 'id', 'title', 'url', 'text', 'score', 'published', 'feed_id', 'source_id', 'event_id' ] data = {attr: getattr(self, attr) for attr in whitelist} data['authors'] = [a.as_dict() for a in self.authors] data['keywords'] = [k.as_dict() for k in self.keywords] return data
class Event(Keywordable): event_id = db.Column('id', db.Integer, db.ForeignKey('keywordable.id'), primary_key=True) articles = db.relationship('Article', backref='event', lazy='dynamic', foreign_keys=[Article.event_id]) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow) story_id = db.Column(db.Integer, db.ForeignKey('story.id')) title = db.Column(db.Unicode) summary = db.Column(db.UnicodeText) def __init__(self, article): self.articles = [article] self.created_at = article.published self.update() def add(self, article): self.articles.append(article) @property def vecs(self): return vstack([a.vec for a in self.articles]) @property def age(self): return datetime.utcnow() - self.created_at @property def summary_pts(self): return self.summary.split('\n') @property def text(self): return '\n'.join([a.text for a in self.articles]) def update(self): self.summary = '\n'.join(multisummarize(self.articles)) self.title = title(self.articles) self.keywords = [ Keyword.find_or_create(name=kw) for kw, score in keywords(self.articles) ] # Set oldest published date as this event's date self.created_at = min([a.published for a in self.articles]) @classmethod def candidates(cls, dt): """return "active" events - those that are not too old given a datetime `dt`""" return cls.query.filter( dt - Event.created_at < timedelta(hours=36)).all() def as_dict(self): whitelist = ['id', 'title', 'created_at', 'updated_at', 'story_id'] data = {attr: getattr(self, attr) for attr in whitelist} data['summary'] = self.summary.split('\n') data['articles'] = [a.as_dict() for a in self.articles] data['keywords'] = [k.as_dict() for k in self.keywords] return data