def news_item_from_feed_item(feed_item: FeedItem, feed: Feed, user: User) -> NewsItem: return NewsItem( feed_id=feed_item.feed_id, user_id=user.user_id, feed_item_id=feed_item.feed_item_id, feed_title=feed.title, title=feed_item.title, description=feed_item.description, link=feed_item.link, published=feed_item.published or now_in_utc(), favicon=determine_favicon_link(feed_item), created_on=now_in_utc(), )
def gemeente_groningen_parser(feed: Feed, html_source: str) -> List[FeedItem]: soup = BeautifulSoup(html_source, features="html.parser") articles = soup.find_all("article") return [ FeedItem( feed_id=feed.feed_id, title=_title(article), description=_description(article), link=_link(article), last_seen=now_in_utc(), published=(datetime.fromisoformat( article.find("time")["datetime"]) if article.find("time") else now_in_utc()), created_on=now_in_utc(), ) for article in articles ]
class NewsItem(BaseModel): # pylint: disable=too-few-public-methods news_item_id: PyObjectId = Field(default_factory=uuid4_str, alias="_id") feed_id: PyObjectId user_id: PyObjectId feed_item_id: PyObjectId feed_title: str title: str description: str link: str published: datetime alternate_links: List[str] = Field(default_factory=list) alternate_title_links: List[str] = Field(default_factory=list) alternate_favicons: List[str] = Field(default_factory=list) favicon: Optional[str] created_on: datetime = now_in_utc() is_read: bool = False is_saved: bool = False is_read_on: Optional[datetime] = None saved_news_item_id: Optional[PyObjectId] = None def append_alternate(self, link: str, title: str, icon_link: str) -> None: """Append an alternate source for the news. Only appended if not yet present.""" if link not in self.alternate_links: self.title = f"[Updated] {self.title}" if not self.title.startswith( "[Updated] ") else self.title self.alternate_title_links.append(title) self.alternate_links.append(link) self.alternate_favicons.append(icon_link)
def rss_document_to_feed_items(feed: Feed, tree: ElementBase) -> List[FeedItem]: """Creates a list of FeedItem objects from a xml tree for the feed.""" item_elements = tree.findall("channel/item") return [ FeedItem( feed_id=feed.feed_id, title=item_element.findtext("title"), link=sanitize_link(item_element.findtext("link")), description=parse_description( item_element.findtext("description")), last_seen=now_in_utc(), published=_parse_optional_rss_datetime( item_element.findtext("pubDate")), created_on=now_in_utc(), ) for item_element in item_elements ]
def atom_document_to_feed_items(feed: Feed, tree: ElementBase) -> List[FeedItem]: item_elements = tree.findall("{http://www.w3.org/2005/Atom}entry") return [ FeedItem( feed_id=feed.feed_id, title=item_element.findtext("{http://www.w3.org/2005/Atom}title"), link=_parse_optional_link_for_href( item_element.find("{http://www.w3.org/2005/Atom}link")), description=item_element.findtext( "{http://www.w3.org/2005/Atom}content") or "", last_seen=now_in_utc(), published=_parse_optional_datetime( item_element.findtext( "{http://www.w3.org/2005/Atom}published")), created_on=now_in_utc(), ) for item_element in item_elements ]
class SavedNewsItem(BaseModel): saved_news_item_id: PyObjectId = Field(default_factory=uuid4_str, alias="_id") feed_id: PyObjectId user_id: PyObjectId feed_item_id: PyObjectId news_item_id: PyObjectId feed_title: str title: str description: str link: str published: datetime alternate_links: List[str] = Field(default_factory=list) alternate_title_links: List[str] = Field(default_factory=list) alternate_favicons: List[str] = Field(default_factory=list) favicon: Optional[str] created_on: datetime saved_on: datetime = now_in_utc()
async def upsert_new_items_for_feed( feed: Feed, updated_feed: Feed, feed_items_from_rss: List[FeedItem]) -> UpdateResult: """ Upload new items as feed item and news item for users. - Upload all the feed-items if feed item did not exist yet. - If feed-item exists, tick the last_seen timestamp. - For all subscribed users, make news items and upsert new feed items. - Set number_of_items, last_fetched and mutable details for the feed itself. returns: Number of new NewsItems created. """ current_feed_items = await repositories( ).feed_item_repository.fetch_all_for_feed(feed) subscribed_users = await repositories( ).user_repository.fetch_subscribed_to(feed) updated_feed_items: List[FeedItem] = [ ] # updated feed_items that will be updated. new_feed_items: List[FeedItem] = [ ] # new feed_items that will be inserted. new_news_items: List[NewsItem] = [] # news items that will be inserted. updated_news_items: List[NewsItem] = [] # news items that are updated. update_result = UpdateResult() for user in subscribed_users: number_of_new_items = 0 current_news_items = await repositories( ).news_item_repository.fetch_all_non_read_for_feed(feed, user) for new_feed_item in feed_items_from_rss: feed_items_with_same_link = [ item for item in current_feed_items if item.link == new_feed_item.link ] if len(feed_items_with_same_link ) > 0: # We have seen this item already, update last seen. for feed_item in feed_items_with_same_link: feed_item.last_seen = now_in_utc() updated_feed_items.extend(feed_items_with_same_link) else: # New feed item. new_feed_items.append(new_feed_item) current_feed_items.append(new_feed_item) # Check if there is already a similar news item to flag alternates. news_items_similar_titles = [ news_item for news_item in current_news_items if are_titles_similar(title_1=news_item.title, title_2=new_feed_item.title) ] # If no similar news items, just insert new news item and feed item, else update existing news item. if len(news_items_similar_titles) == 0: new_news_item = news_item_from_feed_item( new_feed_item, feed, user) new_news_items.append(new_news_item) number_of_new_items += 1 current_news_items.append(new_news_item) else: for existing_news_item in news_items_similar_titles: existing_news_item.append_alternate( new_feed_item.link, new_feed_item.title, determine_favicon_link(new_feed_item)) existing_news_item.published = new_feed_item.published or now_in_utc( ) updated_news_items.append(existing_news_item) update_result.add(user.user_id, number_of_new_items) # Upsert the new and updated feed_items. await repositories().feed_item_repository.upsert_many(new_feed_items) await repositories().feed_item_repository.upsert_many(updated_feed_items) await repositories().news_item_repository.upsert_many(new_news_items) await repositories().news_item_repository.upsert_many(updated_news_items) # Update information in feed item with latest information from the url. feed.last_fetched = datetime.utcnow() feed.description = updated_feed.description feed.title = updated_feed.title feed.number_of_items = feed.number_of_items + len(new_feed_items) await repositories().feed_repository.upsert(feed) return update_result