def fetch_sites_metadata( self, opml_filename: str ) -> List[Tuple[str, Optional[str], Optional[str]]]: opml_filepath = os.path.join(self.settings.base_output_path, opml_filename) def site_outline(element: Any) -> bool: return all([ element.name == "outline", element.has_attr("xmlUrl"), element.has_attr("type"), element.get("type", "") == "rss", ]) def site_category(element: Any) -> Optional[str]: parent = element.findParent() if parent.name == "outline" and parent.has_attr("title"): return parent.get("title") else: return None if not os.path.exists(opml_filepath): Log.error_and_exit( "OPML file '{}' not found".format(opml_filepath)) with open(opml_filepath, encoding="utf-8") as opml_file_handle: xml_contents = opml_file_handle.read() soup = BeautifulSoup(xml_contents, "xml") sites = soup.opml.body.findAll(site_outline) return [(site["xmlUrl"], site.get("title"), site_category(site)) for site in sites if not any([ True for skip_url in self.settings.skip_urls if site["xmlUrl"].startswith(skip_url) ])]
def load(self) -> None: file_path = os.path.join(self.base_output_path, SETTINGS_FILENAME) if os.path.exists(file_path): with open(file_path, "r") as file_handle: data = json.load(file_handle) last_fetch = data.get(KEY_LAST_FETCH, None) if last_fetch: self.last_fetch_mark = datetime.utcfromtimestamp(last_fetch) Log.info("> Previous fetch mark was: {}".format(self.last_fetch_mark)) self.skip_urls = data[KEY_SKIP_URLS] Log.info("> Skip urls list: {}".format(self.skip_urls))
def _log_and_error_if_proceeds(url: str, title: Optional[str], source_site: Any) -> None: # just warn, don't skip if "bozo" in source_site.keys( ) and source_site["bozo"] == 1 and source_site.status != 200: Log.info("{title} ({url}) bozo=1 http_status:{status}".format( title=title, url=url, status=source_site.status)) # should always skip by raising error if (not source_site.feed.keys() or "link" not in source_site.feed.keys() or source_site.status in [401, 403, 404]): Log.warn_and_raise_error( "{title} ({url}) skipped, feed malformed or not retrieved. HTTP Status: {status} Headers: {headers}" .format( title=title, url=url, status=source_site.status, headers=",".join([ "{}={}".format(key, source_site.headers[key]) for key in source_site.headers.keys() ]), )) if source_site.status in [301]: Log.warn( "{title} ({url}) has moved ({status}) Check new URL".format( title=title, url=url, status=source_site.status)) if source_site.status in [410]: Log.warn_and_raise_error( "{title} ({url}) skipped, received http_status:{status} Url gone" .format(title=title, url=url, status=source_site.status))
def _log_and_error_if_proceeds(url: str, title: Optional[str], source_site: Any, response_status_code: int) -> None: # just warn, don't skip if "bozo" in source_site.keys( ) and source_site["bozo"] == 1 and response_status_code != 200: Log.info("{title} ({url}) bozo=1 http_status:{status}".format( title=title, url=url, status=response_status_code)) # should always skip by raising error if (not source_site.feed.keys() or "link" not in source_site.feed.keys() or response_status_code in [401, 403, 404]): Log.warn_and_raise_error( "{title} ({url}) skipped, feed malformed/not retrieved. HTTPStatus: {status}" .format( title=title, url=url, status=response_status_code, )) if response_status_code in [301]: Log.warn( "{title} ({url}) has moved ({status}) Check new URL".format( title=title, url=url, status=response_status_code)) if response_status_code in [410]: Log.warn_and_raise_error( "{title} ({url}) skipped, received http_status:{status} Url gone" .format(title=title, url=url, status=response_status_code))
def save(self) -> None: fetch_mark = datetime.now() file_path = os.path.join(self.base_output_path, SETTINGS_FILENAME) if not os.path.exists(self.base_output_path): Log.error_and_exit("Output path '{}' not found".format(self.base_output_path)) data = { KEY_LAST_FETCH: fetch_mark.timestamp(), KEY_SKIP_URLS: self.skip_urls, } with open(file_path, "w") as file_handle: json.dump(data, file_handle, indent=None) Log.info("> Fetch mark set to: {}".format(fetch_mark))
def fetch_site( self, url: str, title: Optional[str], category: Optional[str] ) -> Dict[str, Union[ParsedFeedSite, List[ParsedFeedItem]]]: try: source_site = feedparser.parse( url, agent="pbrr/1.0 (https://github.com/Kartones/pbrr)", modified=self.settings.last_fetch_mark) except Exception as e: # else need to directly catch urllib errors if "Name or service not known" in str(e): Log.warn_and_raise_error( "{title} ({url}) skipped, error fetching url".format( title=title, url=url)) else: Log.warn( "{title} ({url}) skipped. Error: {error} Headers: {headers}" .format( title=title, url=url, error=e, headers=",".join([ "{}={}".format(key, source_site.headers[key]) for key in source_site.headers.keys() ]), )) raise e # don't override, leave content as it is if source_site.status == 304: return self._not_modified_site(title, category) self._log_and_error_if_proceeds(url=url, title=title, source_site=source_site) parsed_site = self._parse_site(feed=source_site.feed, provided_title=title, category=category) parsed_entries = [] # type: List[ParsedFeedItem] for entry in source_site.entries: parsed_entries.append( self._parse_entry(entry=entry, parsed_site=parsed_site)) # reorder by most recent first (seen inverse order) parsed_entries = sorted(parsed_entries, key=lambda s: (s.published), reverse=True) # correct site last update time with latest entry (some sites report incorrectly or not even have) parsed_site.last_updated = parsed_entries[0].published Log.info("> Fetched: {title}".format(title=title)) return {self.KEY_SITE: parsed_site, self.KEY_ENTRIES: parsed_entries}
def fetch_site( self, url: str, title: Optional[str], category: Optional[str] ) -> Dict[str, Union[ParsedFeedSite, List[ParsedFeedItem]]]: try: feed_response = requests.get( url, headers={ "User-Agent": "pbrr/1.0 (https://github.com/Kartones/pbrr)", "If-Modified-Since": self._format_if_modified_header( self.settings.last_fetch_mark), }, timeout=15, ) source_site = feedparser.parse(feed_response.text) except (Exception) as e: # else need to directly catch urllib errors if "Name or service not known" in str(e): Log.warn_and_raise_error( "{title} ({url}) skipped, error fetching url".format( title=title, url=url)) else: Log.warn("{title} ({url}) skipped. Error: {error}".format( title=title, url=url, error=e)) raise ValueError(str(e)) # don't override, leave content as it is if feed_response.status_code == 304: return self._not_modified_site(title, category) self._log_and_error_if_proceeds( url=url, title=title, source_site=source_site, response_status_code=feed_response.status_code) parsed_site = self._parse_site(feed=source_site.feed, provided_title=title, category=category) entries_count = len(source_site.entries) - 1 parsed_entries = [ self._parse_entry(entry=entry, parsed_site=parsed_site, entry_reverse_index=(entries_count - index)) for index, entry in enumerate(source_site.entries) ] if parsed_entries: parsed_entries = self._filter_entries(parsed_entries) # reorder by most recent first (seen inverse order) parsed_entries = sorted(parsed_entries, key=lambda s: (s.published), reverse=True) # and cut to a reasonable limit (seen also feeds with full dumps maybe? of content) parsed_entries = parsed_entries[:15] # correct site last update time with latest entry (some sites report incorrectly or not even have) parsed_site.last_updated = parsed_entries[0].published Log.info("> Fetched: {title} ({last_updated})".format( title=title, last_updated=parsed_site.last_updated)) return {self.KEY_SITE: parsed_site, self.KEY_ENTRIES: parsed_entries}