Пример #1
0
    def fetch_sites_metadata(
            self, opml_filename: str
    ) -> List[Tuple[str, Optional[str], Optional[str]]]:
        opml_filepath = os.path.join(self.settings.base_output_path,
                                     opml_filename)

        def site_outline(element: Any) -> bool:
            return all([
                element.name == "outline",
                element.has_attr("xmlUrl"),
                element.has_attr("type"),
                element.get("type", "") == "rss",
            ])

        def site_category(element: Any) -> Optional[str]:
            parent = element.findParent()
            if parent.name == "outline" and parent.has_attr("title"):
                return parent.get("title")
            else:
                return None

        if not os.path.exists(opml_filepath):
            Log.error_and_exit(
                "OPML file '{}' not found".format(opml_filepath))

        with open(opml_filepath, encoding="utf-8") as opml_file_handle:
            xml_contents = opml_file_handle.read()
        soup = BeautifulSoup(xml_contents, "xml")
        sites = soup.opml.body.findAll(site_outline)

        return [(site["xmlUrl"], site.get("title"), site_category(site))
                for site in sites if not any([
                    True for skip_url in self.settings.skip_urls
                    if site["xmlUrl"].startswith(skip_url)
                ])]
Пример #2
0
    def load(self) -> None:
        file_path = os.path.join(self.base_output_path, SETTINGS_FILENAME)
        if os.path.exists(file_path):
            with open(file_path, "r") as file_handle:
                data = json.load(file_handle)

            last_fetch = data.get(KEY_LAST_FETCH, None)
            if last_fetch:
                self.last_fetch_mark = datetime.utcfromtimestamp(last_fetch)
                Log.info("> Previous fetch mark was: {}".format(self.last_fetch_mark))

            self.skip_urls = data[KEY_SKIP_URLS]
            Log.info("> Skip urls list: {}".format(self.skip_urls))
Пример #3
0
    def _log_and_error_if_proceeds(url: str, title: Optional[str],
                                   source_site: Any) -> None:
        # just warn, don't skip
        if "bozo" in source_site.keys(
        ) and source_site["bozo"] == 1 and source_site.status != 200:
            Log.info("{title} ({url}) bozo=1 http_status:{status}".format(
                title=title, url=url, status=source_site.status))

        # should always skip by raising error
        if (not source_site.feed.keys()
                or "link" not in source_site.feed.keys()
                or source_site.status in [401, 403, 404]):
            Log.warn_and_raise_error(
                "{title} ({url}) skipped, feed malformed or not retrieved. HTTP Status: {status} Headers: {headers}"
                .format(
                    title=title,
                    url=url,
                    status=source_site.status,
                    headers=",".join([
                        "{}={}".format(key, source_site.headers[key])
                        for key in source_site.headers.keys()
                    ]),
                ))

        if source_site.status in [301]:
            Log.warn(
                "{title} ({url}) has moved ({status}) Check new URL".format(
                    title=title, url=url, status=source_site.status))

        if source_site.status in [410]:
            Log.warn_and_raise_error(
                "{title} ({url}) skipped, received http_status:{status} Url gone"
                .format(title=title, url=url, status=source_site.status))
Пример #4
0
    def _log_and_error_if_proceeds(url: str, title: Optional[str],
                                   source_site: Any,
                                   response_status_code: int) -> None:
        # just warn, don't skip
        if "bozo" in source_site.keys(
        ) and source_site["bozo"] == 1 and response_status_code != 200:
            Log.info("{title} ({url}) bozo=1 http_status:{status}".format(
                title=title, url=url, status=response_status_code))

        # should always skip by raising error
        if (not source_site.feed.keys()
                or "link" not in source_site.feed.keys()
                or response_status_code in [401, 403, 404]):
            Log.warn_and_raise_error(
                "{title} ({url}) skipped, feed malformed/not retrieved. HTTPStatus: {status}"
                .format(
                    title=title,
                    url=url,
                    status=response_status_code,
                ))

        if response_status_code in [301]:
            Log.warn(
                "{title} ({url}) has moved ({status}) Check new URL".format(
                    title=title, url=url, status=response_status_code))

        if response_status_code in [410]:
            Log.warn_and_raise_error(
                "{title} ({url}) skipped, received http_status:{status} Url gone"
                .format(title=title, url=url, status=response_status_code))
Пример #5
0
    def save(self) -> None:
        fetch_mark = datetime.now()
        file_path = os.path.join(self.base_output_path, SETTINGS_FILENAME)

        if not os.path.exists(self.base_output_path):
            Log.error_and_exit("Output path '{}' not found".format(self.base_output_path))

        data = {
            KEY_LAST_FETCH: fetch_mark.timestamp(),
            KEY_SKIP_URLS: self.skip_urls,
        }

        with open(file_path, "w") as file_handle:
            json.dump(data, file_handle, indent=None)

        Log.info("> Fetch mark set to: {}".format(fetch_mark))
Пример #6
0
    def fetch_site(
        self, url: str, title: Optional[str], category: Optional[str]
    ) -> Dict[str, Union[ParsedFeedSite, List[ParsedFeedItem]]]:
        try:
            source_site = feedparser.parse(
                url,
                agent="pbrr/1.0 (https://github.com/Kartones/pbrr)",
                modified=self.settings.last_fetch_mark)
        except Exception as e:
            # else need to directly catch urllib errors
            if "Name or service not known" in str(e):
                Log.warn_and_raise_error(
                    "{title} ({url}) skipped, error fetching url".format(
                        title=title, url=url))
            else:
                Log.warn(
                    "{title} ({url}) skipped. Error: {error} Headers: {headers}"
                    .format(
                        title=title,
                        url=url,
                        error=e,
                        headers=",".join([
                            "{}={}".format(key, source_site.headers[key])
                            for key in source_site.headers.keys()
                        ]),
                    ))
                raise e

        # don't override, leave content as it is
        if source_site.status == 304:
            return self._not_modified_site(title, category)

        self._log_and_error_if_proceeds(url=url,
                                        title=title,
                                        source_site=source_site)

        parsed_site = self._parse_site(feed=source_site.feed,
                                       provided_title=title,
                                       category=category)

        parsed_entries = []  # type: List[ParsedFeedItem]
        for entry in source_site.entries:
            parsed_entries.append(
                self._parse_entry(entry=entry, parsed_site=parsed_site))

        # reorder by most recent first (seen inverse order)
        parsed_entries = sorted(parsed_entries,
                                key=lambda s: (s.published),
                                reverse=True)
        # correct site last update time with latest entry (some sites report incorrectly or not even have)
        parsed_site.last_updated = parsed_entries[0].published

        Log.info("> Fetched: {title}".format(title=title))

        return {self.KEY_SITE: parsed_site, self.KEY_ENTRIES: parsed_entries}
Пример #7
0
    def fetch_site(
        self, url: str, title: Optional[str], category: Optional[str]
    ) -> Dict[str, Union[ParsedFeedSite, List[ParsedFeedItem]]]:
        try:
            feed_response = requests.get(
                url,
                headers={
                    "User-Agent":
                    "pbrr/1.0 (https://github.com/Kartones/pbrr)",
                    "If-Modified-Since":
                    self._format_if_modified_header(
                        self.settings.last_fetch_mark),
                },
                timeout=15,
            )
            source_site = feedparser.parse(feed_response.text)
        except (Exception) as e:
            # else need to directly catch urllib errors
            if "Name or service not known" in str(e):
                Log.warn_and_raise_error(
                    "{title} ({url}) skipped, error fetching url".format(
                        title=title, url=url))
            else:
                Log.warn("{title} ({url}) skipped. Error: {error}".format(
                    title=title, url=url, error=e))
                raise ValueError(str(e))

        # don't override, leave content as it is
        if feed_response.status_code == 304:
            return self._not_modified_site(title, category)

        self._log_and_error_if_proceeds(
            url=url,
            title=title,
            source_site=source_site,
            response_status_code=feed_response.status_code)

        parsed_site = self._parse_site(feed=source_site.feed,
                                       provided_title=title,
                                       category=category)

        entries_count = len(source_site.entries) - 1

        parsed_entries = [
            self._parse_entry(entry=entry,
                              parsed_site=parsed_site,
                              entry_reverse_index=(entries_count - index))
            for index, entry in enumerate(source_site.entries)
        ]

        if parsed_entries:
            parsed_entries = self._filter_entries(parsed_entries)

            # reorder by most recent first (seen inverse order)
            parsed_entries = sorted(parsed_entries,
                                    key=lambda s: (s.published),
                                    reverse=True)
            # and cut to a reasonable limit (seen also feeds with full dumps maybe? of content)
            parsed_entries = parsed_entries[:15]

            # correct site last update time with latest entry (some sites report incorrectly or not even have)
            parsed_site.last_updated = parsed_entries[0].published

        Log.info("> Fetched: {title} ({last_updated})".format(
            title=title, last_updated=parsed_site.last_updated))

        return {self.KEY_SITE: parsed_site, self.KEY_ENTRIES: parsed_entries}