def __init__(self, entry: FeedParserDict): self.entry = entry self.title = entry.title self.url = entry.link self.summary = entry.get('summary', '') self.published_parsed = entry.get('published_parsed') self.update_parsed = entry.get('update_parsed') self.published_date = self._define_published_date( self.published_parsed, self.update_parsed)
def check_parsed(parsed: feedparser.FeedParserDict, req_keys: list) -> bool: """ Checks either the feed or entry of a parsed RSS feed :param parsed: valid, utf-8 feedparsed RSS :req_keys: keys that the `parsed` must contain :return: whether the parsed has the needed elements """ return all([parsed.get(x) is not None for x in req_keys])
def _update_feed_data(self, feed_data_obj: feedparser.FeedParserDict) -> None: """ Updates feed data given a "feedparser.FeedParserDict.feed" object :param feed_data_obj: :return: """ self.title = feed_data_obj['title'] site_url = feed_data_obj.get('link', None) if site_url: self.site_url = site_url
def load_rss_info(self, parsed: feedparser.FeedParserDict) -> None: """ Load some RSS subscription elements into this feed state.""" self.entries = [] for entry in parsed.get("entries"): new_entry = {} new_entry["title"] = entry["title"] new_entry["urls"] = [] new_entry["metadata"] = {} for enclosure in entry["enclosures"]: new_entry["urls"].append(enclosure["href"]) self.entries.append(new_entry)
def parse_content(item: FeedDict) -> str: content = get_text_from_html(item.get('summary') or '') content = textwrap.shorten(content, width=300, placeholder="...") content = ( content .replace('_', '') .replace('*', '') .replace('`', '') .replace('[', '') .replace(']', '') .replace('(', '') .replace(')', '') ) return content
def check_source(parsed: feedparser.FeedParserDict) -> bool: """ Checks the parsed feed for encoding et bozo :param parsed: potentially invalid RSS feed :return: whether the RSS feed is actually valid or not """ if parsed.bozo == 1: return False if not parsed.get('encoding'): return False if not parsed.encoding.upper() == 'utf-8'.upper(): return False return True
def make_validate_dict(item: feedparser.FeedParserDict) -> dict: """ Создает из RSS элемента словарь для сохранения. Метод пытается достать максимум информации из элемента :param item: :return: """ _ = item.get('published_parsed', None) if _: published_at = datetime.fromtimestamp(mktime(_)) else: published_at = datetime.now() try: result = { 'title': item.title, 'description': item.summary, 'link': item.link, 'published_at': published_at, } except Exception: result = {} return result
def parse_vacancies(data: feedparser.FeedParserDict) -> Iterator[Vacancy]: for entry in data.get('entries', []): try: year, month, day, hour, minutes, seconds, *_ = entry.published_parsed date = datetime(year, month, day, hour, minutes, seconds) text = prepare_text(entry.description) url = entry.link except Exception as exception: app.logger.exception( msg='Exception during parsing job post', exc_info=exception, ) continue title = remove_markdown_symbols(entry.title) text = f'*{title}*\n\n' + text link = f'*Посилання*\n[{title}]({entry.link})' result = text + link if len(result) > MESSAGE_LIMIT: strip_to = MESSAGE_LIMIT - len(link) - 10 result = text[:strip_to] + '...\n\n' + link yield Vacancy(url=url, title=entry.title, text=result, date=date)
def __init__(self, feed_link: feedparser.FeedParserDict): self.href = feed_link.href self.title = feed_link.get('title') self.rel = feed_link.get('rel') self.content_type = feed_link.get('content_type')
def _handle_http_codes( self, parsed: feedparser.FeedParserDict) -> "UpdateResult": """ Given feedparser parse result, determine if parse succeeded, and what to do about that. """ # feedparser gives no status if you feedparse a local file. if "status" not in parsed: LOG.debug("Saw status 200 - OK, all is well.") return UpdateResult.SUCCESS status = parsed.get("status", 200) result = UpdateResult.SUCCESS if status == requests.codes["NOT_FOUND"]: LOG.error( f"Saw status {status}, unable to retrieve feed text for " f"{self.metadata['name']}." f"\nStored URL {self.url} for {self.metadata['name']} will be preserved" f"and checked again on next attempt.") result = UpdateResult.FAILURE elif status in [ requests.codes["UNAUTHORIZED"], requests.codes["GONE"] ]: LOG.error( f"Saw status {status}, unable to retrieve feed text for " f"{self.metadata['name']}." f"\nClearing stored URL {self.url} for {self.metadata['name']}." f"\nPlease provide new URL and authorization for subscription " f"{self.metadata['name']}.") self.url = "" result = UpdateResult.FAILURE # handle redirecting errors elif status in [ requests.codes["MOVED_PERMANENTLY"], requests.codes["PERMANENT_REDIRECT"] ]: LOG.warning( f"Saw status {status} indicating permanent URL change." f"\nChanging stored URL {self.url} for {self.metadata['name']} to " f"{parsed.get('href')} and attempting get with new URL.") self.url = parsed.get("href") result = UpdateResult.ATTEMPT_AGAIN elif status in [ requests.codes["FOUND"], requests.codes["SEE_OTHER"], requests.codes["TEMPORARY_REDIRECT"] ]: LOG.warning( f"Saw status {status} indicating temporary URL change." f"\nAttempting with new URL {parsed.get('href')}." f"\nStored URL {self.url} for {self.metadata['name']} will be unchanged." ) self.temp_url = self.url self.url = parsed.get("href") result = UpdateResult.ATTEMPT_AGAIN elif status != 200: LOG.warning( f"Saw '{status}'. Retrying retrieve for {self.metadata['name']} " f"at {self.url}.") result = UpdateResult.ATTEMPT_AGAIN else: LOG.debug("Saw status 200. Success!") return result
def parse_date(item: FeedDict) -> datetime: date = item.get('published_parsed') or item.get('updated_parsed') return time_struct_to_datetime(date)