def feed(): ''' returns feed instance ''' return FeedParserDict( feed=FeedParserDict( link='https://sample.com', subtitle='This is the best podcast show in the entire universe', title='PodcastTitle'), entries=[ FeedParserDict(title='Episode1', subtitle='episode of week 1', published=str(datetime.datetime(2021, 1, 1)), links=[{ 'href': 'https://somesite.com/episode1.mp3', 'type': 'audio/mpeg' }, { 'href': 'https://somesite.com/episode1', 'type': 'text' }]), FeedParserDict(title='Episode2', subtitle='episode of week 2', published=str(datetime.datetime(2021, 1, 7)), links=[{ 'href': 'https://somesite.com/episode2', 'type': 'text', }, { 'href': 'https://somesite.com/episode2.mp3', 'type': 'audio/mpeg' }]) ], etag='5f77c6d7-45f1e', href='https://sample.podcast.tv/test.xml', updated=str(datetime.datetime(2021, 1, 1)), status=200)
def get_url(item: FeedParserDict) -> str: try: if link := item.get("link", ""): validate_url(link) except ValidationError: link = "" return link
def __init__(self, entry: FeedParserDict): self.entry = entry self.title = entry.title self.url = entry.link self.summary = entry.get('summary', '') self.published_parsed = entry.get('published_parsed') self.update_parsed = entry.get('update_parsed') self.published_date = self._define_published_date( self.published_parsed, self.update_parsed)
def parse_single_entry(entry: FeedParserDict) -> Article: article = Article() article.title = entry.get('title') article.link = entry.get('link') article.pub_date = BlogAggregatorService.parse_pub_date( entry.get('published_parsed')) article.desc = entry.get('summary') return article
def rss_feed_missing_feed_title(): """An invalid rss feed for tests""" item1 = {"title": "test1", "link": "https://test.com/item1"} item2 = {"title": "test1", "description": "foo"} item3 = {"title": "test3", "summary": "bar"} entries = [ FeedParserDict(item1), FeedParserDict(item2), FeedParserDict(item3) ] feed = FeedParserDict({"title": "test", "description": "testing"}) rss_feed = FeedParserDict({"feed": feed, "entries": entries}) return rss_feed
def make_fake_feedparser_dict(feed_url, n_items=30): """Creates a fake but valid FeedParserDict object. Args: feed_url (str): Fake URL for fake Feed n_items (int): number of fake entries to generate """ return FeedParserDict( feed=FeedParserDict(title='Sample Feed', description='This is a sample feed', link=feed_url), entries=make_feed_entries_list(n_items=n_items, feed_url=feed_url), version='rss20', bozo=0, )
def test_feeds(mock_feedparse, patch_try_shorten): from plugins import feeds mock_feedparse.return_value = FeedParserDict(entries=[], ) assert feeds.rss('xkcd') == "Feed not found." mock_feedparse.assert_called_with('http://xkcd.com/rss.xml') mock_feedparse.reset_mock() mock_feedparse.return_value = FeedParserDict( entries=[FeedParserDict(title='foo1', link='http://example.com')], feed=FeedParserDict(title='test'), ) with_title = "\x02test\x02: foo1 (http://example.com)" assert feeds.rss('http://rss.example.com/feed.xml') == with_title mock_feedparse.assert_called_with('http://rss.example.com/feed.xml') mock_feedparse.reset_mock() mock_feedparse.return_value = FeedParserDict( entries=[FeedParserDict(title='foo1', link='http://example.com')], feed=FeedParserDict(), ) without_title = "foo1 (http://example.com)" assert feeds.rss('http://rss.example.com/feed.xml') == without_title mock_feedparse.assert_called_with('http://rss.example.com/feed.xml') mock_feedparse.reset_mock()
def entry_to_page(self, entry: FeedParserDict) -> Info: time = entry.published_parsed return Info( action=1, type='rss', source=netloc_to_source(self.netloc), source_url=self.netloc, title=entry.title, guid=self.get_guid(entry.guid), link=entry.link, description=entry.get('summary', ''), preview=entry.get('summary', ''), subjects=[tag.term.lower() for tag in entry.get('tags', [])], time=struct_time_to_datetime(time), zone=TimeZone( offset=3, # time.tm_gmtoff /1800, name='MSK' # time.tm_zone ))
def check_parsed(parsed: feedparser.FeedParserDict, req_keys: list) -> bool: """ Checks either the feed or entry of a parsed RSS feed :param parsed: valid, utf-8 feedparsed RSS :req_keys: keys that the `parsed` must contain :return: whether the parsed has the needed elements """ return all([parsed.get(x) is not None for x in req_keys])
def test_fetch_feed_unknown_issue(mocked_parse, hacks_feed): """If a feed encounters an unknown issue, it is disabled.""" mocked_parse.return_value = FeedParserDict(bozo=1) stream = fetch_feed(hacks_feed) assert stream is None feed = Feed.objects.get() assert not feed.enabled expected_reason = "Error while reading the feed: 500 __ " assert feed.disabled_reason == expected_reason
def test_fetch_feed_exception(mocked_parse, hacks_feed): """If a feed encounters an exception, it is disabled.""" mocked_parse.return_value = FeedParserDict( bozo=1, bozo_exception=Exception("I am grumpy today.")) stream = fetch_feed(hacks_feed) assert stream is None feed = Feed.objects.get() assert not feed.enabled expected_reason = "Error while reading the feed: 500 __ I am grumpy today." assert feed.disabled_reason == expected_reason
def _update_feed_data(self, feed_data_obj: feedparser.FeedParserDict) -> None: """ Updates feed data given a "feedparser.FeedParserDict.feed" object :param feed_data_obj: :return: """ self.title = feed_data_obj['title'] site_url = feed_data_obj.get('link', None) if site_url: self.site_url = site_url
def test_parse_rss_error(mocker): """ Test exceptions are handled for error while parsing rss """ feed = FeedParserDict({ "bozo": True, "bozo_exception": "details", }) mocker.patch("apps.feeds.feed_parser.feedparser.parse", return_value=feed) with pytest.raises(ParseContentError): parse("foo") feed = FeedParserDict({"bozo": False, "feed": "test"}) mocker.patch("apps.feeds.feed_parser.feedparser.parse", return_value=feed) mocker.patch("apps.feeds.feed_parser.has_required_fields", return_value=False) with pytest.raises(ParseContentError): parse("foo")
def getFeed(url, request_headers=None, handlers=None): try: return feedparser.parse( sickrage.srCore.srWebSession.normalize_url(url), agent=random.choice(USER_AGENTS), etag=False, modified=False, request_headers=request_headers, handlers=handlers) except Exception as e: return FeedParserDict()
def test_fetch_feed_timeout(mocked_parse, hacks_feed, settings): """If a feed times out, it is disabled.""" settings.FEEDER_TIMEOUT = 10 mocked_parse.return_value = FeedParserDict( bozo=1, bozo_exception=URLError(reason=socket.timeout("timed out"))) stream = fetch_feed(hacks_feed) assert stream is None feed = Feed.objects.get() assert feed.etag == "" assert not feed.enabled expected_reason = "This feed didn't respond after 10 seconds" assert feed.disabled_reason == expected_reason
def rss_feed(): """ Create a rss FeedParserDict obj for tests """ item1 = {"title": "test1", "link": "https://test.com/item1"} item2 = {"title": "test2", "description": "foo"} item3 = {"title": "test3", "summary": "bar"} entries = [ FeedParserDict(item1), FeedParserDict(item2), FeedParserDict(item3) ] feed = FeedParserDict({ "title": "test", "link": "https://test.com", "description": "testing" }) rss_feed = FeedParserDict({"feed": feed, "entries": entries, "bozo": 0}) return rss_feed
def _from_feed_entry(entry: feedparser.FeedParserDict) -> 'Result': """ Converts a feedparser entry for an arXiv search result feed into a Result object. """ return Result( entry_id=entry.id, updated=Result._to_datetime(entry.updated_parsed), published=Result._to_datetime(entry.published_parsed), title=re.sub(r'\s+', ' ', entry.title), authors=[ Result.Author._from_feed_author(a) for a in entry.authors ], summary=entry.summary, comment=entry.get('comment'), journal_ref=entry.get('arxiv_journal_ref'), doi=entry.get('arxiv_doi'), primary_category=entry.arxiv_primary_category.get('term'), categories=[tag.get('term') for tag in entry.tags], links=[Result.Link._from_feed_link(link) for link in entry.links], _raw=entry)
def prepare_link_title( item: feedparser.FeedParserDict) -> feedparser.FeedParserDict: """ Для RSS Item возвращает ссылку, заголовок и описание :param item: :return: """ result = None if item: assert item.title, 'Not found title in item' assert item.link, 'Not found link in item' link = item.link.replace('https://www.google.com/url?rct=j&sa=t&url=', '') ge_ind = link.find('&ct=ga') if ge_ind > -1: link = link[0:ge_ind] title = item.title.replace('<b>', '').replace('</b>', '') item.link = link item.title = title result = item return result
def parse_content(item: FeedDict) -> str: content = get_text_from_html(item.get('summary') or '') content = textwrap.shorten(content, width=300, placeholder="...") content = ( content .replace('_', '') .replace('*', '') .replace('`', '') .replace('[', '') .replace(']', '') .replace('(', '') .replace(')', '') ) return content
def load_rss_info(self, parsed: feedparser.FeedParserDict) -> None: """ Load some RSS subscription elements into this feed state.""" self.entries = [] for entry in parsed.get("entries"): new_entry = {} new_entry["title"] = entry["title"] new_entry["urls"] = [] new_entry["metadata"] = {} for enclosure in entry["enclosures"]: new_entry["urls"].append(enclosure["href"]) self.entries.append(new_entry)
def getFeed(url, params=None, request_headers=None, handlers=None): try: resp = sickrage.app.wsession.get(url, params=params) if resp.ok: return feedparser.parse(resp.text, agent=sickrage.app.user_agent, etag=False, modified=False, request_headers=request_headers, handlers=handlers) except Exception as e: sickrage.app.log.debug("RSS Error: {}".format(e.message)) return FeedParserDict()
def getFeed(url, params=None, request_headers=None, handlers=None): try: resp = sickrage.srCore.srWebSession.get(url, params=params) if resp.ok: return feedparser.parse(resp.text, agent=random.choice(USER_AGENTS), etag=False, modified=False, request_headers=request_headers, handlers=handlers) except Exception as e: sickrage.srCore.srLogger.debug("RSS Error: {}".format(e.message)) return FeedParserDict()
def check_source(parsed: feedparser.FeedParserDict) -> bool: """ Checks the parsed feed for encoding et bozo :param parsed: potentially invalid RSS feed :return: whether the RSS feed is actually valid or not """ if parsed.bozo == 1: return False if not parsed.get('encoding'): return False if not parsed.encoding.upper() == 'utf-8'.upper(): return False return True
def getFeed(url, request_headers=None, handlers=None): feed = FeedParserDict() try: try: feed = feedparser.parse(normalize_url(url), False, False, request_headers, handlers=handlers) except AttributeError: sickrage.LOGGER.debug('RSS ERROR:[{}] CODE:[{}]'.format( feed.feed[b'error'][b'description'], feed.feed[b'error'][b'code'])) except: pass return feed
def article(self) -> Article: if self.processed: return FeedParserDict(authors='', text='') article = self.article_supplier(self.input_entry.link) if "content" in self.input_entry: article.set_html(self.input_entry.content[0].value) logging.info("Using inline content") else: logging.info(f"Getting content from: {self.input_entry.link}") article.download() article.parse() logging.debug("Just retrieved the following article: ") logging.debug(article) return article
def make_feed_entries_list(n_items=10, feed_url=''): """Generates a list of feed entries. Args: n_items (int): how many feed entries to make feed_url (str): base URL """ tz = ['+0800', 'GMT'] fmt = '%a, %d %b %Y %H:%M:%S' now = datetime.now() offset = timedelta(minutes=1) return [ FeedParserDict( link=urljoin(feed_url, f'story-{i + 1:05d}.html'), published=( f'{(now - offset * random.randint(1, 180)).strftime(fmt)} ' f'{random.choice(tz)}'), author=f'Author {i + 1}', summary=f'Summary {i + 1}', title=f'Title {i + 1}') for i in range(n_items) ]
def parse_vacancies(data: feedparser.FeedParserDict) -> Iterator[Vacancy]: for entry in data.get('entries', []): try: year, month, day, hour, minutes, seconds, *_ = entry.published_parsed date = datetime(year, month, day, hour, minutes, seconds) text = prepare_text(entry.description) url = entry.link except Exception as exception: app.logger.exception( msg='Exception during parsing job post', exc_info=exception, ) continue title = remove_markdown_symbols(entry.title) text = f'*{title}*\n\n' + text link = f'*Посилання*\n[{title}]({entry.link})' result = text + link if len(result) > MESSAGE_LIMIT: strip_to = MESSAGE_LIMIT - len(link) - 10 result = text[:strip_to] + '...\n\n' + link yield Vacancy(url=url, title=entry.title, text=result, date=date)
def make_validate_dict(item: feedparser.FeedParserDict) -> dict: """ Создает из RSS элемента словарь для сохранения. Метод пытается достать максимум информации из элемента :param item: :return: """ _ = item.get('published_parsed', None) if _: published_at = datetime.fromtimestamp(mktime(_)) else: published_at = datetime.now() try: result = { 'title': item.title, 'description': item.summary, 'link': item.link, 'published_at': published_at, } except Exception: result = {} return result
def get_tags(item: FeedParserDict) -> str: return ", ".join( sorted([ tag.get("term") for tag in item.get("tags", []) if tag.get("term") ]))
def get_summary(item: FeedParserDict) -> str: return unescape(item.get("summary", ""))