def test_is_feed(self): finder = feed_seeker.FeedSeeker(self.base_url, html=self.regular_feed_page) assert finder.is_feed() finder = feed_seeker.FeedSeeker(self.base_url, html=self.regular_html_template) assert not finder.is_feed()
def test_generate_feed_urls_not_a_page(self): feeds, _ = self.generate_responses() finder = feed_seeker.FeedSeeker(self.base_url + '/what_is_this_even') found_feeds = list(finder.generate_feed_urls()) assert len(found_feeds) == 0
def test_generate_feed_urls_on_feed(self): feeds, _ = self.generate_responses() finder = feed_seeker.FeedSeeker(self.base_url + feeds[0]) found_feeds = list(finder.generate_feed_urls()) assert len(found_feeds) == 1
def test_guess_feed_links(self): # even empty page has some guesses finder = feed_seeker.FeedSeeker(self.base_url, html=self.regular_html_template) guessed_links = list(finder.guess_feed_links()) assert len(guessed_links) > 0 for feed_link in guessed_links: assert self.base_url in feed_link
def test_html_property(self): responses.add(responses.GET, self.base_url, body=self.regular_html_template, status=200) finder = feed_seeker.FeedSeeker(self.base_url) found_html = finder.html assert found_html == self.regular_html_template
def test_generate_feed_urls_max_links(self): feeds, _ = self.generate_responses() finder = feed_seeker.FeedSeeker(self.base_url) max_links = 2 found_feeds = list(finder.generate_feed_urls(max_links=max_links)) assert len(found_feeds) > 0 assert len(found_feeds) <= max_links < len(feeds)
def test_find_link_feeds(self): num_feeds = 4 feed_urls = [] # note that we do NOT eliminate duplicates at this level for _ in range(num_feeds): feed_urls.append(self.rss_feed_template.format('http://whatever.com')) html = self.regular_html_template.format(head='\n'.join(feed_urls), body='') finder = feed_seeker.FeedSeeker(self.base_url, html=html) assert len(list(finder.find_link_feeds())) == num_feeds assert len(list(finder.find_anchor_feeds())) == 0
def test_find_anchor_feeds(self): num_feeds = 4 feed_urls = [] # we should find these four links for feed in range(num_feeds): feed_urls.append('<a href="http://{}.rss"></a>'.format(feed)) # but will not flag this one, since it does not look like a feed feed_urls.append('<a href="https://not_an_example.com"></a>') html = self.regular_html_template.format(head='', body='\n'.join(feed_urls)) finder = feed_seeker.FeedSeeker(self.base_url, html=html) assert len(list(finder.find_link_feeds())) == 0 assert len(set(finder.find_anchor_feeds())) == num_feeds
def test_empty_page(self): finder = feed_seeker.FeedSeeker(self.base_url, html=self.regular_html_template) # Page has no links, so should fail assert finder.find_feed_url() is None
def test_find_internal_links(self): self.generate_responses() finder = feed_seeker.FeedSeeker(self.base_url, html=None) internal_links = finder.find_internal_links() assert len(internal_links) == 1 # from `self.generate_responses`