class TextBlockParserTestCase(BaseOEmbedTestCase): def setUp(self): self.parser = TextBlockParser() super(TextBlockParserTestCase, self).setUp() def test_basic_handling(self): parsed = self.parser.parse(self.category_url) self.assertEqual(parsed, self.category_embed) def test_inline_link_handling(self): parsed = self.parser.parse('Testing %s' % self.category_url) self.assertEqual(parsed, 'Testing %s' % self.category_embed) def test_block_handling(self): parsed = self.parser.parse('Testing %(url)s\n%(url)s' % ({'url': self.category_url})) self.assertEqual(parsed, 'Testing %(embed)s\n%(embed)s' % ({'embed': self.category_embed})) def test_urlization(self): test_string = 'Testing http://www.google.com' parsed = self.parser.parse(test_string, urlize_all_links=False) self.assertEqual(parsed, test_string) parsed = self.parser.parse(test_string, urlize_all_links=True) self.assertEqual(parsed, 'Testing <a href="http://www.google.com">http://www.google.com</a>') def test_extraction(self): extracted = self.parser.extract_urls('Testing %s wha?' % self.category_url) self.assertEqual(extracted, set([self.category_url]))
def extract_urls(self, text): block_parser = TextBlockParser() soup = BeautifulSoup(text) urls = set() for user_url in soup.findAll(text=re.compile(URL_RE)): if not self.inside_a(user_url): urls |= block_parser.extract_urls(unicode(user_url)) return urls
class TextBlockParserTestCase(BaseOEmbedTestCase): def setUp(self): self.parser = TextBlockParser() super(TextBlockParserTestCase, self).setUp() def test_basic_handling(self): parsed = self.parser.parse(self.category_url) self.assertEqual(parsed, self.category_embed) def test_inline_link_handling(self): parsed = self.parser.parse('Testing %s' % self.category_url) self.assertEqual(parsed, 'Testing %s' % self.category_embed) def test_block_handling(self): parsed = self.parser.parse('Testing %(url)s\n%(url)s' % ({'url': self.category_url})) self.assertEqual(parsed, 'Testing %(embed)s\n%(embed)s' % ({'embed': self.category_embed})) def test_urlization(self): test_string = 'Testing http://www.google.com' parsed = self.parser.parse(test_string, urlize_all_links=False) self.assertEqual(parsed, test_string) parsed = self.parser.parse(test_string, urlize_all_links=True) self.assertEqual(parsed, 'Testing <a href="http://www.google.com">http://www.google.com</a>') def test_extraction(self): extracted = self.parser.extract_urls('Testing %s wha?' % self.category_url) self.assertEqual(extracted, [self.category_url]) def test_extraction_ordering(self): extracted = self.parser.extract_urls(''' %s %s %s %s ''' % (self.category_url, self.blog_url, self.category_url, self.rich_url)) self.assertEqual(extracted, [ self.category_url, self.blog_url, self.rich_url, ])
def extract_urls(self, text): block_parser = TextBlockParser() soup = BeautifulSoup(text) urls = set() url_list = [] for user_url in soup.findAll(text=re.compile(URL_RE)): if not self.inside_a(user_url): block_urls = block_parser.extract_urls(unicode(user_url)) for url in block_urls: if url not in urls: url_list.append(url) urls.add(url) return url_list