def parse_data(self, text, maxwidth, maxheight, template_dir, context, urlize_all_links): block_parser = TextBlockParser() text_parser = TextParser() try: parse_tree = lxml.html.fragment_fromstring(text, create_parent='div') except lxml.etree.XMLSyntaxError: return text if not parse_tree.getchildren(): elements = [parse_tree] else: elements = parse_tree.xpath('.//*[not(self::a) and contains(text(), "http://")]') for element in elements: replacement = block_parser.parse( element.text, maxwidth, maxheight, template_dir, context, urlize_all_links ) if replacement != element.text: element.text = '' new_elements = lxml.html.fragments_fromstring(replacement) for (i, e) in enumerate(new_elements): element.insert(i, e) return lxml.html.tostring(parse_tree)
def parse_data(self, text, maxwidth, maxheight, template_dir, context, urlize_all_links): block_parser = TextBlockParser() original_template_dir = template_dir soup = BeautifulSoup(text) for user_url in soup.findAll(text=re.compile(URL_RE)): if not self.inside_a(user_url): if self.is_standalone(user_url): template_dir = original_template_dir else: template_dir = 'inline' replacement = block_parser.parse( str(user_url), maxwidth, maxheight, template_dir, context, urlize_all_links ) user_url.replaceWith(replacement) return unicode(soup)
def extract_urls(self, text): block_parser = TextBlockParser() soup = BeautifulSoup(text) urls = set() for user_url in soup.findAll(text=re.compile(URL_RE)): if not self.inside_a(user_url): urls |= block_parser.extract_urls(unicode(user_url)) return urls
def extract_urls(self, text): block_parser = TextBlockParser() soup = BeautifulSoup(text) urls = set() url_list = [] for user_url in soup.findAll(text=re.compile(URL_RE)): if not self.inside_a(user_url): block_urls = block_parser.extract_urls(unicode(user_url)) for url in block_urls: if url not in urls: url_list.append(url) urls.add(url) return url_list
def parse_data(self, text, maxwidth, maxheight, template_dir, context, urlize_all_links): block_parser = TextBlockParser() original_template_dir = template_dir soup = BeautifulSoup(text) for user_url in soup.findAll(text=re.compile(URL_RE)): if not self.inside_a(user_url): if self.is_standalone(user_url): template_dir = original_template_dir else: template_dir = 'inline' replacement = block_parser.parse(str(user_url), maxwidth, maxheight, template_dir, context, urlize_all_links) user_url.replaceWith(BeautifulSoup(replacement)) return unicode(soup)
class TextBlockParserTestCase(BaseOEmbedTestCase): def setUp(self): self.parser = TextBlockParser() super(TextBlockParserTestCase, self).setUp() def test_basic_handling(self): parsed = self.parser.parse(self.category_url) self.assertEqual(parsed, self.category_embed) def test_inline_link_handling(self): parsed = self.parser.parse('Testing %s' % self.category_url) self.assertEqual(parsed, 'Testing %s' % self.category_embed) def test_block_handling(self): parsed = self.parser.parse('Testing %(url)s\n%(url)s' % ({'url': self.category_url})) self.assertEqual(parsed, 'Testing %(embed)s\n%(embed)s' % ({'embed': self.category_embed})) def test_urlization(self): test_string = 'Testing http://www.google.com' parsed = self.parser.parse(test_string, urlize_all_links=False) self.assertEqual(parsed, test_string) parsed = self.parser.parse(test_string, urlize_all_links=True) self.assertEqual(parsed, 'Testing <a href="http://www.google.com">http://www.google.com</a>') def test_extraction(self): extracted = self.parser.extract_urls('Testing %s wha?' % self.category_url) self.assertEqual(extracted, set([self.category_url]))
class TextBlockParserTestCase(BaseOEmbedTestCase): def setUp(self): self.parser = TextBlockParser() super(TextBlockParserTestCase, self).setUp() def test_basic_handling(self): parsed = self.parser.parse(self.category_url) self.assertEqual(parsed, self.category_embed) def test_inline_link_handling(self): parsed = self.parser.parse('Testing %s' % self.category_url) self.assertEqual(parsed, 'Testing %s' % self.category_embed) def test_block_handling(self): parsed = self.parser.parse('Testing %(url)s\n%(url)s' % ({'url': self.category_url})) self.assertEqual(parsed, 'Testing %(embed)s\n%(embed)s' % ({'embed': self.category_embed})) def test_urlization(self): test_string = 'Testing http://www.google.com' parsed = self.parser.parse(test_string, urlize_all_links=False) self.assertEqual(parsed, test_string) parsed = self.parser.parse(test_string, urlize_all_links=True) self.assertEqual(parsed, 'Testing <a href="http://www.google.com">http://www.google.com</a>') def test_extraction(self): extracted = self.parser.extract_urls('Testing %s wha?' % self.category_url) self.assertEqual(extracted, [self.category_url]) def test_extraction_ordering(self): extracted = self.parser.extract_urls(''' %s %s %s %s ''' % (self.category_url, self.blog_url, self.category_url, self.rich_url)) self.assertEqual(extracted, [ self.category_url, self.blog_url, self.rich_url, ])
def setUp(self): self.parser = TextBlockParser() super(TextBlockParserTestCase, self).setUp()