Пример #1
0
 def parse_data(self, text, maxwidth, maxheight, template_dir, context,
                urlize_all_links):
     block_parser = TextBlockParser()
     text_parser = TextParser()
     
     try:
         parse_tree = lxml.html.fragment_fromstring(text, create_parent='div')
     except lxml.etree.XMLSyntaxError:
         return text
     
     if not parse_tree.getchildren():
         elements = [parse_tree]
     else:
         elements = parse_tree.xpath('.//*[not(self::a) and contains(text(), "http://")]')
     
     for element in elements:
         replacement = block_parser.parse(
             element.text,
             maxwidth,
             maxheight,
             template_dir,
             context,
             urlize_all_links
         )
         if replacement != element.text:
             element.text = ''
             new_elements = lxml.html.fragments_fromstring(replacement)
             for (i, e) in enumerate(new_elements):
                 element.insert(i, e)
     
     return lxml.html.tostring(parse_tree)
Пример #2
0
 def parse_data(self, text, maxwidth, maxheight, template_dir, context,
                urlize_all_links):                
     block_parser = TextBlockParser()
     original_template_dir = template_dir
     
     soup = BeautifulSoup(text)
     
     for user_url in soup.findAll(text=re.compile(URL_RE)):
         if not self.inside_a(user_url):
             if self.is_standalone(user_url):
                 template_dir = original_template_dir
             else:
                 template_dir = 'inline'
             
             replacement = block_parser.parse(
                 str(user_url),
                 maxwidth,
                 maxheight,
                 template_dir,
                 context,
                 urlize_all_links
             )
             user_url.replaceWith(replacement)
     
     return unicode(soup)
Пример #3
0
 def extract_urls(self, text):
     block_parser = TextBlockParser()
     soup = BeautifulSoup(text)
     urls = set()
     
     for user_url in soup.findAll(text=re.compile(URL_RE)):
         if not self.inside_a(user_url):
             urls |= block_parser.extract_urls(unicode(user_url))
     
     return urls
Пример #4
0
    def extract_urls(self, text):
        block_parser = TextBlockParser()
        soup = BeautifulSoup(text)
        urls = set()

        for user_url in soup.findAll(text=re.compile(URL_RE)):
            if not self.inside_a(user_url):
                urls |= block_parser.extract_urls(unicode(user_url))

        return urls
Пример #5
0
    def extract_urls(self, text):
        block_parser = TextBlockParser()
        soup = BeautifulSoup(text)
        urls = set()
        url_list = []

        for user_url in soup.findAll(text=re.compile(URL_RE)):
            if not self.inside_a(user_url):
                block_urls = block_parser.extract_urls(unicode(user_url))

                for url in block_urls:
                    if url not in urls:
                        url_list.append(url)
                        urls.add(url)

        return url_list
Пример #6
0
    def extract_urls(self, text):
        block_parser = TextBlockParser()
        soup = BeautifulSoup(text)
        urls = set()
        url_list = []

        for user_url in soup.findAll(text=re.compile(URL_RE)):
            if not self.inside_a(user_url):
                block_urls = block_parser.extract_urls(unicode(user_url))
                
                for url in block_urls:
                    if url not in urls:
                        url_list.append(url)
                        urls.add(url)
        
        return url_list
Пример #7
0
    def parse_data(self, text, maxwidth, maxheight, template_dir, context,
                   urlize_all_links):
        block_parser = TextBlockParser()
        original_template_dir = template_dir

        soup = BeautifulSoup(text)

        for user_url in soup.findAll(text=re.compile(URL_RE)):
            if not self.inside_a(user_url):
                if self.is_standalone(user_url):
                    template_dir = original_template_dir
                else:
                    template_dir = 'inline'

                replacement = block_parser.parse(str(user_url), maxwidth,
                                                 maxheight, template_dir,
                                                 context, urlize_all_links)
                user_url.replaceWith(BeautifulSoup(replacement))

        return unicode(soup)
Пример #8
0
class TextBlockParserTestCase(BaseOEmbedTestCase):
    def setUp(self):
        self.parser = TextBlockParser()
        super(TextBlockParserTestCase, self).setUp()
    
    def test_basic_handling(self):
        parsed = self.parser.parse(self.category_url)
        self.assertEqual(parsed, self.category_embed)
    
    def test_inline_link_handling(self):
        parsed = self.parser.parse('Testing %s' % self.category_url)
        self.assertEqual(parsed, 'Testing %s' % self.category_embed)
    
    def test_block_handling(self):
        parsed = self.parser.parse('Testing %(url)s\n%(url)s' % ({'url': self.category_url}))
        self.assertEqual(parsed, 'Testing %(embed)s\n%(embed)s' % ({'embed': self.category_embed}))
    
    def test_urlization(self):
        test_string = 'Testing http://www.google.com'
        parsed = self.parser.parse(test_string, urlize_all_links=False)
        self.assertEqual(parsed, test_string)
        
        parsed = self.parser.parse(test_string, urlize_all_links=True)
        self.assertEqual(parsed, 'Testing <a href="http://www.google.com">http://www.google.com</a>')
    
    def test_extraction(self):
        extracted = self.parser.extract_urls('Testing %s wha?' % self.category_url)
        self.assertEqual(extracted, set([self.category_url]))
Пример #9
0
class TextBlockParserTestCase(BaseOEmbedTestCase):
    def setUp(self):
        self.parser = TextBlockParser()
        super(TextBlockParserTestCase, self).setUp()
    
    def test_basic_handling(self):
        parsed = self.parser.parse(self.category_url)
        self.assertEqual(parsed, self.category_embed)
    
    def test_inline_link_handling(self):
        parsed = self.parser.parse('Testing %s' % self.category_url)
        self.assertEqual(parsed, 'Testing %s' % self.category_embed)
    
    def test_block_handling(self):
        parsed = self.parser.parse('Testing %(url)s\n%(url)s' % ({'url': self.category_url}))
        self.assertEqual(parsed, 'Testing %(embed)s\n%(embed)s' % ({'embed': self.category_embed}))
    
    def test_urlization(self):
        test_string = 'Testing http://www.google.com'
        parsed = self.parser.parse(test_string, urlize_all_links=False)
        self.assertEqual(parsed, test_string)
        
        parsed = self.parser.parse(test_string, urlize_all_links=True)
        self.assertEqual(parsed, 'Testing <a href="http://www.google.com">http://www.google.com</a>')
    
    def test_extraction(self):
        extracted = self.parser.extract_urls('Testing %s wha?' % self.category_url)
        self.assertEqual(extracted, [self.category_url])
    
    def test_extraction_ordering(self):
        extracted = self.parser.extract_urls('''
            %s %s %s
            %s
        ''' % (self.category_url, self.blog_url, self.category_url, self.rich_url))
        
        self.assertEqual(extracted, [
            self.category_url,
            self.blog_url,
            self.rich_url,
        ])
Пример #10
0
 def setUp(self):
     self.parser = TextBlockParser()
     super(TextBlockParserTestCase, self).setUp()