Exemplo n.º 1
0
    def parse(self, url=None):
        url = url or self.url
        html = self.opener.open(url)

        detecting = detect(html)
        if detecting['confidence'] > 0.5:
            encoding = detecting['encoding']
            if encoding not in ('ascii', 'utf-8'):
                html = html.decode(encoding).encode('utf-8')

        base_url = host_for_url(url)
        if base_url is not None:
            base_url = 'http://%s' % base_url
        extractor = Extractor(html, base_url=base_url)

        title = extractor.title()
        links = [
            node['href']
            for node in extractor.content().find_all('a', href=True)
        ]

        if self.store:
            if self.extract:
                html = extractor.extract()

            try:
                doc = GenericDocument.objects.get(url=url)
                doc.title = title
                doc.content = html
                doc.update(upsert=True)
            except DoesNotExist:
                doc = GenericDocument(title=title, content=html, url=url)
                doc.save()

        return links
Exemplo n.º 2
0
 def testExtractor(self):
     extractor = Extractor(self.html, self.base_url)
     content = extractor.extract()
     self.assertGreater(len(content), 0)