def parse(self, url=None): url = url or self.url html = self.opener.open(url) base_url = host_for_url(url) if base_url is not None: base_url = "http://%s" % base_url extractor = Extractor(html, base_url=base_url) title = extractor.title() links = [node["href"] for node in extractor.content().find_all("a", href=True)] if self.store: if self.extract: html = extractor.extract() try: doc = GenericDocument.objects.get(url=url) doc.title = title doc.content = html doc.update(upsert=True) except DoesNotExist: doc = GenericDocument(title=title, content=html, url=url) doc.save() return links
def parse(self, url=None): url = url or self.url html = self.opener.open(url) detecting = detect(html) if detecting['confidence'] > 0.5: encoding = detecting['encoding'] if encoding not in ('ascii', 'utf-8'): html = html.decode(encoding).encode('utf-8') base_url = host_for_url(url) if base_url is not None: base_url = 'http://%s' % base_url extractor = Extractor(html, base_url=base_url) title = extractor.title() links = [node['href'] for node in extractor.content().find_all('a', href=True)] if self.store: if self.extract: html = extractor.extract() try: doc = GenericDocument.objects.get(url=url) doc.title = title doc.content = html doc.update(upsert=True) except DoesNotExist: doc = GenericDocument(title=title, content=html, url=url) doc.save() return links
def parse(self, url=None): url = url or self.url html = self.opener.open(url) detecting = detect(html) if detecting['confidence'] > 0.5: encoding = detecting['encoding'] if encoding not in ('ascii', 'utf-8'): html = html.decode(encoding).encode('utf-8') base_url = host_for_url(url) if base_url is not None: base_url = 'http://%s' % base_url extractor = Extractor(html, base_url=base_url) title = extractor.title() links = [ node['href'] for node in extractor.content().find_all('a', href=True) ] if self.store: if self.extract: html = extractor.extract() try: doc = GenericDocument.objects.get(url=url) doc.title = title doc.content = html doc.update(upsert=True) except DoesNotExist: doc = GenericDocument(title=title, content=html, url=url) doc.save() return links
def testExtractor(self): extractor = Extractor(self.html, self.base_url) content = extractor.extract() self.assertGreater(len(content), 0)