def test_replace_wrong_encoding(self): """Test invalid chars are replaced properly""" encoding, body_unicode = html_to_unicode(ct('utf-8'), 'PREFIX\xe3\xabSUFFIX') # XXX: Policy for replacing invalid chars may suffer minor variations # but it should always contain the unicode replacement char (u'\ufffd') assert u'\ufffd' in body_unicode, repr(body_unicode) assert u'PREFIX' in body_unicode, repr(body_unicode) assert u'SUFFIX' in body_unicode, repr(body_unicode) # Do not destroy html tags due to encoding bugs encoding, body_unicode = html_to_unicode(ct('utf-8'), '\xf0<span>value</span>') assert u'<span>value</span>' in body_unicode, repr(body_unicode)
def test_gunzip_illegal_eof(self): with open(join(SAMPLEDIR, "unexpected-eof.gz"), "rb") as f: text = html_to_unicode("charset=cp1252", gunzip(f.read()))[1] with open(join(SAMPLEDIR, "unexpected-eof-output.txt"), "rb") as o: expected_text = o.read().decode("utf-8") self.assertEqual(len(text), len(expected_text)) self.assertEqual(text, expected_text)
def _assert_encoding(self, content_type, body, expected_encoding, expected_unicode): encoding, body_unicode = html_to_unicode(ct(content_type), body) self.assertTrue(isinstance(body_unicode, unicode)) self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding)) self.assertEqual(body_unicode, expected_unicode)
def test_unicode_body(self): unicode_string = u'\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442' original_string = unicode_string.encode('cp1251') encoding, body_unicode = html_to_unicode(ct('cp1251'), original_string) # check body_as_unicode self.assertTrue(isinstance(body_unicode, unicode)) self.assertEqual(body_unicode, unicode_string)
def extract(self, html='', **kwargs): """ extract data field from raw html or from a url. """ if not html and 'url' in kwargs: info = urlopen(kwargs.pop('url')) _, html = html_to_unicode(info.headers.get('content_type'), info.read()) builder = DomTreeBuilder(html) root = builder.build() region_finder = MiningDataRegion(root, self.k, self.threshold) regions = region_finder.find_regions(root) record_finder = MiningDataRecord(self.threshold) field_finder = MiningDataField() for region in regions: records = record_finder.find_records(region) items, _ = field_finder.align_records(records) region.items = items if 'verbose' in kwargs: print region for record in records: print '\t', record return regions
def url_to_page(url, encoding=None, default_encoding='utf-8'): """Fetch a URL, using python urllib2, and return an HtmlPage object. The `url` may be a string, or a `urllib2.Request` object. The `encoding` argument can be used to force the interpretation of the page encoding. Redirects are followed, and the `url` property of the returned HtmlPage object is the url of the final page redirected to. If the encoding of the page is known, it can be passed as a keyword argument. If unspecified, the encoding is guessed using `w3lib.encoding.html_to_unicode`. `default_encoding` is used if the encoding cannot be determined. """ fh = urlopen(url) info = fh.info() body_str = fh.read() # guess content encoding if not specified if encoding is None: try: # Python 3.x content_type_header = fh.getheader("content-type") except AttributeError: # Python 2.x content_type_header = info.getheader("content-type") encoding, body = html_to_unicode(content_type_header, body_str, default_encoding=default_encoding) else: body = body_str.decode(encoding) return HtmlPage(fh.geturl(), headers=dict(info.items()), body=body, encoding=encoding)
def factory(self,data, parser_cls,url): charset = 'charset=%s' % 'utf-8' data = html_to_unicode(charset, data)[1] body = data.encode('utf8') or '<html/>' parser = parser_cls(recover=True, encoding='utf8') return etree.fromstring(body, parser=parser, base_url=url)
def body_as_unicode(self): """Return body as unicode""" # check for self.encoding before _cached_ubody just in # _body_inferred_encoding is called benc = self.encoding if self._cached_ubody is None: charset = 'charset=%s' % benc self._cached_ubody = html_to_unicode(charset, self.body)[1] return self._cached_ubody
def text(self): """ Body as unicode """ # access self.encoding before _cached_ubody to make sure # _body_inferred_encoding is called benc = self.encoding if self._cached_ubody is None: charset = 'charset=%s' % benc self._cached_ubody = html_to_unicode(charset, self.body)[1] return self._cached_ubody
def _body_inferred_encoding(self): if self._cached_benc is None: content_type = to_native_str(self.headers.get(b'Content-Type', b'')) benc, ubody = html_to_unicode(content_type, self.body, auto_detect_fun=self._auto_detect_fun, default_encoding=self._DEFAULT_ENCODING) self._cached_benc = benc self._cached_ubody = ubody return self._cached_benc
def body_as_unicode(self): from w3lib.encoding import html_to_unicode, resolve_encoding, \ html_body_declared_encoding, http_content_type_encoding """Return body as unicode""" # check for self.encoding before _cached_ubody just in # _body_inferred_encoding is called benc = self.encoding charset = 'charset=%s' % benc self._cached_ubody = html_to_unicode(charset, self.content)[1] return self._cached_ubody
def response2unicode(resp): """ Convert requests.Response body to unicode. Unlike ``response.text`` it handles <meta> tags in response content. """ enc, html = html_to_unicode( content_type_header=resp.headers.get("Content-Type"), html_body_str=resp.content, auto_detect_fun=_autodetect_encoding, ) return html
def encoding(self) -> str: """The encoding string to be used, extracted from the HTML and :class:`HTMLResponse <HTMLResponse>` headers. """ if self._encoding: return self._encoding # Scan meta tags for chaset. if self._html: self._encoding = html_to_unicode(self.default_encoding, self._html)[0] return self._encoding if self._encoding else self.default_encoding
def infer(self, html='', **kwargs): """ extract data with seed region and the data you expect to scrape from there. """ if 'url' in kwargs: info = urlopen(kwargs.pop('url')) _, html = html_to_unicode(info.headers.get('content_type'), info.read()) builder = DomTreeBuilder(html) doc = builder.build() page = HtmlPage(body=tostring(doc, encoding=unicode, method='html')) return self.scraper.scrape_page(page)
def _assert_encoding(self, content_type, body, expected_encoding, expected_unicode): encoding, body_unicode = html_to_unicode(ct(content_type), body) self.assertTrue(isinstance(body_unicode, unicode)) self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding)) if isinstance(expected_unicode, basestring): self.assertEqual(body_unicode, expected_unicode) else: self.assertTrue( body_unicode in expected_unicode, "%s is not in %s" % (body_unicode, expected_unicode) )
def extract(self, html="", **kwargs): """ extract data regions from raw html or from a url. """ if "url" in kwargs: info = urlopen(kwargs.pop("url")) _, html = encoding.html_to_unicode(info.headers.get("content_type"), info.read()) builder = DomTreeBuilder(html) root = builder.build() mining_region = MiningDataRegion(root, self.k, self.threshold) regions = mining_region.find_regions(root) mining_record = MiningDataRecord() mining_field = MiningDataField() region_records = {} all_items = [] for i, region in enumerate(regions): records = mining_record.find_records(region) items, _ = mining_field.align_records(records) all_items.extend(items) assert len(items) == len(records) region_records.update({region: records}) if "verbose" in kwargs: print region for record in records: print "\t", record # always annotate at last to avoid modify the DOM tree if "annotate" in kwargs: for i, region in enumerate(regions): for j, record in enumerate(region_records.get(region)): self.annotate(i, j, record.elements) with open(kwargs.pop("annotate"), "w") as f: print >> f, tostring(root, pretty_print=True) return all_items
def _decode_bytes(body, content_type='', default_encoding='utf-8'): encoding, uni_string = html_to_unicode(content_type_header=content_type, html_body_str=body, default_encoding=default_encoding, auto_detect_fun=_detect_encoding) return (encoding, uni_string)
with open(kwargs.pop("annotate"), "w") as f: print >> f, tostring(root, pretty_print=True) return all_items def annotate(self, region, record, elements): """ annotate the HTML elements with PyQuery. """ colors = ["#ffff42", "#ff0000", "#00ff00", "#ff00ff"] p = pq(elements[0]) div = p.wrap( '<div class="mdr_region" region_id={} record_id={} style="color:{}; border:solid 5px"></div>'.format( region, record, choice(colors) ) ) for e in elements[1:]: div.append(e) if __name__ == "__main__": import sys info = urlopen(sys.argv[1]) _, html = encoding.html_to_unicode(info.headers.get("content_type"), info.read()) depta = Depta() items = depta.extract(html, annotate="output.html", verbose=True) for i, item in enumerate(items): print i, " | ".join(map(lambda x: x.text, item.fields))
def _assert_encoding_detected(self, content_type, expected_encoding, body, **kwargs): assert not isinstance(body, six.text_type) encoding, body_unicode = html_to_unicode(ct(content_type), body, **kwargs) self.assertTrue(isinstance(body_unicode, six.text_type)) self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding))
def _assert_encoding_detected(self, content_type, expected_encoding, body, **kwargs): encoding, body_unicode = html_to_unicode(ct(content_type), body, **kwargs) self.assertTrue(isinstance(body_unicode, unicode)) self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding))
for j, record in enumerate(region_records.get(region)): self.annotate(i, j, record.elements) with open(kwargs.pop('annotate'), 'w') as f: print >> f, tostring(root, pretty_print=True) return all_items def annotate(self, region, record, elements): """ annotate the HTML elements with PyQuery. """ colors = ['#ffff42', '#ff0000', '#00ff00', '#ff00ff'] p = pq(elements[0]) div = p.wrap( '<div class="mdr_region" region_id={} record_id={} style="color:{}; border:solid 5px"></div>' .format(region, record, choice(colors))) for e in elements[1:]: div.append(e) if __name__ == '__main__': import sys info = urlopen(sys.argv[1]) _, html = encoding.html_to_unicode(info.headers.get('content_type'), info.read()) depta = Depta() items = depta.extract(html, annotate='output.html', verbose=True) for i, item in enumerate(items): print i, ' | '.join(map(lambda x: x.text, item.fields))
safe_attrs_only=False ) parser = HTMLParser(encoding=encoding) html = lxml.html.document_fromstring(html, parser=parser) doc = cleaner.clean_html(html) return lxml.etree.tounicode(doc) def mkdir(path): try: os.makedirs(path) except OSError: pass if __name__ == '__main__': args = docopt(__doc__) mkdir(args['--out']) for in_name in args['<input>']: path, fname = os.path.split(in_name) out_name = os.path.join(args['--out'], fname) with open(in_name, 'rb') as f: encoding, html = html_to_unicode(None, f.read()) cleaned = clean_html(html.encode(encoding), encoding) with codecs.open(out_name, 'w', encoding='utf8') as out: out.write(cleaned)
meta=False, safe_attrs_only=False) parser = HTMLParser(encoding=encoding) html = lxml.html.document_fromstring(html, parser=parser) doc = cleaner.clean_html(html) return lxml.etree.tounicode(doc) def mkdir(path): try: os.makedirs(path) except OSError: pass if __name__ == '__main__': args = docopt(__doc__) mkdir(args['--out']) for in_name in args['<input>']: path, fname = os.path.split(in_name) out_name = os.path.join(args['--out'], fname) with open(in_name, 'rb') as f: encoding, html = html_to_unicode(None, f.read()) cleaned = clean_html(html.encode(encoding), encoding) with codecs.open(out_name, 'w', encoding='utf8') as out: out.write(cleaned)
def _decoding(self): charset = f'charset={self._encoding}' enc, text = html_to_unicode(charset, self.body) self._encoding = enc self._text = text