def __init__(self, html): """Create a parse tree from the given HTML.""" def really_parse_fragment(parser, html): """Parse a possibly multi-rooted HTML fragment, wrapping it in a <div> to make it easy to query later. As far as I can tell, this is what parseFragment is supposed to do (but doesn't). See http://code.google.com/p/html5lib/issues/detail?id=161. """ top_level_elements = parser.parseFragment(html) container = Element(self.CONTAINER_TAG) # Why lxml couldn't just have text nodes, I'll never understand. # Text nodes that come other than first are automatically stuffed # into the tail attrs of the preceding elements by html5lib. if top_level_elements and isinstance(top_level_elements[0], basestring): container.text = top_level_elements.pop(0) container.extend(top_level_elements) return container p = HTMLParser(tree=getTreeBuilder(self.TREEBUILDER)) self._root = really_parse_fragment(p, html)
def wiki_string_to_tiddlers(content): """ Turn a string that is a TiddlyWiki into individual tiddlers. """ parser = HTMLParser(tree=treebuilders.getTreeBuilder('dom')) doc = parser.parse(content) # minidom will not provide working getElementById without # first having a valid document, which means some very specific # doctype hooey. So we traverse body = doc.getElementsByTagName('body')[0] body_divs = body.getElementsByTagName('div') is_wiki = False for div in body_divs: if div.hasAttribute('id') and div.getAttribute('id') == 'storeArea': divs = div.getElementsByTagName('div') is_wiki = True break if is_wiki: tiddlers = [] for tiddler_div in divs: tiddlers.append(_get_tiddler_from_div(tiddler_div)) return tiddlers else: raise ValueError('content not a tiddlywiki 2.x')
def extract_text_from_html(html: str, skip_tags: Optional[List[str]] = None) -> str: """Extract plain text content from the elements inside an HTML string.""" def extract_text(element: Element, skip_tags: List[str]) -> Iterator[str]: """Extract text recursively from elements, optionally skipping some tags. This function is Python's xml.etree.ElementTree.Element.itertext() but with the added ability to skip over particular tags and not include the text from inside them or any of their children. """ if not isinstance(element.tag, str) and element.tag is not None: return if element.tag in skip_tags: return if element.text: yield element.text for subelement in element: yield from extract_text(subelement, skip_tags) if subelement.tail: yield subelement.tail skip_tags = skip_tags or [] html_tree = HTMLParser(namespaceHTMLElements=False).parseFragment(html) # extract the text from all of the HTML elements extracted_text = "".join(extract_text(html_tree, skip_tags)) # sanitize unicode, remove leading/trailing whitespace, etc. return simplify_string(extracted_text)
def html_parser(html): try: soup = BeautifulSoup(html) except: parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(html) return soup
def summary_scrape(urn): print " - summary" url = "http://www.edubase.gov.uk/establishment/summary.xhtml?urn=" + urn parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(scrape(url)) keyvaluepairs = table_extract(page) raw_address = [x.strip() for x in keyvaluepairs.pop("").split(" / ")] if postcode.match(raw_address[-1]): keyvaluepairs["Postcode"] = raw_address[-1] raw_address = raw_address[:-1] keyvaluepairs["Address"] = " / ".join(raw_address) for t in page.findall( path([ "body", "div", "div", "div", "div", "table", "tbody", "tr", "td", "h1" ], pre)): x = t.text.split(": ") keyvaluepairs[x[0]] = x[1] for t in page.findall( path([ "body", "div", "div", "div", "div", "table", "tbody", "tr", "td", "div", "p", "b" ], pre)): keyvaluepairs[t.text.strip().strip(":")] = (t.tail or "").strip() return keyvaluepairs
def clean_html(input, sanitize=False): """ Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed. :param sanitize: Remove unwanted HTML tags and attributes. >>> clean_html("<p>Foo<b>bar</b></p>") u'<p>Foo<b>bar</b></p>' >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>") u'<p>Foo<b>bar</b><i>Ooops!</i></p>' >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>') u'<p>Foo<b>bar</b>& oops<a href=#foo&bar>This is a <>link</a></p>' """ parser_kwargs = {} serializer_kwargs = {} if sanitize: if HTMLSanitizer is None: # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016) serializer_kwargs['sanitize'] = True else: parser_kwargs['tokenizer'] = HTMLSanitizer p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs) return "".join(s.serialize(stream))
def runParserEncodingTest(data, encoding): p = HTMLParser() assert p.documentEncoding is None p.parse(data, useChardet=False) encoding = encoding.lower().decode('ascii') assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
def runParserEncodingTest(data, encoding): p = HTMLParser() p.parse(data, useChardet=False) encoding = encoding.lower().decode("ascii") assert encoding == p.tokenizer.stream.charEncoding[0], errorMessage( data, encoding, p.tokenizer.stream.charEncoding[0])
def read_hcard(url): try: f = urlopen(url) content_type = f.info().getheader('content-type', 'text/html') value, params = cgi.parse_header(content_type) charset = params.get('charset', 'utf-8').replace("'", '') dom = HTMLParser().parse(urlopen(url).read(512 * 1024).decode(charset, 'ignore')) except IOError: return def _find(node, class_name): for child in (c for c in node if c.type == 5): if re.search(r'\b%s\b' % class_name, child.attributes.get('class', '')): return child vcard = _find(dom, 'vcard') if vcard is None: return def _parse_property(class_name): el = _find(vcard, class_name) if el is None: return if el.name == 'abbr' and 'title' in el.attributes: result = el.attributes['title'] else: result = u''.join(s.value for s in el if s.type == 4) return result.replace(u'\n', u' ').strip() return { 'nickname': _parse_property('nickname') or _parse_property('fn') or '', }
def extract_html_urls(self, html): """ Take all ``<img src="..">`` from the HTML """ p = HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom = p.parse(html) urls = [] for img in dom.getElementsByTagName("img"): src = img.getAttribute("src") if src: urls.append(unquote_utf8(src)) srcset = img.getAttribute("srcset") if srcset: urls += self.extract_srcset(srcset) for source in dom.getElementsByTagName("source"): srcset = source.getAttribute("srcset") if srcset: urls += self.extract_srcset(srcset) for source in dom.getElementsByTagName("a"): href = source.getAttribute("href") if href: urls.append(unquote_utf8(href)) return urls
def test_parser_encoding(data, encoding): p = HTMLParser() assert p.documentEncoding is None p.parse(data, useChardet=False) encoding = encoding.lower().decode("ascii") assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
def do_year(y, url): pagetext = urllib2.urlopen(url) parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"), tokenizer=sanitizer.HTMLSanitizer) page = parser.parse(pagetext) for section in page.findall( "body/div/div/div/div/div/div/div/div/table[@class='fixture']"): matchtype = section.find("caption").text for match in section.findall("tbody/tr"): l = list(match.getchildren()) d = {} d["Match type"] = matchtype d["Match number"] = l[0].text d["Date"] = make_date(l[1].text, y) d["Team 1"] = flatten_refs(l[3]) d["Team 2"] = flatten_refs(l[5]) a = l[4].find("a") d["Score"] = a.text d["Report"] = "http://www.fifa.com" + a.get("href") print "%d (%s) %s - %s" % (y, d["Match type"], d["Team 1"], d["Team 2"]) datastore.save(unique_keys=["Date", "Team 1", "Team 2"], data=d)
def get_html_parse_tree(url, data=None, headers={}, treetype='beautifulsoup'): "Request a URL, parse with html5lib, and return a parse tree from it" req = urllib2.Request(iri_to_uri(url), data, headers) f = urllib2.urlopen(req) if f.info().gettype() not in ('text/html', 'application/xhtml+xml'): f.close() raise ContentTypeException("Content type isn't HTML, but " + f.info().gettype()) data = f.read() f.close() encoding = None contentType = f.headers.get('content-type') if contentType: (mediaType, params) = cgi.parse_header(contentType) encoding = params.get('charset') compression = f.headers.get('content-encoding') if compression: if compression.lower() == "deflate": try: data = zlib.decompress(data) except zlib.error: data = zlib.decompress(data, -zlib.MAX_WBITS) elif compression.lower() == "gzip": compressedstream = StringIO(data) gzipper = GzipFile(fileobj=compressedstream) data = gzipper.read() if treetype == "beautifulsoup": return BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES) elif treetype == "etree": kwargs = {'tree': treebuilders.getTreeBuilder('etree', ElementTree)} # http://code.google.com/p/html5lib/issues/detail?id=138 if ('namespaceHTMLElements' in inspect.getargspec(HTMLParser.__init__)[0]): kwargs['namespaceHTMLElements'] = False parser = HTMLParser(**kwargs) else: if treetype == "html5lib-beautifulsoup": treetype = "beautifulsoup" parser = HTMLParser(tree=treebuilders.getTreeBuilder(treetype)) return parser.parse(data, encoding=encoding)
def parse(fname): if fname in etree_cache: return etree_cache[fname] with (fname).open('rb') as fp: etree = HTMLParser(namespaceHTMLElements=False).parse(fp) etree_cache.clear() etree_cache[fname] = etree return etree
def test_parser_args(expected, data, kwargs): stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs) assert expected == stream.charEncoding[0].name p = HTMLParser() p.parse(data, useChardet=False, **kwargs) assert expected == p.documentEncoding
def test_productionlist(app, status, warning): app.builder.build_all() warnings = warning.getvalue().split("\n") assert len(warnings) == 2 assert warnings[-1] == '' assert "Dup2.rst:4: WARNING: duplicate token description of Dup, other instance in Dup1" in warnings[ 0] with (app.outdir / 'index.html').open('rb') as f: etree = HTMLParser(namespaceHTMLElements=False).parse(f) ul = list(etree.iter('ul'))[1] cases = [] for li in list(ul): assert len(list(li)) == 1 p = list(li)[0] assert p.tag == 'p' text = str(p.text).strip(' :') assert len(list(p)) == 1 a = list(p)[0] assert a.tag == 'a' link = a.get('href') assert len(list(a)) == 1 code = list(a)[0] assert code.tag == 'code' assert len(list(code)) == 1 span = list(code)[0] assert span.tag == 'span' linkText = span.text.strip() cases.append((text, link, linkText)) assert cases == [ ('A', 'Bare.html#grammar-token-A', 'A'), ('B', 'Bare.html#grammar-token-B', 'B'), ('P1:A', 'P1.html#grammar-token-P1-A', 'P1:A'), ('P1:B', 'P1.html#grammar-token-P1-B', 'P1:B'), ('P2:A', 'P1.html#grammar-token-P1-A', 'P1:A'), ('P2:B', 'P2.html#grammar-token-P2-B', 'P2:B'), ('Explicit title A, plain', 'Bare.html#grammar-token-A', 'MyTitle'), ('Explicit title A, colon', 'Bare.html#grammar-token-A', 'My:Title'), ('Explicit title P1:A, plain', 'P1.html#grammar-token-P1-A', 'MyTitle'), ('Explicit title P1:A, colon', 'P1.html#grammar-token-P1-A', 'My:Title'), ('Tilde A', 'Bare.html#grammar-token-A', 'A'), ('Tilde P1:A', 'P1.html#grammar-token-P1-A', 'A'), ('Tilde explicit title P1:A', 'P1.html#grammar-token-P1-A', '~MyTitle'), ('Tilde, explicit title P1:A', 'P1.html#grammar-token-P1-A', 'MyTitle'), ('Dup', 'Dup2.html#grammar-token-Dup', 'Dup'), ('FirstLine', 'firstLineRule.html#grammar-token-FirstLine', 'FirstLine'), ('SecondLine', 'firstLineRule.html#grammar-token-SecondLine', 'SecondLine'), ] text = (app.outdir / 'LineContinuation.html').read_text() assert "A</strong> ::= B C D E F G" in text
def body_html(self): body_html = self.get_part_content(self.mail_pyzmail.html_part) if not body_html and self.body_text: body_html = self.body_text.replace('\n', '<br />') parser = HTMLParser(tokenizer=HTMLSanitizer) parser.parse(body_html) return body_html
def scrape_pct(link, pct_name): """ Scrapes the data associated with the PCT, and calls functions to scrape data associated with the services. """ print print print pct_name print "-" * len(pct_name) url = "http://www.nhs.uk" + link parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(scrape(url)) root = page.getroot() d = {} # basic contact details d["PCT"] = pct_name d["type"] = "main" d["name"] = pct_name address = root.find("body/div/form/div/div/p").text d["address"] = address postcode = geo.extract_gb_postcode(address) d["postcode"] = postcode d["latlng"] = geo.gb_postcode_to_latlng(postcode) d["info HTML"] = url # quality for t in root.findall( "body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']" ): k = t.find("div/h4").text.strip() v = t.find("div/img").attrib["alt"] d[k] = v # head honcho for t in root.findall( "body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']" ): d["Boss"] = t.text.replace("<br />", ", ") # boring text for t in root.findall("body/div/form/div/div/div/div/div/div/p"): if t.text: if t.attrib.get("class", False) == "intro": d["intro text"] = t.text else: d["boilerplate"] = d.get("boilerplate", "") + "\n" + t.text datastore.save(unique_keys=["PCT", "type", "name", "address"], data=d, latlng=d.get("latlng")) scrape_facilities(pct_name, root) scrape_others(pct_name, url)
def scrape_others(pct_name, url): types = ["doctor", "dentist", "pharmacy", "optician"] for facility_type, i in zip(types, range(2, 6)): parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(scrape(url + "&v=%d" % i)) root = page.getroot() s = root.find("body/div/form/div/div/div/div/div/dl") extract_table_data(pct_name, s, facility_type)
def _try_process_source(stream, options): """ Tries to parse input as xhtml, xml (e.g. svg) or html(5), modifying options while figuring out input.. Returns a DOM tree. """ parse = xml.dom.minidom.parse try: dom = parse(stream) # Try to second-guess the input type # This is _not_ really kosher, but the minidom is not really namespace aware... # In practice the goal is to have the system recognize svg content automatically # First see if there is a default namespace defined for the document: top = dom.documentElement if top.hasAttribute("xmlns"): key = (top.getAttribute("xmlns"), top.nodeName) if key in _HOST_LANG: options.host_language = _HOST_LANG[key] except: # XML Parsing error in the input type, value, traceback = sys.exc_info() if options.host_language == GENERIC_XML or options.lax == False: raise RDFaError('Parsing error in input file: "%s"' % value) # XML Parsing error in the input msg = "XHTML Parsing error in input file: %s. Falling back on the HTML5 parser" % value if options != None and options.warnings: options.comment_graph.add_warning(msg) # in Ivan's original code he reopened the stream if it was from urllib if isinstance(stream, urllib.addinfourl): stream = urllib.urlopen(stream.url) # Now try to see if and HTML5 parser is an alternative... try: from html5lib import HTMLParser, treebuilders except ImportError: # no alternative to the XHTML error, because HTML5 parser not available... msg2 = 'XHTML Parsing error in input file: %s. Though parsing is lax, HTML5 parser not available. Try installing html5lib <http://code.google.com/p/html5lib>' % value raise RDFaError(msg2) parser = HTMLParser(tree=treebuilders.getTreeBuilder("dom")) parse = parser.parse try: dom = parse(stream) # The host language has changed options.host_language = HTML5_RDFA except: # Well, even the HTML5 parser could not do anything with this... (type, value, traceback) = sys.exc_info() msg2 = 'Parsing error in input file as HTML5: "%s"' % value msg3 = msg + '\n' + msg2 raise RDFaError, msg3 return dom
def get_highest_id(floor=0): rssfeed_url = 'http://digitalmedia.fws.gov/cdm4/rss.php' html = urllib2.urlopen(rssfeed_url).read() parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(html) last_item = soup.findAll('item')[-0] last_id = last_item.description.contents[0].split('CISOPTR=')[1].split( '&')[0] last_id = int(last_id) return last_id
def get_first_result_index_from_quick_search_results(html): parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(html) block = soup.find( '', {'id': 'photoresult'}) # isolate the table of data on the first result block = block.findAll('', {'class': 'photobox'})[0] id = block.find('p').find('a').contents[0] id = int(id) return id
def get_first_result_index_from_quick_search_results(html): parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(html) block = soup.find(border="0", bgcolor="white") # isolate the table of data on the first result id_str = block.find('font').contents[0] #contents of first <font> # this should looke like: 'ID#:11901' # parse out the actual id and cast as int id = int(id_str.partition(':')[2]) print id return id
def encodingTest(self, data=test['data'], encoding=test['encoding']): p = HTMLParser() t = p.parse(data, useChardet=False) errorMessage = ("Input:\n%s\nExpected:\n%s\nRecieved\n%s\n"% (data, repr(encoding.lower()), repr(p.tokenizer.stream.charEncoding))) self.assertEquals(encoding.lower(), p.tokenizer.stream.charEncoding[0], errorMessage)
def test_parser_reparse(): data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8') pad = 10240 - len(data) + 1 data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-") assert len(data) == 10240 # Sanity stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False) assert 'windows-1252' == stream.charEncoding[0].name p = HTMLParser(namespaceHTMLElements=False) doc = p.parse(data, useChardet=False) assert 'utf-8' == p.documentEncoding assert doc.find(".//title").text == "Caf\u00E9"
def sanitize_html(html): """Sanitizes an HTML fragment.""" p = HTMLParser(tokenizer=HTMLSanitizer, tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parseFragment(html) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) output_generator = s.serialize(stream) return u''.join(output_generator)
def parse_img_html_page(html): if not html or html == '': print "wait, the page appears blank. abort mission!" return None metadict = init_dict() # soupify the html parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(html) if not soup: print "wait, we couldn't make a soup. i don't know WHY..." return None try: metadict['id'] = int(soup.find('input', {'type':'hidden', 'name': 'CISOPTR'})['value']) except: favorite_link_href = soup.find("a", {"title": u"Add to My Favorites"})['href'] the_split = favorite_link_href.split("'") the_split.pop() metadict['id'] = int(the_split.pop()) #TODO: this is kinda hackey but probably fine metadict['url_to_thumb_img'] = u'http://digitalmedia.fws.gov/cgi-bin/thumbnail.exe?CISOROOT=/natdiglib&CISOPTR=' + str(metadict['id']) hires_link = soup.find(text=lambda str: str.strip() == u'(Full Resolution Image Link)', recursive=True).parent.find('a') metadict['url_to_hires_img'] = hires_link['href'] try: metadict['url_to_lores_img'] = u'http://digitalmedia.fws.gov' + soup.find("img", {"id" : "imagexy"})['src'] except: metadict['url_to_lores_img'] = u'http://digitalmedia.fws.gov' + soup.find("input", {"type" : "image"})['src'] data_table = soup.find("table", {"style": "border-top: 1px solid #cccccc"}).find("tbody") parsed_tuples = [] for data_label_cell in data_table.findAll("td", {"width": "150"}): try: label = get_text_within(data_label_cell) print label except: continue data_cell = data_label_cell.findNextSibling("td") if label == 'Subject': data = data_cell.findAll(text=True) else: data = get_text_within(data_cell).strip() parsed_tuples.append((label, data)) # now we have a list of tuples of the parsed metadata print parsed_tuples for label, data in parsed_tuples: field_key = data_schema.get_field_key_by_full_name(label) if not field_key: continue metadict[field_key] = data return metadict
def schoolscrape(categoryurl, name, url): print "" print name parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(specialscrape(url)) # pre = "{http://www.w3.org/1999/xhtml}" pre = "" keyvaluepairs = {} def addkeyvaluepair(k, v): keyvaluepairs[k] = v print k + ": " + v data_rows = [ t for t in page.findall(path(["body", "div", "div", "div", "div"], pre)) if t.attrib.get("class", "") == "detailsRow" ] for row in data_rows: key = [ t for t in row.findall(path(["span"], pre)) if t.attrib.get("class", "") == "leftColumn" ][0].text.rstrip(": ") valuetag = [ t for t in row.findall(path(["span"], pre)) if t.attrib.get("class", "") == "rightColumn" ][0] if valuetag.text: if key == "Address": raw_address = [valuetag.text] + [ br.tail for br in valuetag.findall(path(["br"], pre)) ] addkeyvaluepair("Address", " / ".join(raw_address[:-1])) addkeyvaluepair("Postcode", raw_address[-1]) else: addkeyvaluepair(key, valuetag.text) else: links = valuetag.findall(path(["a"], pre)) if len(links) == 1: addkeyvaluepair(key, links[0].attrib["href"]) else: for link in links: href = link.attrib["href"] if href[:7] != "http://": href = categoryurl + "details/" + href addkeyvaluepair(link.text, href) datastore.save(unique_keys=["Name"], data=keyvaluepairs)
def from_tiddler(handle): """ generates a tiddler from a Cook-style .tiddler file """ content = handle.read().decode('utf-8', 'replace') content = _escape_brackets(content) parser = HTMLParser(tree=treebuilders.getTreeBuilder('dom')) dom = parser.parse(content) node = dom.getElementsByTagName('div')[0] return _get_tiddler_from_div(node)
def strict_validator(self): """ Strict validation method. We just call html5lib parser with strict=True. Error messages are awful, and it complaints about many small errors, so it can be annoying. """ strict_parser = HTMLParser(strict=True) try: strict_parser.parse(self.data) except ParseError as ex: raise ValidationError(str(ex))