def scrape_committees(self): SEARCH_COMMITTEES_URL = "http://cfreports.sbe.virginia.gov/" _, resp = self.urlretrieve(SEARCH_COMMITTEES_URL) d = etree.fromstring(resp.content, parser=HTMLParser()) ptp = '//span[@id="PagingTotalPages"]/text()' ptr = '//span[@id="PagingTotalRecords"]/text()' number_of_result_pages = int(d.xpath(ptp)[0]) number_of_results = int(d.xpath(ptr)[0]) committee_list = [] for index in range(number_of_result_pages): # for index in range(2): # Reduce time for debugging _, resp = self.urlretrieve(SEARCH_COMMITTEES_URL + '?page=' + str(index + 1)) d = etree.fromstring(resp.content, parser=HTMLParser()) target_table = d.xpath('//table/tbody')[0] committee_list = committee_list + \ self.parse_committee_table(target_table) assert len(committee_list) == number_of_results for result in committee_list: org = Organization( name=result['org_name'], classification='political action committee', ) org.add_source(url=SEARCH_COMMITTEES_URL) org.source_identified = True yield org
def check_same_tpl(html_a, html_b): """ Given html_a and html_b, two HTML pages, check that they contain the same structure. Raises an exception if it's not the case. Otherwise, returns html_a. """ structa = fromstring(str(html_a), parser=HTMLParser(remove_blank_text=True)) structb = fromstring(str(html_b), parser=HTMLParser(remove_blank_text=True)) if not elements_equal(structa, structb): raise Exception("The two templates do not contain the same thing!") return html_a
def __init__(self, encoding='utf8'): # Obiekt Session, używany przy kolejnych zapytaniach. Potrzebny, żeby # raz ustawić nagłówki i nie przekazywać ich do każdego zapytania # osobno. self.session = requests.Session() self.session.headers.update({ 'Accept' : 'text/html,application/xhtml+xml,'\ 'application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding' : 'gzip, deflate', 'Accept-Language' : 'pl,en-US;q=0.7,en;q=0.3', 'Cache-Control' : 'max-age=0', 'Connection' : 'keep-alive', #'Host' : 'www.krs-online.com.pl', 'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64; rv:28.0) '\ 'Gecko/20100101 Firefox/28.0' #'Referer' : 'http://www.krs-online.com.pl/muzeum-slaska-opolskiego-krs-1260077.html', #'Cookie' : 'krs_fk45=h5mfc4oblmd1e1nokkpu4694e5; krs_cookie_accepted=true', #'DNT' : '1', }) self.parser = HTMLParser(encoding=encoding) self.cleaner = Cleaner( # usuwanie skryptów, styli i komentarzy scripts=True, javascript=True, comments=True, style=True, # head i body zostają page_structure=False)
def export(html, custom_filename=''): try: root = fromstring(html, parser=HTMLParser(collect_ids=False)) except (ParserError, XMLSyntaxError): raise SystemExit('Error while parsing content') try: content, = root.xpath(CONTENT_PATH) except ValueError: raise SystemExit('Multiple results while parsing the content') for xpath in BAD_XPATHS: for bad in content.xpath(xpath): bad.getparent().remove(bad) if custom_filename: filename = custom_filename else: file_options = dict( mode='w+b', suffix='.html', prefix=f'{int(time())}_', dir=getcwd(), delete=False, ) with NamedTemporaryFile(**file_options) as html_file: filename = html_file.name ElementTree(content).write(filename, encoding=HTML_ENCODING)
def convert_to_doc(html, domain_for_absolute_link=None): """ Accept the html content document, convert it to the doc element If we want to convert relative links to absolute links, we pass the domain url to the absolute links. :param html: A response html content or string body :param domain_for_absolute_link: A domain URL which used for creating an absolute links :return doc instance of the html. """ # if isinstance(html, (unicode)): # html = html.encode('ascii', 'xmlcharrefreplace') parser = HTMLParser(encoding='utf-8') try: doc = fromstring(html, parser=parser) except TypeError: doc = etree.parse(html, parser) except ValueError: doc = fromstring(html.encode('utf-8')) if domain_for_absolute_link: try: doc.make_links_absolute(domain_for_absolute_link) except Exception as ex: absolute_links_patch(doc, domain_for_absolute_link) return doc
def test_missing_root_empty_string_defusedxml(self): parser = HTMLParser() with self.assertRaises((XMLSyntaxError, AssertionError)) as cm: defusedxml.lxml.parse(BytesIO(b''), parser=parser) self.assertTrue(cm.exception.args[0] is None or 'ElementTree not initialized, missing root' in str( cm.exception))
def translate_html_special_characters(string): html_parser = HTMLParser() print(dir(html_parser)) data = '<br>' print(html_parser.feed(data)) print(data) return html_parser.parse(string)
def get_etree(fileobj, encoding='utf-8'): """ Get an ElementTree instance from a given file object *fileobj*. The encoding is assumed to be utf8. """ return parse(fileobj, HTMLParser(encoding=encoding, remove_blank_text=True))
def the_jama_dance(pma, verify=True): ''' :param: pma (PubMedArticle object) :param: verify (bool) [default: True] :return: url (string) :raises: AccessDenied, NoPDFLink ''' if not pma.doi: raise NoPDFLink('MISSING: doi needed for JAMA article.') baseurl = the_doi_2step(pma.doi) res = requests.get(baseurl) parser = HTMLParser() tree = etree.fromstring(res.content, parser) # we're looking for a meta tag like this: # <meta name="citation_pdf_url" content="http://archneur.jamanetwork.com/data/Journals/NEUR/13776/NOC40008.pdf" /> for item in tree.findall('head/meta'): if item.get('name') == 'citation_pdf_url': pdfurl = item.get('content') else: raise NoPDFLink('DENIED: JAMA did not provide PDF link in (%s).' % baseurl) if verify: #TODO: form navigation verify_pdf_url(pdfurl, 'JAMA') return pdfurl
def processHTML(self): """ decodes data; calls callbacks; recodes data """ zipencoding = None try: zipencoding = self.father.responseHeaders.getRawHeaders( "Content-Encoding")[-1] self.data = zips(zipencoding).decompress(self.data) except: zipencoding = None if len(self.data) > 1 \ and not self.data == "<!-- default response -->": # use encoding header if available. default is meta tag but not always present. # assumes unicode. could enforce with unicodedammit but very slow and never found necessary try: encoding = self.father.responseHeaders.getRawHeaders \ ("content-type")[-1].split('charset=')[1].split(";")[0] except: encoding = "ISO-8859-1" try: tree = fromstring(self.data, parser=HTMLParser(encoding=encoding)) events.gotResponseTree(self, tree) self.data = tostring(tree, encoding=encoding) except: log.exception("{id} Content not parseable len={len} enc={enc}\n{data}"\ .format(id=self.father.id, len=len(self.data), enc=encoding, data=self.data[:100])) # gotResponseTree used by most plugins but but sometimes may want to see raw text events.gotResponseText(self) if zipencoding: self.data = zips(zipencoding).compress(self.data)
def query_initial_packages(search_term): """ Perform an initial package search on PyPI with the given :attr:`search_term`, and return a list of :attr:`PypiSearchResult` named objects. :param str search_term: The initial search query :return: The list of search results :rtype: list[PypiSearchResult] """ logging.info("Querying initial packages for %s...", search_term) result_page = requests.get("https://pypi.python.org/pypi", params={ ":action": "search", "term": search_term }) result_tree = etree.fromstring(result_page.content, HTMLParser()) result_tree.make_links_absolute(result_page.url) result_tags = result_tree.xpath("//table[@class='list']/tr[@class][td]") results = [] for lxml_element in result_tags: result_obj = PypiJsonSearchResult(link="{0}/json".format( lxml_element[0][0].get("href")), weight=int(lxml_element[1].text), summary=lxml_element[2].text or '') if result_obj.is_pip_result(search_term): results.append(result_obj) return results
def view_page(): url = flask.request.args.get("url") o = urllib.parse.urlparse(url) if o.scheme == '': print("Invalid url: Scheme error") exit(2) if o.netloc == '': print("Invalid url: Scheme error") exit(2) req = urllib.request.Request(url) req.add_header('User-Agent', 'PurdueUniversityClassProject/1.0 ([email protected] https://goo.gl/dk8u5S)') with urllib.request.urlopen(req) as response: htm = response.read() html = htm.decode("UTF-8") # Credit: Adapted from example in Python 3.4 Documentation, urllib.request # License: PSFL https://www.python.org/download/releases/3.4.1/license/ # https://docs.python.org/3.4/library/urllib.request.html #root = lxml.html.parse(html) #print(root) parser = HTMLParser(encoding="UTF-8") root = document_fromstring(html, parser=parser, base_url=url) #print(root) #root.make_links_absolute(url, resolve_base_href=True) #print(html) for node in root.iter(): if node.tag == 'head': newstr = "<base href = {}>".format(url) html = html.replace('<head>','<head>' + '\n' + newstr) if node.tag == 'HEAD': newstr = "<BASE HREF = {}>".format(url) html = html.replace('<TITLE>','<TITLE>' + '\n' + newstr) path = copy_profile_photo_to_static(root) static_url = flask.url_for('static',filename = os.path.basename(path), _external = True) #print(static_url) expr = r"(/home/ecegridfs/a/ee364a13/hpo/static/)([\w]+)(\.[\w]+)" o = re.match(expr,path) photo = o.group(2) for node in root.iter(): if node.tag == "img": url = node.get("src") with urllib.request.urlopen(url) as response: type = response.info().get('Content-Type') extension = mimetypes.guess_extension(type) filename = make_filename(url,extension) col = filename.split(".") name = col[0] match_name = name if match_name == photo: #print("Found") #print(node.attrib['src']) html = html.replace(node.attrib['src'],static_url) return html
def parse_html(fileobj, encoding): """ Given a file object *fileobj*, get an ElementTree instance. The *encoding* is assumed to be utf8. """ parser = HTMLParser(encoding=encoding, remove_blank_text=True) return parse(fileobj, parser)
def pre_parse(self): http_content_type = self.response.headers.get('content-type', '') target = HTMLEncodings(http_content_type) # parser will fail on non-ascii unless we set it explicitly parser = HTMLParser(target=target, encoding='ISO-8859-1') total_bytes = 0 self.response.seek(0) while target: chunk = self.response.read(PRE_PARSE_CHUNK_SIZE) if not chunk: try: parser.close() except XMLSyntaxError: pass break if self.bom is None: assert PRE_PARSE_CHUNK_SIZE >= 4 self.bom = b'' for i in range(4, 1, -1): if chunk[:i] in BOM_ENC: self.bom = chunk[:i] target.encodings.append(('bom', BOM_ENC[self.bom])) # the can only be one BOM - stop here break parser.feed(chunk) total_bytes += len(chunk) if total_bytes >= MAX_PRE_PARSE_BYTES: break return target.encodings
def the_wolterskluwer_volta(pma, verify=True): ''' :param: pma (PubMedArticle object) :param: verify (bool) [default: True] :return: url :raises: AccessDenied, NoPDFLink ''' doiurl = 'http://content.wkhealth.com/linkback/openurl?doi=%s' volissurl = 'http://content.wkhealth.com/linkback/openurl?issn={a.issn}&volume={a.volume}&issue={a.issue}&spage={a.first_page}' if pma.doi: baseurl = requests.get(doiurl % pma.doi).url elif pma.issn: pma = rectify_pma_for_vip_links(pma) #raises NoPDFLink if missing data baseurl = requests.get(volissurl.format(a=pma)).url res = requests.get(baseurl) tree = etree.fromstring(res.content, HTMLParser()) try: item = tree.cssselect('li.ej-box-01-body-li-article-tools-pdf')[0] except IndexError: raise NoPDFLink( 'DENIED: wolterskluwer did not provide PDF link for this article') link = item.getchildren()[0] url = link.get('href') if verify: verify_pdf_url(url) return url
def _load(self): """ Load the ElementTree from the source """ # Convert directional quotation marks to regular quotes double_quotes = ur'[\u201c\u201d]' self.source = re.sub(double_quotes, u'"', self.source) single_quotes = ur'[\u2019\u2018]' self.source = re.sub(single_quotes, u"'", self.source) # Convert colons self.source = self.source.replace(u'\uff1a', u':') # Remove line breaks and tabs self.source = self.source.replace(u'\n', u'') self.source = self.source.replace(u'\t', u'') # There are also some "zero width joiners" in random places in the text # Should remove them here, since they make string search unreliable # these are the codes: ‍,   (nbsp), \xa0 (nbsp), \u200d zero_width_joiners = u'\u200d' self.source = self.source.replace(zero_width_joiners, u'') # Also previously had some non breaking spaces in unicode \u00a0, but this # may have been fixed by changing the parser below # Use the lxml cleaner cleaner = Cleaner() parser = HTMLParser(encoding='utf-8') # Finally, load the cleaned string to an ElementTree self.tree = cleaner.clean_html( lxml.html.fromstring(to_string(self.source), parser=parser))
def read_image(recipe_id: int) -> Optional[str]: tree = parse( f"./data/crawl/{Kind.RECIPE.name.lower()}/{recipe_id}.html", parser=HTMLParser(encoding="utf-8"), ) elements = tree.xpath(JSON_LD) if len(elements) == 0: logging.debug(log(f"{filename}: no recipe")) return None for element in elements: j = None try: j = json.loads(element.text) except JSONDecodeError as e: logging.debug(log(f"{filename} invalid json: {element.text}")) return None if "@type" in j.keys(): if j["@type"] == "Recipe": return j["image"].strip() if "@graph" in j.keys(): for node in j["@graph"]: if node["@type"] == "Recipe": return j["image"].strip() else: logging.debug(log(f"{filename}: no @graph or @type=Recipe element in json")) logging.debug(log(json.dumps(j, indent=2, ensure_ascii=False))) return None return None
def parse(self, **kwargs): from lxml.html import HTMLParser html_parser = HTMLParser() etree_root = etree.fromstring(kwargs['root'], parser=html_parser) return super().parse(root=etree_root, document_id=kwargs['document_id'])
def __init__(self, resp: Response): self.origin: str = resp.text # 原始数据 utf8_parser = HTMLParser(encoding="utf-8") data = PyQuery(fromstring(self.origin, parser=utf8_parser)) self.raw: List[EHentaiItem] = [ EHentaiItem(i) for i in data.find(".glcat").parents("tr").items() ] self.url: str = str(resp.url)
def get_forms(self): """Returns a list of form elements available on the page.""" source, encoding = self.get_source(buffered=True) return parse( source, parser=HTMLParser(encoding=encoding, collect_ids=False)).xpath( "descendant-or-self::form|descendant-or-self::x:form", namespaces={'x': XHTML_NAMESPACE})
def _make_etree(html, url): from lxml.html import HTMLParser, document_fromstring parser = HTMLParser(encoding="UTF-8") root = document_fromstring(html, parser=parser, base_url=url) root.make_links_absolute(root.base_url) return root
def __init__(self, fpath=None): if not fpath: self.file_path = '/home/vera/work/aerostat/AEROSTATICA/aerostatica.ru/2019/04/28/728-beltain-flook-2019/index.html' else: self.file_path = fpath with open(self.file_path, 'rb') as f: self.soup = fromstring(f.read(), parser=HTMLParser(encoding='utf8'))
def fetch_winpython_lib_page(): """ Fetch the Windows Python compiled libraries page and return the parsed element tree. """ resp = requests.get(WINPYTHON_LIBS_URL, timeout=30) tree = etree.fromstring(resp.content, HTMLParser()) tree.make_links_absolute(resp.url) return tree
def __init__(self, path: str, dev: bool): self.path = path self.dev = dev kinxfile = load_fm(self.__get_path("Kinxfile")) self.markdown = Markdown( renderer=Renderer(escape=False, hard_wrap=True), inline=InlineLexer, block=BlockLexer, ) try: try: jinja_extensions = kinxfile["extensions"]["jinja"] except KeyError: jinja_extensions = () self.env = Environment( extensions=jinja_extensions, autoescape=True, loader=FileSystemLoader(self.__get_path(kinxfile["root"])), ) del jinja_extensions except KeyError: print("Unexpected error: 'root' key not found in Kinxfile") raise project_dir: dict = {} self.headers: List[Union[Tuple[str, int], str]] = [self.DEFAULT_HEADER] for i in etree.fromstring(self.markdown(kinxfile.content), HTMLParser())[0]: if i.tag == "ul": project_dir[self.headers[-1]].extend(self.__get_links_md(i)) elif (i.tag[:1] == "h" and i.tag[1:] in (str(j) for j in range(1, 7))): self.headers.append((i.text, i.tag[1:])) project_dir[self.headers[-1]] = [] else: project_dir[self.headers[-1]].append(i.tag) if i.tag not in ["hr"]: print("{} is not read in Kinxfile".format( etree.tostring(i))) self.kx: dict = {} for i in [ "title", "author", "description", "url", "copyright", "theme" ]: self.kx[i] = kinxfile[i] del kinxfile self.kx["content"] = (project_dir) del project_dir print(self.kx) self.pages: Dict[str, str] = {}
def lxmldom(self, url): req = self.fetch(url) parser = HTMLParser(encoding='utf-8', remove_pis=True, remove_comments=True, remove_blank_text=True) dom = lxml.html.fromstring(req.text, base_url=url, parser=parser) dom.make_links_absolute(url) return dom
def parse_xhtml_file(file: Path) -> HtmlElement: """ Parse an XHTML file into plain HTML, i.e. a tree of HTMLElements without namespace annotations. :param file: XHTML file :return: the 'html' element """ # The HTMLParser seems to do what I want okay... return parse(str(file), parser=HTMLParser()).getroot()
def _build_doc(self): """ Raises ------ ValueError * If a URL that lxml cannot parse is passed. Exception * Any other ``Exception`` thrown. For example, trying to parse a URL that is syntactically correct on a machine with no internet connection will fail. See Also -------- pandas.io.html._HtmlFrameParser._build_doc """ from lxml.etree import XMLSyntaxError from lxml.html import ( HTMLParser, fromstring, parse, ) parser = HTMLParser(recover=True, encoding=self.encoding) try: if is_url(self.io): with urlopen(self.io) as f: r = parse(f, parser=parser) else: # try to parse the input in the simplest way r = parse(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass except (UnicodeDecodeError, OSError) as e: # if the input is a blob of html goop if not is_url(self.io): r = fromstring(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass else: raise e else: if not hasattr(r, "text_content"): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) for br in r.xpath("*//br"): br.tail = "\n" + (br.tail or "") return r
def test_missing_root_only_comment_defusedxml(self): parser = HTMLParser() self.assertRaises(AssertionError, defusedxml.lxml.parse, BytesIO(b'<!-- foo -->'), parser=parser) try: defusedxml.lxml.parse(BytesIO(b'<!-- foo -->'), parser=parser) except Exception as ex: self.assertTrue( 'ElementTree not initialized, missing root' in str(ex))
class SESCSchedule: parser = HTMLParser(encoding='utf-8') baseurl = 'http://www.sescsp.org.br/sesc/programa_new/busca.cfm' basepage = parse(baseurl, parser).getroot() units = {} for option in basepage.cssselect('select[name="unidade_id"] option'): value = int(option.get('value')) units[value] = option.text_content() def __init__(self, unit): # checks whether it is a valid unit if unit in SESCSchedule.units: self.unit = unit self.events = [] else: # TODO: document the "unit doesn't exist" error raise def get_page_url(self, page): return SESCSchedule.baseurl + '?' + urlencode({'unidade_id': self.unit, 'page': page}) def get_events(self, page): self.page_url = self.get_page_url(page) self.page_tree = parse(self.page_url).getroot() entries = self.page_tree.cssselect('#box') if entries: print 'found {0} entries in page {1}'.format(len(entries), page) page_events = [] for entry_count,entry in enumerate(entries): if entry.cssselect('.tit2'): events = SESCProgram(entry, page).events elif entry.cssselect('.tit'): events = [SESCEvent(entry, page).dictionaries] else: events = [] page_events.extend(events) print '{0} of {1} entries have been scraped'.format(entry_count, len(entries)) # let the world know the scraper was succesful return page_events else: print 'no more events in unit {0} schedule'.format(self.unit) return 0 print 'done scraping page {0}'.format(page) def __iter__(self, page=1): events = self.get_events(page) while events: yield events page += 1 events = self.get_events(page) print 'iteration complete over {0} pages'.format(page)
def _build_doc(self): """ Raises ------ ValueError * If a URL that lxml cannot parse is passed. Exception * Any other ``Exception`` thrown. For example, trying to parse a URL that is syntactically correct on a machine with no internet connection will fail. See Also -------- pandas.io.html._HtmlFrameParser._build_doc """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError parser = HTMLParser(recover=False, encoding=self.encoding) try: # try to parse the input in the simplest way r = parse(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass except (UnicodeDecodeError, IOError): # if the input is a blob of html goop if not _is_url(self.io): r = fromstring(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass else: # not a url scheme = parse_url(self.io).scheme if scheme not in _valid_schemes: # lxml can't parse it msg = (('{invalid!r} is not a valid url scheme, valid ' 'schemes are {valid}') .format(invalid=scheme, valid=_valid_schemes)) raise ValueError(msg) else: # something else happened: maybe a faulty connection raise else: if not hasattr(r, 'text_content'): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r