def fetch_images(etree): base_url = etree.xpath('//base/@url')[0] with pushd_temp_dir(): filename_to_node = collections.OrderedDict() # # Extract the image files into the current directory # imgurls = etree.xpath('//img//@src') for imgurl in imgurls: newimgurl = imgurl # //http if imgurl[:2] == '//': newimgurl = 'http' + imgurl else: if imgurl[:4] != 'http': newimgurl = base_url + imgurl headers = { 'User-Agent': 'PurdueUniversityClassProject/1.0 ({}@purdue.edu https://goo.gl/dk8u5S)' .format(USERNAME) } request = urllib.request.Request(url=newimgurl, headers=headers) resp = urllib.request.urlopen(request) filename = make_filename(newimgurl, 'jpg') with open(filename, "wb") as f: f.write(resp.read()) filename_to_node[filename] = imgurl app.logger.info('fetch_images {} {}'.format(imgurl, filename)) yield filename_to_node
def init_response_attributes(self, etree): try: self.backend_transaction_id = etree.xpath('//MSS_SignatureResp')[0].attrib['MSSP_TransID'] self.status = etree.xpath('//ns6:StatusCode', namespaces={'ns6': self.ns_namespace})[0].attrib['Value'] except (IndexError, KeyError, lxml.etree.XMLSchemaError) as e: raise ResponseParseError('Cannot parse signature response: %s. Response content: %s' % ( e, lxml.etree.tostring(etree)))
def init_response_attributes(self, etree): try: status_code = etree.xpath('//ns5:StatusCode', namespaces={'ns5': self.ns_namespace })[0].attrib['Value'] except (IndexError, KeyError, lxml.etree.XMLSchemaError) as e: raise ResponseParseError( 'Cannot parse status response: %s. Response content: %s' % (e, lxml.etree.tostring(etree))) self.status = Statuses.map(status_code) try: civil_number_tag = etree.xpath( '//ns4:UserIdentifier', namespaces={'ns4': self.ns_namespace})[0] except IndexError: # civil number tag does not exist - this is possible if request is still processing return else: try: self.civil_number = civil_number_tag.text.split('=')[1] except IndexError: raise ResponseParseError( 'Cannot get civil_number from tag text: %s' % civil_number_tag.text)
def insert(cls, etree): """Create and return an empty <item> element.""" new_elt = lxml.etree.Element("item") existing_items = etree.xpath("//item") if existing_items: existing_items[0].addprevious(new_elt) else: etree.xpath("/rss/channel").append(new_elt) new_item = cls(etree, new_elt) new_item.parent_id = 0 return new_item
def get_info(self): headers = config.headers url = self._url.format(self._pripid) result, status_code = Send_Request().send_requests(url, headers) info = {} if status_code == 200: data = etree.xpath(result, parser=etree.HTMLParser(encoding='utf-8')) tr_list = data.xpath( "//table[id= 'table_jyyc']//tr[@name = 'jyyc']") for i, singledata in enumerate(tr_list): temp = {} td_list = singledata.xpath("./td") temp["types"] = '经营异常' temp["in_reason"] = deal_html_code.remove_symbol( td_list[1].xpath("string(.)")) in_date = deal_html_code.remove_symbol( td_list[2].xpath("string(.)")) temp["in_date"] = deal_html_code.change_chinese_date(in_date) temp["out_reason"] = deal_html_code.remove_symbol( td_list[4].xpath("string(.)")) out_date = deal_html_code.remove_symbol( td_list[5].xpath("string(.)")) temp["out_date"] = deal_html_code.change_chinese_date(out_date) temp["gov_dept"] = deal_html_code.remove_symbol( td_list[6].xpath("string(.)")) temp["out_gov"] = deal_html_code.remove_symbol( td_list[7].xpath("string(.)")) info[i] = temp return info
def get_words_from_line(etree, line_id, line_num): """ Return a python dict by flattening the entire page into just an array of words. This discards line number information, and is really just here as a historical thing. """ #print "line id: %s" % line_id word_array = [] xpath_query = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[starts-with(@class, 'ocr')]" ## Have noodled with this some--these used to work. It's not clear that xpath really can handle the range of hocr formats that will be thrown at us, but it works when formats are fairly uniform. #xpath_query1 = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[@class='ocr_word']" #xpath_query2 = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[@class='ocrx_word']" hocr_words = etree.xpath(xpath_query) # hocr_words = etree.xpath(xpath_query1) #if not hocr_words: # hocr_words = etree.xpath(xpath_query2) for word in hocr_words: # the text may be contained in this span, but it may also be contained in a child element. word_array.append({ 'bbox': get_bbox_from_title(word.attrib['title']), 'text': simple_clean(tostring(word, method="text", encoding='UTF-8')), 'word_num': word.attrib['id'].replace("word_", ""), 'line_num': line_num }) return word_array
def set_adjustment_values(self, builder, etree): # fix defaults not loading for object in etree.xpath('/interface/object[@class="GtkAdjustment"]'): property = object.xpath('property[@name="value"]') if len(property): obj = builder.get_object(object.get('id')) obj.set_value(float(property[0].text))
def set_adjustment_values(builder, etree): """ Glade default adjustment values fix """ for object in etree.xpath('/interface/object[@class="GtkAdjustment"]'): property = object.xpath('property[@name="value"]') if len(property): obj = builder.get_object(object.get('id')) obj.set_value(float(property[0].text))
def copy_profile_photo_to_static(etree): base_url = etree.xpath('//base/@url')[0] profileUrl = find_profile_photo_filename(etree) if profileUrl != '': proj_dir = sys.path[0] # e.g., "/home/ecegridfs/a/ee364z15/hpo" static_dir = os.path.join( proj_dir, "static") # e.g., "/home/ecegridfs/a/ee364z15/hpo/data" newProfileUrl = profileUrl # //http if profileUrl[:2] == '//': newProfileUrl = 'http' + profileUrl else: if profileUrl[:4] != 'http': newProfileUrl = base_url + profileUrl headers = { 'User-Agent': 'PurdueUniversityClassProject/1.0 ({}@purdue.edu https://goo.gl/dk8u5S)' .format(USERNAME) } app.logger.info('request for {}'.format(newProfileUrl)) request = urllib.request.Request(url=newProfileUrl, headers=headers) resp = urllib.request.urlopen(request) filename = make_filename(newProfileUrl, 'jpg') with open(os.path.join(static_dir, filename), "wb") as f: f.write(resp.read()) return profileUrl, filename return '', ''
def get_attributes_of_xpaths(etree, x_paths, namespaces, attr='id'): """ Determine the values of the attributes of the objects that match each XPath Args: etree (:obj:`lxml.etree._ElementTree`): element tree for XML document x_paths (:obj:`list` of `str`): XPaths namespaces (:obj:`dict`): dictionary that maps the prefixes of namespaces to their URIs attr (:obj:`str` or :obj:`dict`, optional): attribute to get values of Returns: :obj:`dict` of :obj:`str` to :obj:`list` of :obj:`str`: dictionary that maps each XPath to the values of the attribute of the objects in the XML file that match the XPath """ # get namespaces if isinstance(attr, dict): attr = '{{{}}}{}'.format(attr['namespace']['uri'], attr['name']) # determine the values of the attributes of the objects that match each XPath x_path_attrs = {} for x_path in x_paths: try: objects = etree.xpath(x_path, namespaces=get_namespaces_with_prefixes(namespaces)) x_path_attrs[x_path] = [obj.attrib.get(attr, None) for obj in objects] except Exception: x_path_attrs[x_path] = [] # return the values of the attributes of the objects that match each XPath return x_path_attrs
def name(url): headers = config.headers_detail content, status_code = Send_Request().send_request(url, headers) info = {} if status_code == 200: flag = 1 result = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.path("//div[@class= 'viewBox']/dl")[0] datallist = etree.tostring(dl).split( '<dd style="border-bottom:1px solid #AE0000;padding-bottom:10px;">' ) datallist.remove(datallist[-1]) for i, single in enumerate(datallist): single = etree.xpath(content, parser=etree.HTMLParser(encoding='utf-8')) string = u"股东" name = deal_dd_content(string, single) string = u"变更前" percent_pre = deal_dd_content(string, single) string = u"变更后" percent_after = deal_dd_content(string, single) string = u"变更日期" dates = deal_dd_content(string, single) info[i] = [name, percent_pre, percent_after, dates] else: flag = 100000004 if flag == 1: deal_html_code.remove_repeat(info) return info, flag
def get_detail_info(self, detail_url, info): dict = { u"种类": "cates", u"范围": "ranges", u"期限": "period", u"备注": "remark", } headers = config.headers result, status_code = Send_Request().send_requests(detail_url, headers) if status_code == 200: data = etree.xpath(result, parser=etree.HTMLParser(encoding='utf-8')) string = u"被担保债权概况信息" table = data.xpath("//*[contains(.,'%s')]" % string)[0] for key, value in dict.iteritems(): info[value] = deal_html_code.get_match_info(key, table) string = u"抵押权人概况信息" person_info = data.xpath("//*[contains(.,'%s')]" % string)[0] string = u"抵押权物概况信息" goods_info = data.xpath("//*[contains(.,'%s')]" % string)[0] else: info["cates"] = '' info["ranges"] = '' info["period"] = '' info["remark"] = '' person_info = {} goods_info = {} return person_info, goods_info
def set_adjustment_values(builder,etree): """ Glade default adjustment values fix """ for object in etree.xpath('/interface/object[@class="GtkAdjustment"]'): property = object.xpath('property[@name="value"]') if len(property): obj = builder.get_object(object.get('id')) obj.set_value(float(property[0].text))
def __init__(self, filename): self.objects = [] self.indices = {} self.data_class = type(self.data_class_name, (self.data_class_base, ), {}) f = open(filename, 'rb') etree = lxml.etree.parse(f) for entry in etree.xpath('//%s' % self.xml_tag): mapped_data = {} for key in entry.keys(): mapped_data[self.field_map[key]] = entry.get(key) entry_obj = self.data_class(entry, **mapped_data) self.objects.append(entry_obj) # Create indices for key in self.field_map.values(): self.indices[key] = {} # Update indices for obj in self.objects: for key in self.field_map.values(): value = getattr(obj, key, None) if value is None: continue self.indices[key][value] = obj
def find_elements(etree): elements = [] TRs = etree.xpath('//tr') #国内高匿https HighHiding = 0 #print(TRs) for i, tr in enumerate(TRs): #print(tr) th = tr.xpath('th') h2 = [] if len(th): h2 = th[0].xpath('h2') if len(h2): print(h2[0].text) #print(h2) if HighHiding == 0: if len(h2) and h2[0].text == "国内高匿代理IP": HighHiding = 1 continue else: continue if len(h2) and h2[0].text == "国内透明代理IP": break TDs = tr.xpath('td') #print(TDs) #print(TDs[1].text, TDs[1].tag) if len(TDs) and TDs[5].text == "HTTPS": #print(TDs[1].text, TDs[2].text) yield {'ip': TDs[1].text, 'port': TDs[2].text}
def init_response_attributes(self, etree): self.status = Statuses.ERRED try: self.details = etree.xpath('//soapenv:Text', namespaces={'soapenv': self.soapenv_namespace})[0].text except (IndexError, lxml.etree.XMLSchemaError) as e: raise ResponseParseError('Cannot parse error status response: %s. Response content: %s' % ( e, lxml.etree.tostring(etree)))
def get_topics(spider, id, topic_f, topic_log_f): ''' spider为requests对象, id为话题id topic_f为layer*.json句柄 topic_log_f为xxxx-xx-xx_crawler.log句柄 ''' url = 'https://www.zhihu.com/topic/' + id + '/organize/entire' time.sleep(0.1) for counter1 in range(1000): try: res = spider.get(url, headers=CUR_HEADERS_BASE, timeout=CUR_TIMEOUT_QUERY) content = res.content etree = lxml.html.fromstring(content) people_xpath = '//div[@class="zm-topic-side-followers-info"]//strong/text()' des_xpath = '//div[@id="zh-topic-desc"]' name_xpath = '//h1/text()' etree_id = etree.xpath(people_xpath) etree_des = etree.xpath(des_xpath) etree_name = etree.xpath(name_xpath) topic = dict() topic['id'] = id topic['followers'] = etree_id[0] if len(etree_id) > 0 else 0 topic['des'] = lxml.html.tostring( etree_des[0], encoding='utf8') if len(etree_id) > 0 else 'NULL' topic['name'] = etree_name[0] with open(_XSRF_GLOBAL_FIELNAME) as f: _xsrf = f.read() data_xsrf = {'_xsrf': _xsrf} except BaseException, e: if str(e) != 'None': logging.debug('Fail to fetch the page. Error: {0}.'.format(e)) logging.debug(url + ' wait for a sec to recrawl') time.sleep(1) else: if counter1 > 1: logging.info(url + ' html data get!') break
def query_etree(etree): # Returns a list of top level elements from XML etree return etree.xpath( "//informationTable:infoTable", namespaces={ "informationTable": "http://www.sec.gov/edgar/document/thirteenf/informationtable" })
def extract(self) -> List[str]: texts = [] for etree in self.etrees: texts.extend([ t.text for t in etree.xpath( '//a:t', namespaces=self.namespaces) if t.text.strip() != '' ]) return texts
def test_add_mdreference(testpath): """Test add_reference function. Calls function two times and write the mdreference file. """ md_creator = utils.MdCreator(testpath) md_creator.add_reference('abcd1234', 'path/to/file1') md_creator.add_reference('abcd1234', 'path/to/file2') md_creator.write_references() # Read created file. Reference should be found for both files etree = lxml.etree.parse(os.path.join(testpath, 'md-references.xml')) reference = etree.xpath('/mdReferences/mdReference[@file="path/to/file1"]') assert reference[0].text == 'abcd1234' reference = etree.xpath('/mdReferences/mdReference[@file="path/to/file2"]') assert reference[0].text == 'abcd1234'
def extract(self) -> List[str]: texts = [] for etree in self.etrees: texts.extend([ t.text for t in etree.xpath('//a:t', namespaces=self.namespaces) if t.text.strip() != '' ]) return texts
def _get_xpath_attribute(etree, path, attribute): """ Get an attribute from a node grabbed from xpath. If not found, return None. """ try: return etree.xpath(path)[0].attrib[attribute] except IndexError, KeyError: return None
def get(self,url,headers,encode,xpath,func,num): rps1=requests.get(url,headers=headers) rps1.encoding=encode self.textData=rps1.text if len(xpath)>0: ex=etree.xpath(xpath[0]) for i in ex: self.nextPageUrl.append(func[0](i)) print(f'Page {num} ok')
def swap(etree, texts: List[str]) -> List[str]: for t in etree.xpath('//a:t', namespaces=self.namespaces): if t.text.strip() == '': continue text = texts.pop() if text is None: continue t.text = text return texts
def deal_single_info(datallist, info, j): for i, single in enumerate(datallist, j): single = etree.xpath(single, parser=etree.HTMLParser(encoding='utf-8')) string = u"类型" types = deal_dd_content(string, single) string = u"名称" name = deal_dd_content(string, single) string = u"网址" website = deal_dd_content(string, single) uuid = '' info[i] = [name, types, website, uuid]
def get_lines(etree): elems = etree.xpath("/html/body//article/div") #elems = [collapse(e) for e in elems[0].iterchildren()] elems = [e for e in elems[0].iterchildren()] elems = [collapse(e) for e in elems] txt = [e.text for e in elems] txt = ["" if t is None else t for t in txt] txt = [xlat_utf8(t) for t in txt] txt = [t.strip() for t in txt] txt = [t for t in txt if t] return txt
def football(url, date): html = lxml.html.parse(pages) rows_xpath = xpath( "//*[@id='content-primary']/table[1]/tbody/tr[td[1]/span/span//text()='%s']" % (date)) time_xpath = xpath("td[1]/span/span//text()[2]") team_xpath = xpath("td[2]/a/text()") result_xpath = xpath("td[3]/a/span/text()") place_xpath = xpath("td[4]/a/text()") details = [] for row in rows_xpath(html): time = time_xpath(row)[0].strip() team = team_xpath(row)[0] score = result_xpath(row)[0] venue = place_xpath(row)[0] details.append([time, team, score, venue]) return details
def mmio_regions(etree): ret = [] resources = etree.xpath("//resources/mmio") for res in resources: base = res.get("min") top = res.get("max") dev = res.getparent().getparent() obj = dev.get("object") ret.append((obj, int(base, base=16), int(top, base=16))) return sorted(ret, key=lambda x: (x[1], x[2]))
def name(url): headers = config.headers_detail content, status_code = Send_Request().send_request(url, headers) if status_code == 200: flag = 1 result = etree.xpath(content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.xpath("//div[@class = viewBox']//dl")[0] info = {} if "企业名称" in content: datallist = etree.tostring(dl).split( '<dd style="border-bottom:1px solid #AE0000;padding-bottom:10px;">' ) datallist.remove(datallist[-1]) pattern = re.compile(u".*共(.*?)页.*") number = re.findall(pattern, content) if len(number) == 1: totalpage = int(number[0]) else: totalpage = 0 if int(totalpage) == 1: j = 0 deal_single_info(datallist, info, j) else: j = 0 deal_single_info(datallist, info, j) entid = deal_html_code.match_entid(url) cid = deal_html_code.match_cid(url) href = out_invest_url.format(entid, cid) for k in xrange(2, totalpage + 1): content, status_code = Send_Request().send_request(href) if status_code == 200: start = k * 5 + 1 result = etree.HTML( content, parser=etree.HTMLParser(encoding='utf-8')) dl = result.xpath("//div[@class='viewBox']/dl")[0] datalist = etree.tostring(dl).split( '<dd style="border-bottom:1px solid #AE0000;padding-bottom:10px;">' ) if len(datalist) > 0: datalist.remove(datalist[-1]) deal_single_info(datalist, info, start) else: pass else: flag = 100000004 else: flag = 100000004 if flag == 1: info = deal_html_code.remove_repeat(info) return info, flag
def parsesys(self, etree, syspath): """Give blocks and lines objects lists from Block and Line elements in XML args : etree of the xml file / path of the <System> tag in the xml return : blocks dict and lines list""" blocks = {} lines = [] blockclass = { 'Gain': Gain, 'Delay': Delay, 'Sum': Sum, 'SubSystem': SubSystem, 'Inport': Inport, 'Outport': Outport } for mysys in etree.xpath(syspath): #print etree.getpath(mysys) # iter() instead of findall(): recursive search blockels = mysys.iter("Block") lineels = mysys.iter("Line") ## Fill blocks dict for b in blockels: blocktype = b.get("BlockType") try: if blocktype == 'Gain': newblock = blockclass[blocktype](b, self.constants) else: newblock = blockclass[blocktype](b) except KeyError: print(blocktype + " is an unsupported block") sys.exit() blocks[b.get("SID")] = newblock ## Fill lines list for l in lineels: # find all children 'P' tag # there must be only one 'Src for p in l.findall("P"): if p.get("Name") == "Src": srctext = p.text # find all 'P' tag recursively # there can be more than one 'Dst' for p in l.iter("P"): if p.get("Name") == "Dst": dsttext = p.text newline = Line(srctext, dsttext) lines.append(newline) return (blocks, lines)
def get_words_only(word_tree): word_array = [] xpath_query = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[starts-with(@class, 'ocr')]" #xpath_query1 = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[@class='ocr_word']" #xpath_query2 = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[@class='ocrx_word']" hocr_words = etree.xpath(xpath_query) #if not hocr_words: # hocr_words = etree.xpath(xpath_query2) for word in hocr_words: #print word.attrib # the text may be contained in this span, but it may also be contained in a child element. word_array.append({'bbox':get_bbox_from_title(word.attrib['title']), 'text':tostring(word, method="text", encoding='UTF-8'), 'word_num':word.attrib['id'].replace("word_","")}) return word_array
def get_dl(): from lxml import etree url = 'https://www.kxjf.com/user/login?mainSiteName=kxd' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', 'Host': 'www.kxjf.com', 'Referer': 'https://www.gkkxd.com/userAuth/login', } response = requests.get(url, headers=headers) etree = etree.HTML(response.text) dlmy = etree.xpath('//*[@id="dlmy"]/@value')[0] return dlmy
def topic2question(spider, url): topic_id = re.search('[0-9]+', url).group() rs_f = file(os.path.join(DATA_DIR, topic_id + '.txt'), 'w') cur_page = 0 next_url = url for tmp in range(51): time.sleep(TIME_SLEEP) content = spider_get(spider, next_url, retry=RETRY) etree = lxml.html.fromstring(content) link_xpath = '//a[@class="question_link"]/@href' link = etree.xpath(link_xpath) link2 = set(link) for i in link2: rs_f.write("http://zhihu.com" + i + '\n') #next page next_page_link_xpath = '//div[@class="zm-invite-pager"]/span/a/@href' next_page_link = etree.xpath(next_page_link_xpath) if len(next_page_link) == 0: with open( os.path.join(ERROR_DIR, topic_id + '_' + str(cur_page) + ".txt"), 'w') as f: f.write(content) break last_page = next_page_link[-1] tmp = re.search('[0-9]+', last_page) if tmp: last_page = tmp.group() else: break print topic_id, cur_page, last_page if cur_page < int(last_page): cur_page = int(last_page) next_url = url + '?page=' + last_page else: break
</div> <div class="row"> <div class="col-lg-8 col-lg-offset-2"> <!-- To configure the contact form email address, go to mail/contact_me.php and update the email address in the PHP file on line 19. --> <!-- The form should work on most web servers, but if the form is not working you may need to configure your web server differently. --> <form METHOD="post" ACTION="resultaat.py"> <div class="row control-group"> <div class="form-group col-xs-12 floating-label-form-group controls"> <label>Welke Agent analyseren</label><BR> <BR> <SELECT class="form-control" NAME="ip" > <OPTION value="" > --- Welke Agent analyseren --- </OPTION> ''') #Create Dropdown etree = etree.parse('dropdown.xml') aantal = int(etree.xpath('/clients/aantal[1]/text()')[0]) for x in range(aantal): x+=1 location = '/clients/ipadres[{}]/text()'.format(x) ipaddress = etree.xpath(location)[0] print ('<OPTION value="'+ipaddress+'">'+ipaddress+'</OPTION>') print(''' <OPTION value="Anders">Anders</OPTION> </SELECT> </div> </div> <div class="row control-group"> <div class="form-group col-xs-12 floating-label-form-group controls"> <label>IP Adres</label> <input class="col-xs-12" type="text" Name="invoer_ip" placeholder="Indien anders vul hier het IP Adres in.">
def do_xpath(xpath, etree): """Process an uncompiled XPath expression on an Element Tree""" try: return etree.xpath(xpath) except (lxml.etree.XPathSyntaxError, lxml.etree.XPathEvalError): raise InvalidXPathExpression(sys.exc_info()[:2], value=xpath)
def set_adjustment_values(self,builder,etree): # fix defaults not loading for object in etree.xpath('/interface/object[@class="GtkAdjustment"]'): property = object.xpath('property[@name="value"]') if len(property): obj = builder.get_object(object.get('id')) obj.set_value(float(property[0].text))