Exemplo n.º 1
0
def fetch_images(etree):
    base_url = etree.xpath('//base/@url')[0]
    with pushd_temp_dir():
        filename_to_node = collections.OrderedDict()
        #
        # Extract the image files into the current directory
        #
        imgurls = etree.xpath('//img//@src')
        for imgurl in imgurls:
            newimgurl = imgurl  # //http
            if imgurl[:2] == '//':
                newimgurl = 'http' + imgurl
            else:
                if imgurl[:4] != 'http':
                    newimgurl = base_url + imgurl

            headers = {
                'User-Agent':
                'PurdueUniversityClassProject/1.0 ({}@purdue.edu https://goo.gl/dk8u5S)'
                .format(USERNAME)
            }
            request = urllib.request.Request(url=newimgurl, headers=headers)
            resp = urllib.request.urlopen(request)
            filename = make_filename(newimgurl, 'jpg')
            with open(filename, "wb") as f:
                f.write(resp.read())
            filename_to_node[filename] = imgurl
            app.logger.info('fetch_images {} {}'.format(imgurl, filename))
        yield filename_to_node
Exemplo n.º 2
0
 def init_response_attributes(self, etree):
     try:
         self.backend_transaction_id = etree.xpath('//MSS_SignatureResp')[0].attrib['MSSP_TransID']
         self.status = etree.xpath('//ns6:StatusCode', namespaces={'ns6': self.ns_namespace})[0].attrib['Value']
     except (IndexError, KeyError, lxml.etree.XMLSchemaError) as e:
         raise ResponseParseError('Cannot parse signature response: %s. Response content: %s' % (
             e, lxml.etree.tostring(etree)))
Exemplo n.º 3
0
    def init_response_attributes(self, etree):
        try:
            status_code = etree.xpath('//ns5:StatusCode',
                                      namespaces={'ns5': self.ns_namespace
                                                  })[0].attrib['Value']
        except (IndexError, KeyError, lxml.etree.XMLSchemaError) as e:
            raise ResponseParseError(
                'Cannot parse status response: %s. Response content: %s' %
                (e, lxml.etree.tostring(etree)))
        self.status = Statuses.map(status_code)

        try:
            civil_number_tag = etree.xpath(
                '//ns4:UserIdentifier', namespaces={'ns4':
                                                    self.ns_namespace})[0]
        except IndexError:
            # civil number tag does not exist - this is possible if request is still processing
            return
        else:
            try:
                self.civil_number = civil_number_tag.text.split('=')[1]
            except IndexError:
                raise ResponseParseError(
                    'Cannot get civil_number from tag text: %s' %
                    civil_number_tag.text)
Exemplo n.º 4
0
 def insert(cls, etree):
     """Create and return an empty <item> element."""
     new_elt = lxml.etree.Element("item")
     existing_items = etree.xpath("//item")
     if existing_items:
         existing_items[0].addprevious(new_elt)
     else:
         etree.xpath("/rss/channel").append(new_elt)
     new_item = cls(etree, new_elt)
     new_item.parent_id = 0
     return new_item
Exemplo n.º 5
0
 def get_info(self):
     headers = config.headers
     url = self._url.format(self._pripid)
     result, status_code = Send_Request().send_requests(url, headers)
     info = {}
     if status_code == 200:
         data = etree.xpath(result,
                            parser=etree.HTMLParser(encoding='utf-8'))
         tr_list = data.xpath(
             "//table[id= 'table_jyyc']//tr[@name = 'jyyc']")
         for i, singledata in enumerate(tr_list):
             temp = {}
             td_list = singledata.xpath("./td")
             temp["types"] = '经营异常'
             temp["in_reason"] = deal_html_code.remove_symbol(
                 td_list[1].xpath("string(.)"))
             in_date = deal_html_code.remove_symbol(
                 td_list[2].xpath("string(.)"))
             temp["in_date"] = deal_html_code.change_chinese_date(in_date)
             temp["out_reason"] = deal_html_code.remove_symbol(
                 td_list[4].xpath("string(.)"))
             out_date = deal_html_code.remove_symbol(
                 td_list[5].xpath("string(.)"))
             temp["out_date"] = deal_html_code.change_chinese_date(out_date)
             temp["gov_dept"] = deal_html_code.remove_symbol(
                 td_list[6].xpath("string(.)"))
             temp["out_gov"] = deal_html_code.remove_symbol(
                 td_list[7].xpath("string(.)"))
             info[i] = temp
     return info
Exemplo n.º 6
0
def get_words_from_line(etree, line_id, line_num):
    """ Return a python dict by flattening the entire page into just an array of words.
    This discards line number information, and is really just here as a historical thing.
    """
    #print "line id: %s" % line_id
    word_array = []
    xpath_query = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[starts-with(@class, 'ocr')]"

    ## Have noodled with this some--these used to work. It's not clear that xpath really can handle the range of hocr formats that will be thrown at us, but it works when formats are fairly uniform.
    #xpath_query1 = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[@class='ocr_word']"
    #xpath_query2 = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[@class='ocrx_word']"
    hocr_words = etree.xpath(xpath_query)
    # hocr_words = etree.xpath(xpath_query1)
    #if not hocr_words:
    #    hocr_words = etree.xpath(xpath_query2)
    for word in hocr_words:
        # the text may be contained in this span, but it may also be contained in a child element.
        word_array.append({
            'bbox':
            get_bbox_from_title(word.attrib['title']),
            'text':
            simple_clean(tostring(word, method="text", encoding='UTF-8')),
            'word_num':
            word.attrib['id'].replace("word_", ""),
            'line_num':
            line_num
        })
    return word_array
Exemplo n.º 7
0
 def set_adjustment_values(self, builder,
                           etree):  # fix defaults not loading
     for object in etree.xpath('/interface/object[@class="GtkAdjustment"]'):
         property = object.xpath('property[@name="value"]')
         if len(property):
             obj = builder.get_object(object.get('id'))
             obj.set_value(float(property[0].text))
Exemplo n.º 8
0
 def set_adjustment_values(builder, etree):
     """ Glade default adjustment values fix """
     for object in etree.xpath('/interface/object[@class="GtkAdjustment"]'):
         property = object.xpath('property[@name="value"]')
         if len(property):
             obj = builder.get_object(object.get('id'))
             obj.set_value(float(property[0].text))
Exemplo n.º 9
0
def copy_profile_photo_to_static(etree):
    base_url = etree.xpath('//base/@url')[0]
    profileUrl = find_profile_photo_filename(etree)
    if profileUrl != '':
        proj_dir = sys.path[0]  # e.g., "/home/ecegridfs/a/ee364z15/hpo"
        static_dir = os.path.join(
            proj_dir, "static")  # e.g., "/home/ecegridfs/a/ee364z15/hpo/data"
        newProfileUrl = profileUrl  # //http
        if profileUrl[:2] == '//':
            newProfileUrl = 'http' + profileUrl
        else:
            if profileUrl[:4] != 'http':
                newProfileUrl = base_url + profileUrl
        headers = {
            'User-Agent':
            'PurdueUniversityClassProject/1.0 ({}@purdue.edu https://goo.gl/dk8u5S)'
            .format(USERNAME)
        }
        app.logger.info('request for {}'.format(newProfileUrl))
        request = urllib.request.Request(url=newProfileUrl, headers=headers)
        resp = urllib.request.urlopen(request)
        filename = make_filename(newProfileUrl, 'jpg')
        with open(os.path.join(static_dir, filename), "wb") as f:
            f.write(resp.read())
        return profileUrl, filename
    return '', ''
Exemplo n.º 10
0
def get_attributes_of_xpaths(etree, x_paths, namespaces, attr='id'):
    """ Determine the values of the attributes of the objects that match each XPath

    Args:
        etree (:obj:`lxml.etree._ElementTree`): element tree for XML document
        x_paths (:obj:`list` of `str`): XPaths
        namespaces (:obj:`dict`): dictionary that maps the prefixes of namespaces to their URIs
        attr (:obj:`str` or :obj:`dict`, optional): attribute to get values of

    Returns:
        :obj:`dict` of :obj:`str` to :obj:`list` of :obj:`str`: dictionary that maps each XPath to the
            values of the attribute of the objects in the XML file that match the XPath
    """
    # get namespaces
    if isinstance(attr, dict):
        attr = '{{{}}}{}'.format(attr['namespace']['uri'], attr['name'])

    # determine the values of the attributes of the objects that match each XPath
    x_path_attrs = {}
    for x_path in x_paths:
        try:
            objects = etree.xpath(x_path, namespaces=get_namespaces_with_prefixes(namespaces))

            x_path_attrs[x_path] = [obj.attrib.get(attr, None) for obj in objects]
        except Exception:
            x_path_attrs[x_path] = []

    # return the values of the attributes of the objects that match each XPath
    return x_path_attrs
Exemplo n.º 11
0
def name(url):
    headers = config.headers_detail
    content, status_code = Send_Request().send_request(url, headers)
    info = {}
    if status_code == 200:
        flag = 1
        result = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))
        dl = result.path("//div[@class= 'viewBox']/dl")[0]
        datallist = etree.tostring(dl).split(
            '<dd style="border-bottom:1px solid #AE0000;padding-bottom:10px;">'
        )
        datallist.remove(datallist[-1])
        for i, single in enumerate(datallist):
            single = etree.xpath(content,
                                 parser=etree.HTMLParser(encoding='utf-8'))
            string = u"股东"
            name = deal_dd_content(string, single)
            string = u"变更前"
            percent_pre = deal_dd_content(string, single)
            string = u"变更后"
            percent_after = deal_dd_content(string, single)
            string = u"变更日期"
            dates = deal_dd_content(string, single)
            info[i] = [name, percent_pre, percent_after, dates]
    else:
        flag = 100000004
    if flag == 1:
        deal_html_code.remove_repeat(info)
    return info, flag
Exemplo n.º 12
0
 def get_detail_info(self, detail_url, info):
     dict = {
         u"种类": "cates",
         u"范围": "ranges",
         u"期限": "period",
         u"备注": "remark",
     }
     headers = config.headers
     result, status_code = Send_Request().send_requests(detail_url, headers)
     if status_code == 200:
         data = etree.xpath(result,
                            parser=etree.HTMLParser(encoding='utf-8'))
         string = u"被担保债权概况信息"
         table = data.xpath("//*[contains(.,'%s')]" % string)[0]
         for key, value in dict.iteritems():
             info[value] = deal_html_code.get_match_info(key, table)
         string = u"抵押权人概况信息"
         person_info = data.xpath("//*[contains(.,'%s')]" % string)[0]
         string = u"抵押权物概况信息"
         goods_info = data.xpath("//*[contains(.,'%s')]" % string)[0]
     else:
         info["cates"] = ''
         info["ranges"] = ''
         info["period"] = ''
         info["remark"] = ''
         person_info = {}
         goods_info = {}
     return person_info, goods_info
Exemplo n.º 13
0
 def set_adjustment_values(builder,etree):
     """ Glade default adjustment values fix """
     for object in etree.xpath('/interface/object[@class="GtkAdjustment"]'):
         property = object.xpath('property[@name="value"]')
         if len(property):
             obj = builder.get_object(object.get('id'))
             obj.set_value(float(property[0].text))
Exemplo n.º 14
0
    def __init__(self, filename):
        self.objects = []
        self.indices = {}

        self.data_class = type(self.data_class_name, (self.data_class_base, ),
                               {})

        f = open(filename, 'rb')
        etree = lxml.etree.parse(f)

        for entry in etree.xpath('//%s' % self.xml_tag):
            mapped_data = {}
            for key in entry.keys():
                mapped_data[self.field_map[key]] = entry.get(key)
            entry_obj = self.data_class(entry, **mapped_data)
            self.objects.append(entry_obj)

        # Create indices
        for key in self.field_map.values():
            self.indices[key] = {}

        # Update indices
        for obj in self.objects:
            for key in self.field_map.values():
                value = getattr(obj, key, None)
                if value is None:
                    continue
                self.indices[key][value] = obj
Exemplo n.º 15
0
def find_elements(etree):
    elements = []
    TRs = etree.xpath('//tr')
    #国内高匿https
    HighHiding = 0
    #print(TRs)
    for i, tr in enumerate(TRs):
        #print(tr)
        th = tr.xpath('th')
        h2 = []
        if len(th):
            h2 = th[0].xpath('h2')
            if len(h2):
                print(h2[0].text)
        #print(h2)
        if HighHiding == 0:
            if len(h2) and h2[0].text == "国内高匿代理IP":
                HighHiding = 1
                continue
            else:
                continue
        if len(h2) and h2[0].text == "国内透明代理IP":
            break
        TDs = tr.xpath('td')
        #print(TDs)
        #print(TDs[1].text, TDs[1].tag)
        if len(TDs) and TDs[5].text == "HTTPS":
            #print(TDs[1].text, TDs[2].text)
            yield {'ip': TDs[1].text, 'port': TDs[2].text}
Exemplo n.º 16
0
 def init_response_attributes(self, etree):
     self.status = Statuses.ERRED
     try:
         self.details = etree.xpath('//soapenv:Text', namespaces={'soapenv': self.soapenv_namespace})[0].text
     except (IndexError, lxml.etree.XMLSchemaError) as e:
         raise ResponseParseError('Cannot parse error status response: %s. Response content: %s' % (
             e, lxml.etree.tostring(etree)))
def get_topics(spider, id, topic_f, topic_log_f):
    '''
    spider为requests对象,
    id为话题id
    topic_f为layer*.json句柄
    topic_log_f为xxxx-xx-xx_crawler.log句柄
    '''
    url = 'https://www.zhihu.com/topic/' + id + '/organize/entire'
    time.sleep(0.1)

    for counter1 in range(1000):
        try:
            res = spider.get(url,
                             headers=CUR_HEADERS_BASE,
                             timeout=CUR_TIMEOUT_QUERY)

            content = res.content
            etree = lxml.html.fromstring(content)

            people_xpath = '//div[@class="zm-topic-side-followers-info"]//strong/text()'
            des_xpath = '//div[@id="zh-topic-desc"]'
            name_xpath = '//h1/text()'
            etree_id = etree.xpath(people_xpath)
            etree_des = etree.xpath(des_xpath)
            etree_name = etree.xpath(name_xpath)

            topic = dict()
            topic['id'] = id
            topic['followers'] = etree_id[0] if len(etree_id) > 0 else 0
            topic['des'] = lxml.html.tostring(
                etree_des[0], encoding='utf8') if len(etree_id) > 0 else 'NULL'
            topic['name'] = etree_name[0]

            with open(_XSRF_GLOBAL_FIELNAME) as f:
                _xsrf = f.read()

            data_xsrf = {'_xsrf': _xsrf}

        except BaseException, e:
            if str(e) != 'None':
                logging.debug('Fail to fetch the page. Error: {0}.'.format(e))
            logging.debug(url + ' wait for a sec to recrawl')
            time.sleep(1)
        else:
            if counter1 > 1:
                logging.info(url + ' html data get!')
                break
Exemplo n.º 18
0
def query_etree(etree):
    # Returns a list of top level elements from XML etree
    return etree.xpath(
        "//informationTable:infoTable",
        namespaces={
            "informationTable":
            "http://www.sec.gov/edgar/document/thirteenf/informationtable"
        })
Exemplo n.º 19
0
 def extract(self) -> List[str]:
     texts = []
     for etree in self.etrees:
         texts.extend([
             t.text for t in etree.xpath(
                 '//a:t', namespaces=self.namespaces) if t.text.strip() != ''
         ])
     return texts
Exemplo n.º 20
0
def test_add_mdreference(testpath):
    """Test add_reference function. Calls function two times and
    write the mdreference file.
    """

    md_creator = utils.MdCreator(testpath)

    md_creator.add_reference('abcd1234', 'path/to/file1')
    md_creator.add_reference('abcd1234', 'path/to/file2')

    md_creator.write_references()

    # Read created file. Reference should be found for both files
    etree = lxml.etree.parse(os.path.join(testpath, 'md-references.xml'))
    reference = etree.xpath('/mdReferences/mdReference[@file="path/to/file1"]')
    assert reference[0].text == 'abcd1234'
    reference = etree.xpath('/mdReferences/mdReference[@file="path/to/file2"]')
    assert reference[0].text == 'abcd1234'
Exemplo n.º 21
0
 def extract(self) -> List[str]:
     texts = []
     for etree in self.etrees:
         texts.extend([
             t.text
             for t in etree.xpath('//a:t', namespaces=self.namespaces)
             if t.text.strip() != ''
         ])
     return texts
Exemplo n.º 22
0
def _get_xpath_attribute(etree, path, attribute):
    """
    Get an attribute from a node grabbed from xpath.
    If not found, return None.
    """
    try:
        return etree.xpath(path)[0].attrib[attribute]
    except IndexError, KeyError:
        return None
	def get(self,url,headers,encode,xpath,func,num):
		rps1=requests.get(url,headers=headers)
		rps1.encoding=encode
		self.textData=rps1.text
		if len(xpath)>0:
			ex=etree.xpath(xpath[0])
			for i in ex:
				self.nextPageUrl.append(func[0](i))
		print(f'Page {num} ok')
Exemplo n.º 24
0
 def swap(etree, texts: List[str]) -> List[str]:
     for t in etree.xpath('//a:t', namespaces=self.namespaces):
         if t.text.strip() == '':
             continue
         text = texts.pop()
         if text is None:
             continue
         t.text = text
     return texts
Exemplo n.º 25
0
def _get_xpath_attribute(etree, path, attribute):
    """
    Get an attribute from a node grabbed from xpath.
    If not found, return None.
    """
    try:
        return etree.xpath(path)[0].attrib[attribute]
    except IndexError, KeyError:
        return None
Exemplo n.º 26
0
 def swap(etree, texts: List[str]) -> List[str]:
     for t in etree.xpath('//a:t', namespaces=self.namespaces):
         if t.text.strip() == '':
             continue
         text = texts.pop()
         if text is None:
             continue
         t.text = text
     return texts
Exemplo n.º 27
0
def deal_single_info(datallist, info, j):
    for i, single in enumerate(datallist, j):
        single = etree.xpath(single, parser=etree.HTMLParser(encoding='utf-8'))
        string = u"类型"
        types = deal_dd_content(string, single)
        string = u"名称"
        name = deal_dd_content(string, single)
        string = u"网址"
        website = deal_dd_content(string, single)
        uuid = ''
        info[i] = [name, types, website, uuid]
Exemplo n.º 28
0
def get_lines(etree):
    elems = etree.xpath("/html/body//article/div")
    #elems = [collapse(e) for e in elems[0].iterchildren()]
    elems = [e for e in elems[0].iterchildren()]
    elems = [collapse(e) for e in elems]
    txt = [e.text for e in elems]
    txt = ["" if t is None else t for t in txt]
    txt = [xlat_utf8(t) for t in txt]
    txt = [t.strip() for t in txt]
    txt = [t for t in txt if t]
    return txt
Exemplo n.º 29
0
def football(url, date):
    html = lxml.html.parse(pages)

    rows_xpath = xpath(
        "//*[@id='content-primary']/table[1]/tbody/tr[td[1]/span/span//text()='%s']"
        % (date))
    time_xpath = xpath("td[1]/span/span//text()[2]")
    team_xpath = xpath("td[2]/a/text()")
    result_xpath = xpath("td[3]/a/span/text()")
    place_xpath = xpath("td[4]/a/text()")

    details = []
    for row in rows_xpath(html):
        time = time_xpath(row)[0].strip()
        team = team_xpath(row)[0]
        score = result_xpath(row)[0]
        venue = place_xpath(row)[0]
        details.append([time, team, score, venue])

    return details
Exemplo n.º 30
0
def mmio_regions(etree):
    ret = []

    resources = etree.xpath("//resources/mmio")
    for res in resources:
        base = res.get("min")
        top = res.get("max")
        dev = res.getparent().getparent()
        obj = dev.get("object")
        ret.append((obj, int(base, base=16), int(top, base=16)))

    return sorted(ret, key=lambda x: (x[1], x[2]))
Exemplo n.º 31
0
def name(url):
    headers = config.headers_detail
    content, status_code = Send_Request().send_request(url, headers)
    if status_code == 200:
        flag = 1
        result = etree.xpath(content,
                             parser=etree.HTMLParser(encoding='utf-8'))
        dl = result.xpath("//div[@class = viewBox']//dl")[0]
        info = {}
        if "企业名称" in content:
            datallist = etree.tostring(dl).split(
                '<dd style="border-bottom:1px solid #AE0000;padding-bottom:10px;">'
            )
            datallist.remove(datallist[-1])
            pattern = re.compile(u".*共(.*?)页.*")
            number = re.findall(pattern, content)
            if len(number) == 1:
                totalpage = int(number[0])
            else:
                totalpage = 0
            if int(totalpage) == 1:
                j = 0
                deal_single_info(datallist, info, j)
            else:
                j = 0
                deal_single_info(datallist, info, j)
                entid = deal_html_code.match_entid(url)
                cid = deal_html_code.match_cid(url)
                href = out_invest_url.format(entid, cid)
                for k in xrange(2, totalpage + 1):
                    content, status_code = Send_Request().send_request(href)
                    if status_code == 200:
                        start = k * 5 + 1
                        result = etree.HTML(
                            content, parser=etree.HTMLParser(encoding='utf-8'))
                        dl = result.xpath("//div[@class='viewBox']/dl")[0]
                        datalist = etree.tostring(dl).split(
                            '<dd style="border-bottom:1px solid #AE0000;padding-bottom:10px;">'
                        )

                        if len(datalist) > 0:
                            datalist.remove(datalist[-1])
                            deal_single_info(datalist, info, start)
                    else:
                        pass
        else:
            flag = 100000004

    else:
        flag = 100000004
    if flag == 1:
        info = deal_html_code.remove_repeat(info)
    return info, flag
Exemplo n.º 32
0
    def parsesys(self, etree, syspath):
        """Give blocks  and lines objects lists	from Block and Line elements in XML
			args : etree of the xml file / path of the <System> tag in the xml
			return : blocks dict  and lines list"""

        blocks = {}
        lines = []
        blockclass = {
            'Gain': Gain,
            'Delay': Delay,
            'Sum': Sum,
            'SubSystem': SubSystem,
            'Inport': Inport,
            'Outport': Outport
        }

        for mysys in etree.xpath(syspath):
            #print etree.getpath(mysys)
            # iter() instead of findall(): recursive search
            blockels = mysys.iter("Block")
            lineels = mysys.iter("Line")

        ## Fill blocks dict
        for b in blockels:
            blocktype = b.get("BlockType")
            try:
                if blocktype == 'Gain':
                    newblock = blockclass[blocktype](b, self.constants)
                else:
                    newblock = blockclass[blocktype](b)
            except KeyError:
                print(blocktype + " is an unsupported block")
                sys.exit()
            blocks[b.get("SID")] = newblock

        ## Fill lines list
        for l in lineels:
            # find all children 'P' tag
            # there must be only one 'Src
            for p in l.findall("P"):
                if p.get("Name") == "Src":
                    srctext = p.text

            # find all 'P' tag recursively
            # there can be more than one 'Dst'
            for p in l.iter("P"):
                if p.get("Name") == "Dst":
                    dsttext = p.text
                    newline = Line(srctext, dsttext)
                    lines.append(newline)

        return (blocks, lines)
Exemplo n.º 33
0
def get_words_only(word_tree):
    word_array = []
    xpath_query = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[starts-with(@class, 'ocr')]"
    #xpath_query1 = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[@class='ocr_word']"
    #xpath_query2 = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[@class='ocrx_word']"
    hocr_words = etree.xpath(xpath_query)
    #if not hocr_words:
    #    hocr_words = etree.xpath(xpath_query2)
    for word in hocr_words:
        #print word.attrib
        # the text may be contained in this span, but it may also be contained in a child element.
        word_array.append({'bbox':get_bbox_from_title(word.attrib['title']), 'text':tostring(word, method="text", encoding='UTF-8'), 'word_num':word.attrib['id'].replace("word_","")})
    return word_array
Exemplo n.º 34
0
 def get_dl():
     from lxml import etree
     url = 'https://www.kxjf.com/user/login?mainSiteName=kxd'
     headers = {
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
         'Host': 'www.kxjf.com',
         'Referer': 'https://www.gkkxd.com/userAuth/login',
     }
     response = requests.get(url, headers=headers)
     etree = etree.HTML(response.text)
     dlmy = etree.xpath('//*[@id="dlmy"]/@value')[0]
     return dlmy
Exemplo n.º 35
0
def topic2question(spider, url):
    topic_id = re.search('[0-9]+', url).group()
    rs_f = file(os.path.join(DATA_DIR, topic_id + '.txt'), 'w')
    cur_page = 0
    next_url = url

    for tmp in range(51):
        time.sleep(TIME_SLEEP)

        content = spider_get(spider, next_url, retry=RETRY)
        etree = lxml.html.fromstring(content)
        link_xpath = '//a[@class="question_link"]/@href'
        link = etree.xpath(link_xpath)
        link2 = set(link)
        for i in link2:
            rs_f.write("http://zhihu.com" + i + '\n')
        #next page
        next_page_link_xpath = '//div[@class="zm-invite-pager"]/span/a/@href'
        next_page_link = etree.xpath(next_page_link_xpath)
        if len(next_page_link) == 0:
            with open(
                    os.path.join(ERROR_DIR,
                                 topic_id + '_' + str(cur_page) + ".txt"),
                    'w') as f:
                f.write(content)
                break
        last_page = next_page_link[-1]
        tmp = re.search('[0-9]+', last_page)
        if tmp:
            last_page = tmp.group()
        else:
            break
        print topic_id, cur_page, last_page
        if cur_page < int(last_page):
            cur_page = int(last_page)
            next_url = url + '?page=' + last_page
        else:
            break
Exemplo n.º 36
0
            </div>
            <div class="row">
                <div class="col-lg-8 col-lg-offset-2">
                    <!-- To configure the contact form email address, go to mail/contact_me.php and update the email address in the PHP file on line 19. -->
                    <!-- The form should work on most web servers, but if the form is not working you may need to configure your web server differently. -->
                    <form METHOD="post" ACTION="resultaat.py">
                        <div class="row control-group">
                            <div class="form-group col-xs-12 floating-label-form-group controls">
							<label>Welke Agent analyseren</label><BR>
							<BR>
							<SELECT class="form-control" NAME="ip" >
							<OPTION value="" > --- Welke Agent analyseren --- </OPTION>
							''')
#Create Dropdown							
etree = etree.parse('dropdown.xml')
aantal = int(etree.xpath('/clients/aantal[1]/text()')[0])
for x in range(aantal):
	x+=1
	location = '/clients/ipadres[{}]/text()'.format(x)
	ipaddress = etree.xpath(location)[0]
	print ('<OPTION value="'+ipaddress+'">'+ipaddress+'</OPTION>')
print('''
							<OPTION value="Anders">Anders</OPTION>
							</SELECT>
							
                            </div>
                        </div>
                        <div class="row control-group">
                            <div class="form-group col-xs-12 floating-label-form-group controls">
								<label>IP Adres</label>
                                <input class="col-xs-12" type="text" Name="invoer_ip" placeholder="Indien anders vul hier het IP Adres in.">
def do_xpath(xpath, etree):
    """Process an uncompiled XPath expression on an Element Tree"""
    try:
        return etree.xpath(xpath)
    except (lxml.etree.XPathSyntaxError, lxml.etree.XPathEvalError):
        raise InvalidXPathExpression(sys.exc_info()[:2], value=xpath)
Exemplo n.º 38
0
 def set_adjustment_values(self,builder,etree): # fix defaults not loading
     for object in etree.xpath('/interface/object[@class="GtkAdjustment"]'):
         property = object.xpath('property[@name="value"]')
         if len(property):
             obj = builder.get_object(object.get('id'))
             obj.set_value(float(property[0].text))