Exemplos de xpath em Python, exemplos de lxml.etree.xpath em Python

Exemplo n.º 1

0

Exibir arquivo

def fetch_images(etree):
    base_url = etree.xpath('//base/@url')[0]
    with pushd_temp_dir():
        filename_to_node = collections.OrderedDict()
        #
        # Extract the image files into the current directory
        #
        imgurls = etree.xpath('//img//@src')
        for imgurl in imgurls:
            newimgurl = imgurl  # //http
            if imgurl[:2] == '//':
                newimgurl = 'http' + imgurl
            else:
                if imgurl[:4] != 'http':
                    newimgurl = base_url + imgurl

            headers = {
                'User-Agent':
                'PurdueUniversityClassProject/1.0 ({}@purdue.edu https://goo.gl/dk8u5S)'
                .format(USERNAME)
            }
            request = urllib.request.Request(url=newimgurl, headers=headers)
            resp = urllib.request.urlopen(request)
            filename = make_filename(newimgurl, 'jpg')
            with open(filename, "wb") as f:
                f.write(resp.read())
            filename_to_node[filename] = imgurl
            app.logger.info('fetch_images {} {}'.format(imgurl, filename))
        yield filename_to_node

Exemplo n.º 2

0

Exibir arquivo

Arquivo: client.py Projeto: yashodhank/ve-waldur-v2

 def init_response_attributes(self, etree):
     try:
         self.backend_transaction_id = etree.xpath('//MSS_SignatureResp')[0].attrib['MSSP_TransID']
         self.status = etree.xpath('//ns6:StatusCode', namespaces={'ns6': self.ns_namespace})[0].attrib['Value']
     except (IndexError, KeyError, lxml.etree.XMLSchemaError) as e:
         raise ResponseParseError('Cannot parse signature response: %s. Response content: %s' % (
             e, lxml.etree.tostring(etree)))

Exemplo n.º 3

0

Exibir arquivo

Arquivo: client.py Projeto: puunukk/waldur-mastermind

    def init_response_attributes(self, etree):
        try:
            status_code = etree.xpath('//ns5:StatusCode',
                                      namespaces={'ns5': self.ns_namespace
                                                  })[0].attrib['Value']
        except (IndexError, KeyError, lxml.etree.XMLSchemaError) as e:
            raise ResponseParseError(
                'Cannot parse status response: %s. Response content: %s' %
                (e, lxml.etree.tostring(etree)))
        self.status = Statuses.map(status_code)

        try:
            civil_number_tag = etree.xpath(
                '//ns4:UserIdentifier', namespaces={'ns4':
                                                    self.ns_namespace})[0]
        except IndexError:
            # civil number tag does not exist - this is possible if request is still processing
            return
        else:
            try:
                self.civil_number = civil_number_tag.text.split('=')[1]
            except IndexError:
                raise ResponseParseError(
                    'Cannot get civil_number from tag text: %s' %
                    civil_number_tag.text)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: wxr_model.py Projeto: sisblibrary/jahia2wp

 def insert(cls, etree):
     """Create and return an empty <item> element."""
     new_elt = lxml.etree.Element("item")
     existing_items = etree.xpath("//item")
     if existing_items:
         existing_items[0].addprevious(new_elt)
     else:
         etree.xpath("/rss/channel").append(new_elt)
     new_item = cls(etree, new_elt)
     new_item.parent_id = 0
     return new_item

Exemplo n.º 5

0

Exibir arquivo

 def get_info(self):
     headers = config.headers
     url = self._url.format(self._pripid)
     result, status_code = Send_Request().send_requests(url, headers)
     info = {}
     if status_code == 200:
         data = etree.xpath(result,
                            parser=etree.HTMLParser(encoding='utf-8'))
         tr_list = data.xpath(
             "//table[id= 'table_jyyc']//tr[@name = 'jyyc']")
         for i, singledata in enumerate(tr_list):
             temp = {}
             td_list = singledata.xpath("./td")
             temp["types"] = '经营异常'
             temp["in_reason"] = deal_html_code.remove_symbol(
                 td_list[1].xpath("string(.)"))
             in_date = deal_html_code.remove_symbol(
                 td_list[2].xpath("string(.)"))
             temp["in_date"] = deal_html_code.change_chinese_date(in_date)
             temp["out_reason"] = deal_html_code.remove_symbol(
                 td_list[4].xpath("string(.)"))
             out_date = deal_html_code.remove_symbol(
                 td_list[5].xpath("string(.)"))
             temp["out_date"] = deal_html_code.change_chinese_date(out_date)
             temp["gov_dept"] = deal_html_code.remove_symbol(
                 td_list[6].xpath("string(.)"))
             temp["out_gov"] = deal_html_code.remove_symbol(
                 td_list[7].xpath("string(.)"))
             info[i] = temp
     return info

Exemplo n.º 6

0

Exibir arquivo

def get_words_from_line(etree, line_id, line_num):
    """ Return a python dict by flattening the entire page into just an array of words.
    This discards line number information, and is really just here as a historical thing.
    """
    #print "line id: %s" % line_id
    word_array = []
    xpath_query = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[starts-with(@class, 'ocr')]"

    ## Have noodled with this some--these used to work. It's not clear that xpath really can handle the range of hocr formats that will be thrown at us, but it works when formats are fairly uniform.
    #xpath_query1 = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[@class='ocr_word']"
    #xpath_query2 = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[@class='ocrx_word']"
    hocr_words = etree.xpath(xpath_query)
    # hocr_words = etree.xpath(xpath_query1)
    #if not hocr_words:
    #    hocr_words = etree.xpath(xpath_query2)
    for word in hocr_words:
        # the text may be contained in this span, but it may also be contained in a child element.
        word_array.append({
            'bbox':
            get_bbox_from_title(word.attrib['title']),
            'text':
            simple_clean(tostring(word, method="text", encoding='UTF-8')),
            'word_num':
            word.attrib['id'].replace("word_", ""),
            'line_num':
            line_num
        })
    return word_array

Exemplo n.º 7

0

Exibir arquivo

 def set_adjustment_values(self, builder,
                           etree):  # fix defaults not loading
     for object in etree.xpath('/interface/object[@class="GtkAdjustment"]'):
         property = object.xpath('property[@name="value"]')
         if len(property):
             obj = builder.get_object(object.get('id'))
             obj.set_value(float(property[0].text))

Exemplo n.º 8

0

Exibir arquivo

Arquivo: inkcut.py Projeto: xnoob/Inkcut

 def set_adjustment_values(builder, etree):
     """ Glade default adjustment values fix """
     for object in etree.xpath('/interface/object[@class="GtkAdjustment"]'):
         property = object.xpath('property[@name="value"]')
         if len(property):
             obj = builder.get_object(object.get('id'))
             obj.set_value(float(property[0].text))

Exemplo n.º 9

0

Exibir arquivo

def copy_profile_photo_to_static(etree):
    base_url = etree.xpath('//base/@url')[0]
    profileUrl = find_profile_photo_filename(etree)
    if profileUrl != '':
        proj_dir = sys.path[0]  # e.g., "/home/ecegridfs/a/ee364z15/hpo"
        static_dir = os.path.join(
            proj_dir, "static")  # e.g., "/home/ecegridfs/a/ee364z15/hpo/data"
        newProfileUrl = profileUrl  # //http
        if profileUrl[:2] == '//':
            newProfileUrl = 'http' + profileUrl
        else:
            if profileUrl[:4] != 'http':
                newProfileUrl = base_url + profileUrl
        headers = {
            'User-Agent':
            'PurdueUniversityClassProject/1.0 ({}@purdue.edu https://goo.gl/dk8u5S)'
            .format(USERNAME)
        }
        app.logger.info('request for {}'.format(newProfileUrl))
        request = urllib.request.Request(url=newProfileUrl, headers=headers)
        resp = urllib.request.urlopen(request)
        filename = make_filename(newProfileUrl, 'jpg')
        with open(os.path.join(static_dir, filename), "wb") as f:
            f.write(resp.read())
        return profileUrl, filename
    return '', ''

Exemplo n.º 10

0

Exibir arquivo

Arquivo: utils.py Projeto: biosimulators/Biosimulators_utils

def get_attributes_of_xpaths(etree, x_paths, namespaces, attr='id'):
    """ Determine the values of the attributes of the objects that match each XPath

    Args:
        etree (:obj:`lxml.etree._ElementTree`): element tree for XML document
        x_paths (:obj:`list` of `str`): XPaths
        namespaces (:obj:`dict`): dictionary that maps the prefixes of namespaces to their URIs
        attr (:obj:`str` or :obj:`dict`, optional): attribute to get values of

    Returns:
        :obj:`dict` of :obj:`str` to :obj:`list` of :obj:`str`: dictionary that maps each XPath to the
            values of the attribute of the objects in the XML file that match the XPath
    """
    # get namespaces
    if isinstance(attr, dict):
        attr = '{{{}}}{}'.format(attr['namespace']['uri'], attr['name'])

    # determine the values of the attributes of the objects that match each XPath
    x_path_attrs = {}
    for x_path in x_paths:
        try:
            objects = etree.xpath(x_path, namespaces=get_namespaces_with_prefixes(namespaces))

            x_path_attrs[x_path] = [obj.attrib.get(attr, None) for obj in objects]
        except Exception:
            x_path_attrs[x_path] = []

    # return the values of the attributes of the objects that match each XPath
    return x_path_attrs

Exemplo n.º 11

0

Exibir arquivo

Arquivo: BJ_report_schange.py Projeto: cash2one/BussinessQG

def name(url):
    headers = config.headers_detail
    content, status_code = Send_Request().send_request(url, headers)
    info = {}
    if status_code == 200:
        flag = 1
        result = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))
        dl = result.path("//div[@class= 'viewBox']/dl")[0]
        datallist = etree.tostring(dl).split(
            '<dd style="border-bottom:1px solid #AE0000;padding-bottom:10px;">'
        )
        datallist.remove(datallist[-1])
        for i, single in enumerate(datallist):
            single = etree.xpath(content,
                                 parser=etree.HTMLParser(encoding='utf-8'))
            string = u"股东"
            name = deal_dd_content(string, single)
            string = u"变更前"
            percent_pre = deal_dd_content(string, single)
            string = u"变更后"
            percent_after = deal_dd_content(string, single)
            string = u"变更日期"
            dates = deal_dd_content(string, single)
            info[i] = [name, percent_pre, percent_after, dates]
    else:
        flag = 100000004
    if flag == 1:
        deal_html_code.remove_repeat(info)
    return info, flag

Exemplo n.º 12

0

Exibir arquivo

 def get_detail_info(self, detail_url, info):
     dict = {
         u"种类": "cates",
         u"范围": "ranges",
         u"期限": "period",
         u"备注": "remark",
     }
     headers = config.headers
     result, status_code = Send_Request().send_requests(detail_url, headers)
     if status_code == 200:
         data = etree.xpath(result,
                            parser=etree.HTMLParser(encoding='utf-8'))
         string = u"被担保债权概况信息"
         table = data.xpath("//*[contains(.,'%s')]" % string)[0]
         for key, value in dict.iteritems():
             info[value] = deal_html_code.get_match_info(key, table)
         string = u"抵押权人概况信息"
         person_info = data.xpath("//*[contains(.,'%s')]" % string)[0]
         string = u"抵押权物概况信息"
         goods_info = data.xpath("//*[contains(.,'%s')]" % string)[0]
     else:
         info["cates"] = ''
         info["ranges"] = ''
         info["period"] = ''
         info["remark"] = ''
         person_info = {}
         goods_info = {}
     return person_info, goods_info

Exemplo n.º 13

0

Exibir arquivo

Arquivo: inkcut.py Projeto: pc-coholic/Inkcut

 def set_adjustment_values(builder,etree):
     """ Glade default adjustment values fix """
     for object in etree.xpath('/interface/object[@class="GtkAdjustment"]'):
         property = object.xpath('property[@name="value"]')
         if len(property):
             obj = builder.get_object(object.get('id'))
             obj.set_value(float(property[0].text))

Exemplo n.º 14

0

Exibir arquivo

    def __init__(self, filename):
        self.objects = []
        self.indices = {}

        self.data_class = type(self.data_class_name, (self.data_class_base, ),
                               {})

        f = open(filename, 'rb')
        etree = lxml.etree.parse(f)

        for entry in etree.xpath('//%s' % self.xml_tag):
            mapped_data = {}
            for key in entry.keys():
                mapped_data[self.field_map[key]] = entry.get(key)
            entry_obj = self.data_class(entry, **mapped_data)
            self.objects.append(entry_obj)

        # Create indices
        for key in self.field_map.values():
            self.indices[key] = {}

        # Update indices
        for obj in self.objects:
            for key in self.field_map.values():
                value = getattr(obj, key, None)
                if value is None:
                    continue
                self.indices[key][value] = obj

Exemplo n.º 15

0

Exibir arquivo

def find_elements(etree):
    elements = []
    TRs = etree.xpath('//tr')
    #国内高匿https
    HighHiding = 0
    #print(TRs)
    for i, tr in enumerate(TRs):
        #print(tr)
        th = tr.xpath('th')
        h2 = []
        if len(th):
            h2 = th[0].xpath('h2')
            if len(h2):
                print(h2[0].text)
        #print(h2)
        if HighHiding == 0:
            if len(h2) and h2[0].text == "国内高匿代理IP":
                HighHiding = 1
                continue
            else:
                continue
        if len(h2) and h2[0].text == "国内透明代理IP":
            break
        TDs = tr.xpath('td')
        #print(TDs)
        #print(TDs[1].text, TDs[1].tag)
        if len(TDs) and TDs[5].text == "HTTPS":
            #print(TDs[1].text, TDs[2].text)
            yield {'ip': TDs[1].text, 'port': TDs[2].text}

Exemplo n.º 16

0

Exibir arquivo

Arquivo: client.py Projeto: yashodhank/ve-waldur-v2

 def init_response_attributes(self, etree):
     self.status = Statuses.ERRED
     try:
         self.details = etree.xpath('//soapenv:Text', namespaces={'soapenv': self.soapenv_namespace})[0].text
     except (IndexError, lxml.etree.XMLSchemaError) as e:
         raise ResponseParseError('Cannot parse error status response: %s. Response content: %s' % (
             e, lxml.etree.tostring(etree)))

Exemplo n.º 17

0

Exibir arquivo

Arquivo: topicSpider_gevent.py Projeto: winston17/zhihu-graduation-design

def get_topics(spider, id, topic_f, topic_log_f):
    '''
    spider为requests对象，
    id为话题id
    topic_f为layer*.json句柄
    topic_log_f为xxxx-xx-xx_crawler.log句柄
    '''
    url = 'https://www.zhihu.com/topic/' + id + '/organize/entire'
    time.sleep(0.1)

    for counter1 in range(1000):
        try:
            res = spider.get(url,
                             headers=CUR_HEADERS_BASE,
                             timeout=CUR_TIMEOUT_QUERY)

            content = res.content
            etree = lxml.html.fromstring(content)

            people_xpath = '//div[@class="zm-topic-side-followers-info"]//strong/text()'
            des_xpath = '//div[@id="zh-topic-desc"]'
            name_xpath = '//h1/text()'
            etree_id = etree.xpath(people_xpath)
            etree_des = etree.xpath(des_xpath)
            etree_name = etree.xpath(name_xpath)

            topic = dict()
            topic['id'] = id
            topic['followers'] = etree_id[0] if len(etree_id) > 0 else 0
            topic['des'] = lxml.html.tostring(
                etree_des[0], encoding='utf8') if len(etree_id) > 0 else 'NULL'
            topic['name'] = etree_name[0]

            with open(_XSRF_GLOBAL_FIELNAME) as f:
                _xsrf = f.read()

            data_xsrf = {'_xsrf': _xsrf}

        except BaseException, e:
            if str(e) != 'None':
                logging.debug('Fail to fetch the page. Error: {0}.'.format(e))
            logging.debug(url + ' wait for a sec to recrawl')
            time.sleep(1)
        else:
            if counter1 > 1:
                logging.info(url + ' html data get!')
                break

Exemplo n.º 18

0

Exibir arquivo

def query_etree(etree):
    # Returns a list of top level elements from XML etree
    return etree.xpath(
        "//informationTable:infoTable",
        namespaces={
            "informationTable":
            "http://www.sec.gov/edgar/document/thirteenf/informationtable"
        })

Exemplo n.º 19

0

Exibir arquivo

Arquivo: __init__.py Projeto: yparrot/docio

 def extract(self) -> List[str]:
     texts = []
     for etree in self.etrees:
         texts.extend([
             t.text for t in etree.xpath(
                 '//a:t', namespaces=self.namespaces) if t.text.strip() != ''
         ])
     return texts

Exemplo n.º 20

0

Exibir arquivo

Arquivo: utils_test.py Projeto: Matoking/dpres-siptools

def test_add_mdreference(testpath):
    """Test add_reference function. Calls function two times and
    write the mdreference file.
    """

    md_creator = utils.MdCreator(testpath)

    md_creator.add_reference('abcd1234', 'path/to/file1')
    md_creator.add_reference('abcd1234', 'path/to/file2')

    md_creator.write_references()

    # Read created file. Reference should be found for both files
    etree = lxml.etree.parse(os.path.join(testpath, 'md-references.xml'))
    reference = etree.xpath('/mdReferences/mdReference[@file="path/to/file1"]')
    assert reference[0].text == 'abcd1234'
    reference = etree.xpath('/mdReferences/mdReference[@file="path/to/file2"]')
    assert reference[0].text == 'abcd1234'

Exemplo n.º 21

0

Exibir arquivo

 def extract(self) -> List[str]:
     texts = []
     for etree in self.etrees:
         texts.extend([
             t.text
             for t in etree.xpath('//a:t', namespaces=self.namespaces)
             if t.text.strip() != ''
         ])
     return texts

Exemplo n.º 22

0

Exibir arquivo

Arquivo: util.py Projeto: shazam41/creativecommons.org

def _get_xpath_attribute(etree, path, attribute):
    """
    Get an attribute from a node grabbed from xpath.
    If not found, return None.
    """
    try:
        return etree.xpath(path)[0].attrib[attribute]
    except IndexError, KeyError:
        return None

Exemplo n.º 23

0

Exibir arquivo

Arquivo: ah64PageWorker.py Projeto: shinoairisu/A_Very_Easy_Spider_FrameWork

	def get(self,url,headers,encode,xpath,func,num):
		rps1=requests.get(url,headers=headers)
		rps1.encoding=encode
		self.textData=rps1.text
		if len(xpath)>0:
			ex=etree.xpath(xpath[0])
			for i in ex:
				self.nextPageUrl.append(func[0](i))
		print(f'Page {num} ok')

Exemplo n.º 24

0

Exibir arquivo

Arquivo: __init__.py Projeto: yparrot/docio

 def swap(etree, texts: List[str]) -> List[str]:
     for t in etree.xpath('//a:t', namespaces=self.namespaces):
         if t.text.strip() == '':
             continue
         text = texts.pop()
         if text is None:
             continue
         t.text = text
     return texts

Exemplo n.º 25

0

Exibir arquivo

Arquivo: util.py Projeto: BIGGANI/creativecommons.org

def _get_xpath_attribute(etree, path, attribute):
    """
    Get an attribute from a node grabbed from xpath.
    If not found, return None.
    """
    try:
        return etree.xpath(path)[0].attrib[attribute]
    except IndexError, KeyError:
        return None

Exemplo n.º 26

0

Exibir arquivo

 def swap(etree, texts: List[str]) -> List[str]:
     for t in etree.xpath('//a:t', namespaces=self.namespaces):
         if t.text.strip() == '':
             continue
         text = texts.pop()
         if text is None:
             continue
         t.text = text
     return texts

Exemplo n.º 27

0

Exibir arquivo

def deal_single_info(datallist, info, j):
    for i, single in enumerate(datallist, j):
        single = etree.xpath(single, parser=etree.HTMLParser(encoding='utf-8'))
        string = u"类型"
        types = deal_dd_content(string, single)
        string = u"名称"
        name = deal_dd_content(string, single)
        string = u"网址"
        website = deal_dd_content(string, single)
        uuid = ''
        info[i] = [name, types, website, uuid]

Exemplo n.º 28

0

Exibir arquivo

Arquivo: parse_hfbf_tips.py Projeto: HACC2016/carrotware

def get_lines(etree):
    elems = etree.xpath("/html/body//article/div")
    #elems = [collapse(e) for e in elems[0].iterchildren()]
    elems = [e for e in elems[0].iterchildren()]
    elems = [collapse(e) for e in elems]
    txt = [e.text for e in elems]
    txt = ["" if t is None else t for t in txt]
    txt = [xlat_utf8(t) for t in txt]
    txt = [t.strip() for t in txt]
    txt = [t for t in txt if t]
    return txt

Exemplo n.º 29

0

Exibir arquivo

def football(url, date):
    html = lxml.html.parse(pages)

    rows_xpath = xpath(
        "//*[@id='content-primary']/table[1]/tbody/tr[td[1]/span/span//text()='%s']"
        % (date))
    time_xpath = xpath("td[1]/span/span//text()[2]")
    team_xpath = xpath("td[2]/a/text()")
    result_xpath = xpath("td[3]/a/span/text()")
    place_xpath = xpath("td[4]/a/text()")

    details = []
    for row in rows_xpath(html):
        time = time_xpath(row)[0].strip()
        team = team_xpath(row)[0]
        score = result_xpath(row)[0]
        venue = place_xpath(row)[0]
        details.append([time, team, score, venue])

    return details

Exemplo n.º 30

0

Exibir arquivo

def mmio_regions(etree):
    ret = []

    resources = etree.xpath("//resources/mmio")
    for res in resources:
        base = res.get("min")
        top = res.get("max")
        dev = res.getparent().getparent()
        obj = dev.get("object")
        ret.append((obj, int(base, base=16), int(top, base=16)))

    return sorted(ret, key=lambda x: (x[1], x[2]))

Exemplo n.º 31

0

Exibir arquivo

def name(url):
    headers = config.headers_detail
    content, status_code = Send_Request().send_request(url, headers)
    if status_code == 200:
        flag = 1
        result = etree.xpath(content,
                             parser=etree.HTMLParser(encoding='utf-8'))
        dl = result.xpath("//div[@class = viewBox']//dl")[0]
        info = {}
        if "企业名称" in content:
            datallist = etree.tostring(dl).split(
                '<dd style="border-bottom:1px solid #AE0000;padding-bottom:10px;">'
            )
            datallist.remove(datallist[-1])
            pattern = re.compile(u".*共(.*?)页.*")
            number = re.findall(pattern, content)
            if len(number) == 1:
                totalpage = int(number[0])
            else:
                totalpage = 0
            if int(totalpage) == 1:
                j = 0
                deal_single_info(datallist, info, j)
            else:
                j = 0
                deal_single_info(datallist, info, j)
                entid = deal_html_code.match_entid(url)
                cid = deal_html_code.match_cid(url)
                href = out_invest_url.format(entid, cid)
                for k in xrange(2, totalpage + 1):
                    content, status_code = Send_Request().send_request(href)
                    if status_code == 200:
                        start = k * 5 + 1
                        result = etree.HTML(
                            content, parser=etree.HTMLParser(encoding='utf-8'))
                        dl = result.xpath("//div[@class='viewBox']/dl")[0]
                        datalist = etree.tostring(dl).split(
                            '<dd style="border-bottom:1px solid #AE0000;padding-bottom:10px;">'
                        )

                        if len(datalist) > 0:
                            datalist.remove(datalist[-1])
                            deal_single_info(datalist, info, start)
                    else:
                        pass
        else:
            flag = 100000004

    else:
        flag = 100000004
    if flag == 1:
        info = deal_html_code.remove_repeat(info)
    return info, flag

Exemplo n.º 32

0

Exibir arquivo

    def parsesys(self, etree, syspath):
        """Give blocks  and lines objects lists	from Block and Line elements in XML
			args : etree of the xml file / path of the <System> tag in the xml
			return : blocks dict  and lines list"""

        blocks = {}
        lines = []
        blockclass = {
            'Gain': Gain,
            'Delay': Delay,
            'Sum': Sum,
            'SubSystem': SubSystem,
            'Inport': Inport,
            'Outport': Outport
        }

        for mysys in etree.xpath(syspath):
            #print etree.getpath(mysys)
            # iter() instead of findall(): recursive search
            blockels = mysys.iter("Block")
            lineels = mysys.iter("Line")

        ## Fill blocks dict
        for b in blockels:
            blocktype = b.get("BlockType")
            try:
                if blocktype == 'Gain':
                    newblock = blockclass[blocktype](b, self.constants)
                else:
                    newblock = blockclass[blocktype](b)
            except KeyError:
                print(blocktype + " is an unsupported block")
                sys.exit()
            blocks[b.get("SID")] = newblock

        ## Fill lines list
        for l in lineels:
            # find all children 'P' tag
            # there must be only one 'Src
            for p in l.findall("P"):
                if p.get("Name") == "Src":
                    srctext = p.text

            # find all 'P' tag recursively
            # there can be more than one 'Dst'
            for p in l.iter("P"):
                if p.get("Name") == "Dst":
                    dsttext = p.text
                    newline = Line(srctext, dsttext)
                    lines.append(newline)

        return (blocks, lines)

Exemplo n.º 33

0

Exibir arquivo

Arquivo: parse_utils.py Projeto: imclab/whatwordwhere

def get_words_only(word_tree):
    word_array = []
    xpath_query = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[starts-with(@class, 'ocr')]"
    #xpath_query1 = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[@class='ocr_word']"
    #xpath_query2 = "//body/div[@class='ocr_page']/div[@class='ocr_carea']/p[@class='ocr_par']/span[@class='ocr_line' and @id='" + line_id + "']/span[@class='ocrx_word']"
    hocr_words = etree.xpath(xpath_query)
    #if not hocr_words:
    #    hocr_words = etree.xpath(xpath_query2)
    for word in hocr_words:
        #print word.attrib
        # the text may be contained in this span, but it may also be contained in a child element.
        word_array.append({'bbox':get_bbox_from_title(word.attrib['title']), 'text':tostring(word, method="text", encoding='UTF-8'), 'word_num':word.attrib['id'].replace("word_","")})
    return word_array

Exemplo n.º 34

0

Exibir arquivo

Arquivo: KaiXinDai.py Projeto: zsqsir/Python3-Spider

 def get_dl():
     from lxml import etree
     url = 'https://www.kxjf.com/user/login?mainSiteName=kxd'
     headers = {
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
         'Host': 'www.kxjf.com',
         'Referer': 'https://www.gkkxd.com/userAuth/login',
     }
     response = requests.get(url, headers=headers)
     etree = etree.HTML(response.text)
     dlmy = etree.xpath('//*[@id="dlmy"]/@value')[0]
     return dlmy

Exemplo n.º 35

0

Exibir arquivo

def topic2question(spider, url):
    topic_id = re.search('[0-9]+', url).group()
    rs_f = file(os.path.join(DATA_DIR, topic_id + '.txt'), 'w')
    cur_page = 0
    next_url = url

    for tmp in range(51):
        time.sleep(TIME_SLEEP)

        content = spider_get(spider, next_url, retry=RETRY)
        etree = lxml.html.fromstring(content)
        link_xpath = '//a[@class="question_link"]/@href'
        link = etree.xpath(link_xpath)
        link2 = set(link)
        for i in link2:
            rs_f.write("http://zhihu.com" + i + '\n')
        #next page
        next_page_link_xpath = '//div[@class="zm-invite-pager"]/span/a/@href'
        next_page_link = etree.xpath(next_page_link_xpath)
        if len(next_page_link) == 0:
            with open(
                    os.path.join(ERROR_DIR,
                                 topic_id + '_' + str(cur_page) + ".txt"),
                    'w') as f:
                f.write(content)
                break
        last_page = next_page_link[-1]
        tmp = re.search('[0-9]+', last_page)
        if tmp:
            last_page = tmp.group()
        else:
            break
        print topic_id, cur_page, last_page
        if cur_page < int(last_page):
            cur_page = int(last_page)
            next_url = url + '?page=' + last_page
        else:
            break

Exemplo n.º 36

0

Exibir arquivo

Arquivo: index.py Projeto: yaro13/Scripting

            </div>
            <div class="row">
                <div class="col-lg-8 col-lg-offset-2">
                    <!-- To configure the contact form email address, go to mail/contact_me.php and update the email address in the PHP file on line 19. -->
                    <!-- The form should work on most web servers, but if the form is not working you may need to configure your web server differently. -->
                    <form METHOD="post" ACTION="resultaat.py">
                        <div class="row control-group">
                            <div class="form-group col-xs-12 floating-label-form-group controls">
							<label>Welke Agent analyseren</label><BR>
							<BR>
							<SELECT class="form-control" NAME="ip" >
							<OPTION value="" > --- Welke Agent analyseren --- </OPTION>
							''')
#Create Dropdown							
etree = etree.parse('dropdown.xml')
aantal = int(etree.xpath('/clients/aantal[1]/text()')[0])
for x in range(aantal):
	x+=1
	location = '/clients/ipadres[{}]/text()'.format(x)
	ipaddress = etree.xpath(location)[0]
	print ('<OPTION value="'+ipaddress+'">'+ipaddress+'</OPTION>')
print('''
							<OPTION value="Anders">Anders</OPTION>
							</SELECT>
							
                            </div>
                        </div>
                        <div class="row control-group">
                            <div class="form-group col-xs-12 floating-label-form-group controls">
								<label>IP Adres</label>
                                <input class="col-xs-12" type="text" Name="invoer_ip" placeholder="Indien anders vul hier het IP Adres in.">

Exemplo n.º 37

0

Exibir arquivo

Arquivo: webscraper.py Projeto: codefortallahassee/microwebscraper

def do_xpath(xpath, etree):
    """Process an uncompiled XPath expression on an Element Tree"""
    try:
        return etree.xpath(xpath)
    except (lxml.etree.XPathSyntaxError, lxml.etree.XPathEvalError):
        raise InvalidXPathExpression(sys.exc_info()[:2], value=xpath)

Exemplo n.º 38

0

Exibir arquivo

Arquivo: main.py Projeto: shackspace/inkcut_dmpl

 def set_adjustment_values(self,builder,etree): # fix defaults not loading
     for object in etree.xpath('/interface/object[@class="GtkAdjustment"]'):
         property = object.xpath('property[@name="value"]')
         if len(property):
             obj = builder.get_object(object.get('id'))
             obj.set_value(float(property[0].text))