Python etree.xpath示例，lxml.etree.xpath Python示例

示例#1

0

显示文件

def get_charges(stree: etree) -> List[Charge]:
    """
    Find a list of the charges in a parsed docket.
    """
    # find the charges in the Charges section
    charges = stree.xpath("//section[@name='section_charges']//charge")
    # charges is temporarily a list of tuples of [(sequence_num, Charge)]
    charges = [(
        xpath_or_blank(charge, "./seq_num"),
        Charge(
            offense=xpath_or_blank(charge, "./statute_description"),
            grade=xpath_or_blank(charge, "./grade"),
            statute=xpath_or_blank(charge, "./statute"),
            disposition="Unknown",
            disposition_date=None,
            sentences=[],
        ),
    ) for charge in charges]
    # figure out the disposition dates by looking for a final disposition date that matches a charge.
    final_disposition_events = stree.xpath(
        "//section[@name='section_disposition_sentencing']//case_event[case_event_desc_and_date/is_final[contains(text(),'Final Disposition')]]"
    )
    for final_disp_event in final_disposition_events:
        final_disp_date = xpath_date_or_blank(final_disp_event,
                                              ".//case_event_date")
        applies_to_sequences = xpath_or_empty_list(final_disp_event,
                                                   ".//sequence_number")
        for seq_num in applies_to_sequences:
            # set the final_disp date for the charge with sequence number seq_num
            for sn, charge in charges:
                if sn == seq_num:
                    charge.disposition_date = final_disp_date

    # Figure out the disposition of each charge from the disposition section.
    #   Do this by finding the last sequence in the disposition section for
    #   the sequence with seq_num. The disposition of the charge is that
    #   sequence's disposition. Sentence is in that xml element too.
    try:
        disposition_section = stree.xpath(
            "//section[@name='section_disposition_sentencing']")[0]
        for seq_num, charge in charges:
            try:
                # seq is the last sequence for the charge seq_num.
                seq = disposition_section.xpath(
                    f"./disposition_section/disposition_subsection/disposition_details/case_event/sequences/sequence[sequence_number/text()=' {seq_num} ']"
                )[-1]
                charge.disposition = xpath_or_blank(seq,
                                                    "./offense_disposition")
                charge.sentences = get_sentences(seq)
            except IndexError:
                continue
    except IndexError:
        pass
    return [c for i, c in charges]

示例#2

0

显示文件

文件： __init__.py 项目： ChristianKuehnel/monpyou

    def _parse_html(self, html: etree) -> None:
        """Parse the html code returned from server."""
        self.name = html.xpath('//div[@class="mytitle h4"]')[0].text
        self.iban = html.xpath('//div[@class="mysubtitle h4"]')[0].text

        panel = html.xpath('//div[@class="myPanelData"]')[0]
        self.currency = panel.xpath('//span[@class="mycurr"]')[0].text
        self.balance = self._parse_float(
            panel.xpath(AMOUNT_SEARCH_PATH.format(1))[0].text)
        self.interest_sum = self._parse_float(
            panel.xpath(AMOUNT_SEARCH_PATH.format(2))[0].text)
        self.interest_rate = self._parse_float(
            panel.xpath(AMOUNT_SEARCH_PATH.format(3))[0].text)
        self._notify_listeners()

示例#3

0

显示文件

文件： dom_utils.py 项目： yawks/pyrssw

def get_all_contents(dom: etree,
                     xpaths: list,
                     alt_to_p: bool = False) -> Tuple[str, str]:
    """Get content of all xpaths provided.

    Args:
        dom (etree): dom where to get the content
        xpaths (list): list of xpath expression used to extract content in dom object
        alt_to_p (bool, optional): If true, when an alt is found, a new element <p> is added with alt content (useful for readability). Defaults to False.

    Returns:
        str: [description]
    """
    content: str = ""
    alts: str = ""
    for xpath in xpaths:
        results = dom.xpath(xpath)
        if len(results) > 0:
            for result in results:
                enclosing: str = "%s%s"
                if result.tag != "p":
                    enclosing = "<p>%s</p>"

                alts = _get_alts(alt_to_p, result)

                content += enclosing % to_string(result)

    return content, alts

示例#4

0

显示文件

def add_ids_aux(element: etree,
                ids: defaultdict,
                parent_id: str = "") -> defaultdict:
    """ Add ids to xml element

    Args:
        element (etree): Element to add ids to
        ids (defaultdict): counters for ids assigned so far by tag type
        parent_id (str): Optional; id of parent element, by default ''

    Returns:
        defaultdict: ids, with new counts added by tag type
    """
    if element.tag is etree.Comment:
        return ids
    tag = etree.QName(element.tag).localname
    if tag in TAGS_TO_IGNORE:
        return ids
    if is_do_not_align(element):
        if tag == "w":
            raise RuntimeError(
                'Found <w> element with do-not-align="true" attribute. '
                "This is not allowed, please verify you XML input.")
        if element.xpath(".//w"):
            raise RuntimeError(
                'Found <w> nested inside a do-not-align="true" element. '
                "This is not allowed, please verify you XML input.")
        return ids
    if "id" not in element.attrib:
        if tag in TAG_TO_ID:
            id = TAG_TO_ID[tag]
        elif tag == "seg" and "type" in element.attrib:
            if element.attrib["type"] == "syll":
                id = "y"
            elif element.attrib["type"] in [
                    "morph",
                    "morpheme",
                    "base",
                    "root",
                    "prefix",
                    "suffix",
            ]:
                id = "m"
        else:
            id = tag
        if id not in ids:
            ids[id] = 0
        element.attrib["id"] = parent_id + id + str(ids[id])
        ids[id] += 1
    full_id = element.attrib["id"]
    # This deep copy of ids means that the ids counters are shared recursively
    # between siblings, but not between grand-children. Thus, if processing a p
    # element, the next p element will see its counter incremented, but the s
    # elements of the next p elements will start again at 0. ids always has the
    # counters of all ancestors and their siblings, by tag, but not the
    # descendents of siblings of ancestors.
    new_ids = deepcopy(ids)
    for child in element:
        new_ids = add_ids_aux(child, new_ids, full_id)
    return ids

示例#5

0

显示文件

    def _find_ebay_products_info(tree: etree) -> (list, None):
        """ Find necessary eBay products info in html elements """

        products = tree.xpath('//li[@class="s-item   "]')

        if not len(products):
            logger.warning('Empty eBay products list before finding info')
            return

        ebay_ids = []

        for product in products:
            ebay_id = product.xpath('.//a[@class="s-item__link"]')[0].get(
                'href')

            if ebay_id is None:
                continue

            ebay_id = search(r'/\d{12}\?', ebay_id)

            if ebay_id is None:
                continue

            ebay_id = ebay_id.group()[1:-1]

            if len(ebay_id) != constants.ebay_id_length or ebay_id in ebay_ids:
                continue

            ebay_ids.append(ebay_id)

        if len(ebay_ids):
            return ebay_ids

示例#6

0

显示文件

    def _find_products_info(self, tree: etree) -> None:
        """ Find necessary products info in html elements """

        products = tree.xpath('//div[@data-asin]')

        if not len(products):
            logger.warning('Empty products list before finding info')
            return

        for product in products:
            asin = product.get('data-asin')
            title = product.xpath('.//img')[0].get('alt')

            if asin is None or len(asin) != constants.asin_length:
                continue

            if title is None or not len(title):
                continue

            title = sub(r'[^0-9a-z ]', '', title.lower())
            title = sub(r' {2,}', ' ', ' ' + title + ' ')
            title = sub(r' ({0}) '.format('|'.join(constants.stopwords)), ' ',
                        title)
            title = sub(r'^ | $', '', title)
            words = title.split()

            if len(words) > constants.title_max_words:
                words = words[:constants.title_n_words]

            self._products[asin] = {'title': ' '.join(words)}

示例#7

0

显示文件

文件： parse_pdf.py 项目： devragj/RecordLib

def get_person(stree: etree) -> Person:
    """
    Extract a Person the xml of a docket, parsed into sections.

    Returns an empty Person object on errors.

    Args:
        stree: xml tree of a docket, parsed into a header and some number of sections
    
    Returns:
        a Person object
    """
    try:
        name = stree.xpath(
            "docket/header/caption/defendant_line")[0].text.strip()
        first_name, last_name = split_first_name(name)
    except IndexError:
        first_name = ""
        last_name = ""

    aliases = xpath_or_empty_list(stree, "//alias")
    date_of_birth = xpath_date_or_blank(stree, "//birth_date")
    return Person(first_name=first_name,
                  last_name=last_name,
                  date_of_birth=date_of_birth,
                  aliases=aliases)

示例#8

0

显示文件

文件： parse_pdf.py 项目： devragj/RecordLib

def xpath_date_or_blank(tree: etree, xpath: str) -> Optional[date]:
    """ Given an etree and an xpath expression, return the value of the expression 
    as a date, or None"""
    try:
        return datetime.strptime(
            tree.xpath(xpath)[0].text.strip(), r"%m/%d/%Y").date()
    except (IndexError, ValueError) as e:
        return None

示例#9

0

显示文件

文件： dazhong_v3.0.py 项目： nenyah/work-tools

 def _parse_list(self, tree: etree) -> str:
     nomore: list = tree.xpath('//div[@class="not-found"]')
     if nomore:
         return 'nomore'
     try:
         products: list = tree.xpath(
             '//div[@id="shop-all-list"]//div[@class="svr-info"]//a[@data-click-name="shop_info_gooddeal_click"]/@title'
         )
         product_link: list = tree.xpath(
             '//div[@id="shop-all-list"]//div[@class="svr-info"]//a[@data-click-name="shop_info_gooddeal_click"]/@href'
         )
         for product, link in zip(products, product_link):
             if '伊婉' in product:
                 self._ids.add(link)
         return 'hasmore'
     except Exception:
         pass

示例#10

0

显示文件

        def _get_params(self, html: etree) -> str:
            """ Определяем параметры товара """

            params = ""
            span_params = html.xpath(self.locators['txbParams'])
            for span_param in span_params:
                params += "|" + span_param.text.strip()

            return params

示例#11

0

显示文件

文件： parse_pdf.py 项目： devragj/RecordLib

def xpath_or_blank(stree: etree, xpath: str) -> str:
    """ given an etree and an xpath expression, return the value of the expression, or 
    an empty string. 
    
    A helper method"""
    try:
        return stree.xpath(xpath)[0].text.strip()
    except IndexError:
        return ""

示例#12

0

显示文件

    def get_img_url(self, node: etree) -> str:
        """get img url from enclosure or media:content tag if any

        Arguments:
            node {etree} -- item node of rss feed

        Returns:
            str -- the url of the image found in enclosure or media:content tag
        """
        img_url = ""
        enclosures = node.xpath(".//enclosure")
        # media:content tag
        medias = node.xpath(".//*[local-name()='content'][@url]")
        if len(enclosures) > 0:
            img_url = enclosures[0].get('url')
        elif len(medias) > 0:
            img_url = medias[0].get('url')
        return img_url

示例#13

0

显示文件

文件： dom_utils.py 项目： yawks/pyrssw

def get_first_node(dom: etree, xpaths: list):
    """get first node found in the list of xpath expressions"""
    node: Optional[etree._Element] = None
    for xpath in xpaths:
        results = dom.xpath(xpath)
        if len(results) > 0:
            node = results[0]
            break
    return node

示例#14

0

显示文件

 def _replace_urls_process_links(dom: etree, attribute: str):
     for o in dom.xpath("//*[@%s]" % attribute):
         if o.attrib[attribute].startswith("//"):
             protocol: str = "http:"
             if self.handler.get_original_website().find("https") > -1:
                 protocol = "https:"
             o.attrib[attribute] = protocol + o.attrib[attribute]
         elif o.attrib[attribute].startswith("/"):
             o.attrib[attribute] = self.handler.get_original_website(
             ) + o.attrib[attribute][1:]

示例#15

0

显示文件

文件： dazhong_v3.0.py 项目： nenyah/work-tools

 def _parse_item(self, tree: etree, link: str):
     try:
         item = {}
         item['product'] = tree.xpath(
             '//p[@class="product-name bold"]/text()')[0]
         item['link'] = link
         item['price'] = tree.xpath('//div[@class="price"]//text()')[-1]
         item['hospital'] = tree.xpath(
             '//div[@class="shop-item"]/p[@class="shop-name"]/text()')[0]
         item['address'] = tree.xpath(
             '//div[@class="shop-item"]/p[@class="shop-addr"]/text()'
         )[0].replace('地址：', '')
         item['phone'] = tree.xpath(
             '//div[@class="shop-item"]/p[@class="shop-phone"]/text()'
         )[0].replace('电话：', '')
         print(item)
         self.content.append(item)
         return 'success'
     except Exception:
         pass

示例#16

0

显示文件

文件： align.py 项目： ReadAlongs/Studio

def return_word_from_id(xml: etree, el_id: str) -> str:
    """Given an XML document, return the innertext at id

    Args:
        xml (etree): XML document
        el_id (str): ID

    Returns:
        str: Innertext of element with el_id in xml
    """
    return xml.xpath('//*[@id="%s"]/text()' % el_id)[0]

示例#17

0

显示文件

    def _check_location(tree: etree):
        """ Check current session location on Amazon """

        try:
            span = tree.xpath('//span[@id="glow-ingress-line2"]')[0]

        except IndexError:
            return False

        else:
            return span.text != CURRENT_AMAZON_LOCATION

示例#18

0

显示文件

文件： align.py 项目： littell/ReadAlong-Studio

def return_word_from_id(xml: etree, el_id: str) -> str:
    """ Given an XML document, return the innertext at id

    Parameters
    ----------
    xml : etree
        XML document
    el_id : str
        ID

    Returns
    -------
    str
        Innertext of element with el_id in xml
    """
    return xml.xpath('//*[@id="%s"]/text()' % el_id)[0]

示例#19

0

显示文件

文件： atom_arranger.py 项目： yawks/pyrssw

    def get_img_url(self, node: etree) -> str:
        """get img url from enclosure or media:thumbnail tag if any

        Arguments:
            node {etree} -- item node of rss feed

        Returns:
            str -- the url of the image found in media:thumbnail tag
        """
        img_url = ""
        # media:thumbnail tag
        medias = node.xpath(
            ".//*[local-name()='thumbnail'][@url]", namespaces=NAMESPACES)
        if len(medias) > 0:
            img_url = medias[0].get('url')
        return img_url

示例#20

0

显示文件

文件： parse_pdf.py 项目： devragj/RecordLib

def get_sentences(stree: etree) -> List[Sentence]:
    """Find the sentences in a sequence (as an xml tree) from a disposition section of a docket.
    """
    sequence_date = xpath_date_or_blank(stree, "//action_date")
    sentences = stree.xpath("//sentence_info")
    sentences = [
        Sentence(sentence_date=sequence_date,
                 sentence_type=xpath_or_blank(s, "//program"),
                 sentence_period="...",
                 sentence_length=SentenceLength(
                     min_time=(
                         s.xpath("//sentence_length/min_length/time")[0].text,
                         s.xpath("//sentence_length/min_length/unit")[0].text),
                     max_time=(
                         s.xpath("//sentence_length/min_length/time")[0].text,
                         s.xpath("//sentence_length/min_length/unit")[0].text),
                 )) for s in sentences
    ]
    return sentences

示例#21

0

显示文件

def add_images(element: etree, config: dict) -> etree:
    """Add images from configuration object to xml

    Args:
        element (etree): xml without images
        config (dict): standard ReadAlong-Studio configuration

    Returns:
        etree: xml with images markup
    """
    if "images" not in config:
        raise KeyError(
            "Configuration tried to add images, but no images were found in configuration"
        )

    if not isinstance(config["images"], dict):
        raise TypeError(
            f"Image configuration is of type {type(config['images'])} but a dict is required."
        )

    pages = element.xpath('//div[@type="page"]')

    for i, url in config["images"].items():
        image_el = etree.Element("graphic", url=url)
        try:
            i = int(i)
        except ValueError as e:
            raise ValueError(
                f"Images must be indexed using integers, you provided {i}"
            ) from e
        try:
            pages[int(i)].append(image_el)
        except IndexError as e:
            raise IndexError(
                f"No page found at index {i}, please verify your configuration"
            ) from e

    return element

示例#22

0

显示文件

文件： main.py 项目： Kolya59/xml-timetable

def do_xPath_queries(tree: etree):
    lessons = tree.xpath(QUERIES.GET_LESSONS)
    print('All lessons: {}'.format(lessons))

    audience = tree.xpath(QUERIES.GET_AUDIENCE)
    print('Audience: {}'.format(audience))

    practicals = tree.xpath(QUERIES.GET_PRACTICALS)
    print('Practicals: {}'.format(practicals))

    lectures = tree.xpath(QUERIES.GET_PRACTICALS_FROM_239)
    print('Practicals from 239: {}'.format(lectures))

    teachers = tree.xpath(QUERIES.GET_TEACHERS_FROM_239)
    print('Teachers from 239: {}'.format(teachers))

    last_lessons = tree.xpath(QUERIES.GET_LAST_LESSONS)
    print('Last lessons: {}'.format(last_lessons))

    lessons_count = int(tree.xpath(QUERIES.GET_LESSONS_COUNT))
    print('Lessons count: {}'.format(lessons_count))

示例#23

0

显示文件

文件： add_elements_to_xml.py 项目： littell/ReadAlong-Studio

def add_supplementary_xml(element: etree, config: dict) -> etree:
    """Add arbitrary xml from configuration object to xml

    Args:
        element (etree): original xml document
        config (dict): standard ReadAlong-Studio configuration

    Returns:
        etree: xml with supplemental markup
    """
    if "xml" not in config:
        raise KeyError(
            "Configuration tried to add supplementary xml, but no declarations were found in configuration"
        )
    for el in config["xml"]:
        parents = element.xpath(el["xpath"])
        if not parents:
            LOGGER.warn(
                f"No elements found at {el['xpath']}, please verify your configuration."
            )
        for parent in parents:
            parent.append(etree.XML(el["value"]))

    return element

示例#24

0

显示文件

文件： parsers.py 项目： AverHLV/pairs

def parse_delivery_time_response(tree: etree) -> (int, None):
    """ Find and parse date string in html response """

    for location in ('//strong[@class="vi-acc-del-range"]/b/text()',
                     '//strong[@class="vi-acc-del-range"]/text()',
                     '//span[@class="vi-acc-del-range"]/b/text()',
                     '//span[@class="vi-acc-del-range"]/text()'):
        date = tree.xpath(location)

        if len(date):
            break

    else:
        return

    date = search(r'[A-Z][a-z]{2}\. \d{1,2}', date[0])

    if date is None:
        return

    date = date.group()

    if not len(date):
        return

    # calculate number of delivery days

    delivery_date = datetime.now(current_timezone).date()
    delivery_date = delivery_date.replace(
        day=int(date[5:]), month=constants.ebay_delivery_months[date[:3]])
    current_date = datetime.now(current_timezone).date()

    if current_date > delivery_date:
        return

    return (delivery_date - current_date).days

示例#25

0

显示文件

文件： dom_utils.py 项目： yawks/pyrssw

def delete_xpaths(dom: etree, xpaths: List[str]):
    """delete nodes of the given dom matching xpath exrepssions"""
    for xpath in xpaths:
        delete_nodes(dom.xpath(xpath))

示例#26

0

显示文件

 def get_items(self, dom: etree) -> list:
     return dom.xpath("//item")

示例#27

0

显示文件

文件： parse_pdf.py 项目： devragj/RecordLib

def xpath_or_empty_list(tree: etree, xpath: str) -> List[str]:
    """ Given an etree, find a list of strings, or return an empty list."""
    return [el.text.strip() for el in tree.xpath(xpath)]

示例#28

0

显示文件

 def get_descriptions(self, item: etree) -> list:
     return item.xpath(".//description")

示例#29

0

显示文件

 def get_links(self, item: etree) -> list:
     return item.xpath(".//link")

示例#30

0

显示文件

文件： parse_pdf.py 项目： devragj/RecordLib

def sections_from_pages(ptree: etree) -> etree:
    """
    Splice together sections in `ptree` that are separated across pages, 
    and get rid of the `page` level of the `ptree` entirely
    
    From 
    <docket>
        <page>
            <section> 
            </section
        </page>
        <page>
            <section_continued>
            </section_continued>
        </page>
    </docket>
    
    To
    <docket>
        <section>
        </section>
    </docket>
    """
    # create an empty tree to add all the other sections onto.
    stitched_xml = etree.Element("docket")
    stitched_xml.append(ptree.xpath("//header[1]")[0])
    pages = ptree.xpath("//page")
    logging.info(f"    {len(pages)} pages in this docket.")
    # Recombine a section if it carries onto the following page(s).
    combined_sections = []
    for page_num, page in enumerate(pages):
        sections = page.xpath(".//section")
        for section in sections:
            if len(combined_sections) != 0:
                #if the section last added to combined sections is the same kind of
                # section, then add the current section's text to the most recent
                # combined section's text.
                if section.xpath("@name")[0] == combined_sections[-1].xpath(
                        "@name")[0]:
                    # here is where we'd remove the overflowing header lines from this section, before
                    # appending it to the previous section.
                    section_header_remover = create_section_header_remover(
                        section.xpath("@name")[0])
                    # strip() removes empty lines at the beginning of the section,
                    # which is good. But it also removes spaces at the beginning of the first line with text.
                    # some grammar pieces rely on the indentation of a line to know what kind of line it is.
                    # this strip() removes that indentation.
                    section_text = "\n".join(
                        [ln for ln in section.text.split("\n") if ln.strip()])
                    section_text = section_header_remover(section_text)
                    # now combine the previous section with this section, because this section
                    # is just the overflow of the last on a different page.
                    combined_sections[-1].text = "\n".join(
                        [combined_sections[-1].text.strip(), section_text])
                #else the current section is new, so add the current section to the end of combined_sections
                else:
                    combined_sections.append(section)
            else:
                combined_sections.append(section)

        [
            stitched_xml.append(section_node)
            for section_node in combined_sections
        ]
    last_page = pages[-1]
    if len(last_page.xpath(".//section")) == 0:
        # add the traling <body> lines to the last section in combined_sections
        last_page_body = last_page.xpath("body")[0].text
        stitched_xml.xpath("//section[last()]")[0].text += last_page_body
    docket_tree = etree.ElementTree(stitched_xml)
    return docket_tree