def get_charges(stree: etree) -> List[Charge]: """ Find a list of the charges in a parsed docket. """ # find the charges in the Charges section charges = stree.xpath("//section[@name='section_charges']//charge") # charges is temporarily a list of tuples of [(sequence_num, Charge)] charges = [( xpath_or_blank(charge, "./seq_num"), Charge( offense=xpath_or_blank(charge, "./statute_description"), grade=xpath_or_blank(charge, "./grade"), statute=xpath_or_blank(charge, "./statute"), disposition="Unknown", disposition_date=None, sentences=[], ), ) for charge in charges] # figure out the disposition dates by looking for a final disposition date that matches a charge. final_disposition_events = stree.xpath( "//section[@name='section_disposition_sentencing']//case_event[case_event_desc_and_date/is_final[contains(text(),'Final Disposition')]]" ) for final_disp_event in final_disposition_events: final_disp_date = xpath_date_or_blank(final_disp_event, ".//case_event_date") applies_to_sequences = xpath_or_empty_list(final_disp_event, ".//sequence_number") for seq_num in applies_to_sequences: # set the final_disp date for the charge with sequence number seq_num for sn, charge in charges: if sn == seq_num: charge.disposition_date = final_disp_date # Figure out the disposition of each charge from the disposition section. # Do this by finding the last sequence in the disposition section for # the sequence with seq_num. The disposition of the charge is that # sequence's disposition. Sentence is in that xml element too. try: disposition_section = stree.xpath( "//section[@name='section_disposition_sentencing']")[0] for seq_num, charge in charges: try: # seq is the last sequence for the charge seq_num. seq = disposition_section.xpath( f"./disposition_section/disposition_subsection/disposition_details/case_event/sequences/sequence[sequence_number/text()=' {seq_num} ']" )[-1] charge.disposition = xpath_or_blank(seq, "./offense_disposition") charge.sentences = get_sentences(seq) except IndexError: continue except IndexError: pass return [c for i, c in charges]
def _parse_html(self, html: etree) -> None: """Parse the html code returned from server.""" self.name = html.xpath('//div[@class="mytitle h4"]')[0].text self.iban = html.xpath('//div[@class="mysubtitle h4"]')[0].text panel = html.xpath('//div[@class="myPanelData"]')[0] self.currency = panel.xpath('//span[@class="mycurr"]')[0].text self.balance = self._parse_float( panel.xpath(AMOUNT_SEARCH_PATH.format(1))[0].text) self.interest_sum = self._parse_float( panel.xpath(AMOUNT_SEARCH_PATH.format(2))[0].text) self.interest_rate = self._parse_float( panel.xpath(AMOUNT_SEARCH_PATH.format(3))[0].text) self._notify_listeners()
def get_all_contents(dom: etree, xpaths: list, alt_to_p: bool = False) -> Tuple[str, str]: """Get content of all xpaths provided. Args: dom (etree): dom where to get the content xpaths (list): list of xpath expression used to extract content in dom object alt_to_p (bool, optional): If true, when an alt is found, a new element <p> is added with alt content (useful for readability). Defaults to False. Returns: str: [description] """ content: str = "" alts: str = "" for xpath in xpaths: results = dom.xpath(xpath) if len(results) > 0: for result in results: enclosing: str = "%s%s" if result.tag != "p": enclosing = "<p>%s</p>" alts = _get_alts(alt_to_p, result) content += enclosing % to_string(result) return content, alts
def add_ids_aux(element: etree, ids: defaultdict, parent_id: str = "") -> defaultdict: """ Add ids to xml element Args: element (etree): Element to add ids to ids (defaultdict): counters for ids assigned so far by tag type parent_id (str): Optional; id of parent element, by default '' Returns: defaultdict: ids, with new counts added by tag type """ if element.tag is etree.Comment: return ids tag = etree.QName(element.tag).localname if tag in TAGS_TO_IGNORE: return ids if is_do_not_align(element): if tag == "w": raise RuntimeError( 'Found <w> element with do-not-align="true" attribute. ' "This is not allowed, please verify you XML input.") if element.xpath(".//w"): raise RuntimeError( 'Found <w> nested inside a do-not-align="true" element. ' "This is not allowed, please verify you XML input.") return ids if "id" not in element.attrib: if tag in TAG_TO_ID: id = TAG_TO_ID[tag] elif tag == "seg" and "type" in element.attrib: if element.attrib["type"] == "syll": id = "y" elif element.attrib["type"] in [ "morph", "morpheme", "base", "root", "prefix", "suffix", ]: id = "m" else: id = tag if id not in ids: ids[id] = 0 element.attrib["id"] = parent_id + id + str(ids[id]) ids[id] += 1 full_id = element.attrib["id"] # This deep copy of ids means that the ids counters are shared recursively # between siblings, but not between grand-children. Thus, if processing a p # element, the next p element will see its counter incremented, but the s # elements of the next p elements will start again at 0. ids always has the # counters of all ancestors and their siblings, by tag, but not the # descendents of siblings of ancestors. new_ids = deepcopy(ids) for child in element: new_ids = add_ids_aux(child, new_ids, full_id) return ids
def _find_ebay_products_info(tree: etree) -> (list, None): """ Find necessary eBay products info in html elements """ products = tree.xpath('//li[@class="s-item "]') if not len(products): logger.warning('Empty eBay products list before finding info') return ebay_ids = [] for product in products: ebay_id = product.xpath('.//a[@class="s-item__link"]')[0].get( 'href') if ebay_id is None: continue ebay_id = search(r'/\d{12}\?', ebay_id) if ebay_id is None: continue ebay_id = ebay_id.group()[1:-1] if len(ebay_id) != constants.ebay_id_length or ebay_id in ebay_ids: continue ebay_ids.append(ebay_id) if len(ebay_ids): return ebay_ids
def _find_products_info(self, tree: etree) -> None: """ Find necessary products info in html elements """ products = tree.xpath('//div[@data-asin]') if not len(products): logger.warning('Empty products list before finding info') return for product in products: asin = product.get('data-asin') title = product.xpath('.//img')[0].get('alt') if asin is None or len(asin) != constants.asin_length: continue if title is None or not len(title): continue title = sub(r'[^0-9a-z ]', '', title.lower()) title = sub(r' {2,}', ' ', ' ' + title + ' ') title = sub(r' ({0}) '.format('|'.join(constants.stopwords)), ' ', title) title = sub(r'^ | $', '', title) words = title.split() if len(words) > constants.title_max_words: words = words[:constants.title_n_words] self._products[asin] = {'title': ' '.join(words)}
def get_person(stree: etree) -> Person: """ Extract a Person the xml of a docket, parsed into sections. Returns an empty Person object on errors. Args: stree: xml tree of a docket, parsed into a header and some number of sections Returns: a Person object """ try: name = stree.xpath( "docket/header/caption/defendant_line")[0].text.strip() first_name, last_name = split_first_name(name) except IndexError: first_name = "" last_name = "" aliases = xpath_or_empty_list(stree, "//alias") date_of_birth = xpath_date_or_blank(stree, "//birth_date") return Person(first_name=first_name, last_name=last_name, date_of_birth=date_of_birth, aliases=aliases)
def xpath_date_or_blank(tree: etree, xpath: str) -> Optional[date]: """ Given an etree and an xpath expression, return the value of the expression as a date, or None""" try: return datetime.strptime( tree.xpath(xpath)[0].text.strip(), r"%m/%d/%Y").date() except (IndexError, ValueError) as e: return None
def _parse_list(self, tree: etree) -> str: nomore: list = tree.xpath('//div[@class="not-found"]') if nomore: return 'nomore' try: products: list = tree.xpath( '//div[@id="shop-all-list"]//div[@class="svr-info"]//a[@data-click-name="shop_info_gooddeal_click"]/@title' ) product_link: list = tree.xpath( '//div[@id="shop-all-list"]//div[@class="svr-info"]//a[@data-click-name="shop_info_gooddeal_click"]/@href' ) for product, link in zip(products, product_link): if '伊婉' in product: self._ids.add(link) return 'hasmore' except Exception: pass
def _get_params(self, html: etree) -> str: """ Определяем параметры товара """ params = "" span_params = html.xpath(self.locators['txbParams']) for span_param in span_params: params += "|" + span_param.text.strip() return params
def xpath_or_blank(stree: etree, xpath: str) -> str: """ given an etree and an xpath expression, return the value of the expression, or an empty string. A helper method""" try: return stree.xpath(xpath)[0].text.strip() except IndexError: return ""
def get_img_url(self, node: etree) -> str: """get img url from enclosure or media:content tag if any Arguments: node {etree} -- item node of rss feed Returns: str -- the url of the image found in enclosure or media:content tag """ img_url = "" enclosures = node.xpath(".//enclosure") # media:content tag medias = node.xpath(".//*[local-name()='content'][@url]") if len(enclosures) > 0: img_url = enclosures[0].get('url') elif len(medias) > 0: img_url = medias[0].get('url') return img_url
def get_first_node(dom: etree, xpaths: list): """get first node found in the list of xpath expressions""" node: Optional[etree._Element] = None for xpath in xpaths: results = dom.xpath(xpath) if len(results) > 0: node = results[0] break return node
def _replace_urls_process_links(dom: etree, attribute: str): for o in dom.xpath("//*[@%s]" % attribute): if o.attrib[attribute].startswith("//"): protocol: str = "http:" if self.handler.get_original_website().find("https") > -1: protocol = "https:" o.attrib[attribute] = protocol + o.attrib[attribute] elif o.attrib[attribute].startswith("/"): o.attrib[attribute] = self.handler.get_original_website( ) + o.attrib[attribute][1:]
def _parse_item(self, tree: etree, link: str): try: item = {} item['product'] = tree.xpath( '//p[@class="product-name bold"]/text()')[0] item['link'] = link item['price'] = tree.xpath('//div[@class="price"]//text()')[-1] item['hospital'] = tree.xpath( '//div[@class="shop-item"]/p[@class="shop-name"]/text()')[0] item['address'] = tree.xpath( '//div[@class="shop-item"]/p[@class="shop-addr"]/text()' )[0].replace('地址:', '') item['phone'] = tree.xpath( '//div[@class="shop-item"]/p[@class="shop-phone"]/text()' )[0].replace('电话:', '') print(item) self.content.append(item) return 'success' except Exception: pass
def return_word_from_id(xml: etree, el_id: str) -> str: """Given an XML document, return the innertext at id Args: xml (etree): XML document el_id (str): ID Returns: str: Innertext of element with el_id in xml """ return xml.xpath('//*[@id="%s"]/text()' % el_id)[0]
def _check_location(tree: etree): """ Check current session location on Amazon """ try: span = tree.xpath('//span[@id="glow-ingress-line2"]')[0] except IndexError: return False else: return span.text != CURRENT_AMAZON_LOCATION
def return_word_from_id(xml: etree, el_id: str) -> str: """ Given an XML document, return the innertext at id Parameters ---------- xml : etree XML document el_id : str ID Returns ------- str Innertext of element with el_id in xml """ return xml.xpath('//*[@id="%s"]/text()' % el_id)[0]
def get_img_url(self, node: etree) -> str: """get img url from enclosure or media:thumbnail tag if any Arguments: node {etree} -- item node of rss feed Returns: str -- the url of the image found in media:thumbnail tag """ img_url = "" # media:thumbnail tag medias = node.xpath( ".//*[local-name()='thumbnail'][@url]", namespaces=NAMESPACES) if len(medias) > 0: img_url = medias[0].get('url') return img_url
def get_sentences(stree: etree) -> List[Sentence]: """Find the sentences in a sequence (as an xml tree) from a disposition section of a docket. """ sequence_date = xpath_date_or_blank(stree, "//action_date") sentences = stree.xpath("//sentence_info") sentences = [ Sentence(sentence_date=sequence_date, sentence_type=xpath_or_blank(s, "//program"), sentence_period="...", sentence_length=SentenceLength( min_time=( s.xpath("//sentence_length/min_length/time")[0].text, s.xpath("//sentence_length/min_length/unit")[0].text), max_time=( s.xpath("//sentence_length/min_length/time")[0].text, s.xpath("//sentence_length/min_length/unit")[0].text), )) for s in sentences ] return sentences
def add_images(element: etree, config: dict) -> etree: """Add images from configuration object to xml Args: element (etree): xml without images config (dict): standard ReadAlong-Studio configuration Returns: etree: xml with images markup """ if "images" not in config: raise KeyError( "Configuration tried to add images, but no images were found in configuration" ) if not isinstance(config["images"], dict): raise TypeError( f"Image configuration is of type {type(config['images'])} but a dict is required." ) pages = element.xpath('//div[@type="page"]') for i, url in config["images"].items(): image_el = etree.Element("graphic", url=url) try: i = int(i) except ValueError as e: raise ValueError( f"Images must be indexed using integers, you provided {i}" ) from e try: pages[int(i)].append(image_el) except IndexError as e: raise IndexError( f"No page found at index {i}, please verify your configuration" ) from e return element
def do_xPath_queries(tree: etree): lessons = tree.xpath(QUERIES.GET_LESSONS) print('All lessons: {}'.format(lessons)) audience = tree.xpath(QUERIES.GET_AUDIENCE) print('Audience: {}'.format(audience)) practicals = tree.xpath(QUERIES.GET_PRACTICALS) print('Practicals: {}'.format(practicals)) lectures = tree.xpath(QUERIES.GET_PRACTICALS_FROM_239) print('Practicals from 239: {}'.format(lectures)) teachers = tree.xpath(QUERIES.GET_TEACHERS_FROM_239) print('Teachers from 239: {}'.format(teachers)) last_lessons = tree.xpath(QUERIES.GET_LAST_LESSONS) print('Last lessons: {}'.format(last_lessons)) lessons_count = int(tree.xpath(QUERIES.GET_LESSONS_COUNT)) print('Lessons count: {}'.format(lessons_count))
def add_supplementary_xml(element: etree, config: dict) -> etree: """Add arbitrary xml from configuration object to xml Args: element (etree): original xml document config (dict): standard ReadAlong-Studio configuration Returns: etree: xml with supplemental markup """ if "xml" not in config: raise KeyError( "Configuration tried to add supplementary xml, but no declarations were found in configuration" ) for el in config["xml"]: parents = element.xpath(el["xpath"]) if not parents: LOGGER.warn( f"No elements found at {el['xpath']}, please verify your configuration." ) for parent in parents: parent.append(etree.XML(el["value"])) return element
def parse_delivery_time_response(tree: etree) -> (int, None): """ Find and parse date string in html response """ for location in ('//strong[@class="vi-acc-del-range"]/b/text()', '//strong[@class="vi-acc-del-range"]/text()', '//span[@class="vi-acc-del-range"]/b/text()', '//span[@class="vi-acc-del-range"]/text()'): date = tree.xpath(location) if len(date): break else: return date = search(r'[A-Z][a-z]{2}\. \d{1,2}', date[0]) if date is None: return date = date.group() if not len(date): return # calculate number of delivery days delivery_date = datetime.now(current_timezone).date() delivery_date = delivery_date.replace( day=int(date[5:]), month=constants.ebay_delivery_months[date[:3]]) current_date = datetime.now(current_timezone).date() if current_date > delivery_date: return return (delivery_date - current_date).days
def delete_xpaths(dom: etree, xpaths: List[str]): """delete nodes of the given dom matching xpath exrepssions""" for xpath in xpaths: delete_nodes(dom.xpath(xpath))
def get_items(self, dom: etree) -> list: return dom.xpath("//item")
def xpath_or_empty_list(tree: etree, xpath: str) -> List[str]: """ Given an etree, find a list of strings, or return an empty list.""" return [el.text.strip() for el in tree.xpath(xpath)]
def get_descriptions(self, item: etree) -> list: return item.xpath(".//description")
def get_links(self, item: etree) -> list: return item.xpath(".//link")
def sections_from_pages(ptree: etree) -> etree: """ Splice together sections in `ptree` that are separated across pages, and get rid of the `page` level of the `ptree` entirely From <docket> <page> <section> </section </page> <page> <section_continued> </section_continued> </page> </docket> To <docket> <section> </section> </docket> """ # create an empty tree to add all the other sections onto. stitched_xml = etree.Element("docket") stitched_xml.append(ptree.xpath("//header[1]")[0]) pages = ptree.xpath("//page") logging.info(f" {len(pages)} pages in this docket.") # Recombine a section if it carries onto the following page(s). combined_sections = [] for page_num, page in enumerate(pages): sections = page.xpath(".//section") for section in sections: if len(combined_sections) != 0: #if the section last added to combined sections is the same kind of # section, then add the current section's text to the most recent # combined section's text. if section.xpath("@name")[0] == combined_sections[-1].xpath( "@name")[0]: # here is where we'd remove the overflowing header lines from this section, before # appending it to the previous section. section_header_remover = create_section_header_remover( section.xpath("@name")[0]) # strip() removes empty lines at the beginning of the section, # which is good. But it also removes spaces at the beginning of the first line with text. # some grammar pieces rely on the indentation of a line to know what kind of line it is. # this strip() removes that indentation. section_text = "\n".join( [ln for ln in section.text.split("\n") if ln.strip()]) section_text = section_header_remover(section_text) # now combine the previous section with this section, because this section # is just the overflow of the last on a different page. combined_sections[-1].text = "\n".join( [combined_sections[-1].text.strip(), section_text]) #else the current section is new, so add the current section to the end of combined_sections else: combined_sections.append(section) else: combined_sections.append(section) [ stitched_xml.append(section_node) for section_node in combined_sections ] last_page = pages[-1] if len(last_page.xpath(".//section")) == 0: # add the traling <body> lines to the last section in combined_sections last_page_body = last_page.xpath("body")[0].text stitched_xml.xpath("//section[last()]")[0].text += last_page_body docket_tree = etree.ElementTree(stitched_xml) return docket_tree