def uniform_yes_no(tt: etree) -> None: for k in tt.attrib: v = tt.attrib[k] if v.lower() == 'y' or v.lower() == 'yes': tt.attrib[k] = 'Y' elif v.lower() == 'n' or v.lower() == 'no': tt.attrib[k] = 'N'
def add_ids_aux(element: etree, ids: defaultdict, parent_id: str = "") -> defaultdict: """ Add ids to xml element Args: element (etree): Element to add ids to ids (defaultdict): counters for ids assigned so far by tag type parent_id (str): Optional; id of parent element, by default '' Returns: defaultdict: ids, with new counts added by tag type """ if element.tag is etree.Comment: return ids tag = etree.QName(element.tag).localname if tag in TAGS_TO_IGNORE: return ids if is_do_not_align(element): if tag == "w": raise RuntimeError( 'Found <w> element with do-not-align="true" attribute. ' "This is not allowed, please verify you XML input.") if element.xpath(".//w"): raise RuntimeError( 'Found <w> nested inside a do-not-align="true" element. ' "This is not allowed, please verify you XML input.") return ids if "id" not in element.attrib: if tag in TAG_TO_ID: id = TAG_TO_ID[tag] elif tag == "seg" and "type" in element.attrib: if element.attrib["type"] == "syll": id = "y" elif element.attrib["type"] in [ "morph", "morpheme", "base", "root", "prefix", "suffix", ]: id = "m" else: id = tag if id not in ids: ids[id] = 0 element.attrib["id"] = parent_id + id + str(ids[id]) ids[id] += 1 full_id = element.attrib["id"] # This deep copy of ids means that the ids counters are shared recursively # between siblings, but not between grand-children. Thus, if processing a p # element, the next p element will see its counter incremented, but the s # elements of the next p elements will start again at 0. ids always has the # counters of all ancestors and their siblings, by tag, but not the # descendents of siblings of ancestors. new_ids = deepcopy(ids) for child in element: new_ids = add_ids_aux(child, new_ids, full_id) return ids
def _error_check(self, command_response: etree) -> None: """commmand_response will be an XML Etree object.""" error_list = command_response.find("./clierror") command_obj = command_response.find("./input") if error_list is not None: command = command_obj.text if command_obj is not None else "Unknown command" msg = etree.tostring(error_list).decode() raise NXAPICommandError(command, msg)
def create_game(root: Element): game = game_models.Game(root.attrib['name']) game.max_players = root.attrib['max_players'] game.min_players = root.attrib['min_players'] add_actions(root.find("actions"), game) add_collections(root.find("collections"), game) add_turns(root.find("turns"), game) add_pieces(root.find("pieces"), game) return game
def trim_and_sort_attrs(tt: etree) -> None: for k in tt.attrib: tt.attrib[k] = tt.attrib[k].strip() dd = {} for k in tt.attrib: v = tt.attrib.pop(k) dd[k] = v for k in sorted(dd): tt.attrib[k] = dd[k]
def get_charges(stree: etree) -> List[Charge]: """ Find a list of the charges in a parsed docket. """ # find the charges in the Charges section charges = stree.xpath("//section[@name='section_charges']//charge") # charges is temporarily a list of tuples of [(sequence_num, Charge)] charges = [( xpath_or_blank(charge, "./seq_num"), Charge( offense=xpath_or_blank(charge, "./statute_description"), grade=xpath_or_blank(charge, "./grade"), statute=xpath_or_blank(charge, "./statute"), disposition="Unknown", disposition_date=None, sentences=[], ), ) for charge in charges] # figure out the disposition dates by looking for a final disposition date that matches a charge. final_disposition_events = stree.xpath( "//section[@name='section_disposition_sentencing']//case_event[case_event_desc_and_date/is_final[contains(text(),'Final Disposition')]]" ) for final_disp_event in final_disposition_events: final_disp_date = xpath_date_or_blank(final_disp_event, ".//case_event_date") applies_to_sequences = xpath_or_empty_list(final_disp_event, ".//sequence_number") for seq_num in applies_to_sequences: # set the final_disp date for the charge with sequence number seq_num for sn, charge in charges: if sn == seq_num: charge.disposition_date = final_disp_date # Figure out the disposition of each charge from the disposition section. # Do this by finding the last sequence in the disposition section for # the sequence with seq_num. The disposition of the charge is that # sequence's disposition. Sentence is in that xml element too. try: disposition_section = stree.xpath( "//section[@name='section_disposition_sentencing']")[0] for seq_num, charge in charges: try: # seq is the last sequence for the charge seq_num. seq = disposition_section.xpath( f"./disposition_section/disposition_subsection/disposition_details/case_event/sequences/sequence[sequence_number/text()=' {seq_num} ']" )[-1] charge.disposition = xpath_or_blank(seq, "./offense_disposition") charge.sentences = get_sentences(seq) except IndexError: continue except IndexError: pass return [c for i, c in charges]
def save_file(self, obj): qfd = QtWidgets.QFileDialog() (name, file_type) = QtWidgets.QFileDialog.getSaveFileName(qfd, 'Save File', "","XML files (*.xml)") try: xmlfile = open(name,'w') tree = ET(obj.xmlroot) tree.write(xmlfile, encoding='unicode') xmlfile.close() except: print("No file saved") pass
def _parse_html(self, html: etree) -> None: """Parse the html code returned from server.""" self.name = html.xpath('//div[@class="mytitle h4"]')[0].text self.iban = html.xpath('//div[@class="mysubtitle h4"]')[0].text panel = html.xpath('//div[@class="myPanelData"]')[0] self.currency = panel.xpath('//span[@class="mycurr"]')[0].text self.balance = self._parse_float( panel.xpath(AMOUNT_SEARCH_PATH.format(1))[0].text) self.interest_sum = self._parse_float( panel.xpath(AMOUNT_SEARCH_PATH.format(2))[0].text) self.interest_rate = self._parse_float( panel.xpath(AMOUNT_SEARCH_PATH.format(3))[0].text) self._notify_listeners()
def get_abstract(parsed_document: etree, alt_text: str = 'n/a') -> str: try: abstract = parsed_document.find('front/article-meta/abstract/p').text if abstract is None: abstract = '' for section in parsed_document.findall( 'front/article-meta/abstract/sec/p'): # for sectioned abstracts abstract += str(section.text) if abstract == '': abstract = alt_text except AttributeError: abstract = alt_text return abstract
def convert_metars(root: etree) -> List[Metar]: """ Convert metar data for the database. :param root: XML etree root. :return: List of SQLAlchemy Base classes for Metars. """ def process(kids: List[Element], xml_class: MetarXML) -> MetarXML: """ Process the XML data so that it can be mapped to the database. :param kids: child branches of the etree. :param xml_class: Empty XML class object. :return: Instantiated class with loaded data. """ for elt in kids: if elt.attrib: xml_class.add_child(process_attrib_metar(elt)) else: kwarg = {elt.tag: elt.text} xml_class.set(**kwarg) return xml_class data = root.find("data") elems = data.findall("METAR") maps = [] for elm in elems: children = list(elm) proc = process(children, MetarXML()) mapped = proc.create_mapping() maps.append(mapped) return maps
def _node_to_dictionary(node: etree, ignore_attributes: bool = False): """ Convert an lxml.etree node tree recursively into a nested dictionary. The node's attributes and child items will be added to it's dictionary. Args: node (etree): The etree node ignore_attributes (bool): Optional parameter; whether or not to skip the node's attributes. Default is False. """ result = {} if ignore_attributes else dict(node.attrib) for child_node in node.iterchildren(): key = child_node.tag.split("}")[1] if child_node.text and child_node.text.strip(): value = child_node.text else: value = DIMRParser._node_to_dictionary(child_node) if key in result: if type(result[key]) is list: result[key].append(value) else: first_value = result[key].copy() result[key] = [first_value, value] else: result[key] = value return result
def get_all_contents(dom: etree, xpaths: list, alt_to_p: bool = False) -> Tuple[str, str]: """Get content of all xpaths provided. Args: dom (etree): dom where to get the content xpaths (list): list of xpath expression used to extract content in dom object alt_to_p (bool, optional): If true, when an alt is found, a new element <p> is added with alt content (useful for readability). Defaults to False. Returns: str: [description] """ content: str = "" alts: str = "" for xpath in xpaths: results = dom.xpath(xpath) if len(results) > 0: for result in results: enclosing: str = "%s%s" if result.tag != "p": enclosing = "<p>%s</p>" alts = _get_alts(alt_to_p, result) content += enclosing % to_string(result) return content, alts
def _find_ebay_products_info(tree: etree) -> (list, None): """ Find necessary eBay products info in html elements """ products = tree.xpath('//li[@class="s-item "]') if not len(products): logger.warning('Empty eBay products list before finding info') return ebay_ids = [] for product in products: ebay_id = product.xpath('.//a[@class="s-item__link"]')[0].get( 'href') if ebay_id is None: continue ebay_id = search(r'/\d{12}\?', ebay_id) if ebay_id is None: continue ebay_id = ebay_id.group()[1:-1] if len(ebay_id) != constants.ebay_id_length or ebay_id in ebay_ids: continue ebay_ids.append(ebay_id) if len(ebay_ids): return ebay_ids
def _find_products_info(self, tree: etree) -> None: """ Find necessary products info in html elements """ products = tree.xpath('//div[@data-asin]') if not len(products): logger.warning('Empty products list before finding info') return for product in products: asin = product.get('data-asin') title = product.xpath('.//img')[0].get('alt') if asin is None or len(asin) != constants.asin_length: continue if title is None or not len(title): continue title = sub(r'[^0-9a-z ]', '', title.lower()) title = sub(r' {2,}', ' ', ' ' + title + ' ') title = sub(r' ({0}) '.format('|'.join(constants.stopwords)), ' ', title) title = sub(r'^ | $', '', title) words = title.split() if len(words) > constants.title_max_words: words = words[:constants.title_n_words] self._products[asin] = {'title': ' '.join(words)}
def get_person(stree: etree) -> Person: """ Extract a Person the xml of a docket, parsed into sections. Returns an empty Person object on errors. Args: stree: xml tree of a docket, parsed into a header and some number of sections Returns: a Person object """ try: name = stree.xpath( "docket/header/caption/defendant_line")[0].text.strip() first_name, last_name = split_first_name(name) except IndexError: first_name = "" last_name = "" aliases = xpath_or_empty_list(stree, "//alias") date_of_birth = xpath_date_or_blank(stree, "//birth_date") return Person(first_name=first_name, last_name=last_name, date_of_birth=date_of_birth, aliases=aliases)
def _parse_list(self, tree: etree) -> str: nomore: list = tree.xpath('//div[@class="not-found"]') if nomore: return 'nomore' try: products: list = tree.xpath( '//div[@id="shop-all-list"]//div[@class="svr-info"]//a[@data-click-name="shop_info_gooddeal_click"]/@title' ) product_link: list = tree.xpath( '//div[@id="shop-all-list"]//div[@class="svr-info"]//a[@data-click-name="shop_info_gooddeal_click"]/@href' ) for product, link in zip(products, product_link): if '伊婉' in product: self._ids.add(link) return 'hasmore' except Exception: pass
def get_post_url(etroot: lxml.etree) -> str: """ get post url from etree object :param etroot: lxml.etree object :return: url """ post_url = "http:" + etroot.find('post_url').text return post_url
def xpath_date_or_blank(tree: etree, xpath: str) -> Optional[date]: """ Given an etree and an xpath expression, return the value of the expression as a date, or None""" try: return datetime.strptime( tree.xpath(xpath)[0].text.strip(), r"%m/%d/%Y").date() except (IndexError, ValueError) as e: return None
def get_img_url(self, node: etree) -> str: """get img url from enclosure or media:content tag if any Arguments: node {etree} -- item node of rss feed Returns: str -- the url of the image found in enclosure or media:content tag """ img_url = "" enclosures = node.xpath(".//enclosure") # media:content tag medias = node.xpath(".//*[local-name()='content'][@url]") if len(enclosures) > 0: img_url = enclosures[0].get('url') elif len(medias) > 0: img_url = medias[0].get('url') return img_url
def insert_element(xml_tree: et, element: et.Element) -> None: namespaces = {'default': "http://www.tei-c.org/ns/1.0"} div_deposition = xml_tree.find('.//default:div[@type="deposition"]', namespaces=namespaces) div_deposition_parent = div_deposition.getparent() injection_position = div_deposition_parent.index(div_deposition) + 1 div_deposition_parent.insert(injection_position, element)
def remove(cls, xml_node: etree): if xml_node is None: return parent_node = xml_node.getparent() if parent_node is None: return parent_node.remove(xml_node)
def xpath_or_blank(stree: etree, xpath: str) -> str: """ given an etree and an xpath expression, return the value of the expression, or an empty string. A helper method""" try: return stree.xpath(xpath)[0].text.strip() except IndexError: return ""
def get_first_node(dom: etree, xpaths: list): """get first node found in the list of xpath expressions""" node: Optional[etree._Element] = None for xpath in xpaths: results = dom.xpath(xpath) if len(results) > 0: node = results[0] break return node
def _get_params(self, html: etree) -> str: """ Определяем параметры товара """ params = "" span_params = html.xpath(self.locators['txbParams']) for span_param in span_params: params += "|" + span_param.text.strip() return params
def _replace_urls_process_links(dom: etree, attribute: str): for o in dom.xpath("//*[@%s]" % attribute): if o.attrib[attribute].startswith("//"): protocol: str = "http:" if self.handler.get_original_website().find("https") > -1: protocol = "https:" o.attrib[attribute] = protocol + o.attrib[attribute] elif o.attrib[attribute].startswith("/"): o.attrib[attribute] = self.handler.get_original_website( ) + o.attrib[attribute][1:]
def create_action(element: Element): if "id" in element.attrib: name = element.attrib["id"] else: name = str(element.sourceline) for parent in element.iterancestors(): if "id" in parent.attrib: name += parent.attrib["id"] break return game_models.Action(list(parse_step(step) for step in element), name)
def _parse_item(self, tree: etree, link: str): try: item = {} item['product'] = tree.xpath( '//p[@class="product-name bold"]/text()')[0] item['link'] = link item['price'] = tree.xpath('//div[@class="price"]//text()')[-1] item['hospital'] = tree.xpath( '//div[@class="shop-item"]/p[@class="shop-name"]/text()')[0] item['address'] = tree.xpath( '//div[@class="shop-item"]/p[@class="shop-addr"]/text()' )[0].replace('地址:', '') item['phone'] = tree.xpath( '//div[@class="shop-item"]/p[@class="shop-phone"]/text()' )[0].replace('电话:', '') print(item) self.content.append(item) return 'success' except Exception: pass
def return_word_from_id(xml: etree, el_id: str) -> str: """Given an XML document, return the innertext at id Args: xml (etree): XML document el_id (str): ID Returns: str: Innertext of element with el_id in xml """ return xml.xpath('//*[@id="%s"]/text()' % el_id)[0]
def _check_location(tree: etree): """ Check current session location on Amazon """ try: span = tree.xpath('//span[@id="glow-ingress-line2"]')[0] except IndexError: return False else: return span.text != CURRENT_AMAZON_LOCATION
def get_priority_params_from_html(etroot: lxml.etree) -> dict: keys = [] values = [] for child in etroot.iter('input'): try: keys.append(child.attrib['name']) values.append(child.attrib['value']) except: pass d = dict(zip(keys, values)) return d
def parse_select(element: Element): return step_models.Select(selectorparser.parse(element.attrib["from"], selectorparser.item), element.get("label"), element.sourceline)