def bl_parse_achievement_data(parsed: etree._Element, mode="quickplay"): # Start the dict. built_dict = {} _root = parsed.xpath( ".//section[@id='achievements-section']" ) if not _root: return _root = _root[0] _category_selects = _root.xpath(".//select[@data-group-id='achievements']")[0].xpath(".//option") for _category_select in _category_selects: category_name = _category_select.text category_id = _category_select.get("value") _achievement_boxes = _root.xpath( ".//div[@data-group-id='achievements' and @data-category-id='{0}']/ul/div/div[@data-tooltip]".format( category_id)) n_dict = {} for _achievement_box in _achievement_boxes: achievement_name = _achievement_box.xpath("./div/div")[0].text if achievement_name == '?': # Sombra ARG clue, not a real achievement continue n_dict[util.sanitize_string(achievement_name)] = "m-disabled" not in _achievement_box.get("class") built_dict[category_name.lower()] = n_dict return built_dict
def bl_find_heroes(parsed: etree._Element): # Start the dict. built_dict = {"role": "", "difficulty": "", "abilities": {}} difficulty = len(parsed.findall(".//span[@class='star']")) role = parsed.xpath(".//h4[@class='h2 hero-detail-role-name']")[0].text _abilities = parsed.findall(".//div[@class='hero-ability-descriptor']") abilities = {} for ability in _abilities: name, description = ability[0].text, ability[1].text abilities[name] = description built_dict["difficulty"] = difficulty built_dict["role"] = role built_dict["abilities"] = abilities return built_dict
def walk(e: etree._Element, l: ModelListener) -> None: tag = e.tag l.call('enter_every_before', e) l.call('enter_' + tag, e) l.call('enter_every_after', e) for c in e.getchildren(): walk(c, l) l.call('exit_every_before', e) l.call('exit_' + tag, e) l.call('exit_every_after', e)
def bl_parse_hero_data(parsed: etree._Element, mode="quickplay"): # Start the dict. built_dict = {} _root = parsed.xpath( ".//div[@id='{}']".format("competitive" if mode == "competitive" else "quickplay") ) if not _root: return for hero_name, requested_hero_div_id in hero_data_div_ids.items(): n_dict = {} _stat_groups = _root[0].xpath( ".//div[@data-group-id='stats' and @data-category-id='{0}']".format(requested_hero_div_id) ) if not _stat_groups: continue stat_groups = _stat_groups[0] _t_d = {} hero_specific_box = stat_groups[0] trs = hero_specific_box.findall(".//tbody/tr") # Update the dict with [0]: [1] for subval in trs: name, value = util.sanitize_string(subval[0].text), subval[1].text if 'average' in name.lower(): # No averages, ty continue nvl = util.try_extract(value) _t_d[name] = nvl n_dict["hero_stats"] = _t_d _t_d = {} for subbox in stat_groups[1:]: trs = subbox.findall(".//tbody/tr") # Update the dict with [0]: [1] for subval in trs: name, value = util.sanitize_string(subval[0].text), subval[1].text if 'average' in name.lower(): # No averages, ty continue nvl = util.try_extract(value) _t_d[name] = nvl n_dict["general_stats"] = _t_d built_dict[hero_name] = n_dict return built_dict
def processSchema(root: etree._Element, base: str, params: dict) -> int: # skip core schemas targetNs = root.attrib.get("targetNamespace", None) if targetNs in params['namespaces_to_skip']: return 0 logging.info("processing schema " + base) registerNamespaces(root, base, params) processElements(root, base, targetNs, params) xpathobj = root.xpath( "//link:linkbaseRef", namespaces={"link": "http://www.xbrl.org/2003/linkbase"}) res1 = processLinkBases(xpathobj, base, targetNs, params) res2 = processImportedSchema(root, base, targetNs, params) return res1 or res2
def askJsonParams( self, requestElem: Element, path: str, data: Dict[str, Any], ) -> Optional[str]: """ recursive function to ask all json/body parameters updates `data` argument returns error or None """ # FIXME: do we need the path for child in requestElem.getchildren(): t = getElemTag(child) if t == "param": name = child.get("name", "") if not name: print("WARNING: element %r with tag %r has no name", child, t) continue typ = child.get("type", "") if not typ: print("WARNING: element %r with tag %r has no type", child, t) continue completer = getParamCompleter(child) multiline = child.get("multiline", "") == "true" history = None if child.get("secret", "") != "true": history = FileHistory(self.paramHistoryPath(name)) try: valueRaw = prompt( f"> Parameter: {name} = ", multiline=multiline, history=history, auto_suggest=AutoSuggestFromHistory(), completer=completer, ) except KeyboardInterrupt: return "Canceled" if valueRaw != "": value, err = parseInputValue(valueRaw, typ) if err: return err data[name] = value
def parse(self, root: etree._Element) -> AnnotationCollection: """ Parse XML tree and return a collection of annotations. :param root: The XML tree root. :type root: etree._Element :return: A collection of parsed annotations. :rtype: AnnotationCollection """ annotations = [] for raw_viewstate in root.findall("ndpviewstate"): viewstate = self._parse_viewstate(raw_viewstate) annotations.append(viewstate) collection = AnnotationCollection(annotations) return collection
def xml_get( item: XMLElement, path: Optional[str] = None, attr: Optional[str] = None, *, xpath: bool = False, ) -> str: """ Helper for LXML to get an attribute at a path from an XML element. Raises a ValueError if the path or attribute do not exist. """ ret = xml_get_opt(item, path, attr, xpath=xpath) if ret is None: if path and item.find(path) is None: raise ValueError(f"Could not find XML path '{path}'.") raise ValueError(f"Could not find XML attr '{attr}' at path '{path}'.") return ret
def get_misconfigured_resources( resources_el: _Element, ) -> Tuple[List[_Element], List[_Element], List[_Element]]: """ Return stonith: all, 'action' option set, 'method' option set to 'cycle' """ stonith_all = [] stonith_with_action = [] stonith_with_method_cycle = [] for stonith in resources_el.iterfind("primitive[@class='stonith']"): stonith_all.append(stonith) for nvpair in stonith.iterfind("instance_attributes/nvpair"): if nvpair.get("name") == "action" and nvpair.get("value"): stonith_with_action.append(stonith) if (nvpair.get("name") == "method" and nvpair.get("value") == "cycle"): stonith_with_method_cycle.append(stonith) return stonith_all, stonith_with_action, stonith_with_method_cycle
def __add_kobo_divs_to_body(self, root: etree._Element) -> None: body = root.xpath("./xhtml:body", namespaces={"xhtml": XHTML_NAMESPACE})[0] # save node content for later body_text = body.text body_children = deepcopy(body.getchildren()) body_attrs = {} for key in list(body.keys()): body_attrs[key] = body.get(key) # reset current node, to start from scratch body.clear() # restore node attributes for key in body_attrs: body.set(key, body_attrs[key]) # Wrap the full body in a div inner_div = etree.Element( f"{{{XHTML_NAMESPACE}}}div", attrib={"id": "book-inner"} ) # Handle the node text if body_text is not None: inner_div.text = body_text # re-add the node children, but as children of the div for child in body_children: # save child tail for later child_tail = child.tail child.tail = None inner_div.append(child) # Handle the child tail if child_tail is not None: inner_div[-1].tail = child_tail # Finally, wrap that div in another one... outer_div = etree.Element( f"{{{XHTML_NAMESPACE}}}div", attrib={"id": "book-columns"} ) outer_div.append(inner_div) # And re-chuck the full div pyramid in the now empty body body.append(outer_div)
def _parse_next_page_request(self, task: Task, html: _Element) -> Optional[Task]: element = html.xpath('//div[@id="_function_code_page"]/a') if not element: return None element = element[-1] if element.text == '下一页': url = element.attrib['href'] next_page_url = urljoin(task.url, url) url_components = urlparse(next_page_url) path = normpath(url_components.path) next_page_url = urlunparse( (url_components.scheme, url_components.netloc, path, url_components.params, url_components.query, url_components.fragment)) return Task(next_page_url, '', task.url, metadata=task.metadata)
def printElem(elem: Element, level: int): tag = getElemTag(elem) prefix = indent * level + getElemTag(elem) if tag == "resource": print(f"{prefix}: {elemPath(elem)}") elif tag == "method": print(f"{prefix}: {elemName(elem)} ({elemID(elem)})") elif tag == "param" or tag == "element": print(f"{prefix}: {elemName(elem)} (type={elemType(elem)})") elif tag == "item": print(f"{prefix} (type={elemType(elem)})") elif tag == "option": print(f"{prefix}: {elemValue(elem)}") elif tag == "representation": pass else: print(prefix) for child in elem.getchildren(): printElem(child, level + 1)
def parse(cls, element: etree._Element, subcls: T, tag: str = None, **kwargs) -> "Optional[XMLGroup]": """ With the given :class:`lxml.etree.Element`, parses the :attr:`item_tag` and creates a new :class:`XMLGroup` with its data. In addition to :class:`XMLItem`, finds and parses any subitem contained by the tag. :param lxml.etree._Element element: the element to parse. :param T subcls: the subclass type used when parsing found objects. :param str tag: the XML tag itself. :param kwargs: arbitrary arguments for custom parsing options. :return: the new group of tags. :rtype: XMLItem :raises ValueError: if the `element.tag` is different than `tag`. :raises AttributeError: if `subcls` is not a subclass of :class:`XMLItem` or :class:`XMLGroup`. """ if not issubclass(subcls, XMLItem) and not issubclass(subcls, XMLGroup): raise AttributeError(f"Class {subcls} must inherit from XMLItem or " f"XMLGroup") fields = list() dirs = dict() idx = 0 for field in element: if issubclass(subcls, XMLGroup): value = subcls.parse(element=field, subcls=subcls.cls, tag=tag or subcls.subitem_tag, **kwargs) else: value = subcls.parse(element=field, tag=tag or subcls.item_tag, **kwargs) if value is not None: fields.insert(idx, value) dirs[value.tag] = idx idx += 1 return cls(element.get('tag'), element.tag, fields, dirs)
def _parse_address(self, html: _Element, num_font_url: str, address_font_url: str) -> str: element = html.xpath('//span[@id="address"]')[0] result = [] if element.text is not None: result.append(element.text.strip()) for child in element.iterchildren(): if child.attrib['class'] == 'address': result.append(self.font_parser.parse('address', address_font_url, child.text).strip()) elif child.attrib['class'] == 'num': result.append(self.font_parser.parse('num', num_font_url, child.text).strip()) else: result.append(child.text.strip()) if child.tail is not None: result.append(child.tail.strip()) return ''.join(result).strip()
def __init__(self, doc: Element): children = doc.getchildren() for key, value in self.__annotations__.items(): if is_combinator(value): setattr(self, key, value(children)) else: if key in doc.attrib: setattr(self, key, value(doc.attrib[key])) elif is_entity(value): dummy = DummyXMLElement(key) for child in filter(dummy.match_tag, children): setattr(self, key, value(child.text.strip())) break else: setattr(self, key, None) elif key in self.__dict__: setattr(self, key, self.__dict__[key]) else: setattr(self, key, None)
def _parse_phone_number(self, shop_id: str, html: _Element, num_font_url: str) -> str: url = f'http://www.dianping.com/ajax/json/shopDynamic/basicHideInfo?shopId={shop_id}' logger.info(f'Request for {url}') response = requests.get(url, headers=config.HEADERS) try: data = json.loads(response.text) except Exception: logger.error(f'Failed to load data as json {response.text}') raise content = data['msg']['shopInfo']['phoneNo'] if not content: return None html = etree.HTML(content, etree.HTMLParser()) elements = html.xpath('//body') content = self._parse_number(elements[0], num_font_url) return content
def xml_to_tree(x: etree._Element, lvl: int = 1, preserve_ns: bool = False, text_strip: bool = True): # print(inspect.currentframe().f_code.co_name) if preserve_ns: tag = x.tag else: tag = re.sub(r'\{.*\}(.*)', r'\1', x.tag) if text_strip and x.text: text = x.text.strip() else: text = x.text if x.text else "" yield f"{' ' * (lvl - 1)}|--Tag: {tag:<}" \ f"{' ' * (lvl - 1)}| Text: {text:<}" for ch in x.getchildren(): yield from xml_to_tree(ch, lvl + 1, preserve_ns, text_strip)
def uninent_block_literal_el(el: ET._Element): if (0 == len(el.getchildren())) and (el.text is None): return el def get_inner_content_xml(el_): s = ET.tostring(el_, encoding="unicode", xml_declaration=False) # lxml provides no way to extract literal XML of node content # excluding the node itself m = re.match('([^>]*>)(.+)(<[^>]+>)\\s*$', s, re.DOTALL) return m.groups() start, contents, end = get_inner_content_xml(el) lines = rm_lead_trail_empty_lines(contents) lines = unindent(lines) xml_s = start + "\n".join(lines) + end xml = ET.XML(xml_s) return xml
def match_daoset_ids(c01: etree._Element, volume_files: set[str]) -> None: """Checks all dao elements in a volume and checks if they can be found in a volume directory. Prints a warning if a file is never used. """ with open("outputs/missing_files", "a", encoding="utf-8") as log: missing_files = set() for dao in c01.iterdescendants("dao"): try: volume_files.remove(str(dao.attrib["id"])) except KeyError: missing_files.add(dao.attrib["id"]) if missing_files or volume_files: # pylint: disable-next=line-too-long print( f"\n** Controlling MS{c01.find('did').find('unitid').text} **", file=log) # type: ignore[union-attr] if missing_files: print( "The following files are described in excel but do not have an associated scan:", file=log, ) print( "I seguenti file sono descritti in Excel ma non hanno una scansione associata:", file=log, ) for file_name in sorted(missing_files): print(f" - {file_name!r}", file=log) if volume_files: print( "The following files exist as scan but are not described in excel:", file=log, ) print( "I seguenti file esistono come scansione ma non sono descritti in excel:", file=log, ) for file_name in sorted(volume_files): print(f" + {file_name}", file=log)
def find_constraints_referencing_tag( constraints_section: _Element, tag_id: str, ) -> List[_Element]: """ Find constraint elements which are referencing specified tag. constraints_section -- element constraints tag_id -- tag id """ constraint_list = constraints_section.xpath( """ ./rsc_colocation[ not (descendant::resource_set) and (@rsc=$tag_id or @with-rsc=$tag_id) ] | ./rsc_location[ not (descendant::resource_set) and @rsc=$tag_id ] | ./rsc_order[ not (descendant::resource_set) and (@first=$tag_id or @then=$tag_id) ] | ./rsc_ticket[ not (descendant::resource_set) and @rsc=$tag_id ] | (./rsc_colocation|./rsc_location|./rsc_order|./rsc_ticket)[ ./resource_set/resource_ref[@id=$tag_id] ] """, tag_id=tag_id, ) return cast(List[_Element], constraint_list)
def check_is_without_duplication( report_processor: ReportProcessor, constraint_section: _Element, element: _Element, are_duplicate: Callable[[_Element, _Element], bool], export_element: Callable[[_Element], Dict[str, Any]], duplication_allowed: bool = False, ) -> None: duplicate_element_list = [ duplicate_element for duplicate_element in cast( # The xpath method has a complicated return value, but we know our # xpath expression returns only elements. List[_Element], constraint_section.xpath(".//*[local-name()=$tag_name]", tag_name=element.tag), ) if (element is not duplicate_element and are_duplicate(element, duplicate_element)) ] if not duplicate_element_list: return if report_processor.report_list([ ReportItem.info( reports.messages.DuplicateConstraintsList( element.tag, [ export_element(duplicate_element) for duplicate_element in duplicate_element_list ], )), ReportItem( severity=reports.item.get_severity( reports.codes.FORCE, duplication_allowed, ), message=reports.messages.DuplicateConstraintsExist([ str(duplicate.attrib["id"]) for duplicate in duplicate_element_list ]), ), ]).has_errors: raise LibraryError()
def translate_dom(dom: etree._Element, dest_language: str, original_url: Optional[str] = None): if dest_language in LANGUAGES: translator = Translator() for node in dom.iter(): node.text = _translate(node.text, translator, dest_language) node.tail = _translate(node.tail, translator, dest_language) if original_url is not None: a = etree.Element("a") a.text = "Untranslated" a.attrib["href"] = original_url\ .replace("&translateto=", "&nop=")\ .replace("?translateto=", "?nop=") # quick & dirty dom.append(etree.Element("hr")) dom.append(a) dom.append(etree.Element("hr"))
def _find_primitives_by_agent(resources_section: _Element, agent_name: ResourceAgentName) -> List[_Element]: """ Returns list of primitive resource elements which are using same resource agent as specified by resource_agent_obj. resources_section -- element <resources/> from CIB agent_name -- name of an agent resources of which should be returned """ return cast( List[_Element], resources_section.xpath( ".//primitive[@class=$class_ and @type=$type_ {provider_part}]". format(provider_part=" and @provider=$provider_" if agent_name.provider else "", ), class_=agent_name.standard, provider_=agent_name.provider or "", type_=agent_name.type, ), )
def test_links(xhtml_file: Path, xhtml: _Element) -> bool: """ Test the that all 'a' links to relative URLs links are not broken. :param xhtml_file: the XHTML file's path :param xhtml: the XHTML files' root :return: True if the links are okay """ success = True for link in xhtml.xpath("//xhtml:a", namespaces=XMLNS): href = str(link.attrib["href"]) if ":" not in href: path = xhtml_file.parent / Path(url2pathname(href)) if settings.verbose: print("\t", path) if not path.exists(): print(f"{xhtml_file}:1:0: broken relative link {path}", file=stderr) success = False return success
def parse_port(wsdl: "Definition", xmlelement: etree._Element) -> definitions.Port: """Create a Port object from a xml element. This is called via the parse_service function since ports are part of the service xml elements. Definition:: <wsdl:port name="nmtoken" binding="qname"> * <wsdl:documentation .... /> ? <-- extensibility element --> </wsdl:port> :param wsdl: The parent definition instance :param xmlelement: The XML node """ name = xmlelement.get("name") binding_name = qname_attr(xmlelement, "binding", wsdl.target_namespace) return definitions.Port(name, binding_name=binding_name, xmlelement=xmlelement)
def clone_element_to_dto( clone_element: _Element, rule_eval: Optional[rule.RuleInEffectEval] = None, ) -> CibResourceCloneDto: if rule_eval is None: rule_eval = rule.RuleInEffectEvalDummy() return CibResourceCloneDto( id=str(clone_element.attrib["id"]), description=clone_element.get("description"), member_id=str(get_inner_resource(clone_element).attrib["id"]), meta_attributes=[ nvpair_multi.nvset_element_to_dto(nvset, rule_eval) for nvset in nvpair_multi.find_nvsets(clone_element, nvpair_multi.NVSET_META) ], instance_attributes=[ nvpair_multi.nvset_element_to_dto(nvset, rule_eval) for nvset in nvpair_multi.find_nvsets(clone_element, nvpair_multi.NVSET_INSTANCE) ], )
def __init__(self, el: etree._Element): super().__init__(el) self.from_ = parse_datestr(el.get('from', None)) self.to = parse_datestr(el.get('to', None)) self.not_before = parse_datestr(el.get('notBefore', None)) self.not_after = parse_datestr(el.get('notAfter', None)) self.when = parse_datestr(el.get('when', None)) self.normalized = el.get('type', '') == 'normalized' if self.start is None and self.end is None: raise InvalidDatingError('Absolute dating without a date', el) elif self.date_before is not None and self.date_after is not None and not self.date_before < self.date_after: raise InvalidDatingError( 'Backwards dating (%s), this would have caused a conflict' % self, el)
def _parse_publish_time(self, html: _Element) -> datetime: elements = html.xpath('p[@class="from"]/a') publish_time_str = '' for element in elements: publish_time_str = element.text.strip() publish_matchs = self.publish_time_pattern_1.findall( publish_time_str) if len(publish_matchs) == 1: return datetime.datetime(2020, int(publish_matchs[0][0]), int(publish_matchs[0][1]), int(publish_matchs[0][2]), int(publish_matchs[0][3])) publish_matchs = self.publish_time_pattern_4.findall( publish_time_str) if len(publish_matchs) == 1: return datetime.datetime(int(publish_matchs[0][0]), int(publish_matchs[0][1]), int(publish_matchs[0][2]), int(publish_matchs[0][3]), int(publish_matchs[0][4])) publish_matchs = self.publish_time_pattern_2.findall( publish_time_str) if len(publish_matchs) == 1: publish_time = datetime.datetime.now() publish_time.replace(hour=int(publish_matchs[0][0]), minute=int(publish_matchs[0][1])) return publish_time publish_matchs = self.publish_time_pattern_3.findall( publish_time_str) if len(publish_matchs) == 1: publish_time = datetime.datetime.now() minutes = int(publish_matchs[0]) publish_time -= datetime.timedelta(minutes=minutes) return publish_time raise Exception(f'Failed to parse publish time {publish_time_str}')
def _update_digest_attrs_in_lrm_rsc_op( lrm_rsc_op: _Element, calculated_digests: Dict[str, Optional[str]] ): """ Update digest attributes in lrm_rsc_op elements. If there are missing digests values from pacemaker or missing digests attributes in lrm_rsc_op element then report an error. lrm_rsc_op -- element whose digests attributes needs to be updated in order to do restartless update of resource calculated_digests -- digests calculated by pacemaker for this lrm_rsc_op element """ common_digests_attrs = set(DIGEST_ATTRS).intersection( lrm_rsc_op.attrib.keys() ) if not common_digests_attrs: # this should not happen and when it does it is pacemaker fault raise LibraryError( ReportItem.error( reports.messages.StonithRestartlessUpdateUnableToPerform( "no digests attributes in lrm_rsc_op element", ) ) ) for attr in common_digests_attrs: new_digest = calculated_digests[DIGEST_ATTR_TO_TYPE_MAP[attr]] if new_digest is None: # this should not happen and when it does it is pacemaker fault raise LibraryError( ReportItem.error( reports.messages.StonithRestartlessUpdateUnableToPerform( ( f"necessary digest for '{attr}' attribute is " "missing" ) ) ) ) # update digest in cib lrm_rsc_op.attrib[attr] = new_digest
def _get_host(cls, xhost: etree._Element, task: IscoutTask, level: int) -> RangeCHost: """parse xml, yield return RangeCHost""" host: RangeCHost = None try: # get ip address ip, iptype = cls._get_ip(xhost) if not isinstance(ip, str) or ip == '': return host host: RangeCHost = RangeCHost(task, level, ip) host.iptype = iptype # open ports xports = xhost.findall("ports/port") if xports is None or len(xports) < 1: return host for xp in xports: try: if not cls._check_port_open(xp): continue port: int = int(xp.get("portid").strip()) transprotocol = xp.get("protocol") service: str = None xservice = xp.find('service') if not xservice is None: service = xservice.get("name") host.set_port(port, transprotocol, service) except Exception: cls._logger.debug("Get ports warn: {}".format( traceback.format_exc())) except Exception: cls._logger.error( "Parse one alive host error: taskid={} batchid={}\nerror:{}". format(task.taskid, task.batchid, traceback.format_exc())) return host
def print_schematron_error_log(xhtml: _Element, schematron: Schematron) -> None: """ Print a Schematron's error log in a readable format. :param xhtml: the root of the XHTML file with the errors :param schematron: the Schematron with the error log """ for e in schematron.error_log: # The message is a XML string containing an 'srvl:failed-assert' element xml = fromstring(e.message) # Schematron reports the location of a faulty element with an Xpath selector. location_xpath = xml.xpath('//svrl:failed-assert/@location', namespaces=XMLNS)[0] line = xhtml.xpath(location_xpath, namespaces=XMLNS)[0].sourceline message = xml.xpath('normalize-space(//svrl:text)', namespaces=XMLNS) print(f"{e.filename}:{line}:0: {message}", file=stderr)
def _call_inner(self, root: etree._Element): # do it as a BFS rather than using etree._Element.iter(). # using iter, you add nodes to the tree as you go and they get double visited. # with the BFS, you've already got the parent nodes in your queue when you visit the child # and you won't ever double visit to_visit = [root] while to_visit: node = to_visit.pop(0) to_visit.extend(list(node)) self._handle_text(node) self._handle_text(node, do_handle_tail_instead=True) docwide_word_id = self._starting_word_id for node in root.iter(): if node.tag == self.WORD_TAG: # TODO: factor out word_id definition word_id = f'word_{docwide_word_id:0>6d}' node.attrib[self.WORD_ID_ATTRIB_NAME] = word_id self._used_word_ids[word_id] = node docwide_word_id += 1
def parse_ports(self, doc: etree._Element) -> typing.Dict[str, PortType]: """Return dict with `PortType` instances as values Definition:: <wsdl:definitions .... > <wsdl:portType name="nmtoken"> <wsdl:operation name="nmtoken" .... /> * </wsdl:portType> </wsdl:definitions> :param doc: The source document :type doc: lxml.etree._Element """ result = {} for port_node in doc.findall("wsdl:portType", namespaces=NSMAP): port_type = parse.parse_port_type(self, port_node) result[port_type.name.text] = port_type logger.debug("Adding port: %s", port_type.name.text) return result
def __init__(self, root: ET._Element): self.patterns = {} self.patterns_first = {} # Identify block patterns. # this is a block feature extractor.. for page in root.findall(f".//{ALTO}Page"): blocks = page.findall(f".//{ALTO}TextBlock") for block in blocks[:2] + blocks[-1:]: text = misc.get_text(block) first_line = text.split("\n")[0] pattern = misc.get_pattern(first_line) if len(pattern) <= 8: continue if pattern in self.patterns: self.patterns[pattern] += 1 else: self.patterns[pattern] = 1 self.patterns_first[pattern] = page.get("PHYSICAL_IMG_NR")
def bl_get_all_heroes(parsed: etree._Element): _heroes = parsed.findall(".//a[@class='hero-portrait-detailed']") heroes = [hero.get("data-hero-id") for hero in _heroes] return heroes
def bl_parse_hero_data(parsed: etree._Element, mode="quickplay"): # Start the dict. built_dict = {} _root = parsed.xpath( ".//div[@id='{}']".format("competitive" if mode == "competitive" else "quickplay") ) if not _root: return None try: # XPath for the `u-align-center` h6 which signifies there's no data. no_data = _root[0].xpath(".//ul/h6[@class='u-align-center']".format(mode))[0] except IndexError: pass else: if no_data.text.strip() == "We don't have any data for this account in this mode yet.": return None for hero_name, requested_hero_div_id in hero_data_div_ids.items(): n_dict = {} _stat_groups = _root[0].xpath( ".//div[@data-group-id='stats' and @data-category-id='{0}']" .format(requested_hero_div_id) ) if not _stat_groups: continue stat_groups = _stat_groups[0] _average_stats = {} _t_d = {} # offset for subboxes # if there IS a hero-specific box, we need to scan all boxes from offset to end # because the hero-specific box is first. # if there is NOT, we scan all boxes later. # this is determined by the xpath to find the Hero Specific page. subbox_offset = 0 # .find on the assumption hero box is the *first* item hbtitle = stat_groups.find(".//span[@class='stat-title']").text if hbtitle == "Hero Specific": subbox_offset = 1 hero_specific_box = stat_groups[0] trs = hero_specific_box.findall(".//tbody/tr") # Update the dict with [0]: [1] for subval in trs: name, value = util.sanitize_string(subval[0].text), subval[1].text # Put averages into average_stats if "average" in name: into = _average_stats else: into = _t_d nvl = util.try_extract(value) into[name] = nvl n_dict["hero_stats"] = _t_d _t_d = {} for subbox in stat_groups[subbox_offset:]: trs = subbox.findall(".//tbody/tr") # Update the dict with [0]: [1] for subval in trs: name, value = util.sanitize_string(subval[0].text), subval[1].text # Put averages into average_stats if "average" in name: into = _average_stats else: into = _t_d nvl = util.try_extract(value) into[name] = nvl n_dict["general_stats"] = _t_d n_dict["average_stats"] = _average_stats built_dict[hero_name] = n_dict return built_dict