def articlecollector(path_articles_xml, outpath_articles, articleids): print("\nCollecting articles for \'%s\' from %s\n..." % ( wantedCategory, path_articles_xml)) title_path = etree.ETXPath("child::" + Ttitle) id_path = etree.ETXPath("child::" + Tid) text_path = etree.ETXPath("child::" + Trev + "/" + Ttext) extracted_count = 0 start = time.time() try: with BZ2File(outpath_articles, "w", compresslevel=9) as file, \ etree.xmlfile(file, encoding="utf-8") as newfile, \ newfile.element("mediawiki", xmlns=Header): context = etree.iterparse(path_articles_xml, events=("end",), tag={Tnamespaces, Tpage}) for action, elem in context: if elem.tag == Tpage and id_path(elem)[ 0].text in articleids: create_page(elem, title_path, id_path, text_path, articleids, newfile) extracted_count += 1 elif elem.tag == Tnamespaces: create_namespace(elem, newfile) elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] except FileNotFoundError as e: print(e.filename, "not found") raise e end = time.time() printTime(start, end) return extracted_count
def test_value(self): """ Test object profile values. """ my_object_resource = object_resource.ObjectResource() xml = my_object_resource._get_object_profile('boaty:mcboatface', 'intertubes', ['do:thereisnotry'], datetime.now(pytz.utc), datetime.now(pytz.utc), 'A', 'dr who') tree = etree.fromstring(xml) label_xpath = etree.ETXPath('//{{{}}}objLabel/text()'.format( api.FEDORA_ACCESS_URI)) self.assertEqual(label_xpath(tree)[0], 'intertubes') state_xpath = etree.ETXPath('//{{{}}}objState/text()'.format( api.FEDORA_ACCESS_URI)) self.assertEqual(state_xpath(tree)[0], 'A') pid_xpath = etree.ETXPath('/{{{}}}objectProfile/@pid'.format( api.FEDORA_ACCESS_URI)) self.assertEqual(pid_xpath(tree)[0], 'boaty:mcboatface') model_xpath = etree.ETXPath('//{{{}}}model/text()'.format( api.FEDORA_ACCESS_URI)) self.assertIn('do:thereisnotry', model_xpath(tree)) owner_xpath = etree.ETXPath('//{{{}}}objOwnerId/text()'.format( api.FEDORA_ACCESS_URI)) self.assertEqual(owner_xpath(tree)[0], 'dr who')
def analyze_wsdl(config_file): # execute ws-i tests # don't execute Analyzer.sh directly since it needs bash os.system(WSI_EXECUTION_COMMAND + config_file) # parse result e = etree.parse(SOAPLIB_REPORT_FILE).getroot() summary = etree.ETXPath('{%s}summary' % e.nsmap['wsi-report'])(e) if summary: # retrieve overall result of the test result = summary[0].get('result') if result == 'failed': outs = etree.ETXPath('{%s}artifact' % (e.nsmap['wsi-report'], ))(e) # filter for the object describing the wsdl test desc = [o for o in outs if o.get('type') == 'description'][0] # loop over every group test for entry in desc.iterchildren(): # loop over every single test for test in entry.iterchildren(): # simply print the error if there is one # an html can be generated using files in wsi-test-tools/common/xsl if test.get('result') == 'failed': fail_msg = etree.ETXPath('{%s}failureMessage' % e.nsmap['wsi-report'])(test) fail_det = etree.ETXPath('{%s}failureDetail' % e.nsmap['wsi-report'])(test) if fail_msg: print '\nFAILURE in test %s\n' % test.get('id') print fail_msg[0].text if fail_det: print '\nFAILURE MSG\n' print fail_det[0].text
def parse_file(cls, xml_file): print("Parsing: " + xml_file) with open(xml_file, 'r') as fd: et = etree.parse(fd) for table1_group1 in etree.ETXPath( './{rptAllCDE}table1/{rptAllCDE}table1_Group1_Collection/{rptAllCDE}table1_Group1' )(et.getroot()): pvg = None for table1_group2 in etree.ETXPath( './{rptAllCDE}table1_Group2_Collection/{rptAllCDE}table1_Group2' )(table1_group1): pvg = NINDSReportParser.parse_pvg(table1_group2) NINDSReportParser.parse_cde(table1_group1, pvg)
def print_update(self, doc): xpath_expr = "//{{{0}}}job".format(settings.PX_NS) xpath_fn = etree.ETXPath(xpath_expr) results = xpath_fn(doc) jobid = results[0].attrib['id'] xpath_expr = "//{{{0}}}job/{{{0}}}messages/{{{0}}}message".format( settings.PX_NS) xpath_fn = etree.ETXPath(xpath_expr) results = xpath_fn(doc) print "JOB UPDATE\n\tID {0}\n\n\tMessage(s):".format(jobid) for m in results: print "\t#{0}. {1} - {2}".format(m.attrib['sequence'], m.attrib['level'], m.text) print ""
def update_feed(feedentry, feed_path): """Update the feed with the last individual feed entry. * return None if nothing has changed * add a new entry, delete the last if a new post * add a new entry, remove the old entry if post has changed. """ new_entry = False feed = helper.parse_feed(feed_path) # XPath for finding tagid find_entry = etree.ETXPath("//{%s}entry" % ATOMNS) find_id = etree.ETXPath("{%s}id/text()" % ATOMNS) find_date = etree.ETXPath("{%s}updated/text()" % ATOMNS) # We need the information about the new entry new_id = find_id(feedentry)[0] new_updated = find_date(feedentry)[0] # Processing and comparing entries = find_entry(feed) posts_number = len(entries) for entry in entries: old_id = find_id(entry)[0] old_updated = find_date(entry)[0] if old_id == new_id: if old_updated == new_updated: logging.info("The feed has not changed.") return None else: logging.info("The feed has been updated.") # we remove from feed the specific entry entry.getparent().remove(entry) # Find the first entry element in the feed position = feed.getroot().index( feed.find("//{%s}entry" % ATOMNS)) feed.getroot().insert(position, feedentry.getroot()) # Change the <updated> date of the feed feed.find("//{%s}updated" % ATOMNS).text = new_updated return lxml.html.tostring(feed, encoding='utf-8') else: logging.info("This is a new feed entry.") new_entry = True if new_entry: if posts_number > FEED_MAX_POSTS: entries[-1].getparent().remove(entries[-1]) position = feed.getroot().index(feed.find("//{%s}entry" % ATOMNS)) feed.getroot().insert(position, feedentry.getroot()) # Change the <updated> date of the feed feed.find("//{%s}updated" % ATOMNS).text = new_updated return lxml.html.tostring(feed, encoding='utf-8') return None
def extract_articles_inscope(csvwriter, adict): t = start_time() idexp = etree.ETXPath("child::" + ID) titlexp = etree.ETXPath("child::" + TITLE) textxp = etree.ETXPath("child::" + REV + "/" + TEXT) context = etree.iterparse( DATAP + '/dump/enwiki-20180901-pages-articles-multistream.xml', events=('end', ), tag=PAGE) fast_iter( context, lambda elem: extract_with_xpath(elem, idexp, titlexp, textxp, csvwriter, adict)) stop_time(t)
def eval_test(tree, strid): # registry_test path = "//{http://oval.mitre.org/XMLSchema/oval-definitions-5#windows}registry_test[@id='" + strid + "']" findall = etree.ETXPath(path) if len(findall(tree)) > 0: return eval_registry_test(tree, strid) # family_test path = "//{http://oval.mitre.org/XMLSchema/oval-definitions-5#independent}family_test[@id='" + strid + "']" findall = etree.ETXPath(path) if len(findall(tree)) > 0: return eval_family_test(tree, strid) # cannot evaluate return -1
def to_entry_dict(entry_index_xml): """Convert an XML entry index into a dictionary.""" # Search paths find_href = etree.ETXPath("a/@href") find_short_date = etree.ETXPath("time/text()") find_created = etree.ETXPath("time/@datetime") find_title = etree.ETXPath("a/text()") # extract data entry_index = { 'created': find_created(entry_index_xml)[0], 'iso_short_date': find_short_date(entry_index_xml)[0], 'path': find_href(entry_index_xml)[0], 'title': find_title(entry_index_xml)[0], } return entry_index
def parse_pvg(cls, elem): pvg_code = elem.get(cls.PVG_CODE) # dummy element if pvg_code is None: return try: CDEPermittedValueGroup.objects.get(code__exact=pvg_code) print("PVG %s already exists." % pvg_code) return except CDEPermittedValueGroup.DoesNotExist: pass pvg = CDEPermittedValueGroup(code=pvg_code) pvg.save() print("Created: ", pvg) for detail in etree.ETXPath( './{rptAllCDE}Detail_Collection/{rptAllCDE}Detail')(elem): pv_code = detail.get(cls.PV_CODE) if pv_code is None: continue pv = CDEPermittedValue(code=detail.get(cls.PV_CODE), value=detail.get(cls.PV_VALUE), desc=detail.get(cls.PV_DESC), pv_group=pvg) print("Created:", pv) pv.save() return pvg
def eval_family_test(tree, strid): l_regtest = [] path = "//{http://oval.mitre.org/XMLSchema/oval-definitions-5#independent}family_test[@id='" + strid + "']" findall = etree.ETXPath(path) try: rtest = findall(tree)[0] except: return -1 # state ste = rtest.findall( './{http://oval.mitre.org/XMLSchema/oval-definitions-5#independent}state' ) try: l_state = [get_family_state(tree, ste[0].get('state_ref'))] except: print "non fatal error in family state for strid: " + strid # logical operation # TODO: ops are different from family_test than registry_test regop = 'AND' try: if rtest.get('check') == 'at least one': regop = 'OR' except: print "non fatal error in determining op for strid: " + strid try: return eval_family_state(l_state[0]['famtext']) except: return -1
def xmlPath(element): '''Return a simple, unambiguous path for an XML element''' path = [] while True: parent = element.getparent() name = element.tag if name.startswith(POM_NS_PREFIX): name = name[len(POM_NS_PREFIX):] if parent is None: path.insert(0, '/%s' % name) break expr = etree.ETXPath(element.tag) children = expr(parent) #print 'xmlPath',element.tag,children index = children.index(element) if len(children) == 1: item = '/%s' % name else: item = '/%s[%d]' % (name, index) path.insert(0, item) element = parent return ''.join(path)
def parseXML(xmlFile): '''Parse a local xml file''' xml = etree.parse(xmlFile).getroot() lines = [] # get the number of the enclosing book # FIXME: make this more elegant book_n = 0 book_xpath = './/{http://www.tei-c.org/ns/1.0}div[@type="textpart" and @subtype="book"]' bookFinder = etree.ETXPath(book_xpath) book_list = bookFinder(xml) if len(book_list) > 0: book_n = book_list[0].get('n', 0) for l in xml.iter('{http://www.tei-c.org/ns/1.0}l'): for note in l.iter('{http://www.tei-c.org/ns/1.0}note'): tail = note.tail note.clear() note.tail = tail line_n = l.get('n') lines.append(('{}.{}'.format(book_n, line_n), l.xpath('string()').strip())) return lines
def locate_node(arch, spec): """ Locate a node in a source (parent) architecture. Given a complete source (parent) architecture (i.e. the field `arch` in a view), and a 'spec' node (a node in an inheriting view that specifies the location in the source view of what should be changed), return (if it exists) the node in the source view matching the specification. :param arch: a parent architecture to modify :param spec: a modifying node in an inheriting view :return: a node in the source matching the spec """ if spec.tag == 'xpath': nodes = etree.ETXPath(spec.get('expr'))(arch) return nodes[0] if nodes else None elif spec.tag == 'field': # Only compare the field name: a field can be only once in a given view # at a given level (and for multilevel expressions, we should use xpath # inheritance spec anyway). for node in arch.iter('field'): if node.get('name') == spec.get('name'): return node return None for node in arch.iter(spec.tag): if isinstance(node, SKIPPED_ELEMENT_TYPES): continue if all(node.get(attr) == spec.get(attr) for attr in spec.attrib if attr not in ('position', 'version')): # Version spec should match parent's root element's version if spec.get('version') and spec.get('version') != arch.get('version'): return None return node return None
def edit_dash_playlist(*args, **kwards): ''' create dash chunks for every video in the transcoded folder ''' # print args, kwargs context = args[0] tree = LXML.parse(get_dash_mpd_file_path(context)) root = tree.getroot() # Namespace map nsmap = root.nsmap.get(None) #Function to find all the BaseURL find_baseurl = LXML.ETXPath("//{%s}BaseURL" % nsmap) results = find_baseurl(root) audio_file = results[-1].text results[-1].text = "audio/" + results[ -1].text # Warning : This is quite dirty ! We suppose the last element is the only audio element tree.write(get_dash_mpd_file_path(context)) #Move audio files into audio directory os.makedirs(os.path.join(get_dash_folder(context), "audio")) shutil.move(os.path.join(get_dash_folder(context), audio_file), os.path.join(get_dash_folder(context), "audio", audio_file)) #Create .htaccess for apache f = open(os.path.join(get_dash_folder(context), "audio", ".htaccess"), "w") f.write("AddType audio/mp4 .mp4 \n") f.close() return context
def save_chart(self): """This method uses a pre-defined XML layout and adds the random strings generated above and then saves the password chart as a png. :return: None """ xml_data = etree.fromstring(layout.get_layout()) for i in range(36): tmp = getattr(self, "label_{}".format(i)).text() etree.ETXPath( "//{%s}*[@id='label_%s']" % (u"http://www.w3.org/2000/svg", i))(xml_data)[0].text = tmp svg = etree.tostring(xml_data) file_handle, filename = tempfile.mkstemp() try: os.write(file_handle, svg) subprocess.call( # Default location of inkscape x64. Change if inkscape is # installed in a separate location [ r"C:\Program Files\Inkscape\inkscape.exe", filename, "--export-png", "password_chart.png", "--export-dpi", "96" ]) os.close(file_handle) finally: os.remove(filename)
def commandFlat(sInFile,sOutFile): """ Flattens all RTE segements into singel segment by appending all points to the RTE segment """ eGpx = etree.parse(sInFile).getroot() if eGpx is None: raise commandError("NOROOT") NS = getNS(eGpx) eRtes= eGpx.findall(NS % 'rte') if eRtes is None: raise commandError("NOSEG") for eRte in eRtes[1:]: eRtePts = eRte.findall(NS % 'rtept') if eRtePts is not None: eRtes[0].extend(eRtePts) eGpx.remove(eRte) eRteName = eRtes[0].find(NS % 'name') eRteName.text = os.path.splitext(os.path.basename(sOutFile))[0] lLatLons=[getLatLon(ePt) for ePt in eRtePts] writeGpxFile(eGpx,lLatLons,sOutFile) return len(etree.ETXPath(NS % 'rte')(eGpx))
def delete_all(args): """Delete all jobs""" doc = resources.get_jobs() xpath_expr = "//{{{0}}}job/@id".format(settings.PX_NS) xpath_fn = etree.ETXPath(xpath_expr) results = xpath_fn(doc) for r in results: delete_job(r)
def create(fname, ca_ext, data_dir='.', lang='', app_type=None): urn, extension = parse_urn(fname) tree = etree.parse(path.join(data_dir, fname)) root = tree.getroot() NS = f'{{{root.nsmap[None]}}}' if not lang: lang = etree.ETXPath(f'//{NS}text/{NS}body/{NS}div/@xml:lang')(root)[0] print(f"Lanugage: {lang}") # find refsDecl refsDecl = to_string(etree.ETXPath(f'//{NS}refsDecl')(root)[0]) # Catch first cRefPattern cRef = etree.ETXPath( f'//{NS}refsDecl/{NS}cRefPattern/@replacementPattern')(root)[0] levels = regex.findall(r'tei:([a-zA-Z]+)\[@n=[\'"]\$[0-9]+[\'"]\]', cRef) print(f"cRefPattern: {', '.join(levels)}") # find apps passage = r'\.'.join(r'(\w+)' for _ in range(len(levels))) urn_wo_ext = regex.sub('-.+?$', '', urn) print(fr'Searching for {urn_wo_ext}-[\w\-]+?:{passage}') re_passage = regex.compile(fr'{urn_wo_ext}-[\w\-]+?:{passage}') app_xpath = f'//{NS}app[@loc]' if app_type: app_xpath = f'//{NS}listApp[@type="{app_type}"]' + app_xpath apps = etree.ETXPath(app_xpath)(root) ca_dict = collect_apps(apps, re_passage, NS) ca_string = format_critapp(ca_dict, levels) # Create new file new_urn = regex.sub(f'{extension}$', f'{ca_ext}', urn) new_fname = regex.sub(f'{extension}(?=.xml$)', f'{ca_ext}', fname) content = template.format(content=ca_string, id=new_urn, refsDecl=refsDecl, lang=lang) # Indentation tree = etree.fromstring(content) etree.indent(tree) with open(path.join(data_dir, new_fname), 'w') as f: f.write(etree.tostring(tree, encoding="unicode"))
def import_cisco_cvrf(): flist = [] exdb = db.ExistDB() validateCollection(exdb, db_cvrf_cisco_collection) # ----------------------------------------------------------------------------- # get list of cvrf urls # ----------------------------------------------------------------------------- nurl = "http://tools.cisco.com/security/center/cvrfListing.x" request = urllib2.Request(nurl) rawPage = urllib2.urlopen(request) read = rawPage.read() #print read tree = etree.HTML(read) tpath = "//a[contains(@href,'cvrf.xml')]" findall = etree.ETXPath(tpath) arefs = findall(tree) urls = [] for a in arefs: urls.append(a.get('href').replace('\t', '').replace('\n', '')) # just for tracking for now, need to get cisco to fix or apply a fix # i might ignore if it wasn't for poodle badfiles = [ "/cisco-sa-20040420-tcp-nonios_cvrf.xml", "cisco-sa-20120328-msdp_cvrf.xml", "cisco-sa-20141015-poodle_cvrf.xml", ] # ----------------------------------------------------------------------------- # download files if they don't exist # ----------------------------------------------------------------------------- for u in urls: uname = u.split('/')[-1] # if file does not exist, download #if (not os.path.isfile(cisco_data_dir+uname) and os.access(".", os.W_OK)): if (os.access(".", os.W_OK)): try: print("downloading " + uname) urllib.urlretrieve(u, cisco_data_dir + uname) try: fo = open(cisco_data_dir + uname, 'rb') if exdb.load(fo, db_cvrf_cisco_collection + '/' + uname, True): flist.append(uname + ": data import successful") else: flist.append(uname + ": data import failed") fo.close() except: flist.append(uname + ": file read failed") except: flist.append(uname + ": file download failed") else: flist.append(uname + ": file write failed") return flist
def get_abstract(): with open(DATAP + '/dump/article_ids_reverse.json', "r", encoding="UTF8") as f: title_to_id = load(f) with open(DATAP + '/dump/articles_inscope.json', "r", encoding="UTF8") as f: scope = load(f) scope = {int(key): values for key, values in scope.items()} f = open(DATAP + '/dump/articles_all_abstracts.csv', "w", encoding="UTF8") abstract_xpath = etree.ETXPath("child::abstract") url_xpath = etree.ETXPath("child::url") context = etree.iterparse(DATAP + '/dump/enwiki-20180901-abstract.xml', events=('end', ), tag="doc") fast_iter( context, lambda elem: extract_with_xpath( elem, abstract_xpath, url_xpath, title_to_id, scope, f)) f.close()
def get_job_status(job_id): """Return the status of the given job""" doc = get_job(job_id) if doc == None: return "" xpath_expr = "//{{{0}}}job".format(settings.PX_NS) xpath_fn = etree.ETXPath(xpath_expr) results = xpath_fn(doc) return results[0].attrib['status']
def entries_as_dict(month_index): """Convert index xml list to list of dictionaries.""" # Search path findentrylist = etree.ETXPath("//section[@id='month-index']/ul/li") # Extract data entries_xml = findentrylist(month_index) entries = [ to_entry_dict(entry_index_xml) for entry_index_xml in entries_xml ] return entries
def get_definition_cpe(tree, strid): path = "//{http://oval.mitre.org/XMLSchema/oval-definitions-5}definition[@id='" + strid + "']" findall = etree.ETXPath(path) df = findall(tree)[0] try: cpath = ".//{http://oval.mitre.org/XMLSchema/oval-definitions-5}reference" cpe = df.find(cpath) return cpe.get('ref_id') except: return -1
def get_title(doc): """Return a list of markup and text being the title of the document.""" target = '//{%s}h1[text()]' % HTMLNS findtitle = etree.ETXPath(target) if not findtitle(doc): sys.exit("ERROR: The document has no title") title = findtitle(doc)[0] titletext = etree.tostring(title, encoding="utf-8", method="text") titletext = titletext.strip() return titletext.decode('utf-8').strip()
def get_content(doc): """Return the full content of an article.""" findcontent = etree.ETXPath("//{%s}article" % HTMLNS) try: content = findcontent(doc)[0] import lxml.html # print(lxml.html.tostring(content)) except IndexError as e: raise IndexError('Ooops. No article.') # We want the content without the dates and the title findheader = etree.ETXPath("//{%s}header" % HTMLNS) try: # header = findheader(content)[0] # content.remove(header) header = findheader(doc)[0] # content.remove(header) except IndexError as e: logging.info('No header inside article: {e}'.format(e=e)) return content
def last_posts(feed_path): """Create a list of dictionaries of the last posts using the Atom feed.""" entries = [] feed_root = helper.parse_feed(feed_path) # Information we need: title, dates, link find_entry = etree.ETXPath("//{%s}entry" % ATOMNS) find_title = etree.ETXPath("{%s}title/text()" % ATOMNS) find_published = etree.ETXPath("{%s}published/text()" % ATOMNS) find_updated = etree.ETXPath("{%s}updated/text()" % ATOMNS) # Only the link pointing to the blog post find_url = etree.ETXPath("{%s}link[@rel='alternate']/@href" % ATOMNS) # Extract all the entries feed_entries = find_entry(feed_root) # We iterate through them for entry in feed_entries: entry_data = {'title': find_title(entry)[0], 'published': find_published(entry)[0], 'updated': find_updated(entry)[0], 'url': find_url(entry)[0]} entries.append(entry_data) return entries
def __init__(self): conf_str = ipc.get_conf_str() self.xml_root = etree.fromstring(conf_str) namespaces = self.xml_root.nsmap root_namespace = namespaces[None] extra_namespace = namespaces["extra"] self.rn = root_namespace self.en = extra_namespace query = etree.ETXPath("{%s}device/{%s}unit/@{%s}url" % (self.rn, self.rn, self.en)) self.URL = str(query(self.xml_root)[0]) self.URL2 = str(query(self.xml_root)[1])
def is_valid_query(query): """Function to check for XPath validity. Tries to create an etree ETXPath instance from the query. If this fails, the XPathSyntaxError is excepted to return a False. Returns True otherwise :param query: XPath query :type query: string :returns: True/False""" try: etree.ETXPath(query) return True except etree.XPathSyntaxError: return False
def compine_view_by_xpath(view_arch, xpath_arch): view_arch_tree = etree.fromstring(view_arch) xpath_tree = etree.fromstring(xpath_arch) if xpath_tree.tag != 'data': raise Exception('继承视图的根节点必须是data节点') for xpath_element in xpath_tree: if xpath_element.tag != 'xpath': continue expr = xpath_element.get('expr', None) if expr is None: raise Exception('xpath节点的expr属性不能为空') nodes = etree.ETXPath(expr)(view_arch_tree) node = nodes[0] if nodes else None if node is None: raise Exception('无法通过表达式' + expr + '在父视图中找到相关节点') pos = xpath_element.get('position', 'inside') if pos == 'replace': if node.getparent() is None: raise Exception('您不能对父视图的根节点执行替换操作') else: for child in xpath_element: node.addprevious(child) node.getparent().remove(node) elif pos == 'attributes': for child in xpath_element.getiterator('attribute'): attribute = child.get('name') value = child.text or '' node.set(attribute, value) elif pos == 'inside': add_text_inside(node, xpath_element.text) for child in xpath_element: node.append(child) elif pos == 'after': # add a sentinel element right after node, insert content of # spec before the sentinel, then remove the sentinel element sentinel = E.sentinel() node.addnext(sentinel) add_text_before(sentinel, xpath_element.text) for child in xpath_element: sentinel.addprevious(child) remove_element(sentinel) elif pos == 'before': add_text_before(node, xpath_element.text) for child in xpath_element: node.addprevious(child) else: self.raise_view_error( "不支持的position属性(" + pos + "),position必须为inside、replace、after、before或attributes") return etree.tostring(view_arch_tree, encoding='utf-8')