def test_find_missing(self): fp = etree.parse(StringIO(b'<a>\n<b>bar</b>\n</a>')) xml = stylechecker.XML(fp) xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd)) # missing elements fallback to the root element self.assertEquals(xml.find_element('c', 2), fp.getroot())
def test_example1(self): testdir = path(l2emod.__file__).parent / 'testtex' fn = testdir / 'example1.tex' print "file %s" % fn with make_temp_directory() as tmdir: nfn = '%s/%s' % (tmdir, fn.basename()) os.system('cp %s/* %s' % (testdir, tmdir)) os.chdir(tmdir) l2e = latex2edx(nfn, output_dir=tmdir) l2e.convert() xbfn = nfn[:-4] + '.xbundle' self.assertTrue(os.path.exists(xbfn)) # xb = open(xbfn).read() # self.assertIn('<chapter display_name="Unit 1" start="2013-11-22" url_name="Unit_1">', xb) xml = etree.parse(xbfn).getroot() chapter = xml.find('.//chapter') self.assertTrue(chapter.get('display_name') == 'Unit 1') self.assertTrue(chapter.get('start') == '2013-11-22') self.assertTrue(chapter.get('url_name') == 'Unit_1') cfn = path(tmdir) / 'course/2013_Fall.xml' self.assertTrue(os.path.exists(cfn)) cfn = path(tmdir) / 'chapter/Unit_1.xml' self.assertTrue(os.path.exists(cfn)) # self.assertIn('<sequential display_name="Introduction" due="2013-11-22" url_name="Introduction"', open(cfn).read()) xml = etree.parse(cfn).getroot() seq = xml.find('.//sequential') self.assertTrue(seq.get('display_name') == 'Introduction') self.assertTrue(seq.get('due') == '2013-11-22') self.assertTrue(seq.get('url_name') == 'Introduction') self.assertIn('<problem url_name="p1"/>', open(cfn).read())
def transform_file(request, shortkey, *args, **kwargs): res = hydroshare.get_resource_by_shortkey(shortkey) if res.reference_type == 'soap': client = Client(res.url) response = client.service.GetValues(':'+res.data_site_code, ':'+res.variable_code, '', '', '') elif res.reference_type == 'rest': r = requests.get(res.url) response = str(r.text) waterml_1 = etree.XML(response) wml_string = etree.tostring(waterml_1) s = StringIO(wml_string) dom = etree.parse(s) module_dir = os.path.dirname(__file__) xsl_location = os.path.join(module_dir, "static/ref_ts/xslt/WaterML1_1_timeSeries_to_WaterML2.xsl") xslt = etree.parse(xsl_location) transform = etree.XSLT(xslt) newdom = transform(dom) d = datetime.date.today() date = '{0}_{1}_{2}'.format(d.month, d.day, d.year) xml_name = '{0}-{1}-{2}'.format(res.title.replace(" ", ""), date, 'wml_2_0.xml') with open(xml_name, 'wb') as f: f.write(newdom) xml_file = open(xml_name, 'r') ResourceFile.objects.filter(object_id=res.pk, resource_file__contains='wml_2_0').delete() hydroshare.add_resource_files(res.short_id, xml_file) f = ResourceFile.objects.filter(object_id=res.pk, resource_file__contains='wml_2_0')[0].resource_file data = { 'status_code': 200, 'xml_name': xml_name, 'xml_size': f.size, 'xml_link': f.url } os.remove(xml_name) # print(etree.tostring(newdom, pretty_print=True)) return json_or_jsonp(request, data)
def test_find_missing_without_fallback(self): fp = etree.parse(StringIO(b'<a>\n<b>bar</b>\n</a>')) xml = stylechecker.XML(fp) xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd)) # missing elements fallback to the root element self.assertRaises(ValueError, lambda: xml.find_element('c', 2, fallback=False))
def studio_submit(self, submissions, suffix=''): xml_config = submissions['xml_config'] try: etree.parse(StringIO(xml_config)) except etree.XMLSyntaxError as e: response = { 'result': 'error', 'message': e.message } else: response = { 'result': 'success', } self.xml_config = xml_config self.display_name = submissions['display_name'] self.content_id = submissions['content_id'] self.transcript_file_id = submissions['transcript_file_id'] self.transcript_project_id = submissions['transcript_project_id'] self.enable_player_token = submissions['enable_player_token'] self.partner_code = submissions['partner_code'] self.api_key = submissions['api_key'] self.api_secret_key = submissions['api_secret_key'] self.expiration_time = submissions['expiration_time'] return response
def _load_nested_xml(cls, filename, xml_node_filename_map): """ Load the XML, including all referenced Include files . We also populate a dictionary, ``xml_node_filename_map`` which maps each node to the name of the filename that it was originally in, so that when we load in single components from a file, which are hierachical and contain references to other components, we can find the components that were in the file specified. """ if filename[:5] == "https": # lxml only supports http and ftp doc = etree.parse(urlopen(filename)) else: doc = etree.parse(filename) # Store the source filenames of all the nodes: for node in doc.getroot().getiterator(): xml_node_filename_map[node] = filename root = doc.getroot() if root.nsmap[None] != nineml_namespace: errmsg = ("The XML namespace is not compatible with this version " "of the NineML library. Expected {}, file contains {}") raise Exception(errmsg.format(nineml_namespace, root.nsmap[None])) # Recursively Load Include Nodes: for include_element in root.getiterator(tag=NINEML + 'Include'): cls._load_include(include_element=include_element, basedir=os.path.dirname(filename), xml_node_filename_map=xml_node_filename_map) return root
def __build_lxml(target, source, env): """ General XSLT builder (HTML/FO), using the lxml module. """ from lxml import etree xslt_ac = etree.XSLTAccessControl(read_file=True, write_file=True, create_dir=True, read_network=False, write_network=False) xsl_style = env.subst('$DOCBOOK_XSL') xsl_tree = etree.parse(xsl_style) transform = etree.XSLT(xsl_tree, access_control=xslt_ac) doc = etree.parse(str(source[0])) # Support for additional parameters parampass = {} if parampass: result = transform(doc, **parampass) else: result = transform(doc) try: of = open(str(target[0]), "w") of.write(of.write(etree.tostring(result, pretty_print=True))) of.close() except: pass return None
def make_tests(self): if self.url.startswith('http'): self.tree = etree.parse(urllib.request.urlopen(self.url)) else: self.tree = etree.parse(open(self.url)) self.passes = 0 self.total = 0 text = None for e in self.tree.getroot().getiterator(): if e.tag == self.ns + "title": self.title = e.text if e.tag == self.ns + "revision": self.revision = e[0].text if e.tag == self.ns + "text": text = e.text if not text: raise AttributeError("No text element?") self.tests = defaultdict(OrderedDict) rtests = text.split('\n') rtests = [self.wrg.search(j) for j in rtests if self.wrg.search(j)] for i in rtests: test = i.group(1).split('|') if len(test) < 3: continue comment = None if len(test) >= 3: lang, left, right = test[0:3] if not left.endswith('.'): left += '[_].' if len(test) >= 4: comment = test[3].strip() self.tests[lang.strip()][left.strip()] = [right.strip(), comment] self.out = StringIO()
def validate_StationXML(path_or_object): """ Checks if the given path is a valid StationXML file. Returns a tuple. The first item is a boolean describing if the validation was successful or not. The second item is a list of all found validation errors, if existant. :path_or_object: Filename of file like object. Can also be an etree element. """ # Get the schema location. schema_location = os.path.dirname(inspect.getfile(inspect.currentframe())) schema_location = os.path.join(schema_location, "docs", "fdsn-station-1.0.xsd") xmlschema = etree.XMLSchema(etree.parse(schema_location)) if isinstance(path_or_object, etree._Element): xmldoc = path_or_object else: try: xmldoc = etree.parse(path_or_object) except etree.XMLSyntaxError: return (False, ("Not a XML file.",)) valid = xmlschema.validate(xmldoc) # Pretty error printing if the validation fails. if valid is not True: return (False, xmlschema.error_log) return (True, ())
def getdates(): print "1) Output all unitdates to a csv" print "2) Output all unitdates to a csv that do not have a normal attribute or are not 'undated'" choice = raw_input("Enter a number: ") path = 'Real_Masters_all' if choice == "1": outfile = raw_input("Enter a filename for the csv: ") for filename in os.listdir(path): tree = etree.parse(join(path, filename)) d = tree.xpath('//unitdate') for i in d: with open(outfile + '.csv', 'ab') as csvfile: writer = csv.writer(csvfile, dialect='excel') writer.writerow([filename, tree.getpath(i), i.text]) print filename print outfile + '.csv complete' elif choice == "2": outfile = raw_input("Enter a filename for the csv: ") for filename in os.listdir(path): tree = etree.parse(join(path, filename)) d = tree.xpath('//unitdate') for i in d: # yyyy = re.compile('^[\d]{4}s?$') # yyyy_yyyy = re.compile('^[\d]{4}s?[-][\d]{4}s?$') undated = re.compile('^[Uu]ndated$') if not undated.match(i.text) and not 'normal' in i.attrib: with open(outfile + '.csv', 'ab') as csvfile: writer = csv.writer(csvfile, dialect='excel') writer.writerow([filename, tree.getpath(i), i.text]) print filename print outfile + '.csv complete'
def verify(self, trusted_certs=None, schema=None, trusted_certs_required=True): if not self.xml: self.decode() # validate against RelaxNG schema if HAVELXML and not self.legacy: if schema and os.path.exists(schema): tree = etree.parse(StringIO(self.xml)) schema_doc = etree.parse(schema) xmlschema = etree.XMLSchema(schema_doc) if not xmlschema.validate(tree): error = xmlschema.error_log.last_error message = "%s: %s (line %s)" % (self.get_summary_tostring(), error.message, error.line) raise CredentialNotVerifiable(message) if trusted_certs_required and trusted_certs is None: trusted_certs = [] # trusted_cert_objects = [GID(filename=f) for f in trusted_certs] trusted_cert_objects = [] ok_trusted_certs = [] # If caller explicitly passed in None that means skip cert chain validation. # Strange and not typical if trusted_certs is not None: for f in trusted_certs: try: # Failures here include unreadable files # or non PEM files trusted_cert_objects.append(GID(filename=f)) ok_trusted_certs.append(f) except Exception, exc: logger.error("Failed to load trusted cert from %s: %r" % (f, exc)) trusted_certs = ok_trusted_certs
def studio_submit(self, submissions, suffix=''): self.display_name = submissions['display_name'] xml_content = submissions['data'] max_score = submissions['max_score'] if not max_score: # empty = default max_score = 1 else: try: # not an integer, then default max_score = int(max_score) except: max_score = 1 self.weight = max_score try: etree.parse(StringIO(xml_content)) self.data = xml_content except etree.XMLSyntaxError as e: return { 'result': 'error', 'message': e.message } return { 'result': 'success', }
def getextents(): print "1) Output only collection level extents to a csv" print "2) Output only component level extents to a csv" choice = raw_input("Enter a number: ") path = 'Real_Masters_all' if choice == "1": outfile = raw_input("Enter a filename for the csv: ") for filename in os.listdir(path): tree = etree.parse(join(path, filename)) e = tree.xpath('//ead/archdesc/did//physdesc/extent') for e in e: extent = e.text or "EMPTY EXTENT" extentpath = tree.getpath(e) with open(outfile + '.csv', 'ab') as csvfile: writer = csv.writer(csvfile, dialect='excel') writer.writerow([filename, extentpath, extent]) csvfile.close() print filename print outfile + '.csv complete' elif choice == "2": outfile = raw_input("Enter a filename for the csv: ") for filename in os.listdir(path): tree = etree.parse(join(path, filename)) e = tree.xpath('//dsc//did//extent') for e in e: extent = e.text or "EMPTY EXTENT" extentpath = tree.getpath(e) with open(outfile + '.csv', 'ab') as csvfile: writer = csv.writer(csvfile, dialect='excel') writer.writerow([filename, extentpath, extent]) csvfile.close() print filename print outfile + '.csv complete'
def xml_assert_equal(expected, actual, max_lines=1000, normalizer=None, ignore_blank_text=True): # Transform both documents into an element tree if strings were passed-in if isinstance(expected, (str, unicode)): expected = etree.parse(StringIO(expected)) if isinstance(actual, (str, unicode)): actual = etree.parse(StringIO(actual)) # Create a canonical representation of both documents if normalizer is not None: expected = normalizer(expected) actual = normalizer(actual) expected = xml_as_canonical_string(expected, remove_blank_text=ignore_blank_text) actual = xml_as_canonical_string(actual, remove_blank_text=ignore_blank_text) # Then, compute a unified diff from there diff = difflib.unified_diff(expected, actual, fromfile='expected.xml', tofile='actual.xml') # Print the discrepancies out in unified diff format had_differences = False line_counter = 0 for line in diff: sys.stdout.write(line) had_differences = True line_counter += 1 if line_counter == max_lines: sys.stdout.write('<unified diff abbreviated for clarity\'s sake, more lines still to come>') break if had_differences: raise AssertionError('Expected and actual XML seem to differ')
def testSetupSolrConfig(self): solrDataDir = join(self.tempdir, 'solr-data') self._createServer(stateDir=solrDataDir, port=8042, config={'core1': {}, 'córë2': {}}) self.assertEquals(set(['lib', 'contexts', 'cores', 'start.config', 'solr.xml', 'etc', 'resources']), set(listdir(solrDataDir))) self.assertEquals(set(['webdefault.xml', 'jetty.xml']), set(listdir(join(solrDataDir, 'etc')))) jetty_xml = parse(open(join(solrDataDir, 'etc', 'jetty.xml'))) self.assertEquals(['8042'], jetty_xml.xpath('//SystemProperty[@name="jetty.port"]/@default')) f = open(join(solrDataDir, 'start.config')) for line in f: if line.startswith('jetty.home'): break else: self.fail("No jetty.home line found") f.close() self.assertEquals('jetty.home=%s\n' % solrDataDir, line) self.assertTrue('jetty.lib=/usr/share/java/solr4.5.1' in open(join(solrDataDir, 'start.config')).read()) context_solr_xml = parse(open(join(solrDataDir, 'contexts', 'solr.xml'))) self.assertEquals(['/usr/share/java/webapps/solr-%s.war' % version], context_solr_xml.xpath('//Set[@name="war"]/text()')) self.assertEquals(set(['core1', 'córë2']), set(listdir(join(solrDataDir, 'cores')))) solr_xml = parse(open(join(solrDataDir, 'solr.xml'))) self.assertEquals(set([u'córë2', 'core1']), set(solr_xml.xpath("//core/@name"))) self.assertEquals(set(['cores/core1', u'cores/córë2']), set(solr_xml.xpath("//core/@instanceDir"))) schema_core1_xml = parse(open(join(solrDataDir, 'cores', 'core1', 'conf', 'schema.xml'))) self.assertEquals(['meresco-core1'], schema_core1_xml.xpath("/schema/@name")) schema_core2_xml = parse(open(join(solrDataDir, 'cores', 'córë2', 'conf', 'schema.xml'))) self.assertEquals(['meresco-córë2'], schema_core2_xml.xpath("/schema/@name"))
def test_run(self): print "Entering test run" # xml config config_file = os.path.join( self.config_location, "test_config.xml") # xsd xsd_file = os.path.join( self.config_location, "test_xsd.xsd") # load xml and xsd logger.log_status("Loading XML file: {0}".format(config_file)) library = open(config_file) logger.log_status("Loading XSD file: {0}".format(xsd_file)) schema = open(xsd_file) # create object instance of xsd for xml validation xmlschema_doc = etree.parse(schema) xmlschema = etree.XMLSchema(xmlschema_doc) # parsing xml file library_doc = etree.parse(library) logger.log_status( "Validating XML" ) result = xmlschema.validate(library_doc) self.assertTrue(result == True) logger.log_status("Leaving test run")
def chopit(xmlfile, outfile=OUTFILE, xmltag=WP_TAG, chunksize=CHUNKSIZE): parser = etree.XMLParser(resolve_entities=False, encoding="utf-8", strip_cdata=False) doc = etree.parse(xmlfile, parser) matches = doc.xpath(xmltag) print "Found %s blog posts!" % len(matches) matcheslist = split_seq(matches, chunksize) channel = doc.getroot().find("channel") # Create an empty wordpress xml file for e in matches: channel.remove(e) doc.write(TMPFILE, encoding="utf-8", method="xml", pretty_print=True) # Now, create smaller wordpress xml files ctr = len(matcheslist) print "Breaking WordPress XML into %s smaller files." % ctr for entities in matcheslist: doc = etree.parse(TMPFILE) channel = doc.getroot().find("channel") for entity in entities: channel.append(entity) output = "%s%03d.xml" % (outfile, ctr) doc.write(output, encoding="utf-8", method="xml", pretty_print=True) print " - File %s has %s posts." % (output, len(entities)) ctr -= 1 print "Done!"
def __init__(self, plugin, sprite_name, size=None): super(GenreConfiguredSpriteSheet, self).__init__(plugin, sprite_name, size) self.genre_alternate = {} # contains GenreType tuples self._alt_icons = {} self._sprite_name = sprite_name self._size = size popups = rb.find_plugin_file(plugin, 'img/popups.xml') root = ET.parse(open(popups)).getroot() self._parse_popups(plugin, root, self.GENRE_SYSTEM) try: # self._user_popups = RB.find_user_data_file('plugins/coverart_browser/img/usericons/popups.xml') self._user_popups = RB.user_cache_dir() + "/coverart_browser/usericons/popups.xml" root = ET.parse(open(self._user_popups)).getroot() self._parse_popups(plugin, root, self.GENRE_USER) elem = root.xpath(self._sprite_name + '/index') curr_index = int(elem[0].text) for index in range(0, curr_index + 1): key = RB.ExtDBKey.create_lookup('icon', str(index)) icon_location = self._genre_db.lookup(key) sprite = GdkPixbuf.Pixbuf.new_from_file(icon_location) if self._size: sprite = sprite.scale_simple(self._size[0], self._size[1], GdkPixbuf.InterpType.BILINEAR) self._alt_icons[str(index)] = sprite self.names.append(str(index)) except: pass
def _is_xml_valid(self, path): xml_doc = etree.parse(path) # test that the doc matches the schema schema_path = os.path.join(helpers.SCHEMA_DIR, xml.NRML_SCHEMA_FILE) xmlschema = etree.XMLSchema(etree.parse(schema_path)) xmlschema.assertValid(xml_doc)
def _validate_sc3ml(path_or_object, verbose=False): """ Validates a SC3ML file against the SC3ML 0.9 schema. Returns either True or False. :param path_or_object: File name or file like object. Can also be an etree element. :type verbose: bool :param verbose: Print error log if True. """ # Get the schema location. schema_location = os.path.join(os.path.dirname(__file__), 'data', 'sc3ml_0.9.xsd') xmlschema = etree.XMLSchema(etree.parse(schema_location)) if isinstance(path_or_object, etree._Element): xmldoc = path_or_object else: try: xmldoc = etree.parse(path_or_object) except etree.XMLSyntaxError: if verbose: print('Not an XML file') return False valid = xmlschema.validate(xmldoc) # Pretty error printing if the validation fails. if verbose and valid is not True: print("Error validating SC3ML file:") for entry in xmlschema.error_log: print("\t%s" % entry) return valid
def studio_submit(self, submissions, suffix=''): ''' Save studio edits ''' self.display_name = submissions['display_name'] self.weight = self._get_natural_number(submissions['weight']) max_attempts = self._get_natural_number(submissions['max_attempts']) if max_attempts > 0: self.max_attempts = max_attempts self.your_answer_label = submissions['your_answer_label'] self.our_answer_label = submissions['our_answer_label'] self.submit_button_label = submissions['submit_button_label'] xml_content = submissions['data'] try: etree.parse(StringIO(xml_content)) self.question_string = xml_content except etree.XMLSyntaxError as e: return { 'result': 'error', 'message': e.message, } return { 'result': 'success', }
def beautify_book(root, f): from lib.epubqfix import pack_epub from lib.epubqfix import unpack_epub from lib.epubqfix import clean_temp from lib.epubqfix import find_roots f = f.replace('.epub', '_moh.epub') print('START beautify for: ' + f.decode(SFENC)) tempdir = unpack_epub(os.path.join(root, f)) opf_dir, opf_file, is_fixed = find_roots(tempdir) epub_dir = os.path.join(tempdir, opf_dir) opf_path = os.path.join(tempdir, opf_file) parser = etree.XMLParser(remove_blank_text=True) opftree = etree.parse(opf_path, parser) ncxfile = etree.XPath( '//opf:item[@media-type="application/x-dtbncx+xml"]', namespaces=OPFNS )(opftree)[0].get('href') ncx_path = os.path.join(epub_dir, ncxfile) ncxtree = etree.parse(ncx_path, parser) rename_calibre_cover(opftree, ncxtree, epub_dir) rename_cover_img(opftree, ncxtree, epub_dir) fix_body_id_links(opftree, epub_dir, ncxtree) make_cover_item_first(opftree) cont_src_list = make_content_src_list(ncxtree) fix_display_none(opftree, epub_dir, cont_src_list) # replace_fonts(epub_dir, ncxtree, opftree, 'TeXGyreSchola', 'Bookerly') write_file_changes_back(opftree, opf_path) write_file_changes_back(ncxtree, ncx_path) pack_epub(os.path.join(root, f), tempdir) clean_temp(tempdir) print('FINISH beautify for: ' + f.decode(SFENC))
def __init__(self, ac_id): self.ac_id = ac_id paparazzi_home = os.getenv("PAPARAZZI_HOME") conf_xml_path = "%s/conf/conf.xml" % paparazzi_home conf_tree = etree.parse(conf_xml_path) # extract aircraft node from conf.xml file ac_node = conf_tree.xpath('/conf/aircraft[@ac_id=%i]' % ac_id) if (len(ac_node) != 1): print "Aircraft ID %i not found." % ac_id # get settings file path from aircraft xml node settings_xml_path = "%s/conf/%s" % (paparazzi_home, ac_node[0].attrib['settings']) # save AC name for reference self.name = ac_node[0].attrib['name'] tree = etree.parse(settings_xml_path) index = 0 # keep track of index/id of setting starting at 0 for the_tab in tree.xpath("//dl_settings"): if the_tab.attrib.has_key('NAME'): setting_group = PaparazziSettingsGroup(the_tab.attrib['NAME']) elif the_tab.attrib.has_key('NAME'): setting_group = PaparazziSettingsGroup(the_tab.attrib['name']) else: continue for the_setting in the_tab.xpath('dl_setting'): if the_setting.attrib.has_key('shortname'): name = the_setting.attrib['shortname'] elif the_setting.attrib.has_key('VAR'): name = the_setting.attrib['VAR'] else: name = the_setting.attrib['var'] settings = PaparazziSetting(name) settings.index = index if the_setting.attrib.has_key('MIN'): settings.min_value = float(the_setting.attrib['MIN']) else: settings.min_value = float(the_setting.attrib['min']) if the_setting.attrib.has_key('MAX'): settings.max_value = float(the_setting.attrib['MAX']) else: settings.max_value = float(the_setting.attrib['max']) if the_setting.attrib.has_key('STEP'): settings.step = float(the_setting.attrib['STEP']) else: settings.step = float(the_setting.attrib['step']) if (the_setting.attrib.has_key('values')): settings.values = the_setting.attrib['values'].split('|') count = int((settings.max_value - settings.min_value + settings.step) / settings.step) if (len(settings.values) != count): print "Warning: wrong number of values (%i) for %s (expected %i)" % (len(settings.values), name, count) setting_group.member_list.append(settings) self.lookup.append(settings) self.name_lookup[name] = settings index = index + 1 self.groups.append(setting_group)
def testFile(self, filename): logging.debug('>>') logging.debug("Testing " + filename) try: #parse filename as xml xmldoc = etree.parse(filename) #Parse XML schema xmlschema_doc = etree.parse(self.main_data_path+"schemas/kml20-geodistance.xsd") xmlschema = etree.XMLSchema(xmlschema_doc) if (xmlschema.validate(xmldoc)): self.activities.append(xmldoc) # Assuming one activity per file #Valid file self.xmldoc = xmldoc self.startTime = datetime.datetime.now(tzlocal()) inDatabase = False #cant really check, as dont have start time etc duration = 0 # distance = self.getDistance(xmldoc) index = "%d:%d" % (0,0) sport = "Running" self.activitiesSummary.append( (index, inDatabase, self.startTime.strftime("%Y-%m-%dT%H:%M:%S%z"), distance, str(duration), sport, ) ) #print self.activitiesSummary return True except: #Not valid file return False return False
def merge_xmls(folder, name): main_xml_file = "../Annotations/" + folder + "/" + name.replace("jpg", "xml") if not os.path.isfile(main_xml_file): try: main_xml = ET.parse("../annotationCache/XMLTemplates/labelme.xml") except: return "False" main_xml.find("filename").text = name main_xml.find("folder").text = folder main_xml.write(main_xml_file, pretty_print=True) else: try: main_xml = ET.parse(main_xml_file) except: return "False" main_root = main_xml.getroot() object_files = glob.glob(main_xml_file + ".*") if len(object_files) == 0: return "False" for object_file in object_files: try: object_xml = ET.parse(object_file) except: continue object_xml.find('id').text = str(int(main_xml.xpath('count(//object)'))) object_root = object_xml.getroot() main_root.append(object_root) os.remove(object_file) main_xml.write(main_xml_file, pretty_print=True) return "True"
def fetchRecords(conf): ''' Generator to fetch all records using a resumptionToken if supplied. ''' server = conf["server"] path = conf["path"] verb = conf["verb"] metadataPrefix = conf["metadataPrefix"] set = conf["set"] params = { "verb": verb, "metadataPrefix": metadataPrefix } if set != None: params["set"] = set body = makeRequest("%s%s" % (server, path), **params) f = StringIO(body) tree = etree.parse(f) tokenList = tree.xpath("oai:ListRecords/oai:resumptionToken/text()", namespaces=namespaces) yield tree.xpath("oai:ListRecords/oai:record", namespaces=namespaces) del params["metadataPrefix"] while (len(tokenList) == 1): try: params["resumptionToken"] = tokenList[0] body = makeRequest("%s%s" % (server, path), **params) f = StringIO(body) tree = etree.parse(f) yield tree.xpath("oai:ListRecords/oai:record", namespaces=namespaces) tokenList = tree.xpath("oai:ListRecords/oai:resumptionToken/text()", namespaces=namespaces) except Exception as e: tokenList = [] log.error(sys.exc_info()) log.exception("Problem trying to get next segment.")
def pretty_print_xml(aFDescriptor, aOutput): """ xml pretty printing from a stream. Take a file descriptor (fd or StringIO for example """ # str = aFDescriptor.read() # print " Result = %s\n"%(str) # f = open("/tmp/res.xml","w") # f.write(str) # f.flush() # f.close() offset = 0 is_xml = False while not is_xml: c = aFDescriptor.read(1) if c == "<": is_xml = True else: offset += 1 if is_xml == True: aFDescriptor.seek(offset) tree = etree.parse(aFDescriptor) # get xslt stylesheet doing the transformation xsltPath = Conf.get_instance().get("Transformer", "xsltPrettyPrinter") transform = etree.XSLT(etree.parse(open(xsltPath))) result = transform(tree) ctbto.common.utils.printInFile(str(result), aOutput) else: raise Exception("Error. The file %s doesn't seems to be an XML file. Check its content")
def studio_submit(self, submissions, suffix=''): """ Handle the action of the submit button when using the block from Studio """ self.display_name = submissions['display_name'] if submissions.get('hotspot_coordinates_centered', False): self._hotspot_coordinates_centered = True xml_content = submissions['data'] try: etree.parse(StringIO(xml_content)) self.data = xml_content except etree.XMLSyntaxError as err: # Python 2 and 3 compatibility fix # Switch to _, error_message = e.args try: error_message = err.message # pylint: disable=exception-message-attribute except: # pylint: disable=bare-except _, error_message = err.args return { 'result': 'error', 'message': error_message, } return { 'result': 'success', }
def test_invalid(self): fp = etree.parse(StringIO(b'<a><c>bar</c></a>')) xml = stylechecker.XML(fp) xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd)) result, _ = xml.validate() self.assertFalse(result)
def getXML(fIn, folder): parser = etree.XMLParser(ns_clean=True) try: doc = etree.parse(fIn) except: doc = None xsd = None return doc, xsd root = doc.getroot() citygmlversion = "" for key in root.nsmap.keys(): if root.nsmap[key].find('www.opengis.net/citygml') != -1: if (root.nsmap[key][-3:] == '0.4'): citygmlversion = '0.4' break if (root.nsmap[key][-3:] == '1.0'): citygmlversion = '1.0' break if (root.nsmap[key][-3:] == '2.0'): citygmlversion = '2.0' break if citygmlversion == "": return None, None if citygmlversion == "0.4": xsd = etree.parse(folder + "schemas/v0.4/CityGML.xsd") elif citygmlversion == "1.0": xsd = etree.parse(folder + "schemas/v1.0/CityGML.xsd") else: xsd = etree.parse(folder + "schemas/v2.0/CityGML.xsd") return doc, xsd
from lxml import etree from tqdm import tqdm NEWLINE_TOKEN = "<|n|>" END_TOKEN = "<|endoftext|>" PARAPHRASE_TOKEN = " ПРФРЗ: " PARAGRAPH_END = "\n" + NEWLINE_TOKEN + "\n" ARTICLE_END = "\n" + NEWLINE_TOKEN + "\n" + END_TOKEN + "\n" root = etree.parse( r'C:\Users\kiva0319\IdeaProjects\hrdmd1803\Strong-Paraphrase-Generation-2020\raw_data\paraphrases.xml' ) root = root.getroot() non_negative_class_count = 0 with open( "C:/Users/kiva0319/IdeaProjects/hrdmd1803/Strong-Paraphrase-Generation-2020/processed/for_train" "/article_paraphrase_marked.txt", 'w', encoding="utf-8") as outputFile: for element in tqdm(root[1]): id_1 = element[1].text id_2 = element[2].text clas = element[6].text text_1 = "none" text_2 = "none" if clas != '-1': non_negative_class_count += 1
def _jsonify_ace2005_instance(docid, base_path): instance = dict(docid=docid, mentions=[], relations=[]) with open(base_path + '.sgm', 'r') as f: sgm_content = f.read() # sgm_content = re.sub(r'\<[A-Z]+[.\n]*?\>', '', sgm_content, flags=re.M) # sgm_content = re.sub(r'\<\/[A-Z]+\>', '', sgm_content) # instance['content'] = sgm_content sgm_content = re.sub(r'\&', '\u039d', sgm_content) sgm_root = etree.fromstring(sgm_content) content = ''.join(sgm_root.itertext()) content = content.replace('\u039d', '&') # sgm_tree = etree.parse(base_path + '.sgm') # sgm_root = sgm_tree.getroot() instance['content'] = content apf_tree = etree.parse(base_path + '.apf.xml') apf_root = apf_tree.getroot() relation = [] for relation in apf_root.iterfind('.//relation'): relation_type = relation.get('TYPE') relation_subtype = relation.get('SUBTYPE') for relation_mention in relation.iterfind('./relation_mention'): relation_id = relation_mention.get('ID') relation_dict = dict(id=relation_id, type=relation_type, subtype=relation_subtype) for relation_mention_argument in relation_mention.iterfind( './relation_mention_argument'): mention_id = relation_mention_argument.get('REFID') charseq = relation_mention_argument.find( './extent/charseq') start_char = int(charseq.get('START')) end_char = int(charseq.get('END')) text = re.sub(r'\&([^a])', r'&\1', charseq.text) assert mention_id in [ 'BACONSREBELLION_20050226.1317-E39-74', 'BACONSREBELLION_20050226.1317-E38-73' ] or instance['content'][start_char:end_char + 1] == text mention_dict = dict(id=mention_id, start_char=start_char, end_char=end_char, text=text) entity_mention = apf_root.find( './/entity_mention[@ID="{}"]'.format(mention_id)) if entity_mention is not None: mention_dict['type'] = entity_mention.get('TYPE') mention_dict['role'] = entity_mention.get('ROLE') entity = entity_mention.getparent() mention_dict['entity_type'] = entity.get('TYPE') mention_dict['entity_subtype'] = entity.get('SUBTYPE') #end if instance['mentions'].append(mention_dict) # if instance['content'][start_char:end_char + 1] != text: # print(base_path, mention_id) # # print(instance['content']) # print('instance', instance['content'][start_char:end_char + 1]) # print('text', text) # #end if #end if role = relation_mention_argument.get('ROLE') m = re.match(r'^Arg\-(\d+)$', role) if m: i = int(m.group(1)) relation_dict['arg{}'.format(i)] = mention_id else: relation_dict[role] = mention_id #end for instance['relations'].append(relation_dict) #end for #end for return instance
def read_xml(file): xml = etree.parse(file) return xml
''' 解析和修改XML ''' from lxml import etree if __name__ == "__main__": doc = etree.parse("pred.xml", parser=etree.XMLParser()) root = doc.getroot() print(root) root.remove(root.find("sri")) root.remove(root.find("cr")) print(root.getchildren().index(root.find("nm"))) e = etree.Element('spam') e.text = "this is a test" root.insert(2, e) doc.write("newpred.xml", xml_declaration=True)
def haka_attribute_parser(filename): """ Using CamelCase instead of regular underscore attribute names in element tree. """ parser = etree.XMLParser(ns_clean=True, remove_comments=True, remove_blank_text=True, resolve_entities=False, no_network=True) tree = etree.parse(filename, parser) root = tree.getroot() attribute_filter_policy_group = etree.Element( "AttributeFilterPolicyGroup", id="urn:mace:funet.fi:haka", nsmap={"xmlns": 'urn:mace:shibboleth:2.0:afp'}) attribute_filter_policy_group.attrib[ '{urn:mace:shibboleth:2.0:afp}basic'] = "urn:mace:shibboleth:2.0:afp:mf:basic" attribute_filter_policy_group.attrib[ '{urn:mace:shibboleth:2.0:afp}saml'] = "urn:mace:shibboleth:2.0:afp:mf:saml" attribute_filter_policy_group.attrib[ '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation' ] = "urn:mace:shibboleth:2.0:afp classpath:/schema/shibboleth-2.0-afp.xsd " \ "urn:mace:shibboleth:2.0:afp:mf:basic " \ "classpath:/schema/shibboleth-2.0-afp-mf-basic.xsd " \ "urn:mace:shibboleth:2.0:afp:mf:saml " \ "classpath:/schema/shibboleth-2.0-afp-mf-saml.xsd" for a in root: entity_id = a.get("entityID") if entity_id: for b in a: if etree.QName(b.tag).localname == "SPSSODescriptor": attributes = [] for c in b: if etree.QName( c.tag ).localname == "AttributeConsumingService": for d in c: if etree.QName( d.tag ).localname == "RequestedAttribute": friendlyname = d.get("FriendlyName") name = d.get("Name") if friendlyname: attribute = Attribute.objects.filter( name=name).first() if not attribute: attribute = Attribute.objects.filter( friendlyname=friendlyname ).first() if attribute: attributes.append(attribute) else: print("Could not add attribute " + friendlyname + ", " + name + " for " + entity_id) if attributes: attribute_filter_policy = etree.SubElement( attribute_filter_policy_group, "AttributeFilterPolicy", id="haka-default-" + entity_id) policy_requirement_rule = etree.SubElement( attribute_filter_policy, "PolicyRequirementRule", value=entity_id) policy_requirement_rule.attrib[ '{http://www.w3.org/2001/XMLSchema-instance}type'] = "basic:AttributeRequesterString" for attribute in attributes: attribute_rule = etree.SubElement( attribute_filter_policy, "AttributeRule", attributeID=attribute.attributeid) permit_value_rule = etree.SubElement( attribute_rule, "PermitValueRule") permit_value_rule.attrib[ '{http://www.w3.org/2001/XMLSchema-instance}type'] = "basic:ANY" return etree.tostring(attribute_filter_policy_group, pretty_print=True, encoding='UTF-8')
def update_cpes(xml_file, cpe_vp_map, r7_vp_map): parser = etree.XMLParser(remove_comments=False, remove_blank_text=True) doc = etree.parse(xml_file, parser) for fingerprint in doc.xpath('//fingerprint'): # collect all the params, grouping by os and service params that could be used to compute a CPE params = {} for param in fingerprint.xpath('./param'): name = param.attrib['name'] # remove any existing CPE params if re.match(r'^.*\.cpe\d{0,2}$', name): param.getparent().remove(param) continue match = re.search(r'^(?P<fp_type>hw|os|service(?:\.component)?)\.', name) if match: fp_type = match.group('fp_type') if not fp_type in params: params[fp_type] = {} if name in params[fp_type]: raise ValueError( 'Duplicated fingerprint named {} in fingerprint {} in file {}' .format(name, fingerprint.attrib['pattern'], xml_file)) params[fp_type][name] = param # for each of the applicable os/service param groups, build a CPE for fp_type in params: if fp_type == 'os': cpe_type = 'o' elif fp_type.startswith('service'): cpe_type = 'a' elif fp_type == 'hw': cpe_type = 'h' else: raise ValueError('Unhandled param type {}'.format(fp_type)) # extract the vendor/product/version values from each os/service group, # using the static value ('Apache', for example) when pos is 0, and # otherwise use a value that contains interpolation markers such that # products/projects that use recog content can insert the value # extracted from the banner/other data via regex capturing groups fp_data = { 'vendor': None, 'product': None, 'version': '-', } for fp_datum in fp_data: fp_datum_param_name = "{}.{}".format(fp_type, fp_datum) if fp_datum_param_name in params[fp_type]: fp_datum_e = params[fp_type][fp_datum_param_name] if fp_datum_e.attrib['pos'] == '0': fp_data[fp_datum] = fp_datum_e.attrib['value'] else: fp_data[fp_datum] = "{{{}}}".format( fp_datum_e.attrib['name']) vendor = fp_data['vendor'] product = fp_data['product'] version = fp_data['version'] # build a reasonable looking CPE value from the vendor/product/version, # lowercasing, replacing whitespace with _, and more if vendor and product: if not cpe_type in cpe_vp_map: logging.error("Didn't find CPE type '%s' for '%s' '%s'", cpe_type, vendor, product) continue vendor = vendor.lower().replace(' ', '_').replace(',', '') product = product.lower().replace(' ', '_').replace( ',', '').replace('!', '%21') if 'unknown' in [vendor, product]: continue if (vendor.startswith('{') and vendor.endswith('}')) or ( product.startswith('{') and product.endswith('}')): continue success, vendor, product = lookup_cpe(vendor, product, cpe_type, cpe_vp_map, r7_vp_map) if not success: continue # Sanity check the value to ensure that no invalid values will # slip in due to logic or mapping bugs. # If it's not in the official NIST list then log it and kick it out if product not in cpe_vp_map[cpe_type][vendor]: logging.error( "Invalid CPE type %s created for vendor %s and product %s. This may be due to an invalid mapping.", cpe_type, vendor, product) continue # building the CPE string # Last minute escaping of '/' and `!` product = product.replace('/', '\/').replace('%21', '\!') cpe_value = 'cpe:/{}:{}:{}'.format(cpe_type, vendor, product) if version: cpe_value += ":{}".format(version) cpe_param = etree.Element('param') cpe_param.attrib['pos'] = '0' cpe_param.attrib['name'] = '{}.cpe23'.format(fp_type) cpe_param.attrib['value'] = cpe_value for param_name in params[fp_type]: param = params[fp_type][param_name] parent = param.getparent() index = parent.index(param) + 1 parent.insert(index, cpe_param) root = doc.getroot() with open(xml_file, 'wb') as xml_out: xml_out.write( etree.tostring(root, pretty_print=True, xml_declaration=True, encoding=doc.docinfo.encoding))
video_mxf_id = None audio_mxf_id = None reel_id = None for r, d, f in os.walk('DCP'): for n in f: if n.endswith('cpl.xml'): cpl_id = n[0:-8] elif n.endswith('pkl.xml'): pkl_id = n[0:-8] # (along the way, rename the CPL/PKL files) os.rename('DCP/%s_cpl.xml' % cpl_id, 'DCP/%s_cpl.xml' % wanted_cpl_id) os.rename('DCP/%s_pkl.xml' % pkl_id, 'DCP/%s_pkl.xml' % wanted_pkl_id) xml = etree.parse('DCP/ASSETMAP.xml') assetmap_id = xml.getroot().find('{%s}Id' % assetmap_namespace).text assetmap_id = assetmap_id.replace('urn:uuid:', '') def cpl_name(s): return '{%s}%s' % (cpl_namespace, s) xml = etree.parse('DCP/%s_cpl.xml' % wanted_cpl_id) video_mxf_id = xml.getroot().find(cpl_name('ReelList')). \ find(cpl_name('Reel')). \ find(cpl_name('AssetList')). \ find(cpl_name('MainPicture')). \ find(cpl_name('Id')).text
class Clone( TestCase, get_assert_pcs_effect_mixin( lambda cib: etree.tostring( # pylint:disable=undefined-variable etree.parse(cib).findall(".//resources")[0] ) ), ): # pylint: disable=too-many-public-methods empty_cib = rc("cib-empty.xml") def setUp(self): self.temp_cib = get_tmp_file("tier1_cib_resource_clone_unclone_clone") self.pcs_runner = PcsRunner(self.temp_cib.name) self.set_cib_file(FIXTURE_PRIMITIVE_FOR_CLONE) def tearDown(self): self.temp_cib.close() def set_cib_file(self, *xml_string_list): xml_manip = XmlManipulation.from_file(self.empty_cib) xml_manip.append_to_first_tag_name("resources", *xml_string_list) write_data_to_tmpfile(str(xml_manip), self.temp_cib) def test_clone(self): self.assert_effect( "resource clone C".split(), fixture_resources_xml(fixture_clone("C-clone", "C")), ) def test_clone_custom_id(self): self.assert_effect( "resource clone C CustomCloneId".split(), fixture_resources_xml(fixture_clone("CustomCloneId", "C")), ) def test_clone_id_increment(self): self.set_cib_file( fixture_clone("C-clone", "Dummy"), FIXTURE_PRIMITIVE_FOR_CLONE, ) self.assert_effect( "resource clone C".split(), fixture_resources_xml( fixture_clone("C-clone", "Dummy"), fixture_clone("C-clone-1", "C"), ), ) def test_clone_id_is_stonith(self): self.set_cib_file(FIXTURE_STONITH_FOR_CLONE) self.assert_pcs_fail( "resource clone fence-device".split(), fixture_clone_stonith_msg(), ) self.assert_resources_xml_in_cib( fixture_resources_xml(FIXTURE_STONITH_FOR_CLONE) ) def test_clone_id_is_stonith_forced(self): self.set_cib_file(FIXTURE_STONITH_FOR_CLONE) self.assert_effect( "resource clone fence-device --force".split(), fixture_resources_xml(FIXTURE_STONITH_CLONE), output=fixture_clone_stonith_msg(forced=True), ) def test_clone_group_with_stonith(self): self.set_cib_file(FIXTURE_GROUP_WITH_STONITH) self.assert_effect( "resource clone Group".split(), fixture_resources_xml(FIXTURE_CLONED_GROUP_WITH_STONITH), ) def test_clone_group_with_stonith_forced(self): self.set_cib_file(FIXTURE_GROUP_WITH_STONITH) self.assert_effect( "resource clone Group --force".split(), fixture_resources_xml(FIXTURE_CLONED_GROUP_WITH_STONITH), ) def test_promotable_clone(self): self.assert_effect( "resource promotable C".split(), fixture_resources_xml( fixture_clone("C-clone", "C", promotable=True) ), ) def test_promotable_clone_custom_id(self): self.assert_effect( "resource promotable C CustomPromotableId".split(), fixture_resources_xml( fixture_clone("CustomPromotableId", "C", promotable=True) ), ) def test_promotable_clone_id_is_stonith(self): self.set_cib_file(FIXTURE_STONITH_FOR_CLONE) self.assert_pcs_fail( "resource promotable fence-device".split(), fixture_clone_stonith_msg(), ) self.assert_resources_xml_in_cib( fixture_resources_xml(FIXTURE_STONITH_FOR_CLONE) ) def test_promotable_clone_id_is_stonith_forced(self): self.set_cib_file(FIXTURE_STONITH_FOR_CLONE) self.assert_effect( "resource promotable fence-device --force".split(), fixture_resources_xml(FIXTURE_STONITH_PROMOTABLE), output=fixture_clone_stonith_msg(forced=True), ) def test_promotable_keyword_and_option(self): self.assert_pcs_fail( "resource promotable C CustomCloneId promotable=false".split(), ( "Error: you cannot specify both promotable option and " "promotable keyword\n" ), ) self.assert_resources_xml_in_cib( fixture_resources_xml(FIXTURE_PRIMITIVE_FOR_CLONE) ) def test_clone_with_options(self): self.assert_effect( ( "resource clone C CustomCloneId globally-unique=true meta a=b " "c=d" ).split(), fixture_resources_xml(FIXTURE_CLONE_WITH_OPTIONS), ) def test_group_last_member(self): self.set_cib_file(FIXTURE_GROUP_LAST_MEMBER) self.assert_effect( "resource clone C".split(), fixture_resources_xml(fixture_clone("C-clone", "C")), ) def test_nonexistent_resource(self): self.assert_pcs_fail( "resource clone NonExistentClone".split(), "Error: unable to find group or resource: NonExistentClone\n", ) self.assert_resources_xml_in_cib( fixture_resources_xml(FIXTURE_PRIMITIVE_FOR_CLONE) ) def test_invalid_clone_id(self): self.assert_pcs_fail( "resource clone C 1invalid".split(), "Error: invalid id '1invalid'\n", ) self.assert_resources_xml_in_cib( fixture_resources_xml(FIXTURE_PRIMITIVE_FOR_CLONE) ) def test_clone_id_already_exist(self): self.assert_pcs_fail( "resource clone C C".split(), "Error: id 'C' already exists\n", ) self.assert_resources_xml_in_cib( fixture_resources_xml(FIXTURE_PRIMITIVE_FOR_CLONE) ) def test_group_already_cloned(self): self.set_cib_file(FIXTURE_CLONED_GROUP) self.assert_pcs_fail( "resource clone Group".split(), "Error: cannot clone a group that has already been cloned\n", ) self.assert_resources_xml_in_cib( fixture_resources_xml(FIXTURE_CLONED_GROUP) ) def test_already_a_clone_resource(self): self.set_cib_file(FIXTURE_CLONED_GROUP) self.assert_pcs_fail( "resource clone G1".split(), "Error: G1 is already a clone resource\n", ) self.assert_resources_xml_in_cib( fixture_resources_xml(FIXTURE_CLONED_GROUP) ) def test_bundle_resource(self): self.set_cib_file(FIXTURE_BUNDLE_RESOURCE) self.assert_pcs_fail( "resource clone Dummy".split(), "Error: cannot clone bundle resource\n", ) self.assert_resources_xml_in_cib( fixture_resources_xml(FIXTURE_BUNDLE_RESOURCE) )
import lxml.etree as ET import sys import os if len(sys.argv) < 2: print 'usage: python xsl-transform.py <source-xml-file> <xslt-file> [destination-file]' sys.exit() print 'Transforming ' + sys.argv[1] + ' ...' dom = ET.parse(sys.argv[1]) xslt = ET.parse(sys.argv[2]) transform = ET.XSLT(xslt) newdom = transform(dom) str = ET.tostring(newdom, pretty_print=True) if len(sys.argv) < 4: print(str) else: text_file = open(sys.argv[3], "w") text_file.write(str) text_file.close() print os.stat(sys.argv[3]).st_size, ' Bytes' print print 'Done!'
from lxml import etree html = etree.parse('./test.html', etree.HTMLParser()) result = etree.tostring(html) print(result.decode('utf-8'))
def add_attachment(anthology_id, path, attach_type, overwrite=False): """ Adds a single attachment to the Anthology data files. Arguments: - The ACL ID of the paper (e.g., P17-1012) - The path to the attachment (can be a URL) - The attachment type (poster, presentation, note, software) - Whether to overwrite the downloaded file. """ collection_id, volume_id, paper_id = deconstruct_anthology_id(anthology_id) if path.startswith("http"): _, input_file_path = tempfile.mkstemp() try: print( f"-> Downloading file from {path} to {input_file_path}", file=sys.stderr ) request = urllib.request.Request(path, headers={'User-Agent': 'Mozilla/5.0'}) with urllib.request.urlopen(request) as url, open( input_file_path, mode="wb" ) as input_file_fh: input_file_fh.write(url.read()) except ssl.SSLError: raise Exception(f"Could not download {path}") except Exception as e: raise e else: input_file_path = path file_extension = path.replace("?dl=1", "").split(".")[-1] # Many links from file sharing services are not informative and don't have # extensions, so we could try to guess. if file_extension not in ALLOWED_TYPES: detected = filetype.guess(input_file_path) if detected is not None: file_extension = detected.mime.split("/")[-1] if file_extension not in ALLOWED_TYPES: print( f"Could not determine file extension for {anthology_id} at {path}", file=sys.stderr, ) with open(input_file_path, "rb") as f: checksum = compute_hash(f.read()) # Update XML xml_file = os.path.join( os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml" ) tree = ET.parse(xml_file) attachment_file_name = f"{anthology_id}.{attach_type}.{file_extension}" paper = tree.getroot().find(f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: # Check if attachment already exists for attachment in paper.findall("attachment"): if attachment.text == attachment_file_name: print( f"-> attachment {attachment_file_name} already exists in the XML", file=sys.stderr, ) break else: attachment = ET.Element("attachment") attachment.attrib["type"] = attach_type.lower() attachment.attrib["hash"] = checksum attachment.text = attachment_file_name paper.append(attachment) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print( f"-> added attachment {attachment_file_name} to the XML", file=sys.stderr ) else: print(f"Paper {anthology_id} not found in the Anthology", file=sys.stderr) # Make sure directory exists output_dir = os.path.join(args.attachment_root, collection_id[0], collection_id) if not os.path.exists(output_dir): # print(f"-> Creating directory {output_dir}", file=sys.stderr) os.makedirs(output_dir) # Copy file dest_path = os.path.join(output_dir, attachment_file_name) if os.path.exists(dest_path) and not overwrite: print( f"-> target file {dest_path} already in place, refusing to overwrite", file=sys.stderr, ) return None shutil.copy(input_file_path, dest_path) os.chmod(dest_path, 0o644) print(f"-> copied {input_file_path} to {dest_path} and fixed perms", file=sys.stderr) # Clean up if path.startswith("http"): os.remove(input_file_path) return dest_path
class Unclone( TestCase, get_assert_pcs_effect_mixin( lambda cib: etree.tostring( # pylint:disable=undefined-variable etree.parse(cib).findall(".//resources")[0] ) ), ): empty_cib = rc("cib-empty.xml") def assert_tags_xml(self, expected_xml): self.assert_resources_xml_in_cib( expected_xml, get_cib_part_func=lambda cib: etree.tostring( etree.parse(cib).findall(".//tags")[0], ), ) def assert_constraint_xml(self, expected_xml): self.assert_resources_xml_in_cib( expected_xml, get_cib_part_func=lambda cib: etree.tostring( etree.parse(cib).findall(".//constraints")[0], ), ) def setUp(self): # pylint: disable=invalid-name self.temp_cib = get_tmp_file("tier1_cib_resource_group_ungroup") self.pcs_runner = PcsRunner(self.temp_cib.name) xml_manip = XmlManipulation.from_file(self.empty_cib) xml_manip.append_to_first_tag_name( "resources", FIXTURE_CLONE, FIXTURE_DUMMY, ) xml_manip.append_to_first_tag_name( "configuration", FIXTURE_TAGS_CONFIG_XML, ) xml_manip.append_to_first_tag_name( "constraints", """ <rsc_location id="location-C-clone-rh7-1-INFINITY" node="rh7-1" rsc="C-clone" score="INFINITY"/> """, """ <rsc_location id="location-TagCloneOnly-rh7-1-INFINITY" node="rh7-1" rsc="TagCloneOnly" score="INFINITY"/> """, ) write_data_to_tmpfile(str(xml_manip), self.temp_cib) def tearDown(self): # pylint: disable=invalid-name self.temp_cib.close() def test_nonexistent_clone(self): self.assert_pcs_fail( "resource unclone NonExistentClone".split(), "Error: could not find resource: NonExistentClone\n", ) self.assert_resources_xml_in_cib(FIXTURE_CLONE_AND_RESOURCE) self.assert_tags_xml(FIXTURE_TAGS_CONFIG_XML) self.assert_constraint_xml(FIXTURE_CONSTRAINTS_CONFIG_XML) def test_not_clone_resource(self): self.assert_pcs_fail( "resource unclone Dummy".split(), "Error: 'Dummy' is not a clone resource\n", ) self.assert_resources_xml_in_cib(FIXTURE_CLONE_AND_RESOURCE) self.assert_tags_xml(FIXTURE_TAGS_CONFIG_XML) self.assert_constraint_xml(FIXTURE_CONSTRAINTS_CONFIG_XML) def test_unclone_clone_id(self): self.assert_effect( "resource unclone C-clone".split(), FIXTURE_RESOURCES ) self.assert_tags_xml(FIXTURE_TAGS_RESULT_XML) self.assert_constraint_xml("<constraints/>") def test_unclone_resoruce_id(self): self.assert_effect("resource unclone C".split(), FIXTURE_RESOURCES) self.assert_tags_xml(FIXTURE_TAGS_RESULT_XML) self.assert_constraint_xml("<constraints/>")
'slug':'华南理工大学校园aaa', 'seo_title':'', 'search_description':'', 'go_live_at':'', 'expire_at':'', 'action-publish':'action-publish', } content = s.post(add_wikihome_url, data=data) #----------------解析并导入文章 from lxml import etree#导入lxml库 from datetime import datetime tree = etree.parse(r'D:/谷歌下载/wordpress.2017-04-13.xml')#将xml解析为树结构 root = tree.getroot()#获得该树的树根 ns = {'content':"http://purl.org/rss/1.0/modules/content/", 'dc':'http://purl.org/dc/elements/1.1/'} posts = [] for i in root.find('channel').findall('item'): posts.append([ i.find('title').text, str( datetime.strptime( i.find('pubDate').text, "%a, %d %b %Y %H:%M:%S %z" ).date() ), ','.join([c.text for c in i.findall('category') if c.text != '未分类']), i.findall('dc:creator', ns)[0].text, i.findall('content:encoded', ns)[0].text,
from lxml import etree as ET #obtener el archivo xml stream = open('ejemplo.xml', 'r') #Parsear la data en un objeto ElementTree xml = ET.parse(stream) #obtener la etiqueta root (raiz del xml) root = xml.getroot() #Iterar en cada rama del elemento raiz for e in root: #imprimir en formato string cada elemento print (ET.tostring(e)) print ("") #imprimir el atributo ID de cada elemento print (e.get("Id"))
def main(): module = AnsibleModule(argument_spec=dict( path=dict(type='path', aliases=['dest', 'file']), xmlstring=dict(type='str'), xpath=dict(type='str', default='/'), namespaces=dict(type='dict', default={}), state=dict(type='str', default='present', choices=['absent', 'present'], aliases=['ensure']), value=dict(), attribute=dict(), add_children=dict(type='list'), set_children=dict(type='list'), count=dict(type='bool', default=False), print_match=dict(type='bool', default=False), pretty_print=dict(type='bool', default=False), content=dict(type='str', choices=['attribute', 'text']), input_type=dict(type='str', default='yaml', choices=['xml', 'yaml'])), supports_check_mode=True, mutually_exclusive=[ ['value', 'set_children'], ['value', 'add_children'], ['set_children', 'add_children'], ['path', 'xmlstring'], ['content', 'set_children'], ['content', 'add_children'], ['content', 'value'], ]) xml_file = module.params['path'] xml_string = module.params['xmlstring'] xpath = module.params['xpath'] namespaces = module.params['namespaces'] state = module.params['state'] value = json_dict_bytes_to_unicode(module.params['value']) attribute = module.params['attribute'] set_children = json_dict_bytes_to_unicode(module.params['set_children']) add_children = json_dict_bytes_to_unicode(module.params['add_children']) pretty_print = module.params['pretty_print'] content = module.params['content'] input_type = module.params['input_type'] print_match = module.params['print_match'] count = module.params['count'] # Check if we have lxml 2.3.0 or newer installed if not HAS_LXML: module.fail_json( msg= 'The xml ansible module requires the lxml python library installed on the managed machine' ) elif LooseVersion('.'.join( to_native(f) for f in etree.LXML_VERSION)) < LooseVersion('2.3.0'): module.fail_json( msg= 'The xml ansible module requires lxml 2.3.0 or newer installed on the managed machine' ) elif LooseVersion('.'.join( to_native(f) for f in etree.LXML_VERSION)) < LooseVersion('3.0.0'): module.warn( 'Using lxml version lower than 3.0.0 does not guarantee predictable element attribute order.' ) # Check if the file exists if xml_string: infile = BytesIO(to_bytes(xml_string, errors='surrogate_or_strict')) elif os.path.isfile(xml_file): infile = open(xml_file, 'rb') else: module.fail_json(msg="The target XML source '%s' does not exist." % xml_file) # Try to parse in the target XML file try: parser = etree.XMLParser(remove_blank_text=pretty_print) doc = etree.parse(infile, parser) except etree.XMLSyntaxError as e: module.fail_json(msg="Error while parsing path: %s" % e) if print_match: print_match(module, doc, xpath, namespaces) if count: count_nodes(module, doc, xpath, namespaces) if content == 'attribute': get_element_attr(module, doc, xpath, namespaces) elif content == 'text': get_element_text(module, doc, xpath, namespaces) # module.fail_json(msg="OK. Well, etree parsed the xml file...") # module.exit_json(what_did={"foo": "bar"}, changed=True) # File exists: if state == 'absent': # - absent: delete xpath target delete_xpath_target(module, doc, xpath, namespaces) # Exit # - present: carry on # children && value both set?: should have already aborted by now # add_children && set_children both set?: should have already aborted by now # set_children set? if set_children: set_target_children(module, doc, xpath, namespaces, set_children, input_type) # add_children set? if add_children: add_target_children(module, doc, xpath, namespaces, add_children, input_type) # No?: Carry on # Is the xpath target an attribute selector? if value is not None: set_target(module, doc, xpath, namespaces, attribute, value) # Format the xml only? if pretty_print: pretty(module, doc) ensure_xpath_exists(module, doc, xpath, namespaces)
def main(): ## --- arguments parser = argparse.ArgumentParser( description="ArcGet: retrieve imaging data from XNAT") group = parser.add_mutually_exclusive_group(required=True) group.add_argument("--host", "-host", required=False, help="XNAT URL") group.add_argument("-a", "--alias", required=False, help="XNAT config file alias") parser.add_argument( "-l", "--legacy", required=False, action="store_true", help="Return legacy XNAT 1.4 zipfile and directory structure") parser.add_argument( "--new-structure", required=False, default=False, action="store_true", help= "Don't create SessionID/RAW directory, output files in current working directory or --out-dir" ) # parser.add_argument("--clean-names", required=False, action="store_true", help="Make all file have the SESSIONID_series_SERIESNUM_file_FILENUM.dcm format") parser.add_argument("-u", "--username", required=False, help="XNAT username") parser.add_argument("-p", "--password", required=False, help="XNAT password") parser.add_argument("-s", "--session-label", action="append", required=False, dest="session_labels", help="MR Session label") parser.add_argument( "-r", "--raw-types", required=False, default="ALL", help="raw scan types or numbers e.g. 1,MEMPRAGE,21,22,DSI") parser.add_argument("-o", "--out-dir", default=".", required=False, help="output directory") parser.add_argument( '-q', "--quiet", nargs=0, action=ArgParseSubAction, dest='quiet', help="Decrease verbosity by 1. Can be used several times.") parser.add_argument( '-v', "--verbose", nargs=0, action=ArgParseAddAction, dest='verbose', help="Increase verbosity by 1. Can be used several times.") parser.add_argument('--zip64', required=False, action="store_true", help="Use Zip64 extensions when creating zip archives") parser.add_argument( "-W", "--no-warn", required=False, action="store_true", help= "Don't show me the annoying warning about downloading everything again. I like wasting bandwidth." ) parser.add_argument( "--show-all", action='store_true', dest='show_all', help="Show some information on all available sessions and exit.") (args, sessions) = parser.parse_known_args() if (args.password and not args.legacy): error( "DO NOT put passwords on the command line unless absolutely necessary!! --password only allowed with --legacy" ) # print "=================================================================" # print "Before:" # for arg in args: # print arg + "='"+str(args[arg])+"'" # print "=================================================================" ## --- read username and password from XNAT config file config_file = os.path.expanduser("~/.xnat_auth") if (not os.path.isfile(config_file)): info("No config file found: " + config_file) if (args.alias): error("You cannot specify an --alias without a config file") else: info("Reading config file: " + config_file, verbosity_level=3) xml = etree.parse(os.path.expanduser(config_file)) if (args.alias): if (not args.alias.isalnum()): error("--alias must be alphanumeric", parser=parser) ## --- get host args.host = xml.xpath("/xnat/" + args.alias + "/url/text()") if (args.host): args.host = args.host[0] ## --- get username if (not args.username): args.username = xml.xpath("/xnat/" + args.alias + "/username/text()") if (args.username): args.username = args.username[0] ## --- get password args.password = xml.xpath("/xnat/" + args.alias + "/password/text()") if (args.password): args.password = args.password[0] elif (args.host): ## --- get username if (not args.username): args.username = xml.xpath("/xnat/*[url='" + args.host + "']/username/text()") if (args.username): args.username = args.username[0] ## --- get password if (not args.password): args.password = xml.xpath("/xnat/*[url='" + args.host + "']/password/text()") if (args.password): args.password = args.password[0] ## --- prompt for host, username, password if necessary if (sys.stdin.isatty()): if (not args.host): args.host = raw_input("Enter host: ") if (not args.username): args.username = raw_input("Enter username: "******"Enter password: "******"Could not retrieve a host from config file or command line") if (not args.username): error("Could not retrieve a username from config file or command line") if (not args.password): error("Could not retrieve a password from config file or command line") ## --- strip any slashes from right side of host args.host = str(args.host).rstrip("/") if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) args.out_dir = os.path.abspath(args.out_dir) info("Saving output to '" + args.out_dir + "'") if args.session_labels is None: args.session_labels = [] if len(sessions) > 0: args.session_labels.extend(sessions) # print str(args) # sys.exit() # log(str(args)) # print "=================================================================" # print "After:" # for arg in args: # print arg + "='"+str(args[arg])+"'" # print "=================================================================" # sys.exit() arcget = ArcGet(args) if args.show_all: arcget.getShowAllSessionInfo() sys.exit() sessions = arcget.getSessionInfoByLabels(args.session_labels) for session in sessions: subject = arcget.getSubjectInfoById(session['subject_id']) if "ALL" == args.raw_types: if not args.no_warn: warn( "+-------------------------------------------------------------------+", verbosity_level=-1) warn( "| --==>> WARNING: READ CAREFULLY <<==-- |", verbosity_level=-1) warn( "+-------------------------------------------------------------------+", verbosity_level=-1) warn( "| By not specifying which scans/series to download from the session |", verbosity_level=-1) warn( "| you will be downloading EVERYTHING, including report files, text |", verbosity_level=-1) warn( "| files, pictures, and EVERY SINGLE scan. If you don't REALLY NEED |", verbosity_level=-1) warn( "| it all you are saying that you really do want to waste EVERYONE's |", verbosity_level=-1) warn( "| space, processing power, time, and slow down XNAT, the cluster |", verbosity_level=-1) warn( "| etc, etc. So DON'T DO IT. Use the -r or --raw-types option, for |", verbosity_level=-1) warn( "| example, to get the first, third and all BOLD scans, use: |", verbosity_level=-1) warn( "| --raw-types 1,3,BOLD |", verbosity_level=-1) warn( "+-------------------------------------------------------------------+", verbosity_level=-1) session['subject'] = subject series_list = arcget.getSeriesInfoBySession(session) if verbosity > 0: arcget.outputSessionDetails(session) if (args.legacy): info("Creating legacy XNAT zipfile") arcget.downloadSeriesToZipFile(session) else: info("Getting Data...") arcget.downloadSeriesToDir(session)
</div> ''' html=etree.HTML(text) # HTML 类 可以对 text 文本初始化 构造一个 XPath 解析对象 result=etree.tostring(html) # tostring() 可以输出修正后的 HTML 代码 print(result.decode('utf-8')) 也可以直接读取文本文件进行解析 from lxml import etree html = etree.parse('./test.html', etree.HTMLParser()) result = etree.tostring(html) print(result.decode('utf-8')) text=''' <div> <url> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link1.html">third item</a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a> </url> </div>
from lxml import etree import sys import csv if len(sys.argv) != 3: print "Require tdd_file and output_file name parameters" sys.exit() tdd = etree.parse(open(sys.argv[1])) csvf = csv.writer(open(sys.argv[2], "wb")) csvf.writerow(['code', 'value']) root = tdd.getroot() # Get the default namespace tddns = root.nsmap[None] # Create a namespace map using this nsm = {'tdd': tddns} # Select all the codes for the point_description_code attribute for v in root.findall( './/tdd:attribute[@name="point_description_code"]/tdd:values/tdd:value', nsm): csvf.writerow([v.attrib['text'], v.attrib['description']])
from lxml import etree parser = etree.HTMLParser() tree = etree.parse("app.html", parser) name_xpath_1 = '/html/body/div[1]/div[7]/div[4]/div[1]/div[2]/div[2]/div[2]/div/div[3]/text()' name_xpath_2 = '/html/body/div[1]/div[7]/div[4]/div[1]/div[2]/div[1]/div[2]/div/div[3]/text()' name_1 = tree.xpath(name_xpath_1) name_2 = tree.xpath(name_xpath_2) print(name_1) print(type(name_1)) print(name_2) print(type(name_2))
def __init__(self, ontology_path): self._ontology_path = ontology_path self._ontology_xml = ET.parse(ontology_path).getroot()
def checkForPreconfiguredXML(self): ret = None xmlFilePath = os.path.join( self.unit.currentPath.replace( "%sharedPath%", django_settings.SHARED_DIRECTORY, 1) + "/", django_settings.PROCESSING_XML_FILE) if os.path.isfile(xmlFilePath): # For a list of items with pks: # SELECT TasksConfigs.description, choiceAvailableAtLink, ' ' AS 'SPACE', MicroServiceChains.description, chainAvailable FROM MicroServiceChainChoice Join MicroServiceChains on MicroServiceChainChoice.chainAvailable = MicroServiceChains.pk Join MicroServiceChainLinks on MicroServiceChainLinks.pk = MicroServiceChainChoice.choiceAvailableAtLink Join TasksConfigs on TasksConfigs.pk = MicroServiceChainLinks.currentTask ORDER BY choiceAvailableAtLink desc; try: this_choice_point = choice_unifier.get(self.jobChainLink.pk, self.jobChainLink.pk) tree = etree.parse(xmlFilePath) root = tree.getroot() for preconfiguredChoice in root.findall( ".//preconfiguredChoice"): if preconfiguredChoice.find( "appliesTo").text == this_choice_point: desiredChoice = preconfiguredChoice.find( "goToChain").text desiredChoice = choice_unifier.get( desiredChoice, desiredChoice) try: link = self.jobChainLink.workflow.get_link( this_choice_point) except KeyError: return for replacement in link.config["replacements"]: if replacement["id"] == desiredChoice: # In our JSON-encoded document, the items in # the replacements are not wrapped, do it here. # Needed by ReplacementDict. ret = self._format_items(replacement["items"]) break else: return try: # <delay unitAtime="yes">30</delay> delayXML = preconfiguredChoice.find("delay") unitAtimeXML = None if delayXML: unitAtimeXML = delayXML.get("unitCtime") if unitAtimeXML is not None and unitAtimeXML.lower( ) != "no": delaySeconds = int(delayXML.text) unitTime = os.path.getmtime( self.unit.currentPath.replace( "%sharedPath%", django_settings.SHARED_DIRECTORY, 1)) nowTime = time.time() timeDifference = nowTime - unitTime timeToGo = delaySeconds - timeDifference LOGGER.info('Time to go: %s', timeToGo) self.jobChainLink.setExitMessage( "Waiting till: " + datetime.datetime.fromtimestamp( (nowTime + timeToGo)).ctime()) rd = ReplacementDict(ret) if self.jobChainLink.passVar is not None: if isinstance(self.jobChainLink.passVar, ReplacementDict): new = {} new.update( self.jobChainLink.passVar.dic) new.update(rd.dic) rd.dic = new t = threading.Timer( timeToGo, self.jobChainLink.linkProcessingComplete, args=[0, rd], kwargs={}) t.daemon = True t.start() t2 = threading.Timer( timeToGo, self.jobChainLink.setExitMessage, args=[Job.STATUS_COMPLETED_SUCCESSFULLY], kwargs={}) t2.start() return waitingOnTimer except Exception: LOGGER.info('Error parsing XML', exc_info=True) except Exception: LOGGER.warning( 'Error parsing xml at %s for pre-configured choice', xmlFilePath, exc_info=True) return ret
if __name__ == "__main__": import_root = sys.argv[1] if not os.path.isdir(import_root): print "import path does not exist or is not a directory" sys.exit(1) export_root = sys.argv[2] if not os.path.isdir(export_root): print "export path does not exist or is not a directory" sys.exit(1) for xliff_path in glob.glob(import_root + "/*/firefox-ios.xliff"): print "Exporting", xliff_path with open(xliff_path) as fp: tree = etree.parse(fp) root = tree.getroot() # Make sure there are <file> nodes in this xliff file. file_nodes = root.xpath("//x:file", namespaces=NS) if len(file_nodes) == 0: print " ERROR: No translated files. Skipping." continue # Take the target language from the first <file>. Not sure if that # is a bug in the XLIFF, but in some files only the first node has # the target-language set. target_language = file_nodes[0].get('target-language') if not target_language: print " ERROR: Missing target-language. Skipping." continue
def _create_resource(self, package, output_files): """ Given a package, create an Atom resource entry to send to LOCKSS. Parses metadata for the Atom entry from the METS file, uses LOCKSS-o-matic-specific tags to describe size and checksums. """ # Parse METS to get information for atom entry relative_mets_path = os.path.join( os.path.splitext(os.path.basename(package.current_path))[0], "data", 'METS.{}.xml'.format(package.uuid)) (mets_path, temp_dir) = package.extract_file(relative_mets_path) mets = etree.parse(mets_path) # Delete temp dir if created if os.path.exists(temp_dir): shutil.rmtree(temp_dir) # Parse out name and description if found slug = str(package.uuid) title = os.path.basename(package.current_path) summary = 'AIP generated by Archivematica with uuid {}'.format( package.uuid) dublincore = mets.find( 'mets:dmdSec/mets:mdWrap[@MDTYPE="DC"]/mets:xmlData/dcterms:dublincore', namespaces=utils.NSMAP) if dublincore is not None: title = dublincore.findtext('dcterms:title', namespaces=utils.NSMAP, default=title) slug = dublincore.findtext('dcterms:title', namespaces=utils.NSMAP, default=slug) summary = dublincore.findtext('dcterms:description', namespaces=utils.NSMAP, default=summary) # Parse out Agent for author authors = mets.xpath( ".//mets:mdWrap[@MDTYPE='PREMIS:AGENT']//mets:agentType[text()='organization']/ancestor::mets:agent/*/mets:agentIdentifierValue", namespaces=utils.NSMAP) author = authors[0].text if authors else None # Create atom entry entry = sword2.Entry(title=title, id='urn:uuid:' + package.uuid, author={'name': author}, summary=summary) # Add each chunk to the atom entry if not self.pointer_root: self.pointer_root = etree.parse(package.full_pointer_file_path) entry.register_namespace('lom', utils.NSMAP['lom']) for index, file_path in enumerate(output_files): # Get external URL if len(output_files) == 1: external_url = self._download_url(package.uuid) else: external_url = self._download_url(package.uuid, index + 1) # Get checksum and size from pointer file (or generate if not found) file_e = self.pointer_root.find( ".//mets:fileGrp[@USE='LOCKSS chunk']/mets:file[@ID='{}']". format(os.path.basename(file_path)), namespaces=utils.NSMAP) if file_e is not None: checksum_name = file_e.get('CHECKSUMTYPE') checksum_value = file_e.get('CHECKSUM') size = int(file_e.get('SIZE')) else: # Not split, generate try: checksum = utils.generate_checksum(file_path, self.checksum_type) except ValueError: # Invalid checksum type checksum = utils.generate_checksum(file_path, 'md5') checksum_name = checksum.name.upper().replace('SHA', 'SHA-') checksum_value = checksum.hexdigest() size = os.path.getsize(file_path) # Convert size to kB size = str(math.ceil(size / 1000)) # Add new content entry and values entry.add_field('lom_content', external_url) content_entry = entry.entry[-1] content_entry.set('size', size) content_entry.set('checksumType', checksum_name) content_entry.set('checksumValue', checksum_value) LOGGER.debug('LOCKSS atom entry: %s', entry) return entry, slug
def GetTransectTimesIDX(filename, code): doc2 = etree.parse(filename) distance_list = doc2.find('distance_list') old_stop = '' Start = 0 End = [] transect_IDX = [] transect_i = 1 # new_start = False start_time = [] # log_start = [] stop_time = [] # lat_start = [] # lon_start = [] # lat_stop = [] # lon_stop = [] for i in distance_list: start_time = i.get('start_time').replace(' ', 'T').replace('-', '').replace( ':', '') # log_start = i.get('log_start') stop_time = i.find('stop_time').text.replace(' ', 'T').replace( '-', '').replace(':', '') # lat_start = i.find('lat_start').text # lon_start = i.find('lon_start').text # lat_stop = i.find('lat_stop').text # lon_stop = i.find('lon_start').text # # return start_time,log_start,stop_time,lat_start,lat_stop,lon_start,lon_stop # print(start_time) # print(log_start,lon_stop) # if transect_i < 10: trnsID = '00' + str(transect_i) elif transect_i < 100: trnsID = '0' + str(transect_i) else: trnsID = str(transect_i) if old_stop != start_time: End = np.hstack((End, old_stop)) Start = np.hstack((Start, start_time)) transect_IDX = np.hstack((transect_IDX, code + '_' + trnsID)) transect_i = transect_i + 1 if Start == 0: Start = start_time transect_IDX = np.hstack((transect_IDX, code + '_' + trnsID)) transect_i = transect_i + 1 old_stop = stop_time #add last time End = np.hstack((End, stop_time)) TimeIDX = np.vstack((transect_IDX.T, Start[1:].T, End[1:].T)).T return TimeIDX
def update_package_status(self, package): """ Poll LOM for SWORD statement and update status from response. Query the state_iri for this package and parse it for the server states. If all are in agreement, add those URLs to the pointer file for each LOCKSS chunk. """ status = package.status # Need to have state and edit IRI to talk to LOM if 'state_iri' not in package.misc_attributes or 'edit_iri' not in package.misc_attributes: self.post_move_from_storage_service(None, None, package) # After retry - verify that state & edit IRI exist now if 'state_iri' not in package.misc_attributes or 'edit_iri' not in package.misc_attributes: return (None, _('Unable to contact Lockss-o-matic')) if not self.sword_connection and not self.update_service_document(): return (None, _('Error contacting LOCKSS-o-matic.')) # SWORD2 client has only experimental support for getting SWORD2 # statements, so implementing the fetch and parse here. (March 2014) response = self.sword_connection.get_resource( package.misc_attributes['state_iri'], headers={'Accept': 'application/atom+xml;type=feed'}) if response.code != 200: return (None, _('Error polling LOCKSS-o-matic for SWORD statement.')) statement_root = etree.fromstring(response.content) # TODO Check that number of lom:content entries is same as number of chunks # TODO what to do if was quorum, and now not?? # Package not safely stored, return immediately servers = statement_root.findall('.//lom:server', namespaces=utils.NSMAP) LOGGER.info('All states are agreement: %s', all(s.get('state') == 'agreement' for s in servers)) if not all(s.get('state') == 'agreement' for s in servers): # TODO update pointer file for new failed status? return (status, _('LOCKSS servers not in agreement')) status = Package.UPLOADED # Add LOCKSS URLs to each chunk if not self.pointer_root: self.pointer_root = etree.parse(package.full_pointer_file_path) files = self.pointer_root.findall( ".//mets:fileSec/mets:fileGrp[@USE='LOCKSS chunk']/mets:file", namespaces=utils.NSMAP) # If not files, find AIP fileGrp (package unsplit) if not files: files = self.pointer_root.findall( ".//mets:fileSec/mets:fileGrp[@USE='Archival Information Package']/mets:file", namespaces=utils.NSMAP) # Add new FLocat elements for each LOCKSS URL to each file element for index, file_e in enumerate(files): LOGGER.debug('file element: %s', etree.tostring(file_e, pretty_print=True)) if len(files) == 1: lom_id = self._download_url(package.uuid) else: lom_id = self._download_url(package.uuid, index + 1) LOGGER.debug('LOM id: %s', lom_id) lom_servers = statement_root.find( ".//lom:content[@id='{}']/lom:serverlist".format(lom_id), namespaces=utils.NSMAP) LOGGER.debug('lom_servers: %s', lom_servers) # Remove existing LOCKSS URLs, if they exist for old_url in file_e.findall("mets:FLocat[@LOCTYPE='URL']", namespaces=utils.NSMAP): file_e.remove(old_url) # Add URLs from SWORD statement for server in lom_servers: # TODO check that size and checksum are the same # TODO what to do if size & checksum different? LOGGER.debug('LOM URL: %s', server.get('src')) flocat = etree.SubElement(file_e, utils.PREFIX_NS['mets'] + 'FLocat', LOCTYPE="URL") flocat.set(utils.PREFIX_NS['xlink'] + 'href', server.get('src')) # Delete local files # Note: This will tell LOCKSS to stop harvesting, even if the file was # not split, and will not be deleted locally lom_content = statement_root.findall('.//lom:content', namespaces=utils.NSMAP) delete_lom_ids = [e.get('id') for e in lom_content] error = self._delete_update_lom(package, delete_lom_ids) if error is None: self._delete_files() LOGGER.info('update_package_status: new status: %s', status) # Write out pointer file again with open(package.full_pointer_file_path, 'w') as f: f.write( etree.tostring(self.pointer_root, pretty_print=True, xml_declaration=True, encoding='utf-8')) # Update value if different package.status = status package.save() return (status, error)
def __init__(self, file, test=None): parser = le.XMLParser(resolve_entities=False, huge_tree=True) w3scan = le.parse(file, parser) root = w3scan.getroot() dupes = {} for vulnerability in root.findall("vulnerability"): name = vulnerability.attrib["name"] severity = vulnerability.attrib["severity"] description = "%s are:\n\n" % vulnerability.find( "description").text.split("are:")[0] transactions = vulnerability.find("http-transactions") if transactions is not None: transactions = transactions.findall("http-transaction") for transaction in transactions: request = transaction.find("http-request") response = transaction.find("http-response") status = request.find("status").text.split(" ") response_code = response.find("status").text.split(" ")[1] http_method = status[0] request_url = status[1] data = "" for part in [request, response]: headers = [ f"{h.attrib['field']} -> {h.attrib['content']}" for h in part.find("headers").findall("header") ] headers = "\n".join(headers) request_body = part.find("body") if request_body.attrib['content-encoding'] == "base64": if request_body.text: request_body = base64.b64decode( request_body.text).decode("utf-8", errors="ignore") else: request_body = "" else: request_body = request_body.text if request_body.text else "" if not data: data = f"Request: {request_url} {http_method} {response_code} \n\n" else: data += "Response: \n" data += f"Headers: {headers}\n\nBody:{request_body}\n\n" dupe_url = urlparse(request_url) # Creating dupe path ned to think on more intelligent implementation dupe_path = dupe_url.path[:dupe_url.path.index( "%")] if "%" in dupe_url.path else dupe_url.path dupe_path = dupe_path[:dupe_path.index( "+")] if "+" in dupe_path else dupe_path dupe_path = dupe_path[:dupe_path.index( ".")] if "." in dupe_path else dupe_path dupe_path = dupe_path[:dupe_path.rindex( "/")] if "/" in dupe_path else dupe_path dupe_url = f"{dupe_url.scheme}://{dupe_url.netloc}{dupe_path}" dupe_code = f"{str(response_code)[0]}xx" dupe_key = hashlib.md5( f"{name} {dupe_url} {http_method} {dupe_code}".encode( 'utf-8')).hexdigest() if dupe_key not in dupes: dupes[dupe_key] = Finding( title=f"{name} {dupe_url} {dupe_code}", tool='W3AF', test=test, description=description, severity=severity, numerical_severity=Finding.get_numerical_severity( severity), references=data, dynamic_finding=True) elif data not in dupes[dupe_key].finding['references']: dupes[dupe_key].finding['references'] += data if request_url not in dupes[dupe_key].unsaved_endpoints: dupes[dupe_key].finding[ 'description'] += f"- {request_url}\n\n" dupes[dupe_key].unsaved_endpoints.append(request_url) self.items = dupes.values() print(len(self.items))
# parse from file from lxml import etree from io import StringIO, BytesIO file_name = 'simple.html' fhtml = open(file_name) parser = etree.HTMLParser() tree = etree.parse(fhtml, parser) result = etree.tostring(tree.getroot(), pretty_print=True, method='html') print(result)
def _split_package(self, package): """ Splits the package into chunks of size self.au_size. Returns list of paths to the chunks. If the package has already been split (and an event is in the pointer file), returns the list if file paths from the pointer file. Updates the pointer file with the new LOCKSS chunks, and adds 'division' event. """ # Parse pointer file if not self.pointer_root: self.pointer_root = etree.parse(package.full_pointer_file_path) # Check if file is already split, and if so just return split files if self.pointer_root.xpath('.//premis:eventType[text()="division"]', namespaces=utils.NSMAP): chunks = self.pointer_root.findall( ".//mets:div[@TYPE='Archival Information Package']/mets:div[@TYPE='LOCKSS chunk']", namespaces=utils.NSMAP) output_files = [ c.find('mets:fptr', namespaces=utils.NSMAP).get('FILEID') for c in chunks ] return output_files file_path = package.full_path expected_num_files = math.ceil( os.path.getsize(file_path) / self.au_size) LOGGER.debug('expected_num_files: %s', expected_num_files) # No split needed - just return the file path if expected_num_files <= 1: LOGGER.debug('Only one file expected, not splitting') output_files = [file_path] # No events or structMap changes needed LOGGER.info('LOCKSS: after splitting: %s', output_files) return output_files # Split file # Strip extension, add .tar-1 ('-1' to make rename script happy) output_path = os.path.splitext(file_path)[0] + '.tar-1' command = [ 'tar', '--create', '--multi-volume', '--tape-length', str(self.au_size), '--new-volume-script', 'common/tar_new_volume.sh', '-f', output_path, file_path ] # TODO reserve space in quota for extra files LOGGER.info('LOCKSS split command: %s', command) try: subprocess.check_call(command) except Exception: LOGGER.exception("Split of %s failed with command %s", file_path, command) raise output_path = output_path[:-2] # Remove '-1' dirname, basename = os.path.split(output_path) output_files = sorted([ os.path.join(dirname, entry) for entry in os.listdir(dirname) if entry.startswith(basename) ]) # Update pointer file amdsec = self.pointer_root.find('mets:amdSec', namespaces=utils.NSMAP) # Add 'division' PREMIS:EVENT try: event_detail = subprocess.check_output(['tar', '--version']) except subprocess.CalledProcessError as e: event_detail = e.output or _( 'Error: getting tool info; probably GNU tar') utils.mets_add_event( amdsec, event_type='division', event_detail=event_detail, event_outcome_detail_note='{} LOCKSS chunks created'.format( len(output_files)), ) # Update structMap & fileSec self.pointer_root.find('mets:structMap', namespaces=utils.NSMAP).set('TYPE', 'logical') aip_div = self.pointer_root.find( "mets:structMap/mets:div[@TYPE='Archival Information Package']", namespaces=utils.NSMAP) filesec = self.pointer_root.find('mets:fileSec', namespaces=utils.NSMAP) filegrp = etree.SubElement(filesec, utils.PREFIX_NS['mets'] + 'fileGrp', USE='LOCKSS chunk') # Move ftpr to Local copy div local_ftpr = aip_div.find('mets:fptr', namespaces=utils.NSMAP) if local_ftpr is not None: div = etree.SubElement(aip_div, utils.PREFIX_NS['mets'] + 'div', TYPE='Local copy') div.append(local_ftpr) # This moves local_fptr # Add each split chunk to structMap & fileSec for idx, out_path in enumerate(output_files): # Add div to structMap div = etree.SubElement(aip_div, utils.PREFIX_NS['mets'] + 'div', TYPE='LOCKSS chunk', ORDER=str(idx + 1)) etree.SubElement(div, utils.PREFIX_NS['mets'] + 'fptr', FILEID=os.path.basename(out_path)) # Get checksum and size for fileSec try: checksum = utils.generate_checksum(out_path, self.checksum_type) except ValueError: # Invalid checksum type checksum = utils.generate_checksum(out_path, 'md5') checksum_name = checksum.name.upper().replace('SHA', 'SHA-') size = os.path.getsize(out_path) # Add file & FLocat to fileSec file_e = etree.SubElement(filegrp, utils.PREFIX_NS['mets'] + 'file', ID=os.path.basename(out_path), SIZE=str(size), CHECKSUM=checksum.hexdigest(), CHECKSUMTYPE=checksum_name) flocat = etree.SubElement(file_e, utils.PREFIX_NS['mets'] + 'FLocat', OTHERLOCTYPE="SYSTEM", LOCTYPE="OTHER") flocat.set(utils.NSMAP['xlink'] + 'href', out_path) # Write out pointer file again with open(package.full_pointer_file_path, 'w') as f: f.write( etree.tostring(self.pointer_root, pretty_print=True, xml_declaration=True, encoding='utf-8')) return output_files
def readXML(file): root = etree.parse("xml/"+file+".xml"); conf = [] site ={} searchParameters = [] searchTags =[] allReviewTags =[] reviewHeading =[] reviewText =[] reviewUpvote =[] reviewNextPage =[] reviewStarRating = [] tempDict = {} attr = {} prevTag ='' for element in root.getiterator(): #print element.tag, element.items(), element.text if(element.tag == "site") : if(site) : site['searchTags'] = searchTags site['allReviewTags'] = allReviewTags site['reviewHeading'] = reviewHeading site['reviewText'] = reviewText site['reviewUpvote'] = reviewUpvote site['reviewNextPage'] = reviewNextPage site['reviewStarRating'] = reviewStarRating searchTags =[] allReviewTags =[] reviewHeading =[] reviewText =[] reviewUpvote =[] reviewNextPage =[] reviewStarRating = [] tempDict = {} attr = {} conf.append(site) site = {} elif(element.tag == "name"): site['name'] = element.text.strip() elif(element.tag == "prefix"): site['prefix'] = '' if element.text: site['prefix'] = element.text.strip() elif(element.tag == "searchURL"): site['searchURL'] = element.text.strip() elif(element.tag == "searchParameters"): site['searchParameters'] = searchParameters #CHANGE HERE elif(element.tag == "searchTags"): prevTag = "searchTags" elif(element.tag == "reviewStarRating"): prevTag = "reviewStarRating" elif(element.tag == "allReviewTags"): prevTag = "allReviewTags" elif(element.tag == "reviewHeading"): prevTag = "reviewHeading" elif(element.tag == "reviewText"): prevTag = "reviewText" elif(element.tag == "reviewUpvote"): prevTag = "reviewUpvote" elif(element.tag == "reviewNextPage"): prevTag = "reviewNextPage" elif(element.tag == "filter"): tempDict = {} elif(element.tag == "attributes"): if(element.text): tempStr = element.text.strip().split('\'') attr[tempStr[1]] = tempStr[3] tempDict['attributes'] = attr attr ={} else : tempDict['attributes'] = {} elif(element.tag == "recursive"): if(element.text): if element.text == 'True': tempDict['recursive'] = True else: tempDict['recursive'] = False else : tempDict['recursive'] = True elif(element.tag == "tag"): if(element.text): tempDict['tag'] = element.text.strip() else : tempDict['tag'] = '' if(prevTag == "searchTags") : searchTags.append(tempDict) elif(prevTag == "allReviewTags") : allReviewTags.append(tempDict) elif(prevTag == "reviewHeading") : reviewHeading.append(tempDict) elif(prevTag == "reviewText") : reviewText.append(tempDict) elif(prevTag == "reviewUpvote") : reviewUpvote.append(tempDict) elif(prevTag == "reviewNextPage") : reviewNextPage.append(tempDict) elif(prevTag == "reviewStarRating") : reviewStarRating.append(tempDict) site['searchTags'] = searchTags site['allReviewTags'] = allReviewTags site['reviewHeading'] = reviewHeading site['reviewText'] = reviewText site['reviewUpvote'] = reviewUpvote site['reviewNextPage'] = reviewNextPage site['reviewStarRating'] = reviewStarRating conf.append(site) return conf
def main(): module = AnsibleModule( argument_spec=dict( path=dict(type='path', aliases=['dest', 'file']), xmlstring=dict(type='str'), xpath=dict(type='str'), namespaces=dict(type='dict', default={}), state=dict(type='str', default='present', choices=['absent', 'present'], aliases=['ensure']), value=dict(type='raw'), attribute=dict(type='raw'), add_children=dict(type='list'), set_children=dict(type='list'), count=dict(type='bool', default=False), print_match=dict(type='bool', default=False), pretty_print=dict(type='bool', default=False), content=dict(type='str', choices=['attribute', 'text']), input_type=dict(type='str', default='yaml', choices=['xml', 'yaml']), backup=dict(type='bool', default=False), strip_cdata_tags=dict(type='bool', default=False), insertbefore=dict(type='bool', default=False), insertafter=dict(type='bool', default=False), ), supports_check_mode=True, required_by=dict( add_children=['xpath'], # TODO: Reinstate this in Ansible v2.12 when we have deprecated the incorrect use below # attribute=['value'], content=['xpath'], set_children=['xpath'], value=['xpath'], ), required_if=[ ['count', True, ['xpath']], ['print_match', True, ['xpath']], ['insertbefore', True, ['xpath']], ['insertafter', True, ['xpath']], ], required_one_of=[ ['path', 'xmlstring'], ['add_children', 'content', 'count', 'pretty_print', 'print_match', 'set_children', 'value'], ], mutually_exclusive=[ ['add_children', 'content', 'count', 'print_match', 'set_children', 'value'], ['path', 'xmlstring'], ['insertbefore', 'insertafter'], ], ) xml_file = module.params['path'] xml_string = module.params['xmlstring'] xpath = module.params['xpath'] namespaces = module.params['namespaces'] state = module.params['state'] value = json_dict_bytes_to_unicode(module.params['value']) attribute = module.params['attribute'] set_children = json_dict_bytes_to_unicode(module.params['set_children']) add_children = json_dict_bytes_to_unicode(module.params['add_children']) pretty_print = module.params['pretty_print'] content = module.params['content'] input_type = module.params['input_type'] print_match = module.params['print_match'] count = module.params['count'] backup = module.params['backup'] strip_cdata_tags = module.params['strip_cdata_tags'] insertbefore = module.params['insertbefore'] insertafter = module.params['insertafter'] # Check if we have lxml 2.3.0 or newer installed if not HAS_LXML: module.fail_json(msg=missing_required_lib("lxml"), exception=LXML_IMP_ERR) elif LooseVersion('.'.join(to_native(f) for f in etree.LXML_VERSION)) < LooseVersion('2.3.0'): module.fail_json(msg='The xml ansible module requires lxml 2.3.0 or newer installed on the managed machine') elif LooseVersion('.'.join(to_native(f) for f in etree.LXML_VERSION)) < LooseVersion('3.0.0'): module.warn('Using lxml version lower than 3.0.0 does not guarantee predictable element attribute order.') # Report wrongly used attribute parameter when using content=attribute # TODO: Remove this in Ansible v2.12 (and reinstate strict parameter test above) and remove the integration test example if content == 'attribute' and attribute is not None: module.deprecate("Parameter 'attribute=%s' is ignored when using 'content=attribute' only 'xpath' is used. Please remove entry." % attribute, '2.12') # Check if the file exists if xml_string: infile = BytesIO(to_bytes(xml_string, errors='surrogate_or_strict')) elif os.path.isfile(xml_file): infile = open(xml_file, 'rb') else: module.fail_json(msg="The target XML source '%s' does not exist." % xml_file) # Parse and evaluate xpath expression if xpath is not None: try: etree.XPath(xpath) except etree.XPathSyntaxError as e: module.fail_json(msg="Syntax error in xpath expression: %s (%s)" % (xpath, e)) except etree.XPathEvalError as e: module.fail_json(msg="Evaluation error in xpath expression: %s (%s)" % (xpath, e)) # Try to parse in the target XML file try: parser = etree.XMLParser(remove_blank_text=pretty_print, strip_cdata=strip_cdata_tags) doc = etree.parse(infile, parser) except etree.XMLSyntaxError as e: module.fail_json(msg="Error while parsing document: %s (%s)" % (xml_file or 'xml_string', e)) # Ensure we have the original copy to compare global orig_doc orig_doc = copy.deepcopy(doc) if print_match: do_print_match(module, doc, xpath, namespaces) if count: count_nodes(module, doc, xpath, namespaces) if content == 'attribute': get_element_attr(module, doc, xpath, namespaces) elif content == 'text': get_element_text(module, doc, xpath, namespaces) # File exists: if state == 'absent': # - absent: delete xpath target delete_xpath_target(module, doc, xpath, namespaces) # - present: carry on # children && value both set?: should have already aborted by now # add_children && set_children both set?: should have already aborted by now # set_children set? if set_children: set_target_children(module, doc, xpath, namespaces, set_children, input_type) # add_children set? if add_children: add_target_children(module, doc, xpath, namespaces, add_children, input_type, insertbefore, insertafter) # No?: Carry on # Is the xpath target an attribute selector? if value is not None: set_target(module, doc, xpath, namespaces, attribute, value) # If an xpath was provided, we need to do something with the data if xpath is not None: ensure_xpath_exists(module, doc, xpath, namespaces) # Otherwise only reformat the xml data? if pretty_print: make_pretty(module, doc) module.fail_json(msg="Don't know what to do")