Python parse示例，lxml.etree.parse Python示例

示例#1

0

显示文件

文件： test_stylechecker.py 项目： jfunez/packtools

    def test_find_missing(self):
        fp = etree.parse(StringIO(b'<a>\n<b>bar</b>\n</a>'))
        xml = stylechecker.XML(fp)
        xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd))

        # missing elements fallback to the root element
        self.assertEquals(xml.find_element('c', 2), fp.getroot())

示例#2

0

显示文件

文件： test_examples.py 项目： mitocw/latex2edx

    def test_example1(self):
        testdir = path(l2emod.__file__).parent / 'testtex'
        fn = testdir / 'example1.tex'
        print "file %s" % fn
        with make_temp_directory() as tmdir:
            nfn = '%s/%s' % (tmdir, fn.basename())
            os.system('cp %s/* %s' % (testdir, tmdir))
            os.chdir(tmdir)
            l2e = latex2edx(nfn, output_dir=tmdir)
            l2e.convert()
            xbfn = nfn[:-4] + '.xbundle'
            self.assertTrue(os.path.exists(xbfn))
            # xb = open(xbfn).read()

            # self.assertIn('<chapter display_name="Unit 1" start="2013-11-22" url_name="Unit_1">', xb)
            xml = etree.parse(xbfn).getroot()
            chapter = xml.find('.//chapter')
            self.assertTrue(chapter.get('display_name') == 'Unit 1')
            self.assertTrue(chapter.get('start') == '2013-11-22')
            self.assertTrue(chapter.get('url_name') == 'Unit_1')

            cfn = path(tmdir) / 'course/2013_Fall.xml'
            self.assertTrue(os.path.exists(cfn))

            cfn = path(tmdir) / 'chapter/Unit_1.xml'
            self.assertTrue(os.path.exists(cfn))

            # self.assertIn('<sequential display_name="Introduction" due="2013-11-22" url_name="Introduction"', open(cfn).read())
            xml = etree.parse(cfn).getroot()
            seq = xml.find('.//sequential')
            self.assertTrue(seq.get('display_name') == 'Introduction')
            self.assertTrue(seq.get('due') == '2013-11-22')
            self.assertTrue(seq.get('url_name') == 'Introduction')

            self.assertIn('<problem url_name="p1"/>', open(cfn).read())

示例#3

0

显示文件

文件： views.py 项目： hydroshare/ref_ts

def transform_file(request, shortkey, *args, **kwargs):
    res = hydroshare.get_resource_by_shortkey(shortkey)
    if res.reference_type == 'soap':
        client = Client(res.url)
        response = client.service.GetValues(':'+res.data_site_code, ':'+res.variable_code, '', '', '')
    elif res.reference_type == 'rest':
        r = requests.get(res.url)
        response = str(r.text)
    waterml_1 = etree.XML(response)
    wml_string = etree.tostring(waterml_1)
    s = StringIO(wml_string)
    dom = etree.parse(s)
    module_dir = os.path.dirname(__file__)
    xsl_location = os.path.join(module_dir, "static/ref_ts/xslt/WaterML1_1_timeSeries_to_WaterML2.xsl")
    xslt = etree.parse(xsl_location)
    transform = etree.XSLT(xslt)
    newdom = transform(dom)
    d = datetime.date.today()
    date = '{0}_{1}_{2}'.format(d.month, d.day, d.year)
    xml_name = '{0}-{1}-{2}'.format(res.title.replace(" ", ""), date, 'wml_2_0.xml')
    with open(xml_name, 'wb') as f:
        f.write(newdom)
    xml_file = open(xml_name, 'r')
    ResourceFile.objects.filter(object_id=res.pk, resource_file__contains='wml_2_0').delete()
    hydroshare.add_resource_files(res.short_id, xml_file)
    f = ResourceFile.objects.filter(object_id=res.pk, resource_file__contains='wml_2_0')[0].resource_file
    data = {
        'status_code': 200,
        'xml_name': xml_name,
        'xml_size': f.size,
        'xml_link': f.url
    }
    os.remove(xml_name)
    # print(etree.tostring(newdom, pretty_print=True))
    return json_or_jsonp(request, data)

示例#4

0

显示文件

文件： test_stylechecker.py 项目： jfunez/packtools

    def test_find_missing_without_fallback(self):
        fp = etree.parse(StringIO(b'<a>\n<b>bar</b>\n</a>'))
        xml = stylechecker.XML(fp)
        xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd))

        # missing elements fallback to the root element
        self.assertRaises(ValueError, lambda: xml.find_element('c', 2, fallback=False))

示例#5

0

显示文件

文件： ooyala_player.py 项目： ovnicraft/xblock-ooyala

    def studio_submit(self, submissions, suffix=''):

        xml_config = submissions['xml_config']
        try:
            etree.parse(StringIO(xml_config))
        except etree.XMLSyntaxError as e:
            response = {
                'result': 'error',
                'message': e.message
            }
        else:
            response = {
                'result': 'success',
            }

            self.xml_config = xml_config
            self.display_name = submissions['display_name']
            self.content_id = submissions['content_id']
            self.transcript_file_id = submissions['transcript_file_id']
            self.transcript_project_id = submissions['transcript_project_id']
            self.enable_player_token = submissions['enable_player_token']
            self.partner_code = submissions['partner_code']
            self.api_key = submissions['api_key']
            self.api_secret_key = submissions['api_secret_key']
            self.expiration_time = submissions['expiration_time']

        return response

示例#6

0

显示文件

文件： xml_reader.py 项目： russelljjarvis/nineml

    def _load_nested_xml(cls, filename, xml_node_filename_map):
        """ Load the XML, including  all referenced Include files .

        We also populate a dictionary, ``xml_node_filename_map`` which maps
        each node to the name of the filename that it was originally in, so
        that when we load in single components from a file, which are
        hierachical and contain references to other components, we can find the
        components that were in the file specified.

        """

        if filename[:5] == "https":  # lxml only supports http and ftp
            doc = etree.parse(urlopen(filename))
        else:
            doc = etree.parse(filename)
        # Store the source filenames of all the nodes:
        for node in doc.getroot().getiterator():
            xml_node_filename_map[node] = filename

        root = doc.getroot()
        if root.nsmap[None] != nineml_namespace:
            errmsg = ("The XML namespace is not compatible with this version "
                      "of the NineML library. Expected {}, file contains {}")
            raise Exception(errmsg.format(nineml_namespace, root.nsmap[None]))

        # Recursively Load Include Nodes:
        for include_element in root.getiterator(tag=NINEML + 'Include'):
            cls._load_include(include_element=include_element,
                              basedir=os.path.dirname(filename),
                              xml_node_filename_map=xml_node_filename_map)
        return root

示例#7

0

显示文件

文件： __init__.py 项目： ChineseDr/mongo

def __build_lxml(target, source, env):
    """
    General XSLT builder (HTML/FO), using the lxml module.
    """
    from lxml import etree
    
    xslt_ac = etree.XSLTAccessControl(read_file=True, 
                                      write_file=True, 
                                      create_dir=True, 
                                      read_network=False, 
                                      write_network=False)
    xsl_style = env.subst('$DOCBOOK_XSL')
    xsl_tree = etree.parse(xsl_style)
    transform = etree.XSLT(xsl_tree, access_control=xslt_ac)
    doc = etree.parse(str(source[0]))
    # Support for additional parameters
    parampass = {}
    if parampass:
        result = transform(doc, **parampass)
    else:
        result = transform(doc)
        
    try:
        of = open(str(target[0]), "w")
        of.write(of.write(etree.tostring(result, pretty_print=True)))
        of.close()
    except:
        pass

    return None

示例#8

0

显示文件

文件： testing.py 项目： unhammer/apertium-quality

	def make_tests(self):
		if self.url.startswith('http'):
			self.tree = etree.parse(urllib.request.urlopen(self.url))
		else:
			self.tree = etree.parse(open(self.url))
		
		self.passes = 0
		self.total = 0
		text = None
		for e in self.tree.getroot().getiterator():
			if e.tag == self.ns + "title":
				self.title = e.text
			if e.tag == self.ns + "revision":
				self.revision = e[0].text
			if e.tag == self.ns + "text":
				text = e.text
		if not text:
			raise AttributeError("No text element?")
		
		self.tests = defaultdict(OrderedDict)
		rtests = text.split('\n')
		rtests = [self.wrg.search(j) for j in rtests if self.wrg.search(j)]
		for i in rtests:
			test = i.group(1).split('|')
			if len(test) < 3:
				continue
			comment = None
			if len(test) >= 3:
				lang, left, right = test[0:3]
				if not left.endswith('.'):
					left += '[_].'
			if len(test) >= 4:
				comment = test[3].strip()
			self.tests[lang.strip()][left.strip()] = [right.strip(), comment]
		self.out = StringIO()

示例#9

0

显示文件

文件： stationxml.py 项目： johjam/obspy

def validate_StationXML(path_or_object):
    """
    Checks if the given path is a valid StationXML file.

    Returns a tuple. The first item is a boolean describing if the validation
    was successful or not. The second item is a list of all found validation
    errors, if existant.

    :path_or_object: Filename of file like object. Can also be an etree
        element.
    """
    # Get the schema location.
    schema_location = os.path.dirname(inspect.getfile(inspect.currentframe()))
    schema_location = os.path.join(schema_location, "docs", "fdsn-station-1.0.xsd")

    xmlschema = etree.XMLSchema(etree.parse(schema_location))

    if isinstance(path_or_object, etree._Element):
        xmldoc = path_or_object
    else:
        try:
            xmldoc = etree.parse(path_or_object)
        except etree.XMLSyntaxError:
            return (False, ("Not a XML file.",))

    valid = xmlschema.validate(xmldoc)

    # Pretty error printing if the validation fails.
    if valid is not True:
        return (False, xmlschema.error_log)
    return (True, ())

示例#10

0

显示文件

文件： getstuff.py 项目： djpillen/bentley_scripts

def getdates():
    print "1) Output all unitdates to a csv"
    print "2) Output all unitdates to a csv that do not have a normal attribute or are not 'undated'"
    choice = raw_input("Enter a number: ")
    path = 'Real_Masters_all'
    if choice == "1":
        outfile = raw_input("Enter a filename for the csv: ")
        for filename in os.listdir(path):
            tree = etree.parse(join(path, filename))
            d = tree.xpath('//unitdate')
            for i in d:
                with open(outfile + '.csv', 'ab') as csvfile:
                    writer = csv.writer(csvfile, dialect='excel')
                    writer.writerow([filename, tree.getpath(i), i.text])
            print filename
        print outfile + '.csv complete'
    elif choice == "2":
        outfile = raw_input("Enter a filename for the csv: ")
        for filename in os.listdir(path):
            tree = etree.parse(join(path, filename))
            d = tree.xpath('//unitdate')
            for i in d:
                # yyyy = re.compile('^[\d]{4}s?$')
                # yyyy_yyyy = re.compile('^[\d]{4}s?[-][\d]{4}s?$')
                undated = re.compile('^[Uu]ndated$')
                if not undated.match(i.text) and not 'normal' in i.attrib:
                    with open(outfile + '.csv', 'ab') as csvfile:
                        writer = csv.writer(csvfile, dialect='excel')
                        writer.writerow([filename, tree.getpath(i), i.text])
            print filename
        print outfile + '.csv complete'

示例#11

0

显示文件

文件： credential.py 项目： fp7-alien/C-BAS

    def verify(self, trusted_certs=None, schema=None, trusted_certs_required=True):
        if not self.xml:
            self.decode()

        # validate against RelaxNG schema
        if HAVELXML and not self.legacy:
            if schema and os.path.exists(schema):
                tree = etree.parse(StringIO(self.xml))
                schema_doc = etree.parse(schema)
                xmlschema = etree.XMLSchema(schema_doc)
                if not xmlschema.validate(tree):
                    error = xmlschema.error_log.last_error
                    message = "%s: %s (line %s)" % (self.get_summary_tostring(), error.message, error.line)
                    raise CredentialNotVerifiable(message)

        if trusted_certs_required and trusted_certs is None:
            trusted_certs = []

#        trusted_cert_objects = [GID(filename=f) for f in trusted_certs]
        trusted_cert_objects = []
        ok_trusted_certs = []
        # If caller explicitly passed in None that means skip cert chain validation.
        # Strange and not typical
        if trusted_certs is not None:
            for f in trusted_certs:
                try:
                    # Failures here include unreadable files
                    # or non PEM files
                    trusted_cert_objects.append(GID(filename=f))
                    ok_trusted_certs.append(f)
                except Exception, exc:
                    logger.error("Failed to load trusted cert from %s: %r" % (f, exc))
            trusted_certs = ok_trusted_certs

示例#12

0

显示文件

文件： group_project.py 项目： andyparsons/xblock-group-project

    def studio_submit(self, submissions, suffix=''):

        self.display_name = submissions['display_name']
        xml_content = submissions['data']
        max_score = submissions['max_score']

        if not max_score:
            # empty = default
            max_score = 1
        else:
            try:
                # not an integer, then default
                max_score = int(max_score)
            except:
                max_score = 1

        self.weight = max_score

        try:
            etree.parse(StringIO(xml_content))
            self.data = xml_content
        except etree.XMLSyntaxError as e:
            return {
                'result': 'error',
                'message': e.message
            }

        return {
            'result': 'success',
        }

示例#13

0

显示文件

文件： getstuff.py 项目： djpillen/bentley_scripts

def getextents():
    print "1) Output only collection level extents to a csv"
    print "2) Output only component level extents to a csv"
    choice = raw_input("Enter a number: ")
    path = 'Real_Masters_all'
    if choice == "1":
        outfile = raw_input("Enter a filename for the csv: ")
        for filename in os.listdir(path):
            tree = etree.parse(join(path, filename))
            e = tree.xpath('//ead/archdesc/did//physdesc/extent')
            for e in e:
                extent = e.text or "EMPTY EXTENT"
                extentpath = tree.getpath(e)
                with open(outfile + '.csv', 'ab') as csvfile:
                    writer = csv.writer(csvfile, dialect='excel')
                    writer.writerow([filename, extentpath, extent])
                csvfile.close()
            print filename
        print outfile + '.csv complete'
    elif choice == "2":
        outfile = raw_input("Enter a filename for the csv: ")
        for filename in os.listdir(path):
            tree = etree.parse(join(path, filename))
            e = tree.xpath('//dsc//did//extent')
            for e in e:
                extent = e.text or "EMPTY EXTENT"
                extentpath = tree.getpath(e)
                with open(outfile + '.csv', 'ab') as csvfile:
                    writer = csv.writer(csvfile, dialect='excel')
                    writer.writerow([filename, extentpath, extent])
                csvfile.close()
            print filename
        print outfile + '.csv complete'

示例#14

0

显示文件

文件： compare_xml.py 项目： evgenyorlov1/PPTX

def xml_assert_equal(expected, actual, max_lines=1000, normalizer=None, ignore_blank_text=True):
    # Transform both documents into an element tree if strings were passed-in
    if isinstance(expected, (str, unicode)):
        expected = etree.parse(StringIO(expected))
    if isinstance(actual, (str, unicode)):
        actual = etree.parse(StringIO(actual))

    # Create a canonical representation of both documents
    if normalizer is not None:
        expected = normalizer(expected)
        actual = normalizer(actual)
    expected = xml_as_canonical_string(expected, remove_blank_text=ignore_blank_text)
    actual = xml_as_canonical_string(actual, remove_blank_text=ignore_blank_text)

    # Then, compute a unified diff from there
    diff = difflib.unified_diff(expected, actual, fromfile='expected.xml', tofile='actual.xml')

    # Print the discrepancies out in unified diff format
    had_differences = False
    line_counter = 0
    for line in diff:
        sys.stdout.write(line)
        had_differences = True
        line_counter += 1
        if line_counter == max_lines:
            sys.stdout.write('<unified diff abbreviated for clarity\'s sake, more lines still to come>')
            break

    if had_differences:
        raise AssertionError('Expected and actual XML seem to differ')

示例#15

0

显示文件

文件： servertest.py 项目： seecr/meresco-solr

    def testSetupSolrConfig(self):
        solrDataDir = join(self.tempdir, 'solr-data')
        self._createServer(stateDir=solrDataDir, port=8042, config={'core1': {}, 'córë2': {}})
        self.assertEquals(set(['lib', 'contexts', 'cores', 'start.config', 'solr.xml', 'etc', 'resources']), set(listdir(solrDataDir)))
        self.assertEquals(set(['webdefault.xml', 'jetty.xml']), set(listdir(join(solrDataDir, 'etc'))))
        jetty_xml = parse(open(join(solrDataDir, 'etc', 'jetty.xml')))
        self.assertEquals(['8042'], jetty_xml.xpath('//SystemProperty[@name="jetty.port"]/@default'))

        f = open(join(solrDataDir, 'start.config'))
        for line in f:
            if line.startswith('jetty.home'):
                break
        else:
            self.fail("No jetty.home line found")
        f.close()
        self.assertEquals('jetty.home=%s\n' % solrDataDir, line)
        self.assertTrue('jetty.lib=/usr/share/java/solr4.5.1' in open(join(solrDataDir, 'start.config')).read())

        context_solr_xml = parse(open(join(solrDataDir, 'contexts', 'solr.xml')))
        self.assertEquals(['/usr/share/java/webapps/solr-%s.war' % version], context_solr_xml.xpath('//Set[@name="war"]/text()'))

        self.assertEquals(set(['core1', 'córë2']), set(listdir(join(solrDataDir, 'cores'))))
        solr_xml = parse(open(join(solrDataDir, 'solr.xml')))
        self.assertEquals(set([u'córë2', 'core1']), set(solr_xml.xpath("//core/@name")))
        self.assertEquals(set(['cores/core1', u'cores/córë2']), set(solr_xml.xpath("//core/@instanceDir")))

        schema_core1_xml = parse(open(join(solrDataDir, 'cores', 'core1', 'conf', 'schema.xml')))
        self.assertEquals(['meresco-core1'], schema_core1_xml.xpath("/schema/@name"))

        schema_core2_xml = parse(open(join(solrDataDir, 'cores', 'córë2', 'conf', 'schema.xml')))
        self.assertEquals(['meresco-córë2'], schema_core2_xml.xpath("/schema/@name"))

示例#16

0

显示文件

    def test_run(self):
        print "Entering test run"
        
        # xml config
        config_file = os.path.join( self.config_location, "test_config.xml")
        # xsd
        xsd_file = os.path.join( self.config_location, "test_xsd.xsd")
        
        # load xml and xsd
        logger.log_status("Loading XML file: {0}".format(config_file))
        library = open(config_file)
        
        logger.log_status("Loading XSD file: {0}".format(xsd_file))
        schema = open(xsd_file)
        
        # create object instance of xsd for xml validation
        xmlschema_doc = etree.parse(schema)
        xmlschema = etree.XMLSchema(xmlschema_doc)
        # parsing xml file
        library_doc = etree.parse(library)
        
        logger.log_status( "Validating XML" )
        
        result = xmlschema.validate(library_doc)

        self.assertTrue(result == True)
                
        logger.log_status("Leaving test run")

示例#17

0

显示文件

文件： choppedpress.py 项目： josuesasilva/1001

def chopit(xmlfile, outfile=OUTFILE, xmltag=WP_TAG, chunksize=CHUNKSIZE):
    parser = etree.XMLParser(resolve_entities=False, encoding="utf-8", strip_cdata=False)
    doc = etree.parse(xmlfile, parser)

    matches = doc.xpath(xmltag)
    print "Found %s blog posts!" % len(matches)
    matcheslist = split_seq(matches, chunksize)

    channel = doc.getroot().find("channel")

    # Create an empty wordpress xml file
    for e in matches:
        channel.remove(e)
    doc.write(TMPFILE, encoding="utf-8", method="xml", pretty_print=True)

    # Now, create smaller wordpress xml files
    ctr = len(matcheslist)
    print "Breaking WordPress XML into %s smaller files." % ctr
    for entities in matcheslist:
        doc = etree.parse(TMPFILE)
        channel = doc.getroot().find("channel")
        for entity in entities:
            channel.append(entity)

        output = "%s%03d.xml" % (outfile, ctr)
        doc.write(output, encoding="utf-8", method="xml", pretty_print=True)
        print " - File %s has %s posts." % (output, len(entities))
        ctr -= 1
    print "Done!"

示例#18

0

显示文件

文件： coverart_utils.py 项目： fossfreedom/coverart-browser

    def __init__(self, plugin, sprite_name, size=None):
        super(GenreConfiguredSpriteSheet, self).__init__(plugin, sprite_name,
                                                         size)
        self.genre_alternate = {}  # contains GenreType tuples
        self._alt_icons = {}
        self._sprite_name = sprite_name
        self._size = size

        popups = rb.find_plugin_file(plugin, 'img/popups.xml')
        root = ET.parse(open(popups)).getroot()
        self._parse_popups(plugin, root, self.GENRE_SYSTEM)

        try:
            # self._user_popups = RB.find_user_data_file('plugins/coverart_browser/img/usericons/popups.xml')
            self._user_popups = RB.user_cache_dir() + "/coverart_browser/usericons/popups.xml"
            root = ET.parse(open(self._user_popups)).getroot()
            self._parse_popups(plugin, root, self.GENRE_USER)
            elem = root.xpath(self._sprite_name + '/index')
            curr_index = int(elem[0].text)

            for index in range(0, curr_index + 1):
                key = RB.ExtDBKey.create_lookup('icon', str(index))
                icon_location = self._genre_db.lookup(key)
                sprite = GdkPixbuf.Pixbuf.new_from_file(icon_location)
                if self._size:
                    sprite = sprite.scale_simple(self._size[0], self._size[1],
                                                 GdkPixbuf.InterpType.BILINEAR)

                self._alt_icons[str(index)] = sprite
                self.names.append(str(index))
        except:
            pass

示例#19

0

显示文件

文件： hazard_nrml_unittest.py 项目： danciul/openquake

    def _is_xml_valid(self, path):
        xml_doc = etree.parse(path)

        # test that the doc matches the schema
        schema_path = os.path.join(helpers.SCHEMA_DIR, xml.NRML_SCHEMA_FILE)
        xmlschema = etree.XMLSchema(etree.parse(schema_path))
        xmlschema.assertValid(xml_doc)

示例#20

0

显示文件

文件： event.py 项目： junlysky/obspy

def _validate_sc3ml(path_or_object, verbose=False):
    """
    Validates a SC3ML file against the SC3ML 0.9 schema. Returns either True or
    False.

    :param path_or_object: File name or file like object. Can also be an etree
        element.
    :type verbose: bool
    :param verbose: Print error log if True.
    """
    # Get the schema location.
    schema_location = os.path.join(os.path.dirname(__file__), 'data',
                                   'sc3ml_0.9.xsd')
    xmlschema = etree.XMLSchema(etree.parse(schema_location))

    if isinstance(path_or_object, etree._Element):
        xmldoc = path_or_object
    else:
        try:
            xmldoc = etree.parse(path_or_object)
        except etree.XMLSyntaxError:
            if verbose:
                print('Not an XML file')
            return False

    valid = xmlschema.validate(xmldoc)

    # Pretty error printing if the validation fails.
    if verbose and valid is not True:
        print("Error validating SC3ML file:")
        for entry in xmlschema.error_log:
            print("\t%s" % entry)

    return valid

示例#21

0

显示文件

文件： submit_and_compare.py 项目： Stanford-Online/xblock-submit-and-compare

    def studio_submit(self, submissions, suffix=''):
        '''
        Save studio edits
        '''
        self.display_name = submissions['display_name']
        self.weight = self._get_natural_number(submissions['weight'])
        max_attempts = self._get_natural_number(submissions['max_attempts'])
        if max_attempts > 0:
            self.max_attempts = max_attempts
        self.your_answer_label = submissions['your_answer_label']
        self.our_answer_label = submissions['our_answer_label']
        self.submit_button_label = submissions['submit_button_label']
        xml_content = submissions['data']

        try:
            etree.parse(StringIO(xml_content))
            self.question_string = xml_content
        except etree.XMLSyntaxError as e:
            return {
                'result': 'error',
                'message': e.message,
            }

        return {
            'result': 'success',
        }

示例#22

0

显示文件

文件： beautify_book.py 项目： Rajananthini/epubQTools

def beautify_book(root, f):
    from lib.epubqfix import pack_epub
    from lib.epubqfix import unpack_epub
    from lib.epubqfix import clean_temp
    from lib.epubqfix import find_roots
    f = f.replace('.epub', '_moh.epub')
    print('START beautify for: ' + f.decode(SFENC))
    tempdir = unpack_epub(os.path.join(root, f))
    opf_dir, opf_file, is_fixed = find_roots(tempdir)
    epub_dir = os.path.join(tempdir, opf_dir)
    opf_path = os.path.join(tempdir, opf_file)
    parser = etree.XMLParser(remove_blank_text=True)
    opftree = etree.parse(opf_path, parser)
    ncxfile = etree.XPath(
        '//opf:item[@media-type="application/x-dtbncx+xml"]',
        namespaces=OPFNS
    )(opftree)[0].get('href')
    ncx_path = os.path.join(epub_dir, ncxfile)
    ncxtree = etree.parse(ncx_path, parser)

    rename_calibre_cover(opftree, ncxtree, epub_dir)
    rename_cover_img(opftree, ncxtree, epub_dir)
    fix_body_id_links(opftree, epub_dir, ncxtree)
    make_cover_item_first(opftree)
    cont_src_list = make_content_src_list(ncxtree)
    fix_display_none(opftree, epub_dir, cont_src_list)

    # replace_fonts(epub_dir, ncxtree, opftree, 'TeXGyreSchola', 'Bookerly')
    write_file_changes_back(opftree, opf_path)
    write_file_changes_back(ncxtree, ncx_path)
    pack_epub(os.path.join(root, f), tempdir)
    clean_temp(tempdir)
    print('FINISH beautify for: ' + f.decode(SFENC))

示例#23

0

显示文件

文件： settings_xml_parse.py 项目： AshuLara/lisa

  def __init__(self, ac_id):
    self.ac_id = ac_id
    paparazzi_home = os.getenv("PAPARAZZI_HOME")
    conf_xml_path = "%s/conf/conf.xml" % paparazzi_home
    conf_tree = etree.parse(conf_xml_path)
    # extract aircraft node from conf.xml file
    ac_node = conf_tree.xpath('/conf/aircraft[@ac_id=%i]' % ac_id)
    if (len(ac_node) != 1):
      print "Aircraft ID %i not found." % ac_id

    # get settings file path from aircraft xml node
    settings_xml_path = "%s/conf/%s" % (paparazzi_home, ac_node[0].attrib['settings'])

    # save AC name for reference
    self.name = ac_node[0].attrib['name']

    tree = etree.parse(settings_xml_path)
  
    index = 0 # keep track of index/id of setting starting at 0
    for the_tab in tree.xpath("//dl_settings"):
      if the_tab.attrib.has_key('NAME'):
        setting_group = PaparazziSettingsGroup(the_tab.attrib['NAME'])
      elif the_tab.attrib.has_key('NAME'):
        setting_group = PaparazziSettingsGroup(the_tab.attrib['name'])
      else:
        continue
 
      for the_setting in the_tab.xpath('dl_setting'):
        if the_setting.attrib.has_key('shortname'):
          name = the_setting.attrib['shortname']
        elif the_setting.attrib.has_key('VAR'):
          name = the_setting.attrib['VAR']
        else:
          name = the_setting.attrib['var']
        settings = PaparazziSetting(name)
        settings.index = index
        if the_setting.attrib.has_key('MIN'):
          settings.min_value = float(the_setting.attrib['MIN'])
        else:
          settings.min_value = float(the_setting.attrib['min'])
        if the_setting.attrib.has_key('MAX'):
          settings.max_value = float(the_setting.attrib['MAX'])
        else:
          settings.max_value = float(the_setting.attrib['max'])
        if the_setting.attrib.has_key('STEP'):
          settings.step = float(the_setting.attrib['STEP'])
        else:
          settings.step = float(the_setting.attrib['step'])
	if (the_setting.attrib.has_key('values')):
	  settings.values = the_setting.attrib['values'].split('|')
	  count = int((settings.max_value - settings.min_value + settings.step) / settings.step)
	  if (len(settings.values) != count):
	    print "Warning: wrong number of values (%i) for %s (expected %i)" % (len(settings.values), name, count)
  
        setting_group.member_list.append(settings)
        self.lookup.append(settings)
        self.name_lookup[name] = settings
        index = index + 1
  
      self.groups.append(setting_group)

示例#24

0

显示文件

文件： file_kml20.py 项目： MrBricodage/pytrainer

	def testFile(self, filename):
		logging.debug('>>')
		logging.debug("Testing " + filename)
		try:
			#parse filename as xml
			xmldoc = etree.parse(filename)
			#Parse XML schema
			xmlschema_doc = etree.parse(self.main_data_path+"schemas/kml20-geodistance.xsd")
			xmlschema = etree.XMLSchema(xmlschema_doc)
			if (xmlschema.validate(xmldoc)):
				self.activities.append(xmldoc) # Assuming one activity per file
				#Valid file
				self.xmldoc = xmldoc
				self.startTime = datetime.datetime.now(tzlocal())
				inDatabase = False #cant really check, as dont have start time etc
				duration  = 0 #
				distance = self.getDistance(xmldoc)
				index = "%d:%d" % (0,0) 
				sport = "Running"
				self.activitiesSummary.append( (index,
												inDatabase, 
												self.startTime.strftime("%Y-%m-%dT%H:%M:%S%z"), 
												distance, 
												str(duration), 
												sport,
												) )
				#print self.activitiesSummary
				return True
		except:
			#Not valid file
			return False 
		return False

示例#25

0

显示文件

文件： transfer_annotations.py 项目： hairuoguo/LabelMeAnnotationTool

def merge_xmls(folder, name):
    main_xml_file = "../Annotations/" + folder + "/" + name.replace("jpg", "xml")
    if not os.path.isfile(main_xml_file):
        try:
            main_xml = ET.parse("../annotationCache/XMLTemplates/labelme.xml")
        except:
            return "False"
        main_xml.find("filename").text = name 
        main_xml.find("folder").text = folder
        main_xml.write(main_xml_file, pretty_print=True)
    else:    
        try:
            main_xml = ET.parse(main_xml_file)
        except:
            return "False"
    main_root = main_xml.getroot()
    object_files = glob.glob(main_xml_file + ".*")
    if len(object_files) == 0:
        return "False"
    for object_file in object_files:
        try:
            object_xml = ET.parse(object_file)
        except:
            continue
        object_xml.find('id').text = str(int(main_xml.xpath('count(//object)')))
        object_root = object_xml.getroot()
        main_root.append(object_root)
        os.remove(object_file)
    main_xml.write(main_xml_file, pretty_print=True)
    return "True"

示例#26

0

显示文件

文件： nsdl-to-lr-data-pump.py 项目： nborwankar/LearningRegistry

def fetchRecords(conf):
    '''
    Generator to fetch all records using a resumptionToken if supplied.
    '''
    server = conf["server"]
    path = conf["path"]
    verb = conf["verb"]
    metadataPrefix = conf["metadataPrefix"]
    set = conf["set"]
    
    params = { "verb": verb, "metadataPrefix": metadataPrefix }
    if set != None:
        params["set"] = set
    
    body = makeRequest("%s%s" % (server, path), **params)
    f = StringIO(body)
    tree = etree.parse(f)
    tokenList = tree.xpath("oai:ListRecords/oai:resumptionToken/text()", namespaces=namespaces)
    yield tree.xpath("oai:ListRecords/oai:record", namespaces=namespaces)
    
    del params["metadataPrefix"]
    
    while (len(tokenList) == 1):
        try:
            params["resumptionToken"] = tokenList[0]
            body = makeRequest("%s%s" % (server, path), **params)
            f = StringIO(body)
            tree = etree.parse(f)
            yield tree.xpath("oai:ListRecords/oai:record", namespaces=namespaces)
            tokenList = tree.xpath("oai:ListRecords/oai:resumptionToken/text()", namespaces=namespaces)
        except Exception as e:
            tokenList = []
            log.error(sys.exc_info())
            log.exception("Problem trying to get next segment.")

示例#27

0

显示文件

文件： xml_utils.py 项目： pombredanne/java-balivernes

def pretty_print_xml(aFDescriptor, aOutput):
    """ xml pretty printing from a stream. Take a file descriptor (fd or StringIO for example """

    # str = aFDescriptor.read()

    # print " Result = %s\n"%(str)
    # f = open("/tmp/res.xml","w")
    # f.write(str)
    # f.flush()
    # f.close()

    offset = 0
    is_xml = False

    while not is_xml:
        c = aFDescriptor.read(1)
        if c == "<":
            is_xml = True
        else:
            offset += 1

    if is_xml == True:
        aFDescriptor.seek(offset)
        tree = etree.parse(aFDescriptor)

        # get xslt stylesheet doing the transformation
        xsltPath = Conf.get_instance().get("Transformer", "xsltPrettyPrinter")

        transform = etree.XSLT(etree.parse(open(xsltPath)))

        result = transform(tree)

        ctbto.common.utils.printInFile(str(result), aOutput)
    else:
        raise Exception("Error. The file %s doesn't seems to be an XML file. Check its content")

示例#28

0

显示文件

文件： image_explorer.py 项目： edx-solutions/xblock-image-explorer

    def studio_submit(self, submissions, suffix=''):
        """
        Handle the action of the submit button when using the block from Studio
        """
        self.display_name = submissions['display_name']
        if submissions.get('hotspot_coordinates_centered', False):
            self._hotspot_coordinates_centered = True

        xml_content = submissions['data']

        try:
            etree.parse(StringIO(xml_content))
            self.data = xml_content
        except etree.XMLSyntaxError as err:
            # Python 2 and 3 compatibility fix
            # Switch to _, error_message = e.args
            try:
                error_message = err.message  # pylint: disable=exception-message-attribute
            except:  # pylint: disable=bare-except
                _, error_message = err.args

            return {
                'result': 'error',
                'message': error_message,
            }

        return {
            'result': 'success',
        }

示例#29

0

显示文件

文件： test_stylechecker.py 项目： jfunez/packtools

    def test_invalid(self):
        fp = etree.parse(StringIO(b'<a><c>bar</c></a>'))
        xml = stylechecker.XML(fp)
        xml.xmlschema = etree.XMLSchema(etree.parse(sample_xsd))

        result, _ = xml.validate()
        self.assertFalse(result)

示例#30

0

显示文件

文件： valxsdcitygml.py 项目： tudelft3d/geovalidation.server

def getXML(fIn, folder):
    parser = etree.XMLParser(ns_clean=True)
    try:
        doc = etree.parse(fIn)    
    except:
        doc = None
        xsd = None
        return doc, xsd
    root = doc.getroot()
    citygmlversion = ""
    for key in root.nsmap.keys():
        if root.nsmap[key].find('www.opengis.net/citygml') != -1:
            if (root.nsmap[key][-3:] == '0.4'):
                citygmlversion = '0.4'
                break
            if (root.nsmap[key][-3:] == '1.0'):
                citygmlversion = '1.0'
                break
            if (root.nsmap[key][-3:] == '2.0'):
                citygmlversion = '2.0'
                break
    if citygmlversion == "":
        return None, None
    if citygmlversion == "0.4":
        xsd = etree.parse(folder + "schemas/v0.4/CityGML.xsd")
    elif citygmlversion == "1.0":
        xsd = etree.parse(folder + "schemas/v1.0/CityGML.xsd")
    else:
        xsd = etree.parse(folder + "schemas/v2.0/CityGML.xsd")
    return doc, xsd

示例#31

0

显示文件

from lxml import etree
from tqdm import tqdm

NEWLINE_TOKEN = "<|n|>"
END_TOKEN = "<|endoftext|>"
PARAPHRASE_TOKEN = " ПРФРЗ: "
PARAGRAPH_END = "\n" + NEWLINE_TOKEN + "\n"
ARTICLE_END = "\n" + NEWLINE_TOKEN + "\n" + END_TOKEN + "\n"

root = etree.parse(
    r'C:\Users\kiva0319\IdeaProjects\hrdmd1803\Strong-Paraphrase-Generation-2020\raw_data\paraphrases.xml'
)
root = root.getroot()

non_negative_class_count = 0

with open(
        "C:/Users/kiva0319/IdeaProjects/hrdmd1803/Strong-Paraphrase-Generation-2020/processed/for_train"
        "/article_paraphrase_marked.txt",
        'w',
        encoding="utf-8") as outputFile:
    for element in tqdm(root[1]):
        id_1 = element[1].text
        id_2 = element[2].text
        clas = element[6].text
        text_1 = "none"
        text_2 = "none"

        if clas != '-1':
            non_negative_class_count += 1

示例#32

0

显示文件

    def _jsonify_ace2005_instance(docid, base_path):
        instance = dict(docid=docid, mentions=[], relations=[])

        with open(base_path + '.sgm', 'r') as f:
            sgm_content = f.read()
        # sgm_content = re.sub(r'\<[A-Z]+[.\n]*?\>', '', sgm_content, flags=re.M)
        # sgm_content = re.sub(r'\<\/[A-Z]+\>', '', sgm_content)
        # instance['content'] = sgm_content
        sgm_content = re.sub(r'\&', '\u039d', sgm_content)
        sgm_root = etree.fromstring(sgm_content)
        content = ''.join(sgm_root.itertext())
        content = content.replace('\u039d', '&')
        # sgm_tree = etree.parse(base_path + '.sgm')
        # sgm_root = sgm_tree.getroot()
        instance['content'] = content

        apf_tree = etree.parse(base_path + '.apf.xml')
        apf_root = apf_tree.getroot()
        relation = []
        for relation in apf_root.iterfind('.//relation'):
            relation_type = relation.get('TYPE')
            relation_subtype = relation.get('SUBTYPE')

            for relation_mention in relation.iterfind('./relation_mention'):
                relation_id = relation_mention.get('ID')
                relation_dict = dict(id=relation_id,
                                     type=relation_type,
                                     subtype=relation_subtype)

                for relation_mention_argument in relation_mention.iterfind(
                        './relation_mention_argument'):
                    mention_id = relation_mention_argument.get('REFID')
                    charseq = relation_mention_argument.find(
                        './extent/charseq')
                    start_char = int(charseq.get('START'))
                    end_char = int(charseq.get('END'))
                    text = re.sub(r'\&([^a])', r'&amp;\1', charseq.text)

                    assert mention_id in [
                        'BACONSREBELLION_20050226.1317-E39-74',
                        'BACONSREBELLION_20050226.1317-E38-73'
                    ] or instance['content'][start_char:end_char + 1] == text

                    mention_dict = dict(id=mention_id,
                                        start_char=start_char,
                                        end_char=end_char,
                                        text=text)
                    entity_mention = apf_root.find(
                        './/entity_mention[@ID="{}"]'.format(mention_id))
                    if entity_mention is not None:
                        mention_dict['type'] = entity_mention.get('TYPE')
                        mention_dict['role'] = entity_mention.get('ROLE')
                        entity = entity_mention.getparent()
                        mention_dict['entity_type'] = entity.get('TYPE')
                        mention_dict['entity_subtype'] = entity.get('SUBTYPE')
                    #end if
                    instance['mentions'].append(mention_dict)
                    # if instance['content'][start_char:end_char + 1] != text:
                    #     print(base_path, mention_id)
                    #     # print(instance['content'])
                    #     print('instance', instance['content'][start_char:end_char + 1])
                    #     print('text', text)
                    # #end if
                    #end if

                    role = relation_mention_argument.get('ROLE')
                    m = re.match(r'^Arg\-(\d+)$', role)
                    if m:
                        i = int(m.group(1))
                        relation_dict['arg{}'.format(i)] = mention_id
                    else:
                        relation_dict[role] = mention_id
                #end for

                instance['relations'].append(relation_dict)
            #end for
        #end for

        return instance

示例#33

0

显示文件

文件： tei2txt.py 项目： dh-trier/converse

def read_xml(file):
    xml = etree.parse(file)
    return xml

示例#34

0

显示文件

文件： demo06.py 项目： gzgdouru/python_study

'''
解析和修改XML
'''
from lxml import etree

if __name__ == "__main__":
    doc = etree.parse("pred.xml", parser=etree.XMLParser())
    root = doc.getroot()
    print(root)

    root.remove(root.find("sri"))
    root.remove(root.find("cr"))

    print(root.getchildren().index(root.find("nm")))

    e = etree.Element('spam')
    e.text = "this is a test"
    root.insert(2, e)

    doc.write("newpred.xml", xml_declaration=True)

示例#35

0

显示文件

文件： parsehakaattributes.py 项目： UniversityofHelsinki/sp-registry

def haka_attribute_parser(filename):
    """
    Using CamelCase instead of regular underscore attribute names in element tree.
    """
    parser = etree.XMLParser(ns_clean=True,
                             remove_comments=True,
                             remove_blank_text=True,
                             resolve_entities=False,
                             no_network=True)
    tree = etree.parse(filename, parser)
    root = tree.getroot()
    attribute_filter_policy_group = etree.Element(
        "AttributeFilterPolicyGroup",
        id="urn:mace:funet.fi:haka",
        nsmap={"xmlns": 'urn:mace:shibboleth:2.0:afp'})
    attribute_filter_policy_group.attrib[
        '{urn:mace:shibboleth:2.0:afp}basic'] = "urn:mace:shibboleth:2.0:afp:mf:basic"
    attribute_filter_policy_group.attrib[
        '{urn:mace:shibboleth:2.0:afp}saml'] = "urn:mace:shibboleth:2.0:afp:mf:saml"
    attribute_filter_policy_group.attrib[
        '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation'
        ] = "urn:mace:shibboleth:2.0:afp classpath:/schema/shibboleth-2.0-afp.xsd " \
        "urn:mace:shibboleth:2.0:afp:mf:basic " \
        "classpath:/schema/shibboleth-2.0-afp-mf-basic.xsd " \
        "urn:mace:shibboleth:2.0:afp:mf:saml " \
        "classpath:/schema/shibboleth-2.0-afp-mf-saml.xsd"
    for a in root:
        entity_id = a.get("entityID")
        if entity_id:
            for b in a:
                if etree.QName(b.tag).localname == "SPSSODescriptor":
                    attributes = []
                    for c in b:
                        if etree.QName(
                                c.tag
                        ).localname == "AttributeConsumingService":
                            for d in c:
                                if etree.QName(
                                        d.tag
                                ).localname == "RequestedAttribute":
                                    friendlyname = d.get("FriendlyName")
                                    name = d.get("Name")
                                    if friendlyname:
                                        attribute = Attribute.objects.filter(
                                            name=name).first()
                                        if not attribute:
                                            attribute = Attribute.objects.filter(
                                                friendlyname=friendlyname
                                            ).first()
                                        if attribute:
                                            attributes.append(attribute)
                                        else:
                                            print("Could not add attribute " +
                                                  friendlyname + ", " + name +
                                                  " for " + entity_id)
                    if attributes:
                        attribute_filter_policy = etree.SubElement(
                            attribute_filter_policy_group,
                            "AttributeFilterPolicy",
                            id="haka-default-" + entity_id)
                        policy_requirement_rule = etree.SubElement(
                            attribute_filter_policy,
                            "PolicyRequirementRule",
                            value=entity_id)
                        policy_requirement_rule.attrib[
                            '{http://www.w3.org/2001/XMLSchema-instance}type'] = "basic:AttributeRequesterString"
                        for attribute in attributes:
                            attribute_rule = etree.SubElement(
                                attribute_filter_policy,
                                "AttributeRule",
                                attributeID=attribute.attributeid)
                            permit_value_rule = etree.SubElement(
                                attribute_rule, "PermitValueRule")
                            permit_value_rule.attrib[
                                '{http://www.w3.org/2001/XMLSchema-instance}type'] = "basic:ANY"
    return etree.tostring(attribute_filter_policy_group,
                          pretty_print=True,
                          encoding='UTF-8')

示例#36

0

显示文件

文件： update_cpes.py 项目： RumbleDiscovery/recog

def update_cpes(xml_file, cpe_vp_map, r7_vp_map):
    parser = etree.XMLParser(remove_comments=False, remove_blank_text=True)
    doc = etree.parse(xml_file, parser)

    for fingerprint in doc.xpath('//fingerprint'):

        # collect all the params, grouping by os and service params that could be used to compute a CPE
        params = {}
        for param in fingerprint.xpath('./param'):
            name = param.attrib['name']
            # remove any existing CPE params
            if re.match(r'^.*\.cpe\d{0,2}$', name):
                param.getparent().remove(param)
                continue

            match = re.search(r'^(?P<fp_type>hw|os|service(?:\.component)?)\.',
                              name)
            if match:
                fp_type = match.group('fp_type')
                if not fp_type in params:
                    params[fp_type] = {}
                if name in params[fp_type]:
                    raise ValueError(
                        'Duplicated fingerprint named {} in fingerprint {} in file {}'
                        .format(name, fingerprint.attrib['pattern'], xml_file))
                params[fp_type][name] = param

        # for each of the applicable os/service param groups, build a CPE
        for fp_type in params:
            if fp_type == 'os':
                cpe_type = 'o'
            elif fp_type.startswith('service'):
                cpe_type = 'a'
            elif fp_type == 'hw':
                cpe_type = 'h'
            else:
                raise ValueError('Unhandled param type {}'.format(fp_type))

            # extract the vendor/product/version values from each os/service group,
            # using the static value ('Apache', for example) when pos is 0, and
            # otherwise use a value that contains interpolation markers such that
            # products/projects that use recog content can insert the value
            # extracted from the banner/other data via regex capturing groups
            fp_data = {
                'vendor': None,
                'product': None,
                'version': '-',
            }
            for fp_datum in fp_data:
                fp_datum_param_name = "{}.{}".format(fp_type, fp_datum)
                if fp_datum_param_name in params[fp_type]:
                    fp_datum_e = params[fp_type][fp_datum_param_name]
                    if fp_datum_e.attrib['pos'] == '0':
                        fp_data[fp_datum] = fp_datum_e.attrib['value']
                    else:
                        fp_data[fp_datum] = "{{{}}}".format(
                            fp_datum_e.attrib['name'])

            vendor = fp_data['vendor']
            product = fp_data['product']
            version = fp_data['version']

            # build a reasonable looking CPE value from the vendor/product/version,
            # lowercasing, replacing whitespace with _, and more
            if vendor and product:
                if not cpe_type in cpe_vp_map:
                    logging.error("Didn't find CPE type '%s' for '%s' '%s'",
                                  cpe_type, vendor, product)
                    continue

                vendor = vendor.lower().replace(' ', '_').replace(',', '')
                product = product.lower().replace(' ', '_').replace(
                    ',', '').replace('!', '%21')
                if 'unknown' in [vendor, product]:
                    continue

                if (vendor.startswith('{') and vendor.endswith('}')) or (
                        product.startswith('{') and product.endswith('}')):
                    continue

                success, vendor, product = lookup_cpe(vendor, product,
                                                      cpe_type, cpe_vp_map,
                                                      r7_vp_map)
                if not success:
                    continue

                # Sanity check the value to ensure that no invalid values will
                # slip in due to logic or mapping bugs.
                # If it's not in the official NIST list then log it and kick it out
                if product not in cpe_vp_map[cpe_type][vendor]:
                    logging.error(
                        "Invalid CPE type %s created for vendor %s and product %s. This may be due to an invalid mapping.",
                        cpe_type, vendor, product)
                    continue

                # building the CPE string
                # Last minute escaping of '/' and `!`
                product = product.replace('/', '\/').replace('%21', '\!')
                cpe_value = 'cpe:/{}:{}:{}'.format(cpe_type, vendor, product)

                if version:
                    cpe_value += ":{}".format(version)

                cpe_param = etree.Element('param')
                cpe_param.attrib['pos'] = '0'
                cpe_param.attrib['name'] = '{}.cpe23'.format(fp_type)
                cpe_param.attrib['value'] = cpe_value

                for param_name in params[fp_type]:
                    param = params[fp_type][param_name]
                    parent = param.getparent()
                    index = parent.index(param) + 1
                    parent.insert(index, cpe_param)

    root = doc.getroot()

    with open(xml_file, 'wb') as xml_out:
        xml_out.write(
            etree.tostring(root,
                           pretty_print=True,
                           xml_declaration=True,
                           encoding=doc.docinfo.encoding))

示例#37

0

显示文件

文件： make.py 项目： tschiemer/libdcp

video_mxf_id = None
audio_mxf_id = None
reel_id = None

for r, d, f in os.walk('DCP'):
    for n in f:
        if n.endswith('cpl.xml'):
            cpl_id = n[0:-8]
        elif n.endswith('pkl.xml'):
            pkl_id = n[0:-8]

# (along the way, rename the CPL/PKL files)
os.rename('DCP/%s_cpl.xml' % cpl_id, 'DCP/%s_cpl.xml' % wanted_cpl_id)
os.rename('DCP/%s_pkl.xml' % pkl_id, 'DCP/%s_pkl.xml' % wanted_pkl_id)

xml = etree.parse('DCP/ASSETMAP.xml')
assetmap_id = xml.getroot().find('{%s}Id' % assetmap_namespace).text
assetmap_id = assetmap_id.replace('urn:uuid:', '')


def cpl_name(s):
    return '{%s}%s' % (cpl_namespace, s)


xml = etree.parse('DCP/%s_cpl.xml' % wanted_cpl_id)

video_mxf_id = xml.getroot().find(cpl_name('ReelList')).    \
                             find(cpl_name('Reel')).        \
                             find(cpl_name('AssetList')).   \
                             find(cpl_name('MainPicture')). \
                             find(cpl_name('Id')).text

示例#38

0

显示文件

文件： test_clone_unclone.py 项目： vvidic/pcs

class Clone(
    TestCase,
    get_assert_pcs_effect_mixin(
        lambda cib: etree.tostring(
            # pylint:disable=undefined-variable
            etree.parse(cib).findall(".//resources")[0]
        )
    ),
):
    # pylint: disable=too-many-public-methods
    empty_cib = rc("cib-empty.xml")

    def setUp(self):
        self.temp_cib = get_tmp_file("tier1_cib_resource_clone_unclone_clone")
        self.pcs_runner = PcsRunner(self.temp_cib.name)
        self.set_cib_file(FIXTURE_PRIMITIVE_FOR_CLONE)

    def tearDown(self):
        self.temp_cib.close()

    def set_cib_file(self, *xml_string_list):
        xml_manip = XmlManipulation.from_file(self.empty_cib)
        xml_manip.append_to_first_tag_name("resources", *xml_string_list)
        write_data_to_tmpfile(str(xml_manip), self.temp_cib)

    def test_clone(self):
        self.assert_effect(
            "resource clone C".split(),
            fixture_resources_xml(fixture_clone("C-clone", "C")),
        )

    def test_clone_custom_id(self):
        self.assert_effect(
            "resource clone C CustomCloneId".split(),
            fixture_resources_xml(fixture_clone("CustomCloneId", "C")),
        )

    def test_clone_id_increment(self):
        self.set_cib_file(
            fixture_clone("C-clone", "Dummy"),
            FIXTURE_PRIMITIVE_FOR_CLONE,
        )
        self.assert_effect(
            "resource clone C".split(),
            fixture_resources_xml(
                fixture_clone("C-clone", "Dummy"),
                fixture_clone("C-clone-1", "C"),
            ),
        )

    def test_clone_id_is_stonith(self):
        self.set_cib_file(FIXTURE_STONITH_FOR_CLONE)
        self.assert_pcs_fail(
            "resource clone fence-device".split(),
            fixture_clone_stonith_msg(),
        )
        self.assert_resources_xml_in_cib(
            fixture_resources_xml(FIXTURE_STONITH_FOR_CLONE)
        )

    def test_clone_id_is_stonith_forced(self):
        self.set_cib_file(FIXTURE_STONITH_FOR_CLONE)
        self.assert_effect(
            "resource clone fence-device --force".split(),
            fixture_resources_xml(FIXTURE_STONITH_CLONE),
            output=fixture_clone_stonith_msg(forced=True),
        )

    def test_clone_group_with_stonith(self):
        self.set_cib_file(FIXTURE_GROUP_WITH_STONITH)
        self.assert_effect(
            "resource clone Group".split(),
            fixture_resources_xml(FIXTURE_CLONED_GROUP_WITH_STONITH),
        )

    def test_clone_group_with_stonith_forced(self):
        self.set_cib_file(FIXTURE_GROUP_WITH_STONITH)
        self.assert_effect(
            "resource clone Group --force".split(),
            fixture_resources_xml(FIXTURE_CLONED_GROUP_WITH_STONITH),
        )

    def test_promotable_clone(self):
        self.assert_effect(
            "resource promotable C".split(),
            fixture_resources_xml(
                fixture_clone("C-clone", "C", promotable=True)
            ),
        )

    def test_promotable_clone_custom_id(self):
        self.assert_effect(
            "resource promotable C CustomPromotableId".split(),
            fixture_resources_xml(
                fixture_clone("CustomPromotableId", "C", promotable=True)
            ),
        )

    def test_promotable_clone_id_is_stonith(self):
        self.set_cib_file(FIXTURE_STONITH_FOR_CLONE)
        self.assert_pcs_fail(
            "resource promotable fence-device".split(),
            fixture_clone_stonith_msg(),
        )
        self.assert_resources_xml_in_cib(
            fixture_resources_xml(FIXTURE_STONITH_FOR_CLONE)
        )

    def test_promotable_clone_id_is_stonith_forced(self):
        self.set_cib_file(FIXTURE_STONITH_FOR_CLONE)
        self.assert_effect(
            "resource promotable fence-device --force".split(),
            fixture_resources_xml(FIXTURE_STONITH_PROMOTABLE),
            output=fixture_clone_stonith_msg(forced=True),
        )

    def test_promotable_keyword_and_option(self):
        self.assert_pcs_fail(
            "resource promotable C CustomCloneId promotable=false".split(),
            (
                "Error: you cannot specify both promotable option and "
                "promotable keyword\n"
            ),
        )
        self.assert_resources_xml_in_cib(
            fixture_resources_xml(FIXTURE_PRIMITIVE_FOR_CLONE)
        )

    def test_clone_with_options(self):
        self.assert_effect(
            (
                "resource clone C CustomCloneId globally-unique=true meta a=b "
                "c=d"
            ).split(),
            fixture_resources_xml(FIXTURE_CLONE_WITH_OPTIONS),
        )

    def test_group_last_member(self):
        self.set_cib_file(FIXTURE_GROUP_LAST_MEMBER)
        self.assert_effect(
            "resource clone C".split(),
            fixture_resources_xml(fixture_clone("C-clone", "C")),
        )

    def test_nonexistent_resource(self):
        self.assert_pcs_fail(
            "resource clone NonExistentClone".split(),
            "Error: unable to find group or resource: NonExistentClone\n",
        )
        self.assert_resources_xml_in_cib(
            fixture_resources_xml(FIXTURE_PRIMITIVE_FOR_CLONE)
        )

    def test_invalid_clone_id(self):
        self.assert_pcs_fail(
            "resource clone C 1invalid".split(),
            "Error: invalid id '1invalid'\n",
        )
        self.assert_resources_xml_in_cib(
            fixture_resources_xml(FIXTURE_PRIMITIVE_FOR_CLONE)
        )

    def test_clone_id_already_exist(self):
        self.assert_pcs_fail(
            "resource clone C C".split(),
            "Error: id 'C' already exists\n",
        )
        self.assert_resources_xml_in_cib(
            fixture_resources_xml(FIXTURE_PRIMITIVE_FOR_CLONE)
        )

    def test_group_already_cloned(self):
        self.set_cib_file(FIXTURE_CLONED_GROUP)
        self.assert_pcs_fail(
            "resource clone Group".split(),
            "Error: cannot clone a group that has already been cloned\n",
        )
        self.assert_resources_xml_in_cib(
            fixture_resources_xml(FIXTURE_CLONED_GROUP)
        )

    def test_already_a_clone_resource(self):
        self.set_cib_file(FIXTURE_CLONED_GROUP)
        self.assert_pcs_fail(
            "resource clone G1".split(),
            "Error: G1 is already a clone resource\n",
        )
        self.assert_resources_xml_in_cib(
            fixture_resources_xml(FIXTURE_CLONED_GROUP)
        )

    def test_bundle_resource(self):
        self.set_cib_file(FIXTURE_BUNDLE_RESOURCE)
        self.assert_pcs_fail(
            "resource clone Dummy".split(),
            "Error: cannot clone bundle resource\n",
        )
        self.assert_resources_xml_in_cib(
            fixture_resources_xml(FIXTURE_BUNDLE_RESOURCE)
        )

示例#39

0

显示文件

文件： xsl-transform.py 项目： waldirpires/hackathon

import lxml.etree as ET
import sys
import os

if len(sys.argv) < 2:
    print 'usage: python xsl-transform.py <source-xml-file> <xslt-file> [destination-file]'
    sys.exit()

print 'Transforming ' + sys.argv[1] + ' ...'
dom = ET.parse(sys.argv[1])
xslt = ET.parse(sys.argv[2])
transform = ET.XSLT(xslt)

newdom = transform(dom)

str = ET.tostring(newdom, pretty_print=True)

if len(sys.argv) < 4:
    print(str)
else:
    text_file = open(sys.argv[3], "w")
    text_file.write(str)
    text_file.close()
    print os.stat(sys.argv[3]).st_size, ' Bytes'

print
print 'Done!'

示例#40

0

显示文件

文件： _02_read_file.py 项目： zzf531/WebSpider

from lxml import etree

html = etree.parse('./test.html', etree.HTMLParser())
result = etree.tostring(html)
print(result.decode('utf-8'))

示例#41

0

显示文件

文件： add_attachments.py 项目： xinru1414/acl-anthology

def add_attachment(anthology_id, path, attach_type, overwrite=False):
    """
    Adds a single attachment to the Anthology data files.

    Arguments:
    - The ACL ID of the paper (e.g., P17-1012)
    - The path to the attachment (can be a URL)
    - The attachment type (poster, presentation, note, software)
    - Whether to overwrite the downloaded file.
    """

    collection_id, volume_id, paper_id = deconstruct_anthology_id(anthology_id)

    if path.startswith("http"):
        _, input_file_path = tempfile.mkstemp()
        try:
            print(
                f"-> Downloading file from {path} to {input_file_path}", file=sys.stderr
            )
            request = urllib.request.Request(path, headers={'User-Agent': 'Mozilla/5.0'})
            with urllib.request.urlopen(request) as url, open(
                input_file_path, mode="wb"
            ) as input_file_fh:
                input_file_fh.write(url.read())
        except ssl.SSLError:
            raise Exception(f"Could not download {path}")
        except Exception as e:
            raise e
    else:
        input_file_path = path

    file_extension = path.replace("?dl=1", "").split(".")[-1]
    # Many links from file sharing services are not informative and don't have
    # extensions, so we could try to guess.
    if file_extension not in ALLOWED_TYPES:
        detected = filetype.guess(input_file_path)
        if detected is not None:
            file_extension = detected.mime.split("/")[-1]
            if file_extension not in ALLOWED_TYPES:
                print(
                    f"Could not determine file extension for {anthology_id} at {path}",
                    file=sys.stderr,
                )

    with open(input_file_path, "rb") as f:
        checksum = compute_hash(f.read())

    # Update XML
    xml_file = os.path.join(
        os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml"
    )
    tree = ET.parse(xml_file)

    attachment_file_name = f"{anthology_id}.{attach_type}.{file_extension}"

    paper = tree.getroot().find(f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        # Check if attachment already exists
        for attachment in paper.findall("attachment"):
            if attachment.text == attachment_file_name:
                print(
                    f"-> attachment {attachment_file_name} already exists in the XML",
                    file=sys.stderr,
                )
                break
        else:
            attachment = ET.Element("attachment")
            attachment.attrib["type"] = attach_type.lower()
            attachment.attrib["hash"] = checksum
            attachment.text = attachment_file_name

            paper.append(attachment)
            indent(tree.getroot())
            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(
                f"-> added attachment {attachment_file_name} to the XML", file=sys.stderr
            )

    else:
        print(f"Paper {anthology_id} not found in the Anthology", file=sys.stderr)

    # Make sure directory exists
    output_dir = os.path.join(args.attachment_root, collection_id[0], collection_id)
    if not os.path.exists(output_dir):
        #        print(f"-> Creating directory {output_dir}", file=sys.stderr)
        os.makedirs(output_dir)

    # Copy file
    dest_path = os.path.join(output_dir, attachment_file_name)
    if os.path.exists(dest_path) and not overwrite:
        print(
            f"-> target file {dest_path} already in place, refusing to overwrite",
            file=sys.stderr,
        )
        return None

    shutil.copy(input_file_path, dest_path)
    os.chmod(dest_path, 0o644)
    print(f"-> copied {input_file_path} to {dest_path} and fixed perms", file=sys.stderr)

    # Clean up
    if path.startswith("http"):
        os.remove(input_file_path)

    return dest_path

示例#42

0

显示文件

文件： test_clone_unclone.py 项目： vvidic/pcs

class Unclone(
    TestCase,
    get_assert_pcs_effect_mixin(
        lambda cib: etree.tostring(
            # pylint:disable=undefined-variable
            etree.parse(cib).findall(".//resources")[0]
        )
    ),
):
    empty_cib = rc("cib-empty.xml")

    def assert_tags_xml(self, expected_xml):
        self.assert_resources_xml_in_cib(
            expected_xml,
            get_cib_part_func=lambda cib: etree.tostring(
                etree.parse(cib).findall(".//tags")[0],
            ),
        )

    def assert_constraint_xml(self, expected_xml):
        self.assert_resources_xml_in_cib(
            expected_xml,
            get_cib_part_func=lambda cib: etree.tostring(
                etree.parse(cib).findall(".//constraints")[0],
            ),
        )

    def setUp(self):
        # pylint: disable=invalid-name
        self.temp_cib = get_tmp_file("tier1_cib_resource_group_ungroup")
        self.pcs_runner = PcsRunner(self.temp_cib.name)
        xml_manip = XmlManipulation.from_file(self.empty_cib)
        xml_manip.append_to_first_tag_name(
            "resources",
            FIXTURE_CLONE,
            FIXTURE_DUMMY,
        )
        xml_manip.append_to_first_tag_name(
            "configuration",
            FIXTURE_TAGS_CONFIG_XML,
        )
        xml_manip.append_to_first_tag_name(
            "constraints",
            """
            <rsc_location id="location-C-clone-rh7-1-INFINITY" node="rh7-1"
                rsc="C-clone" score="INFINITY"/>
            """,
            """
            <rsc_location id="location-TagCloneOnly-rh7-1-INFINITY"
                node="rh7-1" rsc="TagCloneOnly" score="INFINITY"/>
            """,
        )
        write_data_to_tmpfile(str(xml_manip), self.temp_cib)

    def tearDown(self):
        # pylint: disable=invalid-name
        self.temp_cib.close()

    def test_nonexistent_clone(self):
        self.assert_pcs_fail(
            "resource unclone NonExistentClone".split(),
            "Error: could not find resource: NonExistentClone\n",
        )
        self.assert_resources_xml_in_cib(FIXTURE_CLONE_AND_RESOURCE)
        self.assert_tags_xml(FIXTURE_TAGS_CONFIG_XML)
        self.assert_constraint_xml(FIXTURE_CONSTRAINTS_CONFIG_XML)

    def test_not_clone_resource(self):
        self.assert_pcs_fail(
            "resource unclone Dummy".split(),
            "Error: 'Dummy' is not a clone resource\n",
        )
        self.assert_resources_xml_in_cib(FIXTURE_CLONE_AND_RESOURCE)
        self.assert_tags_xml(FIXTURE_TAGS_CONFIG_XML)
        self.assert_constraint_xml(FIXTURE_CONSTRAINTS_CONFIG_XML)

    def test_unclone_clone_id(self):
        self.assert_effect(
            "resource unclone C-clone".split(), FIXTURE_RESOURCES
        )
        self.assert_tags_xml(FIXTURE_TAGS_RESULT_XML)
        self.assert_constraint_xml("<constraints/>")

    def test_unclone_resoruce_id(self):
        self.assert_effect("resource unclone C".split(), FIXTURE_RESOURCES)
        self.assert_tags_xml(FIXTURE_TAGS_RESULT_XML)
        self.assert_constraint_xml("<constraints/>")

示例#43

0

显示文件

文件： utils.py 项目： zr777/school-wiki

	'slug':'华南理工大学校园aaa',
	'seo_title':'',
	'search_description':'',
	'go_live_at':'',
	'expire_at':'',
	'action-publish':'action-publish',
}
content = s.post(add_wikihome_url, data=data)




#----------------解析并导入文章
from lxml import etree#导入lxml库  
from datetime import datetime
tree = etree.parse(r'D:/谷歌下载/wordpress.2017-04-13.xml')#将xml解析为树结构  
root = tree.getroot()#获得该树的树根
ns = {'content':"http://purl.org/rss/1.0/modules/content/",
	  'dc':'http://purl.org/dc/elements/1.1/'}
posts = []
for i in root.find('channel').findall('item'):
	posts.append([
		i.find('title').text,
		str(
			datetime.strptime(
				i.find('pubDate').text, "%a, %d %b %Y %H:%M:%S %z"
			).date()
		),
		','.join([c.text for c in i.findall('category') if c.text != '未分类']),
		i.findall('dc:creator', ns)[0].text,
		i.findall('content:encoded', ns)[0].text,

示例#44

0

显示文件

文件： parser_XML_lxml.py 项目： leochaeta/CCNA_devnet_study

from lxml import etree as ET

#obtener el archivo xml
stream = open('ejemplo.xml', 'r')

#Parsear la data en un objeto ElementTree 
xml = ET.parse(stream)

#obtener la etiqueta root (raiz del xml)
root = xml.getroot()

#Iterar en cada rama del elemento raiz
for e in root:
    #imprimir en formato string cada elemento
    print (ET.tostring(e))
    print ("")
    #imprimir el atributo ID de cada elemento
    print (e.get("Id"))

示例#45

0

显示文件

文件： xml.py 项目： tozka/ansible

def main():
    module = AnsibleModule(argument_spec=dict(
        path=dict(type='path', aliases=['dest', 'file']),
        xmlstring=dict(type='str'),
        xpath=dict(type='str', default='/'),
        namespaces=dict(type='dict', default={}),
        state=dict(type='str',
                   default='present',
                   choices=['absent', 'present'],
                   aliases=['ensure']),
        value=dict(),
        attribute=dict(),
        add_children=dict(type='list'),
        set_children=dict(type='list'),
        count=dict(type='bool', default=False),
        print_match=dict(type='bool', default=False),
        pretty_print=dict(type='bool', default=False),
        content=dict(type='str', choices=['attribute', 'text']),
        input_type=dict(type='str', default='yaml', choices=['xml', 'yaml'])),
                           supports_check_mode=True,
                           mutually_exclusive=[
                               ['value', 'set_children'],
                               ['value', 'add_children'],
                               ['set_children', 'add_children'],
                               ['path', 'xmlstring'],
                               ['content', 'set_children'],
                               ['content', 'add_children'],
                               ['content', 'value'],
                           ])

    xml_file = module.params['path']
    xml_string = module.params['xmlstring']
    xpath = module.params['xpath']
    namespaces = module.params['namespaces']
    state = module.params['state']
    value = json_dict_bytes_to_unicode(module.params['value'])
    attribute = module.params['attribute']
    set_children = json_dict_bytes_to_unicode(module.params['set_children'])
    add_children = json_dict_bytes_to_unicode(module.params['add_children'])
    pretty_print = module.params['pretty_print']
    content = module.params['content']
    input_type = module.params['input_type']
    print_match = module.params['print_match']
    count = module.params['count']

    # Check if we have lxml 2.3.0 or newer installed
    if not HAS_LXML:
        module.fail_json(
            msg=
            'The xml ansible module requires the lxml python library installed on the managed machine'
        )
    elif LooseVersion('.'.join(
            to_native(f) for f in etree.LXML_VERSION)) < LooseVersion('2.3.0'):
        module.fail_json(
            msg=
            'The xml ansible module requires lxml 2.3.0 or newer installed on the managed machine'
        )
    elif LooseVersion('.'.join(
            to_native(f) for f in etree.LXML_VERSION)) < LooseVersion('3.0.0'):
        module.warn(
            'Using lxml version lower than 3.0.0 does not guarantee predictable element attribute order.'
        )

    # Check if the file exists
    if xml_string:
        infile = BytesIO(to_bytes(xml_string, errors='surrogate_or_strict'))
    elif os.path.isfile(xml_file):
        infile = open(xml_file, 'rb')
    else:
        module.fail_json(msg="The target XML source '%s' does not exist." %
                         xml_file)

    # Try to parse in the target XML file
    try:
        parser = etree.XMLParser(remove_blank_text=pretty_print)
        doc = etree.parse(infile, parser)
    except etree.XMLSyntaxError as e:
        module.fail_json(msg="Error while parsing path: %s" % e)

    if print_match:
        print_match(module, doc, xpath, namespaces)

    if count:
        count_nodes(module, doc, xpath, namespaces)

    if content == 'attribute':
        get_element_attr(module, doc, xpath, namespaces)
    elif content == 'text':
        get_element_text(module, doc, xpath, namespaces)

    # module.fail_json(msg="OK. Well, etree parsed the xml file...")

    # module.exit_json(what_did={"foo": "bar"}, changed=True)

    # File exists:
    if state == 'absent':
        # - absent: delete xpath target
        delete_xpath_target(module, doc, xpath, namespaces)
        # Exit
    # - present: carry on

    # children && value both set?: should have already aborted by now
    # add_children && set_children both set?: should have already aborted by now

    # set_children set?
    if set_children:
        set_target_children(module, doc, xpath, namespaces, set_children,
                            input_type)

    # add_children set?
    if add_children:
        add_target_children(module, doc, xpath, namespaces, add_children,
                            input_type)

    # No?: Carry on

    # Is the xpath target an attribute selector?
    if value is not None:
        set_target(module, doc, xpath, namespaces, attribute, value)

    # Format the xml only?
    if pretty_print:
        pretty(module, doc)

    ensure_xpath_exists(module, doc, xpath, namespaces)

示例#46

0

显示文件

文件： ArcGet.py 项目： tomov/Exploration-Data-Analysis

def main():
    ## --- arguments
    parser = argparse.ArgumentParser(
        description="ArcGet: retrieve imaging data from XNAT")
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--host", "-host", required=False, help="XNAT URL")
    group.add_argument("-a",
                       "--alias",
                       required=False,
                       help="XNAT config file alias")
    parser.add_argument(
        "-l",
        "--legacy",
        required=False,
        action="store_true",
        help="Return legacy XNAT 1.4 zipfile and directory structure")
    parser.add_argument(
        "--new-structure",
        required=False,
        default=False,
        action="store_true",
        help=
        "Don't create SessionID/RAW directory, output files in current working directory or --out-dir"
    )
    # parser.add_argument("--clean-names", required=False, action="store_true", help="Make all file have the SESSIONID_series_SERIESNUM_file_FILENUM.dcm format")
    parser.add_argument("-u",
                        "--username",
                        required=False,
                        help="XNAT username")
    parser.add_argument("-p",
                        "--password",
                        required=False,
                        help="XNAT password")
    parser.add_argument("-s",
                        "--session-label",
                        action="append",
                        required=False,
                        dest="session_labels",
                        help="MR Session label")
    parser.add_argument(
        "-r",
        "--raw-types",
        required=False,
        default="ALL",
        help="raw scan types or numbers e.g. 1,MEMPRAGE,21,22,DSI")
    parser.add_argument("-o",
                        "--out-dir",
                        default=".",
                        required=False,
                        help="output directory")
    parser.add_argument(
        '-q',
        "--quiet",
        nargs=0,
        action=ArgParseSubAction,
        dest='quiet',
        help="Decrease verbosity by 1. Can be used several times.")
    parser.add_argument(
        '-v',
        "--verbose",
        nargs=0,
        action=ArgParseAddAction,
        dest='verbose',
        help="Increase verbosity by 1. Can be used several times.")
    parser.add_argument('--zip64',
                        required=False,
                        action="store_true",
                        help="Use Zip64 extensions when creating zip archives")
    parser.add_argument(
        "-W",
        "--no-warn",
        required=False,
        action="store_true",
        help=
        "Don't show me the annoying warning about downloading everything again. I like wasting bandwidth."
    )
    parser.add_argument(
        "--show-all",
        action='store_true',
        dest='show_all',
        help="Show some information on all available sessions and exit.")
    (args, sessions) = parser.parse_known_args()
    if (args.password and not args.legacy):
        error(
            "DO NOT put passwords on the command line unless absolutely necessary!! --password only allowed with --legacy"
        )
    # print "================================================================="
    # print "Before:"
    # for arg in args:
    #     print arg + "='"+str(args[arg])+"'"
    # print "================================================================="
    ## --- read username and password from XNAT config file
    config_file = os.path.expanduser("~/.xnat_auth")
    if (not os.path.isfile(config_file)):
        info("No config file found: " + config_file)
        if (args.alias):
            error("You cannot specify an --alias without a config file")
    else:
        info("Reading config file: " + config_file, verbosity_level=3)
        xml = etree.parse(os.path.expanduser(config_file))
        if (args.alias):
            if (not args.alias.isalnum()):
                error("--alias must be alphanumeric", parser=parser)
            ## --- get host
            args.host = xml.xpath("/xnat/" + args.alias + "/url/text()")
            if (args.host):
                args.host = args.host[0]
            ## --- get username
            if (not args.username):
                args.username = xml.xpath("/xnat/" + args.alias +
                                          "/username/text()")
                if (args.username):
                    args.username = args.username[0]
            ## --- get password
            args.password = xml.xpath("/xnat/" + args.alias +
                                      "/password/text()")
            if (args.password):
                args.password = args.password[0]
        elif (args.host):
            ## --- get username
            if (not args.username):
                args.username = xml.xpath("/xnat/*[url='" + args.host +
                                          "']/username/text()")
                if (args.username):
                    args.username = args.username[0]
            ## --- get password
            if (not args.password):
                args.password = xml.xpath("/xnat/*[url='" + args.host +
                                          "']/password/text()")
                if (args.password):
                    args.password = args.password[0]
    ## --- prompt for host, username, password if necessary
    if (sys.stdin.isatty()):
        if (not args.host):
            args.host = raw_input("Enter host: ")
        if (not args.username):
            args.username = raw_input("Enter username: "******"Enter password: "******"Could not retrieve a host from config file or command line")
    if (not args.username):
        error("Could not retrieve a username from config file or command line")
    if (not args.password):
        error("Could not retrieve a password from config file or command line")
    ## --- strip any slashes from right side of host
    args.host = str(args.host).rstrip("/")
    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)
    args.out_dir = os.path.abspath(args.out_dir)
    info("Saving output to '" + args.out_dir + "'")
    if args.session_labels is None:
        args.session_labels = []
    if len(sessions) > 0:
        args.session_labels.extend(sessions)
    # print str(args)
    # sys.exit()
    # log(str(args))

    # print "================================================================="
    # print "After:"
    # for arg in args:
    #     print arg + "='"+str(args[arg])+"'"
    # print "================================================================="
    # sys.exit()
    arcget = ArcGet(args)
    if args.show_all:
        arcget.getShowAllSessionInfo()
        sys.exit()
    sessions = arcget.getSessionInfoByLabels(args.session_labels)

    for session in sessions:
        subject = arcget.getSubjectInfoById(session['subject_id'])
        if "ALL" == args.raw_types:
            if not args.no_warn:
                warn(
                    "+-------------------------------------------------------------------+",
                    verbosity_level=-1)
                warn(
                    "|               --==>> WARNING: READ CAREFULLY <<==--               |",
                    verbosity_level=-1)
                warn(
                    "+-------------------------------------------------------------------+",
                    verbosity_level=-1)
                warn(
                    "| By not specifying which scans/series to download from the session |",
                    verbosity_level=-1)
                warn(
                    "| you will be downloading EVERYTHING, including report files, text  |",
                    verbosity_level=-1)
                warn(
                    "| files, pictures, and EVERY SINGLE scan. If you don't REALLY NEED  |",
                    verbosity_level=-1)
                warn(
                    "| it all you are saying that you really do want to waste EVERYONE's |",
                    verbosity_level=-1)
                warn(
                    "| space, processing power, time, and slow down XNAT, the cluster    |",
                    verbosity_level=-1)
                warn(
                    "| etc, etc. So DON'T DO IT. Use the -r or --raw-types option, for   |",
                    verbosity_level=-1)
                warn(
                    "| example, to get the first, third and all BOLD scans, use:         |",
                    verbosity_level=-1)
                warn(
                    "|    --raw-types 1,3,BOLD                                           |",
                    verbosity_level=-1)
                warn(
                    "+-------------------------------------------------------------------+",
                    verbosity_level=-1)
        session['subject'] = subject
        series_list = arcget.getSeriesInfoBySession(session)
        if verbosity > 0:
            arcget.outputSessionDetails(session)
        if (args.legacy):
            info("Creating legacy XNAT zipfile")
            arcget.downloadSeriesToZipFile(session)
        else:
            info("Getting Data...")
            arcget.downloadSeriesToDir(session)

示例#47

0

显示文件

文件： 4 parsing library.py 项目： xiemeigongzi88/Web-Crawler

</div>
'''

html=etree.HTML(text)
# HTML 类 可以对 text 文本初始化 构造一个 XPath 解析对象 
result=etree.tostring(html)
# tostring() 可以输出修正后的 HTML 代码 
print(result.decode('utf-8'))


也可以直接读取文本文件进行解析 


from lxml import etree
 
html = etree.parse('./test.html', etree.HTMLParser())
result = etree.tostring(html)
print(result.decode('utf-8'))



text='''
<div>
<url>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link1.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</url>
</div>

示例#48

0

显示文件

from lxml import etree
import sys
import csv

if len(sys.argv) != 3:
    print "Require tdd_file and output_file name parameters"
    sys.exit()

tdd = etree.parse(open(sys.argv[1]))
csvf = csv.writer(open(sys.argv[2], "wb"))
csvf.writerow(['code', 'value'])

root = tdd.getroot()
# Get the default namespace
tddns = root.nsmap[None]
# Create a namespace map using this
nsm = {'tdd': tddns}

# Select all the codes for the point_description_code attribute

for v in root.findall(
        './/tdd:attribute[@name="point_description_code"]/tdd:values/tdd:value',
        nsm):
    csvf.writerow([v.attrib['text'], v.attrib['description']])

示例#49

0

显示文件

from lxml import etree

parser = etree.HTMLParser()
tree = etree.parse("app.html", parser)

name_xpath_1 = '/html/body/div[1]/div[7]/div[4]/div[1]/div[2]/div[2]/div[2]/div/div[3]/text()'
name_xpath_2 = '/html/body/div[1]/div[7]/div[4]/div[1]/div[2]/div[1]/div[2]/div/div[3]/text()'

name_1 = tree.xpath(name_xpath_1)
name_2 = tree.xpath(name_xpath_2)

print(name_1)
print(type(name_1))
print(name_2)
print(type(name_2))

示例#50

0

显示文件

 def __init__(self, ontology_path):
     self._ontology_path = ontology_path
     self._ontology_xml = ET.parse(ontology_path).getroot()

示例#51

0

显示文件

    def checkForPreconfiguredXML(self):
        ret = None
        xmlFilePath = os.path.join(
            self.unit.currentPath.replace(
                "%sharedPath%", django_settings.SHARED_DIRECTORY, 1) + "/",
            django_settings.PROCESSING_XML_FILE)

        if os.path.isfile(xmlFilePath):
            # For a list of items with pks:
            # SELECT TasksConfigs.description, choiceAvailableAtLink, ' ' AS 'SPACE', MicroServiceChains.description, chainAvailable FROM MicroServiceChainChoice Join MicroServiceChains on MicroServiceChainChoice.chainAvailable = MicroServiceChains.pk Join MicroServiceChainLinks on MicroServiceChainLinks.pk = MicroServiceChainChoice.choiceAvailableAtLink Join TasksConfigs on TasksConfigs.pk = MicroServiceChainLinks.currentTask ORDER BY choiceAvailableAtLink desc;
            try:
                this_choice_point = choice_unifier.get(self.jobChainLink.pk,
                                                       self.jobChainLink.pk)
                tree = etree.parse(xmlFilePath)
                root = tree.getroot()
                for preconfiguredChoice in root.findall(
                        ".//preconfiguredChoice"):
                    if preconfiguredChoice.find(
                            "appliesTo").text == this_choice_point:
                        desiredChoice = preconfiguredChoice.find(
                            "goToChain").text
                        desiredChoice = choice_unifier.get(
                            desiredChoice, desiredChoice)

                        try:
                            link = self.jobChainLink.workflow.get_link(
                                this_choice_point)
                        except KeyError:
                            return
                        for replacement in link.config["replacements"]:
                            if replacement["id"] == desiredChoice:
                                # In our JSON-encoded document, the items in
                                # the replacements are not wrapped, do it here.
                                # Needed by ReplacementDict.
                                ret = self._format_items(replacement["items"])
                                break
                        else:
                            return

                        try:
                            # <delay unitAtime="yes">30</delay>
                            delayXML = preconfiguredChoice.find("delay")
                            unitAtimeXML = None
                            if delayXML:
                                unitAtimeXML = delayXML.get("unitCtime")
                            if unitAtimeXML is not None and unitAtimeXML.lower(
                            ) != "no":
                                delaySeconds = int(delayXML.text)
                                unitTime = os.path.getmtime(
                                    self.unit.currentPath.replace(
                                        "%sharedPath%",
                                        django_settings.SHARED_DIRECTORY, 1))
                                nowTime = time.time()
                                timeDifference = nowTime - unitTime
                                timeToGo = delaySeconds - timeDifference
                                LOGGER.info('Time to go: %s', timeToGo)
                                self.jobChainLink.setExitMessage(
                                    "Waiting till: " +
                                    datetime.datetime.fromtimestamp(
                                        (nowTime + timeToGo)).ctime())
                                rd = ReplacementDict(ret)
                                if self.jobChainLink.passVar is not None:
                                    if isinstance(self.jobChainLink.passVar,
                                                  ReplacementDict):
                                        new = {}
                                        new.update(
                                            self.jobChainLink.passVar.dic)
                                        new.update(rd.dic)
                                        rd.dic = new
                                t = threading.Timer(
                                    timeToGo,
                                    self.jobChainLink.linkProcessingComplete,
                                    args=[0, rd],
                                    kwargs={})
                                t.daemon = True
                                t.start()

                                t2 = threading.Timer(
                                    timeToGo,
                                    self.jobChainLink.setExitMessage,
                                    args=[Job.STATUS_COMPLETED_SUCCESSFULLY],
                                    kwargs={})
                                t2.start()
                                return waitingOnTimer

                        except Exception:
                            LOGGER.info('Error parsing XML', exc_info=True)

            except Exception:
                LOGGER.warning(
                    'Error parsing xml at %s for pre-configured choice',
                    xmlFilePath,
                    exc_info=True)
        return ret

示例#52

0

显示文件

文件： xliff-to-strings.py 项目： sandyway/firefox-ios

if __name__ == "__main__":

    import_root = sys.argv[1]
    if not os.path.isdir(import_root):
        print "import path does not exist or is not a directory"
        sys.exit(1)

    export_root = sys.argv[2]
    if not os.path.isdir(export_root):
        print "export path does not exist or is not a directory"
        sys.exit(1)

    for xliff_path in glob.glob(import_root + "/*/firefox-ios.xliff"):
        print "Exporting", xliff_path
        with open(xliff_path) as fp:
            tree = etree.parse(fp)
            root = tree.getroot()

            # Make sure there are <file> nodes in this xliff file.
            file_nodes = root.xpath("//x:file", namespaces=NS)
            if len(file_nodes) == 0:
                print "  ERROR: No translated files. Skipping."
                continue

            # Take the target language from the first <file>. Not sure if that
            # is a bug in the XLIFF, but in some files only the first node has
            # the target-language set.
            target_language = file_nodes[0].get('target-language')
            if not target_language:
                print "  ERROR: Missing target-language. Skipping."
                continue

示例#53

0

显示文件

文件： lockssomatic.py 项目： mamedin/archivematica-storage-service

    def _create_resource(self, package, output_files):
        """ Given a package, create an Atom resource entry to send to LOCKSS.

        Parses metadata for the Atom entry from the METS file, uses
        LOCKSS-o-matic-specific tags to describe size and checksums.
        """

        # Parse METS to get information for atom entry
        relative_mets_path = os.path.join(
            os.path.splitext(os.path.basename(package.current_path))[0],
            "data", 'METS.{}.xml'.format(package.uuid))
        (mets_path, temp_dir) = package.extract_file(relative_mets_path)
        mets = etree.parse(mets_path)
        # Delete temp dir if created
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)

        # Parse out name and description if found
        slug = str(package.uuid)
        title = os.path.basename(package.current_path)
        summary = 'AIP generated by Archivematica with uuid {}'.format(
            package.uuid)
        dublincore = mets.find(
            'mets:dmdSec/mets:mdWrap[@MDTYPE="DC"]/mets:xmlData/dcterms:dublincore',
            namespaces=utils.NSMAP)
        if dublincore is not None:
            title = dublincore.findtext('dcterms:title',
                                        namespaces=utils.NSMAP,
                                        default=title)
            slug = dublincore.findtext('dcterms:title',
                                       namespaces=utils.NSMAP,
                                       default=slug)
            summary = dublincore.findtext('dcterms:description',
                                          namespaces=utils.NSMAP,
                                          default=summary)
        # Parse out Agent for author
        authors = mets.xpath(
            ".//mets:mdWrap[@MDTYPE='PREMIS:AGENT']//mets:agentType[text()='organization']/ancestor::mets:agent/*/mets:agentIdentifierValue",
            namespaces=utils.NSMAP)
        author = authors[0].text if authors else None

        # Create atom entry
        entry = sword2.Entry(title=title,
                             id='urn:uuid:' + package.uuid,
                             author={'name': author},
                             summary=summary)

        # Add each chunk to the atom entry
        if not self.pointer_root:
            self.pointer_root = etree.parse(package.full_pointer_file_path)
        entry.register_namespace('lom', utils.NSMAP['lom'])
        for index, file_path in enumerate(output_files):
            # Get external URL
            if len(output_files) == 1:
                external_url = self._download_url(package.uuid)
            else:
                external_url = self._download_url(package.uuid, index + 1)

            # Get checksum and size from pointer file (or generate if not found)
            file_e = self.pointer_root.find(
                ".//mets:fileGrp[@USE='LOCKSS chunk']/mets:file[@ID='{}']".
                format(os.path.basename(file_path)),
                namespaces=utils.NSMAP)
            if file_e is not None:
                checksum_name = file_e.get('CHECKSUMTYPE')
                checksum_value = file_e.get('CHECKSUM')
                size = int(file_e.get('SIZE'))
            else:
                # Not split, generate
                try:
                    checksum = utils.generate_checksum(file_path,
                                                       self.checksum_type)
                except ValueError:  # Invalid checksum type
                    checksum = utils.generate_checksum(file_path, 'md5')
                checksum_name = checksum.name.upper().replace('SHA', 'SHA-')
                checksum_value = checksum.hexdigest()
                size = os.path.getsize(file_path)

            # Convert size to kB
            size = str(math.ceil(size / 1000))

            # Add new content entry and values
            entry.add_field('lom_content', external_url)
            content_entry = entry.entry[-1]
            content_entry.set('size', size)
            content_entry.set('checksumType', checksum_name)
            content_entry.set('checksumValue', checksum_value)

        LOGGER.debug('LOCKSS atom entry: %s', entry)
        return entry, slug

示例#54

0

显示文件

文件： tools.py 项目： sindrevatnehol/pysonar

def GetTransectTimesIDX(filename, code):

    doc2 = etree.parse(filename)
    distance_list = doc2.find('distance_list')
    old_stop = ''
    Start = 0
    End = []
    transect_IDX = []
    transect_i = 1
    #    new_start = False

    start_time = []
    #    log_start = []
    stop_time = []
    #    lat_start = []
    #    lon_start = []
    #    lat_stop = []
    #    lon_stop = []

    for i in distance_list:

        start_time = i.get('start_time').replace(' ', 'T').replace('-',
                                                                   '').replace(
                                                                       ':', '')
        #        log_start = i.get('log_start')
        stop_time = i.find('stop_time').text.replace(' ', 'T').replace(
            '-', '').replace(':', '')
        #        lat_start = i.find('lat_start').text
        #        lon_start = i.find('lon_start').text
        #        lat_stop = i.find('lat_stop').text
        #        lon_stop = i.find('lon_start').text
        #
        #    return start_time,log_start,stop_time,lat_start,lat_stop,lon_start,lon_stop
        #        print(start_time)
        #        print(log_start,lon_stop)
        #
        if transect_i < 10:
            trnsID = '00' + str(transect_i)
        elif transect_i < 100:
            trnsID = '0' + str(transect_i)
        else:
            trnsID = str(transect_i)

        if old_stop != start_time:
            End = np.hstack((End, old_stop))
            Start = np.hstack((Start, start_time))
            transect_IDX = np.hstack((transect_IDX, code + '_' + trnsID))
            transect_i = transect_i + 1

        if Start == 0:
            Start = start_time
            transect_IDX = np.hstack((transect_IDX, code + '_' + trnsID))
            transect_i = transect_i + 1

        old_stop = stop_time

    #add last time
    End = np.hstack((End, stop_time))

    TimeIDX = np.vstack((transect_IDX.T, Start[1:].T, End[1:].T)).T
    return TimeIDX

示例#55

0

显示文件

文件： lockssomatic.py 项目： mamedin/archivematica-storage-service

    def update_package_status(self, package):
        """
        Poll LOM for SWORD statement and update status from response.

        Query the state_iri for this package and parse it for the server states.
        If all are in agreement, add those URLs to the pointer file for each
        LOCKSS chunk.
        """
        status = package.status

        # Need to have state and edit IRI to talk to LOM
        if 'state_iri' not in package.misc_attributes or 'edit_iri' not in package.misc_attributes:
            self.post_move_from_storage_service(None, None, package)

        # After retry - verify that state & edit IRI exist now
        if 'state_iri' not in package.misc_attributes or 'edit_iri' not in package.misc_attributes:
            return (None, _('Unable to contact Lockss-o-matic'))

        if not self.sword_connection and not self.update_service_document():
            return (None, _('Error contacting LOCKSS-o-matic.'))

        # SWORD2 client has only experimental support for getting SWORD2
        # statements, so implementing the fetch and parse here. (March 2014)
        response = self.sword_connection.get_resource(
            package.misc_attributes['state_iri'],
            headers={'Accept': 'application/atom+xml;type=feed'})

        if response.code != 200:
            return (None,
                    _('Error polling LOCKSS-o-matic for SWORD statement.'))

        statement_root = etree.fromstring(response.content)

        # TODO Check that number of lom:content entries is same as number of chunks
        # TODO what to do if was quorum, and now not??

        # Package not safely stored, return immediately
        servers = statement_root.findall('.//lom:server',
                                         namespaces=utils.NSMAP)
        LOGGER.info('All states are agreement: %s',
                    all(s.get('state') == 'agreement' for s in servers))
        if not all(s.get('state') == 'agreement' for s in servers):
            # TODO update pointer file for new failed status?
            return (status, _('LOCKSS servers not in agreement'))

        status = Package.UPLOADED

        # Add LOCKSS URLs to each chunk
        if not self.pointer_root:
            self.pointer_root = etree.parse(package.full_pointer_file_path)
        files = self.pointer_root.findall(
            ".//mets:fileSec/mets:fileGrp[@USE='LOCKSS chunk']/mets:file",
            namespaces=utils.NSMAP)
        # If not files, find AIP fileGrp (package unsplit)
        if not files:
            files = self.pointer_root.findall(
                ".//mets:fileSec/mets:fileGrp[@USE='Archival Information Package']/mets:file",
                namespaces=utils.NSMAP)

        # Add new FLocat elements for each LOCKSS URL to each file element
        for index, file_e in enumerate(files):
            LOGGER.debug('file element: %s',
                         etree.tostring(file_e, pretty_print=True))
            if len(files) == 1:
                lom_id = self._download_url(package.uuid)
            else:
                lom_id = self._download_url(package.uuid, index + 1)
            LOGGER.debug('LOM id: %s', lom_id)
            lom_servers = statement_root.find(
                ".//lom:content[@id='{}']/lom:serverlist".format(lom_id),
                namespaces=utils.NSMAP)
            LOGGER.debug('lom_servers: %s', lom_servers)
            # Remove existing LOCKSS URLs, if they exist
            for old_url in file_e.findall("mets:FLocat[@LOCTYPE='URL']",
                                          namespaces=utils.NSMAP):
                file_e.remove(old_url)
            # Add URLs from SWORD statement
            for server in lom_servers:
                # TODO check that size and checksum are the same
                # TODO what to do if size & checksum different?
                LOGGER.debug('LOM URL: %s', server.get('src'))
                flocat = etree.SubElement(file_e,
                                          utils.PREFIX_NS['mets'] + 'FLocat',
                                          LOCTYPE="URL")
                flocat.set(utils.PREFIX_NS['xlink'] + 'href',
                           server.get('src'))

        # Delete local files
        # Note: This will tell LOCKSS to stop harvesting, even if the file was
        # not split, and will not be deleted locally
        lom_content = statement_root.findall('.//lom:content',
                                             namespaces=utils.NSMAP)
        delete_lom_ids = [e.get('id') for e in lom_content]
        error = self._delete_update_lom(package, delete_lom_ids)
        if error is None:
            self._delete_files()

        LOGGER.info('update_package_status: new status: %s', status)

        # Write out pointer file again
        with open(package.full_pointer_file_path, 'w') as f:
            f.write(
                etree.tostring(self.pointer_root,
                               pretty_print=True,
                               xml_declaration=True,
                               encoding='utf-8'))

        # Update value if different
        package.status = status
        package.save()
        return (status, error)

示例#56

0

显示文件

文件： parser.py 项目： hunkom/dusty

    def __init__(self, file, test=None):
        parser = le.XMLParser(resolve_entities=False, huge_tree=True)
        w3scan = le.parse(file, parser)
        root = w3scan.getroot()
        dupes = {}
        for vulnerability in root.findall("vulnerability"):
            name = vulnerability.attrib["name"]
            severity = vulnerability.attrib["severity"]
            description = "%s are:\n\n" % vulnerability.find(
                "description").text.split("are:")[0]
            transactions = vulnerability.find("http-transactions")
            if transactions is not None:
                transactions = transactions.findall("http-transaction")
            for transaction in transactions:
                request = transaction.find("http-request")
                response = transaction.find("http-response")
                status = request.find("status").text.split(" ")
                response_code = response.find("status").text.split(" ")[1]
                http_method = status[0]
                request_url = status[1]
                data = ""
                for part in [request, response]:
                    headers = [
                        f"{h.attrib['field']} -> {h.attrib['content']}"
                        for h in part.find("headers").findall("header")
                    ]
                    headers = "\n".join(headers)
                    request_body = part.find("body")
                    if request_body.attrib['content-encoding'] == "base64":
                        if request_body.text:
                            request_body = base64.b64decode(
                                request_body.text).decode("utf-8",
                                                          errors="ignore")
                        else:
                            request_body = ""
                    else:
                        request_body = request_body.text if request_body.text else ""
                    if not data:
                        data = f"Request: {request_url} {http_method} {response_code} \n\n"
                    else:
                        data += "Response: \n"
                    data += f"Headers: {headers}\n\nBody:{request_body}\n\n"
                dupe_url = urlparse(request_url)
                # Creating dupe path ned to think on more intelligent implementation
                dupe_path = dupe_url.path[:dupe_url.path.index(
                    "%")] if "%" in dupe_url.path else dupe_url.path
                dupe_path = dupe_path[:dupe_path.index(
                    "+")] if "+" in dupe_path else dupe_path
                dupe_path = dupe_path[:dupe_path.index(
                    ".")] if "." in dupe_path else dupe_path
                dupe_path = dupe_path[:dupe_path.rindex(
                    "/")] if "/" in dupe_path else dupe_path
                dupe_url = f"{dupe_url.scheme}://{dupe_url.netloc}{dupe_path}"
                dupe_code = f"{str(response_code)[0]}xx"
                dupe_key = hashlib.md5(
                    f"{name} {dupe_url} {http_method} {dupe_code}".encode(
                        'utf-8')).hexdigest()
                if dupe_key not in dupes:
                    dupes[dupe_key] = Finding(
                        title=f"{name} {dupe_url} {dupe_code}",
                        tool='W3AF',
                        test=test,
                        description=description,
                        severity=severity,
                        numerical_severity=Finding.get_numerical_severity(
                            severity),
                        references=data,
                        dynamic_finding=True)
                elif data not in dupes[dupe_key].finding['references']:
                    dupes[dupe_key].finding['references'] += data
                if request_url not in dupes[dupe_key].unsaved_endpoints:
                    dupes[dupe_key].finding[
                        'description'] += f"- {request_url}\n\n"

                    dupes[dupe_key].unsaved_endpoints.append(request_url)
        self.items = dupes.values()
        print(len(self.items))

示例#57

0

显示文件

文件： from_file.py 项目： thuvu33/test-1

# parse from file
from lxml import etree
from io import StringIO, BytesIO

file_name = 'simple.html'

fhtml = open(file_name)
parser = etree.HTMLParser()
tree = etree.parse(fhtml, parser)
result = etree.tostring(tree.getroot(), pretty_print=True, method='html')

print(result)

示例#58

0

显示文件

文件： lockssomatic.py 项目： mamedin/archivematica-storage-service

    def _split_package(self, package):
        """
        Splits the package into chunks of size self.au_size. Returns list of paths to the chunks.

        If the package has already been split (and an event is in the pointer
        file), returns the list if file paths from the pointer file.

        Updates the pointer file with the new LOCKSS chunks, and adds 'division'
        event.
        """
        # Parse pointer file
        if not self.pointer_root:
            self.pointer_root = etree.parse(package.full_pointer_file_path)

        # Check if file is already split, and if so just return split files
        if self.pointer_root.xpath('.//premis:eventType[text()="division"]',
                                   namespaces=utils.NSMAP):
            chunks = self.pointer_root.findall(
                ".//mets:div[@TYPE='Archival Information Package']/mets:div[@TYPE='LOCKSS chunk']",
                namespaces=utils.NSMAP)
            output_files = [
                c.find('mets:fptr', namespaces=utils.NSMAP).get('FILEID')
                for c in chunks
            ]
            return output_files

        file_path = package.full_path
        expected_num_files = math.ceil(
            os.path.getsize(file_path) / self.au_size)
        LOGGER.debug('expected_num_files: %s', expected_num_files)

        # No split needed - just return the file path
        if expected_num_files <= 1:
            LOGGER.debug('Only one file expected, not splitting')
            output_files = [file_path]
            # No events or structMap changes needed
            LOGGER.info('LOCKSS: after splitting: %s', output_files)
            return output_files

        # Split file
        # Strip extension, add .tar-1 ('-1' to make rename script happy)
        output_path = os.path.splitext(file_path)[0] + '.tar-1'
        command = [
            'tar', '--create', '--multi-volume', '--tape-length',
            str(self.au_size), '--new-volume-script',
            'common/tar_new_volume.sh', '-f', output_path, file_path
        ]
        # TODO reserve space in quota for extra files
        LOGGER.info('LOCKSS split command: %s', command)
        try:
            subprocess.check_call(command)
        except Exception:
            LOGGER.exception("Split of %s failed with command %s", file_path,
                             command)
            raise
        output_path = output_path[:-2]  # Remove '-1'
        dirname, basename = os.path.split(output_path)
        output_files = sorted([
            os.path.join(dirname, entry) for entry in os.listdir(dirname)
            if entry.startswith(basename)
        ])

        # Update pointer file
        amdsec = self.pointer_root.find('mets:amdSec', namespaces=utils.NSMAP)

        # Add 'division' PREMIS:EVENT
        try:
            event_detail = subprocess.check_output(['tar', '--version'])
        except subprocess.CalledProcessError as e:
            event_detail = e.output or _(
                'Error: getting tool info; probably GNU tar')
        utils.mets_add_event(
            amdsec,
            event_type='division',
            event_detail=event_detail,
            event_outcome_detail_note='{} LOCKSS chunks created'.format(
                len(output_files)),
        )

        # Update structMap & fileSec
        self.pointer_root.find('mets:structMap',
                               namespaces=utils.NSMAP).set('TYPE', 'logical')
        aip_div = self.pointer_root.find(
            "mets:structMap/mets:div[@TYPE='Archival Information Package']",
            namespaces=utils.NSMAP)
        filesec = self.pointer_root.find('mets:fileSec',
                                         namespaces=utils.NSMAP)
        filegrp = etree.SubElement(filesec,
                                   utils.PREFIX_NS['mets'] + 'fileGrp',
                                   USE='LOCKSS chunk')

        # Move ftpr to Local copy div
        local_ftpr = aip_div.find('mets:fptr', namespaces=utils.NSMAP)
        if local_ftpr is not None:
            div = etree.SubElement(aip_div,
                                   utils.PREFIX_NS['mets'] + 'div',
                                   TYPE='Local copy')
            div.append(local_ftpr)  # This moves local_fptr

        # Add each split chunk to structMap & fileSec
        for idx, out_path in enumerate(output_files):
            # Add div to structMap
            div = etree.SubElement(aip_div,
                                   utils.PREFIX_NS['mets'] + 'div',
                                   TYPE='LOCKSS chunk',
                                   ORDER=str(idx + 1))
            etree.SubElement(div,
                             utils.PREFIX_NS['mets'] + 'fptr',
                             FILEID=os.path.basename(out_path))
            # Get checksum and size for fileSec
            try:
                checksum = utils.generate_checksum(out_path,
                                                   self.checksum_type)
            except ValueError:  # Invalid checksum type
                checksum = utils.generate_checksum(out_path, 'md5')
            checksum_name = checksum.name.upper().replace('SHA', 'SHA-')
            size = os.path.getsize(out_path)
            # Add file & FLocat to fileSec
            file_e = etree.SubElement(filegrp,
                                      utils.PREFIX_NS['mets'] + 'file',
                                      ID=os.path.basename(out_path),
                                      SIZE=str(size),
                                      CHECKSUM=checksum.hexdigest(),
                                      CHECKSUMTYPE=checksum_name)
            flocat = etree.SubElement(file_e,
                                      utils.PREFIX_NS['mets'] + 'FLocat',
                                      OTHERLOCTYPE="SYSTEM",
                                      LOCTYPE="OTHER")
            flocat.set(utils.NSMAP['xlink'] + 'href', out_path)

        # Write out pointer file again
        with open(package.full_pointer_file_path, 'w') as f:
            f.write(
                etree.tostring(self.pointer_root,
                               pretty_print=True,
                               xml_declaration=True,
                               encoding='utf-8'))

        return output_files

示例#59

0

显示文件

文件： xmlToDict.py 项目： rvtonge/ProductReviewCompilation

def readXML(file):
    root = etree.parse("xml/"+file+".xml");
    conf = []
    site ={}
    searchParameters = []
    searchTags =[]
    allReviewTags =[]
    reviewHeading =[]
    reviewText =[]
    reviewUpvote =[]
    reviewNextPage =[]
    reviewStarRating = []
    tempDict = {}
    attr = {}
    prevTag =''
    for element in root.getiterator():
        #print element.tag, element.items(), element.text
        if(element.tag == "site") :
            if(site) :
                site['searchTags'] = searchTags
                site['allReviewTags'] = allReviewTags
                site['reviewHeading'] = reviewHeading
                site['reviewText'] = reviewText
                site['reviewUpvote'] = reviewUpvote
                site['reviewNextPage'] = reviewNextPage
                site['reviewStarRating'] = reviewStarRating
                searchTags =[]
                allReviewTags =[]
                reviewHeading =[]
                reviewText =[]
                reviewUpvote =[]
                reviewNextPage =[]
                reviewStarRating = []
                tempDict = {}
                attr = {}
                conf.append(site)
                site = {}
        elif(element.tag == "name"):
            site['name'] = element.text.strip()
        elif(element.tag == "prefix"):
            site['prefix'] = ''
            if element.text:
                site['prefix'] = element.text.strip()
        elif(element.tag == "searchURL"):
            site['searchURL'] = element.text.strip()
        elif(element.tag == "searchParameters"):
            site['searchParameters'] = searchParameters #CHANGE HERE
        elif(element.tag == "searchTags"):
            prevTag = "searchTags"
        elif(element.tag == "reviewStarRating"):
            prevTag = "reviewStarRating"    
        elif(element.tag == "allReviewTags"):
            prevTag = "allReviewTags" 
        elif(element.tag == "reviewHeading"):
            prevTag = "reviewHeading" 
        elif(element.tag == "reviewText"):
            prevTag = "reviewText" 
        elif(element.tag == "reviewUpvote"):
            prevTag = "reviewUpvote" 
        elif(element.tag == "reviewNextPage"):
            prevTag = "reviewNextPage"             
        elif(element.tag == "filter"):
            tempDict = {}
        elif(element.tag == "attributes"):
            if(element.text):
                tempStr = element.text.strip().split('\'')
                attr[tempStr[1]] = tempStr[3] 
                tempDict['attributes'] = attr
                attr ={}
            else :
                tempDict['attributes'] = {}
        elif(element.tag == "recursive"):
            if(element.text):
                if element.text == 'True':
                    tempDict['recursive'] = True
                else:
                    tempDict['recursive'] = False
            else :
                tempDict['recursive'] = True
        elif(element.tag == "tag"):
            if(element.text):
                tempDict['tag'] = element.text.strip()
            else :
                tempDict['tag'] = ''
            if(prevTag == "searchTags") :
                    searchTags.append(tempDict)
            elif(prevTag == "allReviewTags") :
                    allReviewTags.append(tempDict)
            elif(prevTag == "reviewHeading") :
                    reviewHeading.append(tempDict)
            elif(prevTag == "reviewText") :
                    reviewText.append(tempDict)
            elif(prevTag == "reviewUpvote") :
                    reviewUpvote.append(tempDict)
            elif(prevTag == "reviewNextPage") :
                    reviewNextPage.append(tempDict)
            elif(prevTag == "reviewStarRating") :
                    reviewStarRating.append(tempDict)     
                
                
    site['searchTags'] = searchTags
    site['allReviewTags'] = allReviewTags
    site['reviewHeading'] = reviewHeading
    site['reviewText'] = reviewText
    site['reviewUpvote'] = reviewUpvote
    site['reviewNextPage'] = reviewNextPage
    site['reviewStarRating'] = reviewStarRating
    conf.append(site)
    return conf

示例#60

0

显示文件

文件： xml.py 项目： sektor1100/ansible-mikrotik

def main():
    module = AnsibleModule(
        argument_spec=dict(
            path=dict(type='path', aliases=['dest', 'file']),
            xmlstring=dict(type='str'),
            xpath=dict(type='str'),
            namespaces=dict(type='dict', default={}),
            state=dict(type='str', default='present', choices=['absent', 'present'], aliases=['ensure']),
            value=dict(type='raw'),
            attribute=dict(type='raw'),
            add_children=dict(type='list'),
            set_children=dict(type='list'),
            count=dict(type='bool', default=False),
            print_match=dict(type='bool', default=False),
            pretty_print=dict(type='bool', default=False),
            content=dict(type='str', choices=['attribute', 'text']),
            input_type=dict(type='str', default='yaml', choices=['xml', 'yaml']),
            backup=dict(type='bool', default=False),
            strip_cdata_tags=dict(type='bool', default=False),
            insertbefore=dict(type='bool', default=False),
            insertafter=dict(type='bool', default=False),
        ),
        supports_check_mode=True,
        required_by=dict(
            add_children=['xpath'],
            # TODO: Reinstate this in Ansible v2.12 when we have deprecated the incorrect use below
            # attribute=['value'],
            content=['xpath'],
            set_children=['xpath'],
            value=['xpath'],
        ),
        required_if=[
            ['count', True, ['xpath']],
            ['print_match', True, ['xpath']],
            ['insertbefore', True, ['xpath']],
            ['insertafter', True, ['xpath']],
        ],
        required_one_of=[
            ['path', 'xmlstring'],
            ['add_children', 'content', 'count', 'pretty_print', 'print_match', 'set_children', 'value'],
        ],
        mutually_exclusive=[
            ['add_children', 'content', 'count', 'print_match', 'set_children', 'value'],
            ['path', 'xmlstring'],
            ['insertbefore', 'insertafter'],
        ],
    )

    xml_file = module.params['path']
    xml_string = module.params['xmlstring']
    xpath = module.params['xpath']
    namespaces = module.params['namespaces']
    state = module.params['state']
    value = json_dict_bytes_to_unicode(module.params['value'])
    attribute = module.params['attribute']
    set_children = json_dict_bytes_to_unicode(module.params['set_children'])
    add_children = json_dict_bytes_to_unicode(module.params['add_children'])
    pretty_print = module.params['pretty_print']
    content = module.params['content']
    input_type = module.params['input_type']
    print_match = module.params['print_match']
    count = module.params['count']
    backup = module.params['backup']
    strip_cdata_tags = module.params['strip_cdata_tags']
    insertbefore = module.params['insertbefore']
    insertafter = module.params['insertafter']

    # Check if we have lxml 2.3.0 or newer installed
    if not HAS_LXML:
        module.fail_json(msg=missing_required_lib("lxml"), exception=LXML_IMP_ERR)
    elif LooseVersion('.'.join(to_native(f) for f in etree.LXML_VERSION)) < LooseVersion('2.3.0'):
        module.fail_json(msg='The xml ansible module requires lxml 2.3.0 or newer installed on the managed machine')
    elif LooseVersion('.'.join(to_native(f) for f in etree.LXML_VERSION)) < LooseVersion('3.0.0'):
        module.warn('Using lxml version lower than 3.0.0 does not guarantee predictable element attribute order.')

    # Report wrongly used attribute parameter when using content=attribute
    # TODO: Remove this in Ansible v2.12 (and reinstate strict parameter test above) and remove the integration test example
    if content == 'attribute' and attribute is not None:
        module.deprecate("Parameter 'attribute=%s' is ignored when using 'content=attribute' only 'xpath' is used. Please remove entry." % attribute, '2.12')

    # Check if the file exists
    if xml_string:
        infile = BytesIO(to_bytes(xml_string, errors='surrogate_or_strict'))
    elif os.path.isfile(xml_file):
        infile = open(xml_file, 'rb')
    else:
        module.fail_json(msg="The target XML source '%s' does not exist." % xml_file)

    # Parse and evaluate xpath expression
    if xpath is not None:
        try:
            etree.XPath(xpath)
        except etree.XPathSyntaxError as e:
            module.fail_json(msg="Syntax error in xpath expression: %s (%s)" % (xpath, e))
        except etree.XPathEvalError as e:
            module.fail_json(msg="Evaluation error in xpath expression: %s (%s)" % (xpath, e))

    # Try to parse in the target XML file
    try:
        parser = etree.XMLParser(remove_blank_text=pretty_print, strip_cdata=strip_cdata_tags)
        doc = etree.parse(infile, parser)
    except etree.XMLSyntaxError as e:
        module.fail_json(msg="Error while parsing document: %s (%s)" % (xml_file or 'xml_string', e))

    # Ensure we have the original copy to compare
    global orig_doc
    orig_doc = copy.deepcopy(doc)

    if print_match:
        do_print_match(module, doc, xpath, namespaces)

    if count:
        count_nodes(module, doc, xpath, namespaces)

    if content == 'attribute':
        get_element_attr(module, doc, xpath, namespaces)
    elif content == 'text':
        get_element_text(module, doc, xpath, namespaces)

    # File exists:
    if state == 'absent':
        # - absent: delete xpath target
        delete_xpath_target(module, doc, xpath, namespaces)

    # - present: carry on

    # children && value both set?: should have already aborted by now
    # add_children && set_children both set?: should have already aborted by now

    # set_children set?
    if set_children:
        set_target_children(module, doc, xpath, namespaces, set_children, input_type)

    # add_children set?
    if add_children:
        add_target_children(module, doc, xpath, namespaces, add_children, input_type, insertbefore, insertafter)

    # No?: Carry on

    # Is the xpath target an attribute selector?
    if value is not None:
        set_target(module, doc, xpath, namespaces, attribute, value)

    # If an xpath was provided, we need to do something with the data
    if xpath is not None:
        ensure_xpath_exists(module, doc, xpath, namespaces)

    # Otherwise only reformat the xml data?
    if pretty_print:
        make_pretty(module, doc)

    module.fail_json(msg="Don't know what to do")