def _split_configuration(self, projectfile, temp_dir): num_pieces = multiprocessing.cpu_count() tree = ET(file=projectfile) num_files = len(tree.findall('./files/file')) splitfiles = [] files_per_job = int(math.ceil(float(num_files)/num_pieces)) for idx in xrange(num_pieces): tree = ET(file=projectfile) root = tree.getroot() start = idx*files_per_job end = start + files_per_job if end > num_files: end = None for elem in ('files', 'images', 'pages', 'file-name-disambiguation'): elem_root = root.find(elem) to_keep = elem_root.getchildren()[start:end] to_remove = [x for x in elem_root.getchildren() if not x in to_keep] for node in to_remove: elem_root.remove(node) out_file = os.path.join(temp_dir, "{0}-{1}.ScanTailor".format( os.path.splitext(os.path.basename( projectfile))[0], idx)) tree.write(out_file) splitfiles.append(out_file) return splitfiles
def dump(self, stream): if self.prettyprint: self.indent(self.xml) document = ET(self.xml) header = '<?xml version="1.0" encoding="%s"?>' % self.encoding stream.write(header.encode(self.encoding)) document.write(stream, encoding=self.encoding)
`assin-eval.py` script. """ import argparse from xml.etree.cElementTree import ElementTree as ET import numpy as np from collections import Counter if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('train', help='XML file with training data') parser.add_argument('test', help='XML file with test data') parser.add_argument('output', help='Output tagged XML file') args = parser.parse_args() tree = ET() root_train = tree.parse(args.train) similarities_train = np.array( [float(pair.get('similarity')) for pair in root_train]) similarity_avg = similarities_train.mean() entailments_train = [pair.get('entailment') for pair in root_train] entailment_counter = Counter(entailments_train) majority_entailment, _ = entailment_counter.most_common(1)[0] root_test = tree.parse(args.test) for pair in root_test: pair.set('similarity', str(similarity_avg)) pair.set('entailment', majority_entailment) tree.write(args.output, 'utf-8')
def __call__(self, stream): self.xml = ET(file=stream) (keys, defaults) = self.find_graphml_keys(self.xml) for g in self.xml.findall("{%s}graph" % self.NS_GRAPHML): yield self.make_graph(g, keys, defaults)
# convenience script for interactive use: # myougiden $ python3.2 # >>> from qjm import * from myougiden import config from xml.etree.cElementTree import ElementTree as ET import gzip et = ET() jm = et.parse(gzip.open(config['paths']['jmdictgz'], 'r')) from xml.etree.cElementTree import tostring def tos(element): return tostring(element).decode()