Exemplo n.º 1
0
 def _split_configuration(self, projectfile, temp_dir):
     num_pieces = multiprocessing.cpu_count()
     tree = ET(file=projectfile)
     num_files = len(tree.findall('./files/file'))
     splitfiles = []
     files_per_job = int(math.ceil(float(num_files)/num_pieces))
     for idx in xrange(num_pieces):
         tree = ET(file=projectfile)
         root = tree.getroot()
         start = idx*files_per_job
         end = start + files_per_job
         if end > num_files:
             end = None
         for elem in ('files', 'images', 'pages',
                      'file-name-disambiguation'):
             elem_root = root.find(elem)
             to_keep = elem_root.getchildren()[start:end]
             to_remove = [x for x in elem_root.getchildren()
                          if not x in to_keep]
             for node in to_remove:
                 elem_root.remove(node)
         out_file = os.path.join(temp_dir,
                                 "{0}-{1}.ScanTailor".format(
                                 os.path.splitext(os.path.basename(
                                 projectfile))[0], idx))
         tree.write(out_file)
         splitfiles.append(out_file)
     return splitfiles
Exemplo n.º 2
0
 def dump(self, stream):
     if self.prettyprint:
         self.indent(self.xml)
     document = ET(self.xml)
     header = '<?xml version="1.0" encoding="%s"?>' % self.encoding
     stream.write(header.encode(self.encoding))
     document.write(stream, encoding=self.encoding)
Exemplo n.º 3
0
`assin-eval.py` script.
"""

import argparse
from xml.etree.cElementTree import ElementTree as ET
import numpy as np
from collections import Counter

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('train', help='XML file with training data')
    parser.add_argument('test', help='XML file with test data')
    parser.add_argument('output', help='Output tagged XML file')
    args = parser.parse_args()

    tree = ET()
    root_train = tree.parse(args.train)
    similarities_train = np.array(
        [float(pair.get('similarity')) for pair in root_train])
    similarity_avg = similarities_train.mean()

    entailments_train = [pair.get('entailment') for pair in root_train]
    entailment_counter = Counter(entailments_train)
    majority_entailment, _ = entailment_counter.most_common(1)[0]

    root_test = tree.parse(args.test)
    for pair in root_test:
        pair.set('similarity', str(similarity_avg))
        pair.set('entailment', majority_entailment)

    tree.write(args.output, 'utf-8')
Exemplo n.º 4
0
 def __call__(self, stream):
     self.xml = ET(file=stream)
     (keys, defaults) = self.find_graphml_keys(self.xml)
     for g in self.xml.findall("{%s}graph" % self.NS_GRAPHML):
         yield self.make_graph(g, keys, defaults)
Exemplo n.º 5
0
# convenience script for interactive use:
# myougiden $ python3.2
# >>> from qjm import *

from myougiden import config
from xml.etree.cElementTree import ElementTree as ET
import gzip

et = ET()
jm = et.parse(gzip.open(config['paths']['jmdictgz'], 'r'))

from xml.etree.cElementTree import tostring


def tos(element):
    return tostring(element).decode()