예제 #1
0
 def test_parse(self):
     """Minimal test of DOMEventStream.parse()"""
     handler = pulldom.parse(tstfile)
     self.addCleanup(handler.stream.close)
     list(handler)
     with open(tstfile, 'rb') as fin:
         list(pulldom.parse(fin))
예제 #2
0
 def read(self):
     events = pulldom.parse(self.filename)
     events = pulldom.parse(sys.argv[1])
     for (event, node) in events:
         if event == "START_ELEMENT" and node.tagName == "entry":
             events.expandNode(node)
             entry = Entry(node)
             yield entry
예제 #3
0
    def test_parse(self):
        """Minimal test of DOMEventStream.parse()"""

        # This just tests that parsing from a stream works. Actual parser
        # semantics are tested using parseString with a more focused XML
        # fragment.

        # Test with a filename:
        handler = pulldom.parse(tstfile)
        self.addCleanup(handler.stream.close)
        list(handler)

        # Test with a file object:
        with open(tstfile, "rb") as fin:
            list(pulldom.parse(fin))
예제 #4
0
    def test_parse(self):
        """Minimal test of DOMEventStream.parse()"""

        # This just tests that parsing from a stream works. Actual parser
        # semantics are tested using parseString with a more focused XML
        # fragment.

        # Test with a filename:
        handler = pulldom.parse(tstfile)
        self.addCleanup(handler.stream.close)
        list(handler)

        # Test with a file object:
        with open(tstfile, "rb") as fin:
            list(pulldom.parse(fin))
예제 #5
0
파일: pullParse.py 프로젝트: roolin/xmlbd
def PullParse(fileName, katalog, baseName):
    db = XmlDB(katalog + "/" + baseName)
    db.Open()
    events = pulldom.parse(fileName)
    pre = 1
    stack = []
    stack.append(XmlNodeTuple(0, type="elem", data="doc"))
    for (event, node) in events:
        if event == "START_ELEMENT":
            stack.append(XmlNodeTuple(pre, type="elem", data=node.tagName))
            pre += 1
        if event == "END_ELEMENT":
            x = stack.pop()
            if len(stack) > 0:
                parent = stack[len(stack) - 1].pre
            else:
                parent = 0
            db.AddTuple(x.pre, pre, parent, x.type, x.data)
            pre += 1
        if event == "CHARACTERS" and node.data != "\n" and node.data != "  ":
            db.AddTuple(pre, pre + 1, pre - 1, "text", node.data)
            pre += 2
    x = stack.pop()
    parent = -1
    db.AddTuple(x.pre, pre, parent, x.type, x.data)
    return db
예제 #6
0
def process_file(path):
    global page_count, blp_count, found
    doc = pulldom.parse(bz2.open(path))
    for event, node in doc:
        if event == pulldom.START_ELEMENT and node.tagName == 'page':
            page_count += 1
            if page_count % 1000 == 0:
                dt = datetime.now() - t0
                print(f'Done with {humanize.intcomma(page_count)} pages, {humanize.intcomma(blp_count)} blps, found {found} in {dt}')
            doc.expandNode(node)
            ns = node.getElementsByTagName('ns')[0].childNodes[0].nodeValue
            title = node.getElementsByTagName('title')[0].childNodes[0].nodeValue
            if not ns == '0':
                continue

            cdataNodes = node.getElementsByTagName('text')[0].childNodes
            content = ' '.join(node.nodeValue for node in cdataNodes).lower()
            if '#redirect' in content:
                continue

            if 'living people' in content:
                blp_count += 1
                if not 'ref' in content:
                    found += 1
                    print('Found:', title)
예제 #7
0
def read_xml_or_gz_file(input_file_path):
    """
    Reads a file in xml format and returns its content. If the file is zipped, it unzips it first
    :param input_file_path:
    :return:
    """
    input_file_extension = os.path.splitext(input_file_path)[1]
    # Read biosamples from XML file
    if input_file_extension == '.gz':
        content = pulldom.parse(gzip.open(input_file_path))
    elif input_file_extension == '.xml':
        content = pulldom.parse(input_file_path)
    else:
        print('Error: invalid file extension')
        sys.exit(1)
    return content
예제 #8
0
 def __init__(self, gmlFile):
     self.gmlFile = gmlFile
     _gml = open(gmlFile,'rb')
     self.gml = _gml
     self._filesize = float(os.fstat(_gml.fileno()).st_size)
     self.events = pulldom.parse(_gml)
     self.numFeatures = 0
예제 #9
0
    def process(self, ctx, m):

        path = ctx.interpolate(ctx, self.path)
        logger.debug("Reading XML in pull mode (splitting by tag '%s'): %s" % (self.tagname, path))


        with open(path, "r") as xmlfile:

            doc = pulldom.parse(xmlfile)
            for event, node in doc:
                if event == pulldom.START_ELEMENT and node.tagName == self.tagname:
                    doc.expandNode(node)

                    m2 = ctx.copy_message(m)
                    xmltext = node.toxml().encode('utf-8')
                    xmltext = "<root>" + xmltext + "</root>"
                    parser = etree.XMLParser(recover=True, encoding="utf-8")
                    xml = etree.fromstring(xmltext, parser=parser)

                    for elem in xml.iter():
                        if ":" in elem.tag:
                            elem.tag = ":".join(elem.tag.split(":")[1:])

                    m2['xml'] = xml

                    yield m2
예제 #10
0
def handle_children(xmlfile, handle_parsenode):
    root_open = None
    root_close = None
    level = 0
    xml_doc = pulldom.parse(xmlfile)
    for event, parsenode in xml_doc:
        if event == pulldom.START_ELEMENT:
            # print level, parsenode.getAttribute(ID_ATTR)
            if level == 0:
                root_open = parsenode.toprettyxml(indent="")
                # since we did not expand root_open contains the closing slash
                root_open = root_open[:-3] + ">\n"
                # change the schema for edge diffs
                root_open = root_open.replace("edges_file.xsd",
                                              "edgediff_file.xsd")
                root_close = "</%s>\n" % parsenode.localName
            if level == 1:
                # consumes END_ELEMENT, no level increase
                xml_doc.expandNode(parsenode)
                handle_parsenode(parsenode)
            else:
                level += 1
        elif event == pulldom.END_ELEMENT:
            level -= 1
    return root_open, root_close
예제 #11
0
def analyze_document(filename: str = "data/dblp.xml",
                     expected_event_count: T.Optional[int] = 248393285):
    """
    New function for dblp xml analysis used to correct my database schema.
    :param filename:
    :param expected_event_count:
    :return:
    """
    os.chdir(
        os.path.dirname(filename)
    )  # so that relative reference to dtd file can be read by XML parser
    doc = pulldom.parse(filename, parser=parser, bufsize=2**14)

    for event, node in tqdm(doc, total=expected_event_count):
        if event == pulldom.START_ELEMENT:
            if node.tagName == "note":
                for k, v in node.attributes.items():
                    if k == 'type':
                        note_types.add(v)
            elif node.tagName == "ee":
                for k, v in node.attributes.items():
                    if k == 'type':
                        ee_types.add(v)
            elif node.tagName == "url":
                for k, v in node.attributes.items():
                    if k == 'type':
                        url_types.add(v)
            elif node.tagName == "isbn":
                for k, v in node.attributes.items():
                    if k == 'type':
                        isbn_types.add(v)
            # doc.expandNode(node)

    for s in [note_types, ee_types, url_types, isbn_types]:
        print(list(s))
예제 #12
0
def parse(xmlfile, element_names, element_attrs={}, attr_conversions={}):
    """
    Parses the given element_names from xmlfile and yield compound objects for
    their xml subtrees (no extra objects are returned if element_names appear in
    the subtree) The compound objects provide all element attributes of
    the root of the subtree as attributes unless attr_names are supplied. In this
    case attr_names maps element names to a list of attributes which are
    supplied. If attr_conversions is not empty it must map attribute names to
    callables which will be called upon the attribute value before storing under
    the attribute name. 
    The compound objects gives dictionary style access to list of compound
    objects o for any children with the given element name 
    o['child_element_name'] = [osub0, osub1, ...]
    As a shorthand, attribute style access to the list of child elements is
    provided unless an attribute with the same name as the child elements
    exists (i.e. o.child_element_name = [osub0, osub1, ...])
    @Note: All elements with the same name must have the same type regardless of
    the subtree in which they occur
    @Note: Attribute names may be modified to avoid name clashes
    with python keywords.
    @Example: parse('plain.edg.xml', ['edge'])
    """
    elementTypes = {}
    xml_doc = pulldom.parse(xmlfile)
    for event, parsenode in xml_doc:
        if event == pulldom.START_ELEMENT and parsenode.localName in element_names:
            xml_doc.expandNode(parsenode)
            yield _get_compound_object(parsenode, elementTypes,
                                       parsenode.localName, element_attrs,
                                       attr_conversions)
예제 #13
0
파일: parser.py 프로젝트: janbrohl/PyXML
def parse (source):
    """parse(source) : Pattern
    Parses the XML from the input stream and returns a Pattern tree.
    """
    stream = util.DOMTokenStream(pulldom.parse(source))
    element_map = {}
#    for item in stream:
#        print item, stream.parents
    
    # Process the document prologue and the first two elements
    assert stream.next()[0] == pulldom.START_DOCUMENT
    stream.expect_element('grammar')
    stream.expect_element('start')

    # Parse the main pattern body
    pattern = parse_top(stream)
    
    stream.expect_end_element('start')

    # Process definition section
    while 1:
        event, node = stream.get_next_event()
        if event == pulldom.END_ELEMENT and node.localName == 'grammar':
            # We're done
            break 
        elif event == pulldom.START_ELEMENT and node.localName == 'define':
            # Parse definition
            ncname = node.getAttributeNS(RNG.BASE, 'name')
            stream.expect_element('element')
            nc = parse_nameclass(stream)
            pattern = parse_top(stream)
            stream.expect_end_element('element')
            stream.expect_end_element('define')
            element_map[ncname] = relaxng.Element(nc=nc,
                                                  p1=pattern)
        else:
            raise RuntimeError, 'Unexpected event: %r, %r' % (event, node)

    # Loop through all the patterns, replacing Ref instances
    # with the corresponding Element instance
    # XXX does this always terminate, given that there can be
    # cycles of Elements?
    # XXX on the other hand, does this cover every single pattern
    # node that could contain a Ref instance?
    queue = [pattern] + element_map.values()
    while len(queue):
        head = queue.pop()
        if hasattr(head, 'p1'):
            if isinstance(head.p1, Ref):
                head.p1 = element_map[head.p1.ref_name]
            else:
                queue.append(head.p1)

        if hasattr(head, 'p2'):
            if isinstance(head.p2, Ref):
                head.p2 = element_map[head.p2.ref_name]
            else:
                queue.append(head.p2)
            
    return relaxng.Schema(pattern)
예제 #14
0
def parse(xmlfile, element_names, element_attrs={}, attr_conversions={}):
    """
    Parses the given element_names from xmlfile and yield compound objects for
    their xml subtrees (no extra objects are returned if element_names appear in
    the subtree) The compound objects provide all element attributes of
    the root of the subtree as attributes unless attr_names are supplied. In this
    case attr_names maps element names to a list of attributes which are
    supplied. If attr_conversions is not empty it must map attribute names to
    callables which will be called upon the attribute value before storing under
    the attribute name. 
    The compound objects gives dictionary style access to list of compound
    objects o for any children with the given element name 
    o['child_element_name'] = [osub0, osub1, ...]
    As a shorthand, attribute style access to the list of child elements is
    provided unless an attribute with the same name as the child elements
    exists (i.e. o.child_element_name = [osub0, osub1, ...])
    @Note: All elements with the same name must have the same type regardless of
    the subtree in which they occur
    @Note: Attribute names may be modified to avoid name clashes
    with python keywords.
    @Note: The element_names may be either a single string or a list of strings.
    @Example: parse('plain.edg.xml', ['edge'])
    """
    if isinstance(element_names, str):
        element_names = [element_names]
    elementTypes = {}
    xml_doc = pulldom.parse(xmlfile)
    for event, parsenode in xml_doc:
        if event == pulldom.START_ELEMENT and parsenode.localName in element_names:
            xml_doc.expandNode(parsenode)
            yield _get_compound_object(parsenode, elementTypes,
                                       parsenode.localName, element_attrs, attr_conversions)
예제 #15
0
def extract(input_xml):
    """Process entire input XML document, firing on events"""
    # Start pulling; it continues automatically
    doc = pulldom.parse(input_xml)
    output = ''
    for event, node in doc:
        # elements to ignore: xml
        if event == pulldom.START_ELEMENT and node.localName in ignore:
            continue
        # copy comments intact
        elif event == pulldom.COMMENT:
            doc.expandNode(node)
            output += node.toxml()
        # empty inline elements: pb, milestone
        elif event == pulldom.START_ELEMENT and node.localName in inlineEmpty:
            output += node.toxml()
        # non-empty inline elements: note, hi, head, l, lg, div, p, ab, 
        elif event == pulldom.START_ELEMENT and node.localName in inlineContent:
            output += regexEmptyTag.sub('>', node.toxml())
        elif event == pulldom.END_ELEMENT and node.localName in inlineContent:
            output += '</' + node.localName + '>'
        elif event == pulldom.START_ELEMENT and node.localName in blockElement:
            output += '\n<' + node.localName + '>\n'
        elif event == pulldom.END_ELEMENT and node.localName in blockElement:
            output += '\n</' + node.localName + '>'
        elif event == pulldom.CHARACTERS:
            output += normalizeSpace(node.data)
        else:
            continue
    return output
예제 #16
0
 def pulldom(self, filename, element_name):
     """
     Return an iterator over dictionaries elements that match `element_name`.
     This uses the pulldom parser, so it's more memory efficient for large
     xml files.
     """
     from xml.dom import pulldom
     from xml.dom import Node
     fh = open(filename, 'r')
     events = pulldom.parse(fh)
     for event in events:
         node_type, node = event
         if node_type == 'START_ELEMENT' and node.nodeName == element_name:
             events.expandNode(node)
             node.normalize()
             record = {}
             # TODO: This only parses a flat list of elements. It should
             # probably handle attributes or nested elements as well.
             for subnode in node.childNodes:
                 if subnode.nodeType != Node.TEXT_NODE:
                     if subnode.hasChildNodes():
                         record[subnode.
                                nodeName] = subnode.firstChild.nodeValue
                     else:
                         record[subnode.nodeName] = ''
             yield record
     fh.close()
예제 #17
0
파일: sort_routes.py 프로젝트: zeroset/sumo
def sort_departs(routefilename, outfile):
    routes_doc = pulldom.parse(routefilename)
    vehicles = []
    root = None
    for event, parsenode in routes_doc:
        if event == pulldom.START_ELEMENT:
            if root is None:
                root = parsenode.localName
                outfile.write("<%s>\n" % root)
                continue
            routes_doc.expandNode(parsenode)
            departAttr = DEPART_ATTRS.get(parsenode.localName)
            if departAttr is not None:
                startString = parsenode.getAttribute(departAttr)
                if ':' in startString:
                    start = sumolib.miscutils.parseTime(startString)
                elif startString == "triggered":
                    start = -1  # before everything else
                else:
                    start = float(startString)
                vehicles.append(
                    (start, parsenode.toprettyxml(indent="", newl="")))
            else:
                # copy to output
                outfile.write(" " * 4 +
                              parsenode.toprettyxml(indent="", newl="") + "\n")

    print('read %s elements.' % len(vehicles))
    vehicles.sort(key=lambda v: v[0])
    for depart, vehiclexml in vehicles:
        outfile.write(" " * 4)
        outfile.write(vehiclexml)
        outfile.write("\n")
    outfile.write("</%s>\n" % root)
    print('wrote %s elements.' % len(vehicles))
예제 #18
0
def parseXML(stream, parser=None):
    if isinstance(stream, six.string_types):
        events = pulldom.parseString(stream, parser)
    else:
        events = pulldom.parse(stream, parser)

    document = None
    chain = []
    for event, node in events:
        if event == "START_DOCUMENT":
            chain.append(XMLNode("DOCUMENT", {}))

        elif event == "START_ELEMENT":
            node = XMLNode.fromDOMNode(node)
            if chain:
                chain[-1].children.append(node)
            chain.append(node)

        elif event == "END_ELEMENT":
            chain.pop(-1)

        elif event == "CHARACTERS":
            chain[-1].data += node.data

        elif event == "END_DOCUMENT":
            document = chain.pop(-1)
    return document or chain[0]
예제 #19
0
def process_metadata(f):
    count = 0
    metadata = {}
    events = pulldom.parse(f)
    for event, node in events:
        if node.localName == 'gpxFile' and event == pulldom.START_ELEMENT:
            m = {}
            for k in ['visibility', 'user']:
                if node.hasAttribute(k):
                    m[k] = node.getAttribute(k)
            for k in ['id', 'uid', 'points']:
                if node.hasAttribute(k):
                    m[k] = int(node.getAttribute(k))
            if node.hasAttribute('timestamp'):
                m['date'] = node.getAttribute('timestamp')[0:10]
            events.expandNode(node)
            desc = node.getElementsByTagName('description')
            if desc and desc[0].firstChild:
                m['description'] = desc[0].firstChild.data[0:500]
            tags = node.getElementsByTagName('tag')
            if tags:
                t = []
                for tag in tags:
                    if tag.firstChild:
                        t.append(tag.firstChild.data)
                m['tags'] = t
            metadata[node.getAttribute('filename')] = m
            count += 1
            if count % 10000 == 0:
                sys.stdout.write('.')
                sys.stdout.flush()
    return metadata
예제 #20
0
    def process(self, ctx, m):

        path = ctx.interpolate(ctx, self.path)
        logger.debug("Reading XML in pull mode (splitting by tag '%s'): %s" % (self.tagname, path))


        with open(path, "r") as xmlfile:

            doc = pulldom.parse(xmlfile)
            for event, node in doc:
                if event == pulldom.START_ELEMENT and node.tagName == self.tagname:
                    doc.expandNode(node)

                    m2 = ctx.copy_message(m)
                    xmltext = node.toxml().encode('utf-8')
                    xmltext = "<root>" + xmltext + "</root>"
                    parser = etree.XMLParser(recover=True, encoding="utf-8")
                    xml = etree.fromstring(xmltext, parser=parser)

                    for elem in xml.iter():
                        if ":" in elem.tag:
                            elem.tag = ":".join(elem.tag.split(":")[1:])

                    m2['xml'] = xml

                    yield m2
예제 #21
0
def handle_children(xmlfile, handle_parsenode):
    root = None
    schema = None
    version = ""
    level = 0
    xml_doc = pulldom.parse(xmlfile)
    for event, parsenode in xml_doc:
        if event == pulldom.START_ELEMENT:
            # print level, parsenode.getAttribute(ID_ATTR)
            if level == 0:
                # since we did not expand root_open contains the closing slash
                root = parsenode.localName
                if root == "edges":
                    schema = "edgediff_file.xsd"
                elif root == "tlLogics":
                    schema = "tllogic_file.xsd"
                if parsenode.hasAttribute("version"):
                    version = ' version="%s"' % parsenode.getAttribute(
                        "version")
                if root not in ("edges", "nodes", "connections", "tlLogics"):
                    # do not write schema information
                    version = None
            if level == 1:
                # consumes END_ELEMENT, no level increase
                xml_doc.expandNode(parsenode)
                handle_parsenode(parsenode)
            else:
                level += 1
        elif event == pulldom.END_ELEMENT:
            level -= 1
    return root, schema, version
예제 #22
0
def process_metadata(f):
    count = 0
    metadata = {}
    events = pulldom.parse(f)
    for event, node in events:
        if node.localName == 'gpxFile' and event == pulldom.START_ELEMENT:
            m = {}
            for k in ['visibility', 'user']:
                if node.hasAttribute(k):
                    m[k] = node.getAttribute(k)
            for k in ['id', 'uid', 'points']:
                if node.hasAttribute(k):
                    m[k] = int(node.getAttribute(k))
            if node.hasAttribute('timestamp'):
                m['date'] = node.getAttribute('timestamp')[0:10]
            events.expandNode(node)
            desc = node.getElementsByTagName('description')
            if desc and desc[0].firstChild:
                m['description'] = desc[0].firstChild.data[0:500]
            tags = node.getElementsByTagName('tag')
            if tags:
                t = []
                for tag in tags:
                    if tag.firstChild:
                        t.append(tag.firstChild.data)
                m['tags'] = t
            metadata[node.getAttribute('filename')] = m
            count += 1
            if count % 10000 == 0:
                sys.stdout.write('.')
                sys.stdout.flush()
    return metadata
예제 #23
0
def parse(self, source, base_uri):
    """parse(source) : Pattern
    Parses the XML from the input stream and returns a Pattern tree.
    """
    self.base_uri = base_uri
    stream = util.DOMTokenStream(pulldom.parse(source))
    stream.set_legal_attributes(_legal_attributes)
    self.stream = stream

    # Process the document prologue
    assert stream.get_next_event()[0] == pulldom.START_DOCUMENT
    try:
        event, node = stream.get_next_event()
    except StopIteration:
        raise RuntimeError, ("No first element found -- " "missing RELAX NG namespace declaration?")

    assert is_start_of_pattern(event, node)
    root_grammar = stream.root_grammar
    root_grammar.add_start_sym("", self.parse_rest_of_pattern(event, node))
    root_grammar.combine()

    pattern = root_grammar.start_symbol
    if pattern is relaxng.NotAllowed:
        raise RuntimeError, "Schema reduces to NotAllowed (can never be valid)"

    return relaxng.Schema(pattern)
예제 #24
0
파일: netdiff.py 프로젝트: planetsumo/sumo
def handle_children(xmlfile, handle_parsenode):
    root_open = None
    root_close = None
    level = 0
    xml_doc = pulldom.parse(xmlfile)
    for event, parsenode in xml_doc:
        if event == pulldom.START_ELEMENT:
            # print level, parsenode.getAttribute(ID_ATTR)
            if level == 0:
                root_open = parsenode.toprettyxml(indent="")
                # since we did not expand root_open contains the closing slash
                root_open = root_open[:-3] + ">\n"
                # change the schema for edge diffs
                root_open = root_open.replace(
                    "edges_file.xsd", "edgediff_file.xsd")
                root_close = "</%s>\n" % parsenode.localName
            if level == 1:
                # consumes END_ELEMENT, no level increase
                xml_doc.expandNode(parsenode)
                handle_parsenode(parsenode)
            else:
                level += 1
        elif event == pulldom.END_ELEMENT:
            level -= 1
    return root_open, root_close
예제 #25
0
 def __init__(self, stream_or_string, **options):
     super(Deserializer, self).__init__(stream_or_string, **options)
     parser = sax.make_parser()
     parser.setFeature(sax.handler.feature_namespaces, 1)
     #parser.setFeature(sax.handler.feature_namespace_prefixes,1)
     self.event_stream = pulldom.parse(self.stream, parser)
     self.db = options.pop('using', DEFAULT_DB_ALIAS)
예제 #26
0
def sort_departs(routefilename, outfile):
    routes_doc = pulldom.parse(sys.argv[1])
    vehicles = []
    root = None
    for event, parsenode in routes_doc:
        if event == pulldom.START_ELEMENT:
            if root is None:
                root = parsenode.localName
                outfile.write("<%s>\n" % root)
                continue
            routes_doc.expandNode(parsenode)
            departAttr = DEPART_ATTRS.get(parsenode.localName)
            if departAttr is not None:
                start = float(parsenode.getAttribute(departAttr))
                vehicles.append(
                    (start, parsenode.toprettyxml(indent="", newl="")))
            else:
                # copy to output
                outfile.write(
                    " " * 4 + parsenode.toprettyxml(indent="", newl="") + "\n")

    print('read %s elements.' % len(vehicles))
    vehicles.sort(key=lambda v: v[0])
    for depart, vehiclexml in vehicles:
        outfile.write(" " * 4)
        outfile.write(vehiclexml)
        outfile.write("\n")
    outfile.write("</%s>\n" % root)
    print('wrote %s elements.' % len(vehicles))
예제 #27
0
def filter_samples():
    if not os.path.exists(os.path.dirname(OUTPUT_FILE)):
        os.makedirs(os.path.dirname(OUTPUT_FILE))
    print('Input file: ' + INPUT_FILE)
    print('Processing NCBI samples...')
    # Read biosamples from XML file
    content = pulldom.parse(gzip.open(INPUT_FILE))
    processed_samples_count = 0
    selected_samples_count = 0
    with codecs.open(OUTPUT_FILE, 'w', 'utf-8') as f:
        f.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
        f.write("<BioSampleSet>")
        for event, node in content:
            if event == 'START_ELEMENT' and node.tagName == 'BioSample':
                content.expandNode(node)
                node_xml = node.toxml()
                processed_samples_count = processed_samples_count + 1
                if processed_samples_count % 5000 == 0:
                    print('Processed samples: ' + str(processed_samples_count))
                    print('Selected samples: ' + str(selected_samples_count))
                if is_homo_sapiens_sample(node_xml):
                    if has_minimum_relevant_attributes_count(
                            node_xml, constants.NCBI_FILTER_MIN_RELEVANT_ATTS):
                        f.write('\n' + node.toxml())
                        selected_samples_count = selected_samples_count + 1

        f.write("\n</BioSampleSet>\n")
    f.close()

    print('Finished processing NCBI samples')
    print('- Total samples processed: ' + str(processed_samples_count))
    print('- Total samples selected: ' + str(selected_samples_count))
예제 #28
0
 def call(self):
     """
     Makes a request to cghub server.
     Returns generator that returns Result objects.
     """
     self.patch_input_data()
     query = self.build_query()
     url = '%s%s' % (self.server_url, self.uri)
     if query:
         url = '%s?%s' % (url, query)
     xml = self.get_source_file(url)
     if self.format == self.FORMAT_JSON:
         results = ijson.items(xml, 'response.docs.item')
         for item in results:
             yield item
     else:
         # http://docs.python.org/dev/library/xml.dom.pulldom.html
         doc = pulldom.parse(xml)
         for event, node in doc:
             if event == pulldom.START_ELEMENT:
                 if node.tagName == 'doc':
                     doc.expandNode(node)
                     # convert to python object
                     # http://docs.python.org/2/library/xml.etree.elementtree.html
                     result_xml = node.toxml(encoding='utf-8')
                     tree = ElementTree.fromstring(result_xml)
                     result = Result(tree)
                     yield self.patch_result(result, result_xml)
                 elif node.tagName == 'result':
                     self.hits = int(node.getAttribute('numFound'))
예제 #29
0
 def _parsePage(self, xml, pageResults, translationLanguage, includeSentences):
     events = pulldom.parse(xml)
     changedItemCount = 0
     for (event, node) in events:
         if event == pulldom.START_ELEMENT:
             if node.tagName.lower() == "list":
                 events.expandNode(node)
                 smartfmlist = SmartFMList()
                 smartfmlist.loadFromDOM(node)
                 # self._logMsg(smartfmlist)
                 if pageResults.addList(smartfmlist):
                     changedItemCount += 1
             elif node.tagName.lower() == "item":
                 events.expandNode(node)
                 smartfmitem = SmartFMVocab()
                 smartfmitem.loadFromDOM(node)
                 # self._logMsg(smartfmitem)
                 if pageResults.addItem(smartfmitem):
                     changedItemCount += 1
                 if includeSentences:
                     for sentence in smartfmitem.sentencesFromDOM(node, translationLanguage):
                         # self._logMsg(sentence)
                         if pageResults.addItem(sentence):
                             changedItemCount += 1
                         elif sentence.uniqIdStr() in pageResults.items:
                             pageResults.items[sentence.uniqIdStr()].linkToVocab(smartfmitem)
                             pageResults.updateIndexToMatch(sentence.uniqIdStr(), smartfmitem.uniqIdStr())
             elif includeSentences and node.tagName.lower() == "sentence":
                 events.expandNode(node)
                 smartfmsentence = SmartFMSentence()
                 smartfmsentence.loadFromDOM(node, translationLanguage)
                 if pageResults.addItem(smartfmsentence):
                     changedItemCount += 1
     return changedItemCount
예제 #30
0
def parseXML(stream, parser=None):
    if isinstance(stream, six.string_types):
        events = pulldom.parseString(stream, parser)
    else:
        events = pulldom.parse(stream, parser)

    document = None
    chain = []
    for event, node in events:
        if event == "START_DOCUMENT":
            chain.append(XMLNode("DOCUMENT", {}))

        elif event == "START_ELEMENT":
            node = XMLNode.fromDOMNode(node)
            if chain:
                chain[-1].children.append(node)
            chain.append(node)

        elif event == "END_ELEMENT":
            chain.pop(-1)

        elif event == "CHARACTERS":
            chain[-1].data += node.data

        elif event == "END_DOCUMENT":
            document = chain.pop(-1)
    return document or chain[0]
예제 #31
0
def load_chunks(fpath, limit=None):
    with open(fpath, 'rb') as f:
        events = pulldom.parse(f, parser=_create_parser())
        chunk_id = 0
        chunk_events = _start_events(events, 'chunk')
        for chunk in tqdm(chunk_events, desc=f'Loading chunks from {fpath}'):
            for chunk in _findall(chunk, 'chunk'):
                if chunk_id == limit:
                    return
                chunk_id += 1
                tokens = []
                for tok in _findall(chunk, 'tok'):
                    orth = _findvalue(tok, 'orth')
                    lemmas = []
                    ctags = []
                    disamb_lemma = None
                    disamb_ctag = None
                    for lex in _findall(tok, 'lex'):
                        lemma = _findvalue(lex, 'base')
                        ctag = _findvalue(lex, 'ctag')
                        if lex.getAttribute('disamb') == '1':
                            disamb_lemma = lemma
                            disamb_ctag = ctag
                        else:
                            lemmas.append(lemma)
                            ctags.append(ctag)
                    token = Token(orth, lemmas, ctags, disamb_lemma,
                                  disamb_ctag)
                    tokens.append(token)

                yield Chunk(tokens)
예제 #32
0
 def __init__(self, handle, namespace=None):
     """Create the object and initialize the XML parser."""
     self.source = None
     self.source_version = None
     self.version = None
     self.speciesName = None
     self.ncbiTaxID = None
     self._namespace = namespace
     # pulldom.parse can accept both file handles and file names.
     # However, it doesn't use a context manager. so if we provide a file
     # name and let pulldom.parse open the file for us, then the file
     # will remain open until SeqXmlIterator is deallocated or we delete
     # the DOMEventStream returned by pulldom.parse.
     # Delete the DOMEventStream in case any exceptions happen.
     self._events = pulldom.parse(handle)
     try:
         try:
             event, node = next(self._events)
         except StopIteration:
             raise_from(ValueError("Empty file."), None)
         if event != "START_DOCUMENT" or node.localName is not None:
             raise ValueError("Failed to find start of XML")
         self._read_header()
     except Exception:
         self._events = None
         raise
예제 #33
0
파일: __init__.py 프로젝트: k0emt/macts
def parse(xmlfile, element_name):
    # parses the attributes of all nodes with element_name and returns a list of namedtuples
    # @note the first node in xmlfile will determine the set of attributes 
    # @note attribute names which are also python keywords will be prefixed with 'attr_' 
    elementType = [] # mutable, will be [namedtuple]
    xml_doc = pulldom.parse(xmlfile)
    return [get_attrs(parsenode, elementType, element_name) for event, parsenode in xml_doc 
            if event == pulldom.START_ELEMENT and parsenode.localName == element_name]
 def _initFromFile(self, path):
     kanjiDicFile = open(path)
     events = pulldom.parse(kanjiDicFile)
     for (event, node) in events:
         if event == pulldom.START_ELEMENT:
             if node.tagName.lower() == 'character':
                 events.expandNode(node)
                 self._processNode(node)
예제 #35
0
 def __init__(self, stream):
     self._items = pulldom.parse(XMLStream(stream), bufsize = 256)
     self._item = None   # The current item
     self._next = None   # 1 item pushback buffer
     self.kind = None
     self.name = None
     self.attrs = None
     self.value = None
 def _initFromFile(self, path):
     kanjiDicFile = open(path)
     events = pulldom.parse(kanjiDicFile)
     for (event, node) in events:
         if event == pulldom.START_ELEMENT:
             if node.tagName.lower() == 'character':
                 events.expandNode(node)
                 self._processNode(node)
예제 #37
0
def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    tree = pulldom.parse(filename)
    for event, node in tree:
        if event == pulldom.START_ELEMENT:
            keys = key_type(node, keys)

    return keys
예제 #38
0
    def testXInclude(self):
        file = "../../xml_files_windows/xinclude.xml"
        tagName = "data"

        doc = _PULLDOM.parse(file)
        for event, node in doc:
            if event == _PULLDOM.START_ELEMENT and node.tagName == tagName:
                doc.expandNode(node)
                self.assertEqual("xi:include", node.firstChild.nodeName)
예제 #39
0
def dmoz_reader(filename):
    doc = pulldom.parse(filename)
    for event, node in doc:
        if event == pulldom.START_ELEMENT and node.tagName == 'ExternalPage':
            doc.expandNode(node)
            url = node.attributes['about'].value
            topic_node = node.getElementsByTagName('topic')[0]
            topics = topic_node.childNodes[0].data
            yield url, topics
def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    tree=pulldom.parse(filename)
    for event, node in tree:
        if event==pulldom.START_ELEMENT:
            keys = key_type(node, keys)


    return keys
예제 #41
0
def get_wk_nodes():
	events = pulldom.parse(sys.stdin)
	try:
		for (event, node) in events:
			if event == pulldom.START_ELEMENT and node.tagName == "page":			
				events.expandNode(node)
				yield node
	except Exception as e:
		sys.stderr.write(str(e)+"\n")
예제 #42
0
    def testXSLT(self):
        file = "../../xml_files_windows/optional/xslt.xsl"
        tagName = "xsl:stylesheet"

        doc = _PULLDOM.parse(file)
        for event, node in doc:
            if event == _PULLDOM.START_ELEMENT and node.tagName == tagName:
                doc.expandNode(node)
                self.assertEqual("xsl:stylesheet", node.nodeName)
예제 #43
0
def WalkNodesForAttributes(path):
  """Parse the xml file getting all attributes.
  <venue>
    <attribute>value</attribute>
  </venue>

  Returns:
    type_name - The java-style name the top node will have. "Venue"
    top_node_name - unadultured name of the xml stanza, probably the type of
    java class we're creating. "venue"
    attributes - {'attribute': 'value'}
  """
  doc = pulldom.parse(path)

  type_name = None
  top_node_name = None
  attributes = {}

  level = 0
  for event, node in doc:
    # For skipping parts of a tree.
    if level > 0:
      if event == pulldom.END_ELEMENT:
        level-=1
        logging.warn('(%s) Skip end: %s' % (str(level), node))
        continue
      elif event == pulldom.START_ELEMENT:
        logging.warn('(%s) Skipping: %s' % (str(level), node))
        level+=1
        continue

    if event == pulldom.START_ELEMENT:
      logging.warn('Parsing: ' + node.tagName)
      # Get the type name to use.
      if type_name is None:
        type_name = ''.join([word.capitalize()
                             for word in node.tagName.split('_')])
        top_node_name = node.tagName
        logging.warn('Found Top Node Name: ' + top_node_name)
        continue

      typ = node.getAttribute('type')
      child = node.getAttribute('child')
      # We don't want to walk complex types.
      if typ in COMPLEX:
        logging.warn('Found Complex: ' + node.tagName)
        level = 1
      elif typ not in TYPES:
        logging.warn('Found String: ' + typ)
        typ = STRING
      else:
        logging.warn('Found Type: ' + typ)
      logging.warn('Adding: ' + str((node, typ)))
      attributes.setdefault(node.tagName, (typ, [child]))
  logging.warn('Attr: ' + str((type_name, top_node_name, attributes)))
  return type_name, top_node_name, attributes
예제 #44
0
def WalkNodesForAttributes(path):
    """Parse the xml file getting all attributes.
  <venue>
    <attribute>value</attribute>
  </venue>

  Returns:
    type_name - The java-style name the top node will have. "Venue"
    top_node_name - unadultured name of the xml stanza, probably the type of
    java class we're creating. "venue"
    attributes - {'attribute': 'value'}
  """
    doc = pulldom.parse(path)

    type_name = None
    top_node_name = None
    attributes = {}

    level = 0
    for event, node in doc:
        # For skipping parts of a tree.
        if level > 0:
            if event == pulldom.END_ELEMENT:
                level -= 1
                logging.warn('(%s) Skip end: %s' % (str(level), node))
                continue
            elif event == pulldom.START_ELEMENT:
                logging.warn('(%s) Skipping: %s' % (str(level), node))
                level += 1
                continue

        if event == pulldom.START_ELEMENT:
            logging.warn('Parsing: ' + node.tagName)
            # Get the type name to use.
            if type_name is None:
                type_name = ''.join(
                    [word.capitalize() for word in node.tagName.split('_')])
                top_node_name = node.tagName
                logging.warn('Found Top Node Name: ' + top_node_name)
                continue

            typ = node.getAttribute('type')
            child = node.getAttribute('child')
            # We don't want to walk complex types.
            if typ in COMPLEX:
                logging.warn('Found Complex: ' + node.tagName)
                level = 1
            elif typ not in TYPES:
                logging.warn('Found String: ' + typ)
                typ = STRING
            else:
                logging.warn('Found Type: ' + typ)
            logging.warn('Adding: ' + str((node, typ)))
            attributes.setdefault(node.tagName, (typ, [child]))
    logging.warn('Attr: ' + str((type_name, top_node_name, attributes)))
    return type_name, top_node_name, attributes
예제 #45
0
 def testTextNodes(self):
     text = []
     for event, node in pulldom.parse(self.testFile):
         if event == pulldom.CHARACTERS:
             text.append(node.data)
     try:
         result = "".join(text)
         self.assertEqual(repr(result), r"u'\n    Some greek: \u0391\u0392\u0393\u0394\u0395\n    \n    \n    \n'")
     except Exception as x:
         self.fail("Unexpected exception joining text pieces: %s" % str(x))
예제 #46
0
 def testComment(self):
     commentText = []
     for event, node in pulldom.parse(self.testFile):
         if event == pulldom.COMMENT:
             commentText.append(node.data)
     try:
         result = "".join(commentText)
         self.assertEqual(repr(result), r"u'&#x39b;&#x39c;&#x39d;&#x39e;&#x39f;'")
     except Exception as x:
         self.fail("Unexpected exception joining comment data pieces: %s" % str(x))
예제 #47
0
def parse_stats(metrics=METRICS):
    for metric in metrics:
        for fn in glob.glob(STATS_XML % metric):
            try:
                dom = pulldom.parse(fn)
            except IOError:
                continue
            for event, node in dom:
                if event == "START_ELEMENT" and node.tagName == 'representative':
                    yield web.storage(node.attributes.items())
예제 #48
0
def parse_fec():
    dom = pulldom.parse(FEC_XML)
    for event, node in dom:
        if event == "START_ELEMENT" and node.tagName == 'candidate':
            dom.expandNode(node)
            fec_id = node.getElementsByTagName('id')[0].firstChild.nodeValue
            uri = node.getElementsByTagName('uri')[0].firstChild.nodeValue
            if fec_id in uri: continue
            bioguide_id = uri.split('/')[-1]
            yield {'fecid': fec_id, 'bioguideid': bioguide_id}
예제 #49
0
    def testXXE(self):
        file = "../../xml_files_windows/xxe/xxe.xml"
        tagName = "data"

        doc = _PULLDOM.parse(file)
        for event, node in doc:
            if event == _PULLDOM.START_ELEMENT and node.tagName == tagName:
                doc.expandNode(node)
                self.assertEqual("data", node.nodeName)
                self.assertEqual("it_works", node.firstChild.data)
예제 #50
0
def parse_fec():
    dom = pulldom.parse(FEC_XML)
    for event, node in dom:
        if event == "START_ELEMENT" and node.tagName == 'candidate':
            dom.expandNode(node)
            fec_id = node.getElementsByTagName('id')[0].firstChild.nodeValue
            uri = node.getElementsByTagName('uri')[0].firstChild.nodeValue
            if fec_id in uri: continue
            bioguide_id = uri.split('/')[-1]
            yield {'fecid': fec_id, 'bioguideid': bioguide_id}
예제 #51
0
 def testComment(self):
     commentText = []
     for event, node in pulldom.parse(self.testFile):
         if event == pulldom.COMMENT:
             commentText.append(node.data)
     try:
         result = u"".join(commentText)
         self.failUnlessEqual(repr(result), r"u'&#x39b;&#x39c;&#x39d;&#x39e;&#x39f;'")
     except Exception, x:
         self.fail("Unexpected exception joining comment data pieces: %s" % str(x))
예제 #52
0
 def testTextNodes(self):
     text = []
     for event, node in pulldom.parse(self.testFile):
         if event == pulldom.CHARACTERS:
             text.append(node.data)
     try:
         result = u"".join(text)
         self.failUnlessEqual(repr(result), r"u'\n    Some greek: \u0391\u0392\u0393\u0394\u0395\n    \n    \n    \n'")
     except Exception, x:
         self.fail("Unexpected exception joining text pieces: %s" % str(x))
예제 #53
0
    def testDefault_noAttack(self):
        file = "../../xml_files_windows/standard.xml"
        tagName = "data"

        doc = _PULLDOM.parse(file)
        for event, node in doc:
            if event == _PULLDOM.START_ELEMENT and node.tagName == tagName:
                doc.expandNode(node)
                self.assertEqual("data", node.nodeName)
                self.assertEqual("4", node.firstChild.data)
예제 #54
0
def processAntFile(filename):
	global currentFile
	handle = open(filename)
	doc = pulldom.parse(handle)
	resetCurrent()
	currentFile = filename
	for event, node in doc:
		if event == pulldom.START_ELEMENT:
			printNode(node)
			processNode(node)
예제 #55
0
    def testParameterEntity_core(self):
        file = "../../xml_files_windows/xxep/parameterEntity_core.xml"
        tagName = "data"

        doc = _PULLDOM.parse(file)
        for event, node in doc:
            if event == _PULLDOM.START_ELEMENT and node.tagName == tagName:
                doc.expandNode(node)
                tmp = node.toxml()
                self.assertEqual("<data/>", tmp)
예제 #56
0
 def __init__(self,
              stream_or_string,
              *,
              using=DEFAULT_DB_ALIAS,
              ignorenonexistent=False,
              **options):
     super().__init__(stream_or_string, **options)
     self.event_stream = pulldom.parse(self.stream, self._make_parser())
     self.db = using
     self.ignore = ignorenonexistent
예제 #57
0
    def testDOS_recursion(self):
        file = "../../xml_files_windows/dos/dos_recursion.xml"
        tagName = "data"

        with self.assertRaises(_SAX.SAXParseException):
            doc = _PULLDOM.parse(file)
            for event, node in doc:
                if event == _PULLDOM.START_ELEMENT and node.tagName == tagName:
                    doc.expandNode(node)
                    tmp = node.toxml()
예제 #58
0
 def read(self):
     entry_id = 1
     events = pulldom.parse(self.filename)
     # events = pulldom.parse(sys.argv[1])
     for (event, node) in events:
         if event == "START_ELEMENT" and node.tagName == "entry":
             events.expandNode(node)
             entry = Entry(node, entry_id)
             yield entry
             entry_id += 1
예제 #59
0
def process_gpx(db, gpx_id, f, options):
    cur = db.cursor()
    geomfromtext = 'ST_GeomFromText(%s)'
    if options.reproject:
        geomfromtext = 'ST_Transform({0}, 900913)'.format(geomfromtext)
    segment = 0
    needWrite = False
    events = pulldom.parse(f)
    for event, node in events:
        if event == pulldom.START_ELEMENT:
            if node.localName == 'trkseg':
                points = []
                polledPoints = []
                needWrite = False
                lastNode = None
                lastDate = None
            elif node.localName == 'trkpt':
                lat = float(node.getAttribute('lat'))
                lon = float(node.getAttribute('lon'))
                dist = abs(lon - lastNode[0]) + abs(
                    lat - lastNode[1]) if lastNode else options.dmin * 2

                events.expandNode(node)
                t = node.getElementsByTagName('time')
                time = t[0].firstChild.data
                lastNode = (lon, lat, time)

                if dist > options.dmax:
                    needWrite = True
                    polledPoints = [(lon, lat, time)]
                elif dist >= options.dmin:
                    points.append((lon, lat, time))
                    if len(points) >= options.pmax:
                        needWrite = True

                    t = node.getElementsByTagName('time')
                    if t and t[0].firstChild and len(
                            t[0].firstChild.data) >= 10:
                        lastDate = t[0].firstChild.data[0:10]

        elif event == pulldom.END_ELEMENT and node.localName == 'trkseg':
            needWrite = True
        if needWrite:
            if points and len(points) >= max(2, options.pmin):
                geom = 'SRID=4326;LINESTRINGM(' + ','.join(
                    ['{0} {1} {2}'.format(x[0], x[1], x[2])
                     for x in points]) + ')'
                cur.execute(
                    'insert into gpx_data (gpx_id, segment_id, track_date, track) values (%s, %s, %s, {0})'
                    .format(geomfromtext), (gpx_id, segment, lastDate, geom))
                segment += 1
            points = polledPoints
            polledPoints = []
            needWrite = False
    cur.close()
예제 #60
0
파일: parser.py 프로젝트: superssnails/edd
    def __init__(self, stream_or_string, **options):

        self.rawImportRecordBuffer = []
        self.options = options
        if isinstance(stream_or_string, six.string_types):
            self.stream = six.StringIO(stream_or_string)
        else:
            self.stream = stream_or_string

        self.event_stream = pulldom.parse(self.stream, self._make_parser())
        self.thin = options.pop('thin', 0)