예제 #1
0
        def datastream(xmlfile):
            skippedLines = defaultdict(int)
            elems = sorted(allElems, key=lambda e: elem2level[e])
            mE0 = "<%s " % elems[0]
            mE1 = "<%s " % elems[1]
            attrs0 = [a for a in attrs if attr2elem[a] == elems[0]]
            attrs1 = [a for a in attrs if attr2elem[a] == elems[1]]
            mAs0 = [(a, re.compile('%s="([^"]*)"' % a)) for a in attrs0]
            mAs1 = [(a, re.compile('%s="([^"]*)"' % a)) for a in attrs1]

            values = {}  # attr -> value
            for line in _open(xmlfile):
                if mE0 in line:
                    for a, r in mAs0:
                        values[a] = r.search(line).groups()[0]
                if mE1 in line:
                    skip = False
                    for a, r in mAs1:
                        m = r.search(line)
                        if m:
                            values[a] = m.groups()[0]
                        else:
                            skip = True
                            skippedLines[a] += 1
                    if not skip:
                        yield [values[a] for a in attrs]

            for attr, count in skippedLines.items():
                print(
                    "Warning: Skipped %s lines because of missing attributes '%s'."
                    % (count, attr),
                    file=sys.stderr)
예제 #2
0
 def datastream(xmlfile):
     mE = "<%s " % allElems[0]
     mAs = [re.compile('%s="([^"]*)"' % a) for a in attrs]
     for line in _open(xmlfile):
         if mE in line:
             matches = [r.search(line) for r in mAs]
             if all(matches):
                 yield [m.groups()[0] for m in matches]
예제 #3
0
def getDataStream(options):
    # determine elements and nesting for the given attributes
    # by reading from the first file

    attrOptions = options.attrOptions
    attr2elem = {}
    elem2level = {}

    level = 0
    for event, elem in ET.iterparse(_open(options.files[0]), ("start", "end")):
        if event == "start":
            level += 1
            for a, e in zip(attrOptions, options.attrElems):
                attr = getattr(options, a)
                if attr in elem.keys():
                    if e is not None and e != elem.tag:
                        # print("skipping attribute '%s' in element '%s' (required elem '%s'" % (attr, elem.tag, e))
                        continue
                    elem2level[elem.tag] = level
                    if attr in attr2elem:
                        oldTag = attr2elem[attr]
                        if oldTag != elem.tag:
                            if elem2level[oldTag] < level:
                                attr2elem[attr] = elem.tag
                            print(
                                "Warning: found %s '%s' in element '%s' (level %s) and element '%s' (level %s)."
                                " Using '%s'." %
                                (a, attr, oldTag, elem2level[oldTag], elem.tag,
                                 level, attr2elem[attr]))
                    else:
                        attr2elem[attr] = elem.tag
            if len(attr2elem) == 3:
                # all attributes have been seen
                break
        elif event == "end":
            level -= 1

    if len(attr2elem) != 3:
        for a in attrOptions:
            attr = getattr(options, a)
            if attr not in attr2elem:
                sys.exit("%s '%s' not found in %s" %
                         (a, attr, options.files[0]))

    allElems = list(set(attr2elem.values()))
    attrs = [getattr(options, a) for a in attrOptions]

    # we don't know the order of the elements and we cannot get it from our xml parser

    if len(allElems) == 2:

        def datastream(xmlfile):
            skippedLines = defaultdict(int)
            elems = sorted(allElems, key=lambda e: elem2level[e])
            mE0 = "<%s " % elems[0]
            mE1 = "<%s " % elems[1]
            attrs0 = [a for a in attrs if attr2elem[a] == elems[0]]
            attrs1 = [a for a in attrs if attr2elem[a] == elems[1]]
            mAs0 = [(a, re.compile('%s="([^"]*)"' % a)) for a in attrs0]
            mAs1 = [(a, re.compile('%s="([^"]*)"' % a)) for a in attrs1]

            values = {}  # attr -> value
            for line in _open(xmlfile):
                if mE0 in line:
                    for a, r in mAs0:
                        values[a] = r.search(line).groups()[0]
                if mE1 in line:
                    skip = False
                    for a, r in mAs1:
                        m = r.search(line)
                        if m:
                            values[a] = m.groups()[0]
                        else:
                            skip = True
                            skippedLines[a] += 1
                    if not skip:
                        yield [values[a] for a in attrs]

            for attr, count in skippedLines.items():
                print(
                    "Warning: Skipped %s lines because of missing attributes '%s'."
                    % (count, attr),
                    file=sys.stderr)

        return datastream

    elif len(allElems) == 1:

        def datastream(xmlfile):
            mE = "<%s " % allElems[0]
            mAs = [re.compile('%s="([^"]*)"' % a) for a in attrs]
            for line in _open(xmlfile):
                if mE in line:
                    matches = [r.search(line) for r in mAs]
                    if all(matches):
                        yield [m.groups()[0] for m in matches]

        return datastream

    else:
        sys.exit(
            "Found attributes at elements %s but at most 2 elements are supported"
            % allElems)