예제 #1
0
파일: helpers.py 프로젝트: gpratt/gffutils
def _unjsonify(x, isattributes=False):
    """Convert JSON string to an ordered defaultdict."""
    if isattributes:
        obj = simplejson.loads(x)
        return feature.dict_class(obj)
    return simplejson.loads(x)
예제 #2
0
파일: parser.py 프로젝트: roryk/gffutils
def _split_keyvals(keyval_str, dialect=None):
    """
    Given the string attributes field of a GFF-like line, split it into an
    attributes dictionary and a "dialect" dictionary which contains information
    needed to reconstruct the original string.

    Lots of logic here to handle all the corner cases.

    If `dialect` is None, then do all the logic to infer a dialect from this
    attribute string.

    Otherwise, use the provided dialect (and return it at the end).
    """
    infer_dialect = False
    if dialect is None:
        # Make a copy of default dialect so it can be modified as needed
        dialect = copy.copy(constants.dialect)
        infer_dialect = True

    quals = feature.dict_class()
    if not keyval_str:
        return quals, dialect

    # If a dialect was provided, then use that directly.
    if not infer_dialect:
        if dialect["trailing semicolon"]:
            keyval_str = keyval_str.rstrip(";")

        parts = keyval_str.split(dialect["field separator"])

        kvsep = dialect["keyval separator"]
        if dialect["leading semicolon"]:
            pieces = []
            for p in parts:
                if p and p[0] == ";":
                    p = p[1:]
                pieces.append(p.strip().split(kvsep))
                key_vals = [(p[0], " ".join(p[1:])) for p in pieces]

        if dialect["fmt"] == "gff3":
            key_vals = [p.split(kvsep) for p in parts]
        else:
            leadingsemicolon = dialect["leading semicolon"]
            pieces = []
            for i, p in enumerate(parts):
                if i == 0 and leadingsemicolon:
                    p = p[1:]
                pieces.append(p.strip().split(kvsep))
                key_vals = [(p[0], " ".join(p[1:])) for p in pieces]

        quoted = dialect["quoted GFF2 values"]
        for item in key_vals:
            # Easy if it follows spec
            if len(item) == 2:
                key, val = item

            # Only key provided?
            else:
                assert len(item) == 1, item
                key = item[0]
                val = ""

            try:
                quals[key]
            except KeyError:
                quals[key] = []

            if quoted:
                if len(val) > 0 and val[0] == '"' and val[-1] == '"':
                    val = val[1:-1]

            if val:
                # TODO: if there are extra commas for a value, just use empty
                # strings
                # quals[key].extend([v for v in val.split(',') if v])
                vals = val.split(",")
                quals[key].extend(vals)

        return quals, dialect

    # If we got here, then we need to infer the dialect....
    #
    # Reset the order to an empty list so that it will only be populated with
    # keys that are found in the file.
    dialect["order"] = []

    # ensembl GTF has trailing semicolon
    if keyval_str[-1] == ";":
        keyval_str = keyval_str[:-1]
        dialect["trailing semicolon"] = True

    # GFF2/GTF has a semicolon with at least one space after it.
    # Spaces can be on both sides (e.g. wormbase)
    # GFF3 works with no spaces.
    # So split on the first one we can recognize...
    for sep in (" ; ", "; ", ";"):
        parts = keyval_str.split(sep)
        if len(parts) > 1:
            dialect["field separator"] = sep
            break

    # Is it GFF3?  They have key-vals separated by "="
    if gff3_kw_pat.match(parts[0]):
        key_vals = [p.split("=") for p in parts]
        dialect["fmt"] = "gff3"
        dialect["keyval separator"] = "="

    # Otherwise, key-vals separated by space.  Key is first item.
    else:
        dialect["keyval separator"] = " "
        pieces = []
        for p in parts:
            # Fix misplaced semicolons in keys in some GFF2 files
            if p and p[0] == ";":
                p = p[1:]
                dialect["leading semicolon"] = True
            pieces.append(p.strip().split(" "))
        key_vals = [(p[0], " ".join(p[1:])) for p in pieces]

    for item in key_vals:

        # Easy if it follows spec
        if len(item) == 2:
            key, val = item

        # Only key provided?
        else:
            assert len(item) == 1, item
            key = item[0]
            val = ""

        # Is the key already in there?
        if key in quals:
            dialect["repeated keys"] = True
        else:
            quals[key] = []

        # Remove quotes in GFF2
        if len(val) > 0 and val[0] == '"' and val[-1] == '"':
            val = val[1:-1]
            dialect["quoted GFF2 values"] = True
        if val:
            # TODO: if there are extra commas for a value, just use empty
            # strings
            # quals[key].extend([v for v in val.split(',') if v])
            vals = val.split(",")
            if (len(vals) > 1) and dialect["repeated keys"]:
                raise helpers.AttributeStringError(
                    "Internally inconsistent attributes formatting: " "some have repeated keys, some do not."
                )
            quals[key].extend(vals)

        # keep track of the order of keys
        dialect["order"].append(key)

    # for key, vals in quals.items():
    #
    # TODO: urllib.unquote breaks round trip invariance for "hybrid1.gff3"
    # test file.  This is because the "Note" field has %xx escape chars,
    # but "Dbxref" has ":" which, if everything were consistent, should
    # have also been escaped.
    #
    # (By the way, GFF3 spec says only literal use of \t, \n, \r, %, and
    # control characters should be encoded)
    #
    # Solution 1: don't unquote
    # Solution 2: store, along with each attribute, whether or not it
    #             should be quoted later upon reconstruction
    # Solution 3: don't care about invariance

    # unquoted = [urllib.unquote(v) for v in vals]

    # quals[key] = vals

    if (dialect["keyval separator"] == " ") and (dialect["quoted GFF2 values"]):
        dialect["fmt"] = "gtf"

    return quals, dialect