Пример #1
0
def adaptive_parseline(line, index2col):
    items = line.strip("\n").split("\t")
    if len(items) != len(index2col):
        sys.stderr.write(
            "Incosistent number of columns between the following\
                          line and the header line, skipped it:\n\
                          Line:\n%s\n"
            % line
        )
        return None

    col2val = {}
    valid_cols = adaptive_columns()
    for i, col in index2col.iteritems():
        if col in valid_cols:
            col2val[col] = items[i].replace("/", ", ")

    # Return None if line does not have minimum required fields.
    required_cols = [
        "count",
        "frequencyCount",
        "nucleotide",
        "vGeneName",
        "jGeneName",
        "vGeneNameTies",
        "jGeneNameTies",
    ]
    for c in required_cols:
        if c not in col2val:  # or col2val[c] in ["(undefined)", ""]:
            return None

    count = int(col2val["count"])
    freq = float(col2val["frequencyCount"]) / 100.0  # convert to non percentage
    nuc = col2val["nucleotide"]
    vgene = col2val["vGeneName"]
    if vgene == "unresolved":
        vgenes = col2val["vGeneNameTies"].split(",")
    else:
        vgenes = [vgene]
    jgene = col2val["jGeneName"]
    if jgene == "unresolved":
        jgenes = col2val["jGeneNameTies"].split(",")
    else:
        jgenes = [jgene]

    # Clone with required fields
    clone = Clone(count, freq, nuc, vgenes, jgenes)

    # Additional information if available
    # Gene info:
    if "dGeneName" in col2val:
        dgenestr = col2val["dGeneName"]
        if dgenestr == "unresolved":
            clone.dgenes = col2val["dGeneNameTies"].split(",")
        else:
            clone.dgenes = [dgenestr]

    if "sequenceStatus" in col2val:
        status = col2val["sequenceStatus"].lower()
        if status is not None and status == "in":
            clone.productive = True
        elif status == "out" or status == "stop":
            clone.productive = False
        else:
            sys.stderr.write("Unknown status: %s\n" % status)
    if "aminoAcid" in col2val:
        clone.cdr3aa = col2val["aminoAcid"]

    # Junctional info:
    offset = 0
    if "vIndex" in col2val:
        vindex = int(col2val["vIndex"])
        if clone.productive:
            # Make sure nuc is inframe:
            offset = vindex % 3
            nuclen = len(clone.nuc)
            endoffset = (nuclen - offset) % 3
            clone.nuc = clone.nuc[offset : nuclen - endoffset]
            clone.aa = libcommon.nt2aa(clone.nuc)
        if clone.cdr3aa:
            cdr3len = len(clone.cdr3aa) * 3
            endindex = max(len(clone.nuc), vindex + cdr3len)
            clone.cdr3nuc = clone.nuc[vindex:endindex]
    if "dIndex" in col2val:
        clone.firstdpos = int(col2val["dIndex"]) - offset
    if "n2Index" in col2val:
        n2index = int(col2val["n2Index"])
        if n2index != -1:
            clone.lastvpos = n2index - 1 - offset
        elif clone.firstdpos:  # No d5ins
            clone.lastvpos = clone.firstdpos - 1
    if "jIndex" in col2val:
        clone.firstjpos = int(col2val["jIndex"]) - offset
    if "n1Index" in col2val:
        n1index = int(col2val["n1Index"])
        if n1index != -1:
            clone.lastdpos = n1index - 1 - offset
        elif clone.firstjpos:  # No d3ins
            clone.lastdpos = clone.firstjpos - 1

    # Deletion info:
    if "vDeletion" in col2val:
        clone.vdel = int(col2val["vDeletion"])
    if "d5Deletion" in col2val:
        clone.d5del = int(col2val["d5Deletion"])
    if "d3Deletion" in col2val:
        clone.d3del = int(col2val["d3Deletion"])
    if "jDeletion" in col2val:
        clone.jdel = int(col2val["jDeletion"])

    return clone
Пример #2
0
def sequenta_parseline(line, index2col):
    items = line.strip("\n").split("\t")
    if len(items) != len(index2col):
        sys.stderr.write(
            "Incosistent number of columns between the following\
                          line and the header line, skipped it:\n\
                          Line:\n%s\n"
            % line
        )
        return None

    col2val = {}
    valid_cols = sequenta_columns()
    for i, col in index2col.iteritems():
        if col in valid_cols:
            col2val[col] = items[i]

    # Return None if clone is "Water"
    if "Patient" in col2val and col2val["Patient"] == "Water":
        return None

    # Return None if line does not have minimum required fields.
    required_cols = [
        "Total_Read_Count",
        "Log10_Frequency",
        "Clone_Sequence",
        "V_Segment_Major_Gene",
        "J_Segment_Major_Gene",
    ]
    for c in required_cols:
        if c not in col2val or col2val[c] in ["NAN", "", "-"]:
            return None

    count = libcommon.soft_int(col2val["Total_Read_Count"])
    try:
        freq = 10 ** float(col2val["Log10_Frequency"])
    except:  # Return None if clone does not have a valid frequency
        return None
    nuc = col2val["Clone_Sequence"]
    vgenes = col2val["V_Segment_Major_Gene"].split("; ")
    jgenes = col2val["J_Segment_Major_Gene"].split("; ")
    # Clone with required fields
    clone = Clone(count, freq, nuc, vgenes, jgenes)

    # Additional information if available
    # Gene info:
    if "D_Segment_Major_Allele" in col2val:
        dstr = col2val["D_Segment_Major_Allele"]
        if dstr not in ["NAN", "", "-"]:
            dalleles = dstr.split("; ")
            dgenes = []
            for d in dalleles:
                dgene = d.split("*")[0]
                if dgene not in dgenes:
                    dgenes.append(dgene)
            clone.dgenes = dgenes
            clone.dalleles = dalleles
    if not clone.dgenes:  # no dgenes info
        jgroups = get_j_groups(clone.jgenes)
        if ["1"] == jgroups:
            clone.dgenes = ["TRBD1"]
        else:
            clone.dgenes = [random.choice(["TRBD1", "TRBD2"])]

    if "V_Segment_Major_Allele" in col2val:
        clone.valleles = col2val["V_Segment_Major_Allele"].split("; ")
    if "J_Segment_Major_Allele" in col2val:
        clone.jalleles = col2val["J_Segment_Major_Allele"].split("; ")

    # Sequence ID, status and cdr3aa:
    if "Sample" in col2val:
        clone.samplename = col2val["Sample"]
    if "Patient" in col2val:
        clone.patient = col2val["Patient"]
    if "Clone_Index" in col2val:
        clone.id = col2val["Clone_Index"]
    if "Is_Good_Frame" in col2val:
        if col2val["Is_Good_Frame"].lower() == "true":
            clone.productive = True
        else:
            clone.productive = False
    if "Clone_Protein_Sequence" in col2val:
        clone.aa = col2val["Clone_Protein_Sequence"].replace("*", "Z")

    offset = 0
    if "CDR3_Sense_Sequence" in col2val:
        clone.cdr3nuc = col2val["CDR3_Sense_Sequence"]
        if not re.search(clone.cdr3nuc, clone.nuc):
            clone.nuc = libcommon.rc(clone.nuc)
        try:
            cdr3aa = sequenta_getaa(clone.cdr3nuc)
            clone.cdr3aa = cdr3aa
        except:  # return None if cannot translate cdr3nuc
            return None
        # Make sure nuc is in frame
        cdr3start = re.search(clone.cdr3nuc, clone.nuc).start()
        offset = cdr3start % 3
        nuclen = len(clone.nuc)
        endoffset = (nuclen - offset) % 3
        clone.nuc = clone.nuc[offset : nuclen - endoffset]

    # Junctional info:
    if "V_Segment_Extension_Length" in col2val:
        vins = libcommon.soft_int(col2val["V_Segment_Extension_Length"])
        clone.lastvpos = vins - 1 - offset
        if "N_Bases_adjacent_V" in col2val:
            d5ins = col2val["N_Bases_adjacent_V"]
            if not d5ins.startswith("-") and d5ins not in ["", "NAN"]:
                clone.firstdpos = clone.lastvpos + int(d5ins) + 1
    if "J_Segment_Extension_Length" in col2val:
        jins = libcommon.soft_int(col2val["J_Segment_Extension_Length"])
        clone.firstjpos = len(clone.nuc) - jins
        if "N_Bases_adjacent_J" in col2val:
            d3ins = col2val["N_Bases_adjacent_J"]
            if not d3ins.startswith("-") and d3ins not in ["", "NAN"]:
                clone.lastdpos = clone.firstjpos - int(d3ins) - 1

    # Deletions:
    if "V_Segment_Deletion_Length" in col2val:
        vdel = col2val["V_Segment_Deletion_Length"]
        if not vdel.startswith("-") and vdel not in ["", "NAN"]:
            clone.vdel = libcommon.soft_int(vdel)
    if "J_Segment_Deletion_Length" in col2val:
        jdel = col2val["J_Segment_Deletion_Length"]
        if not jdel.startswith("-") and jdel not in ["", "NAN"]:
            clone.jdel = libcommon.soft_int(jdel)

    # Special treatment for D info:
    d2fulllen = {"TRBD1": 12, "TRBD2": 16}
    if "D_Segment_length" in col2val:
        dgene = clone.dgenes[0]
        dfulllen = d2fulllen[dgene]
        dlen = col2val["D_Segment_length"]
        if not dlen.startswith("-") and dlen not in ["", "NAN"]:
            ddel = dfulllen - int(dlen)
            clone.d5del, clone.d3del = get_ddels(ddel)
            # clone.d5del = ddel / 2.0
            # clone.d3del = ddel - clone.d5del
        else:  # all D was deleted
            clone.d5del, clone.d3del = get_ddels(dfulllen)
            # clone.d5del = dfulllen / 2
            # clone.d3del = dfulllen - clone.d5del
            ndn = clone.firstjpos - clone.lastvpos
            clone.firstdpos = clone.lastvpos + ndn / 2 + 1
            clone.lastdpos = clone.firstdpos - 1

    return clone
Пример #3
0
def mitcr_parseline(line, index2col):
    items = line.strip().split('\t')
    if len(items) != len(index2col):
        sys.stderr.write("Inconsistent number of columns between the following\
                          line and the header line, skipped it:\n\
                          Line:\n%s\n" % line)
        return None
    
    col2val = {}
    valid_cols = mitcr_columns()
    for i, col in index2col.iteritems():
        if col in valid_cols:
            col2val[col] = items[i]

    # Return None if line does not have minimum required fields.
    required_cols = ["Read count", "Percentage", "CDR3 nucleotide sequence",
                     "V segments", "J segments"]
    for c in required_cols:
        if c not in col2val or not col2val[c]:
            return None

    count = int(col2val['Read count'])
    freq = float(col2val['Percentage'])/100.0
    nuc = col2val['CDR3 nucleotide sequence']
    vgenes = col2val['V segments'].split(', ')
    jgenes = col2val['J segments'].split(', ')

    clone = Clone(count, freq, nuc, vgenes, jgenes, cdr3nuc=nuc)

    clone.productive = True  # Assuming MiTCR only output productive clones
    if 'D segments' in col2val:
        clone.dgenes = col2val['D segments'].split(', ')
    if 'V alleles' in col2val:
        clone.valleles = col2val['V alleles'].split(', ')
    if 'J alleles' in col2val:
        clone.jalleles = col2val['J alleles'].split(', ')
    if 'D alleles' in col2val:
        clone.dalleles = col2val['D alleles'].split(', ')

    if 'CDR3 amino acid sequence' in col2val:
        clone.aa = col2val['CDR3 amino acid sequence']
        clone.cdr3aa = col2val['CDR3 amino acid sequence']
    if 'Last V nucleotide position' in col2val:
        clone.lastvpos = int(col2val['Last V nucleotide position'])
    if 'First D nucleotide position' in col2val:
        clone.firstdpos = int(col2val['First D nucleotide position'])
    if 'Last D nucleotide position' in col2val:
        clone.lastdpos = int(col2val['Last D nucleotide position'])
    if 'First J nucleotide position' in col2val:
        clone.firstjpos = int(col2val['First J nucleotide position'])

    return clone