def conserved_cysteines(folders, outfile, regex = None, frequency = False):
    cdr_list_all = []
    for folder in folders:
        with open(folder[0] + '/6_Junction.txt') as f:
            cdr_list = []
            reader = csv.DictReader(f, delimiter = '\t')
            for row in reader:
                if row['Functionality'].find('productive') == 0 and row['D-REGION reading frame'] == '2':
                    try:
                        pat = re.compile('IGHD[^\*]+')
                        info = pat.search(row['D-GENE and allele'])
                        dgene = info.group(0)
                    except:
                        dgene = ''
                    if frequency:
                        pat = re.compile(regex)
                        info = pat.match(row['Sequence ID'])
                        freq = int(info.group(1))
                    else:
                        freq = 1
                    cysteines = 0
                    d2 = False
                    if dgene in ['IGHD2-2','IGHD2-8','IGHD2-15'] and int(row['D-REGION-nt nb']) > 24:
                        d2 = True
                        if helper_functions.spacing_4(row['CDR3-IMGT (AA)']):
                            cysteines = freq
                    if dgene == 'IGHD2-21' and int(row['D-REGION-nt nb']) > 21:
                        d2 = True
                        if helper_functions.spacing_3(row['CDR3-IMGT (AA)']):
                            cysteines = freq
                    if d2:
                        length = len(row['CDR3-IMGT (AA)'])
                        notfound = True
                        for item in cdr_list:
                            if length == item._length:
                                item._increment(cysteines, freq)
                                notfound = False
                        if notfound:
                            cdr_list.append(Cdr(length, cysteines, freq))
        for item in cdr_list:
            item._calculate_stats()
            notfound2 = True
            for item2 in cdr_list_all:
                if item._length == item2._length:
                    item2._increment(item._fraction_conserved, item._number)
                    notfound2 = False
            if notfound2:
                cdr_list_all.append(Cdr_all_files(item._length, item._fraction_conserved, item._number))
    cdr_list_all.sort(key = lambda x: x._length)
    data = []
    with open(outfile, 'w') as out:
        out.write('CDR3 length,fraction cysteines conserved,number of seqs\n')
        for item in cdr_list_all:
            item._calculate_stats()
            out.write(str(item._length) + ',' + str(item._cysteines) + ',' + str(item._number) + '\n')
            data.append([item._length, item._cysteines])
    return data
Exemplo n.º 2
0
def analyse_dsegs(folders, outfile, regex=None, frequency=False):
    cdr_all = Cdr_all_files()
    for folder in folders:
        with open(folder[0] + "/5_AA-sequences.txt") as f:
            cdr = Cdr()
            reader = csv.DictReader(f, delimiter="\t")
            for row in reader:
                if row["Functionality"].find("productive") == 0:
                    if row["CDR3-IMGT"].count("C") > 1:
                        try:
                            pat = re.compile("IGHD[^\*]+")
                            info = pat.search(row["D-GENE and allele"])
                            dgene = info.group(0)
                        except:
                            dgene = ""
                        if frequency:
                            pat = re.compile(regex)
                            info = pat.match(row["Sequence ID"])
                            freq = int(info.group(1))
                        else:
                            freq = 1
                        if helper_functions.spacing_4(row["CDR3-IMGT"]):
                            cdr.increment(dgene, freq)
                        else:
                            cdr.increment_nc(dgene, freq)

            cdr.calculate_stats()
            for key in cdr.fractions.keys():
                cdr_all.increment(key, cdr.fractions[key] * fractions.Fraction(folder[1]))
                cdr_all.increment_nc(key, cdr.fractions_nc[key] * fractions.Fraction(folder[1]))

    # cdr_all.alleles()
    with open(outfile, "w") as out:
        out.write(
            "IGHD1-1nc,IGHD1-2nc,IGHD1-3nc,IGHD1-4nc,IGHD1-5nc,IGHD1-6nc,IGHD1-7nc,IGHD1-8nc,\
            IGHD2-1nc,IGHD2-2nc,IGHD2-3nc,IGHD2-4nc,IGHD2-5nc,IGHD2-6nc,\
            IGHD3-1nc,IGHD3-2nc,IGHD3-3nc,IGHD3-4nc,IGHD4-1nc,\
            IGHD4-2nc,IGHD4-3nc,IGHD4-4nc,IGHD5-1nc,\
            IGHD5-2nc,IGHD5-3nc,IGHD6-1nc,IGHD6-2nc,\
            IGHD6-3nc,IGHD6-4nc,IGHD6-5nc,IGHD6-6nc,IGHD7-1nc,nonenc,\
            IGHD1-1,IGHD1-2,IGHD1-3,IGHD1-4,IGHD1-5,IGHD1-6,IGHD1-7,IGHD1-8,\
            IGHD2-1,IGHD2-2,IGHD2-3,IGHD2-4,IGHD2-5,IGHD2-6,\
            IGHD3-1,IGHD3-2,IGHD3-3,IGHD3-4,IGHD4-1,\
            IGHD4-2,IGHD4-3,IGHD4-4,IGHD5-1,\
            IGHD5-2,IGHD5-3,IGHD6-1,IGHD6-2,\
            IGHD6-3,IGHD6-4,IGHD6-5,IGHD6-6,IGHD7-1,none\n"
        )
        out.write(
            str(cdr_all.d_segs_nc["IGHD1-1"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD1-2"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD1-3"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD1-4"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD1-5"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD1-6"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD1-7"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD1-8"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD2-1"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD2-2"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD2-3"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD2-4"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD2-5"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD2-6"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD3-1"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD3-2"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD3-3"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD3-4"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD4-1"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD4-2"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD4-3"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD4-4"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD5-1"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD5-2"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD5-3"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD6-1"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD6-2"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD6-3"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD6-4"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD6-5"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD6-6"])
            + ","
            + str(cdr_all.d_segs_nc["IGHD7-1"])
            + ","
            + str(cdr_all.d_segs_nc[""])
            + ","
            + ","
            + str(cdr_all.d_segs["IGHD1-1"])
            + ","
            + str(cdr_all.d_segs["IGHD1-2"])
            + ","
            + str(cdr_all.d_segs["IGHD1-3"])
            + ","
            + str(cdr_all.d_segs["IGHD1-4"])
            + ","
            + str(cdr_all.d_segs["IGHD1-5"])
            + ","
            + str(cdr_all.d_segs["IGHD1-6"])
            + ","
            + str(cdr_all.d_segs["IGHD1-7"])
            + ","
            + str(cdr_all.d_segs["IGHD1-8"])
            + ","
            + str(cdr_all.d_segs["IGHD2-1"])
            + ","
            + str(cdr_all.d_segs["IGHD2-2"])
            + ","
            + str(cdr_all.d_segs["IGHD2-3"])
            + ","
            + str(cdr_all.d_segs["IGHD2-4"])
            + ","
            + str(cdr_all.d_segs["IGHD2-5"])
            + ","
            + str(cdr_all.d_segs["IGHD2-6"])
            + ","
            + str(cdr_all.d_segs["IGHD3-1"])
            + ","
            + str(cdr_all.d_segs["IGHD3-2"])
            + ","
            + str(cdr_all.d_segs["IGHD3-3"])
            + ","
            + str(cdr_all.d_segs["IGHD3-4"])
            + ","
            + str(cdr_all.d_segs["IGHD4-1"])
            + ","
            + str(cdr_all.d_segs["IGHD4-2"])
            + ","
            + str(cdr_all.d_segs["IGHD4-3"])
            + ","
            + str(cdr_all.d_segs["IGHD4-4"])
            + ","
            + str(cdr_all.d_segs["IGHD5-1"])
            + ","
            + str(cdr_all.d_segs["IGHD5-2"])
            + ","
            + str(cdr_all.d_segs["IGHD5-3"])
            + ","
            + str(cdr_all.d_segs["IGHD6-1"])
            + ","
            + str(cdr_all.d_segs["IGHD6-2"])
            + ","
            + str(cdr_all.d_segs["IGHD6-3"])
            + ","
            + str(cdr_all.d_segs["IGHD6-4"])
            + ","
            + str(cdr_all.d_segs["IGHD6-5"])
            + ","
            + str(cdr_all.d_segs["IGHD6-6"])
            + ","
            + str(cdr_all.d_segs["IGHD7-1"])
            + ","
            + str(cdr_all.d_segs[""])
        )
    dseg_list_nc = [
        cdr_all.d_segs_nc["IGHD1-1"],
        cdr_all.d_segs_nc["IGHD1-2"],
        cdr_all.d_segs_nc["IGHD1-3"],
        cdr_all.d_segs_nc["IGHD1-4"],
        cdr_all.d_segs_nc["IGHD1-5"],
        cdr_all.d_segs_nc["IGHD1-6"],
        cdr_all.d_segs_nc["IGHD1-7"],
        cdr_all.d_segs_nc["IGHD1-8"],
        cdr_all.d_segs_nc["IGHD2-1"],
        cdr_all.d_segs_nc["IGHD2-2"],
        cdr_all.d_segs_nc["IGHD2-3"],
        cdr_all.d_segs_nc["IGHD2-4"],
        cdr_all.d_segs_nc["IGHD2-5"],
        cdr_all.d_segs_nc["IGHD2-6"],
        cdr_all.d_segs_nc["IGHD3-1"],
        cdr_all.d_segs_nc["IGHD3-2"],
        cdr_all.d_segs_nc["IGHD3-3"],
        cdr_all.d_segs_nc["IGHD3-4"],
        cdr_all.d_segs_nc["IGHD4-1"],
        cdr_all.d_segs_nc["IGHD4-2"],
        cdr_all.d_segs_nc["IGHD4-3"],
        cdr_all.d_segs_nc["IGHD4-4"],
        cdr_all.d_segs_nc["IGHD5-1"],
        cdr_all.d_segs_nc["IGHD5-2"],
        cdr_all.d_segs_nc["IGHD5-3"],
        cdr_all.d_segs_nc["IGHD6-1"],
        cdr_all.d_segs_nc["IGHD6-2"],
        cdr_all.d_segs_nc["IGHD6-3"],
        cdr_all.d_segs_nc["IGHD6-4"],
        cdr_all.d_segs_nc["IGHD6-5"],
        cdr_all.d_segs_nc["IGHD6-6"],
        cdr_all.d_segs_nc["IGHD7-1"],
        cdr_all.d_segs_nc[""],
    ]
    dseg_list = [
        cdr_all.d_segs["IGHD1-1"],
        cdr_all.d_segs["IGHD1-2"],
        cdr_all.d_segs["IGHD1-3"],
        cdr_all.d_segs["IGHD1-4"],
        cdr_all.d_segs["IGHD1-5"],
        cdr_all.d_segs["IGHD1-6"],
        cdr_all.d_segs["IGHD1-7"],
        cdr_all.d_segs["IGHD1-8"],
        cdr_all.d_segs["IGHD2-1"],
        cdr_all.d_segs["IGHD2-2"],
        cdr_all.d_segs["IGHD2-3"],
        cdr_all.d_segs["IGHD2-4"],
        cdr_all.d_segs["IGHD2-5"],
        cdr_all.d_segs["IGHD2-6"],
        cdr_all.d_segs["IGHD3-1"],
        cdr_all.d_segs["IGHD3-2"],
        cdr_all.d_segs["IGHD3-3"],
        cdr_all.d_segs["IGHD3-4"],
        cdr_all.d_segs["IGHD4-1"],
        cdr_all.d_segs["IGHD4-2"],
        cdr_all.d_segs["IGHD4-3"],
        cdr_all.d_segs["IGHD4-4"],
        cdr_all.d_segs["IGHD5-1"],
        cdr_all.d_segs["IGHD5-2"],
        cdr_all.d_segs["IGHD5-3"],
        cdr_all.d_segs["IGHD6-1"],
        cdr_all.d_segs["IGHD6-2"],
        cdr_all.d_segs["IGHD6-3"],
        cdr_all.d_segs["IGHD6-4"],
        cdr_all.d_segs["IGHD6-5"],
        cdr_all.d_segs["IGHD6-6"],
        cdr_all.d_segs["IGHD7-1"],
        cdr_all.d_segs[""],
    ]
    return [dseg_list_nc, dseg_list]