def conserved_cysteines(folders, outfile, regex = None, frequency = False): cdr_list_all = [] for folder in folders: with open(folder[0] + '/6_Junction.txt') as f: cdr_list = [] reader = csv.DictReader(f, delimiter = '\t') for row in reader: if row['Functionality'].find('productive') == 0 and row['D-REGION reading frame'] == '2': try: pat = re.compile('IGHD[^\*]+') info = pat.search(row['D-GENE and allele']) dgene = info.group(0) except: dgene = '' if frequency: pat = re.compile(regex) info = pat.match(row['Sequence ID']) freq = int(info.group(1)) else: freq = 1 cysteines = 0 d2 = False if dgene in ['IGHD2-2','IGHD2-8','IGHD2-15'] and int(row['D-REGION-nt nb']) > 24: d2 = True if helper_functions.spacing_4(row['CDR3-IMGT (AA)']): cysteines = freq if dgene == 'IGHD2-21' and int(row['D-REGION-nt nb']) > 21: d2 = True if helper_functions.spacing_3(row['CDR3-IMGT (AA)']): cysteines = freq if d2: length = len(row['CDR3-IMGT (AA)']) notfound = True for item in cdr_list: if length == item._length: item._increment(cysteines, freq) notfound = False if notfound: cdr_list.append(Cdr(length, cysteines, freq)) for item in cdr_list: item._calculate_stats() notfound2 = True for item2 in cdr_list_all: if item._length == item2._length: item2._increment(item._fraction_conserved, item._number) notfound2 = False if notfound2: cdr_list_all.append(Cdr_all_files(item._length, item._fraction_conserved, item._number)) cdr_list_all.sort(key = lambda x: x._length) data = [] with open(outfile, 'w') as out: out.write('CDR3 length,fraction cysteines conserved,number of seqs\n') for item in cdr_list_all: item._calculate_stats() out.write(str(item._length) + ',' + str(item._cysteines) + ',' + str(item._number) + '\n') data.append([item._length, item._cysteines]) return data
def analyse_dsegs(folders, outfile, regex=None, frequency=False): cdr_all = Cdr_all_files() for folder in folders: with open(folder[0] + "/5_AA-sequences.txt") as f: cdr = Cdr() reader = csv.DictReader(f, delimiter="\t") for row in reader: if row["Functionality"].find("productive") == 0: if row["CDR3-IMGT"].count("C") > 1: try: pat = re.compile("IGHD[^\*]+") info = pat.search(row["D-GENE and allele"]) dgene = info.group(0) except: dgene = "" if frequency: pat = re.compile(regex) info = pat.match(row["Sequence ID"]) freq = int(info.group(1)) else: freq = 1 if helper_functions.spacing_4(row["CDR3-IMGT"]): cdr.increment(dgene, freq) else: cdr.increment_nc(dgene, freq) cdr.calculate_stats() for key in cdr.fractions.keys(): cdr_all.increment(key, cdr.fractions[key] * fractions.Fraction(folder[1])) cdr_all.increment_nc(key, cdr.fractions_nc[key] * fractions.Fraction(folder[1])) # cdr_all.alleles() with open(outfile, "w") as out: out.write( "IGHD1-1nc,IGHD1-2nc,IGHD1-3nc,IGHD1-4nc,IGHD1-5nc,IGHD1-6nc,IGHD1-7nc,IGHD1-8nc,\ IGHD2-1nc,IGHD2-2nc,IGHD2-3nc,IGHD2-4nc,IGHD2-5nc,IGHD2-6nc,\ IGHD3-1nc,IGHD3-2nc,IGHD3-3nc,IGHD3-4nc,IGHD4-1nc,\ IGHD4-2nc,IGHD4-3nc,IGHD4-4nc,IGHD5-1nc,\ IGHD5-2nc,IGHD5-3nc,IGHD6-1nc,IGHD6-2nc,\ IGHD6-3nc,IGHD6-4nc,IGHD6-5nc,IGHD6-6nc,IGHD7-1nc,nonenc,\ IGHD1-1,IGHD1-2,IGHD1-3,IGHD1-4,IGHD1-5,IGHD1-6,IGHD1-7,IGHD1-8,\ IGHD2-1,IGHD2-2,IGHD2-3,IGHD2-4,IGHD2-5,IGHD2-6,\ IGHD3-1,IGHD3-2,IGHD3-3,IGHD3-4,IGHD4-1,\ IGHD4-2,IGHD4-3,IGHD4-4,IGHD5-1,\ IGHD5-2,IGHD5-3,IGHD6-1,IGHD6-2,\ IGHD6-3,IGHD6-4,IGHD6-5,IGHD6-6,IGHD7-1,none\n" ) out.write( str(cdr_all.d_segs_nc["IGHD1-1"]) + "," + str(cdr_all.d_segs_nc["IGHD1-2"]) + "," + str(cdr_all.d_segs_nc["IGHD1-3"]) + "," + str(cdr_all.d_segs_nc["IGHD1-4"]) + "," + str(cdr_all.d_segs_nc["IGHD1-5"]) + "," + str(cdr_all.d_segs_nc["IGHD1-6"]) + "," + str(cdr_all.d_segs_nc["IGHD1-7"]) + "," + str(cdr_all.d_segs_nc["IGHD1-8"]) + "," + str(cdr_all.d_segs_nc["IGHD2-1"]) + "," + str(cdr_all.d_segs_nc["IGHD2-2"]) + "," + str(cdr_all.d_segs_nc["IGHD2-3"]) + "," + str(cdr_all.d_segs_nc["IGHD2-4"]) + "," + str(cdr_all.d_segs_nc["IGHD2-5"]) + "," + str(cdr_all.d_segs_nc["IGHD2-6"]) + "," + str(cdr_all.d_segs_nc["IGHD3-1"]) + "," + str(cdr_all.d_segs_nc["IGHD3-2"]) + "," + str(cdr_all.d_segs_nc["IGHD3-3"]) + "," + str(cdr_all.d_segs_nc["IGHD3-4"]) + "," + str(cdr_all.d_segs_nc["IGHD4-1"]) + "," + str(cdr_all.d_segs_nc["IGHD4-2"]) + "," + str(cdr_all.d_segs_nc["IGHD4-3"]) + "," + str(cdr_all.d_segs_nc["IGHD4-4"]) + "," + str(cdr_all.d_segs_nc["IGHD5-1"]) + "," + str(cdr_all.d_segs_nc["IGHD5-2"]) + "," + str(cdr_all.d_segs_nc["IGHD5-3"]) + "," + str(cdr_all.d_segs_nc["IGHD6-1"]) + "," + str(cdr_all.d_segs_nc["IGHD6-2"]) + "," + str(cdr_all.d_segs_nc["IGHD6-3"]) + "," + str(cdr_all.d_segs_nc["IGHD6-4"]) + "," + str(cdr_all.d_segs_nc["IGHD6-5"]) + "," + str(cdr_all.d_segs_nc["IGHD6-6"]) + "," + str(cdr_all.d_segs_nc["IGHD7-1"]) + "," + str(cdr_all.d_segs_nc[""]) + "," + "," + str(cdr_all.d_segs["IGHD1-1"]) + "," + str(cdr_all.d_segs["IGHD1-2"]) + "," + str(cdr_all.d_segs["IGHD1-3"]) + "," + str(cdr_all.d_segs["IGHD1-4"]) + "," + str(cdr_all.d_segs["IGHD1-5"]) + "," + str(cdr_all.d_segs["IGHD1-6"]) + "," + str(cdr_all.d_segs["IGHD1-7"]) + "," + str(cdr_all.d_segs["IGHD1-8"]) + "," + str(cdr_all.d_segs["IGHD2-1"]) + "," + str(cdr_all.d_segs["IGHD2-2"]) + "," + str(cdr_all.d_segs["IGHD2-3"]) + "," + str(cdr_all.d_segs["IGHD2-4"]) + "," + str(cdr_all.d_segs["IGHD2-5"]) + "," + str(cdr_all.d_segs["IGHD2-6"]) + "," + str(cdr_all.d_segs["IGHD3-1"]) + "," + str(cdr_all.d_segs["IGHD3-2"]) + "," + str(cdr_all.d_segs["IGHD3-3"]) + "," + str(cdr_all.d_segs["IGHD3-4"]) + "," + str(cdr_all.d_segs["IGHD4-1"]) + "," + str(cdr_all.d_segs["IGHD4-2"]) + "," + str(cdr_all.d_segs["IGHD4-3"]) + "," + str(cdr_all.d_segs["IGHD4-4"]) + "," + str(cdr_all.d_segs["IGHD5-1"]) + "," + str(cdr_all.d_segs["IGHD5-2"]) + "," + str(cdr_all.d_segs["IGHD5-3"]) + "," + str(cdr_all.d_segs["IGHD6-1"]) + "," + str(cdr_all.d_segs["IGHD6-2"]) + "," + str(cdr_all.d_segs["IGHD6-3"]) + "," + str(cdr_all.d_segs["IGHD6-4"]) + "," + str(cdr_all.d_segs["IGHD6-5"]) + "," + str(cdr_all.d_segs["IGHD6-6"]) + "," + str(cdr_all.d_segs["IGHD7-1"]) + "," + str(cdr_all.d_segs[""]) ) dseg_list_nc = [ cdr_all.d_segs_nc["IGHD1-1"], cdr_all.d_segs_nc["IGHD1-2"], cdr_all.d_segs_nc["IGHD1-3"], cdr_all.d_segs_nc["IGHD1-4"], cdr_all.d_segs_nc["IGHD1-5"], cdr_all.d_segs_nc["IGHD1-6"], cdr_all.d_segs_nc["IGHD1-7"], cdr_all.d_segs_nc["IGHD1-8"], cdr_all.d_segs_nc["IGHD2-1"], cdr_all.d_segs_nc["IGHD2-2"], cdr_all.d_segs_nc["IGHD2-3"], cdr_all.d_segs_nc["IGHD2-4"], cdr_all.d_segs_nc["IGHD2-5"], cdr_all.d_segs_nc["IGHD2-6"], cdr_all.d_segs_nc["IGHD3-1"], cdr_all.d_segs_nc["IGHD3-2"], cdr_all.d_segs_nc["IGHD3-3"], cdr_all.d_segs_nc["IGHD3-4"], cdr_all.d_segs_nc["IGHD4-1"], cdr_all.d_segs_nc["IGHD4-2"], cdr_all.d_segs_nc["IGHD4-3"], cdr_all.d_segs_nc["IGHD4-4"], cdr_all.d_segs_nc["IGHD5-1"], cdr_all.d_segs_nc["IGHD5-2"], cdr_all.d_segs_nc["IGHD5-3"], cdr_all.d_segs_nc["IGHD6-1"], cdr_all.d_segs_nc["IGHD6-2"], cdr_all.d_segs_nc["IGHD6-3"], cdr_all.d_segs_nc["IGHD6-4"], cdr_all.d_segs_nc["IGHD6-5"], cdr_all.d_segs_nc["IGHD6-6"], cdr_all.d_segs_nc["IGHD7-1"], cdr_all.d_segs_nc[""], ] dseg_list = [ cdr_all.d_segs["IGHD1-1"], cdr_all.d_segs["IGHD1-2"], cdr_all.d_segs["IGHD1-3"], cdr_all.d_segs["IGHD1-4"], cdr_all.d_segs["IGHD1-5"], cdr_all.d_segs["IGHD1-6"], cdr_all.d_segs["IGHD1-7"], cdr_all.d_segs["IGHD1-8"], cdr_all.d_segs["IGHD2-1"], cdr_all.d_segs["IGHD2-2"], cdr_all.d_segs["IGHD2-3"], cdr_all.d_segs["IGHD2-4"], cdr_all.d_segs["IGHD2-5"], cdr_all.d_segs["IGHD2-6"], cdr_all.d_segs["IGHD3-1"], cdr_all.d_segs["IGHD3-2"], cdr_all.d_segs["IGHD3-3"], cdr_all.d_segs["IGHD3-4"], cdr_all.d_segs["IGHD4-1"], cdr_all.d_segs["IGHD4-2"], cdr_all.d_segs["IGHD4-3"], cdr_all.d_segs["IGHD4-4"], cdr_all.d_segs["IGHD5-1"], cdr_all.d_segs["IGHD5-2"], cdr_all.d_segs["IGHD5-3"], cdr_all.d_segs["IGHD6-1"], cdr_all.d_segs["IGHD6-2"], cdr_all.d_segs["IGHD6-3"], cdr_all.d_segs["IGHD6-4"], cdr_all.d_segs["IGHD6-5"], cdr_all.d_segs["IGHD6-6"], cdr_all.d_segs["IGHD7-1"], cdr_all.d_segs[""], ] return [dseg_list_nc, dseg_list]