def resolve_seqs(blastdb): import os, sys, sqlite3, re, time, itertools from Bio import Seq, Entrez, SeqIO from blastlib.clean_seq_funcs import resolve_seqs, alignment_reg, alignment_comp, alignment_rev_comp, identity_calc, tiling from cleanlib.databasing import get_seqs_from_sqldb_GI, get_seqs_from_sqldb_GI_no_gene conn = sqlite3.connect(blastdb) c = conn.cursor() GI_nums_all = set() GI_nums_single = set() GI_nums_single_GI = [] mito = set() genes = set() dic = {} dic_ind = {} dic_single = {} dic_mult = {} count2 = 1 GI_mito_GI = [] for iter in c.execute("SELECT Gene_name from blast GROUP BY Gene_name;"): genes.add(iter[0]) #this gets list of all taxa/genes regardless if they have multiple gene choices or not for iter in c.execute("SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id != '0' GROUP BY tc_id, Gene_name, GI;"): GI_nums_all.add(str(iter[0])+"_"+str(iter[1])+"|"+str(iter[2])+"_"+str(iter[3])) #this gets a list of all taxa/genes if they only have one gene choice for iter in c.execute("SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id != '0' GROUP BY tc_id, Gene_name HAVING COUNT(*) =1;"): GI_nums_single.add(str(iter[0])+"_"+str(iter[1])+"|"+str(iter[2])+"_"+str(iter[3])) #this give me all the tc_ids that have multiple gene choices #tc_id_gene|GI_hit_length GI_nums = GI_nums_all-GI_nums_single GI_nums_single_GIs = [] #deal with singletones for i in GI_nums_single: if int(i.split("_")[-1]) > 5000: mito.add(i) #have to pull out mito/chloro ones separately for i in mito: GI_nums_single.remove(i) GI_mito_GI.append(re.split('_|\|', i)[-2]) GI_mito_GI_str = str(GI_mito_GI).replace("[", "(").replace("]", ")") c.execute("UPDATE blast SET Decision='Mito or chloro sequence/Chosen' WHERE GI IN " + GI_mito_GI_str + ";") #write singletons for i in GI_nums_single: GI_nums_single_GIs.append(re.split('_|\|', i)[-2]) GI_nums_single_GIs_str = str(GI_nums_single_GIs).replace("[", "(").replace("]", ")") c.execute("UPDATE blast SET Decision='Only choice/chosen' WHERE GI IN " + GI_nums_single_GIs_str + ";") #makes a dictionary of lists for each multiple taxa/gene choice #num_gene: ['gi_len'] for i in GI_nums: if i.split("|")[0] in dic.keys(): dic_list = dic[i.split("|")[0]] dic_list.append(i.split("|")[1]) dic[i.split("|")[0]] = dic_list else: dic[i.split("|")[0]] = [i.split("|")[1]] countall = 0 #deal with multiples and try and resolve print("Trying to resolve sequences") for i in dic: countall += 1 #print(i) #print(countall) print(str(round(float(countall)/float(len(dic))*100, 2))+'%') lengths = [int(m.split('_')[1]) for m in dic[i]] individual = [dic[i][x] for x, l in enumerate(lengths) if l < 5000] mito = [dic[i][x] for x, l in enumerate(lengths) if l >= 5000] if len(mito) > 0: #will pick the first one if there are multiple ones mitoinGI = [mito[0].split("_")[0]] c.execute("UPDATE blast SET Decision='Mito or chloro sequence/Chosen' WHERE GI='" + mitoinGI[0] + "';") GIS_not_picked_mito = list(set([x.split("_")[0] for x in mito])-set([mitoinGI[0]])) if len(GIS_not_picked_mito) != 0: GIS_not_picked_mito_str = str(GIS_not_picked_mito).replace("[", "(").replace("]", ")") c.execute("UPDATE blast SET Decision='Mito or chloro sequence/Randomly not chosen' WHERE GI IN " + GIS_not_picked_mito_str + ";") GIS_not_picked_mito = list(set([x.split('_')[0] for x in individual])-set([mitoinGI[0]])) if len(GIS_not_picked_mito) != 0: GIS_not_picked_mito_str = str(GIS_not_picked_mito).replace("[", "(").replace("]", ")") c.execute("UPDATE blast SET Decision='Short or less info/Not chosen' WHERE GI IN " + GIS_not_picked_mito_str + ";") elif len(individual) > 0: #this should never happen # if len(individual) == 1: # c.execute("UPDATE blast SET Decision='Not Chosen' WHERE Name_num='" + individual[0].split("_")[0] + "';") # else: # print(individual) result = [] ranges = {} current_start = -1 current_stop = -1 whole_length = 0 #do tiling gene = i.split("_", 1)[1] for m in [x.split('_')[0] for x in individual]: idens, start_stop = tiling([m], gene, blastdb, c) start, end = start_stop[0] for iden in idens: if iden > 70: #make dic of lists if (start, end) in ranges.keys(): ranges_dic_list = ranges[(start, end)] ranges_dic_list.append(m) ranges[(start, end)] = ranges_dic_list else: ranges[(start, end)] = [m] else: print('Alignment below 70') c.execute("UPDATE blast SET Decision='Sequence does not align well (<70%) to input query sequence/not chosen' WHERE GI='" + m + "';") #print(GIs_to_align) if len(ranges) == 0: c.execute("UPDATE blast SET Decision='Sequence does not align well (<70%) to input query sequence/not chosen' WHERE Name_num='" + i + "';") print('All alignments below 70, printing to file') with open("fulllength_hand_check.txt", "a") as a: a.write(i + '\n') else: #get merged range for start, stop in sorted(ranges.keys()): if start > current_stop: result.append((start, stop)) current_start, current_stop = start, stop else: current_stop = max(current_stop, stop) result[-1] = (current_start, current_stop) for n in result: whole_length += n[1] - n[0] + 1 #go through each combination of ranges to get 95% of whole range for L in range(1, len(ranges)+1): max_perc = 0 for subset in itertools.combinations(ranges.keys(), L): comb_length = 0 #for each combination, get merged length current_start = -1 current_stop = -1 result = [] for start, stop in sorted(subset): if start > current_stop: result.append((start, stop)) current_start, current_stop = start, stop else: current_stop = max(current_stop, stop) result[-1] = (current_start, current_stop) for x in result: comb_length += x[1] - x[0] + 1 if whole_length >= comb_length: perc = float(comb_length)/float(whole_length) if perc > max_perc: max_perc = perc max_subset = set() max_subset.add(subset) elif perc == max_perc: max_subset.add(subset) else: pass else: pass # goes through all combinations in a level before breaking if max_perc > .95: break final_tiling = [(0, 0)]*L for combination in max_subset: for x, comb_frag in enumerate(sorted(combination)): if comb_frag[1] - comb_frag[0] + 1 > final_tiling[x][1] - final_tiling[x][0] + 1: final_tiling[x] = comb_frag if (0, 0) in final_tiling: possible_GIs = [] else: possible_GIs = [ranges[x] for x in final_tiling] GIS_not_picked_tiling = list(set([x.split('_')[0] for x in individual])-set([GI for GI_tiles in possible_GIs for GI in GI_tiles])) if len(GIS_not_picked_tiling) != 0: GIS_not_picked_tiling_str = str(GIS_not_picked_tiling).replace("[", "(").replace("]", ")") c.execute("UPDATE blast SET Decision='Better tiling/Not chosen' WHERE GI IN " + GIS_not_picked_tiling_str + ";") count = 1 chosen_GIs = [] for m in possible_GIs: if len(m) == 1: c.execute("UPDATE blast SET Decision='Only or best choice in tiling analysis/chosen' WHERE GI='" + m[0] + "';") else: GIs_to_pick = resolve_seqs(m, blastdb, gene, c) GIs_to_pick_str = str(GIs_to_pick).replace("[", "(").replace("]", ")") if len(GIs_to_pick) == 1: c.execute("UPDATE blast SET Decision='Longest or most info (not checked), tile "+str(count)+" /Chosen' WHERE GI IN " + GIs_to_pick_str + ";") GIS_not_picked = list(set(m)-set(GIs_to_pick)) chosen_GIs.append(GIs_to_pick) GI_not_picked_str = str(GIS_not_picked).replace("[", "(").replace("]", ")") c.execute("UPDATE blast SET Decision='Short or less info, tile "+str(count)+"/Not chosen' WHERE GI IN " + GI_not_picked_str + ";") count += 1 else: c.execute("UPDATE blast SET Decision='To cluster analysis/Chosen' WHERE GI IN " + GIs_to_pick_str + ";") GIS_not_picked = list(set(m)-set(GIs_to_pick)) chosen_GIs.append(GIs_to_pick) if len(GIS_not_picked) != 0: GI_not_picked_str = str(GIS_not_picked).replace("[", "(").replace("]", ")") c.execute("UPDATE blast SET Decision='Short or less info/Not chosen' WHERE GI IN " + GI_not_picked_str + ";") #merge ranges - if same number of ranges as original, keep all, #want the least number to overlap 95% of whole range #try each comb from low to high and if hits 95%, choose those #if multiple higher than 95%, choose one with best % #if multiple with same % and same numb of combs- multiple #print(dic) conn.commit() conn.close()
alignment = alignment_reg(list_of_GIs) iden = identity_calc(alignment) if iden < 95: print("Low Aligned Identity: " + str(iden)) alignment = alignment_rev_comp(list_of_GIs) iden = identity_calc(alignment) if iden < 95: #get taxonomy for query(main species) print("Low Reverse Complement Aligned Identity: " + str(iden)) alignment = alignment_comp(list_of_GIs) iden = identity_calc(alignment) if iden < 95: print("Low Complement Aligned Identity: " + str(iden)) #add tiling thing gene_name = '_'.join(i.split('_')[1:]) idens, start_stop = tiling(list_of_GIs, gene_name) current_start = -1 current_stop = -1 result = [] if all(m > 70 for m in idens): for start, stop in sorted(start_stop): if start > current_stop: result.append((start, stop)) current_start, current_stop = start, stop else: current_stop = max(current_stop, stop) result[-1] = (current_start, current_stop) if len(result) == len(start_stop): print( "Seqs align to different regions of probe, choosing all" )
def test_resolved_seqs(infile, blastdb, taxdb): import sqlite3, sys, time from Bio.Blast import NCBIWWW, NCBIXML from blastlib.clean_seq_funcs import resolve_seqs, alignment_comp, alignment_reg, alignment_rev_comp, blast, blast_all, identity_calc, tiling conn = sqlite3.connect(blastdb) c = conn.cursor() c.execute("ATTACH '" + taxdb + "' as 'tax'") error_dic = {} blast_dic_nums = {} blast_dic_tcids = {} seqs_to_blast = [] finalseqs = set() multseqs = [] #do self blast and print to file with open(infile) as fasta_file: print("Blasting " + infile) records = fasta_file.read() numqueries = records.count('>') error = True while error == True: try: result_handle = NCBIWWW.qblast( "blastn", "nt", records, entrez_query= '((Papilionoidea[Organism]) OR Hedylidae[Organism]) OR Hesperiidae[Organism]', word_size=28, hitlist_size=100) error = False except: error = True #get rid of this extra step of printing xml using NCBIXML with open(infile + ".xml", "w") as save_file: save_file.write(result_handle.read()) result_handle.close() #open self blast file for parsing #make dictionary of query species: query GI of those that don't have the top hit as the same species with open(infile + ".xml") as p: print("Parsing blast output") blast_recs = NCBIXML.parse(p) count = 0 for rec in blast_recs: count += 1 print(str(round(float(count) / float(numqueries) * 100, 2)) + "%") #figure out a new way to do this queryAcc = str(rec.query.split()[0]) for iter in c.execute("SELECT GI FROM blast WHERE accession='" + queryAcc + "'"): queryGI = (str(iter[0])) hitdic = {} hitSp = set() for alignment in rec.alignments: for hsp in alignment.hsps: identity = float(hsp.identities) / float(hsp.align_length) if alignment.title.split("|")[1] == queryGI: pass else: hitdic[str(alignment.title.split("|")[1])] = identity maxiden = max(hitdic.values()) hitGIs = [GI for GI, iden in hitdic.iteritems() if iden == maxiden] for iter in c.execute("SELECT tc_id FROM blast WHERE GI='" + queryGI + "'"): querySp = (str(iter[0])) for i in hitGIs: for iter in c.execute("SELECT tc_id FROM blast WHERE GI='" + i + "'"): hitSp.add(str(iter[0])) #only look at top 5 if it doesnt hit the top hit - faster if querySp not in hitSp: hitSp = set() if len(hitdic.values()) > 5: maxiden = sorted(hitdic.values(), reverse=True)[0:5] else: maxiden = hitdic.values() hitGIs = [ GI for GI, iden in hitdic.iteritems() if iden in maxiden ] for i in hitGIs: for iter in c.execute( "SELECT tc_id FROM blast WHERE GI='" + i + "'"): hitSp.add(str(iter[0])) if querySp not in hitSp: error_dic[querySp] = queryGI else: finalseqs.add(queryGI) else: finalseqs.add(queryGI) count = 0 #error_dic['21204'] = '316994286' ##go through error dictionary and align the 'same' gene/species to see if they're weird looking newseqs = set() print("Checking nonmatching sequences") for tc_id in error_dic: count += 1 print(str(round(float(count) / float(len(error_dic)) * 100, 2)) + "%") list_of_GIs = [] for iter in c.execute("SELECT Gene_name FROM blast WHERE GI ='" + error_dic[tc_id] + "'"): gene_name = str(iter[0]) for iter in c.execute("SELECT GI FROM blast WHERE tc_id = '" + tc_id + "' and Gene_name = '" + gene_name + "'"): list_of_GIs.append(str(iter[0])) first_GI = list_of_GIs[0] other_GIs = list_of_GIs[1:] pair_check = [] #align each seq to the first one - first one acts as the 'default' direction for x, GI in enumerate(other_GIs): GI_pair = [first_GI, GI] alignment = alignment_reg(GI_pair) iden = identity_calc(alignment) if iden < 90: # print("Low Aligned Identity: " + str(iden)) alignment = alignment_rev_comp(GI_pair) iden = identity_calc(alignment) if iden < 90: # print("Low Reverse Complement Aligned Identity: " + str(iden)) alignment = alignment_comp(GI_pair) iden = identity_calc(alignment) if iden < 90: # print("Low Complement Aligned Identity: " + str(iden)) pair_check.append(0) else: pair_check.append(1) # print("Complement iden: " + str(iden) + " so pair is fine") else: pair_check.append(1) # print("Reverse Complement iden: " + str(iden) + " so pair is fine") else: pair_check.append(1) # print("High Aligned Identity: " + str(iden) + " so pair is fine") # print(pair_check) if all(i == 1 for i in pair_check): finalseqs.add(error_dic[tc_id]) newseqs.add(error_dic[tc_id]) else: idens, start_stop = tiling(list_of_GIs, gene_name) current_start = -1 current_stop = -1 result = [] if all(i > 70 for i in idens): for start, stop in sorted(start_stop): if start > current_stop: result.append((start, stop)) current_start, current_stop = start, stop else: current_stop = max(current_stop, stop) result[-1] = (current_start, current_stop) if len(result) == len(start_stop): # print("Seqs align to different regions of probe, choosing all") for x in list_of_GIs: finalseqs.add(x) newseqs.add(x) else: # print('Seqs overlap: Printing to file for hand checking') with open('these_seqs_overlap.txt', 'a') as a: a.write(str(list_of_GIs) + '\n') else: # print('Somethings up with a sequence - printing to check - will blast') pair_check.append(0) blast_dic_nums[list_of_GIs[0]] = len(list_of_GIs) blast_dic_tcids[list_of_GIs[0]] = tc_id seqs_to_blast.append(list_of_GIs) with open('seqs_to_be_blasted.txt', 'a') as a: a.write(str(list_of_GIs) + '\n') if len(seqs_to_blast) > 0: print( "Blasting error seqeuences (seqs don't align together and one doesn't align to whole)" ) seqs_to_blast_flat = [ item for sublist in seqs_to_blast for item in sublist ] try: hits_all = blast_all(seqs_to_blast_flat, blast_dic_nums, blast_dic_tcids, c) except: time.sleep(5) hits_all = blast_all(seqs_to_blast_flat, blast_dic_nums, blast_dic_tcids, c) # print(hits_all) print("Parsing taxonomy for error sequences") for x, list_of_GIs in enumerate(seqs_to_blast): hits = hits_all[x] tc_id = blast_dic_tcids[list_of_GIs[0]] #if theres only one lowest taxonomy hit and its not itself, change if hits.count(min(hits)) == 1 and error_dic[tc_id] != list_of_GIs[ hits.index(min(hits))]: finalseqs.add(list_of_GIs[hits.index(min(hits))]) # print(str(list_of_GIs[hits.index(min(hits))]) + " had closer taxonomy hit") elif hits.count(min( hits)) == 1 and error_dic[tc_id] == list_of_GIs[hits.index( min(hits))]: finalseqs.add(error_dic[tc_id]) # print(str(list_of_GIs[hits.index(min(hits))]) + " was previously chosen") else: #there are multiple lowest taxonomy hits # print('Taxonomies had multiple closest hits') index_pos = [] count = 0 for x in hits: if x == min(hits): index_pos.append(count) count += 1 mult_GIs = [list_of_GIs[x] for x in index_pos] GI_to_pick = resolve_seqs(mult_GIs) #if theres only one chosen and it wasnt the one already picked...add to change dic if len(GI_to_pick) == 1 and error_dic[tc_id] != GI_to_pick[0]: finalseqs.add(GI_to_pick[0]) # print(str(GI_to_pick) + " chosen") #if theres only one max length and it was already picked elif len( GI_to_pick) == 1 and error_dic[tc_id] == GI_to_pick[0]: # print(str(GI_to_pick) + " was previously chosen") finalseqs.add(GI_to_pick[0]) else: #this only happens if the originally picked one is crappy and the rest are the same # print("Multiple choices: " + str(GI_to_pick)) multseqs.append(error_dic[tc_id]) #Go to cluster analysis print('length of resolved=' + str(len(finalseqs))) print('length of not resolved = ' + str(len(multseqs))) with open("final_GIs.txt", "a") as o: for m in finalseqs: o.write(str(m) + "\n") #to cluster with open("multiple_gene_choices.txt", "a") as o: for m in multseqs: o.write(str(m) + "\n") conn.close()
#to pipeline dic[i] = whole if len(individual) > 0: #pick individual if len(individual) == 1: GI_nums_single.add(i + "|" + individual[0]) else: print(individual) result = [] ranges = {} current_start = -1 current_stop = -1 whole_length = 0 #do tiling for m in [x.split('_')[0] for x in individual]: iden, start_stop = tiling([m], 'COI_trnL_COII') start, end = start_stop[0] #uses gi for danaus chrysippus COI_trnL_COII if iden > 70: #make dic of lists if (start, end) in ranges.keys(): ranges_dic_list = ranges[(start, end)] ranges_dic_list.append(m) ranges[(start, end)] = ranges_dic_list else: ranges[(start, end)] = [m] else: print('Alignment below 70') print(GIs_to_align) if len(ranges) == 0: print('All alignments below 70, printing to file')
def cluster(blastdb, taxdb): Entrez.email = "*****@*****.**" conn = sqlite3.connect(blastdb) c = conn.cursor() c.execute("ATTACH '" + taxdb + "' as 'tax'") muscle_cline = MuscleCommandline(clwstrict=True) input_dic = {} multiple_dic = {} two_dic = {} problem_dic = {} finalseqs = set() multfinalseqs = [] unresolved = [] with open("multiple_gene_choices.txt") as o: line = o.readline() while line: input_dic[line.split("\t")[0]] = line.strip().split( "\t")[1].replace("[", "").replace("]", "").replace("'", "") line = o.readline() for i in input_dic: GIs = input_dic[i] GIs_list = GIs.split(", ") if len(GIs_list) > 2: multiple_dic[i] = GIs_list if len(GIs_list) == 2: two_dic[i] = GIs_list for i in multiple_dic: identities = [] joined_GIs = ",".join(multiple_dic[i]) handle = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id=joined_GIs) seqs = SeqIO.parse(handle, "fasta") handle_string = StringIO() SeqIO.write(seqs, handle_string, "fasta") data = handle_string.getvalue() stdout, stderr = muscle_cline(stdin=data) align = AlignIO.read(StringIO(stdout), "clustal") summary_align = AlignInfo.SummaryInfo(align) consensus = summary_align.gap_consensus(threshold=.5, ambiguous='N') consensus_record = SeqRecord(consensus, id="Consensus_all") for m in multiple_dic[i]: error = True while error == True: try: handle = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id=m) error = False except: print('Error, trying again') time.sleep(10) seqs = SeqIO.read(handle, "fasta") handle_string = StringIO() SeqIO.write(seqs, handle_string, "fasta") SeqIO.write(consensus_record, handle_string, "fasta") data = handle_string.getvalue() stdout, stderr = muscle_cline(stdin=data) align = AlignIO.read(StringIO(stdout), "clustal") count = 0 gaps = 0 for col in range(0, len(align[0])): column = align[:, col] if "-" not in column: if column[1:] == column[:-1]: count = count + 1 else: gaps = gaps + 1 iden = 100 * (count / float((len(align[0]) - gaps))) identities.append(iden) if identities.count(max(identities)) == 1: finalseqs.add(multiple_dic[i][identities.index(max(identities))]) else: problem_dic[i] = multiple_dic[i] GI_to_pick = [ multiple_dic[i][m] for m, x in enumerate(identities) if x == max(identities) ] multfinalseqs.append(GI_to_pick) for i in two_dic: #align the two seqs list_of_GIs = two_dic[i] alignment = alignment_reg(list_of_GIs) iden = identity_calc(alignment) if iden < 95: # print("Low Aligned Identity: " + str(iden)) alignment = alignment_rev_comp(list_of_GIs) iden = identity_calc(alignment) if iden < 95: #get taxonomy for query(main species) # print("Low Reverse Complement Aligned Identity: " + str(iden)) alignment = alignment_comp(list_of_GIs) iden = identity_calc(alignment) if iden < 95: # print("Low Complement Aligned Identity: " + str(iden)) #add tiling thing gene_name = '_'.join(i.split('_')[1:]) idens, start_stop = tiling(list_of_GIs, gene_name) current_start = -1 current_stop = -1 result = [] if all(m > 70 for m in idens): for start, stop in sorted(start_stop): if start > current_stop: result.append((start, stop)) current_start, current_stop = start, stop else: current_stop = max(current_stop, stop) result[-1] = (current_start, current_stop) if len(result) == len(start_stop): # print("Seqs align to different regions of probe, choosing all") multfinalseqs.append(list_of_GIs) else: # print('Seqs overlap: Printing to file for hand checking') with open('these_seqs_overlap_cluster.txt', 'a') as a: unresolved.append(list_of_GIs) a.write(str(list_of_GIs) + '\n') else: #get taxonomy for query(main species) print("Parsing taxonomy for error sequences") hits = blast(i, list_of_GIs, c) #if theres only one lowest taxonomy hit, change if hits.count(min(hits)) == 1: finalseqs.add( str(two_dic[i][hit_levels.index(min(hits))])) # print(str(two_dic[i][hit_levels.index(min(hits))]) + " had closer taxonomy hit") else: #there are multiple lowest taxonomy hits multfinalseqs.append(two_dic[i]) problem_dic[i] = two_dic[i] # print('Taxonomies had the multiple closest hits') else: multfinalseqs.append(two_dic[i]) # print("Complement iden: " + str(iden) + " so pair is fine") else: multfinalseqs.append(two_dic[i]) # print("Reverse Complement iden: " + str(iden) + " so pair is fine") else: multfinalseqs.append(two_dic[i]) # print("High Aligned Identity: " + str(iden) + " so pair is fine") print("length of resolved = " + str(len(finalseqs))) print("length of choose multiple = " + str(len(multfinalseqs))) print("length of unresolved = " + str(len(unresolved))) with open("final_GIs.txt", "a") as o: for m in finalseqs: o.write(str(m) + "\n") with open("choose_mult.txt", "a") as o: for m in [num for pair in multfinalseqs for num in pair]: o.write(str(m) + "\n")
def resolve_seqs(blastdb): import os, sys, sqlite3, re, time, itertools from Bio import Seq, Entrez, SeqIO from blastlib.clean_seq_funcs import resolve_seqs, alignment_reg, alignment_comp, alignment_rev_comp, identity_calc, tiling conn = sqlite3.connect(blastdb) c = conn.cursor() Entrez.email = "*****@*****.**" GI_nums_all = set() GI_nums_single = set() GI_nums_all_COI = set() GI_nums_single_COI = set() genes = set() dic = {} dic_COI = {} dic_single = {} dic_mult = {} count2 = 1 records = [] #this gets list of all taxa/genes regardless if they have multiple gene choices or not for iter in c.execute( "SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id NOT NULL AND Gene_name != 'COI_trnL_COII' GROUP BY tc_id, Gene_name, GI;" ): GI_nums_all.add( str(iter[0]) + "_" + str(iter[1]) + "|" + str(iter[2]) + "_" + str(iter[3])) #this gets a list of all taxa/genes if they only have one gene choice for iter in c.execute( "SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id NOT NULL AND Gene_name != 'COI_trnL_COII' GROUP BY tc_id, Gene_name HAVING COUNT(*) =1;" ): GI_nums_single.add( str(iter[0]) + "_" + str(iter[1]) + "|" + str(iter[2]) + "_" + str(iter[3])) #this give me all the GIs that have multiple gene choices GI_nums = GI_nums_all - GI_nums_single #do the same with COI #this gets list of all taxa/genes regardless if they have multiple gene choices or not for iter in c.execute( "SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id NOT NULL AND Gene_name == 'COI_trnL_COII' GROUP BY tc_id, Gene_name, GI;" ): GI_nums_all_COI.add( str(iter[0]) + "_" + str(iter[1]) + "|" + str(iter[2]) + "_" + str(iter[3])) #this gets a list of all taxa/genes if they only have one gene choice for iter in c.execute( "SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id NOT NULL AND Gene_name = 'COI_trnL_COII' GROUP BY tc_id HAVING COUNT(*) =1;" ): GI_nums_single_COI.add( str(iter[0]) + "_" + str(iter[1]) + "|" + str(iter[2]) + "_" + str(iter[3])) #this give me all the GIs that have multiple gene choices GI_nums_mult_COI = GI_nums_all_COI - GI_nums_single_COI for i in GI_nums_single_COI: if int(i.split("_")[-1]) > 3000: GI_nums_mult_COI.add(i) #makes a dictionary of lists for each taxa/gene choice for i in GI_nums: if i.split("|")[0] in dic.keys(): dic_list = dic[i.split("|")[0]] dic_list.append(i.split("|")[1]) dic[i.split("|")[0]] = dic_list else: dic[i.split("|")[0]] = [i.split("|")[1]] genes.add(re.split('_|\|', i)[1]) #same for COI for i in GI_nums_mult_COI: if i.split("|")[0] in dic_COI.keys(): dic_list = dic_COI[i.split("|")[0]] dic_list.append(i.split("|")[1]) dic_COI[i.split("|")[0]] = dic_list else: dic_COI[i.split("|")[0]] = [i.split("|")[1]] genes.add(re.split('_|\|', i)[1]) count = 0 #deal with lengths of COI and add to dic to try and resolve print("Trying to resolve COI/COII sequences") for i in dic_COI: count += 1 print(str(round(float(count) / float(len(dic_COI)) * 100, 2)) + '%') lengths = [int(m.split('_')[1]) for m in dic_COI[i]] individual = [dic_COI[i][x] for x, l in enumerate(lengths) if l < 2000] whole = [ dic_COI[i][x] for x, l in enumerate(lengths) if l > 2000 and l < 3000 ] mito = [dic_COI[i][x] for x, l in enumerate(lengths) if l > 3000] if len(mito) > 0: GIs_to_align = [mito[0].split("_")[0], 'GU365907'] alignment = alignment_reg(GIs_to_align) iden = identity_calc(alignment) if iden > 80: #have to do span to account for random small blocks that dont align span = 0 #get start for l in range(len(alignment[0])): col = alignment[:, l] if '-' not in col: span += 1 if span == 10: break elif span > 0 and '-' in col: span = 0 start = l - 8 span = 0 #get stop for l in reversed(range(len(alignment[0]))): col = alignment[:, l] if '-' not in col: span += 1 if span == 10: break elif span > 0 and '-' in col: span = 0 end = l + 10 handle = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id=mito[0].split("_")[0], seq_start=start - 1, seq_stop=end - 2) record = SeqIO.read(handle, "fasta") records.append(record) else: print('Low iden when matching COI to whole mito') with open("COI_hand_check.txt", "a") as a: a.write(i + '\n') elif len(whole) > 0: #pick whole if len(whole) == 1: GI_nums_single.add(i + "|" + whole[0]) else: #to pipeline dic[i] = whole if len(individual) > 0: #pick individual if len(individual) == 1: GI_nums_single.add(i + "|" + individual[0]) else: # print(individual) result = [] ranges = {} current_start = -1 current_stop = -1 whole_length = 0 #do tiling for m in [x.split('_')[0] for x in individual]: iden, start_stop = tiling([m], 'COI_trnL_COII') start, end = start_stop[0] #uses gi for danaus chrysippus COI_trnL_COII if iden > 70: #make dic of lists if (start, end) in ranges.keys(): ranges_dic_list = ranges[(start, end)] ranges_dic_list.append(m) ranges[(start, end)] = ranges_dic_list else: ranges[(start, end)] = [m] else: print('Alignment below 70') print(GIs_to_align) if len(ranges) == 0: print('All alignments below 70, printing to file') with open("COI_hand_check.txt", "a") as a: a.write(i + '\n') #get merged range for start, stop in sorted(ranges.keys()): if start > current_stop: result.append((start, stop)) current_start, current_stop = start, stop else: current_stop = max(current_stop, stop) result[-1] = (current_start, current_stop) for n in result: whole_length += n[1] - n[0] + 1 #go through each combination of ranges to get 95% of whole range for L in range(1, len(ranges) + 1): max_perc = 0 for subset in itertools.combinations(ranges.keys(), L): comb_length = 0 #for each combination, get merged length current_start = -1 current_stop = -1 result = [] for start, stop in sorted(subset): if start > current_stop: result.append((start, stop)) current_start, current_stop = start, stop else: current_stop = max(current_stop, stop) result[-1] = (current_start, current_stop) for x in result: comb_length += x[1] - x[0] + 1 if whole_length >= comb_length: perc = float(comb_length) / float(whole_length) if perc > max_perc: max_perc = perc max_subset = set() max_subset.add(subset) elif perc == max_perc: max_subset.add(subset) else: pass else: pass # goes through all combinations in a level before breaking if max_perc > .95: break final_tiling = [(0, 0)] * L for combination in max_subset: for x, comb_frag in enumerate(sorted(combination)): if comb_frag[1] - comb_frag[0] + 1 > final_tiling[x][ 1] - final_tiling[x][0] + 1: final_tiling[x] = comb_frag # print(final_tiling) possible_GIs = [ranges[x] for x in final_tiling] # print(possible_GIs) count = 0 for m in possible_GIs: if len(m) == 1: GI_nums_single.add(i + "|" + m[0] + "_0") else: if count == 0: dic[i] = m count += 1 else: dic[i + "_" + str(count)] = m count += 1 # merge ranges - if same number of ranges as original, keep all, # want the least number to overlap 95% of whole range # try each comb from low to high and if hits 95%, choose those # if multiple higher than 95%, choose one with best % # if multiple with same % and same numb of combs- multiple print(dic) # sys.exit() SeqIO.write(records, "mito_COI.fa", "fasta") #pulls out the GIs with first, the longest number of ATCGs and second, the longest length and makes dictionary print("Trying to resolve all other sequences") count = 0 for i in dic: GIlist = [] for n in dic[i]: GIlist.append(n.split("_")[0]) dic[i] = resolve_seqs(GIlist) print(str(round((float(count) / float(len(dic))) * 100, 2)) + "%") count += 1 #splits the ones that still have multiple (so the longest had multiple choices) and the ones that are resolved for i in dic: if len(dic[i]) > 1: dic_mult[i] = dic[i] else: dic_single[i] = dic[i] for i in genes: finalGInums_only1 = set() finalGInums_longest = set() for n in dic_single.keys(): if i == re.split('_|\|', n)[1]: finalGInums_longest.add(''.join(dic_single[n])) for n in GI_nums_single: if i == re.split('_|\|', n)[1]: finalGInums_only1.add(re.split('_|\|', n)[-2]) with open("final_GIs.txt", "a") as o: for m in finalGInums_only1: o.write(str(m) + "\n") #this needs to go to blast_sp_parse.py with open(i + "_accession_nums_resolved.txt", "w") as o: for m in finalGInums_longest: o.write(str(m) + "\n") #this needs to go to cluster.py with open("multiple_gene_choices.txt", "w") as w: for i in dic_mult: w.write(i + "\t" + str(dic_mult[i]) + "\n") conn.close()