Python tiling примеры использования

Язык программирования: Python

Пространство имен/Пакет: blastlib.clean_seq_funcs

Метод/Функция: tiling

Примеров на hotexamples.com: 6

Python tiling - 6 примеров найдено. Это лучшие примеры Python кода для blastlib.clean_seq_funcs.tiling, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

def resolve_seqs(blastdb):
    import os, sys, sqlite3, re, time, itertools
    from Bio import Seq, Entrez, SeqIO
    from blastlib.clean_seq_funcs import resolve_seqs, alignment_reg, alignment_comp, alignment_rev_comp, identity_calc, tiling
    from cleanlib.databasing import get_seqs_from_sqldb_GI, get_seqs_from_sqldb_GI_no_gene
    conn = sqlite3.connect(blastdb)
    c = conn.cursor()
    GI_nums_all = set()
    GI_nums_single = set()
    GI_nums_single_GI = []
    mito = set()
    genes = set()
    dic = {}
    dic_ind = {}
    dic_single = {}
    dic_mult = {}
    count2 = 1
    GI_mito_GI = []
    for iter in c.execute("SELECT Gene_name from blast GROUP BY Gene_name;"):
        genes.add(iter[0]) 
    #this gets list of all taxa/genes regardless if they have multiple gene choices or not
    for iter in c.execute("SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id != '0' GROUP BY tc_id, Gene_name, GI;"):
        GI_nums_all.add(str(iter[0])+"_"+str(iter[1])+"|"+str(iter[2])+"_"+str(iter[3]))
    #this gets a list of all taxa/genes if they only have one gene choice
    for iter in c.execute("SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id != '0' GROUP BY tc_id, Gene_name HAVING COUNT(*) =1;"):
        GI_nums_single.add(str(iter[0])+"_"+str(iter[1])+"|"+str(iter[2])+"_"+str(iter[3]))
    #this give me all the tc_ids that have multiple gene choices
    #tc_id_gene|GI_hit_length
    GI_nums = GI_nums_all-GI_nums_single
    GI_nums_single_GIs = []
    
    #deal with singletones
    for i in GI_nums_single:
        if int(i.split("_")[-1]) > 5000:
            mito.add(i)
    #have to pull out mito/chloro ones separately
    for i in mito:
        GI_nums_single.remove(i)
        GI_mito_GI.append(re.split('_|\|', i)[-2])   
    GI_mito_GI_str = str(GI_mito_GI).replace("[", "(").replace("]", ")")    
    c.execute("UPDATE blast SET Decision='Mito or chloro sequence/Chosen' WHERE GI IN " + GI_mito_GI_str + ";")
    #write singletons
    for i in GI_nums_single:
        GI_nums_single_GIs.append(re.split('_|\|', i)[-2])
    GI_nums_single_GIs_str = str(GI_nums_single_GIs).replace("[", "(").replace("]", ")")
    c.execute("UPDATE blast SET Decision='Only choice/chosen' WHERE GI IN " + GI_nums_single_GIs_str + ";")
    #makes a dictionary of lists for each multiple taxa/gene choice
    #num_gene: ['gi_len']
    for i in GI_nums:
        if i.split("|")[0] in dic.keys():
            dic_list = dic[i.split("|")[0]]
            dic_list.append(i.split("|")[1])
            dic[i.split("|")[0]] = dic_list
        else:
            dic[i.split("|")[0]] = [i.split("|")[1]]    
    
    countall = 0
    #deal with multiples and try and resolve
    print("Trying to resolve sequences")
    for i in dic:
        countall += 1
        #print(i)
        #print(countall)
        print(str(round(float(countall)/float(len(dic))*100, 2))+'%')
        lengths = [int(m.split('_')[1]) for m in dic[i]]
        individual = [dic[i][x] for x, l in enumerate(lengths) if l < 5000]
        mito = [dic[i][x] for x, l in enumerate(lengths) if l >= 5000]
        if len(mito) > 0:
            #will pick the first one if there are multiple ones
            mitoinGI = [mito[0].split("_")[0]]
            c.execute("UPDATE blast SET Decision='Mito or chloro sequence/Chosen' WHERE GI='" + mitoinGI[0] + "';")
            GIS_not_picked_mito = list(set([x.split("_")[0] for x in mito])-set([mitoinGI[0]]))
            if len(GIS_not_picked_mito) != 0:
                GIS_not_picked_mito_str = str(GIS_not_picked_mito).replace("[", "(").replace("]", ")")
                c.execute("UPDATE blast SET Decision='Mito or chloro sequence/Randomly not chosen' WHERE GI IN " + GIS_not_picked_mito_str + ";")
                
            GIS_not_picked_mito = list(set([x.split('_')[0] for x in individual])-set([mitoinGI[0]]))
            if len(GIS_not_picked_mito) != 0:
                GIS_not_picked_mito_str = str(GIS_not_picked_mito).replace("[", "(").replace("]", ")")
                c.execute("UPDATE blast SET Decision='Short or less info/Not chosen' WHERE GI IN " + GIS_not_picked_mito_str + ";")

        elif len(individual) > 0:
            #this should never happen
            # if len(individual) == 1:
            #     c.execute("UPDATE blast SET Decision='Not Chosen' WHERE Name_num='" + individual[0].split("_")[0] + "';")
                
            # else:
#                print(individual)
            result = []
            ranges = {}
            current_start = -1
            current_stop = -1
            whole_length = 0
            #do tiling
            gene = i.split("_", 1)[1]
            for m in [x.split('_')[0] for x in individual]:
                idens, start_stop = tiling([m], gene, blastdb, c)
                start, end = start_stop[0] 
                for iden in idens:
                    if iden > 70:
                        #make dic of lists
                        if (start, end) in ranges.keys():
                            ranges_dic_list = ranges[(start, end)]
                            ranges_dic_list.append(m)
                            ranges[(start, end)] = ranges_dic_list
                        else:
                            ranges[(start, end)] = [m]
                    else:
                        print('Alignment below 70')
                        c.execute("UPDATE blast SET Decision='Sequence does not align well (<70%) to input query sequence/not chosen' WHERE GI='" + m + "';")

                        #print(GIs_to_align)
            if len(ranges) == 0:
                c.execute("UPDATE blast SET Decision='Sequence does not align well (<70%) to input query sequence/not chosen' WHERE Name_num='" + i + "';")
                print('All alignments below 70, printing to file')
                with open("fulllength_hand_check.txt", "a") as a:
                    a.write(i + '\n')
            else:
                #get merged range
                for start, stop in sorted(ranges.keys()):
                    if start > current_stop:
                        result.append((start, stop))
                        current_start, current_stop = start, stop
                    else:
                        current_stop = max(current_stop, stop)
                        result[-1] = (current_start, current_stop)
                for n in result:
                    whole_length += n[1] - n[0] + 1
                #go through each combination of ranges to get 95% of whole range
                for L in range(1, len(ranges)+1):
                    max_perc = 0
                    for subset in itertools.combinations(ranges.keys(), L):
                        comb_length = 0
                        #for each combination, get merged length
                        current_start = -1
                        current_stop = -1
                        result = []
                        for start, stop in sorted(subset):
                            if start > current_stop:
                                result.append((start, stop))
                                current_start, current_stop = start, stop
                            else:
                                current_stop = max(current_stop, stop)
                                result[-1] = (current_start, current_stop)
                        for x in result:
                            comb_length += x[1] - x[0] + 1
                        if whole_length >= comb_length:
                            perc = float(comb_length)/float(whole_length)
                            if perc > max_perc:
                                max_perc = perc
                                max_subset = set()
                                max_subset.add(subset)
                            elif perc == max_perc:
                                max_subset.add(subset)
                            else:
                                pass  
                        else:
                            pass
                    # goes through all combinations in a level before breaking
                    if max_perc > .95:
                        break
                final_tiling = [(0, 0)]*L
                for combination in max_subset:
                    for x, comb_frag in enumerate(sorted(combination)):
                        if comb_frag[1] - comb_frag[0] + 1 > final_tiling[x][1] - final_tiling[x][0] + 1:
                            final_tiling[x] = comb_frag
                if (0, 0) in final_tiling:
                    possible_GIs = []
                else:
                    possible_GIs = [ranges[x] for x in final_tiling]
                GIS_not_picked_tiling = list(set([x.split('_')[0] for x in individual])-set([GI for GI_tiles in possible_GIs for GI in GI_tiles]))
                if len(GIS_not_picked_tiling) != 0:
                    GIS_not_picked_tiling_str = str(GIS_not_picked_tiling).replace("[", "(").replace("]", ")")
                    c.execute("UPDATE blast SET Decision='Better tiling/Not chosen' WHERE GI IN " + GIS_not_picked_tiling_str + ";")

                count = 1
                chosen_GIs = []
                for m in possible_GIs:
                    if len(m) == 1:
                        c.execute("UPDATE blast SET Decision='Only or best choice in tiling analysis/chosen' WHERE GI='" + m[0] + "';")
                    else:
                        GIs_to_pick = resolve_seqs(m, blastdb, gene, c)
                        GIs_to_pick_str = str(GIs_to_pick).replace("[", "(").replace("]", ")")
                        if len(GIs_to_pick) == 1:
                            c.execute("UPDATE blast SET Decision='Longest or most info (not checked), tile "+str(count)+" /Chosen' WHERE GI IN " + GIs_to_pick_str + ";")
                            GIS_not_picked = list(set(m)-set(GIs_to_pick))
                            chosen_GIs.append(GIs_to_pick)
                            GI_not_picked_str = str(GIS_not_picked).replace("[", "(").replace("]", ")")
                            c.execute("UPDATE blast SET Decision='Short or less info, tile "+str(count)+"/Not chosen' WHERE GI IN " + GI_not_picked_str + ";")
                            count += 1
                        else:
                            c.execute("UPDATE blast SET Decision='To cluster analysis/Chosen' WHERE GI IN " + GIs_to_pick_str + ";")
                            GIS_not_picked = list(set(m)-set(GIs_to_pick))
                            chosen_GIs.append(GIs_to_pick)
                            if len(GIS_not_picked) != 0:
                                GI_not_picked_str = str(GIS_not_picked).replace("[", "(").replace("]", ")")
                                c.execute("UPDATE blast SET Decision='Short or less info/Not chosen' WHERE GI IN " + GI_not_picked_str + ";")
                        #merge ranges - if same number of ranges as original, keep all,
                        #want the least number to overlap 95% of whole range
                        #try each comb from low to high and if hits 95%, choose those
                        #if multiple higher than 95%, choose one with best %
                        #if multiple with same % and same numb of combs- multiple
                
    #print(dic)

    conn.commit()
    conn.close()

Пример #2

Показать файл

 alignment = alignment_reg(list_of_GIs)
 iden = identity_calc(alignment)
 if iden < 95:
     print("Low Aligned Identity: " + str(iden))
     alignment = alignment_rev_comp(list_of_GIs)
     iden = identity_calc(alignment)
     if iden < 95:
         #get taxonomy for query(main species)
         print("Low Reverse Complement Aligned Identity: " + str(iden))
         alignment = alignment_comp(list_of_GIs)
         iden = identity_calc(alignment)
         if iden < 95:
             print("Low Complement Aligned Identity: " + str(iden))
             #add tiling thing
             gene_name = '_'.join(i.split('_')[1:])
             idens, start_stop = tiling(list_of_GIs, gene_name)
             current_start = -1
             current_stop = -1
             result = []
             if all(m > 70 for m in idens):
                 for start, stop in sorted(start_stop):
                     if start > current_stop:
                         result.append((start, stop))
                         current_start, current_stop = start, stop
                     else:
                         current_stop = max(current_stop, stop)
                         result[-1] = (current_start, current_stop)
                 if len(result) == len(start_stop):
                     print(
                         "Seqs align to different regions of probe, choosing all"
                     )

Пример #3

Показать файл

Файл: blast_sp_parse.py Проект: sunray1/working_scripts

def test_resolved_seqs(infile, blastdb, taxdb):
    import sqlite3, sys, time
    from Bio.Blast import NCBIWWW, NCBIXML
    from blastlib.clean_seq_funcs import resolve_seqs, alignment_comp, alignment_reg, alignment_rev_comp, blast, blast_all, identity_calc, tiling
    conn = sqlite3.connect(blastdb)
    c = conn.cursor()
    c.execute("ATTACH '" + taxdb + "' as 'tax'")
    error_dic = {}
    blast_dic_nums = {}
    blast_dic_tcids = {}
    seqs_to_blast = []
    finalseqs = set()
    multseqs = []
    #do self blast and print to file
    with open(infile) as fasta_file:
        print("Blasting " + infile)
        records = fasta_file.read()
    numqueries = records.count('>')
    error = True
    while error == True:
        try:
            result_handle = NCBIWWW.qblast(
                "blastn",
                "nt",
                records,
                entrez_query=
                '((Papilionoidea[Organism]) OR Hedylidae[Organism]) OR Hesperiidae[Organism]',
                word_size=28,
                hitlist_size=100)
            error = False
        except:
            error = True
    #get rid of this extra step of printing xml using NCBIXML
    with open(infile + ".xml", "w") as save_file:
        save_file.write(result_handle.read())
        result_handle.close()
    #open self blast file for parsing
    #make dictionary of query species: query GI of those that don't have the top hit as the same species

    with open(infile + ".xml") as p:
        print("Parsing blast output")
        blast_recs = NCBIXML.parse(p)
        count = 0
        for rec in blast_recs:
            count += 1
            print(str(round(float(count) / float(numqueries) * 100, 2)) + "%")
            #figure out a new way to do this
            queryAcc = str(rec.query.split()[0])
            for iter in c.execute("SELECT GI FROM blast WHERE accession='" +
                                  queryAcc + "'"):
                queryGI = (str(iter[0]))
            hitdic = {}
            hitSp = set()
            for alignment in rec.alignments:
                for hsp in alignment.hsps:
                    identity = float(hsp.identities) / float(hsp.align_length)
                if alignment.title.split("|")[1] == queryGI:
                    pass
                else:
                    hitdic[str(alignment.title.split("|")[1])] = identity
            maxiden = max(hitdic.values())
            hitGIs = [GI for GI, iden in hitdic.iteritems() if iden == maxiden]
            for iter in c.execute("SELECT tc_id FROM blast WHERE GI='" +
                                  queryGI + "'"):
                querySp = (str(iter[0]))
            for i in hitGIs:
                for iter in c.execute("SELECT tc_id FROM blast WHERE GI='" +
                                      i + "'"):
                    hitSp.add(str(iter[0]))
            #only look at top 5 if it doesnt hit the top hit - faster
            if querySp not in hitSp:
                hitSp = set()
                if len(hitdic.values()) > 5:
                    maxiden = sorted(hitdic.values(), reverse=True)[0:5]
                else:
                    maxiden = hitdic.values()
                hitGIs = [
                    GI for GI, iden in hitdic.iteritems() if iden in maxiden
                ]
                for i in hitGIs:
                    for iter in c.execute(
                            "SELECT tc_id FROM blast WHERE GI='" + i + "'"):
                        hitSp.add(str(iter[0]))
                if querySp not in hitSp:
                    error_dic[querySp] = queryGI
                else:
                    finalseqs.add(queryGI)
            else:
                finalseqs.add(queryGI)
    count = 0
    #error_dic['21204'] = '316994286'
    ##go through error dictionary and align the 'same' gene/species to see if they're weird looking
    newseqs = set()
    print("Checking nonmatching sequences")
    for tc_id in error_dic:
        count += 1
        print(str(round(float(count) / float(len(error_dic)) * 100, 2)) + "%")
        list_of_GIs = []
        for iter in c.execute("SELECT Gene_name FROM blast WHERE GI ='" +
                              error_dic[tc_id] + "'"):
            gene_name = str(iter[0])
        for iter in c.execute("SELECT GI FROM blast WHERE tc_id = '" + tc_id +
                              "' and Gene_name = '" + gene_name + "'"):
            list_of_GIs.append(str(iter[0]))
        first_GI = list_of_GIs[0]
        other_GIs = list_of_GIs[1:]
        pair_check = []
        #align each seq to the first one - first one acts as the 'default' direction
        for x, GI in enumerate(other_GIs):
            GI_pair = [first_GI, GI]
            alignment = alignment_reg(GI_pair)
            iden = identity_calc(alignment)
            if iden < 90:
                #                print("Low Aligned Identity: " + str(iden))
                alignment = alignment_rev_comp(GI_pair)
                iden = identity_calc(alignment)
                if iden < 90:
                    #                    print("Low Reverse Complement Aligned Identity: " + str(iden))
                    alignment = alignment_comp(GI_pair)
                    iden = identity_calc(alignment)
                    if iden < 90:
                        #                        print("Low Complement Aligned Identity: " + str(iden))
                        pair_check.append(0)
                    else:
                        pair_check.append(1)
#                        print("Complement iden: " + str(iden) + " so pair is fine")
                else:
                    pair_check.append(1)
#                    print("Reverse Complement iden: " + str(iden) + " so pair is fine")
            else:
                pair_check.append(1)
#                print("High Aligned Identity: " + str(iden) + " so pair is fine")
#        print(pair_check)
        if all(i == 1 for i in pair_check):
            finalseqs.add(error_dic[tc_id])
            newseqs.add(error_dic[tc_id])
        else:
            idens, start_stop = tiling(list_of_GIs, gene_name)
            current_start = -1
            current_stop = -1
            result = []
            if all(i > 70 for i in idens):
                for start, stop in sorted(start_stop):
                    if start > current_stop:
                        result.append((start, stop))
                        current_start, current_stop = start, stop
                    else:
                        current_stop = max(current_stop, stop)
                        result[-1] = (current_start, current_stop)
                if len(result) == len(start_stop):
                    #                    print("Seqs align to different regions of probe, choosing all")
                    for x in list_of_GIs:
                        finalseqs.add(x)
                        newseqs.add(x)
                else:
                    #                    print('Seqs overlap: Printing to file for hand checking')
                    with open('these_seqs_overlap.txt', 'a') as a:
                        a.write(str(list_of_GIs) + '\n')
            else:
                #                print('Somethings up with a sequence - printing to check - will blast')
                pair_check.append(0)
                blast_dic_nums[list_of_GIs[0]] = len(list_of_GIs)
                blast_dic_tcids[list_of_GIs[0]] = tc_id
                seqs_to_blast.append(list_of_GIs)
                with open('seqs_to_be_blasted.txt', 'a') as a:
                    a.write(str(list_of_GIs) + '\n')
    if len(seqs_to_blast) > 0:
        print(
            "Blasting error seqeuences (seqs don't align together and one doesn't align to whole)"
        )
        seqs_to_blast_flat = [
            item for sublist in seqs_to_blast for item in sublist
        ]
        try:
            hits_all = blast_all(seqs_to_blast_flat, blast_dic_nums,
                                 blast_dic_tcids, c)
        except:
            time.sleep(5)
            hits_all = blast_all(seqs_to_blast_flat, blast_dic_nums,
                                 blast_dic_tcids, c)
#        print(hits_all)
        print("Parsing taxonomy for error sequences")
        for x, list_of_GIs in enumerate(seqs_to_blast):
            hits = hits_all[x]
            tc_id = blast_dic_tcids[list_of_GIs[0]]
            #if theres only one lowest taxonomy hit and its not itself, change
            if hits.count(min(hits)) == 1 and error_dic[tc_id] != list_of_GIs[
                    hits.index(min(hits))]:
                finalseqs.add(list_of_GIs[hits.index(min(hits))])
#                print(str(list_of_GIs[hits.index(min(hits))]) + " had closer taxonomy hit")
            elif hits.count(min(
                    hits)) == 1 and error_dic[tc_id] == list_of_GIs[hits.index(
                        min(hits))]:
                finalseqs.add(error_dic[tc_id])
#                print(str(list_of_GIs[hits.index(min(hits))]) + " was previously chosen")
            else:  #there are multiple lowest taxonomy hits
                #                print('Taxonomies had multiple closest hits')
                index_pos = []
                count = 0
                for x in hits:
                    if x == min(hits):
                        index_pos.append(count)
                    count += 1
                mult_GIs = [list_of_GIs[x] for x in index_pos]
                GI_to_pick = resolve_seqs(mult_GIs)
                #if theres only one chosen and it wasnt the one already picked...add to change dic
                if len(GI_to_pick) == 1 and error_dic[tc_id] != GI_to_pick[0]:
                    finalseqs.add(GI_to_pick[0])
#                    print(str(GI_to_pick) + " chosen")
#if theres only one max length and it was already picked
                elif len(
                        GI_to_pick) == 1 and error_dic[tc_id] == GI_to_pick[0]:
                    #                    print(str(GI_to_pick) + " was previously chosen")
                    finalseqs.add(GI_to_pick[0])
                else:
                    #this only happens if the originally picked one is crappy and the rest are the same
                    #                    print("Multiple choices: " + str(GI_to_pick))
                    multseqs.append(error_dic[tc_id])
                    #Go to cluster analysis

    print('length of resolved=' + str(len(finalseqs)))
    print('length of not resolved = ' + str(len(multseqs)))

    with open("final_GIs.txt", "a") as o:
        for m in finalseqs:
            o.write(str(m) + "\n")

    #to cluster
    with open("multiple_gene_choices.txt", "a") as o:
        for m in multseqs:
            o.write(str(m) + "\n")

    conn.close()

Пример #4

Показать файл

         #to pipeline
         dic[i] = whole
 if len(individual) > 0:
     #pick individual
     if len(individual) == 1:
         GI_nums_single.add(i + "|" + individual[0])
     else:
         print(individual)
         result = []
         ranges = {}
         current_start = -1
         current_stop = -1
         whole_length = 0
         #do tiling
         for m in [x.split('_')[0] for x in individual]:
             iden, start_stop = tiling([m], 'COI_trnL_COII')
             start, end = start_stop[0]
             #uses gi for danaus chrysippus COI_trnL_COII
             if iden > 70:
                 #make dic of lists
                 if (start, end) in ranges.keys():
                     ranges_dic_list = ranges[(start, end)]
                     ranges_dic_list.append(m)
                     ranges[(start, end)] = ranges_dic_list
                 else:
                     ranges[(start, end)] = [m]
             else:
                 print('Alignment below 70')
                 print(GIs_to_align)
         if len(ranges) == 0:
             print('All alignments below 70, printing to file')

Пример #5

Показать файл

def cluster(blastdb, taxdb):
    Entrez.email = "*****@*****.**"
    conn = sqlite3.connect(blastdb)
    c = conn.cursor()
    c.execute("ATTACH '" + taxdb + "' as 'tax'")
    muscle_cline = MuscleCommandline(clwstrict=True)
    input_dic = {}
    multiple_dic = {}
    two_dic = {}
    problem_dic = {}
    finalseqs = set()
    multfinalseqs = []
    unresolved = []

    with open("multiple_gene_choices.txt") as o:
        line = o.readline()
        while line:
            input_dic[line.split("\t")[0]] = line.strip().split(
                "\t")[1].replace("[", "").replace("]", "").replace("'", "")
            line = o.readline()
    for i in input_dic:
        GIs = input_dic[i]
        GIs_list = GIs.split(", ")
        if len(GIs_list) > 2:
            multiple_dic[i] = GIs_list
        if len(GIs_list) == 2:
            two_dic[i] = GIs_list

    for i in multiple_dic:
        identities = []
        joined_GIs = ",".join(multiple_dic[i])
        handle = Entrez.efetch(db="nucleotide",
                               rettype="fasta",
                               retmode="text",
                               id=joined_GIs)
        seqs = SeqIO.parse(handle, "fasta")
        handle_string = StringIO()
        SeqIO.write(seqs, handle_string, "fasta")
        data = handle_string.getvalue()
        stdout, stderr = muscle_cline(stdin=data)
        align = AlignIO.read(StringIO(stdout), "clustal")
        summary_align = AlignInfo.SummaryInfo(align)
        consensus = summary_align.gap_consensus(threshold=.5, ambiguous='N')
        consensus_record = SeqRecord(consensus, id="Consensus_all")
        for m in multiple_dic[i]:
            error = True
            while error == True:
                try:
                    handle = Entrez.efetch(db="nucleotide",
                                           rettype="fasta",
                                           retmode="text",
                                           id=m)
                    error = False
                except:
                    print('Error, trying again')
                    time.sleep(10)
            seqs = SeqIO.read(handle, "fasta")
            handle_string = StringIO()
            SeqIO.write(seqs, handle_string, "fasta")
            SeqIO.write(consensus_record, handle_string, "fasta")
            data = handle_string.getvalue()
            stdout, stderr = muscle_cline(stdin=data)
            align = AlignIO.read(StringIO(stdout), "clustal")
            count = 0
            gaps = 0
            for col in range(0, len(align[0])):
                column = align[:, col]
                if "-" not in column:
                    if column[1:] == column[:-1]:
                        count = count + 1
                else:
                    gaps = gaps + 1
            iden = 100 * (count / float((len(align[0]) - gaps)))
            identities.append(iden)
        if identities.count(max(identities)) == 1:
            finalseqs.add(multiple_dic[i][identities.index(max(identities))])
        else:
            problem_dic[i] = multiple_dic[i]
            GI_to_pick = [
                multiple_dic[i][m] for m, x in enumerate(identities)
                if x == max(identities)
            ]
            multfinalseqs.append(GI_to_pick)

    for i in two_dic:
        #align the two seqs
        list_of_GIs = two_dic[i]
        alignment = alignment_reg(list_of_GIs)
        iden = identity_calc(alignment)
        if iden < 95:
            #            print("Low Aligned Identity: " + str(iden))
            alignment = alignment_rev_comp(list_of_GIs)
            iden = identity_calc(alignment)
            if iden < 95:
                #get taxonomy for query(main species)
                #               print("Low Reverse Complement Aligned Identity: " + str(iden))
                alignment = alignment_comp(list_of_GIs)
                iden = identity_calc(alignment)
                if iden < 95:
                    #                   print("Low Complement Aligned Identity: " + str(iden))
                    #add tiling thing
                    gene_name = '_'.join(i.split('_')[1:])
                    idens, start_stop = tiling(list_of_GIs, gene_name)
                    current_start = -1
                    current_stop = -1
                    result = []
                    if all(m > 70 for m in idens):
                        for start, stop in sorted(start_stop):
                            if start > current_stop:
                                result.append((start, stop))
                                current_start, current_stop = start, stop
                            else:
                                current_stop = max(current_stop, stop)
                                result[-1] = (current_start, current_stop)
                        if len(result) == len(start_stop):
                            #                           print("Seqs align to different regions of probe, choosing all")
                            multfinalseqs.append(list_of_GIs)
                        else:
                            #                           print('Seqs overlap: Printing to file for hand checking')
                            with open('these_seqs_overlap_cluster.txt',
                                      'a') as a:
                                unresolved.append(list_of_GIs)
                                a.write(str(list_of_GIs) + '\n')
                    else:
                        #get taxonomy for query(main species)
                        print("Parsing taxonomy for error sequences")
                        hits = blast(i, list_of_GIs, c)
                        #if theres only one lowest taxonomy hit, change
                        if hits.count(min(hits)) == 1:
                            finalseqs.add(
                                str(two_dic[i][hit_levels.index(min(hits))]))
#                            print(str(two_dic[i][hit_levels.index(min(hits))]) + " had closer taxonomy hit")
                        else:  #there are multiple lowest taxonomy hits
                            multfinalseqs.append(two_dic[i])
                            problem_dic[i] = two_dic[i]
#                           print('Taxonomies had the multiple closest hits')
                else:
                    multfinalseqs.append(two_dic[i])
#                    print("Complement iden: " + str(iden) + " so pair is fine")
            else:
                multfinalseqs.append(two_dic[i])
#                print("Reverse Complement iden: " + str(iden) + " so pair is fine")
        else:
            multfinalseqs.append(two_dic[i])
#            print("High Aligned Identity: " + str(iden) + " so pair is fine")

    print("length of resolved = " + str(len(finalseqs)))
    print("length of choose multiple = " + str(len(multfinalseqs)))
    print("length of unresolved = " + str(len(unresolved)))

    with open("final_GIs.txt", "a") as o:
        for m in finalseqs:
            o.write(str(m) + "\n")

    with open("choose_mult.txt", "a") as o:
        for m in [num for pair in multfinalseqs for num in pair]:
            o.write(str(m) + "\n")

Пример #6

Показать файл

Файл: multiple.py Проект: sunray1/working_scripts

def resolve_seqs(blastdb):
    import os, sys, sqlite3, re, time, itertools
    from Bio import Seq, Entrez, SeqIO
    from blastlib.clean_seq_funcs import resolve_seqs, alignment_reg, alignment_comp, alignment_rev_comp, identity_calc, tiling
    conn = sqlite3.connect(blastdb)
    c = conn.cursor()
    Entrez.email = "*****@*****.**"
    GI_nums_all = set()
    GI_nums_single = set()
    GI_nums_all_COI = set()
    GI_nums_single_COI = set()
    genes = set()
    dic = {}
    dic_COI = {}
    dic_single = {}
    dic_mult = {}
    count2 = 1
    records = []
    #this gets list of all taxa/genes regardless if they have multiple gene choices or not
    for iter in c.execute(
            "SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id NOT NULL AND Gene_name != 'COI_trnL_COII' GROUP BY tc_id, Gene_name, GI;"
    ):
        GI_nums_all.add(
            str(iter[0]) + "_" + str(iter[1]) + "|" + str(iter[2]) + "_" +
            str(iter[3]))
    #this gets a list of all taxa/genes if they only have one gene choice
    for iter in c.execute(
            "SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id NOT NULL AND Gene_name != 'COI_trnL_COII' GROUP BY tc_id, Gene_name HAVING COUNT(*) =1;"
    ):
        GI_nums_single.add(
            str(iter[0]) + "_" + str(iter[1]) + "|" + str(iter[2]) + "_" +
            str(iter[3]))
    #this give me all the GIs that have multiple gene choices
    GI_nums = GI_nums_all - GI_nums_single

    #do the same with COI
    #this gets list of all taxa/genes regardless if they have multiple gene choices or not
    for iter in c.execute(
            "SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id NOT NULL AND Gene_name == 'COI_trnL_COII' GROUP BY tc_id, Gene_name, GI;"
    ):
        GI_nums_all_COI.add(
            str(iter[0]) + "_" + str(iter[1]) + "|" + str(iter[2]) + "_" +
            str(iter[3]))
    #this gets a list of all taxa/genes if they only have one gene choice
    for iter in c.execute(
            "SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id NOT NULL AND Gene_name = 'COI_trnL_COII' GROUP BY tc_id HAVING COUNT(*) =1;"
    ):
        GI_nums_single_COI.add(
            str(iter[0]) + "_" + str(iter[1]) + "|" + str(iter[2]) + "_" +
            str(iter[3]))
    #this give me all the GIs that have multiple gene choices
    GI_nums_mult_COI = GI_nums_all_COI - GI_nums_single_COI
    for i in GI_nums_single_COI:
        if int(i.split("_")[-1]) > 3000:
            GI_nums_mult_COI.add(i)

    #makes a dictionary of lists for each taxa/gene choice
    for i in GI_nums:
        if i.split("|")[0] in dic.keys():
            dic_list = dic[i.split("|")[0]]
            dic_list.append(i.split("|")[1])
            dic[i.split("|")[0]] = dic_list
        else:
            dic[i.split("|")[0]] = [i.split("|")[1]]
        genes.add(re.split('_|\|', i)[1])
    #same for COI
    for i in GI_nums_mult_COI:
        if i.split("|")[0] in dic_COI.keys():
            dic_list = dic_COI[i.split("|")[0]]
            dic_list.append(i.split("|")[1])
            dic_COI[i.split("|")[0]] = dic_list
        else:
            dic_COI[i.split("|")[0]] = [i.split("|")[1]]
        genes.add(re.split('_|\|', i)[1])

    count = 0
    #deal with lengths of COI and add to dic to try and resolve
    print("Trying to resolve COI/COII sequences")
    for i in dic_COI:
        count += 1
        print(str(round(float(count) / float(len(dic_COI)) * 100, 2)) + '%')
        lengths = [int(m.split('_')[1]) for m in dic_COI[i]]
        individual = [dic_COI[i][x] for x, l in enumerate(lengths) if l < 2000]
        whole = [
            dic_COI[i][x] for x, l in enumerate(lengths)
            if l > 2000 and l < 3000
        ]
        mito = [dic_COI[i][x] for x, l in enumerate(lengths) if l > 3000]
        if len(mito) > 0:
            GIs_to_align = [mito[0].split("_")[0], 'GU365907']
            alignment = alignment_reg(GIs_to_align)
            iden = identity_calc(alignment)
            if iden > 80:
                #have to do span to account for random small blocks that dont align
                span = 0
                #get start
                for l in range(len(alignment[0])):
                    col = alignment[:, l]
                    if '-' not in col:
                        span += 1
                    if span == 10:
                        break
                    elif span > 0 and '-' in col:
                        span = 0
                start = l - 8
                span = 0
                #get stop
                for l in reversed(range(len(alignment[0]))):
                    col = alignment[:, l]
                    if '-' not in col:
                        span += 1
                    if span == 10:
                        break
                    elif span > 0 and '-' in col:
                        span = 0
                end = l + 10
                handle = Entrez.efetch(db="nucleotide",
                                       rettype="fasta",
                                       retmode="text",
                                       id=mito[0].split("_")[0],
                                       seq_start=start - 1,
                                       seq_stop=end - 2)
                record = SeqIO.read(handle, "fasta")
                records.append(record)
            else:
                print('Low iden when matching COI to whole mito')
                with open("COI_hand_check.txt", "a") as a:
                    a.write(i + '\n')
        elif len(whole) > 0:
            #pick whole
            if len(whole) == 1:
                GI_nums_single.add(i + "|" + whole[0])
            else:
                #to pipeline
                dic[i] = whole
        if len(individual) > 0:
            #pick individual
            if len(individual) == 1:
                GI_nums_single.add(i + "|" + individual[0])
            else:
                #                print(individual)
                result = []
                ranges = {}
                current_start = -1
                current_stop = -1
                whole_length = 0
                #do tiling
                for m in [x.split('_')[0] for x in individual]:
                    iden, start_stop = tiling([m], 'COI_trnL_COII')
                    start, end = start_stop[0]
                    #uses gi for danaus chrysippus COI_trnL_COII
                    if iden > 70:
                        #make dic of lists
                        if (start, end) in ranges.keys():
                            ranges_dic_list = ranges[(start, end)]
                            ranges_dic_list.append(m)
                            ranges[(start, end)] = ranges_dic_list
                        else:
                            ranges[(start, end)] = [m]
                    else:
                        print('Alignment below 70')
                        print(GIs_to_align)
                if len(ranges) == 0:
                    print('All alignments below 70, printing to file')
                    with open("COI_hand_check.txt", "a") as a:
                        a.write(i + '\n')
                #get merged range
                for start, stop in sorted(ranges.keys()):
                    if start > current_stop:
                        result.append((start, stop))
                        current_start, current_stop = start, stop
                    else:
                        current_stop = max(current_stop, stop)
                        result[-1] = (current_start, current_stop)
                for n in result:
                    whole_length += n[1] - n[0] + 1
                #go through each combination of ranges to get 95% of whole range
                for L in range(1, len(ranges) + 1):
                    max_perc = 0
                    for subset in itertools.combinations(ranges.keys(), L):
                        comb_length = 0
                        #for each combination, get merged length
                        current_start = -1
                        current_stop = -1
                        result = []
                        for start, stop in sorted(subset):
                            if start > current_stop:
                                result.append((start, stop))
                                current_start, current_stop = start, stop
                            else:
                                current_stop = max(current_stop, stop)
                                result[-1] = (current_start, current_stop)
                        for x in result:
                            comb_length += x[1] - x[0] + 1
                        if whole_length >= comb_length:
                            perc = float(comb_length) / float(whole_length)
                            if perc > max_perc:
                                max_perc = perc
                                max_subset = set()
                                max_subset.add(subset)
                            elif perc == max_perc:
                                max_subset.add(subset)
                            else:
                                pass
                        else:
                            pass
                    # goes through all combinations in a level before breaking
                    if max_perc > .95:
                        break
                final_tiling = [(0, 0)] * L
                for combination in max_subset:
                    for x, comb_frag in enumerate(sorted(combination)):
                        if comb_frag[1] - comb_frag[0] + 1 > final_tiling[x][
                                1] - final_tiling[x][0] + 1:
                            final_tiling[x] = comb_frag
#              print(final_tiling)
                possible_GIs = [ranges[x] for x in final_tiling]
                #                print(possible_GIs)
                count = 0
                for m in possible_GIs:
                    if len(m) == 1:
                        GI_nums_single.add(i + "|" + m[0] + "_0")
                    else:
                        if count == 0:
                            dic[i] = m
                            count += 1
                        else:
                            dic[i + "_" + str(count)] = m
                            count += 1
                        # merge ranges - if same number of ranges as original, keep all,
                        # want the least number to overlap 95% of whole range
                        # try each comb from low to high and if hits 95%, choose those
                        # if multiple higher than 95%, choose one with best %
                        # if multiple with same % and same numb of combs- multiple
    print(dic)
    #   sys.exit()
    SeqIO.write(records, "mito_COI.fa", "fasta")
    #pulls out the GIs with first, the longest number of ATCGs and second, the longest length and makes dictionary
    print("Trying to resolve all other sequences")
    count = 0
    for i in dic:
        GIlist = []
        for n in dic[i]:
            GIlist.append(n.split("_")[0])
        dic[i] = resolve_seqs(GIlist)
        print(str(round((float(count) / float(len(dic))) * 100, 2)) + "%")
        count += 1
    #splits the ones that still have multiple (so the longest had multiple choices) and the ones that are resolved
    for i in dic:
        if len(dic[i]) > 1:
            dic_mult[i] = dic[i]
        else:
            dic_single[i] = dic[i]
    for i in genes:
        finalGInums_only1 = set()
        finalGInums_longest = set()
        for n in dic_single.keys():
            if i == re.split('_|\|', n)[1]:
                finalGInums_longest.add(''.join(dic_single[n]))
        for n in GI_nums_single:
            if i == re.split('_|\|', n)[1]:
                finalGInums_only1.add(re.split('_|\|', n)[-2])
        with open("final_GIs.txt", "a") as o:
            for m in finalGInums_only1:
                o.write(str(m) + "\n")
        #this needs to go to blast_sp_parse.py
        with open(i + "_accession_nums_resolved.txt", "w") as o:
            for m in finalGInums_longest:
                o.write(str(m) + "\n")
    #this needs to go to cluster.py
    with open("multiple_gene_choices.txt", "w") as w:
        for i in dic_mult:
            w.write(i + "\t" + str(dic_mult[i]) + "\n")
    conn.close()