示例#1
0
def get_bbh(d_allvall,min_simil = 0.0):
    # Finds all bidirectional best hits from the previously parsed dictionary
    # Takes into account cases where multiple proteins are the 'best hits'; adds them all (if they're bbhs)
    pairs_per_genome = autovivify(levels=1,final=list)
    all_pairs = set()
    pair_dict = autovivify(levels=1,final=set)
    for genome_q in d_allvall:
        genomes_s = d_allvall[genome_q]
        for genome_s in genomes_s:
            if genome_s <= genome_q:
                continue
                # Only check once for each genome pair
            genes_q = genomes_s[genome_s]
            for gene_q in genes_q:
                best_score,best_ev,best_genes = get_best_scoring_genes(d_allvall,genome_q,genome_s,gene_q)
                for best_gene in best_genes:
                    # Get all the best hits
                    best_score_rev,best_ev,best_genes_rev = get_best_scoring_genes(d_allvall,genome_s,genome_q,best_gene)
                    if gene_q in best_genes_rev:
                        # This best hit is a bbh
                        sim_ab = d_allvall[genome_q][genome_s][gene_q][best_gene][2]
                        sim_ba = d_allvall[genome_s][genome_q][best_gene][gene_q][2]
                        if sim_ab > min_simil and sim_ba > min_simil:
                            # Add both ways here
                            all_pairs.add((best_gene,gene_q))
                            all_pairs.add((gene_q,best_gene))
                            pairs_per_genome[(genome_q,genome_s)].append((gene_q,best_gene))
                            pairs_per_genome[(genome_s,genome_q)].append((best_gene,gene_q))
                            pair_dict[gene_q].add(best_gene)
                            pair_dict[best_gene].add(gene_q)
    return(all_pairs,pairs_per_genome,pair_dict)
示例#2
0
def filter_cogs(cutoff_dict,score_dict_exp,base_truecogs,genomes,propagate_truecogs,truecog_pairs):
    passed=autovivify(levels=2,final=float)
    for t1 in score_dict_exp:
        t2s = score_dict_exp[t1]
        for t2 in t2s:
            if t2 == t1:
                continue
            if t1 not in genomes or t2 not in genomes:
                continue
            g1s = t2s[t2]
            for g1 in g1s:
                g2s = g1s[g1]
                for g2 in g2s:
                    t0 = time.time()
                    [ql,sl,score,bits,ev] = g2s[g2]
                    if t1 < t2:
                        cutoff = cutoff_dict[t1][t2]
                    else:
                        cutoff = cutoff_dict[t2][t1]
                    if score >= cutoff:
                        if g1 < g2:
                            passed[g1][g2]=score
                        else:
                            passed[g2][g1]=score 
                #Decide if true cogs should be forced as cogs even if below threshold
                    if propagate_truecogs and score < cutoff:
                        if (g1,g2) in truecog_pairs:
                            if g1 < g2:
                                passed[g1][g2] = 100
                            else:
                                passed[g2][g1] = 100
                                                       
    return(passed)
示例#3
0
def read_allvall(allVallfile,genome_dict,genomes=False):
    '''Parses the allvall blast results from a single file'''
    #Returns a gen2tax dict and an expanded score dictionary for the bbh analysis    
    score_dict_exp = autovivify(levels=4,final=list)
    with open(allVallfile, 'r') as f:
        for line in f:
            if not '#' in line:
                line=line.rstrip()
                # in the BLAST, outfmt 6 is used with the following keywords
                # qseqid sseqid pident length mismatch gapopen qlen slen evalue bitscore
                # qseqid and sseqid are already split up in gene and taxon
                (a,b,pid,length,mismatch,gapopen,ql,sl,evalue,bits)=line.split('\t')
                t1 = genome_dict[a].genome.name
                t2 = genome_dict[b].genome.name
                pid = float(pid)
                ql = float(ql)
                length = int(length)
                # Hits are only added if the alignment length is at least 75% as long as the query
                if float(length) < 0.75*ql:
                    continue
                # If an iterable of genomes is added, these are considered the only genomes allowed
                # This is mostly done to allow analysis of smaller subgroups of genomes without having to rerun the BLAST
                if genomes and (t1 not in genomes or t2 not in genomes):
                    continue
                bits = float(bits)
                evalue = float(evalue)
                score_dict_exp[t1][t2][a][b] = [ql,sl,pid,bits,evalue]
    return(score_dict_exp)
示例#4
0
def add_genome_cog(base_group,genome,true_pair_dict,base_truecogs,genome_dict):
    # Adds a genome to a group of genomes, and returns which truecogs remain
    truecogs_missing = 0
    new_truecogs = []
    for truecogs in base_truecogs:
        # truecogs is a group of truecogs, at least one per base genome
        candidates = autovivify(levels=1,final=int)
        # candidates keeps track of candidate genes that might be a truecog in the new group
        # The number indicates the amount of genomes of the base_genomes that this gene is a truecog_pair with
        for base_genome in base_group:
            # Analyze all the genes of this truecog group belonging to this genome 
            genome_truecogs = [tc for tc in truecogs if genome_dict[tc].genome.name == base_genome] 
            basegenome_candidates = set()
            added = False
            for truecog in genome_truecogs:
                gene_true_pairs = true_pair_dict[truecog]
                for gene in gene_true_pairs:
                    if genome_dict[gene].genome.name == genome:
                        basegenome_candidates.add(gene)
                        added = True
                        # One of the genes of the new genome is a truecog_pair with one of the truecogs of thise base genome
                        # If this is true for all genomes, the truecog is conserved
            for cand in basegenome_candidates:
                candidates[cand] += 1
            if not added:
                # None of the genes belonging to this trueCOG and one of the base genomes are a true_pair with any gene of the genome to be added
                # The truecog is not conserved when this genome is added
                truecogs_missing += 1
                break
                
        else:    
            # Keep only candidates that are true_pairs with all other genomes 
            # This step prevents the case where multiple candidate truecogs exist, but none are conserved true_pairs between all genomes in the basegroup        
            to_add = []
            for c in candidates:
                if candidates[c] < len(base_group):
                    continue
                else:
                    to_add.append(c)
            if len(to_add) == 0:
                truecogs_missing += 1
            else:
                new_truecogs.append(truecogs + tuple(to_add))
    return truecogs_missing,new_truecogs
示例#5
0
def get_truecog_pairs(pairs,all_pairs,genome_dict):
    # Figure out which of the pairs are truecog-pairs (flanked by other pairs)
    truecog_pairs_per_genome = {}
    all_truecog_pairs = set()
    true_pair_dict = autovivify(levels=1,final=set)
    for genome_a,genome_b in pairs:
        pairs_g = pairs[genome_a,genome_b]
        truecog_pairs_per_genome[genome_a,genome_b] = []
        for genename_a,genename_b in pairs_g:
            # Get the objects from the names
            gene_a = genome_dict[genename_a]
            gene_b = genome_dict[genename_b]
            if (gene_a.left_flank and gene_b.left_flank and gene_a.right_flank and gene_b.right_flank):
                if ((gene_a.left_flank.name,gene_b.left_flank.name) in all_pairs and (gene_a.right_flank.name,gene_b.right_flank.name) in all_pairs) or\
                   ((gene_a.left_flank.name,gene_b.right_flank.name) in all_pairs and (gene_a.right_flank.name,gene_b.left_flank.name) in all_pairs):
            # If both genes have flanking genes, and the flanking genes are bbhs between the genomes...
                    # ... add these genes as a truecog-pair
                    truecog_pairs_per_genome[genome_a,genome_b].append((genename_a,genename_b))
                    all_truecog_pairs.add((genename_a,genename_b))
                    true_pair_dict[genename_a].add(genename_b)
                    true_pair_dict[genename_b].add(genename_a)
    return(truecog_pairs_per_genome,all_truecog_pairs,true_pair_dict)
示例#6
0
def calculate_taxon_pairwise(true_list,genome_dict,score_dict_exp,cutoff_mode,cutoff_factor,\
                                genomes_allowed=False,cutoff_sgm = 0.05,outf=False,bins='auto'):
    # Calculate the cutoff for each pair of genomes based on the truecogs given
    # First cleanup: get all unique truecogs (remove extra truecogs due to some truecogs pairing up with multiple genes)
    unique_truecogs = get_unique_truecogs(true_list)
    
    # Now gather all the similarity scores for each pair of genomes
    true_scores = autovivify(2, list)
    for cog_group in unique_truecogs:        
        for a in cog_group:
            for b in cog_group:
                t_a = genome_dict[a].genome.name
                t_b = genome_dict[b].genome.name
                if t_a >= t_b or (genomes_allowed and (t_a not in genomes_allowed and t_b not in genomes_allowed)):
                    continue                
                
                score_ab = score_dict_exp[t_a][t_b][a][b][2]
                score_ba = score_dict_exp[t_b][t_a][b][a][2]
                
                score = (score_ab+score_ba)/2.0 # Just in case the score from prot a vs b and b vs a is slightly different, take the average
                
                if t_a < t_b:
                    true_scores[t_a][t_b].append(score)
                else:
                    true_scores[t_b][t_a].append(score)
               
    if outf:  
        cutoff_table=open(outf,'wt')
    cutoff_dict = autovivify(2, float)
    all_cutoffs = []
    
    histogram_data = autovivify(2, list)
    
    for t1 in true_scores:
        for t2 in true_scores[t1]:
            scores = true_scores[t1][t2]
            scores_sorted = tuple(sorted(scores))
            if scores_sorted in scores_tested:
                # To save time when doing multiple group formations in combination with sgm
                pars,cutoff = scores_tested[scores_sorted]
            else:
                y,bounds = np.histogram(scores,bins=bins)
#                highest_point = max(y)
                if cutoff_mode == 'sgm':
                    result = fit_truecogs(scores,bins='auto',plot=False,model=SkewedGaussianModel)
                    if result:
                        pars = result.params
                        skew = skewnorm(pars['gamma'],pars['center'],pars['sigma'])
                        cutoff = skew.ppf(cutoff_sgm)
                # Sometimes the data is fit to just the tail of the sgm, and the cutoffs are outside the 0-100 range
                if cutoff_mode == 'gm' or not result or (result and (cutoff > 100.0 or cutoff < 0.0)):
                    mean = np.mean(scores)
                    std = np.std(scores)
                    cutoff = round(mean-cutoff_factor*(std),2)
                    pars = {'center':mean,'sigma':std,'amplitude':1,'gamma':0}
                
                scores_tested[scores_sorted] = pars,cutoff
            cutoff_dict[t1][t2]=cutoff
            all_cutoffs.append(cutoff)
            histogram_data[t1][t2] = [pars,cutoff]
            
            if outf:
                cutoff_table.write( "%s\t%s\tmean=\t%s\tcutoff=\t%s\n" % (t1,t2,np.mean(true_scores[t1][t2]),cutoff_dict[t1][t2]) )
    if outf:
        cutoff_table.close()
        
    return cutoff_dict,true_scores,all_cutoffs,histogram_data