def get_bbh(d_allvall,min_simil = 0.0): # Finds all bidirectional best hits from the previously parsed dictionary # Takes into account cases where multiple proteins are the 'best hits'; adds them all (if they're bbhs) pairs_per_genome = autovivify(levels=1,final=list) all_pairs = set() pair_dict = autovivify(levels=1,final=set) for genome_q in d_allvall: genomes_s = d_allvall[genome_q] for genome_s in genomes_s: if genome_s <= genome_q: continue # Only check once for each genome pair genes_q = genomes_s[genome_s] for gene_q in genes_q: best_score,best_ev,best_genes = get_best_scoring_genes(d_allvall,genome_q,genome_s,gene_q) for best_gene in best_genes: # Get all the best hits best_score_rev,best_ev,best_genes_rev = get_best_scoring_genes(d_allvall,genome_s,genome_q,best_gene) if gene_q in best_genes_rev: # This best hit is a bbh sim_ab = d_allvall[genome_q][genome_s][gene_q][best_gene][2] sim_ba = d_allvall[genome_s][genome_q][best_gene][gene_q][2] if sim_ab > min_simil and sim_ba > min_simil: # Add both ways here all_pairs.add((best_gene,gene_q)) all_pairs.add((gene_q,best_gene)) pairs_per_genome[(genome_q,genome_s)].append((gene_q,best_gene)) pairs_per_genome[(genome_s,genome_q)].append((best_gene,gene_q)) pair_dict[gene_q].add(best_gene) pair_dict[best_gene].add(gene_q) return(all_pairs,pairs_per_genome,pair_dict)
def filter_cogs(cutoff_dict,score_dict_exp,base_truecogs,genomes,propagate_truecogs,truecog_pairs): passed=autovivify(levels=2,final=float) for t1 in score_dict_exp: t2s = score_dict_exp[t1] for t2 in t2s: if t2 == t1: continue if t1 not in genomes or t2 not in genomes: continue g1s = t2s[t2] for g1 in g1s: g2s = g1s[g1] for g2 in g2s: t0 = time.time() [ql,sl,score,bits,ev] = g2s[g2] if t1 < t2: cutoff = cutoff_dict[t1][t2] else: cutoff = cutoff_dict[t2][t1] if score >= cutoff: if g1 < g2: passed[g1][g2]=score else: passed[g2][g1]=score #Decide if true cogs should be forced as cogs even if below threshold if propagate_truecogs and score < cutoff: if (g1,g2) in truecog_pairs: if g1 < g2: passed[g1][g2] = 100 else: passed[g2][g1] = 100 return(passed)
def read_allvall(allVallfile,genome_dict,genomes=False): '''Parses the allvall blast results from a single file''' #Returns a gen2tax dict and an expanded score dictionary for the bbh analysis score_dict_exp = autovivify(levels=4,final=list) with open(allVallfile, 'r') as f: for line in f: if not '#' in line: line=line.rstrip() # in the BLAST, outfmt 6 is used with the following keywords # qseqid sseqid pident length mismatch gapopen qlen slen evalue bitscore # qseqid and sseqid are already split up in gene and taxon (a,b,pid,length,mismatch,gapopen,ql,sl,evalue,bits)=line.split('\t') t1 = genome_dict[a].genome.name t2 = genome_dict[b].genome.name pid = float(pid) ql = float(ql) length = int(length) # Hits are only added if the alignment length is at least 75% as long as the query if float(length) < 0.75*ql: continue # If an iterable of genomes is added, these are considered the only genomes allowed # This is mostly done to allow analysis of smaller subgroups of genomes without having to rerun the BLAST if genomes and (t1 not in genomes or t2 not in genomes): continue bits = float(bits) evalue = float(evalue) score_dict_exp[t1][t2][a][b] = [ql,sl,pid,bits,evalue] return(score_dict_exp)
def add_genome_cog(base_group,genome,true_pair_dict,base_truecogs,genome_dict): # Adds a genome to a group of genomes, and returns which truecogs remain truecogs_missing = 0 new_truecogs = [] for truecogs in base_truecogs: # truecogs is a group of truecogs, at least one per base genome candidates = autovivify(levels=1,final=int) # candidates keeps track of candidate genes that might be a truecog in the new group # The number indicates the amount of genomes of the base_genomes that this gene is a truecog_pair with for base_genome in base_group: # Analyze all the genes of this truecog group belonging to this genome genome_truecogs = [tc for tc in truecogs if genome_dict[tc].genome.name == base_genome] basegenome_candidates = set() added = False for truecog in genome_truecogs: gene_true_pairs = true_pair_dict[truecog] for gene in gene_true_pairs: if genome_dict[gene].genome.name == genome: basegenome_candidates.add(gene) added = True # One of the genes of the new genome is a truecog_pair with one of the truecogs of thise base genome # If this is true for all genomes, the truecog is conserved for cand in basegenome_candidates: candidates[cand] += 1 if not added: # None of the genes belonging to this trueCOG and one of the base genomes are a true_pair with any gene of the genome to be added # The truecog is not conserved when this genome is added truecogs_missing += 1 break else: # Keep only candidates that are true_pairs with all other genomes # This step prevents the case where multiple candidate truecogs exist, but none are conserved true_pairs between all genomes in the basegroup to_add = [] for c in candidates: if candidates[c] < len(base_group): continue else: to_add.append(c) if len(to_add) == 0: truecogs_missing += 1 else: new_truecogs.append(truecogs + tuple(to_add)) return truecogs_missing,new_truecogs
def get_truecog_pairs(pairs,all_pairs,genome_dict): # Figure out which of the pairs are truecog-pairs (flanked by other pairs) truecog_pairs_per_genome = {} all_truecog_pairs = set() true_pair_dict = autovivify(levels=1,final=set) for genome_a,genome_b in pairs: pairs_g = pairs[genome_a,genome_b] truecog_pairs_per_genome[genome_a,genome_b] = [] for genename_a,genename_b in pairs_g: # Get the objects from the names gene_a = genome_dict[genename_a] gene_b = genome_dict[genename_b] if (gene_a.left_flank and gene_b.left_flank and gene_a.right_flank and gene_b.right_flank): if ((gene_a.left_flank.name,gene_b.left_flank.name) in all_pairs and (gene_a.right_flank.name,gene_b.right_flank.name) in all_pairs) or\ ((gene_a.left_flank.name,gene_b.right_flank.name) in all_pairs and (gene_a.right_flank.name,gene_b.left_flank.name) in all_pairs): # If both genes have flanking genes, and the flanking genes are bbhs between the genomes... # ... add these genes as a truecog-pair truecog_pairs_per_genome[genome_a,genome_b].append((genename_a,genename_b)) all_truecog_pairs.add((genename_a,genename_b)) true_pair_dict[genename_a].add(genename_b) true_pair_dict[genename_b].add(genename_a) return(truecog_pairs_per_genome,all_truecog_pairs,true_pair_dict)
def calculate_taxon_pairwise(true_list,genome_dict,score_dict_exp,cutoff_mode,cutoff_factor,\ genomes_allowed=False,cutoff_sgm = 0.05,outf=False,bins='auto'): # Calculate the cutoff for each pair of genomes based on the truecogs given # First cleanup: get all unique truecogs (remove extra truecogs due to some truecogs pairing up with multiple genes) unique_truecogs = get_unique_truecogs(true_list) # Now gather all the similarity scores for each pair of genomes true_scores = autovivify(2, list) for cog_group in unique_truecogs: for a in cog_group: for b in cog_group: t_a = genome_dict[a].genome.name t_b = genome_dict[b].genome.name if t_a >= t_b or (genomes_allowed and (t_a not in genomes_allowed and t_b not in genomes_allowed)): continue score_ab = score_dict_exp[t_a][t_b][a][b][2] score_ba = score_dict_exp[t_b][t_a][b][a][2] score = (score_ab+score_ba)/2.0 # Just in case the score from prot a vs b and b vs a is slightly different, take the average if t_a < t_b: true_scores[t_a][t_b].append(score) else: true_scores[t_b][t_a].append(score) if outf: cutoff_table=open(outf,'wt') cutoff_dict = autovivify(2, float) all_cutoffs = [] histogram_data = autovivify(2, list) for t1 in true_scores: for t2 in true_scores[t1]: scores = true_scores[t1][t2] scores_sorted = tuple(sorted(scores)) if scores_sorted in scores_tested: # To save time when doing multiple group formations in combination with sgm pars,cutoff = scores_tested[scores_sorted] else: y,bounds = np.histogram(scores,bins=bins) # highest_point = max(y) if cutoff_mode == 'sgm': result = fit_truecogs(scores,bins='auto',plot=False,model=SkewedGaussianModel) if result: pars = result.params skew = skewnorm(pars['gamma'],pars['center'],pars['sigma']) cutoff = skew.ppf(cutoff_sgm) # Sometimes the data is fit to just the tail of the sgm, and the cutoffs are outside the 0-100 range if cutoff_mode == 'gm' or not result or (result and (cutoff > 100.0 or cutoff < 0.0)): mean = np.mean(scores) std = np.std(scores) cutoff = round(mean-cutoff_factor*(std),2) pars = {'center':mean,'sigma':std,'amplitude':1,'gamma':0} scores_tested[scores_sorted] = pars,cutoff cutoff_dict[t1][t2]=cutoff all_cutoffs.append(cutoff) histogram_data[t1][t2] = [pars,cutoff] if outf: cutoff_table.write( "%s\t%s\tmean=\t%s\tcutoff=\t%s\n" % (t1,t2,np.mean(true_scores[t1][t2]),cutoff_dict[t1][t2]) ) if outf: cutoff_table.close() return cutoff_dict,true_scores,all_cutoffs,histogram_data