def run(self, aa_gene_files, evalue, per_identity, output_dir): """Apply reciprocal blast to all pairs of genomes in parallel. Parameters ---------- aa_gene_files : list of str Amino acid fasta files to process via reciprocal blast. evalue : float E-value threshold for reporting hits. per_identity : float Percent identity threshold for reporting hits. output_dir : str Directory to store blast results. """ # concatenate all gene files and create a single diamond database self.logger.info(' Creating diamond database (be patient!).') gene_file = os.path.join(output_dir, 'all_genes.faa') concatenate_files(aa_gene_files, gene_file) diamond_db = os.path.join(output_dir, 'all_genes') diamond = Diamond(self.cpus) diamond.make_database(gene_file, diamond_db) # blast all genes against the database self.logger.info('') self.logger.info(' Identifying hits between all pairs of genomes (be patient!).') hits_daa_file = os.path.join(output_dir, 'all_hits') diamond.blastp(gene_file, diamond_db, evalue, per_identity, len(aa_gene_files) * 10, hits_daa_file) # create flat hits table self.logger.info(' Creating table with hits.') hits_table_file = os.path.join(output_dir, 'all_hits.tsv') diamond.view(hits_daa_file + '.daa', hits_table_file)
def rblast(self, options): """Reciprocal blast command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [CompareM - rblast] Performing reciprocal blast between genomes.') self.logger.info('*******************************************************************************') check_dir_exists(options.protein_dir) make_sure_path_exists(options.output_dir) aa_gene_files = [] for f in os.listdir(options.protein_dir): if f.endswith(options.protein_ext): aa_gene_files.append(os.path.join(options.protein_dir, f)) if not aa_gene_files: self.logger.warning(' [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.') sys.exit() # modify gene ids to include genome ids in order to ensure # all gene identifiers are unique across the set of genomes, # also removes the trailing asterisk used to identify the stop # codon self.logger.info('') self.logger.info(' Appending genome identifiers to all gene identifiers.') gene_out_dir = os.path.join(options.output_dir, 'genes') make_sure_path_exists(gene_out_dir) modified_aa_gene_files = [] for gf in aa_gene_files: genome_id = remove_extension(gf) aa_file = os.path.join(gene_out_dir, genome_id + '.faa') fout = open(aa_file, 'w') for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True): fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n') if seq[-1] == '*': seq = seq[0:-1] fout.write(seq + '\n') fout.close() modified_aa_gene_files.append(aa_file) # perform the reciprocal blast with blastp or diamond self.logger.info('') if options.blastp: rblast = ReciprocalBlast(options.cpus) rblast.run(modified_aa_gene_files, options.evalue, options.output_dir) # concatenate all blast tables to mimic output of diamond, all hits # for a given genome MUST be in consecutive order to fully mimic # the expected results from diamond self.logger.info('') self.logger.info(' Creating single file with all blast hits (be patient!).') blast_files = sorted([f for f in os.listdir(options.output_dir) if f.endswith('.blastp.tsv')]) hit_tables = [os.path.join(options.output_dir, f) for f in blast_files] concatenate_files(hit_tables, os.path.join(options.output_dir, 'all_hits.tsv')) else: rdiamond = ReciprocalDiamond(options.cpus) rdiamond.run(modified_aa_gene_files, options.evalue, options.per_identity, options.output_dir) self.logger.info('') self.logger.info(' Reciprocal blast hits written to: %s' % options.output_dir) self.time_keeper.print_time_stamp()
def run(self, query_gene_file, target_gene_file, sorted_hit_table, evalue_threshold, per_iden_threshold, per_aln_len_threshold, keep_rbhs, output_dir): """Calculate amino acid identity (AAI) between pairs of genomes. Parameters ---------- query_gene_file : str File with all query genes in FASTA format. target_gene_file : str or None File with all target genes in FASTA format, or None if performing a reciprocal AAI calculation. sorted_hit_table : str Sorted table indicating genes with sequence similarity. evalue_threshold : float Evalue threshold used to define a homologous gene. per_identity_threshold : float Percent identity threshold used to define a homologous gene. per_aln_len_threshold : float Alignment length threshold used to define a homologous gene. keep_rbhs : boolean Flag indicating if RBH should be written to file. output_dir : str Directory to store AAI results. """ self.sorted_hit_table = sorted_hit_table self.evalue_threshold = evalue_threshold self.per_identity_threshold = per_iden_threshold self.per_aln_len_threshold = per_aln_len_threshold self.keep_rbhs = keep_rbhs self.output_dir = output_dir # calculate length of genes and number of genes in each genome self.logger.info('Calculating length of genes.') self.gene_lengths = {} self.query_gene_count = defaultdict(int) query_genomes = set() for seq_id, seq in seq_io.read_fasta_seq(query_gene_file): if seq[-1] == '*': self.gene_lengths[seq_id] = len(seq) - 1 else: self.gene_lengths[seq_id] = len(seq) genome_id = seq_id[0:seq_id.find('~')] self.query_gene_count[genome_id] += 1 query_genomes.add(genome_id) self.target_gene_count = defaultdict(int) target_genomes = set() if target_gene_file: for seq_id, seq in seq_io.read_fasta_seq(target_gene_file): if seq[-1] == '*': self.gene_lengths[seq_id] = len(seq) - 1 else: self.gene_lengths[seq_id] = len(seq) genome_id = seq_id[0:seq_id.find('~')] self.target_gene_count[genome_id] += 1 target_genomes.add(genome_id) else: self.target_gene_count = self.query_gene_count # get byte offset of hits from each genome self.logger.info('Indexing sorted hit table.') self.offset_table = self._genome_offsets(self.sorted_hit_table) # calculate AAI between each pair of genomes in parallel if target_genomes: # compare query genomes to target genomes self.num_pairs = len(query_genomes) * len(target_genomes) self.logger.info( 'Calculating AAI between %d query and %d target genomes:' % (len(query_genomes), len(target_genomes))) else: # compute pairwise values between target genomes ng = len(query_genomes) self.num_pairs = (ng * ng - ng) / 2 self.logger.info( 'Calculating AAI between all %d pairs of genomes:' % self.num_pairs) if self.num_pairs == 0: self.logger.warning('No genome pairs identified.') return genome_id_lists = [] query_genomes = list(query_genomes) target_genomes = list(target_genomes) for i in xrange(0, len(query_genomes)): genome_idI = query_genomes[i] if target_genomes: genome_id_list = target_genomes else: genome_id_list = [] for j in xrange(i + 1, len(query_genomes)): genome_idJ = query_genomes[j] genome_id_list.append(genome_idJ) genome_id_lists.append((genome_idI, genome_id_list)) self.processed_paired = 0 parallel = Parallel(self.cpus) progress_func = self._progress if self.logger.is_silent: progress_func = None consumer_data = parallel.run(self._producer, self._consumer, genome_id_lists, progress_func) # write results for each genome pair self.logger.info('Summarizing AAI results.') aai_summay_file = os.path.join(output_dir, 'aai_summary.tsv') fout = open(aai_summay_file, 'w') fout.write( 'Genome A\tGenes in A\tGenome B\tGenes in B\t# orthologous genes\tMean AAI\tStd AAI\tOrthologous fraction (OF)\n' ) for data in consumer_data: fout.write('%s\t%d\t%s\t%d\t%d\t%.2f\t%.2f\t%.2f\n' % data) fout.close() # concatenate RBH files rbh_output_file = None if self.keep_rbhs: self.logger.info('Concatenating RBH files.') rbh_files = [] for genome_id in query_genomes: rbh_files.append( os.path.join(self.output_dir, genome_id + '.rbh.tsv')) rbh_output_file = os.path.join(self.output_dir, 'rbh.tsv') concatenate_files(rbh_files, rbh_output_file, common_header=True) for f in rbh_files: os.remove(f) return aai_summay_file, rbh_output_file
def run(self, query_proteins, db_file, custom_db_file, taxonomy_file, custom_taxonomy_file, evalue, per_identity, per_aln_len, max_matches, homology_search, min_per_taxa, consensus, min_per_bp, use_trimAl, restrict_taxon, msa_program, tree_program, prot_model, skip_rooting, output_dir): """Infer a gene tree for homologs genes identified by blast. Workflow for inferring a gene tree from sequences identified as being homologs to a set of query proteins. Homologs are identified using BLASTP and a set of user-defined parameters. Parameters ---------- query_proteins : str Fasta file containing query proteins. db_file : str BLAST database of reference proteins. custom_db_file : str Custom database of proteins. taxonomy_file : str Taxonomic assignment of each reference genomes. custom_taxonomy_file : str Taxonomic assignment of genomes in custom database. evalue : float E-value threshold used to define homolog. per_identity : float Percent identity threshold used to define a homolog. per_aln_len : float Alignment length threshold used to define a homolog. max_matches : int Maximum matches per query protein. metadata : dict[genome_id] -> metadata dictionary Metadata for genomes. homology_search : str Type of homology search to perform. min_per_taxa : float Minimum percentage of taxa required to retain a column. consensus : float Minimum percentage of the same amino acid required to retain column. min_per_bp : float Minimum percentage of base pairs required to keep trimmed sequence. use_trimAl : boolean Filter columns using trimAl. restrict_taxon : str Restrict alignment to specific taxonomic group (e.g., k__Archaea). msa_program : str Program to use for multiple sequence alignment ['mafft', 'muscle']. tree_program : str Program to use for tree inference ['fasttree', 'raxml']. prot_model : str Protein substitution model for tree inference ['WAG', 'LG', 'AUTO']. skip_rooting : boolean Skip midpoint rooting if True. output_dir : str Directory to store results. """ # validate query sequence names for use with GeneTreeTk validate_seq_ids(query_proteins) # read taxonomy file self.logger.info('Reading taxonomy file.') taxonomy = Taxonomy().read(taxonomy_file) if custom_taxonomy_file: custom_taxonomy = Taxonomy().read(custom_taxonomy_file) taxonomy.update(custom_taxonomy) # report distribution of query genes mean_len, max_len, min_len, p10, p50, p90 = self._gene_distribution( query_proteins) self.logger.info( 'Query gene lengths: min, mean, max = %d, %.1f, %d | p10, p50, p90 = %.1f, %.1f, %.1f' % (min_len, mean_len, max_len, p10, p50, p90)) # identify homologs using BLASTP self.logger.info('Identifying homologs using %s.' % homology_search) blast = Blast(self.cpus) blast_output = os.path.join(output_dir, 'reference_hits.tsv') if homology_search == 'diamond': diamond = Diamond(self.cpus) diamond.blastp(query_proteins, db_file, evalue, per_identity, per_aln_len, max_matches, blast_output, output_fmt='custom') else: blast.blastp(query_proteins, db_file, blast_output, evalue, max_matches, output_fmt='custom', task=homology_search) homologs = blast.identify_homologs(blast_output, evalue, per_identity, per_aln_len) self.logger.info('Identified %d homologs in reference database.' % len(homologs)) custom_homologs = None if custom_db_file: custom_blast_output = os.path.join(output_dir, 'custom_hits.tsv') if homology_search == 'diamond': diamond = Diamond(self.cpus) diamond.blastp(query_proteins, custom_db_file, evalue, per_identity, per_aln_len, max_matches, custom_blast_output, output_fmt='custom') else: blast.blastp(query_proteins, custom_db_file, custom_blast_output, evalue, max_matches, output_fmt='custom', task=homology_search) custom_homologs = blast.identify_homologs(custom_blast_output, evalue, per_identity, per_aln_len) self.logger.info('Identified %d homologs in custom database.' % len(custom_homologs)) # restrict homologs to specific taxonomic group if restrict_taxon: self.logger.info('Restricting homologs to %s.' % restrict_taxon) restricted_homologs = {} for query_id, hit in homologs.iteritems(): genome_id = hit.subject_id.split('~')[0] if restrict_taxon in taxonomy[genome_id]: restricted_homologs[query_id] = hit self.logger.info( '%d of %d homologs in reference database are from the specified group.' % (len(restricted_homologs), len(homologs))) homologs = restricted_homologs if len(homologs) == 0: self.logger.error( 'Too few homologs were identified. Gene tree cannot be inferred.' ) sys.exit() # extract homologs self.logger.info( 'Extracting homologs and determining local gene context.') db_homologs_tmp = os.path.join(output_dir, 'homologs_db.tmp') gene_precontext, gene_postcontext = self.extract_homologs_and_context( homologs.keys(), db_file, db_homologs_tmp) # report gene length distribution of homologs mean_len, max_len, min_len, p10, p50, p90 = self._gene_distribution( db_homologs_tmp) self.logger.info( 'Homolog gene lengths: min, mean, max = %d, %.1f, %d | p10, p50, p90 = %.1f, %.1f, %.1f' % (min_len, mean_len, max_len, p10, p50, p90)) # concatenate homologs with initial query genes homolog_ouput_tmp = os.path.join(output_dir, 'homologs.faa.tmp') if custom_homologs: custom_db_homologs_tmp = os.path.join(output_dir, 'custom_homologs_db.tmp') custom_gene_precontext, custom_gene_postcontext = self.extract_homologs_and_context( custom_homologs.keys(), custom_db_file, custom_db_homologs_tmp) gene_precontext.update(custom_gene_precontext) gene_postcontext.update(custom_gene_postcontext) homologs.update(custom_homologs) concatenate_files( [query_proteins, db_homologs_tmp, custom_db_homologs_tmp], homolog_ouput_tmp) os.remove(custom_db_homologs_tmp) else: concatenate_files([query_proteins, db_homologs_tmp], homolog_ouput_tmp) os.remove(db_homologs_tmp) # remove stop codons homolog_ouput = os.path.join(output_dir, 'homologs.faa') self._remove_stop_codons(homolog_ouput_tmp, homolog_ouput) os.remove(homolog_ouput_tmp) # infer multiple sequence alignment msa = MsaWorkflow(self.cpus) trimmed_msa_output = msa.run(homolog_ouput, min_per_taxa, consensus, min_per_bp, use_trimAl, msa_program, output_dir) # infer tree tw = TreeWorkflow(self.cpus) tree_output = tw.run(trimmed_msa_output, tree_program, prot_model, skip_rooting, output_dir) # create tax2tree consensus map and decorate tree self.logger.info('Decorating internal tree nodes with tax2tree.') output_taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv') fout = open(output_taxonomy_file, 'w') for homolog_id in homologs.keys(): genome_id = homolog_id.split('~')[0] t = taxonomy.get(genome_id, None) if t: fout.write(homolog_id + '\t' + ';'.join(t) + '\n') fout.close() t2t_tree = os.path.join(output_dir, 'homologs.tax2tree.tree') cmd = 't2t decorate -m %s -t %s -o %s' % (output_taxonomy_file, tree_output, t2t_tree) os.system(cmd) # create tree with leaf nodes given as genome accessions tree = dendropy.Tree.get_from_path(t2t_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) for leaf in tree.leaf_node_iter(): leaf.taxon.label = leaf.taxon.label.split('~')[0] genome_tree = os.path.join(output_dir, 'homologs.tax2tree.genome_accessions.tree') tree.write_to_path(genome_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True) # setup metadata for ARB file src_dir = os.path.dirname(os.path.realpath(__file__)) version_file = open(os.path.join(src_dir, 'VERSION')) metadata = {} metadata['genetreetk_version'] = version_file.read().strip() metadata['genetreetk_query_proteins'] = query_proteins metadata['genetreetk_db_file'] = db_file metadata['genetreetk_taxonomy_file'] = taxonomy_file metadata['genetreetk_blast_evalue'] = str(evalue) metadata['genetreetk_blast_per_identity'] = str(per_identity) metadata['genetreetk_blast_per_aln_len'] = str(per_aln_len) metadata['genetreetk_blast_max_matches'] = str(max_matches) metadata['genetreetk_homology_search'] = homology_search metadata['genetreetk_msa_min_per_taxa'] = str(min_per_taxa) metadata['genetreetk_msa_consensus'] = str(consensus) metadata['genetreetk_msa_min_per_bp'] = str(min_per_bp) metadata['genetreetk_msa_program'] = msa_program metadata['genetreetk_tree_program'] = tree_program metadata['genetreetk_tree_prot_model'] = prot_model # create ARB metadata file self.logger.info('Creating ARB metadata file.') arb_metadata_file = os.path.join(output_dir, 'arb.metadata.txt') self.create_arb_metadata(homologs, trimmed_msa_output, taxonomy, metadata, gene_precontext, gene_postcontext, arb_metadata_file)
def run(self, query_gene_file, target_gene_file, sorted_hit_table, evalue_threshold, per_iden_threshold, per_aln_len_threshold, keep_rbhs, output_dir): """Calculate amino acid identity (AAI) between pairs of genomes. Parameters ---------- query_gene_file : str File with all query genes in FASTA format. target_gene_file : str or None File with all target genes in FASTA format, or None if performing a reciprocal AAI calculation. sorted_hit_table : str Sorted table indicating genes with sequence similarity. evalue_threshold : float Evalue threshold used to define a homologous gene. per_identity_threshold : float Percent identity threshold used to define a homologous gene. per_aln_len_threshold : float Alignment length threshold used to define a homologous gene. keep_rbhs : boolean Flag indicating if RBH should be written to file. output_dir : str Directory to store AAI results. """ self.sorted_hit_table = sorted_hit_table self.evalue_threshold = evalue_threshold self.per_identity_threshold = per_iden_threshold self.per_aln_len_threshold = per_aln_len_threshold self.keep_rbhs = keep_rbhs self.output_dir = output_dir # calculate length of genes and number of genes in each genome self.logger.info('Calculating length of genes.') self.gene_lengths = {} self.query_gene_count = defaultdict(int) query_genomes = set() for seq_id, seq in seq_io.read_fasta_seq(query_gene_file): if seq[-1] == '*': self.gene_lengths[seq_id] = len(seq) - 1 else: self.gene_lengths[seq_id] = len(seq) genome_id = seq_id[0:seq_id.find('~')] self.query_gene_count[genome_id] += 1 query_genomes.add(genome_id) self.target_gene_count = defaultdict(int) target_genomes = set() if target_gene_file: for seq_id, seq in seq_io.read_fasta_seq(target_gene_file): if seq[-1] == '*': self.gene_lengths[seq_id] = len(seq) - 1 else: self.gene_lengths[seq_id] = len(seq) genome_id = seq_id[0:seq_id.find('~')] self.target_gene_count[genome_id] += 1 target_genomes.add(genome_id) else: self.target_gene_count = self.query_gene_count # get byte offset of hits from each genome self.logger.info('Indexing sorted hit table.') self.offset_table = self._genome_offsets(self.sorted_hit_table) # calculate AAI between each pair of genomes in parallel if target_genomes: # compare query genomes to target genomes self.num_pairs = len(query_genomes) * len(target_genomes) self.logger.info('Calculating AAI between %d query and %d target genomes:' % (len(query_genomes), len(target_genomes))) else: # compute pairwise values between target genomes ng = len(query_genomes) self.num_pairs = (ng*ng - ng) / 2 self.logger.info('Calculating AAI between all %d pairs of genomes:' % self.num_pairs) if self.num_pairs == 0: self.logger.warning('No genome pairs identified.') return genome_id_lists = [] query_genomes = list(query_genomes) target_genomes = list(target_genomes) for i in range(0, len(query_genomes)): genome_idI = query_genomes[i] if target_genomes: genome_id_list = target_genomes else: genome_id_list = [] for j in range(i + 1, len(query_genomes)): genome_idJ = query_genomes[j] genome_id_list.append(genome_idJ) genome_id_lists.append((genome_idI, genome_id_list)) self.processed_paired = 0 parallel = Parallel(self.cpus) progress_func = self._progress if self.logger.is_silent: progress_func = None consumer_data = parallel.run(self._producer, self._consumer, genome_id_lists, progress_func) # write results for each genome pair self.logger.info('Summarizing AAI results.') aai_summay_file = os.path.join(output_dir, 'aai_summary.tsv') fout = open(aai_summay_file, 'w') fout.write('#Genome A\tGenes in A\tGenome B\tGenes in B\t# orthologous genes\tMean AAI\tStd AAI\tOrthologous fraction (OF)\n') for data in consumer_data: fout.write('%s\t%d\t%s\t%d\t%d\t%.2f\t%.2f\t%.2f\n' % data) fout.close() # concatenate RBH files rbh_output_file = None if self.keep_rbhs: self.logger.info('Concatenating RBH files.') rbh_files = [] for genome_id in query_genomes: rbh_files.append(os.path.join(self.output_dir, genome_id + '.rbh.tsv')) rbh_output_file = os.path.join(self.output_dir, 'rbh.tsv') concatenate_files(rbh_files, rbh_output_file, common_header=True) for f in rbh_files: os.remove(f) return aai_summay_file, rbh_output_file