def list_genomes_dir(userdir): """List fasta files in a specified directory Parameters ---------- userdir : str Directory path where all fasta files are Returns ------- dict Dictionary indicating the genomic file for each genome. """ if not os.path.exists(userdir): raise ValueError('{0} does not exist.'.format(userdir)) else: onlygenomefiles = { f: os.path.join(userdir, f) for f in os.listdir(userdir) if os.path.isfile(os.path.join(userdir, f)) } for potential_file in onlygenomefiles: try: read_fasta(os.path.join(userdir, potential_file)) except: raise IOError("{0} is not a fasta file.".format( os.path.join(userdir, potential_file))) return onlygenomefiles
def _producer(self, gene_file): """Calculates codon usage of a genome. This function is intended to be used as a producer within a producer/consumer multiprocessing framework. It calculates the codon usage for a single genome and returns the results for consumption by the consumer function. Parameters ---------- gene_file : str Fasta file containing nucleotide sequences. Returns ------- str Unique identifier of genome. dict : d[codon] -> count Occurrence of each codon. dict : d[codon] -> length Average length of genes for a given stop codon. """ genome_id = ntpath.basename(gene_file) genome_id = genome_id.replace('.genes.fna', '') genome_id = os.path.splitext(genome_id)[0] seqs = seq_io.read_fasta(gene_file) codon_usage, gene_length = self.codon_usage(seqs) return (genome_id, codon_usage, gene_length)
def _msa_filter_by_taxa(self, concatenated_file, gtdb_taxonomy, taxa_filter, outgroup_taxon): """Filter GTDB MSA filtered to specified taxa.""" msa = read_fasta(concatenated_file) self.logger.info('Read concatenated alignment for %d GTDB genomes.' % len(msa)) if taxa_filter is not None: taxa_to_keep = set(taxa_filter.split(',')) if outgroup_taxon not in taxa_to_keep and outgroup_taxon is not None: taxa_to_keep.add(outgroup_taxon) filtered_genomes = 0 for genome_id, taxa in gtdb_taxonomy.iteritems(): common_taxa = taxa_to_keep.intersection(taxa) if len(common_taxa) == 0: if genome_id in msa: del msa[genome_id] filtered_genomes += 1 self.logger.info('Filtered %d taxa based on assigned taxonomy.' % filtered_genomes) return msa
def create_concatenated_alignment(genome_ids, marker_genes, alignment_dir, concatenated_alignment_file, marker_file): """Create concatenated multiple sequence alignment for all genomes. Parameters ---------- genome_ids : iterable Genomes of interest. marker_genes : iterable Unique ids of marker genes. alignment_dir : str Directory containing multiple sequence alignments. concatenated_alignment_file : str File to containing concatenated alignment. marker_file : str File indicating length of each marker in the alignment. """ # Read alignment files. Some genomes may have multiple # copies of a marker gene in which case the last one # is arbitrarily taken. This is acceptable as all genes # are already screen to be conspecific. alignments = defaultdict(dict) marker_length = {} for mg in marker_genes: f = mg + '.aln.masked.faa' seqs = seq_io.read_fasta(os.path.join(alignment_dir, f)) for seq_id, seq in seqs.iteritems(): genome_id = seq_id[0:seq_id.find(DefaultValues.SEQ_CONCAT_CHAR)] alignments[mg][genome_id] = seq marker_length[mg] = len(seq) # create marker file fout = open(marker_file, 'w') for mg in marker_genes: fout.write('%s\t%s\t%s\t%d\n' % (mg, mg, mg, marker_length[mg])) fout.close() # create concatenated alignment concatenated_seqs = {} for mg in marker_genes: seqs = alignments[mg] for genome_id in genome_ids: if genome_id in seqs: # append alignment concatenated_seqs[genome_id] = concatenated_seqs.get(genome_id, '') + seqs[genome_id] else: # missing gene concatenated_seqs[genome_id] = concatenated_seqs.get(genome_id, '') + '-' * marker_length[mg] # save concatenated alignment seq_io.write_fasta(concatenated_seqs, concatenated_alignment_file)
def create_concatenated_alignment(genome_ids, marker_genes, alignment_dir, concatenated_alignment_file, marker_file): """Create concatenated multiple sequence alignment for all genomes. Parameters ---------- genome_ids : iterable Genomes of interest. marker_genes : iterable Unique ids of marker genes. alignment_dir : str Directory containing multiple sequence alignments. concatenated_alignment_file : str File to containing concatenated alignment. marker_file : str File indicating length of each marker in the alignment. """ # Read alignment files. Some genomes may have multiple # copies of a marker gene in which case the last one # is arbitrarily taken. This is acceptable as all genes # are already screen to be conspecific. alignments = defaultdict(dict) marker_length = {} for mg in marker_genes: f = mg + '.aln.masked.faa' seqs = seq_io.read_fasta(os.path.join(alignment_dir, f)) for seq_id, seq in seqs.iteritems(): genome_id = seq_id[0:seq_id.find(DefaultValues.SEQ_CONCAT_CHAR)] alignments[mg][genome_id] = seq marker_length[mg] = len(seq) # create marker file fout = open(marker_file, 'w') for mg in marker_genes: fout.write('%s\t%s\t%s\t%d\n' % (mg, mg, mg, marker_length[mg])) fout.close() # create concatenated alignment concatenated_seqs = {} for mg in marker_genes: seqs = alignments[mg] for genome_id in genome_ids: if genome_id in seqs: # append alignment concatenated_seqs[genome_id] = concatenated_seqs.get( genome_id, '') + seqs[genome_id] else: # missing gene concatenated_seqs[genome_id] = concatenated_seqs.get( genome_id, '') + '-' * marker_length[mg] # save concatenated alignment seq_io.write_fasta(concatenated_seqs, concatenated_alignment_file)
def _read_seq_ids(self, bin_files): """Read sequence IDs of all bin files.""" bins = {} for bin_file in bin_files: bin_id = self.bin_id_from_filename(bin_file) bins[bin_id] = set(read_fasta(bin_file).keys()) return bins
def _parse_sequence_file(self, fna_file, prefix, ssu_query_id): metadata = [] all_genes_dict = read_fasta(fna_file, False) sequence = all_genes_dict[ssu_query_id] if prefix == 'lsu_silva_23s': metadata.append(('lsu_23s_sequence'.format(prefix), sequence)) elif prefix == 'ssu_silva': metadata.append(('ssu_sequence'.format(prefix), sequence)) else: metadata.append(('{0}_sequence'.format(prefix), sequence)) return metadata
def run(self, msa, mask, outf): outfwriter = open(outf, 'w') dict_genomes = read_fasta(msa, False) with open(mask, 'r') as f: maskstr = f.readline() print maskstr print len(maskstr) for k, v in dict_genomes.iteritems(): aligned_seq = ''.join([v[i] for i in xrange( 0, len(maskstr)) if maskstr[i] == '1']) fasta_outstr = ">%s\n%s\n" % (k, aligned_seq) outfwriter.write(fasta_outstr) outfwriter.close()
def _producer(self, gene_file): """Calculates dinucleotide usage statistics of a genome. Parameters ---------- gene_file : str Fasta file containing amino acid sequences. """ genome_id = ntpath.basename(gene_file) genome_id = genome_id.replace('.genes.fna', '') genome_id = os.path.splitext(genome_id)[0] seqs = seq_io.read_fasta(gene_file) self.dinucleotide_usage(seqs, genome_id) return True
def run(self, msa, mask, marker_list, taxonomy_file, metadata_file, output): dict_marker = {} print "readmsa" dict_genomes = read_fasta(msa, False) print len(dict_genomes) sub_list_genomes = self.selectGenomes(dict_genomes, taxonomy_file, metadata_file) print len(sub_list_genomes) with open(mask, 'r') as f: maskstr = f.readline() with open(marker_list, 'r') as f: f.readline() for line in f: list_info = line.split("\t") dict_marker[list_info[0]] = int(list_info[3]) new_mask, output_seqs = self.trim_seqs(dict_genomes, sub_list_genomes, maskstr, dict_marker) if not os.path.exists(output): os.makedirs(output) #Write mask mask_file = open(os.path.join(output, "mask.txt"), 'w') mask_file.write(''.join([str(n) for n in new_mask])) mask_file.close() #Write MSA trimmed_file = open(os.path.join(output, "trimmed_sequences.fa"), 'w') nbr_aa_seqs = open(os.path.join(output, "number_AA_genomes.tsv"), 'w') for genome_id, aligned_seq in output_seqs.iteritems(): fasta_outstr = ">%s\n%s\n" % (genome_id, aligned_seq) trimmed_file.write(fasta_outstr) lenaa = len(aligned_seq) - (len(aligned_seq) - len(aligned_seq.replace('-', ''))) len_outstr = "%s\t%s\t%s\n" % (genome_id, lenaa, len(aligned_seq)) nbr_aa_seqs.write(len_outstr) trimmed_file.close() nbr_aa_seqs.close()
def _parse_lsu_5S_files(self, accession, fout, fna_file, summary_file): """Parse information from 5S LSU files.""" # check if a 5S sequence was identified if not os.path.exists(fna_file): return 0 # write header if self.write_lsu_5S_header: fout.write( 'genome_id\tlsu_5s_query_id\tlsu_5s_length\tlsu_5s_contig_len\tlsu_5s_sequence\n' ) self.write_lsu_5S_header = False seqs = read_fasta(fna_file) identified_genes = 0 longest_seq = 0 longest_seq_id = None longest_contig_len = None if os.path.exists(summary_file): with open(summary_file) as fsum: header_line = fsum.readline() # consume header line header_list = [x.strip() for x in header_line.split('\t')] idx_seq_len = header_list.index("Sequence length") for line in fsum: identified_genes += 1 line_split = list(map(str.strip, line.strip().split('\t'))) seq_id = line_split[0] contig_len = int(line_split[idx_seq_len]) seq_len = len(seqs[seq_id]) if seq_len > longest_seq: longest_seq_id = seq_id longest_seq = seq_len longest_contig_len = contig_len if longest_seq_id: fout.write('%s\t%s\t%d\t%d\t%s\n' % (accession, longest_seq_id, longest_seq, longest_contig_len, seqs[longest_seq_id])) return identified_genes
def trim_msa(self, untrimmed_msa, mask_type, maskid, output_file): if maskid == 'bac' and mask_type == 'reference': mask = os.path.join(Config.MASK_DIR, Config.MASK_BAC120) elif maskid == 'arc' and mask_type == 'reference': mask = os.path.join(Config.MASK_DIR, Config.MASK_AR122) elif mask_type == 'file': mask = maskid with open(mask, 'r') as f: maskstr = f.readline() outfwriter = open(output_file, 'w') dict_genomes = read_fasta(untrimmed_msa, False) for k, v in dict_genomes.iteritems(): aligned_seq = ''.join( [v[i] for i in xrange(0, len(maskstr)) if maskstr[i] == '1']) fasta_outstr = ">%s\n%s\n" % (k, aligned_seq) outfwriter.write(fasta_outstr) outfwriter.close() return True
def run(self, msa_file, constraint_dir, outfile): msa_dict = read_fasta(msa_file) outdict = dict((key, []) for key in msa_dict.iterkeys()) onlyfiles = [ os.path.join(constraint_dir, f) for f in os.listdir(constraint_dir) if os.path.isfile(os.path.join(constraint_dir, f)) ] for constraintfile in onlyfiles: constraintlist = [] with open(constraintfile) as f: for line in f: constraintlist.append(line.strip()) for k, v in outdict.iteritems(): if k in constraintlist: outdict.get(k).append('1') else: outdict.get(k).append('0') outf = open(outfile, 'w') for outk, outval in outdict.iteritems(): outf.write(">{0}\n{1}\n".format(outk, ''.join(outval))) outf.close()
def _producer(self, genome_file): """Calculates kmer usage of a genome. Parameters ---------- genome_file : str Fasta file containing genomic sequences. Returns ------- str Unique identifier of genome. dict : d[kmer] -> count Occurrence of each kmer. """ genome_id = ntpath.basename(genome_file) genome_id = os.path.splitext(genome_id)[0] seqs = seq_io.read_fasta(genome_file) kmer_usage = self.signatures.counts(seqs) return (genome_id, kmer_usage)
def _producer(self, genome_file): """Calculates kmer usage of a genome. Parameters ---------- genome_file : str Fasta file containing genomic sequences. Returns ------- str Unique identifier of genome. dict : d[kmer] -> count Occurrence of each kmer. """ genome_id = ntpath.basename(genome_file) genome_id = os.path.splitext(genome_id)[0] seqs = seq_io.read_fasta(genome_file) kmer_usage = self.signatures.calculate(seqs) return (genome_id, kmer_usage)
def run(self, msa_file, marker_list): """Randomly select a subset of columns from the MSA of each marker.""" # read multiple sequence alignment self.logger.info('Reading multiple sequence alignment.') msa = read_fasta(msa_file, False) self.logger.info('Read MSA for %d genomes.' % len(msa)) filtered_seqs, pruned_seqs = self.trim(msa, marker_list) self.logger.info( 'Removed %d taxa have amino acids in <%.1f%% of columns in filtered MSA.' % (len(pruned_seqs), self.min_perc_aa)) # write out trimmed sequences filter_file = open(os.path.join(self.output_dir, "filtered_msa.faa"), 'w') for gid, seq in filtered_seqs.items(): fasta_outstr = ">%s\n%s\n" % (gid, seq) filter_file.write(fasta_outstr) filter_file.close() self.logger.info('Done.')
def _producer(self, gene_file): """Calculates amino acid usage of a genome. Parameters ---------- gene_file : str Fasta file containing amino acid sequences. Returns ------- str Unique identifier of genome. dict : dict[aa] -> count Occurrence of each amino acid. """ genome_id = ntpath.basename(gene_file) genome_id = genome_id.replace('.genes.faa', '') genome_id = os.path.splitext(genome_id)[0] seqs = seq_io.read_fasta(gene_file) aa_usage = self.amino_acid_usage(seqs) return [genome_id, aa_usage]
def _runHmmMultiAlign(self, db_genome_id, path, marker_ids): ''' Selects markers that are not aligned for a specific genome. :param db_genome_id: Selected genome :param path: Path to the genomic fasta file for the genome :param marker_ids: list of marker ids for the selected sets ''' temp_con = GenomeDatabaseConnection() temp_con.MakePostgresConnection() temp_cur = temp_con.cursor() # gather information for all marker genes final_genome = [] final_markerid = [] final_seq = [] final_multihits = [] final_evalue = [] final_bitscore = [] marker_dbs = {"PFAM": self.pfam_top_hit_suffix, "TIGR": self.tigrfam_top_hit_suffix} for marker_db, marker_suffix in marker_dbs.iteritems(): query = ("SELECT m.id_in_database,m.marker_file_location,m.size,m.id " + "FROM genomes as g, markers as m " + "LEFT JOIN marker_databases as md " + "ON md.id=m.marker_database_id " + "WHERE NOT EXISTS (" + "SELECT * FROM aligned_markers as am " + "WHERE am.genome_id = g.id and am.marker_id = m.id) " + "AND g.id = %s " + "AND m.id in %s " + "AND md.external_id_prefix like %s") temp_cur.execute( query, (db_genome_id, tuple(marker_ids,), marker_db)) raw_results = temp_cur.fetchall() marker_dict_original = { a: {"path": b, "size": c, "db_marker_id": d} for a, b, c, d in raw_results} # get all gene sequences genome_path = str(path) tophit_path = genome_path.replace( self.protein_file_suffix, marker_suffix) # we load the list of all the genes detected in the genome protein_file = tophit_path.replace( marker_suffix, self.protein_file_suffix) all_genes_dict = read_fasta(protein_file, False) # we store the tophit file line by line and store the # information in a dictionary with open(tophit_path) as tp: # first line is header line tp.readline() gene_dict = {} for line_tp in tp: linelist = line_tp.split("\t") genename = linelist[0] sublist = linelist[1] if ";" in sublist: diff_markers = sublist.split(";") else: diff_markers = [sublist] for each_gene in diff_markers: sublist = each_gene.split(",") markerid = sublist[0] if markerid not in marker_dict_original: continue evalue = sublist[1] bitscore = sublist[2].strip() if markerid in gene_dict: oldbitscore = gene_dict.get( markerid).get("bitscore") if oldbitscore < bitscore: gene_dict[markerid] = {"marker_path": marker_dict_original.get(markerid).get("path"), "gene": genename, "gene_seq": all_genes_dict.get(genename), "evalue": evalue, "bitscore": bitscore, "multihit": True, "db_marker_id": marker_dict_original.get(markerid).get("db_marker_id")} else: gene_dict.get(markerid)["multihit"] = True else: gene_dict[markerid] = {"marker_path": marker_dict_original.get(markerid).get("path"), "gene": genename, "gene_seq": all_genes_dict.get(genename), "evalue": evalue, "bitscore": bitscore, "multihit": False, "db_marker_id": marker_dict_original.get(markerid).get("db_marker_id")} for mid, info in marker_dict_original.iteritems(): if mid not in gene_dict: final_genome.append(db_genome_id) final_markerid.append(info.get("db_marker_id")) final_seq.append("-" * info.get("size")) final_multihits.append(False) final_evalue.append(None) final_bitscore.append(None) result_aligns = self._runHmmAlign(gene_dict, db_genome_id) for result_align in result_aligns: final_genome.append(result_align[0]) final_markerid.append(result_align[1]) final_seq.append(result_align[2]) final_multihits.append(result_align[3]) final_evalue.append(result_align[4]) final_bitscore.append(result_align[5]) if final_genome: query = "SELECT upsert_aligned_markers(%s,%s,%s,%s,%s,%s)" temp_cur.execute(query, (final_genome, final_markerid, final_seq, final_multihits, final_evalue, final_bitscore)) temp_con.commit() temp_cur.close() temp_con.ClosePostgresConnection() return True
def _producer(self, genome_file): """Apply prodigal to genome with most suitable translation table. Parameters ---------- genome_file : queue Fasta file for genome. """ genome_id = remove_extension(genome_file) aa_gene_file = os.path.join(self.output_dir, genome_id + '_genes.faa') nt_gene_file = os.path.join(self.output_dir, genome_id + '_genes.fna') gff_file = os.path.join(self.output_dir, genome_id + '.gff') best_translation_table = -1 table_coding_density = {4:-1, 11:-1} if self.called_genes: os.system('cp %s %s' % (os.path.abspath(genome_file), aa_gene_file)) else: tmp_dir = tempfile.mkdtemp() seqs = read_fasta(genome_file) # determine number of bases total_bases = 0 for seq in seqs.values(): total_bases += len(seq) # call genes under different translation tables if self.translation_table: translation_tables = [self.translation_table] else: translation_tables = [4, 11] for translation_table in translation_tables: os.makedirs(os.path.join(tmp_dir, str(translation_table))) aa_gene_file_tmp = os.path.join(tmp_dir, str(translation_table), genome_id + '_genes.faa') nt_gene_file_tmp = os.path.join(tmp_dir, str(translation_table), genome_id + '_genes.fna') gff_file_tmp = os.path.join(tmp_dir, str(translation_table), genome_id + '.gff') # check if there is sufficient bases to calculate prodigal parameters if total_bases < 100000 or self.meta: proc_str = 'meta' # use best precalculated parameters else: proc_str = 'single' # estimate parameters from data args = '-m' if self.closed_ends: args += ' -c' cmd = 'prodigal %s -p %s -q -f gff -g %d -a %s -d %s -i %s > %s 2> /dev/null' % (args, proc_str, translation_table, aa_gene_file_tmp, nt_gene_file_tmp, genome_file, gff_file_tmp) os.system(cmd) # determine coding density prodigalParser = ProdigalGeneFeatureParser(gff_file_tmp) codingBases = 0 for seq_id, _seq in seqs.items(): codingBases += prodigalParser.coding_bases(seq_id) codingDensity = float(codingBases) / total_bases table_coding_density[translation_table] = codingDensity # determine best translation table if not self.translation_table: best_translation_table = 11 if (table_coding_density[4] - table_coding_density[11] > 0.05) and table_coding_density[4] > 0.7: best_translation_table = 4 else: best_translation_table = self.translation_table shutil.copyfile(os.path.join(tmp_dir, str(best_translation_table), genome_id + '_genes.faa'), aa_gene_file) shutil.copyfile(os.path.join(tmp_dir, str(best_translation_table), genome_id + '_genes.fna'), nt_gene_file) shutil.copyfile(os.path.join(tmp_dir, str(best_translation_table), genome_id + '.gff'), gff_file) # clean up temporary files shutil.rmtree(tmp_dir) return (genome_id, aa_gene_file, nt_gene_file, gff_file, best_translation_table, table_coding_density[4], table_coding_density[11])
def _run_hmm_align(self, genome_ids, genome_dirs, genes_in_genomes, ignore_multi_copy, output_msa_dir, output_model_dir, queue_in, queue_out): """Run each marker gene in a separate thread. Only the gene with the highest bitscore is used for genomes with multiple hits to a given protein family. Parameters ---------- genome_ids : iterable Genomes of interest. genome_dirs : d[assembly_accession] -> directory Path to files for individual genomes. genes_in_genomes : d[genome_id][family_id] -> [(gene_id_1, bitscore), ..., (gene_id_N, bitscore)] Genes within each genome. ignore_multi_copy : bool Flag indicating if genes with multiple hits should be ignored (True) or the gene with the highest bitscore taken (False). output_msa_dir : str Output directory for multiple sequence alignment. output_model_dir : str Output directory for HMMs. queue_in : Queue Input queue for parallel processing. queue_out : Queue Output queue for parallel processing. """ while True: marker_id = queue_in.get(block=True, timeout=None) if marker_id == None: break marker_seq_file = os.path.join(output_msa_dir, marker_id + '.faa') fout = open(marker_seq_file, 'w') for genome_id in genome_ids: genome_dir = genome_dirs[genome_id] assembly = genome_dir[genome_dir.rfind('/') + 1:] genes_file = os.path.join(genome_dir, assembly + self.protein_file_ext) seqs = seq_io.read_fasta(genes_file) hits = genes_in_genomes[genome_id].get(marker_id, None) if not hits or (ignore_multi_copy and len(hits) > 1): continue # get gene with highest bitscore hits.sort(key=lambda x: x[1], reverse=True) gene_id, _bitscore = hits[0] fout.write('>' + genome_id + DefaultValues.SEQ_CONCAT_CHAR + gene_id + '\n') fout.write(seqs[gene_id] + '\n') fout.close() hmmer = HMMER('align') hmmer.align(os.path.join(output_model_dir, marker_id + '.hmm'), marker_seq_file, os.path.join(output_msa_dir, marker_id + '.aln.faa'), trim=False, outputFormat='Pfam') self._mask_alignment(os.path.join(output_msa_dir, marker_id + '.aln.faa'), os.path.join(output_msa_dir, marker_id + '.aln.masked.faa')) queue_out.put(marker_id)
def run(self, dirin, dirout, gtr, release): """ renaming genome files for fastani""" # get list of genomes to retain (based on genome list 1014) genomes_to_retain = set() with open(gtr) as f: # f.readline() for line in f: line_split = line.strip().split('\t') genomes_to_retain.add(line_split[0]) print('Genome to retain: %d' % len(genomes_to_retain)) # get mapping from published UBA genomes to NCBI accessions __location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) uba_acc = {} with open(os.path.join(__location__, 'uba_ncbi_accessions.tsv')) as ub: for line in ub: line_split = line.strip().split('\t') if line_split[2] != "None": uba_acc[line_split[0]] = { "uba": line_split[1], "gca": 'GB_' + line_split[2] } else: uba_acc[line_split[0]] = {"uba": line_split[1]} # renaming taxonomy: taxout = open(os.path.join(dirout, 'gtdb_taxonomy.tsv'), 'w') with open(os.path.join(dirin, 'gtdb_taxonomy.tsv')) as gt: for line in gt: info = line.strip().split("\t") if info[0] in genomes_to_retain: if info[0].startswith("U_"): subdict = uba_acc.get(info[0]) if "gca" in subdict.keys(): taxout.write("{0}\t{1}\n".format( subdict.get("gca"), info[1])) else: taxout.write("{0}\t{1}\n".format( subdict.get("uba"), info[1])) else: taxout.write(line) taxout.close() # renaming genome files for fastani fastanis = glob.glob(os.path.join(dirin, 'fastani', "*")) fastani_dir = os.path.join(dirout, 'fastani') if not os.path.exists(fastani_dir): os.makedirs(fastani_dir) for genome in fastanis: filenamef = os.path.basename(genome) filenamef = filenamef.replace("_genomic.fna", "") if filenamef.startswith("U_"): subdict = uba_acc.get(filenamef) if filenamef == "U_74684": print(subdict) print(genome) print( os.path.join(fastani_dir, subdict.get("gca")[3:] + "_genomic.fna")) if "gca" in subdict.keys(): copyfile( genome, os.path.join(fastani_dir, subdict.get("gca")[3:] + "_genomic.fna")) else: copyfile( genome, os.path.join(fastani_dir, subdict.get("uba") + "_genomic.fna")) else: copyfile(genome, os.path.join(fastani_dir, filenamef + "_genomic.fna")) for dom in ['bac120', 'ar122']: # MSA renaming msadir = os.path.join(dirout, dom, 'msa') if not os.path.exists(msadir): os.makedirs(msadir) msa_dict = read_fasta( os.path.join(dirin, dom, 'gtdb_concatenated.faa')) seqout = open( os.path.join(msadir, 'gtdb_r' + release + '_' + dom + '.faa'), 'w') for id, seq in msa_dict.items(): if id in genomes_to_retain: if id.startswith("U_"): subdict = uba_acc.get(id) if "gca" in subdict.keys(): seqout.write(">{0}\n{1}\n".format( subdict.get("gca"), seq)) else: seqout.write(">{0}\n{1}\n".format( subdict.get("uba"), seq)) else: seqout.write(">{0}\n{1}\n".format(id, seq)) seqout.close() # PPLACER renaming pplacerdir = os.path.join(dirout, dom, 'pplacer') if not os.path.exists(pplacerdir): os.makedirs(pplacerdir) trees = glob.glob(os.path.join(dirin, dom, 'pplacer', "*.tree")) if len(trees) != 1: print("Error") sys.exit() else: treef = trees[0] fastas = glob.glob(os.path.join(dirin, dom, 'pplacer', "*.fa")) if len(fastas) != 1: print("Error") sys.exit() else: seqfile = fastas[0] logs = glob.glob(os.path.join(dirin, dom, 'pplacer', "*.log")) if len(logs) != 1: print("Error") sys.exit() else: logfile = logs[0] # produce corrected tree tree = dendropy.Tree.get_from_path(os.path.join(treef), schema='newick', rooting='force-rooted', preserve_underscores=True) for n in tree.leaf_node_iter(): if n.taxon.label.startswith("U_"): subdict = uba_acc.get(n.taxon.label) if "gca" in subdict.keys(): n.taxon.label = subdict.get("gca") else: n.taxon.label = subdict.get("uba") tree.write_to_path(os.path.join(dirout, dom, 'pplacer', dom + "_r" + release + ".tree"), schema='newick', suppress_rooting=True, unquoted_underscores=True) trimmed_seqout = open( os.path.join(dirout, dom, 'pplacer', 'trimmed_msa_' + dom + '.faa'), 'w') trimmed_fasta = read_fasta(seqfile) for id, seq in trimmed_fasta.items(): if id in genomes_to_retain: if id.startswith("U_"): subdict = uba_acc.get(id) if "gca" in subdict.keys(): trimmed_seqout.write(">{0}\n{1}\n".format( subdict.get("gca"), seq)) else: trimmed_seqout.write(">{0}\n{1}\n".format( subdict.get("uba"), seq)) else: trimmed_seqout.write(">{0}\n{1}\n".format(id, seq)) trimmed_seqout.close() logoutf = open( os.path.join(dirout, dom, 'pplacer', 'fitting_' + dom + '.log'), 'w') with open(logfile) as logfin: for line in logfin: for k, subdict in uba_acc.items(): if "gca" in subdict.keys(): line = line.replace(k + ":", subdict.get("gca") + ":") else: line = line.replace(k + ":", subdict.get("uba") + ":") logoutf.write(line) logoutf.close()
def run(self, outf): # Check if all directories are here actual_dirs = os.listdir(self.pack_dir) if len(actual_dirs) != len(self.list_dirsinpackage): print 'ERROR:' if len(set(actual_dirs) & set(self.list_dirsinpackage)) != len( self.list_dirsinpackage): print 'ERROR:' with open(os.path.join(self.pack_dir, 'metadata', 'metadata.txt')) as metafile: for line in metafile: if line.startswith('VERSION_DATA'): version = line.strip().split('=')[1] # List genomes in fastani folder list_genomes = glob.glob( os.path.join(self.pack_dir, 'fastani', 'database/*.gz')) # Archaeal genome MSA is untrimmed ar_msa_file = glob.glob(os.path.join(self.pack_dir, 'msa/*ar122.faa'))[0] ar_msa = read_fasta(ar_msa_file) first_seq = ar_msa.get(ar_msa.keys()[0]) if len(first_seq) != 32675: print 'ERROR: len(first_seq) != 32675' # Bacterial genome MSA is untrimmed bac_msa_file = glob.glob(os.path.join(self.pack_dir, 'msa/*bac120.faa'))[0] bac_msa = read_fasta(bac_msa_file) first_seq = bac_msa.get(bac_msa.keys()[0]) if len(first_seq) != 41155: print 'ERROR: len(first_seq) != 41155' # Bacterial MASK is same length as the untrimmed bacterial genomes bac_mask_file = glob.glob( os.path.join(self.pack_dir, 'masks/*bac120.mask'))[0] bac_mask = '' with open(bac_mask_file) as bmf: bac_mask = bmf.readline() if len(bac_mask) != 41155: print 'ERROR: len(bac_mask) != 41155' # Archaeal MASK is same length as the untrimmed archaeal genomes ar_mask_file = glob.glob( os.path.join(self.pack_dir, 'masks/*ar122.mask'))[0] ar_mask = '' with open(ar_mask_file) as amf: ar_mask = amf.readline() if len(ar_mask) != 32675: print 'ERROR: len(ar_mask) != 32675' # Archaeal Pplacer MSA should have the same number of genomes as the # Archaeal untrimmed MSA ar_pplacer_msa_file = glob.glob( os.path.join(self.pack_dir, 'pplacer', 'gtdb_' + version + '_ar122.refpkg', 'trimmed_msa_ar122.faa'))[0] ar_pplacer_msa = read_fasta(ar_pplacer_msa_file) if len(ar_pplacer_msa) != len(ar_msa): print 'ERROR: len(ar_pplacer_msa) != len(ar_msa)' print 'len(ar_pplacer_msa): {}'.format(len(ar_pplacer_msa)) print 'len(ar_msa): {}'.format(len(ar_msa)) print 'difference genomes: {}'.format( list(set(ar_msa.keys()).difference(ar_pplacer_msa.keys()))) first_seq = ar_pplacer_msa.get(ar_pplacer_msa.keys()[0]) # Archaeal Pplacer MSA should have the same length as the Archaeal mask if len(first_seq) != len([a for a in ar_mask if a == '1']): print 'ERROR: len(first_seq) != len([a for a in ar_mask if a ==1])' print 'len(first_seq): {}'.format(len(first_seq)) print 'len([a for a in ar_mask if a ==1]): {}'.format( len([a for a in ar_mask if a == '1'])) # Bacterial Pplacer MSA should have the same number of genomes as the # Bacterial untrimmed MSA bac_pplacer_msa_file = os.path.join( self.pack_dir, 'pplacer', 'gtdb_' + version + '_bac120.refpkg', 'trimmed_msa_bac120.faa') bac_pplacer_msa = read_fasta(bac_pplacer_msa_file) if len(bac_pplacer_msa) != len(bac_msa): print 'ERROR: len(bac_pplacer_msa) != len(bac_msa)' print 'len(bac_pplacer_msa): {}'.format(len(bac_pplacer_msa)) print 'len(bac_msa): {}'.format(len(bac_msa)) print 'difference genomes: {}'.format( list(set(bac_msa.keys()).difference(bac_pplacer_msa.keys()))) first_seq = bac_pplacer_msa.get(bac_pplacer_msa.keys()[0]) # Bacterial Pplacer MSA should have the same length as the Bacterial # mask if len(first_seq) != len([a for a in bac_mask if a == '1']): print 'ERROR: len(first_seq) != len([a for a in bac_mask if a ==1])' print 'len(first_seq): {}'.format(len(first_seq)) print 'len([a for a in bac_mask if a ==1]): {}'.format( len([a for a in bac_mask if a == '1'])) # Archaeal Tree should have the same number of leaves than nomber of # genomes in the MSA arc_tree = dendropy.Tree.get_from_path(os.path.join( self.pack_dir, 'pplacer', 'gtdb_' + version + '_ar122.refpkg', 'ar122_' + version + '.tree'), schema='newick', rooting='force-rooted', preserve_underscores=True) list_leaves = arc_tree.leaf_nodes() if len(list_leaves) != len(ar_pplacer_msa): print 'ERROR: len(list_leaves) != len(ar_pplacer_msa)' print 'len(list_leaves): {}'.format(len(list_leaves)) print 'len(ar_pplacer_msa): {}'.format(len(ar_pplacer_msa)) # Bacterial Tree should have the same number of leaves than nomber of # genomes in the MSA bac_tree = dendropy.Tree.get_from_path(os.path.join( self.pack_dir, 'pplacer', 'gtdb_' + version + '_bac120.refpkg', 'bac120_' + version + '.tree'), schema='newick', rooting='force-rooted', preserve_underscores=True) list_leaves = bac_tree.leaf_nodes() if len(list_leaves) != len(bac_pplacer_msa): print 'ERROR: len(list_leaves) != len(bac_pplacer_msa)' print 'len(list_leaves): {}'.format(len(list_leaves)) print 'len(bac_pplacer_msa): {}'.format(len(bac_pplacer_msa)) # Taxonomy file should have as many genomes as bac120 and ar122 MSA # combined tax_file = os.path.join(self.pack_dir, 'taxonomy', 'gtdb_taxonomy.tsv') tax_dict = {} with open(tax_file) as tf: for line in tf: infos = line.strip().split('\t') tax_dict[infos[0]] = infos[1] if len(tax_dict) != (len(ar_msa) + len(bac_msa)): print 'ERROR: len(tax_dict) != (len(ar_msa) + len(bac_msa))' print 'len(tax_dict): {}'.format(len(tax_dict)) print 'len(ar_msa) + len(bac_msa): {}'.format( len(ar_msa) + len(bac_msa)) # Radii file should have as many genomes as bac120 and ar122 MSA # combined radii_file = os.path.join(self.pack_dir, 'radii', 'gtdb_radii.tsv') radii_dict = {} with open(radii_file) as rf: for line in rf: infos = line.strip().split('\t') radii_dict[infos[1]] = infos[2] if len(radii_dict) != (len(ar_msa) + len(bac_msa)): print 'ERROR: len(radii_dict) != (len(ar_msa) + len(bac_msa))' print 'len(radii_dict): {}'.format(len(radii_dict)) print 'len(ar_msa) + len(bac_msa): {}'.format( len(ar_msa) + len(bac_msa)) if len(set(radii_dict.keys()).symmetric_difference( tax_dict.keys())) != 0: print 'ERROR: len(set(radii_dict.keys()).symmetric_difference(tax_dict.keys()))' print 'set(radii_dict.keys()).symmetric_difference(tax_dict.keys()): {}'.format( set(radii_dict.keys()).symmetric_difference(tax_dict.keys())) if len(list_genomes) != len(radii_dict): print 'ERROR: len(list_genomes) != len(radii_dict)' print 'len(list_genomes): {}'.format(len(list_genomes)) print 'len(radii_dict): {}'.format(len(radii_dict)) print '\n\nVERSION: {}'.format(version) print 'Length trimmed bac120 MSA: {}'.format( len(bac_pplacer_msa.get(bac_pplacer_msa.keys()[0]))) print 'Length trimmed ar122 MSA: {}'.format( len(ar_pplacer_msa.get(ar_pplacer_msa.keys()[0]))) print '' print 'Number of genomes in fastani/database: {}'.format( len(list_genomes)) print 'Number of genomes in radii file: {}'.format(len(radii_dict)) print 'Number of genomes in taxonomy file: {}'.format(len(tax_dict)) print 'Would you like to archive the folder? ' # raw_input returns the empty string for "enter" yes = {'yes', 'y', 'yep', ''} no = {'no', 'n'} final_choice = False choice = raw_input().lower() if choice in yes: with tarfile.open(outf, "w:gz") as tar: packdir = copy.copy(self.pack_dir) if packdir.endswith('/'): packdir = packdir[:-1] tar.add(self.pack_dir, arcname=os.path.basename(packdir)) elif choice in no: return False else: sys.stdout.write("Please respond with 'yes' or 'no'")
def compare(self, bin_files1, bin_files2, assembly_file, output_file): """Compare bins from two different binning methods.""" # determine total number of sequences self.logger.info('Reading bins.') seqs = read_fasta(assembly_file) seq_lens = {} total_bases = 0 num_seq1K = 0 total_bases1K = 0 num_seq5K = 0 total_bases5K = 0 for seq_id, seq in seqs.items(): seq_len = len(seq) seq_lens[seq_id] = seq_len total_bases += seq_len if seq_len >= 1000: num_seq1K += 1 total_bases1K += seq_len if seq_len >= 5000: num_seq5K += 1 total_bases5K += seq_len # determine sequences in each bin bins1 = self._read_seq_ids(bin_files1) bins2 = self._read_seq_ids(bin_files2) # determine bin stats bin_stats1, total_uniq_binned_seqs1, tota_uniq_binned_bases1, num_repeats1 = self._binning_stats(bins1, seq_lens) bin_stats2, total_uniq_binned_seqs2, tota_uniq_binned_bases2, num_repeats2 = self._binning_stats(bins2, seq_lens) # sort bins by size bin_stats1 = sorted(bin_stats1.items(), key=lambda x: x[1][1], reverse=True) bin_stats2 = sorted(bin_stats2.items(), key=lambda x: x[1][1], reverse=True) # report summary results print() print('Assembled sequences = %d (%.2f Mbp)' % (len(seqs), total_bases / 1e6)) print(' No. seqs > 1 kbp = %d (%.2f Mbp)' % (num_seq1K, total_bases1K / 1e6)) print(' No. seqs > 5 kbp = %d (%.2f Mbp)' % (num_seq5K, total_bases5K / 1e6)) print() print('Binning statistics:') print(' 1) No. bins: %s, No. binned seqs: %d (%.2f%%), No. binned bases: %.2f Mbp (%.2f%%), No. seqs in multiple bins: %d' % (len(bins1), total_uniq_binned_seqs1, total_uniq_binned_seqs1 * 100 / len(seqs), tota_uniq_binned_bases1 / 1e6, tota_uniq_binned_bases1 * 100 / total_bases, num_repeats1)) print(' 2) No. bins: %s, No. binned seqs: %d (%.2f%%), No. binned bases: %.2f Mbp (%.2f%%), No. seqs in multiple bins: %d' % (len(bins2), total_uniq_binned_seqs2, total_uniq_binned_seqs2 * 100 / len(seqs), tota_uniq_binned_bases2 / 1e6, tota_uniq_binned_bases2 * 100 / total_bases, num_repeats2)) print() # output report fout = open(output_file, 'w') for data in bin_stats2: fout.write('\t' + data[0]) fout.write('\tUnbinned\tNo. Sequences\tNo. Bases (Mbp)\tBest Match\tBases in Common (%)\tSequences in Common (%)\n') total_seqs_in_common2 = defaultdict(int) max_bases_in_common2 = defaultdict(int) max_seqs_in_common2 = defaultdict(int) best_matching_bins2 = {} binned_seqs2 = defaultdict(set) for data1 in bin_stats1: bin_id1 = data1[0] fout.write(bin_id1) seqs1 = bins1[bin_id1] max_bases_in_common = 0 max_seqs_in_common = 0 best_matching_bin = 'n/a' binned_seqs = set() for data2 in bin_stats2: bin_id2 = data2[0] seqs2 = bins2[bin_id2] seqs_in_common = seqs1.intersection(seqs2) binned_seqs.update(seqs_in_common) num_seqs_in_common = len(seqs_in_common) fout.write('\t' + str(num_seqs_in_common)) bases_in_common = 0 for seq_id in seqs_in_common: bases_in_common += seq_lens[seq_id] if bases_in_common > max_bases_in_common: max_bases_in_common = bases_in_common max_seqs_in_common = num_seqs_in_common best_matching_bin = bin_id2 if bases_in_common > max_bases_in_common2[bin_id2]: max_bases_in_common2[bin_id2] = bases_in_common max_seqs_in_common2[bin_id2] = num_seqs_in_common best_matching_bins2[bin_id2] = bin_id1 binned_seqs2[bin_id2].update(seqs_in_common) fout.write('\t%d\t%d\t%.2f\t%s\t%.2f\t%.2f\n' % (len(seqs1) - len(binned_seqs), data1[1][0], data1[1][1] / 1e6, best_matching_bin, max_bases_in_common * 100 / data1[1][1], max_seqs_in_common * 100 / data1[1][0], )) fout.write('Unbinned') for data in bin_stats2: binId = data[0] fout.write('\t%d' % (len(bins2[binId]) - len(binned_seqs2[binId]))) fout.write('\n') fout.write('No. Sequences') for data in bin_stats2: fout.write('\t%d' % data[1][0]) fout.write('\n') fout.write('No. Bases (Mbp)') for data in bin_stats2: fout.write('\t%.2f' % (data[1][1] / 1e6)) fout.write('\n') fout.write('Best Match') for data in bin_stats2: binId = data[0] fout.write('\t%s' % best_matching_bins2.get(binId, 'n/a')) fout.write('\n') fout.write('Bases in Common (%)') for data in bin_stats2: binId = data[0] fout.write('\t%.2f' % (max_bases_in_common2[binId] * 100 / data[1][1])) fout.write('\n') fout.write('Sequences in Common (%)') for data in bin_stats2: binId = data[0] fout.write('\t%.2f' % (max_seqs_in_common2[binId] * 100 / data[1][0])) fout.write('\n') fout.close()
def _trim_seqs(self, input_msa, output_msa, remove_identical=False, min_per_taxa=0.5, min_bp=1000): """Trim ends of sequences. input_msa : str File with MSA to trim. output_msa : str New file with trimmed MSA. remove_identical : boolean Flag indicating if identical sequence should be removed. min_per_taxa : float [0, 1.0] Minimum required taxa to retain leading and trailing columns. min_bp : int Minimum required length to retain sequence. """ # read seqs seqs = seq_io.read_fasta(input_msa) # filter identical seqs identical_seqs = set() if remove_identical: self.logger.info('Filtering identical sequences.') seq_ids = list(seqs.keys()) for i in range(0, len(seq_ids)): seq_id_I = seq_ids[i] if seq_id_I in identical_seqs: continue for j in range(i + 1, len(seqIds)): seq_id_J = seq_ids[j] if seqs[seq_id_I] == seqs[seq_id_J]: self.logger.info('Seq %s and %s are identical.' % (seq_id_I, seq_id_J)) identical_seqs.add(seq_id_J) self.logger.info('Identified %d of %d sequences as identical.' % (len(identical_seqs), len(seqs))) # trim start and end columns to consensus alignment first_char = [] last_char = [] for seq_id, seq in seqs.items(): if seq_id in identical_seqs: continue for i, ch in enumerate(seq): if ch != '.' and ch != '-': first_char.append(i) break for i in range(len(seq) - 1, -1, -1): if seq[i] != '.' and seq[i] != '-': last_char.append(i) break first_char.sort() last_char.sort(reverse=True) trim_index = int((len(seqs) * min_per_taxa)) start = first_char[trim_index] end = last_char[trim_index] self.logger.info( 'Trimming seqs from %d to %d leaving a %dbp length alignment.' % (start, end, end - start + 1)) short_seq_file = output_msa + '.short' fout = open(output_msa, 'w') fout_short = open(short_seq_file, 'w') num_filtered_seq = 0 for seq_id, seq in seqs.items(): if seq_id in identical_seqs: continue valid_bp = 0 for i in range(start, min(len(seq), end + 1)): ch = seq[i] if ch != '.' and ch != '-': valid_bp += 1 if valid_bp >= min_bp: fout.write('>' + seq_id + '\n') fout.write(seq[start:end + 1] + '\n') else: self.logger.info( 'Filtering seq %s with %d of %d (%.1f%%) aligned bases.' % (seq_id, valid_bp, (end - start + 1), valid_bp * 100.0 / (end - start + 1))) num_filtered_seq += 1 fout_short.write('>' + seq_id + '\n') fout_short.write(seq[start:end + 1] + '\n') fout.close() fout_short.close() self.logger.info('Filtered %d of %d sequences due to length.' % (num_filtered_seq, len(seqs) - len(identical_seqs))) self.logger.info('Short sequence written to: %s' % short_seq_file)
def run(self, homolog_file, min_per_taxa, consensus, min_per_bp, use_trimAl, msa_program, output_dir): """Create multiple sequence alignment. Parameters ---------- homolog_file : str File containing sequences to align min_per_taxa : float Minimum percentage of taxa required to retain a column. consensus : float Minimum percentage of the same amino acid required to retain column. min_per_bp : float Minimum percentage of base pairs required to keep trimmed sequence. use_trimAl : boolean Filter columns using trimAl. msa_program : str Program to use for multiple sequence alignment ['mafft', 'muscle']. output_dir : str Directory to store results. """ # infer multiple sequence alignment self.logger.info('Inferring multiple sequence alignment with %s.' % msa_program) output_file = ntpath.basename(homolog_file) prefix = output_file[0:output_file.rfind('.')] suffix = output_file[output_file.rfind('.') + 1:] msa_output = os.path.join(output_dir, prefix + '.aligned.' + suffix) if msa_program == 'mafft': mafft = Mafft(self.cpus) msa_log = os.path.join(output_dir, 'mafft.log') mafft.run(homolog_file, msa_output, msa_log) elif msa_program == 'muscle': muscle = Muscle() msa_log = os.path.join(output_dir, 'muscle.log') muscle.run(homolog_file, msa_output, msa_log) # trim multiple sequence alignment trimmed_msa_output = os.path.join( output_dir, prefix + '.trimmed.aligned.' + suffix) if use_trimAl: self.logger.info( 'Using trimAl to filter poorly represented columns from alignment.' ) # convert MSA to relaxed phylip format phylip_msa_output = msa_output.replace('.faa', '.phyx') cmd = 'seqmagick convert %s %s' % (msa_output, phylip_msa_output) os.system(cmd) tmp_output = os.path.join(output_dir, 'tmp.faa') cmd = 'trimal -in %s -out %s -automated1 -fasta' % ( phylip_msa_output, tmp_output) os.system(cmd) cmd = 'trimal -in %s -out %s -resoverlap 0.75 -seqoverlap %f' % ( tmp_output, trimmed_msa_output, min_per_bp) os.system(cmd) seqs = seq_io.read_fasta(msa_output) tmp_seqs = seq_io.read_fasta(tmp_output) trimmed_seqs = seq_io.read_fasta(trimmed_msa_output) self.logger.info( 'Trimmed alignment from %d to %d AA.' % (len(seqs.values()[0]), len(trimmed_seqs.values()[0]))) self.logger.info( '%d of %d taxa were deemed to be too short and removed.' % (len(tmp_seqs) - len(trimmed_seqs), len(seqs))) os.remove(tmp_output) else: self.logger.info( 'Trimming poorly represented columns from alignment.') seqs = seq_io.read_fasta(msa_output, keep_annotation=True) trimmed_seqs, pruned_seqs, min_taxa_filtered, consensus_filtered = seq_tk.trim_seqs( seqs, min_per_taxa / 100.0, consensus / 100.0, min_per_bp / 100.0) self.logger.info( 'Trimmed alignment from %d to %d AA (%d by minimum taxa percent, %d by consensus).' % (len(seqs.values()[0]), len(trimmed_seqs.values()[0]), min_taxa_filtered, consensus_filtered)) self.logger.info( '%d of %d taxa were deemed to be too short and removed.' % (len(pruned_seqs), len(seqs))) if len(pruned_seqs) > 0: prune_seqs_out = os.path.join(output_dir, 'filtered_seqs.too_short.txt') self.logger.info('Pruned sequences written to %s.' % prune_seqs_out) seq_io.write_fasta(pruned_seqs, prune_seqs_out) if len(pruned_seqs) == len(seqs): self.logger.error( 'Too many sequences were pruned. Gene tree cannot be inferred.' ) sys.exit() seq_io.write_fasta(trimmed_seqs, trimmed_msa_output) return trimmed_msa_output
def _parse_taxonomy_file(self, genome_id, metadata_taxonomy_file, fout, prefix, fna_file, summary_file=None): """Parse metadata file with taxonomic information for 16S rRNA genes. Parameters ---------- genome_id : str Unique identifier of genome. metadata_taxonomy_file : str Full path to file containing 16S rRNA metadata. fout : file Output stream to populate with metadata. Prefix : str Prefix to append to metadata fields. Returns ------- int Number of 16S rRNA genes identified in genome. """ if not os.path.exists(metadata_taxonomy_file): return 0 with open(metadata_taxonomy_file) as f: header_line = f.readline() # consume header line if prefix not in self.taxonomy_headers: self.taxonomy_headers.add(prefix) fout.write('genome_id') headers = [prefix + '_' + x.strip().replace('ssu_', '') for x in header_line.split('\t')] fout.write('\t' + '\t'.join(headers)) fout.write('\t{0}_sequence\t{0}_contig_len\n'.format(prefix)) # Check the CheckM headers are consistent split_headers = header_line.rstrip().split("\t") for pos in range(0, len(split_headers)): header = split_headers[pos] if header == 'query_id': query_id_pos = pos break # Report hit to longest 16S rRNA gene. It is possible that # the HMMs identified a putative 16S rRNA gene, but that # there was no valid BLAST hit. longest_query_len = 0 longest_ssu_hit_info = None identified_ssu_genes = 0 for line in f: line_split = line.strip().split('\t') query_len = int(line_split[2]) if query_len > longest_query_len: longest_query_len = query_len longest_ssu_hit_info = line_split ssu_query_id = line_split[query_id_pos] if longest_ssu_hit_info: fout.write(genome_id) fout.write('\t' + '\t'.join(longest_ssu_hit_info)) all_genes_dict = read_fasta(fna_file, False) sequence = all_genes_dict[ssu_query_id] fout.write('\t{0}'.format(sequence)) if summary_file is not None and os.path.exists(summary_file): with open(summary_file) as fsum: header_line = fsum.readline() # consume header line header_list = [x.strip() for x in header_line.split('\t')] idx_seq = header_list.index("Sequence length") for line in fsum: identified_ssu_genes += 1 sum_list = [x.strip() for x in line.split('\t')] if sum_list[0] == ssu_query_id: fout.write("\t{0}".format(sum_list[idx_seq])) fout.write('\n') return identified_ssu_genes
def run(self, genomes, align_dir, out_dir, prefix, debugopt=False): try: """Classify genomes based on position in reference tree.""" for marker_set_id in ('bac120', 'ar122'): user_msa_file = os.path.join( align_dir, prefix + '.%s.user_msa.fasta' % marker_set_id) if not os.path.exists(user_msa_file): # file will not exist if there are no User genomes from a given domain continue classify_tree = self.place_genomes(user_msa_file, marker_set_id, out_dir, prefix) # get taxonomic classification of each user genome tree = dendropy.Tree.get_from_path(classify_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) gtdb_taxonomy = Taxonomy().read(self.taxonomy_file) fout = open( os.path.join( out_dir, prefix + '.%s.classification.tsv' % marker_set_id), 'w') fastaniout = open( os.path.join( out_dir, prefix + '.%s.fastani_results.tsv' % marker_set_id), 'w') redfout = open( os.path.join(out_dir, prefix + '.%s.summary.tsv' % marker_set_id), 'w') if debugopt: parchiinfo = open( os.path.join( out_dir, prefix + '.%s.debug_file.tsv' % marker_set_id), 'w') reddictfile = open( os.path.join( out_dir, prefix + '.%s.red_dictionary.tsv' % marker_set_id), 'w') marker_dict = {} if marker_set_id == 'bac120': marker_dict = Config.RED_DIST_BAC_DICT elif marker_set_id == 'ar122': marker_dict = Config.RED_DIST_ARC_DICT reddictfile.write('Phylum\t{0}\n'.format( marker_dict.get('p__'))) reddictfile.write('Class\t{0}\n'.format( marker_dict.get('c__'))) reddictfile.write('Order\t{0}\n'.format( marker_dict.get('o__'))) reddictfile.write('Family\t{0}\n'.format( marker_dict.get('f__'))) reddictfile.write('Genus\t{0}\n'.format( marker_dict.get('g__'))) reddictfile.close() fastaniout.write("User genome\tReference genome\tANI\n") redfout.write( "user_genome\tclassification_method\tred_value\n") if debugopt: parchiinfo.write( "User genome\tHigher rank\tHigher value\tLower rank\tLower value\tcase\tclosest_rank\n" ) # Genomes can be classified by using Mash or RED values # We go through all leaves of the tree. if the leaf is a user genome we take it's parent node and look at all the leaves for this node. # If the parent node has only one Reference genome ( GB or RS ) we calculate the mash distance between the user genome and the reference genome analysed_nodes = [] fastani_dict = {} all_fastani_dict = {} fastani_list = [] # some genomes of Case C are handled here, if Mash distance is close enough self.logger.info( 'Calculating Average Nucleotide Identity using FastANI.') for nd in tree.preorder_node_iter(): #We store the prefixes of each leaves to check if one starts with GB_ or RS_ list_subnode_initials = [ subnd.taxon.label.replace("'", '')[0:3] for subnd in nd.leaf_iter() ] list_subnode = [ subnd.taxon.label.replace("'", '') for subnd in nd.leaf_iter() ] #if only one genome is a reference genome if (list_subnode_initials.count('RS_') + list_subnode_initials.count('GB_') + list_subnode_initials.count('UBA')) == 1 and len( list_subnode_initials ) > 1 and list_subnode[0] not in analysed_nodes: fastani_list.append(list_subnode) analysed_nodes.extend(list_subnode) manager = multiprocessing.Manager() out_q = manager.dict() procs = [] nprocs = self.cpus if len(fastani_list) > 0: for item in splitchunks_list(fastani_list, nprocs): p = multiprocessing.Process(target=self._fastaniWorker, args=(item, genomes, out_q)) procs.append(p) p.start() # Collect all results into a single result dict. We know how many dicts # with results to expect. #while out_q.empty(): # time.sleep(1) # Wait for all worker processes to finish for p in procs: p.join() if p.exitcode == 1: raise ValueError("Stop!!") all_fastani_dict = dict(out_q) for k, v in all_fastani_dict.iteritems(): fastaniout.write("{0}\t{1}\t{2}\n".format( k, v.get("ref_genome"), v.get("ani"))) if Config.FASTANI_SPECIES_THRESHOLD <= v.get("ani"): suffixed_name = add_ncbi_prefix(v.get("ref_genome")) taxa_str = ";".join(gtdb_taxonomy.get(suffixed_name)) if taxa_str.endswith("s__"): taxa_str = taxa_str + v.get("ref_genome") fout.write('%s\t%s\n' % (k, taxa_str)) fastani_dict[k] = v redfout.write("{0}\tani\tNone\n".format(k)) fastaniout.close() self.logger.info( '{0} genomes have been classify with FastANI.'.format( len(fastani_dict))) scaled_tree = self._calculate_red_distances( classify_tree, out_dir) user_genome_ids = set(read_fasta(user_msa_file).keys()) user_genome_ids = user_genome_ids.difference( set(fastani_dict.keys())) # for all other cases we measure the RED distance between a leaf and a parent node ( RED = 1-edge_length). This RED value will tell us # the rank level that can be associated with a User genome. # As an example if the RED value is close to the order level, the user genome will take the order level of the Reference genome under the same parent node. # Is there are multiple orders under the parent node. The user genome is considered as a new order for leaf in scaled_tree.leaf_node_iter(): if leaf.taxon.label in user_genome_ids: taxa = [] # In some cases , pplacer can associate 2 user genomes on the same parent node so we need to go up the tree to find a node with a reference genome as leaf. cur_node = leaf.parent_node list_subnode_initials = [ subnd.taxon.label.replace("'", '')[0:3] for subnd in cur_node.leaf_iter() ] while 'RS_' not in list_subnode_initials and 'GB_' not in list_subnode_initials and 'UBA' not in list_subnode_initials: cur_node = cur_node.parent_node list_subnode_initials = [ subnd.taxon.label.replace("'", '')[0:3] for subnd in cur_node.leaf_iter() ] current_rel_list = cur_node.rel_dist parent_taxon_node = cur_node.parent_node _support, parent_taxon, _aux_info = parse_label( parent_taxon_node.label) while parent_taxon_node is not None and not parent_taxon: parent_taxon_node = parent_taxon_node.parent_node _support, parent_taxon, _aux_info = parse_label( parent_taxon_node.label) parent_rank = parent_taxon.split(";")[-1][0:3] parent_rel_dist = parent_taxon_node.rel_dist genome_parent_child = [ leaf.taxon.label, parent_rank, parent_rel_dist, '', '', '', '' ] child_taxons = [] closest_rank = None detection = "RED" # if the genome is placed between the genus and specie ranks , it will be associated with the genus when _get_closest_red_rank is called if parent_rank != 'g__': child_rk = self.order_rank[ self.order_rank.index(parent_rank) + 1] list_subnode = [ childnd.taxon.label.replace("'", '') for childnd in cur_node.leaf_iter() if (childnd.taxon.label.startswith('RS_') or childnd.taxon.label.startswith('GB_')) ] list_ranks = [ gtdb_taxonomy.get(name)[self.order_rank.index( child_rk)] for name in list_subnode ] if len(set(list_ranks)) == 1: for subranknd in cur_node.preorder_iter(): _support, subranknd_taxon, _aux_info = parse_label( subranknd.label) if subranknd.is_internal( ) and subranknd_taxon is not None and subranknd_taxon.startswith( child_rk): child_taxons = subranknd_taxon.split( ";") child_taxon_node = subranknd child_rel_dist = child_taxon_node.rel_dist break else: #case 2a and 2b closest_rank = parent_rank detection = "Topology" else: #case 1a closest_rank = parent_rank detection = "Topology" #case 1b if len(child_taxons) == 0 and closest_rank is None: list_leaves = [ childnd.taxon.label.replace("'", '') for childnd in cur_node.leaf_iter() if (childnd.taxon.label.startswith('RS_') or childnd.taxon.label.startswith('GB_')) ] if len(list_leaves) != 1: self.logger.error( 'There should be only one leaf.') sys.exit(-1) list_leaf_ranks = gtdb_taxonomy.get( list_leaves[0])[self.order_rank.index(child_rk ):-1] for leaf_taxon in reversed(list_leaf_ranks): if leaf_taxon == list_leaf_ranks[0]: if abs(current_rel_list - marker_dict.get( leaf_taxon[:3])) < abs( (current_rel_list) - marker_dict.get(parent_rank)): #and current_rel_list - marker_dict.get(leaf_taxon[:3]) > 0 ): closest_rank = leaf_taxon[:3] genome_parent_child[3] = leaf_taxon genome_parent_child[ 5] = 'case 1b - III' break else: pchildrank = list_leaf_ranks[ list_leaf_ranks.index(leaf_taxon) - 1] if abs( current_rel_list - marker_dict.get(leaf_taxon[:3]) ) < abs(current_rel_list - marker_dict.get(pchildrank[:3])): #and current_rel_list - marker_dict.get(leaf_taxon[:3]) > 0 ) : closest_rank = leaf_taxon[:3] genome_parent_child[1] = pchildrank genome_parent_child[2] = 1.0 genome_parent_child[3] = leaf_taxon genome_parent_child[5] = 'case 1b - II' break if closest_rank is None: closest_rank = parent_rank genome_parent_child[3] = list_leaf_ranks[0] genome_parent_child[5] = 'case 1b - IV' #if there is multiple ranks on the child node (i.e genome between p__Nitrospirae and c__Nitrospiria;o__Nitrospirales;f__Nitropiraceae) #we loop through the list of rank from f_ to c_ rank for child_taxon in reversed(child_taxons): # if lower rank is c__Nitropiria if child_taxon == child_taxons[0]: if (abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(child_rel_dist - marker_dict.get(child_taxon[:3])) and abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(current_rel_list - marker_dict.get(parent_rank))): genome_parent_child[3] = ';'.join( child_taxons) genome_parent_child[4] = child_rel_dist genome_parent_child[5] = 'case 3b - II' closest_rank = child_taxon[:3] elif closest_rank is None: closest_rank = parent_rank genome_parent_child[3] = ';'.join( child_taxons) genome_parent_child[4] = child_rel_dist genome_parent_child[5] = 'case 3b - III' else: pchildrank = child_taxons[ child_taxons.index(child_taxon) - 1] if (abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(current_rel_list - marker_dict.get(pchildrank[:3])) and abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(child_rel_dist - marker_dict.get(child_taxon[:3]))): closest_rank = child_taxon genome_parent_child[3] = ';'.join( child_taxons) genome_parent_child[4] = child_rel_dist genome_parent_child[5] = 'case 3b - I' break # case 1b if closest_rank is None: print "IT SHOULDN'T HAPPEN!!!" genome_parent_child[6] = closest_rank list_subnode = [ subnd.taxon.label.replace("'", '') for subnd in cur_node.leaf_iter() ] red_taxonomy = self._get_redtax( list_subnode, closest_rank, gtdb_taxonomy) fout.write('{0}\t{1}\n'.format(leaf.taxon.label, red_taxonomy)) del genome_parent_child[0] redfout.write("{0}\t{1}\t{2}\n".format( leaf.taxon.label, detection, current_rel_list)) if debugopt: parchiinfo.write('{0}\t{1}\t{2}\t{3}\n'.format( leaf.taxon.label, current_rel_list, '\t'.join(str(x) for x in genome_parent_child), detection)) redfout.close() fout.close() if debugopt: parchiinfo.close() pplaceout = open( os.path.join( out_dir, prefix + '.%s.classification_pplacer.tsv' % marker_set_id), 'w') # We get the pplacer taxonomy for comparison user_genome_ids = set(read_fasta(user_msa_file).keys()) for leaf in tree.leaf_node_iter(): if leaf.taxon.label in user_genome_ids: taxa = [] cur_node = leaf while cur_node.parent_node: _support, taxon, _aux_info = parse_label( cur_node.label) if taxon: for t in taxon.split(';')[::-1]: taxa.append(t.strip()) cur_node = cur_node.parent_node taxa_str = ';'.join(taxa[::-1]) pplaceout.write('%s\t%s\n' % (leaf.taxon.label, taxa_str)) pplaceout.close() except ValueError as error: print "GTDB-Tk has stopped before finishing" sys.exit(-1) except Exception as error: print "GTDB-Tk has stopped before finishing" sys.exit(-1)
def _report_identified_marker_genes(self, gene_dict, outdir, prefix): """Report statistics for identified marker genes.""" bac_outfile = open( os.path.join(outdir, prefix + "_bac120_markers_summary.tsv"), "w") arc_outfile = open( os.path.join(outdir, prefix + "_ar122_markers_summary.tsv"), "w") header = "Name\tnumber_unique_genes\tnumber_multiple_genes\tnumber_missing_genes\tlist_unique_genes\tlist_multiple_genes\tlist_missing_genes\n" bac_outfile.write(header) arc_outfile.write(header) # gather information for all marker genes marker_dbs = { "PFAM": ConfigMetadata.PFAM_TOP_HIT_SUFFIX, "TIGR": ConfigMetadata.TIGRFAM_TOP_HIT_SUFFIX } marker_bac_list_original = [] for db_marker in Config.BAC120_MARKERS.keys(): marker_bac_list_original.extend([ marker.replace(".HMM", "").replace(".hmm", "") for marker in Config.BAC120_MARKERS[db_marker] ]) marker_arc_list_original = [] for db_marker in Config.AR122_MARKERS.keys(): marker_arc_list_original.extend([ marker.replace(".HMM", "").replace(".hmm", "") for marker in Config.AR122_MARKERS[db_marker] ]) for db_genome_id, info in gene_dict.items(): unique_genes_bac, multi_hits_bac, missing_genes_bac = [], [], [] unique_genes_arc, multi_hits_arc, missing_genes_arc = [], [], [] gene_bac_dict, gene_arc_dict = {}, {} path = info.get("aa_gene_path") for _marker_db, marker_suffix in marker_dbs.iteritems(): # get all gene sequences protein_file = str(path) tophit_path = protein_file.replace( ConfigMetadata.PROTEIN_FILE_SUFFIX, marker_suffix) # we load the list of all the genes detected in the genome all_genes_dict = read_fasta(protein_file, False) # Prodigal adds an asterisks at the end of each called genes. # These asterisks sometimes appear in the MSA, which can be # an issue for some downstream software for seq_id, seq in all_genes_dict.iteritems(): if seq[-1] == '*': all_genes_dict[seq_id] = seq[:-1] # we store the tophit file line by line and store the # information in a dictionary with open(tophit_path) as tp: # first line is header line tp.readline() for line_tp in tp: linelist = line_tp.split("\t") genename = linelist[0] sublist = linelist[1] if ";" in sublist: diff_markers = sublist.split(";") else: diff_markers = [sublist] for each_mark in diff_markers: sublist = each_mark.split(",") markerid = sublist[0] if (markerid not in marker_bac_list_original and markerid not in marker_arc_list_original): continue if markerid in marker_bac_list_original: if markerid in gene_bac_dict: gene_bac_dict.get( markerid)["multihit"] = True else: gene_bac_dict[markerid] = { "gene": genename, "multihit": False } if markerid in marker_arc_list_original: if markerid in gene_arc_dict: gene_arc_dict.get( markerid)["multihit"] = True else: gene_arc_dict[markerid] = { "gene": genename, "multihit": False } for mid in marker_bac_list_original: if mid not in gene_bac_dict: missing_genes_bac.append(mid) elif gene_bac_dict[mid]["multihit"]: multi_hits_bac.append(mid) else: unique_genes_bac.append(mid) for mid in marker_arc_list_original: if mid not in gene_arc_dict: missing_genes_arc.append(mid) elif gene_arc_dict[mid]["multihit"]: multi_hits_arc.append(mid) else: unique_genes_arc.append(mid) bac_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format( db_genome_id, len(unique_genes_bac), len(multi_hits_bac), len(missing_genes_bac), ','.join(unique_genes_bac), ','.join(multi_hits_bac), ','.join(missing_genes_bac))) arc_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format( db_genome_id, len(unique_genes_arc), len(multi_hits_arc), len(missing_genes_arc), ','.join(unique_genes_arc), ','.join(multi_hits_arc), ','.join(missing_genes_arc))) bac_outfile.close() arc_outfile.close()
def _runHmmMultiAlign(self, db_genome_id, path, marker_ids): ''' Selects markers that are not aligned for a specific genome. :param db_genome_id: Selected genome :param path: Path to the genomic fasta file for the genome :param marker_ids: list of marker ids for the selected sets ''' temp_con = GenomeDatabaseConnection() temp_con.MakePostgresConnection(self.release) temp_cur = temp_con.cursor() # gather information for all marker genes final_genome = [] final_markerid = [] final_seq = [] final_multihits = [] final_evalue = [] final_bitscore = [] marker_dbs = { "PFAM": self.pfam_top_hit_suffix, "TIGR": self.tigrfam_top_hit_suffix } for marker_db, marker_suffix in marker_dbs.iteritems(): query = ( "SELECT m.id_in_database,m.marker_file_location,m.size,m.id " + "FROM genomes as g, markers as m " + "LEFT JOIN marker_databases as md " + "ON md.id=m.marker_database_id " + "WHERE NOT EXISTS (" + "SELECT * FROM aligned_markers as am " + "WHERE am.genome_id = g.id and am.marker_id = m.id) " + "AND g.id = %s " + "AND m.id in %s " + "AND md.external_id_prefix like %s") temp_cur.execute(query, (db_genome_id, tuple(marker_ids, ), marker_db)) raw_results = temp_cur.fetchall() marker_dict_original = { a: { "path": b, "size": c, "db_marker_id": d } for a, b, c, d in raw_results } # get all gene sequences genome_path = str(path) tophit_path = genome_path.replace(self.protein_file_suffix, marker_suffix) # we load the list of all the genes detected in the genome protein_file = tophit_path.replace(marker_suffix, self.protein_file_suffix) all_genes_dict = read_fasta(protein_file, False) # Prodigal adds an asterisks at the end of each called genes, # These asterisks sometimes appear in the MSA, which can be an issue for some softwares downstream for seq_id, seq in all_genes_dict.iteritems(): if seq[-1] == '*': all_genes_dict[seq_id] = seq[:-1] # we store the tophit file line by line and store the # information in a dictionary with open(tophit_path) as tp: # first line is header line tp.readline() gene_dict = {} for line_tp in tp: linelist = line_tp.split("\t") genename = linelist[0] sublist = linelist[1] if ";" in sublist: diff_markers = sublist.split(";") else: diff_markers = [sublist] for each_gene in diff_markers: sublist = each_gene.split(",") markerid = sublist[0] if markerid not in marker_dict_original: continue evalue = sublist[1] bitscore = sublist[2].strip() if markerid in gene_dict: oldbitscore = gene_dict.get(markerid).get( "bitscore") if oldbitscore < bitscore: gene_dict[markerid] = { "marker_path": marker_dict_original.get(markerid).get( "path"), "gene": genename, "gene_seq": all_genes_dict.get(genename), "evalue": evalue, "bitscore": bitscore, "multihit": True, "db_marker_id": marker_dict_original.get(markerid).get( "db_marker_id") } else: gene_dict.get(markerid)["multihit"] = True else: gene_dict[markerid] = { "marker_path": marker_dict_original.get(markerid).get("path"), "gene": genename, "gene_seq": all_genes_dict.get(genename), "evalue": evalue, "bitscore": bitscore, "multihit": False, "db_marker_id": marker_dict_original.get(markerid).get( "db_marker_id") } for mid, info in marker_dict_original.iteritems(): if mid not in gene_dict: final_genome.append(db_genome_id) final_markerid.append(info.get("db_marker_id")) final_seq.append("-" * info.get("size")) final_multihits.append(False) final_evalue.append(None) final_bitscore.append(None) result_aligns = self._runHmmAlign(gene_dict, db_genome_id) for result_align in result_aligns: final_genome.append(result_align[0]) final_markerid.append(result_align[1]) final_seq.append(result_align[2]) final_multihits.append(result_align[3]) final_evalue.append(result_align[4]) final_bitscore.append(result_align[5]) if final_genome: query = "SELECT upsert_aligned_markers(%s,%s,%s,%s,%s,%s)" temp_cur.execute(query, (final_genome, final_markerid, final_seq, final_multihits, final_evalue, final_bitscore)) temp_con.commit() temp_cur.close() temp_con.ClosePostgresConnection() return True
def _parse_sequence_file(self, fna_file, prefix, ssu_query_id): metadata = [] all_genes_dict = read_fasta(fna_file, False) sequence = all_genes_dict[ssu_query_id] metadata.append(('{0}_sequence'.format(prefix), sequence)) return metadata
def __workerThread(self, probe_size, probe_step_size, mismatch, min_aln_len, keep_fragments, results_dir, queueIn, queueOut): """Process each data item in parallel. The reference genome is the genome from which probes are being designed. The aim is to determine how many of these reference probes will hybridize to the target genome. To determine the number of reference probes which will hybridize to the target genome, the target genome is fragmented into probe sized windows to determine how many of these are nearly identical to the reference genome. """ while True: ref_genome, target_genome = queueIn.get(block=True, timeout=None) if ref_genome == None: break ref_name = ntpath.basename(ref_genome).replace('.fasta', '').replace('.fna', '') target_name = ntpath.basename(target_genome).replace('.fasta', '').replace('.fna', '') if keep_fragments: fragment_dir = os.path.join(results_dir, 'fragments') else: fragment_dir = tempfile.mkdtemp() # count total number of reference genome probes ref_seqs = read_fasta(ref_genome) ref_genome_size = 0 num_ref_probes = 0 for seq in ref_seqs.values(): num_ref_probes += (len(seq)-probe_size)/probe_step_size + 1 #sum([1 for i in range(0, len(seq)-probe_size, probe_step_size)]) ref_genome_size += len(seq) # fragment target genome into probe sized windows window_file = os.path.join(fragment_dir, ref_name + '~' + target_name + '.fna') fout = open(window_file, 'w') target_seqs = read_fasta(target_genome) num_target_probes = 0 target_windows = {} target_genome_size = 0 for seq in target_seqs.values(): target_genome_size += len(seq) for i in range(0, len(seq)-probe_size, probe_step_size): fout.write('>probe_%d\n' % num_target_probes) fout.write(seq[i:i+probe_size] + '\n') target_windows[str(num_target_probes)] = seq[i:i+probe_size] num_target_probes += 1 fout.close() # BLAST target probes against reference genome output_table = os.path.join(results_dir, ref_name + '~' + target_name + '.blast_hits.tsv') self._blastn(window_file, ref_genome, output_table, evalue=1e-2, max_matches=1, task='dc-megablast') window_hits = set() failed_similarity_test = set() output_file = os.path.join(results_dir, ref_name + '~' + target_name + '.probe_hits.tsv') fout = open(output_file, 'w') fout.write('Probe ID\tSubject ID\tProbe percent alignment\tPercent identity\tAdjusted percent identity\n') for hit in self._read_hit(output_table): adj_aln_len = hit.aln_len - hit.gaps query_aln_frac = adj_aln_len * 100.0 / hit.query_len adjusted_perc_identity = (adj_aln_len - hit.mismatch) * 100.0 / hit.query_len if (query_aln_frac >= (100*min_aln_len) and adjusted_perc_identity >= (100*(1.0 - mismatch))): if hit.query_id not in window_hits: window_hits.add(hit.query_id) fout.write('%s\t%s\t%.1f\t%.1f\t%.1f\n' % (hit.query_id, hit.subject_id, query_aln_frac, hit.perc_identity, adjusted_perc_identity)) else: failed_similarity_test.add(hit.query_id) fout.close() num_failed_similarity_test = len(target_windows) - len(window_hits) output_file = os.path.join(results_dir, ref_name + '~' + target_name + '.summary.tsv') fout = open(output_file, 'w') fout.write('Reference ID\tReference genome size (bp)') fout.write('\tTarget ID\tTarget genome size (bp)') fout.write('\tNo. reference probes\tNo. target probes\tNo. hybridized probes\tNo. probes failing genomic similarity test') fout.write('\tPredict signal intensity\n') fout.write('%s\t%d' % (ref_name, ref_genome_size)) fout.write('\t%s\t%d' % (target_name, target_genome_size)) fout.write('\t%d\t%d\t%d\t%d' % (num_ref_probes, len(target_windows), len(window_hits), num_failed_similarity_test)) fout.write('\t%.1f' % (len(window_hits)*100.0/len(target_windows))) fout.write('\n') fout.close() if not keep_fragments: shutil.rmtree(fragment_dir) # allow results to be processed or written to file queueOut.put(ref_name)
def run(self, homolog_file, gene_id_file, taxonomy_file, min_per_taxa, consensus, min_per_bp, use_trimAl, msa_program, tree_program, prot_model, output_dir): """Infer a tree over a reduced set of genes. Filter a set of homolgs to a specified set of gene ids, and infer tree over this reduced set of proteins. Parameters ---------- homolog_file : str Fasta file containing homologs. gene_ids : str File with gene ids to retain in tree. taxonomy_file : str Taxonomic assignment of each reference genomes. min_per_taxa : float Minimum percentage of taxa required to retain a column. consensus : float Minimum percentage of the same amino acid required to retain column. min_per_bp : float Minimum percentage of base pairs required to keep trimmed sequence. use_trimAl : boolean Filter columns using trimAl. msa_program : str Program to use for multiple sequence alignment ['mafft', 'muscle']. tree_program : str Program to use for tree inference ['fasttree', 'raxml']. prot_model : str Protein substitution model for tree inference ['WAG', 'LG', 'AUTO']. output_dir: str Output directory. """ # generate msa with reduced sequences self.logger.info('Extracting sequences to retain.') genes_to_retain = self.read_ids(gene_id_file) self.logger.info(' ...identified %d sequences to retain.' % len(genes_to_retain)) seqs = seq_io.read_fasta(homolog_file) reduced_seqs = {} for seq_id, seq in seqs.iteritems(): if seq_id in genes_to_retain: reduced_seqs[seq_id] = seq reduced_homolog_file = homolog_file[0:homolog_file.rfind('.')] reduced_homolog_file += '.reduced.' + homolog_file[homolog_file. rfind('.') + 1:] seq_io.write_fasta(reduced_seqs, reduced_homolog_file) self.logger.info('Retained %d sequences.' % len(reduced_seqs)) # infer multiple sequence alignment msa = MsaWorkflow(self.cpus) trimmed_msa_output = msa.run(reduced_homolog_file, min_per_taxa, consensus, min_per_bp, use_trimAl, msa_program, output_dir) # infer tree tw = TreeWorkflow(self.cpus) tree_output = tw.run(trimmed_msa_output, tree_program, prot_model, False, output_dir) # create tax2tree consensus map and decorate tree self.logger.info('Decorating internal tree nodes with tax2tree.') t2t_tree = tree_output.replace('.tree', '.tax2tree.tree') os.system('t2t decorate -m %s -t %s -o %s' % (taxonomy_file, tree_output, t2t_tree))
def _parse_taxonomy_file(self, genome_id, metadata_taxonomy_file, fout, prefix, fna_file, summary_file=None): """Parse metadata file with taxonomic information for 16S rRNA genes. Parameters ---------- genome_id : str Unique identifier of genome. metadata_taxonomy_file : str Full path to file containing rRNA metadata. fout : file Output stream to populate with metadata. Prefix : str Prefix to append to metadata fields. Returns ------- int Number of 16S rRNA genes identified in genome. """ if not os.path.exists(metadata_taxonomy_file): return 0 with open(metadata_taxonomy_file) as f: header_line = f.readline() # consume header line if prefix not in self.taxonomy_headers: self.taxonomy_headers.add(prefix) fout.write('genome_id') headers = [ prefix + '_' + x.strip().replace('ssu_', '') for x in header_line.split('\t') ] headers.append("{0}_sequence".format(prefix)) headers.append("{0}_contig_len".format(prefix)) if prefix == 'lsu_silva_23s': for n, i in enumerate(headers): if i == 'lsu_silva_23s_sequence': headers[n] = 'lsu_23s_sequence' elif i == 'lsu_silva_23s_query_id': headers[n] = 'lsu_23s_query_id' elif i == 'lsu_silva_23s_length': headers[n] = 'lsu_23s_length' elif i == 'lsu_silva_23s_contig_len': headers[n] = 'lsu_23s_contig_len' elif prefix == 'ssu_silva': for n, i in enumerate(headers): if i == 'ssu_silva_sequence': headers[n] = 'ssu_sequence' elif i == 'ssu_silva_query_id': headers[n] = 'ssu_query_id' elif i == 'ssu_silva_length': headers[n] = 'ssu_length' elif i == 'ssu_silva_contig_len': headers[n] = 'ssu_contig_len' fout.write('\t' + '\t'.join(headers) + "\n") # Check the CheckM headers are consistent split_headers = header_line.rstrip().split("\t") for pos in range(0, len(split_headers)): header = split_headers[pos] if header == 'query_id': query_id_pos = pos break # Report hit to longest 16S rRNA gene. It is possible that # the HMMs identified a putative 16S rRNA gene, but that # there was no valid BLAST hit. longest_query_len = 0 longest_ssu_hit_info = None identified_ssu_genes = 0 for line in f: line_split = line.strip().split('\t') query_len = int(line_split[2]) if query_len > longest_query_len: longest_query_len = query_len longest_ssu_hit_info = line_split ssu_query_id = line_split[query_id_pos] if longest_ssu_hit_info: fout.write(genome_id) fout.write('\t' + '\t'.join(longest_ssu_hit_info)) all_genes_dict = read_fasta(fna_file, False) sequence = all_genes_dict[ssu_query_id] fout.write('\t{0}'.format(sequence)) if summary_file is not None and os.path.exists(summary_file): with open(summary_file) as fsum: header_line = fsum.readline() # consume header line header_list = [ x.strip() for x in header_line.split('\t') ] idx_seq = header_list.index("Sequence length") for line in fsum: identified_ssu_genes += 1 sum_list = [x.strip() for x in line.split('\t')] if sum_list[0] == ssu_query_id: fout.write("\t{0}".format(sum_list[idx_seq])) fout.write('\n') return identified_ssu_genes
def _producer(self, genome_pair): """Identify reciprocal best blast hits between pairs of genomes. Parameters ---------- genome_pair : list Identifier of genomes to process. """ blast_stream = open(self.blast_table, 'rb', 32 * (10 ** 6)) genome_fileA, genome_fileB = genome_pair # count number of genes in each genome genes_in_genomeA = seq_io.read_fasta(genome_fileA) genes_in_genomeB = seq_io.read_fasta(genome_fileB) genome_idA = remove_extension(genome_fileA) genome_idB = remove_extension(genome_fileB) # find blast hits between genome A and B, and vice versa hitsAB = self._valid_hits(blast_stream, self.offset_table, self.per_identity_threshold, self.per_aln_len_threshold, genome_idA, genome_idB) hitsBA = self._valid_hits(blast_stream, self.offset_table, self.per_identity_threshold, self.per_aln_len_threshold, genome_idB, genome_idA) # report reciprocal best blast hits if self.write_shared_genes: fout_seqs = open(os.path.join(self.shared_genes_dir, genome_idA + '-' + genome_idB + '.shared_genes.faa'), 'w') fout_stats = open(os.path.join(self.shared_genes_dir, genome_idA + '-' + genome_idB + '.rbb_hits.tsv'), 'w') fout_stats.write(genome_idA + '\t' + genome_idB + '\tPercent Identity\tPercent Alignment Length\te-value\tbitscore\n') per_identity_hits = [] for query_id, hit_stats in hitsAB.iteritems(): subject_id, per_identA, per_aln_lenA, evalueA, bitscoreA = hit_stats if subject_id in hitsBA and query_id == hitsBA[subject_id][0]: _subject_id, per_identB, per_aln_lenB, evalueB, bitscoreB = hitsBA[subject_id] # take average of statistics in both blast directions as # the results will be similar, but not identical per_ident = 0.5 * (per_identA + per_identB) per_identity_hits.append(per_ident) per_aln_len = 0.5 * (per_aln_lenA + per_aln_lenB) evalue = 0.5 * (evalueA + evalueB) bitscore = 0.5 * (bitscoreA + bitscoreB) fout_stats.write('%s\t%s\t%.2f\t%.2f\t%.2g\t%.2f\n' % (query_id, subject_id, per_ident, per_aln_len, evalue, bitscore)) # write out shared genes if self.write_shared_genes: fout_seqs.write('>' + query_id + '\n') fout_seqs.write(genes_in_genomeA[query_id] + '\n') fout_seqs.write('>' + subject_id + '\n') fout_seqs.write(genes_in_genomeB[subject_id] + '\n') if self.write_shared_genes: fout_seqs.close() fout_stats.close() mean_per_identity_hits = 0 if len(per_identity_hits) > 0: mean_per_identity_hits = mean(per_identity_hits) std_per_identity_hits = 0 if len(per_identity_hits) >= 2: std_per_identity_hits = std(per_identity_hits) return (genome_idA, len(genes_in_genomeA), genome_idB, len(genes_in_genomeB), len(per_identity_hits), mean_per_identity_hits, std_per_identity_hits)