def run(self, genome_files, scaffold_file, min_seq_len): """Fragment genome sequences into fragments of a fixed size. Parameters ---------- genome_files : list of str Fasta files of genomes to process. scaffold_file : str Scaffolds binned to generate putative genomes. min_seq_len : int Ignore scaffolds shorter than the specified length. Returns ------- dict : d[seq_id] -> seq Dictionary of unbinned sequences. """ check_file_exists(scaffold_file) # get list of sequences in bins self.logger.info('Reading binned scaffolds.') binned_seq_ids = set() total_binned_bases = 0 for genome_file in genome_files: for seq_id, seq in seq_io.read_seq(genome_file): binned_seq_ids.add(seq_id) total_binned_bases += len(seq) self.logger.info( 'Read %d (%.2f Mbp) binned scaffolds.' % (len(binned_seq_ids), float(total_binned_bases) / 1e6)) # write all unbinned sequences self.logger.info('Identifying unbinned scaffolds >= %d bp.' % min_seq_len) unbinned_bases = 0 unbinned_seqs = {} for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id not in binned_seq_ids and len(seq) >= min_seq_len: unbinned_seqs[seq_id] = seq unbinned_bases += len(seq) self.logger.info('Identified %d (%.2f Mbp) unbinned scaffolds.' % (len(unbinned_seqs), float(unbinned_bases) / 1e6)) self.logger.info('Percentage of unbinned scaffolds: %.2f%%' % (len(unbinned_seqs) * 100.0 / (len(unbinned_seqs) + len(binned_seq_ids)))) self.logger.info('Percentage of unbinned bases: %.2f%%' % (unbinned_bases * 100.0 / (unbinned_bases + total_binned_bases))) return unbinned_seqs
def run(self, genome_files, scaffold_file, min_seq_len): """Fragment genome sequences into fragments of a fixed size. Parameters ---------- genome_files : list of str Fasta files of genomes to process. scaffold_file : str Scaffolds binned to generate putative genomes. min_seq_len : int Ignore scaffolds shorter than the specified length. Returns ------- dict : d[seq_id] -> seq Dictionary of unbinned sequences. """ check_file_exists(scaffold_file) # get list of sequences in bins self.logger.info('') self.logger.info(' Reading binned scaffolds.') binned_seq_ids = set() total_binned_bases = 0 for genome_file in genome_files: for seq_id, seq in seq_io.read_seq(genome_file): binned_seq_ids.add(seq_id) total_binned_bases += len(seq) self.logger.info(' Read %d (%.2f Mbp) binned scaffolds.' % (len(binned_seq_ids), float(total_binned_bases) / 1e6)) # write all unbinned sequences self.logger.info('') self.logger.info(' Identifying unbinned scaffolds >= %d bp.' % min_seq_len) unbinned_bases = 0 unbinned_seqs = {} for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id not in binned_seq_ids and len(seq) >= min_seq_len: unbinned_seqs[seq_id] = seq unbinned_bases += len(seq) self.logger.info(' Identified %d (%.2f Mbp) unbinned scaffolds.' % (len(unbinned_seqs), float(unbinned_bases) / 1e6)) self.logger.info('') self.logger.info(' Percentage of unbinned scaffolds: %.2f%%' % (len(unbinned_seqs) * 100.0 / (len(unbinned_seqs) + len(binned_seq_ids)))) self.logger.info(' Percentage of unbinned bases: %.2f%%' % (unbinned_bases * 100.0 / (unbinned_bases + total_binned_bases))) return unbinned_seqs
def _fragment_genomes(self, genome_file, window_size, step_size, profile, fout): """Fragment genome sequences into fragments of a fixed size. This is a helper function for fragmenting sequences within a genome which will be classified in order to create a taxonomic profile. Parameters ---------- genome_file : str Fasta file with genome sequences to fragment. window_size : int Size of each fragment. step_size : int Number of bases to move after each window. profile : Profile Class for classifying fragments. fout : stream Output stream to store all fragments. """ for seq_id, seq in seq_io.read_seq(genome_file): fragments = seq_tk.fragment(seq, window_size, step_size) for i, frag in enumerate(fragments): fout.write('>' + seq_id + '~' + str(i) + '\n') fout.write(frag + '\n') profile.fragments_from_seq[seq_id] = len(fragments) profile.seq_len[seq_id] = len(seq)
def concatenate_gene_files(gene_files, concatenated_gene_file): """Combine all gene files into a single file. Gene ids are modified to include genome ids in order to ensure all gene identifiers are unique across the set of genomes. Parameters ---------- gene_files : list of str Fasta files of called genes to process. concatenated_gene_file : str Name of file to contain concatenated gene files. """ fout = open(concatenated_gene_file, 'w') for gf in gene_files: genome_id = remove_extension(gf) for seq_id, seq in seq_io.read_seq(gf): fout.write('>' + genome_id + '~' + seq_id + '\n') if seq[-1] == '*': seq = seq[0:-1] fout.write(seq + '\n') fout.close()
def read_bins(bin_dirs): """Read sequences in bins.""" bins = defaultdict(lambda: defaultdict(set)) contigs = {} contigs_in_bins = defaultdict(lambda: {}) for method_id, (bin_dir, bin_ext) in bin_dirs.items(): for bf in os.listdir(bin_dir): if not bf.endswith(bin_ext): continue bin_id = bf[0:bf.rfind(bin_ext)] if bin_id[-1] == '.': bin_id = bin_id[0:-1] bf_path = os.path.join(bin_dir, bf) for seq_id, seq in seq_io.read_seq(bf_path): bins[method_id][bin_id].add(seq_id) contigs[seq_id] = seq contigs_in_bins[seq_id][method_id] = bin_id if len(bins[method_id][bin_id]) == 0: self.logger.warning('Bin %s from %s is empty.' % (bf, method_id)) return bins, contigs, contigs_in_bins
def create_arb_metadata(self, msa_output, taxonomy, metadata, output_file): """Create metadata file suitable for import into ARB. Parameters ---------- msa_output : str Fasta file with aligned homologs. taxonomy : d[genome_id] -> list of taxa Taxonomic information for genomes. metadata : d[key] - string Additional metadata to write to ARB file. output_file : str File to write metadata information. """ arb_metadata_list = [] for seq_id, seq in seq_io.read_seq(msa_output): arb_metadata = {} arb_metadata['db_name'] = seq_id arb_metadata['genome_id'] = seq_id arb_metadata['gtdb_tax_string'] = ';'.join(taxonomy.get( seq_id, '')) arb_metadata['aligned_seq'] = seq for k, v in metadata.iteritems(): arb_metadata[k] = v arb_metadata_list.append(arb_metadata) fout = open(output_file, 'w') arb_parser = ArbParser() arb_parser.write(arb_metadata_list, fout) fout.close()
def concatenate_gene_files(gene_files, concatenated_gene_file): """Combine all gene files into a single file. Gene ids are modified to include genome ids in order to ensure all gene identifiers are unique across the set of genomes. Parameters ---------- gene_files : list of str Fasta files of called genes to process. concatenated_gene_file : str Name of file to contain concatenated gene files. """ fout = open(concatenated_gene_file, 'w') for gf in gene_files: genome_id = remove_extension(gf) for seq_id, seq in seq_io.read_seq(gf): fout.write('>' + seq_id + '~' + genome_id + '\n') if seq[-1] == '*': seq = seq[0:-1] fout.write(seq + '\n') fout.close()
def run(self, ssu_gtdb_taxonomy_file, silva_parc_fasta_file): """Check INSDC primary accession numbers for GTDB SSU sequences with those in SILVA.""" print('Reading SILVA INSDC accession numbers.') silva_ids = set() for seq_id, seq in seq_io.read_seq(silva_parc_fasta_file): silva_ids.add(seq_id) print('Read %d accession numbers.' % len(silva_ids)) print('Checking GTDB SSU INSDC accession numbers.') missing_silva_acc = 0 num_genes = 0 for line in open(ssu_gtdb_taxonomy_file): line_split = line.strip().split('\t') gid = line_split[0] gene_id = line_split[1] gene_id = gene_id[0:gene_id.rfind('.')] start = int(line_split[2]) stop = int(line_split[3]) accession = '%s.%d.%d' % (gene_id, start, stop) num_genes += 1 if accession not in silva_ids and (stop-start) > 300: print('Missing INSDC accession in SILVA for genome %s: %s (len=%d)' % (gid, accession, stop-start)) missing_silva_acc += 1 print('Identified %d of %d (%.2f%%) genes without a SILVA accession.' % (missing_silva_acc, num_genes, missing_silva_acc*100.0/num_genes))
def add_compatible(self, scaffold_file, genome_file, compatible_file, min_len, out_genome): """Add sequences specified as compatible. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. min_len : int Minimum length to add scaffold. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine statistics for each potentially compatible scaffold scaffold_ids = set() with open(compatible_file) as f: headers = [x.strip() for x in f.readline().split('\t')] scaffold_gc_index = headers.index('Scaffold GC') genome_gc_index = headers.index('Median genome GC') td_dist_index = headers.index('Scaffold TD') scaffold_cov_index = headers.index('Scaffold coverage') genome_cov_index = headers.index('Median genome coverage') for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_gc = float(line_split[scaffold_gc_index]) genome_gc = float(line_split[genome_gc_index]) gc_dist = abs(scaffold_gc - genome_gc) td_dist = float(line_split[td_dist_index]) scaffold_cov = float(line_split[scaffold_cov_index]) genome_cov = float(line_split[genome_cov_index]) cov_dist = abs(scaffold_cov - genome_cov) if bin_id == cur_bin_id: scaffold_ids.add(scaffold_id) # add compatible sequences to genome added_seqs = 0 genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in scaffold_ids: if len(seq) >= min_len: genome_seqs[seq_id] = seq added_seqs += 1 self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs)) # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def add_compatible_unique(self, scaffold_file, genome_file, compatible_file, min_len, out_genome): """Add sequences specified as compatible. Only sequences specified exactly once in the compatibility file are added. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. min_len : int Minimum length to add scaffold. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine scaffolds compatible with genome scaffold_ids = [] bin_ids = {} with open(compatible_file) as f: f.readline() for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_ids.append(scaffold_id) bin_ids[scaffold_id] = bin_id compatible_scaffolds = set() for scaffold_id, bin_id in bin_ids.iteritems(): if scaffold_ids.count(scaffold_id) == 1 and bin_id == cur_bin_id: compatible_scaffolds.add(scaffold_id) self.logger.info('Identified %d compatible scaffolds.' % len(compatible_scaffolds)) # add compatible sequences to genome added_seqs = 0 genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in compatible_scaffolds: if len(seq) >= min_len: genome_seqs[seq_id] = seq added_seqs += 1 self.logger.info('Added %d scaffolds meeting length criterion.' % added_seqs) # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def run(self, gtdb_bac_taxonomy_file, gtdb_ar_taxonomy_file, silva_ssu_ref, silva_lsu_ref, ssu_blast_table, lsu_blast_table, output_dir): """Create table assigning GTDB taxonomy to SILVA accessions based on SSU and LSU BLAST results.""" if not os.path.exists(output_dir): os.makedirs(output_dir) # read GTDB taxonomy print('Reading GTDB taxonomy.') gtdb_bac_taxonomy = Taxonomy().read(gtdb_bac_taxonomy_file) gtdb_ar_taxonomy = Taxonomy().read(gtdb_ar_taxonomy_file) gtdb_taxonomy = gtdb_bac_taxonomy.copy() gtdb_taxonomy.update(gtdb_ar_taxonomy) print('Identified %d bacterial genomes to process.' % len(gtdb_bac_taxonomy)) print('Identified %d archaeal genomes to process.' % len(gtdb_ar_taxonomy)) print('Identified %d genomes to process.' % len(gtdb_taxonomy)) # read SILVA taxonomy print('Reading SILVA 16S and 23S rRNA taxonomies.') silva_ssu_taxonomy = {} for seq_id, seq, taxonomy in seq_io.read_seq(silva_ssu_ref, keep_annotation=True): silva_ssu_taxonomy[seq_id] = taxonomy silva_lsu_taxonomy = {} for seq_id, seq, taxonomy in seq_io.read_seq(silva_lsu_ref, keep_annotation=True): silva_lsu_taxonomy[seq_id] = taxonomy # parse BLAST tables print('Parsing BLAST tables.') ssu_table = os.path.join(output_dir, 'ssu_silva.tsv') self._parse_blast_table(ssu_blast_table, gtdb_taxonomy, silva_ssu_taxonomy, self.min_ssu_len, ssu_table) lsu_table = os.path.join(output_dir, 'lsu_silva.tsv') self._parse_blast_table(lsu_blast_table, gtdb_taxonomy, silva_lsu_taxonomy, self.min_lsu_len, lsu_table)
def read_msa_file(self, msa_file): """Determine percentage of amino acids for each genome in MSA file.""" msa_perc = {} for seq_id, seq in read_seq(msa_file): seq = seq.upper() aa = len(seq) - seq.count('-') - seq.count('_') - seq.count('*') msa_perc[seq_id] = aa * 100.0 / len(seq) return msa_perc
def _gene_distribution(self, seq_file): """Calculate length distribution of sequences.""" gene_lens = [] for seq_id, seq in seq_io.read_seq(seq_file): gene_lens.append(len(seq)) p10, p50, p90 = np_percentile(gene_lens, [10, 50, 90]) return np_mean(gene_lens), max(gene_lens), min( gene_lens), p10, p50, p90
def _remove_stop_codons(self, input_file, output_file): """Remove stop codons at end of sequences.""" fout = open(output_file, 'w') for seq_id, seq, annotation in seq_io.read_seq(input_file, keep_annotation=True): fout.write('>%s %s\n' % (seq_id, annotation)) if seq[-1] == '*': seq = seq[0:-1] fout.write('%s\n' % seq) fout.close()
def place_genomes(self, user_msa_file, marker_set_id, out_dir, prefix): """Place genomes into reference tree using pplacer.""" # rename user MSA file for compatibility with pplacer if not user_msa_file.endswith('.fasta'): t = os.path.join(out_dir, prefix + '.user_msa.fasta') shutil.copyfile(user_msa_file, t) user_msa_file = t # run pplacer to place bins in reference genome tree num_genomes = sum([1 for _seq_id, _seq in read_seq(user_msa_file)]) # get path to pplacer reference package if marker_set_id == 'bac120': self.logger.info( 'Placing %d bacterial genomes into reference tree with pplacer (be patient).' % num_genomes) pplacer_ref_pkg = os.path.join(Config.PPLACER_DIR, Config.PPLACER_BAC120_REF_PKG) elif marker_set_id == 'ar122': self.logger.info( 'Placing %d archaeal genomes into reference tree with pplacer (be patient).' % num_genomes) pplacer_ref_pkg = os.path.join(Config.PPLACER_DIR, Config.PPLACER_AR122_REF_PKG) elif marker_set_id == 'rps23': self.logger.info( 'Placing %d genomes into reference tree with pplacer (be patient).' % num_genomes) pplacer_ref_pkg = os.path.join(Config.PPLACER_DIR, Config.PPLACER_RPS23_REF_PKG) pplacer_out_dir = os.path.join(out_dir, 'pplacer') if not os.path.exists(pplacer_out_dir): os.makedirs(pplacer_out_dir) pplacer_out = os.path.join(pplacer_out_dir, 'pplacer.%s.out' % marker_set_id) pplacer_json_out = os.path.join(pplacer_out_dir, 'pplacer.%s.json' % marker_set_id) cmd = 'pplacer -j %d -c %s -o %s %s > %s' % ( self.cpus, pplacer_ref_pkg, pplacer_json_out, user_msa_file, pplacer_out) os.system(cmd) # extract tree tree_file = os.path.join(out_dir, prefix + ".%s.classify.tree" % marker_set_id) cmd = 'guppy tog -o %s %s' % (tree_file, pplacer_json_out) os.system(cmd) return tree_file
def read_ssu_file(self, ssu_fasta_file): """Read length of SSU sequences for genomes.""" ssu_length = {} for seq_id, seq in read_seq(ssu_fasta_file): gid = seq_id.split('~')[0] if gid in ssu_length and len(seq) < ssu_length[gid]: continue ssu_length[gid] = len(seq) - seq.upper().count('N') return ssu_length
def _derep_msa(self, msa_file, selected_taxa, output_msa): """Dereplicate multiple sequence alignment.""" selected_taxa_labels = set() for taxon in selected_taxa: selected_taxa_labels.add(taxon.label) fout = open(output_msa, 'w') for seq_id, seq, annotation in read_seq(msa_file, keep_annotation=True): if seq_id in selected_taxa_labels: fout.write('>%s %s\n' % (seq_id, annotation)) fout.write('%s\n' % seq) fout.close()
def modify(input_file, scaffold_file, seqs_to_add, seqs_to_remove, output_file): """Add or remove scaffolds from a fasta file. Parameters ---------- input_file : str Fasta file to modify. scaffold_file : str Fasta file containing scaffolds to add. seqs_to_add: iterable Unique ids of scaffolds to add. seqs_to_remove : iterable Unique ids of scaffolds to remove. output_file : str Desired name of modified fasta file. Returns ------- iterable, iterable Unique ids of sequences that could not be added, unique ids of sequences that could not be removed. """ seqs = seq_io.read(input_file) # add sequences to bin failed_to_add = set() if seqs_to_add: failed_to_add = set(seqs_to_add) if seqs_to_add != None: for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in seqs_to_add: failed_to_add.remove(seq_id) seqs[seq_id] = seq # remove sequences from bin failed_to_remove = set() if seqs_to_remove: failed_to_remove = set(seqs_to_remove) if seqs_to_remove != None: for seq_id in seqs_to_remove: if seq_id in seqs: failed_to_remove.remove(seq_id) seqs.pop(seq_id) # save modified bin seq_io.write_fasta(seqs, output_file) return failed_to_add, failed_to_remove
def add_compatible_unique(self, scaffold_file, genome_file, compatible_file, out_genome): """Add sequences specified as compatible. Only sequences specified exactly once in the compatibility file are added. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine scaffolds compatible with genome scaffold_ids = [] bin_ids = {} with open(compatible_file) as f: f.readline() for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_ids.append(scaffold_id) bin_ids[scaffold_id] = bin_id compatible_scaffolds = set() for scaffold_id, bin_id in bin_ids.iteritems(): if scaffold_ids.count(scaffold_id) == 1 and bin_id == cur_bin_id: compatible_scaffolds.add(scaffold_id) # add compatible sequences to genome genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in compatible_scaffolds: genome_seqs[seq_id] = seq # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def unique(genome_files): """Check if sequences are assigned to multiple bins. Parameters ---------- genome_files : iterable Path to genome fasta files. Returns ------- dict : d[genome_id][genome_id] -> [shared sequences] List of any sequences within a genome observed multiple times. """ # read sequence IDs from all genomes, # while checking for duplicate sequences within a genomes duplicates = defaultdict(lambda: defaultdict(list)) genome_seqs = {} for f in genome_files: genome_id = remove_extension(f) seq_ids = set() for seq_id, _seq in seq_io.read_seq(f): if seq_id in seq_ids: duplicates[genome_id][genome_id].append(seq_id) seq_ids.add(seq_id) genome_seqs[genome_id] = seq_ids # check for sequences assigned to multiple bins genome_ids = genome_seqs.keys() for i in xrange(0, len(genome_ids)): seq_idsI = genome_seqs[genome_ids[i]] for j in xrange(i + 1, len(genome_ids)): seq_idsJ = genome_seqs[genome_ids[j]] seq_intersection = seq_idsI.intersection(seq_idsJ) if len(seq_intersection) > 0: duplicates[genome_ids[i]][genome_ids[j]] = seq_intersection duplicates[genome_ids[j]][genome_ids[i]] = seq_intersection return duplicates
def validate_seq_ids(query_proteins): """Ensure all sequence identifiers contain only acceptable characters. Parameters ---------- query_proteins : str Fasta file containing query proteins. """ invalid_chars = set('()[],;=') for seq_id, _seq in seq_io.read_seq(query_proteins): if any((c in invalid_chars) for c in seq_id): logging.getLogger('no_timestamp').error( 'Invalid sequence header in file %s' % query_proteins) logging.getLogger('no_timestamp').error( 'Sequence contains an invalid character: %s' % seq_id) logging.getLogger('no_timestamp').error( 'Sequence identifiers must not contain the following characters: ' + ''.join(invalid_chars)) sys.exit()
def _genome_seqs(self, genome_files): """Get unique id of sequences in each genome. Parameters ---------- genome_files : iterable Genome files in fasta format. Returns ------- dict: d[genome_id] -> set(seq_id1, ..., seq_idN) Ids of sequences in each genome. """ genome_seqs = defaultdict(set) for genome_file in genome_files: genome_id = remove_extension(genome_file) for seq_id, _seq in seq_io.read_seq(genome_file): genome_seqs[genome_id].add(seq_id) return genome_seqs
def write_windows(self,scaffold_file,output_dir,window_size,window_gap): ''' -------------------------------------------------------------------- Take a scaffold file in fasta format and prints a similarly name fasta file of the windows made from the scaffolds in the scaffold file. -------------------------------------------------------------------- Input: scaffold_file The name of the fasta file to turn into windows Output: Writes a links file and a file of windows ''' seq_win_id={} #pairs a scaffold with the windows made from it window_dict={} #dictionary of windows_dict[win_id]=seq_win for seq_id, sequence in seq_io.read_seq(scaffold_file): win_id,seq_win=self.make_windows([seq_id,sequence], window_size, window_gap) seq_win_id[seq_id]=win_id for i in range(0,len(win_id)): window_dict[win_id[i]]=seq_win[i] filename=os.path.split(scaffold_file)[1] start=".".join(filename.split('.')[:-1]) end=filename.split('.')[-1] window_file=os.path.join(output_dir,start+"windows."+end) names = ['id','sequence'] formats = [] print len(window_dict) seq_io.write_fasta(window_dict,window_file) #self.write_fasta(window_dict,window_file) links_file=os.path.join(output_dir,"links_file.tsv") self.write_links(seq_win_id,links_file) return [window_file,links_file]
def combine(self, ssu_msa, ssu_tree, lsu_msa, lsu_tree, output_dir): """Infer 16S + 23S tree spanning GTDB genomes.""" # identify common 16S and 23S sequences ssu_seqs = {} for seq_id, seq, annotation in seq_io.read_seq(ssu_msa, keep_annotation=True): genome_id = seq_id.split('~')[0] ssu_seqs[genome_id] = [seq, annotation] self.logger.info('Read %d SSU rRNA sequences.' % len(ssu_seqs)) lsu_seqs = {} for seq_id, seq, annotation in seq_io.read_seq(lsu_msa, keep_annotation=True): genome_id = seq_id.split('~')[0] lsu_seqs[genome_id] = [seq, annotation] self.logger.info('Read %d LSU rRNA sequences.' % len(lsu_seqs)) common_seqs = set(ssu_seqs.keys()).intersection(set(lsu_seqs.keys())) self.logger.info('Identified %d sequences in common.' % len(common_seqs)) # identify incongruent taxonomic order classifcations between trees self.logger.info( 'Identifying incongruent order-level taxonomic classifications between trees.' ) ssu_taxonomy = Taxonomy().read_from_tree(ssu_tree) lsu_taxonomy = Taxonomy().read_from_tree(lsu_tree) order_index = Taxonomy.rank_labels.index('order') seqs_to_filter = set() for seq_id in common_seqs: ssu_order = ssu_taxonomy.get(seq_id)[order_index][3:] lsu_order = lsu_taxonomy.get(seq_id)[order_index][3:] # remove designator of paraphyletic orders # (since in the concatenated tree this may be resolved) ssu_order = ssu_order.split('_')[0] lsu_order = lsu_order.split('_')[0] if ssu_order != lsu_order: seqs_to_filter.add(seq_id) self.logger.info( 'Identified %d sequences with incongruent classifcations.' % len(seqs_to_filter)) common_seqs.difference_update(seqs_to_filter) # write out MSA concatenated_msa = os.path.join(output_dir, 'ssu_lsu_concatenated.fna') fout = open(concatenated_msa, 'w') for seq_id in common_seqs: fout.write('>%s %s %s\n' % (seq_id, ssu_seqs[seq_id][1], lsu_seqs[seq_id][1])) fout.write('%s%s\n' % (ssu_seqs[seq_id][0], lsu_seqs[seq_id][0])) fout.close() # infer tree output_tree = os.path.join(output_dir, 'ssu_lsu_concatenated.tree') os.system('FastTreeMP -nosupport -nt -gtr -gamma %s > %s' % (concatenated_msa, output_tree))
def add_compatible_closest(self, scaffold_file, genome_file, compatible_file, min_len, out_genome): """Add sequences specified as compatible. A sequences is added to a bin if and only if it is closest to that bin in GC, tetranuclotide, and coverage space. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. min_len : int Minimum length to add scaffold. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine statistics for each potentially compatible scaffold scaffold_ids = defaultdict(dict) with open(compatible_file) as f: headers = [x.strip() for x in f.readline().split('\t')] scaffold_gc_index = headers.index('Scaffold GC') genome_gc_index = headers.index('Median genome GC') td_dist_index = headers.index('Scaffold TD') scaffold_cov_index = headers.index('Scaffold coverage') genome_cov_index = headers.index('Median genome coverage') for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_gc = float(line_split[scaffold_gc_index]) genome_gc = float(line_split[genome_gc_index]) gc_dist = abs(scaffold_gc - genome_gc) td_dist = float(line_split[td_dist_index]) scaffold_cov = float(line_split[scaffold_cov_index]) genome_cov = float(line_split[genome_cov_index]) cov_dist = abs(scaffold_cov - genome_cov) scaffold_ids[scaffold_id][bin_id] = [gc_dist, td_dist, cov_dist] # determine scaffolds that are closest to a single bin # in terms of GC, tetranucleotide distance, and coverage compatible_scaffolds = set() for scaffold_id, bin_stats in scaffold_ids.items(): best_gc = [1e9, None] best_td = [1e9, None] best_cov = [1e9, None] for bin_id, stats in bin_stats.items(): gc, td, cov = stats if gc < best_gc[0]: best_gc = [gc, bin_id] if td < best_td[0]: best_td = [td, bin_id] if cov < best_cov[0]: best_cov = [cov, bin_id] # check if scaffold is closest to a single bin if (best_gc[1] == best_td[1] == best_cov[1]) and best_gc[1] == cur_bin_id: compatible_scaffolds.add(scaffold_id) self.logger.info('Identified {:,} compatible scaffolds.'.format(len(compatible_scaffolds))) # add compatible sequences to genome added_seqs = 0 genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in compatible_scaffolds: if len(seq) >= min_len: genome_seqs[seq_id] = seq added_seqs += 1 self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs)) # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def run(self, scaffold_file, genome_files, tetra_file, coverage_file, output_file): """Calculate statistics for scaffolds. Parameters ---------- scaffold_file : str Fasta file containing scaffolds. genome_files : list of str Fasta files with binned scaffolds. tetra_file : str Tetranucleotide signatures for scaffolds. coverage_file : str Coverage profiles for scaffolds output_file : str Output file for scaffolds statistics. """ tetra = Tetranucleotide(self.cpus) signatures = tetra.read(tetra_file) cov_profiles = None if coverage_file: coverage = Coverage(self.cpus) cov_profiles, _ = coverage.read(coverage_file) # determine bin assignment for each scaffold self.logger.info('') self.logger.info(' Determining scaffold statistics.') scaffold_id_genome_id = {} for gf in genome_files: genome_id = remove_extension(gf) for scaffold_id, _seq in seq_io.read_seq(gf): scaffold_id_genome_id[scaffold_id] = genome_id # write out scaffold statistics fout = open(output_file, 'w') fout.write('Scaffold id\tGenome Id\tGC\tLength (bp)') if cov_profiles: bam_ids = sorted(cov_profiles[cov_profiles.keys()[0]].keys()) for bam_id in bam_ids: fout.write('\t' + bam_id) for kmer in tetra.canonical_order(): fout.write('\t' + kmer) fout.write('\n') for scaffold_id, seq in seq_io.read_seq(scaffold_file): fout.write(scaffold_id) fout.write('\t' + scaffold_id_genome_id.get(scaffold_id, self.unbinned)) fout.write('\t%.2f' % (seq_tk.gc(seq) * 100.0)) fout.write('\t%d' % len(seq)) if cov_profiles: for bam_id in bam_ids: fout.write('\t%.2f' % cov_profiles[scaffold_id][bam_id]) fout.write('\t' + '\t'.join(map(str, signatures[scaffold_id]))) fout.write('\n') fout.close()
def run_seqs_file(self, producer, consumer, seq_file, progress=None): """Process sequences in parallel. The producer function must be specified and must not return None. Consumer and progress can be set to None. Parameters ---------- producer : function Function to process data items. consumer : queue Function to consumed processed data items. seq_file : str Name of fasta/q file to read. progress : function Function to report progress string. Returns ------- <user specified> Set by caller in the consumer function. """ # populate producer queue with data to process seq_iter = seq_io.read_seq(seq_file) producer_queue = mp.Queue() read_all_seqs = False for _ in range(self.cpus): try: seq_data = next(seq_iter) producer_queue.put(seq_data) except StopIteration: read_all_seqs = True for _ in range(self.cpus): producer_queue.put(None) # signal processes to terminate break data_items = sum(1 for _ in seq_io.read_seq(seq_file)) try: consumer_queue = mp.Queue() manager_proc = mp.Process(target=self.__process_manager, args=(producer, producer_queue, consumer_queue)) manager_proc.start() # process items produced by workers items_processed = 0 consumer_data = None while True: if progress: status = progress(items_processed, data_items) sys.stdout.write('%s\r' % status) sys.stdout.flush() produced_data = consumer_queue.get(block=True, timeout=None) if produced_data == None: break if not read_all_seqs: try: seq_data = next(seq_iter) producer_queue.put(seq_data) except StopIteration: read_all_seqs = True for _ in range(self.cpus): producer_queue.put( None) # signal processes to terminate if consumer: consumer_data = consumer(produced_data, consumer_data) items_processed += 1 if progress: sys.stdout.write('\n') manager_proc.join() return consumer_data except Exception as _err: print(sys.exc_info()[0]) print(traceback.format_exc()) self.logger.warning('Exception encountered while processing data.') manager_proc.terminate()
def add_compatible_closest(self, scaffold_file, genome_file, compatible_file, out_genome): """Add sequences specified as compatible. A sequences is added to a bin if and only if it is closest to that bin in GC, tetranuclotide, and coverage space. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine statistics for each potentially compatible scaffold scaffold_ids = defaultdict(dict) with open(compatible_file) as f: headers = [x.strip() for x in f.readline().split('\t')] scaffold_gc_index = headers.index('Scaffold GC') genome_gc_index = headers.index('Mean genome GC') td_dist_index = headers.index('Scaffold TD') scaffold_cov_index = headers.index('Mean scaffold coverage') genome_cov_index = headers.index('Mean genome coverage') for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_gc = float(line_split[scaffold_gc_index]) genome_gc = float(line_split[genome_gc_index]) gc_dist = abs(scaffold_gc - genome_gc) td_dist = float(line_split[td_dist_index]) scaffold_cov = float(line_split[scaffold_cov_index]) genome_cov = float(line_split[genome_cov_index]) cov_dist = abs(scaffold_cov - genome_cov) scaffold_ids[scaffold_id][bin_id] = [gc_dist, td_dist, cov_dist] # determine scaffolds that are closest to a single bin # in terms of GC, tetranucleotide distance, and coverage compatible_scaffolds = set() for scaffold_id, bin_stats in scaffold_ids.iteritems(): best_gc = [1e9, None] best_td = [1e9, None] best_cov = [1e9, None] for bin_id, stats in bin_stats.iteritems(): gc, td, cov = stats if gc < best_gc[0]: best_gc = [gc, bin_id] if td < best_td[0]: best_td = [td, bin_id] if cov < best_cov[0]: best_cov = [cov, bin_id] # check if scaffold is closest to a single bin if (best_gc[1] == best_td[1] == best_cov[1]) and best_gc[1] == cur_bin_id: compatible_scaffolds.add(scaffold_id) # add compatible sequences to genome genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in compatible_scaffolds: genome_seqs[seq_id] = seq # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def run(self, scaffold_gene_file, stat_file, ref_genome_gene_files, db_file, evalue, per_identity,): """Create taxonomic profiles for a set of genomes. Parameters ---------- scaffold_gene_file : str Fasta file of genes on scaffolds in amino acid space. stat_file : str File with statistics for individual scaffolds. ref_genome_gene_files : list of str Fasta files of called genes on reference genomes of interest. db_file : str Database of competing reference genes. evalue : float E-value threshold used by blast. per_identity: float Percent identity threshold used by blast. """ # read statistics file self.logger.info('') self.logger.info(' Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(stat_file) # perform homology searches self.logger.info('') self.logger.info(' Creating diamond database for reference genomes.') ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa') concatenate_gene_files(ref_genome_gene_files, ref_gene_file) diamond = Diamond(self.cpus) ref_diamond_db = os.path.join(self.output_dir, 'ref_genes') diamond.make_database(ref_gene_file, ref_diamond_db) self.logger.info(' Identifying homologs within reference genomes of interest (be patient!).') self.diamond_dir = os.path.join(self.output_dir, 'diamond') make_sure_path_exists(self.diamond_dir) hits_ref_genomes_daa = os.path.join(self.diamond_dir, 'ref_hits') diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, 1, hits_ref_genomes_daa) hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv') diamond.view(hits_ref_genomes_daa + '.daa', hits_ref_genomes) self.logger.info(' Identifying homologs within competing reference genomes (be patient!).') hits_comp_ref_genomes_daa = os.path.join(self.diamond_dir, 'competing_ref_hits') diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, 1, hits_comp_ref_genomes_daa) hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv') diamond.view(hits_comp_ref_genomes_daa + '.daa', hits_comp_ref_genomes) # get list of genes with a top hit to the reference genomes of interest hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes) # get number of genes on each scaffold num_genes_on_scaffold = defaultdict(int) for seq_id, _seq in seq_io.read_seq(scaffold_gene_file): scaffold_id = seq_id[0:seq_id.rfind('_')] num_genes_on_scaffold[scaffold_id] += 1 # get hits to each scaffold hits_to_scaffold = defaultdict(list) for query_id, hit in hits_to_ref.iteritems(): gene_id = query_id[0:query_id.rfind('~')] scaffold_id = gene_id[0:gene_id.rfind('_')] hits_to_scaffold[scaffold_id].append(hit) # report summary stats for each scaffold reference_out = os.path.join(self.output_dir, 'references.tsv') fout = open(reference_out, 'w') fout.write('Scaffold id\tSubject scaffold ids\tSubject genome ids') fout.write('\tGenome id\tLength (bp)\tGC\tMean coverage') fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n') for scaffold_id, hits in hits_to_scaffold.iteritems(): aln_len = [] perc_iden = [] evalue = [] bitscore = [] subject_scaffold_ids = defaultdict(int) subject_bin_ids = defaultdict(int) for hit in hits: aln_len.append(hit.aln_length) perc_iden.append(hit.perc_identity) evalue.append(hit.evalue) bitscore.append(hit.bitscore) subject_id, subject_bin_id = hit.subject_id.split('~') subject_scaffold_id = subject_id[0:subject_id.rfind('_')] subject_scaffold_ids[subject_scaffold_id] += 1 subject_bin_ids[subject_bin_id] += 1 subject_scaffold_id_str = [] for subject_id, num_hits in subject_scaffold_ids.iteritems(): subject_scaffold_id_str.append(subject_id + ':' + str(num_hits)) subject_scaffold_id_str = ','.join(subject_scaffold_id_str) subject_bin_id_str = [] for bin_id, num_hits in subject_bin_ids.iteritems(): subject_bin_id_str.append(bin_id + ':' + str(num_hits)) subject_bin_id_str = ','.join(subject_bin_id_str) fout.write('%s\t%s\t%s\t%s\t%.2f\t%d\t%d\t%.2f\t%d\t%.2f\t%.2g\t%.2f\n' % ( scaffold_id, subject_scaffold_id_str, subject_bin_id_str, scaffold_stats.print_stats(scaffold_id), mean(scaffold_stats.coverage(scaffold_id)), num_genes_on_scaffold[scaffold_id], len(hits), len(hits) * 100.0 / num_genes_on_scaffold[scaffold_id], mean(aln_len), mean(perc_iden), mean(evalue), mean(bitscore))) fout.close() return reference_out
def _run_reciprocal_diamond(self, query_gene_file, target_gene_file, evalue, per_identity, per_aln_len, max_hits, sensitive, high_mem, tmp_dir, output_dir): """Perform similarity search of query genes against target genes, and reciprocal hits. Parameters ---------- query_gene_file : str File with all query proteins. target_gene_file : str File with all target proteins. evalue : float E-value threshold for reporting hits. per_identity : float Percent identity threshold for reporting hits. per_aln_len : float Percent query coverage threshold for reporting hits. max_hits : int Maximum number of hits to report per query sequences. tmp_dir : str Directory to store temporary files. output_dir : str Directory to store blast results. """ self.logger.info( 'Creating DIAMOND database of query proteins (be patient!).') diamond = Diamond(self.cpus) query_diamond_db = os.path.join(output_dir, 'query_genes') diamond.create_db(query_gene_file, query_diamond_db) self.logger.info( 'Creating DIAMOND database of target proteins (be patient!).') target_diamond_db = os.path.join(output_dir, 'target_genes') diamond.create_db(target_gene_file, target_diamond_db) # blast query genes against target proteins self.logger.info( 'Performing similarity sequence between query and target proteins (be patient!).' ) if tmp_dir: tmp_query_hits_table = tempfile.NamedTemporaryFile( prefix='comparem_hits_', dir=tmp_dir, delete=False) else: tmp_query_hits_table = tempfile.NamedTemporaryFile( prefix='comparem_hits_', delete=False) tmp_query_hits_table.close() query_hits_daa_file = os.path.join(output_dir, 'query_hits') if high_mem: diamond.blastp(query_gene_file, target_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_query_hits_table.name, 'standard', tmp_dir, chunk_size=1, block_size=8) else: diamond.blastp(query_gene_file, target_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_query_hits_table.name, 'standard', tmp_dir) # get target genes hit by one or more query proteins self.logger.info( 'Creating file with target proteins with similarity to query proteins.' ) target_hit = set() for line in open(tmp_query_hits_table.name): line_split = line.split('\t') target_hit.add(line_split[1]) target_genes_hits = os.path.join(output_dir, 'target_genes_hit.faa') fout = open(target_genes_hits, 'w') for seq_id, seq in seq_io.read_seq(target_gene_file): if seq_id in target_hit: fout.write('>' + seq_id + '\n') fout.write(seq + '\n') fout.close() self.logger.info( 'Identified %d target proteins to be used in reciprocal search.' % len(target_hit)) # perform reciprocal blast self.logger.info( 'Performing reciprocal similarity sequence between target and query proteins (be patient!).' ) if tmp_dir: tmp_target_hits_table = tempfile.NamedTemporaryFile( prefix='comparem_hits_', dir=tmp_dir, delete=False) else: tmp_target_hits_table = tempfile.NamedTemporaryFile( prefix='comparem_hits_', delete=False) tmp_target_hits_table.close() if high_mem: diamond.blastp(target_genes_hits, query_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_target_hits_table.name, 'standard', tmp_dir, chunk_size=1, block_size=8) else: diamond.blastp(target_genes_hits, query_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_target_hits_table.name, 'standard', tmp_dir) # combine hit tables and sort os.system('cat %s >> %s' % (tmp_target_hits_table.name, tmp_query_hits_table.name)) os.remove(tmp_target_hits_table.name) hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv') self._sort_hit_table(tmp_query_hits_table.name, hits_table_file)
def run(self, genome_files, db_file, taxonomy_file, evalue, per_identity, window_size, step_size): """Create taxonomic profiles for a set of genomes. Parameters ---------- genome_files : list of str Fasta files of genomes to process. db_file : str Database of reference genes. taxonomy_file : str File containing GreenGenes taxonomy strings for reference genomes. evalue : float E-value threshold used by blast. per_identity: float Percent identity threshold used by blast. window_size : int Size of each fragment. step_size : int Number of bases to move after each window. """ # parse taxonomy file self.logger.info(' Reading taxonomic assignment of reference genomes.') taxonomy = Taxonomy().read(taxonomy_file) # fragment each genome into fixed sizes windows self.logger.info('') self.logger.info(' Fragmenting sequences in each bin:') diamond_output_dir = os.path.join(self.output_dir, 'diamond') make_sure_path_exists(diamond_output_dir) fragment_file = os.path.join(diamond_output_dir, 'fragments.fna') fragment_out = open(fragment_file, 'w') contig_id_to_genome_id = {} for genome_file in genome_files: genome_id = remove_extension(genome_file) self.profiles[genome_id] = Profile(genome_id, taxonomy) self._fragment_genomes(genome_file, window_size, step_size, self.profiles[genome_id], fragment_out) for seq_id, _seq in seq_io.read_seq(genome_file): contig_id_to_genome_id[seq_id] = genome_id # run diamond self.logger.info('') self.logger.info(' Running diamond blastx with %d processes (be patient!)' % self.cpus) diamond = Diamond(self.cpus) diamond_daa_out = os.path.join(diamond_output_dir, 'diamond_hits') diamond.blastx(fragment_file, db_file, evalue, per_identity, 1, diamond_daa_out) diamond_table_out = os.path.join(diamond_output_dir, 'diamond_hits.tsv') diamond.view(diamond_daa_out + '.daa', diamond_table_out) self.logger.info('') self.logger.info(' Creating taxonomic profile for each genome.') self._taxonomic_profiles(diamond_table_out, taxonomy, contig_id_to_genome_id) self.logger.info('') self.logger.info(' Writing taxonomic profile for each genome.') report_dir = os.path.join(self.output_dir, 'bin_reports') make_sure_path_exists(report_dir) for genome_id, profile in self.profiles.iteritems(): seq_summary_out = os.path.join(report_dir, genome_id + '.sequences.tsv') profile.write_seq_summary(seq_summary_out) genome_profile_out = os.path.join(report_dir, genome_id + '.profile.tsv') profile.write_genome_profile(genome_profile_out) genome_summary_out = os.path.join(self.output_dir, 'genome_summary.tsv') self._write_genome_summary(genome_summary_out) # create Krona plot krona_profiles = defaultdict(lambda: defaultdict(int)) for genome_id, profile in self.profiles.iteritems(): seq_assignments = profile.classify_seqs(taxonomy) for seq_id, classification in seq_assignments.iteritems(): taxa = [] for r in xrange(0, len(profile.rank_labels)): taxa.append(classification[r][0]) krona_profiles[genome_id][';'.join(taxa)] += profile.seq_len[seq_id] krona = Krona() krona_output_file = os.path.join(self.output_dir, 'taxonomic_profiles.krona.html') krona.create(krona_profiles, krona_output_file)
def run(self, genome_files1, genome_files2, seq_file, output_file): """Get basic statistics about genomes. Parameters ---------- genome_files1 : iterable First set of henome files in fasta format. genome_files2 : iterable Second set of genome files in fasta format. seq_file : str Scaffolds/contigs binned to create genomes. output_file : str Desire file to write results. """ # determine total number of sequences self.logger.info('Reading sequences.') seq_lens = {} total_bases = 0 num_seqs_over_length = defaultdict(int) total_bases_over_length = defaultdict(int) lengths_to_check = [1000, 5000, 10000, 20000, 50000] for seq_id, seq in seq_io.read_seq(seq_file): seq_len = len(seq) seq_lens[seq_id] = seq_len total_bases += seq_len for length in lengths_to_check: if seq_len >= length: num_seqs_over_length[length] += 1 total_bases_over_length[length] += seq_len # determine sequences in each bin genome_seqs1 = self._genome_seqs(genome_files1) genome_seqs2 = self._genome_seqs(genome_files2) # determine bin stats genome_stats1, total_uniq_binned_seqs1, total_uniq_binned_bases1, num_repeats1 = self._genome_stats(genome_seqs1, seq_lens) genome_stats2, total_uniq_binned_seqs2, total_uniq_binned_bases2, num_repeats2 = self._genome_stats(genome_seqs2, seq_lens) # sort bins by size genome_stats1 = sorted(genome_stats1.iteritems(), key=lambda x: x[1][1], reverse=True) genome_stats2 = sorted(genome_stats2.iteritems(), key=lambda x: x[1][1], reverse=True) # report summary results self.reporter.info('Total seqs = %d (%.2f Mbp)' % (len(seq_lens), float(total_bases) / 1e6)) for length in lengths_to_check: self.reporter.info(' # seqs > %d kbp = %d (%.2f Mbp)' % (int(length / 1000), num_seqs_over_length[length], float(total_bases_over_length[length]) / 1e6)) self.reporter.info('') self.reporter.info('Binned seqs statistics:') self.reporter.info(' 1) # genomes: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d' % (len(genome_seqs1), total_uniq_binned_seqs2, float(total_uniq_binned_seqs1) * 100 / len(seq_lens), float(total_uniq_binned_bases1) / 1e6, float(total_uniq_binned_bases1) * 100 / total_bases, num_repeats1)) self.reporter.info(' 2) # genomes: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d' % (len(genome_seqs2), total_uniq_binned_seqs2, float(total_uniq_binned_seqs2) * 100 / len(seq_lens), float(total_uniq_binned_bases2) / 1e6, float(total_uniq_binned_bases2) * 100 / total_bases, num_repeats2)) # output report fout = open(output_file, 'w') for data in genome_stats2: fout.write('\t' + data[0]) fout.write('\tunbinned\t# seqs\t# bases (Mbp)\tBest match\t% bases in common\t% seqs in common\n') max_bp_common2 = defaultdict(int) max_seqs_common2 = defaultdict(int) best_matching_genome2 = {} binned_seqs2 = defaultdict(set) for data1 in genome_stats1: bin_id1 = data1[0] fout.write(bin_id1) seqs1 = genome_seqs1[bin_id1] max_bp_common = 0 max_seqs_common = 0 best_matching_genome = 'n/a' binned_seqs = set() for data2 in genome_stats2: bin_id2 = data2[0] seqs2 = genome_seqs2[bin_id2] seqs_common = seqs1.intersection(seqs2) binned_seqs.update(seqs_common) num_seqs_common = len(seqs_common) fout.write('\t' + str(num_seqs_common)) bases_common = 0 for seqId in seqs_common: bases_common += seq_lens[seqId] if bases_common > max_bp_common: max_bp_common = bases_common max_seqs_common = num_seqs_common best_matching_genome = bin_id2 if bases_common > max_bp_common2[bin_id2]: max_bp_common2[bin_id2] = bases_common max_seqs_common2[bin_id2] = num_seqs_common best_matching_genome2[bin_id2] = bin_id1 binned_seqs2[bin_id2].update(seqs_common) fout.write('\t%d\t%d\t%.2f\t%s\t%.2f\t%.2f\n' % (len(seqs1) - len(binned_seqs), data1[1][0], float(data1[1][1]) / 1e6, best_matching_genome, float(max_bp_common) * 100 / data1[1][1], float(max_seqs_common) * 100 / data1[1][0], )) fout.write('unbinned') for data in genome_stats2: genome_id = data[0] fout.write('\t%d' % (len(genome_seqs2[genome_id]) - len(binned_seqs2[genome_id]))) fout.write('\n') fout.write('# seqs') for data in genome_stats2: fout.write('\t%d' % data[1][0]) fout.write('\n') fout.write('# bases (Mbp)') for data in genome_stats2: fout.write('\t%.2f' % (float(data[1][1]) / 1e6)) fout.write('\n') fout.write('Best match') for data in genome_stats2: binId = data[0] fout.write('\t%s' % best_matching_genome2.get(binId, 'n/a')) fout.write('\n') fout.write('% bases in common') for data in genome_stats2: binId = data[0] fout.write('\t%.2f' % (float(max_bp_common2[binId]) * 100 / data[1][1])) fout.write('\n') fout.write('% seqs in common') for data in genome_stats2: binId = data[0] fout.write('\t%.2f' % (float(max_seqs_common2[binId]) * 100 / data[1][0])) fout.write('\n') fout.close()
def run(self, msa_file, tree_program, prot_model, skip_rooting, output_dir): """Infer tree. Parameters ---------- msa_file : str Multiple sequence alignment in fasta format. tree_program : str Program to use for tree inference ['fasttree', 'raxml']. prot_model : str Protein substitution model for tree inference ['WAG', 'LG', 'AUTO']. output_dir : str Directory to store results. """ num_seqs = sum([1 for _, _ in seq_io.read_seq(msa_file)]) if num_seqs <= 2: self.logger.error( 'Insufficient number of sequences in MSA to infer tree.') raise SystemExit('Tree inference failed.') output_file = ntpath.basename(msa_file) prefix = output_file[0:output_file.rfind('.')] suffix = output_file[output_file.rfind('.') + 1:] if tree_program == 'fasttree': self.logger.info( 'Inferring gene tree with FastTree using %s+GAMMA.' % prot_model) fasttree = FastTree(multithreaded=(self.cpus > 1)) tree_unrooted_output = os.path.join(output_dir, prefix + '.unrooted.tree') tree_log = os.path.join(output_dir, prefix + '.tree.log') tree_output_log = os.path.join(output_dir, 'fasttree.log') fasttree.run(msa_file, 'prot', prot_model, tree_unrooted_output, tree_log, tree_output_log) elif tree_program == 'raxml': self.logger.info( 'Inferring gene tree with RAxML using PROTGAMMA%s.' % prot_model) # create phylip MSA file phylip_msa_file = msa_file.replace('.faa', '.phyx') cmd = 'seqmagick convert %s %s' % (msa_file, phylip_msa_file) os.system(cmd) # run RAxML raxml_dir = os.path.abspath(os.path.join(output_dir, 'raxml')) tree_output_log = os.path.join(output_dir, 'raxml.log') raxml = RAxML(self.cpus) tree_unrooted_output = raxml.run(phylip_msa_file, prot_model, raxml_dir) # root tree at midpoint if not skip_rooting: seqs = seq_io.read(msa_file) if len(seqs) > 2: self.logger.info('Rooting tree at midpoint.') tree = dendropy.Tree.get_from_path(tree_unrooted_output, schema='newick', rooting="force-rooted", preserve_underscores=True) tree.reroot_at_midpoint(update_bipartitions=False) tree_output = os.path.join(output_dir, prefix + '.rooted.tree') tree.write_to_path(tree_output, schema='newick', suppress_rooting=True, unquoted_underscores=True) else: tree_output = tree_unrooted_output return tree_output
def run(self, scaffold_file, genome_files, tetra_file, coverage_file, output_file): """Calculate statistics for scaffolds. Parameters ---------- scaffold_file : str Fasta file containing scaffolds. genome_files : list of str Fasta files with binned scaffolds. tetra_file : str Tetranucleotide signatures for scaffolds. coverage_file : str Coverage profiles for scaffolds output_file : str Output file for scaffolds statistics. """ tetra = Tetranucleotide(self.cpus) signatures = tetra.read(tetra_file) cov_profiles = None if coverage_file: coverage = Coverage(self.cpus) cov_profiles, _ = coverage.read(coverage_file) # determine bin assignment for each scaffold self.logger.info('Determining scaffold statistics.') scaffold_id_genome_id = {} for gf in genome_files: genome_id = remove_extension(gf) for scaffold_id, _seq in seq_io.read_seq(gf): scaffold_id_genome_id[scaffold_id] = genome_id # write out scaffold statistics fout = open(output_file, 'w') fout.write('Scaffold id\tGenome Id\tGC\tLength (bp)') if cov_profiles: first_key = list(cov_profiles.keys())[0] bam_ids = sorted(cov_profiles[first_key].keys()) for bam_id in bam_ids: fout.write('\t' + bam_id) for kmer in tetra.canonical_order(): fout.write('\t' + kmer) fout.write('\n') for scaffold_id, seq in seq_io.read_seq(scaffold_file): fout.write(scaffold_id) fout.write('\t' + scaffold_id_genome_id.get(scaffold_id, self.unbinned)) fout.write('\t%.2f' % (seq_tk.gc(seq) * 100.0)) fout.write('\t%d' % len(seq)) if cov_profiles: for bam_id in bam_ids: fout.write('\t%.2f' % cov_profiles[scaffold_id][bam_id]) fout.write('\t' + '\t'.join(map(str, signatures[scaffold_id]))) fout.write('\n') fout.close()
def _run_reciprocal_diamond(self, query_gene_file, target_gene_file, evalue, per_identity, per_aln_len, max_hits, sensitive, high_mem, tmp_dir, output_dir): """Perform similarity search of query genes against target genes, and reciprocal hits. Parameters ---------- query_gene_file : str File with all query proteins. target_gene_file : str File with all target proteins. evalue : float E-value threshold for reporting hits. per_identity : float Percent identity threshold for reporting hits. per_aln_len : float Percent query coverage threshold for reporting hits. max_hits : int Maximum number of hits to report per query sequences. tmp_dir : str Directory to store temporary files. output_dir : str Directory to store blast results. """ self.logger.info('Creating DIAMOND database of query proteins (be patient!).') diamond = Diamond(self.cpus) query_diamond_db = os.path.join(output_dir, 'query_genes') diamond.make_database(query_gene_file, query_diamond_db) self.logger.info('Creating DIAMOND database of target proteins (be patient!).') target_diamond_db = os.path.join(output_dir, 'target_genes') diamond.make_database(target_gene_file, target_diamond_db) # blast query genes against target proteins self.logger.info('Performing similarity sequence between query and target proteins (be patient!).') if tmp_dir: tmp_query_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False) else: tmp_query_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False) tmp_query_hits_table.close() query_hits_daa_file = os.path.join(output_dir, 'query_hits') if high_mem: diamond.blastp(query_gene_file, target_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_query_hits_table.name, 'standard', tmp_dir, chunk_size=1, block_size=8) else: diamond.blastp(query_gene_file, target_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_query_hits_table.name, 'standard', tmp_dir) # get target genes hit by one or more query proteins self.logger.info('Creating file with target proteins with similarity to query proteins.') target_hit = set() for line in open(tmp_query_hits_table.name): line_split = line.split('\t') target_hit.add(line_split[1]) target_genes_hits = os.path.join(output_dir, 'target_genes_hit.faa') fout = open(target_genes_hits, 'w') for seq_id, seq in seq_io.read_seq(target_gene_file): if seq_id in target_hit: fout.write('>' + seq_id + '\n') fout.write(seq + '\n') fout.close() self.logger.info('Identified %d target proteins to be used in reciprocal search.' % len(target_hit)) # perform reciprocal blast self.logger.info('Performing reciprocal similarity sequence between target and query proteins (be patient!).') if tmp_dir: tmp_target_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', dir=tmp_dir, delete=False) else: tmp_target_hits_table = tempfile.NamedTemporaryFile(prefix='comparem_hits_', delete=False) tmp_target_hits_table.close() if high_mem: diamond.blastp(target_genes_hits, query_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_target_hits_table.name, 'standard', tmp_dir, chunk_size=1, block_size=8) else: diamond.blastp(target_genes_hits, query_diamond_db, evalue, per_identity, per_aln_len, max_hits, sensitive, tmp_target_hits_table.name, 'standard', tmp_dir) # combine hit tables and sort os.system('cat %s >> %s' % (tmp_target_hits_table.name, tmp_query_hits_table.name)) os.remove(tmp_target_hits_table.name) hits_table_file = os.path.join(output_dir, 'hits_sorted.tsv') self._sort_hit_table(tmp_query_hits_table.name, hits_table_file)
def run(self, gene_dirs, min_per_gene, min_per_bps, tree_program, prot_model, split_chars, output_dir): """Infer concatenated gene tree. Parameters ---------- gene_dirs : list GeneTreeTk output directories with information for individual genes. min_per_gene : float Minimum percentage of genes required to retain taxa. min_per_bps : float Minimum percentage of base pairs required to retain taxa. tree_program : str Program to use for tree inference ['fasttree', 'raxml']. prot_model : str Protein substitution model for tree inference ['WAG', 'LG', 'AUTO']. output_dir : str Directory to store results. """ # read MSA files concat = defaultdict(lambda: defaultdict(list)) msa_length = 0 gene_lengths = {} for gene_dir in gene_dirs: homologs = os.path.join(gene_dir, 'homologs.trimmed.aligned.faa') for seq_id, seq in seq_io.read_seq(homologs): taxon_id, gene_id = self._split_ids(seq_id, split_chars) if not taxon_id: self.logger.error('Failed to split identifier: %s' % seq_id) sys.exit(-1) concat[taxon_id][gene_dir].append(seq) msa_length += len(seq) gene_lengths[gene_dir] = len(seq) # filter taxon mc_filter = set() min_per_gene_filter = set() min_per_bps_filter = set() for taxon_id in concat: # check if multiple copy missing = 0 taxon_msa_len = 0 for gene_id in gene_dirs: if gene_id not in concat[taxon_id]: missing += 1 continue if len(concat[taxon_id][gene_id]) > 1: mc_filter.add(taxon_id) break taxon_msa_len += len(concat[taxon_id][gene_id][0]) if taxon_id not in mc_filter: if missing > len(gene_dirs) * (1.0 - float(min_per_gene) / 100.0): min_per_gene_filter.add(taxon_id) elif taxon_msa_len < msa_length * float(min_per_bps) / 100.0: min_per_bps_filter.add(taxon_id) min_req_genes = math.ceil(len(gene_dirs) * float(min_per_gene) / 100.0) filtered_taxa = mc_filter.union(min_per_gene_filter).union( min_per_bps_filter) remaining_taxa = set(concat) - filtered_taxa self.logger.info('No. genes: %d' % len(gene_dirs)) self.logger.info('No. taxa across all genes: %d' % len(concat)) self.logger.info('Total filtered taxa: %d' % len(filtered_taxa)) self.logger.info(' Due to multi-copy genes: %d' % len(mc_filter)) self.logger.info(' Due to having <%d of the genes: %d' % (min_req_genes, len(min_per_gene_filter))) self.logger.info(' Due to an insufficient number of base pairs: %d' % len(min_per_bps_filter)) self.logger.info('Remaining taxa: %d' % len(remaining_taxa)) self.logger.info('Length of concatenated MSA: %d' % msa_length) # create the multiple sequences alignment msa_file = os.path.join(output_dir, 'concatenated.faa') fout = open(msa_file, 'w') for taxon_id in remaining_taxa: msa = '' for gene_id in gene_dirs: if gene_id not in concat[taxon_id]: msa += '-' * gene_lengths[gene_id] else: msa += concat[taxon_id][gene_id][0] fout.write('>%s\n' % taxon_id) fout.write('%s\n' % msa) fout.close() # read all taxonomy files # (assumes taxonomy is the same for taxa across all genes) taxonomy = {} for gene_id in gene_dirs: taxonomy_file = os.path.join(gene_id, 'taxonomy.tsv') t = Taxonomy().read(taxonomy_file) for label, taxa_str in t.iteritems(): taxon_id, gene_id = self._split_ids(label, split_chars) taxonomy[taxon_id] = taxa_str # create taxonomy file for retained taxa self.logger.info('Creating taxonomy file for retained taxa.') output_taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv') fout = open(output_taxonomy_file, 'w') for taxon_id in remaining_taxa: if taxon_id in taxonomy: # query genomes will generally be missing fout.write('%s\t%s\n' % (taxon_id, ';'.join(taxonomy[taxon_id]))) fout.close() # infer tree if tree_program == 'fasttree': self.logger.info( 'Inferring gene tree with FastTree using %s+GAMMA.' % prot_model) fasttree = FastTree(multithreaded=(self.cpus > 1)) tree_unrooted_output = os.path.join(output_dir, 'concatenated.unrooted.tree') tree_log = os.path.join(output_dir, 'concatenated.tree.log') tree_output_log = os.path.join(output_dir, 'fasttree.log') fasttree.run(msa_file, 'prot', prot_model, tree_unrooted_output, tree_log, tree_output_log) elif tree_program == 'raxml': self.logger.info( 'Inferring gene tree with RAxML using PROTGAMMA%s.' % prot_model) # create phylip MSA file phylip_msa_file = msa_file.replace('.faa', '.phyx') cmd = 'seqmagick convert %s %s' % (msa_file, phylip_msa_file) os.system(cmd) # run RAxML raxml_dir = os.path.abspath(os.path.join(output_dir, 'raxml')) tree_output_log = os.path.join(output_dir, 'raxml.log') raxml = RAxML(self.cpus) tree_unrooted_output = raxml.run(phylip_msa_file, prot_model, raxml_dir) # root tree at midpoint self.logger.info('Rooting tree at midpoint.') tree = dendropy.Tree.get_from_path(tree_unrooted_output, schema='newick', rooting="force-rooted", preserve_underscores=True) if len(remaining_taxa) > 2: tree.reroot_at_midpoint(update_bipartitions=False) tree_output = os.path.join(output_dir, 'concatenated.rooted.tree') tree.write_to_path(tree_output, schema='newick', suppress_rooting=True, unquoted_underscores=True) # create tax2tree consensus map and decorate tree t2t_tree = os.path.join(output_dir, 'concatenated.tax2tree.tree') cmd = 't2t decorate -m %s -t %s -o %s' % (output_taxonomy_file, tree_output, t2t_tree) os.system(cmd) # setup metadata for ARB file src_dir = os.path.dirname(os.path.realpath(__file__)) version_file = open(os.path.join(src_dir, 'VERSION')) metadata = {} metadata['genetreetk_version'] = version_file.read().strip() metadata['genetreetk_tree_program'] = tree_program metadata['genetreetk_tree_prot_model'] = prot_model # create ARB metadata file self.logger.info('Creating ARB metadata file.') arb_metadata_file = os.path.join(output_dir, 'arb.metadata.txt') self.create_arb_metadata(msa_file, taxonomy, metadata, arb_metadata_file)
def run(self, genome_files1, genome_files2, seq_file, output_file): """Get basic statistics about genomes. Parameters ---------- genome_files1 : iterable First set of henome files in fasta format. genome_files2 : iterable Second set of genome files in fasta format. seq_file : str Scaffolds/contigs binned to create genomes. output_file : str Desire file to write results. """ # determine total number of sequences self.logger.info('') self.logger.info(' Reading sequences.') seq_lens = {} total_bases = 0 num_seqs_over_length = defaultdict(int) total_bases_over_length = defaultdict(int) lengths_to_check = [1000, 5000, 10000, 20000, 50000] for seq_id, seq in seq_io.read_seq(seq_file): seq_len = len(seq) seq_lens[seq_id] = seq_len total_bases += seq_len for length in lengths_to_check: if seq_len >= length: num_seqs_over_length[length] += 1 total_bases_over_length[length] += seq_len # determine sequences in each bin genome_seqs1 = self._genome_seqs(genome_files1) genome_seqs2 = self._genome_seqs(genome_files2) # determine bin stats genome_stats1, total_uniq_binned_seqs1, total_uniq_binned_bases1, num_repeats1 = self._genome_stats(genome_seqs1, seq_lens) genome_stats2, total_uniq_binned_seqs2, total_uniq_binned_bases2, num_repeats2 = self._genome_stats(genome_seqs2, seq_lens) # sort bins by size genome_stats1 = sorted(genome_stats1.iteritems(), key=lambda x: x[1][1], reverse=True) genome_stats2 = sorted(genome_stats2.iteritems(), key=lambda x: x[1][1], reverse=True) # report summary results self.logger.info(' Total seqs = %d (%.2f Mbp)' % (len(seq_lens), float(total_bases) / 1e6)) for length in lengths_to_check: self.logger.info(' # seqs > %d kbp = %d (%.2f Mbp)' % (int(length / 1000), num_seqs_over_length[length], float(total_bases_over_length[length]) / 1e6)) self.logger.info('') self.logger.info(' Binned seqs statistics:') self.logger.info(' 1) # genomes: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d' % (len(genome_seqs1), total_uniq_binned_seqs2, float(total_uniq_binned_seqs1) * 100 / len(seq_lens), float(total_uniq_binned_bases1) / 1e6, float(total_uniq_binned_bases1) * 100 / total_bases, num_repeats1)) self.logger.info(' 2) # genomes: %s, # binned seqs: %d (%.2f%%), # binned bases: %.2f Mbp (%.2f%%), # seqs in multiple bins: %d' % (len(genome_seqs2), total_uniq_binned_seqs2, float(total_uniq_binned_seqs2) * 100 / len(seq_lens), float(total_uniq_binned_bases2) / 1e6, float(total_uniq_binned_bases2) * 100 / total_bases, num_repeats2)) # output report fout = open(output_file, 'w') for data in genome_stats2: fout.write('\t' + data[0]) fout.write('\tunbinned\t# seqs\t# bases (Mbp)\tBest match\t% bases in common\t% seqs in common\n') max_bp_common2 = defaultdict(int) max_seqs_common2 = defaultdict(int) best_matching_genome2 = {} binned_seqs2 = defaultdict(set) for data1 in genome_stats1: bin_id1 = data1[0] fout.write(bin_id1) seqs1 = genome_seqs1[bin_id1] max_bp_common = 0 max_seqs_common = 0 best_matching_genome = 'n/a' binned_seqs = set() for data2 in genome_stats2: bin_id2 = data2[0] seqs2 = genome_seqs2[bin_id2] seqs_common = seqs1.intersection(seqs2) binned_seqs.update(seqs_common) num_seqs_common = len(seqs_common) fout.write('\t' + str(num_seqs_common)) bases_common = 0 for seqId in seqs_common: bases_common += seq_lens[seqId] if bases_common > max_bp_common: max_bp_common = bases_common max_seqs_common = num_seqs_common best_matching_genome = bin_id2 if bases_common > max_bp_common2[bin_id2]: max_bp_common2[bin_id2] = bases_common max_seqs_common2[bin_id2] = num_seqs_common best_matching_genome2[bin_id2] = bin_id1 binned_seqs2[bin_id2].update(seqs_common) fout.write('\t%d\t%d\t%.2f\t%s\t%.2f\t%.2f\n' % (len(seqs1) - len(binned_seqs), data1[1][0], float(data1[1][1]) / 1e6, best_matching_genome, float(max_bp_common) * 100 / data1[1][1], float(max_seqs_common) * 100 / data1[1][0], )) fout.write('unbinned') for data in genome_stats2: genome_id = data[0] fout.write('\t%d' % (len(genome_seqs2[genome_id]) - len(binned_seqs2[genome_id]))) fout.write('\n') fout.write('# seqs') for data in genome_stats2: fout.write('\t%d' % data[1][0]) fout.write('\n') fout.write('# bases (Mbp)') for data in genome_stats2: fout.write('\t%.2f' % (float(data[1][1]) / 1e6)) fout.write('\n') fout.write('Best match') for data in genome_stats2: binId = data[0] fout.write('\t%s' % best_matching_genome2.get(binId, 'n/a')) fout.write('\n') fout.write('% bases in common') for data in genome_stats2: binId = data[0] fout.write('\t%.2f' % (float(max_bp_common2[binId]) * 100 / data[1][1])) fout.write('\n') fout.write('% seqs in common') for data in genome_stats2: binId = data[0] fout.write('\t%.2f' % (float(max_seqs_common2[binId]) * 100 / data[1][0])) fout.write('\n') fout.close()
def create_arb_metadata(self, homologs, msa_output, taxonomy, metadata, gene_precontext, gene_postcontext, output_file): """Create metadata file suitable for import into ARB. Parameters ---------- homologs : d[seq_id] -> namedtuple of BlastHit information BLAST results for identified homologs. msa_output : str Fasta file with aligned homologs. taxonomy : d[genome_id] -> list of taxa Taxonomic information for genomes. metadata : d[key] - string Additional metadata to write to ARB file. gene_precontext : d[seq_id] -> list of annotations for pre-context genes Annotation for genes preceding a gene. gene_postcontext: d[seq_id] -> list of annotations for post-context genes Annotation for genes following a gene. output_file : str File to write metadata information. """ arb_metadata_list = [] for seq_id, seq, annotation in seq_io.read_seq(msa_output, keep_annotation=True): if '~' in seq_id: genome_id, scaffold_gene_id = seq_id.split('~') else: scaffold_gene_id = seq_id genome_id = '' arb_metadata = {} arb_metadata['db_name'] = seq_id arb_metadata['genome_id'] = genome_id arb_metadata['scaffold_id'] = scaffold_gene_id[0:scaffold_gene_id. rfind('_')] arb_metadata['scaffold_gene_id'] = scaffold_gene_id arb_metadata['gtdb_tax_string'] = ';'.join( taxonomy.get(genome_id, '')) arb_metadata['aligned_seq'] = seq for k, v in metadata.iteritems(): arb_metadata[k] = v arb_metadata['gene_precontext'] = ' -> '.join( gene_precontext.get(seq_id, [])) arb_metadata['gene_postcontext'] = ' <- '.join( gene_postcontext.get(seq_id, [])) hit_info = homologs.get(seq_id, None) if hit_info: arb_metadata['blast_evalue'] = '%.1g' % hit_info.evalue arb_metadata['blast_bitscore'] = '%.1f' % hit_info.bitscore arb_metadata[ 'blast_perc_identity'] = '%.1f' % hit_info.perc_identity arb_metadata[ 'blast_subject_perc_alignment_len'] = '%.1f' % hit_info.subject_perc_aln_len arb_metadata[ 'blast_query_perc_alignment_len'] = '%.1f' % hit_info.query_perc_aln_len arb_metadata['blast_query_id'] = hit_info.query_id if annotation: annotation_split = annotation.split('] [') if len(annotation_split) == 3: # assume format is [gtdb_taxonomy] [NCBI organism name] [annotation] gtdb_taxonomy, organism_name, gene_annotation = annotation_split gtdb_taxonomy = gtdb_taxonomy.replace('[', '') gene_annotation = gene_annotation.replace(']', '') else: # no idea what the format is, so just save the annotation gene_annotation = annotation organism_name = '' gtdb_taxonomy = '' arb_metadata['gene_annotation'] = gene_annotation arb_metadata['organism'] = organism_name arb_metadata['full_name'] = organism_name arb_metadata_list.append(arb_metadata) fout = open(output_file, 'w') arb_parser = ArbParser() arb_parser.write(arb_metadata_list, fout) fout.close()
def run(self, scaffold_gene_file, stat_file, ref_genome_gene_files, db_file, evalue, per_identity, per_aln_len): """Create taxonomic profiles for a set of genomes. Parameters ---------- scaffold_gene_file : str Fasta file of genes on scaffolds in amino acid space. stat_file : str File with statistics for individual scaffolds. ref_genome_gene_files : list of str Fasta files of called genes on reference genomes of interest. db_file : str Database of competing reference genes. evalue : float E-value threshold of valid hits. per_identity : float Percent identity threshold of valid hits [0,100]. per_aln_len : float Percent query coverage of valid hits [0, 100]. """ # read statistics file self.logger.info('Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(stat_file) # perform homology searches self.logger.info('Creating DIAMOND database for reference genomes.') ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa') concatenate_gene_files(ref_genome_gene_files, ref_gene_file) diamond = Diamond(self.cpus) ref_diamond_db = os.path.join(self.output_dir, 'ref_genes') diamond.create_db(ref_gene_file, ref_diamond_db) self.logger.info('Identifying homologs within reference genomes of interest (be patient!).') self.diamond_dir = os.path.join(self.output_dir, 'diamond') make_sure_path_exists(self.diamond_dir) hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv') diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, per_aln_len, 1, False, hits_ref_genomes) self.logger.info('Identifying homologs within competing reference genomes (be patient!).') hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv') diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, per_aln_len, 1, False, hits_comp_ref_genomes) # get list of genes with a top hit to the reference genomes of interest hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes) # get number of genes on each scaffold num_genes_on_scaffold = defaultdict(int) for seq_id, _seq in seq_io.read_seq(scaffold_gene_file): scaffold_id = seq_id[0:seq_id.rfind('_')] num_genes_on_scaffold[scaffold_id] += 1 # get hits to each scaffold hits_to_scaffold = defaultdict(list) for query_id, hit in hits_to_ref.items(): gene_id = query_id[0:query_id.rfind('~')] scaffold_id = gene_id[0:gene_id.rfind('_')] hits_to_scaffold[scaffold_id].append(hit) # report summary stats for each scaffold reference_out = os.path.join(self.output_dir, 'references.tsv') fout = open(reference_out, 'w') fout.write('Scaffold ID\tSubject genome IDs\tSubject scaffold IDs') fout.write('\tGenome ID\tLength (bp)\tGC\tMean coverage') fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n') for scaffold_id, hits in hits_to_scaffold.items(): aln_len = [] perc_iden = [] evalue = [] bitscore = [] subject_scaffold_ids = defaultdict(int) subject_bin_ids = defaultdict(int) for hit in hits: aln_len.append(hit.aln_length) perc_iden.append(hit.perc_identity) evalue.append(hit.evalue) bitscore.append(hit.bitscore) subject_bin_id, subject_gene_id = hit.subject_id.split('~') subject_scaffold_id = subject_gene_id[0:subject_gene_id.rfind('_')] subject_scaffold_ids[subject_scaffold_id] += 1 subject_bin_ids[subject_bin_id] += 1 sorted_subject_bin_ids = sorted(subject_bin_ids.items(), key=operator.itemgetter(1), reverse=True) subject_bin_id_str = [] for bin_id, num_hits in sorted_subject_bin_ids: subject_bin_id_str.append(bin_id + ':' + str(num_hits)) subject_bin_id_str = ','.join(subject_bin_id_str) sorted_subject_scaffold_ids = sorted(subject_scaffold_ids.items(), key=operator.itemgetter(1), reverse=True) subject_scaffold_id_str = [] for subject_id, num_hits in sorted_subject_scaffold_ids: subject_scaffold_id_str.append(subject_id + ':' + str(num_hits)) subject_scaffold_id_str = ','.join(subject_scaffold_id_str) fout.write('%s\t%s\t%s\t%s\t%.2f\t%d\t%d\t%.2f\t%d\t%.2f\t%.2g\t%.2f\n' % ( scaffold_id, subject_bin_id_str, subject_scaffold_id_str, scaffold_stats.print_stats(scaffold_id), mean(scaffold_stats.coverage(scaffold_id)), num_genes_on_scaffold[scaffold_id], len(hits), len(hits) * 100.0 / num_genes_on_scaffold[scaffold_id], mean(aln_len), mean(perc_iden), mean(evalue), mean(bitscore))) fout.close() return reference_out
def run(self, gtdb_bac_taxonomy_file, gtdb_ar_taxonomy_file, gtdb_path_file, gtdb_metadata_file, output_dir): """Create FASTA files with all 16S and 23S rRNA sequences from GTDB genomes.""" # get User ID to UBA translation print('Reading GTDB metadata to translate User IDs to UBA IDs.') user_id_to_uba = {} with open(gtdb_metadata_file) as f: f.readline() for line in f: line_split = line.strip().split('\t') gid = line_split[0] org_name = line_split[1] if '(UBA' in org_name: uba_id = org_name.split('(')[-1].replace(')', '') user_id_to_uba[gid] = uba_id # read GTDB taxonomy print('Reading GTDB taxonomy.') gtdb_bac_taxonomy = Taxonomy().read(gtdb_bac_taxonomy_file) gtdb_ar_taxonomy = Taxonomy().read(gtdb_ar_taxonomy_file) gtdb_taxonomy = gtdb_bac_taxonomy.copy() gtdb_taxonomy.update(gtdb_ar_taxonomy) print('Identified %d bacterial genomes to process.' % len(gtdb_bac_taxonomy)) print('Identified %d archaeal genomes to process.' % len(gtdb_ar_taxonomy)) print('Identified %d genomes to process.' % len(gtdb_taxonomy)) # read genome paths print('Reading path to genomes.') genome_paths = {} for line in open(gtdb_path_file): gid, gid_path = line.strip().split('\t') if gid in user_id_to_uba: gid = user_id_to_uba[gid] genome_paths[gid] = gid_path # sanity check data missing_paths = set(gtdb_taxonomy.keys()) - set(genome_paths.keys()) if len(missing_paths) > 0: print( '[WARNING] There are %d genomes in the taxonomy file without a specified genome path.' % len(missing_paths)) # create FASTA file with 16S and 23S rRNA sequence files print('Parsing 16S and 23S rRNA sequence files.') if not os.path.exists(output_dir): os.makedirs(output_dir) fout_16S = open(os.path.join(output_dir, 'ssu.fna'), 'w') fout_23S = open(os.path.join(output_dir, 'lsu.fna'), 'w') missing_ssu = 0 missing_lsu = 0 for i, gid in enumerate(gtdb_taxonomy): if i % 1000 == 0: print('Processed %d genomes.' % i) if gid not in genome_paths: print( '[WARNING] Genome %s does not have a specified genome path.' % gid) continue genome_path = genome_paths[gid] ssu_file = os.path.join(genome_path, 'rna_silva', 'ssu.fna') if not os.path.exists(ssu_file): missing_ssu += 1 continue ssu_info_file = os.path.join(genome_path, 'rna_silva', 'ssu.hmm_summary.tsv') ssu_info = {} with open(ssu_info_file) as f: header = f.readline().strip().split('\t') contig_len_index = header.index('Sequence length') for line in f: line_split = line.strip().split('\t') gene_id = line_split[0] contig_length = int(line_split[contig_len_index]) ssu_info[gene_id] = contig_length for ssu_index, (seq_id, seq) in enumerate(seq_io.read_seq(ssu_file)): fout_16S.write('>%s~%s [ssu=%d bp] [contig=%d bp]\n' % (gid, seq_id, len(seq), ssu_info[seq_id])) fout_16S.write('%s\n' % seq) lsu_file = os.path.join(genome_path, 'rna_silva', 'lsu_23S.fna') if not os.path.exists(lsu_file): missing_lsu += 1 continue lsu_info_file = os.path.join(genome_path, 'rna_silva', 'lsu_23S.hmm_summary.tsv') lsu_info = {} with open(lsu_info_file) as f: header = f.readline().strip().split('\t') contig_len_index = header.index('Sequence length') for line in f: line_split = line.strip().split('\t') gene_id = line_split[0] contig_length = int(line_split[contig_len_index]) lsu_info[gene_id] = contig_length for lsu_index, (seq_id, seq) in enumerate(seq_io.read_seq(lsu_file)): fout_23S.write('>%s~%s [ssu=%d bp] [contig=%d bp]\n' % (gid, seq_id, len(seq), lsu_info[seq_id])) fout_23S.write('%s\n' % seq) fout_16S.close() fout_23S.close() print( 'There were %d of %d (%.2f%%) genomes without an identifier 16S rRNA gene.' % (missing_ssu, len(gtdb_taxonomy), missing_ssu * 100.0 / len(gtdb_taxonomy))) print( 'There were %d of %d (%.2f%%) genomes without an identifier 23S rRNA gene.' % (missing_lsu, len(gtdb_taxonomy), missing_lsu * 100.0 / len(gtdb_taxonomy)))