def remove_outliers(self, genome_file, outlier_file, out_genome): """Remove sequences specified as outliers. Any scaffolds lists in the first column of the outlier file are removed from the specified genome. Parameters ---------- genome_file : str Fasta file of binned scaffolds. outlier_file : str File specifying outlying scaffolds. out_genome : str Name of output genome. """ genome_seqs = seq_io.read(genome_file) # remove scaffolds with open(outlier_file) as f: f.readline() for line in f: line_split = line.split('\t') scaffold_id = line_split[0] genome_seqs.pop(scaffold_id, None) # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def add_compatible(self, scaffold_file, genome_file, compatible_file, min_len, out_genome): """Add sequences specified as compatible. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. min_len : int Minimum length to add scaffold. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine statistics for each potentially compatible scaffold scaffold_ids = set() with open(compatible_file) as f: headers = [x.strip() for x in f.readline().split('\t')] scaffold_gc_index = headers.index('Scaffold GC') genome_gc_index = headers.index('Median genome GC') td_dist_index = headers.index('Scaffold TD') scaffold_cov_index = headers.index('Scaffold coverage') genome_cov_index = headers.index('Median genome coverage') for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_gc = float(line_split[scaffold_gc_index]) genome_gc = float(line_split[genome_gc_index]) gc_dist = abs(scaffold_gc - genome_gc) td_dist = float(line_split[td_dist_index]) scaffold_cov = float(line_split[scaffold_cov_index]) genome_cov = float(line_split[genome_cov_index]) cov_dist = abs(scaffold_cov - genome_cov) if bin_id == cur_bin_id: scaffold_ids.add(scaffold_id) # add compatible sequences to genome added_seqs = 0 genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in scaffold_ids: if len(seq) >= min_len: genome_seqs[seq_id] = seq added_seqs += 1 self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs)) # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def create_records(self, metadata_file, msa_file, genome_list, output_file): """Create ARB records from GTDB metadata.""" seqs = {} if msa_file: seqs = seq_io.read(msa_file) genomes_to_keep = set() if genome_list: for line in open(genome_list): genomes_to_keep.add(line.strip()) fout = open(output_file, 'w') header = True for row in csv.reader(open(metadata_file, 'rb')): if header: fields = row[1:] header = False else: genome_id = row[0] values = row[1:] aligned_seq = seqs.get(genome_id, '') if not genomes_to_keep or genome_id in genomes_to_keep: self._record(fout, genome_id, fields, values, aligned_seq) fout.close()
def add_compatible_unique(self, scaffold_file, genome_file, compatible_file, min_len, out_genome): """Add sequences specified as compatible. Only sequences specified exactly once in the compatibility file are added. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. min_len : int Minimum length to add scaffold. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine scaffolds compatible with genome scaffold_ids = [] bin_ids = {} with open(compatible_file) as f: f.readline() for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_ids.append(scaffold_id) bin_ids[scaffold_id] = bin_id compatible_scaffolds = set() for scaffold_id, bin_id in bin_ids.iteritems(): if scaffold_ids.count(scaffold_id) == 1 and bin_id == cur_bin_id: compatible_scaffolds.add(scaffold_id) self.logger.info('Identified %d compatible scaffolds.' % len(compatible_scaffolds)) # add compatible sequences to genome added_seqs = 0 genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in compatible_scaffolds: if len(seq) >= min_len: genome_seqs[seq_id] = seq added_seqs += 1 self.logger.info('Added %d scaffolds meeting length criterion.' % added_seqs) # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def generate(self, genome_file, contig_break): """Derive metdata across nucleotide sequences. Parameters ---------- genome_file : str Name of fasta file containing nucleotide sequences. contig_break : int Minimum number of ambiguous bases for defining contigs. Returns ------- dict : d[metadata_field] -> value Map of metadata fields to their respective values. dict : d[metadata_field -> description Description of each metadata field. """ # calculate nucleotide statistics scaffolds = seq_io.read(genome_file) nuc_stats = {} nuc_desc = {} nuc_stats['scaffold_count'] = len(scaffolds) nuc_desc['scaffold_count'] = "Number of scaffolds in genome." nuc_stats['gc_count'] = genome_tk.gc_count(scaffolds) nuc_desc['gc_count'] = "Number of G or C bases in genome." nuc_stats['gc_percentage'] = genome_tk.gc(scaffolds) * 100.0 nuc_desc['gc_percentage'] = "GC content of genome." nuc_stats['genome_size'] = sum([len(x) for x in scaffolds.values()]) nuc_desc['genome_size'] = "Total base pairs in genome including nucleotide bases, ambiguous bases, and gaps." nuc_stats['n50_scaffolds'] = seq_tk.N50(scaffolds) nuc_desc['n50_scaffolds'] = "Scaffold length at which 50% of total bases in assembly are in scaffolds of that length or greater." nuc_stats['l50_scaffolds'] = seq_tk.L50(scaffolds, nuc_stats['n50_scaffolds']) nuc_desc['l50_scaffolds'] = "Number of scaffolds longer than, or equal to, the scaffold N50 length." nuc_stats['mean_scaffold_length'] = seq_tk.mean_length(scaffolds) nuc_desc['mean_scaffold_length'] = "Mean length of scaffolds in base pairs." nuc_stats['longest_scaffold'] = seq_tk.max_length(scaffolds) nuc_desc['longest_scaffold'] = "Number of bases in longest scaffold." contigs = seq_tk.identify_contigs(scaffolds, 'N' * contig_break) nuc_stats['contig_count'] = len(contigs) nuc_desc['contig_count'] = "Number of contigs in genome." nuc_stats['ambiguous_bases'] = genome_tk.ambiguous_nucleotides(contigs) nuc_desc['ambiguous_bases'] = "Number of ambiguous bases in contigs." nuc_stats['total_gap_length'] = genome_tk.ambiguous_nucleotides(scaffolds) - nuc_stats['ambiguous_bases'] nuc_desc['total_gap_length'] = "Number of ambiguous bases comprising gaps in scaffolds." nuc_stats['n50_contigs'] = seq_tk.N50(contigs) nuc_desc['n50_contigs'] = "Contig length at which 50% of total bases in assembly are in contigs of that length or greater." nuc_stats['l50_contigs'] = seq_tk.L50(contigs, nuc_stats['n50_contigs']) nuc_desc['l50_contigs'] = "Number of contigs longer than, or equal to, the contig N50 length." nuc_stats['mean_contig_length'] = seq_tk.mean_length(contigs) nuc_desc['mean_contig_length'] = "Mean length of contigs in base pairs." nuc_stats['longest_contig'] = seq_tk.max_length(contigs) nuc_desc['longest_contig'] = "Number of bases in longest contig." return nuc_stats, nuc_desc
def run(self, input_tree, msa_file, outgroup_file, perc_taxa_to_keep, num_replicates, model, output_dir): """Jackknife taxa. Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. outgroup_file : str File indicating labels of outgroup taxa. perc_taxa_to_keep : float Percentage of taxa to keep in each replicate. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. output_dir : str input_tree directory for bootstrap trees. """ assert (model in ['wag', 'jtt']) self.perc_taxa_to_keep = perc_taxa_to_keep self.model = model self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) # read outgroup taxa self.outgroup_ids = set() if outgroup_file: for line in open(outgroup_file): self.outgroup_ids.add(line.strip()) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) # calculate replicates #***self.logger.info('Calculating jackknife taxa replicates:') #***parallel = Parallel(self.cpus) #***parallel.run(self._producer, None, range(num_replicates), self._progress) # calculate support rep_tree_files = [] for rep_index in range(num_replicates): rep_tree_files.append( os.path.join(self.replicate_dir, 'jk_taxa.tree.' + str(rep_index) + '.tre')) tree_support = TreeSupport() output_tree = os.path.join( output_dir, remove_extension(input_tree) + '.jk_taxa.tree') tree_support.subset_taxa(input_tree, rep_tree_files, output_tree) return output_tree
def extract(self, genome_files, best_hits, output_dir): """Extract 16S rRNA genes. Parameters ---------- genome_files : iterable Path to genome files to process. best_hits : d[genome_id][seq_id] -> information about best hit Information about best hits for each genome. output_dir : str Output directory. Returns ------- d[genome_id] -> str Fasta file containing SSU sequences for each genome. """ self.logger.info('Extracting SSU rRNA genes.') ssu_seq_files = {} for genome_file in genome_files: genome_id = remove_extension(genome_file) genome_dir = os.path.join(output_dir, genome_id) if len(best_hits[genome_id]) == 0: continue # write summary file and putative SSU rRNAs to file summary_file = os.path.join(genome_dir, 'ssu.hmm_summary.tsv') summary_out = open(summary_file, 'w') summary_out.write( 'Sequence Id\tHMM\ti-Evalue\tStart hit\tEnd hit\tSSU gene length\tReverse Complement\tSequence length\n' ) ssu_seq_files[genome_id] = os.path.join(genome_dir, 'ssu.fna') seq_out = open(ssu_seq_files[genome_id], 'w') seqs = seq_io.read(genome_file) for seq_id in best_hits[genome_id]: orig_seq_id = seq_id if '-#' in seq_id: seq_id = seq_id[0:seq_id.rfind('-#')] seq_info = [orig_seq_id] + best_hits[genome_id][orig_seq_id] seq = seqs[seq_id] summary_out.write('\t'.join(seq_info) + '\n') seq_out.write('>' + seq_info[0] + '\n') seq_out.write(seq[int(seq_info[3]) + 1:int(seq_info[4]) + 1] + '\n') summary_out.close() seq_out.close() return ssu_seq_files
def run(self, input_tree, msa_file, num_replicates, model, base_type, frac, output_dir): """Bootstrap multiple sequence alignment. Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. base_type : str Indicates if bases are nucleotides or amino acids. frac : float Fraction of alignment to subsample. output_dir : str Directory for bootstrap trees. """ assert (model in ['wag', 'lg', 'jtt']) assert (base_type in ['nt', 'prot']) self.model = model self.base_type = base_type self.frac = frac self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) # calculate replicates self.logger.info('Calculating bootstrap replicates:') parallel = Parallel(self.cpus) parallel.run(self._producer, None, range(num_replicates), self._progress) # calculate support values rep_tree_files = [] for rep_index in range(num_replicates): rep_tree_files.append( os.path.join(self.replicate_dir, 'bootstrap_tree.r_' + str(rep_index) + '.tree')) output_tree = os.path.join( output_dir, remove_extension(input_tree) + '.bootstrap.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def run(self, input_tree, msa_file, outgroup_file, perc_taxa_to_keep, num_replicates, model, output_dir): """Jackknife taxa. Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. outgroup_file : str File indicating labels of outgroup taxa. perc_taxa_to_keep : float Percentage of taxa to keep in each replicate. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. output_dir : str input_tree directory for bootstrap trees. """ assert(model in ['wag', 'jtt']) self.perc_taxa_to_keep = perc_taxa_to_keep self.model = model self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) # read outgroup taxa self.outgroup_ids = set() if outgroup_file: for line in open(outgroup_file): self.outgroup_ids.add(line.strip()) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) # calculate replicates #***self.logger.info('Calculating jackknife taxa replicates:') #***parallel = Parallel(self.cpus) #***parallel.run(self._producer, None, xrange(num_replicates), self._progress) # calculate support rep_tree_files = [] for rep_index in xrange(num_replicates): rep_tree_files.append(os.path.join(self.replicate_dir, 'jk_taxa.tree.' + str(rep_index) + '.tre')) tree_support = TreeSupport() output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.jk_taxa.tree') tree_support.subset_taxa(input_tree, rep_tree_files, output_tree) return output_tree
def generate(self, genome_file, gff_file): """Derive metdata from gene sequences. Parameters ---------- genome_file : str Name of fasta file containing nucleotide sequences. gff_file : str Name of generic feature file describing genes. Returns ------- dict : d[metadata_field] -> value Map of metadata fields to their respective values. dict : d[metadata_field -> description Description of each metadata field. """ gff_parser = GenericFeatureParser(gff_file) coding_bases = gff_parser.total_coding_bases() # calculate nucleotide statistics scaffolds = seq_io.read(genome_file) genome_size = sum([len(x) for x in list(scaffolds.values())]) gene_stats = {} gene_desc = {} gene_stats['protein_count'] = gff_parser.cds_count gene_desc['protein_count'] = "Number of protein coding genes." gene_stats['tRNA_count'] = gff_parser.tRNA_count gene_desc['tRNA_count'] = "Number of tRNA genes." gene_stats['ncRNA_count'] = gff_parser.ncRNA_count gene_desc['ncRNA_count'] = "Number of ncRNA genes." gene_stats['rRNA_count'] = gff_parser.rRNA_count gene_desc['rRNA_count'] = "Number of rRNA genes." gene_stats['16S_count'] = gff_parser.rRNA_16S_count gene_desc['16S_count'] = "Number of 16S rRNA genes." gene_stats['coding_bases'] = coding_bases gene_desc['coding_bases'] = "Number of coding bases in genome." gene_stats['coding_density'] = float( coding_bases) * 100.0 / genome_size gene_desc['coding_density'] = "Percentage of coding bases in genome." return gene_stats, gene_desc
def modify(input_file, scaffold_file, seqs_to_add, seqs_to_remove, output_file): """Add or remove scaffolds from a fasta file. Parameters ---------- input_file : str Fasta file to modify. scaffold_file : str Fasta file containing scaffolds to add. seqs_to_add: iterable Unique ids of scaffolds to add. seqs_to_remove : iterable Unique ids of scaffolds to remove. output_file : str Desired name of modified fasta file. Returns ------- iterable, iterable Unique ids of sequences that could not be added, unique ids of sequences that could not be removed. """ seqs = seq_io.read(input_file) # add sequences to bin failed_to_add = set() if seqs_to_add: failed_to_add = set(seqs_to_add) if seqs_to_add != None: for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in seqs_to_add: failed_to_add.remove(seq_id) seqs[seq_id] = seq # remove sequences from bin failed_to_remove = set() if seqs_to_remove: failed_to_remove = set(seqs_to_remove) if seqs_to_remove != None: for seq_id in seqs_to_remove: if seq_id in seqs: failed_to_remove.remove(seq_id) seqs.pop(seq_id) # save modified bin seq_io.write_fasta(seqs, output_file) return failed_to_add, failed_to_remove
def bootstrap(self, input_tree, msa_file, seq_type, model_str, gamma, num_replicates, output_dir, cpus): """Perform non-parametric bootstrapping. Parameters ---------- input_tree : str File containing newick tree to decorate with bootstraps. msa_file : str Fasta file containing multiple sequence alignment. seq_type : str Specifies multiple sequences alignment is of 'nt' or 'prot'. model_str : str Specified either the 'wag' or 'jtt' model. gamma : bool Indicates if GAMMA model should be used num_replicates : int Number of replicates to perform. output_dir: str Output directory to contain bootstrap trees. cpus : int Number of cpus to use. """ assert (seq_type.upper() in ['NT', 'PROT']) assert (model_str.upper() in ['WAG', 'LG', 'JTT', 'GTR']) self.output_dir = output_dir self.seq_type = seq_type self.model = model_str self.gamma = gamma self.msa = seq_io.read(msa_file) # calculate replicates parallel = Parallel(cpus) replicate_numbers = list(range(num_replicates)) parallel.run(self._bootstrap, None, replicate_numbers, None) # calculate support values rep_tree_files = [] for rep_index in replicate_numbers: rep_tree_files.append( os.path.join(self.output_dir, 'rep_%d' % rep_index, 'bootstrap.tree')) tree_name = os.path.splitext(os.path.basename(input_tree))[0] output_tree = os.path.join(output_dir, tree_name + '.bootstrap.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def generate(self, genome_file, gff_file): """Derive metdata from gene sequences. Parameters ---------- genome_file : str Name of fasta file containing nucleotide sequences. gff_file : str Name of generic feature file describing genes. Returns ------- dict : d[metadata_field] -> value Map of metadata fields to their respective values. dict : d[metadata_field -> description Description of each metadata field. """ gff_parser = GenericFeatureParser(gff_file) coding_bases = gff_parser.total_coding_bases() # calculate nucleotide statistics scaffolds = seq_io.read(genome_file) genome_size = sum([len(x) for x in scaffolds.values()]) gene_stats = {} gene_desc = {} gene_stats['protein_count'] = gff_parser.cds_count gene_desc['protein_count'] = "Number of protein coding genes." gene_stats['tRNA_count'] = gff_parser.tRNA_count gene_desc['tRNA_count'] = "Number of tRNA genes." gene_stats['ncRNA_count'] = gff_parser.ncRNA_count gene_desc['ncRNA_count'] = "Number of ncRNA genes." gene_stats['rRNA_count'] = gff_parser.rRNA_count gene_desc['rRNA_count'] = "Number of rRNA genes." gene_stats['16S_count'] = gff_parser.rRNA_16S_count gene_desc['16S_count'] = "Number of 16S rRNA genes." gene_stats['coding_bases'] = coding_bases gene_desc['coding_bases'] = "Number of coding bases in genome." gene_stats['coding_density'] = float(coding_bases) * 100.0 / genome_size gene_desc['coding_density'] = "Percentage of coding bases in genome." return gene_stats, gene_desc
def _extract(self, genome_file, best_hits, output_dir): """Extract rRNA genes. Parameters ---------- genome_file : str Name of fasta file containing nucleotide sequences. best_hits : d[seq_id] -> information about best hit Information about best hits. output_dir : str Output directory. Returns ------- str Name of fasta file containing extractracted sequences. """ # write summary file and putative SSU rRNAs to file summary_file = os.path.join(output_dir, '%s.hmm_summary.tsv' % self.rna_name) summary_out = open(summary_file, 'w') summary_out.write( 'Sequence Id\tHMM\ti-Evalue\tStart hit\tEnd hit\tSSU gene length\tReverse Complement\tSequence length\n' ) ssu_seq_file = os.path.join(output_dir, '%s.fna' % self.rna_name) seq_out = open(ssu_seq_file, 'w') seqs = seq_io.read(genome_file) for seq_id in best_hits: orig_seq_id = seq_id if '-#' in seq_id: seq_id = seq_id[0:seq_id.rfind('-#')] seq_info = [orig_seq_id] + best_hits[orig_seq_id] seq = seqs[seq_id] summary_out.write('\t'.join(seq_info) + '\t' + str(len(seq)) + '\n') seq_out.write('>' + seq_info[0] + '\n') seq_out.write(seq[int(seq_info[3]) + 1:int(seq_info[4]) + 1] + '\n') summary_out.close() seq_out.close() return ssu_seq_file
def add_compatible_unique(self, scaffold_file, genome_file, compatible_file, out_genome): """Add sequences specified as compatible. Only sequences specified exactly once in the compatibility file are added. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine scaffolds compatible with genome scaffold_ids = [] bin_ids = {} with open(compatible_file) as f: f.readline() for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_ids.append(scaffold_id) bin_ids[scaffold_id] = bin_id compatible_scaffolds = set() for scaffold_id, bin_id in bin_ids.iteritems(): if scaffold_ids.count(scaffold_id) == 1 and bin_id == cur_bin_id: compatible_scaffolds.add(scaffold_id) # add compatible sequences to genome genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in compatible_scaffolds: genome_seqs[seq_id] = seq # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def create_records(self, metadata_file, msa_file, taxonomy_file, genome_list, output_file): """Create ARB records from GTDB metadata.""" seqs = {} if msa_file: seqs = seq_io.read(msa_file) taxonomy = {} if taxonomy_file: taxonomy = Taxonomy().read(taxonomy_file) genomes_to_keep = set() if genome_list: for line in open(genome_list): genomes_to_keep.add(line.strip()) fout = open(output_file, 'w') delimiter = ',' if metadata_file.endswith('.tsv'): delimiter = '\t' header = True for row in csv.reader(open(metadata_file, 'rb'), delimiter=delimiter): if header: fields = [ f.lower().replace(' ', '_').replace('-', '_') for f in row[1:] ] if taxonomy: fields.append('gtdb_taxonomy') header = False else: genome_id = row[0] values = row[1:] if taxonomy: values.append('; '.join(taxonomy[genome_id])) aligned_seq = seqs.get(genome_id, '') if not genomes_to_keep or genome_id in genomes_to_keep: self._record(fout, genome_id, fields, values, aligned_seq) fout.close()
def bootstrap(self, input_tree, msa_file, model_str, num_replicates, output_dir, cpus): """Perform non-parametric bootstrapping. Parameters ---------- input_tree : str File containing newick tree to decorate with bootstraps. msa_file : str Fasta file containing multiple sequence alignment. model_str : str Specified either the 'WAG' or 'LG' model. num_replicates : int Number of replicates to perform. output_dir: str Output directory to contain bootstrap trees. cpus : int Number of cpus to use. """ check_on_path('seqmagick') assert(model_str.upper() in ['WAG', 'LG']) self.output_dir = output_dir self.model = model_str self.msa = seq_io.read(msa_file) # calculate replicates parallel = Parallel(cpus) replicate_numbers = list(range(num_replicates)) parallel.run(self._bootstrap, None, replicate_numbers, None) # calculate support values rep_tree_files = [] for rep_index in replicate_numbers: rep_tree_files.append(os.path.join(output_dir, 'rep_%d' % rep_index, 'RAxML_bestTree.support')) tree_name = os.path.splitext(os.path.basename(input_tree))[0] output_tree = os.path.join(output_dir, tree_name + '.bootstrap.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def remove_outliers(self, genome_file, outlier_file, out_genome, modified_only): """Remove sequences specified as outliers. Any scaffolds lists in the first column of the outlier file are removed from the specified genome. Parameters ---------- genome_file : str Fasta file of binned scaffolds. outlier_file : str File specifying outlying scaffolds. out_genome : str Name of output genome. modified_only : bool Only create output file if genome is modified. """ genome_seqs = seq_io.read(genome_file) if not genome_seqs: return # remove scaffolds bModified = False with open(outlier_file) as f: f.readline() for line in f: if line[0] == '#': continue line_split = line.split('\t') scaffold_id = line_split[0] rtn = genome_seqs.pop(scaffold_id, None) if rtn: bModified = True # save modified bin if bModified or not modified_only: seq_io.write_fasta(genome_seqs, out_genome)
def bootstrap(self, input_tree, msa_file, seq_type, model_str, num_replicates, output_tree, cpus): """Perform non-parametric bootstrapping. Parameters ---------- input_tree : str File containing newick tree to decorate with bootstraps. msa_file : str Fasta file containing multiple sequence alignment. seq_type : str Specifies multiple sequences alignment is of 'nt' or 'prot'. model_str : str Specified either the 'wag' or 'jtt' model. num_replicates : int Number of replicates to perform. output_tree: str Output file containing tree with bootstrap values. cpus : int Number of cpus to use. """ assert(seq_type in ['nt', 'prot']) assert(model_str in ['wag', 'jtt']) self.replicate_dir = tempfile.mkdtemp() self.seq_type = seq_type self.model = model_str self.msa = seq_io.read(msa_file) # calculate replicates parallel = Parallel(cpus) parallel.run(self._bootstrap, None, xrange(num_replicates), None) # calculate support values rep_tree_files = [] for rep_index in xrange(num_replicates): rep_tree_files.append(os.path.join(self.replicate_dir, 'bootstrap.tree.' + str(rep_index) + '.tre')) bootstrap_support(input_tree, rep_tree_files, output_tree) shutil.rmtree(self.replicate_dir)
def create_records(self, metadata_file, msa_file, taxonomy_file, genome_list, output_file): """Create ARB records from GTDB metadata.""" seqs = {} if msa_file: seqs = seq_io.read(msa_file) taxonomy = {} if taxonomy_file: taxonomy = Taxonomy().read(taxonomy_file) genomes_to_keep = set() if genome_list: for line in open(genome_list): genomes_to_keep.add(line.strip()) fout = open(output_file, 'w') delimiter = ',' if metadata_file.endswith('.tsv'): delimiter = '\t' header = True for row in csv.reader(open(metadata_file, 'rb'), delimiter=delimiter): if header: fields = [f.lower().replace(' ', '_').replace('-', '_') for f in row[1:]] if taxonomy: fields.append('gtdb_taxonomy') header = False else: genome_id = row[0] values = row[1:] if taxonomy: values.append('; '.join(taxonomy[genome_id])) aligned_seq = seqs.get(genome_id, '') if not genomes_to_keep or genome_id in genomes_to_keep: self._record(fout, genome_id, fields, values, aligned_seq) fout.close()
def manual(self, options): """Manual command""" check_file_exists(options.cluster_file) check_file_exists(options.genome_file) make_sure_path_exists(options.output_dir) genome_id = remove_extension(options.genome_file) seqs = seq_io.read(options.genome_file) fout = {} with open(options.cluster_file) as f: f.readline() for line in f: line_split = line.rstrip().split('\t') scaffold_id = line_split[0] cluster_id = int(line_split[1]) if cluster_id < 0: # negative values indicate scaffolds that should # not be placed in a cluster continue if cluster_id not in fout: fout[cluster_id] = open( os.path.join(options.output_dir, genome_id + '_c%d.fna' % cluster_id), 'w') f = fout[cluster_id] f.write('>' + scaffold_id + '\n') f.write(seqs[scaffold_id] + '\n') for f in fout.values(): f.close() self.logger.info('Partitioned sequences written to: ' + options.output_dir)
def add_compatible_closest(self, scaffold_file, genome_file, compatible_file, min_len, out_genome): """Add sequences specified as compatible. A sequences is added to a bin if and only if it is closest to that bin in GC, tetranuclotide, and coverage space. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. min_len : int Minimum length to add scaffold. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine statistics for each potentially compatible scaffold scaffold_ids = defaultdict(dict) with open(compatible_file) as f: headers = [x.strip() for x in f.readline().split('\t')] scaffold_gc_index = headers.index('Scaffold GC') genome_gc_index = headers.index('Median genome GC') td_dist_index = headers.index('Scaffold TD') scaffold_cov_index = headers.index('Scaffold coverage') genome_cov_index = headers.index('Median genome coverage') for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_gc = float(line_split[scaffold_gc_index]) genome_gc = float(line_split[genome_gc_index]) gc_dist = abs(scaffold_gc - genome_gc) td_dist = float(line_split[td_dist_index]) scaffold_cov = float(line_split[scaffold_cov_index]) genome_cov = float(line_split[genome_cov_index]) cov_dist = abs(scaffold_cov - genome_cov) scaffold_ids[scaffold_id][bin_id] = [gc_dist, td_dist, cov_dist] # determine scaffolds that are closest to a single bin # in terms of GC, tetranucleotide distance, and coverage compatible_scaffolds = set() for scaffold_id, bin_stats in scaffold_ids.items(): best_gc = [1e9, None] best_td = [1e9, None] best_cov = [1e9, None] for bin_id, stats in bin_stats.items(): gc, td, cov = stats if gc < best_gc[0]: best_gc = [gc, bin_id] if td < best_td[0]: best_td = [td, bin_id] if cov < best_cov[0]: best_cov = [cov, bin_id] # check if scaffold is closest to a single bin if (best_gc[1] == best_td[1] == best_cov[1]) and best_gc[1] == cur_bin_id: compatible_scaffolds.add(scaffold_id) self.logger.info('Identified {:,} compatible scaffolds.'.format(len(compatible_scaffolds))) # add compatible sequences to genome added_seqs = 0 genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in compatible_scaffolds: if len(seq) >= min_len: genome_seqs[seq_id] = seq added_seqs += 1 self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs)) # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def run(self, input_tree, msa_file, marker_info_file, mask_file, perc_markers_to_keep, num_replicates, model, jk_dir, output_dir): """Jackknife marker genes. Marker file should have the format: <marker id>\t<marker name>\t<marker desc>\t<length>\n Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. marker_info_file : str File indicating database id, HMM name, description and length of each marker in the alignment. mask_file : str File indicating masking of multiple sequence alignment. perc_markers_to_keep : float [0, 1] Percentage of marker genes to keep in each replicate. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. output_dir : str Output directory for jackkife trees. """ assert(model in ['wag', 'jtt']) self.model = model self.perc_markers_to_keep = perc_markers_to_keep # determine length of each marker gene in alignment rep_tree_files = [] if not jk_dir: self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) marker_lengths = [] total_len = 0 with open(marker_info_file) as f: f.readline() for line in f: line_split = line.split('\t') ml = int(line_split[3]) marker_lengths.append(ml) total_len += ml self.logger.info('Concatenated length of markers: %d' % total_len) # read mask mask = open(mask_file).readline().strip() start = 0 self.marker_lengths = [] total_mask_len = 0 for ml in marker_lengths: end = start + ml zeros = mask[start:end].count('0') start = end self.marker_lengths.append(ml - zeros) total_mask_len += ml - zeros self.logger.info('Concatenated length of filtered MSA: %d' % total_mask_len) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) if len(self.msa.values()[0]) != total_mask_len: self.logger.error('Length of MSA does not meet length of mask.') sys.exit() # calculate replicates self.logger.info('Calculating jackknife marker replicates:') parallel = Parallel(self.cpus) parallel.run(self._producer, None, xrange(num_replicates), self._progress) # calculate support self.logger.info('Calculating support for %d replicates.' % num_replicates) for rep_index in xrange(num_replicates): rep_tree_files.append(os.path.join(self.replicate_dir, 'jk_markers.tree.' + str(rep_index) + '.tre')) else: for f in os.listdir(jk_dir): if f.endswith('.tree') or f.endswith('.tre'): rep_tree_files.append(os.path.join(jk_dir, f)) self.logger.info('Read %d jackknife replicates.' % len(rep_tree_files)) output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.jk_markers.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def run(self, input_tree, msa_file, num_replicates, model, gamma, base_type, frac, boot_dir, output_dir): """Bootstrap multiple sequence alignment. Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. base_type : str Indicates if bases are nucleotides or amino acids. frac : float Fraction of alignment to subsample. output_dir : str Directory for bootstrap trees. """ assert(model in ['wag', 'lg', 'jtt']) assert(base_type in ['nt', 'prot']) self.model = model self.gamma = gamma self.base_type = base_type self.frac = frac rep_tree_files = [] if not boot_dir: self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) # calculate replicates self.logger.info('Calculating bootstrap replicates:') parallel = Parallel(self.cpus) parallel.run(self._producer, None, xrange(num_replicates), self._progress) for rep_index in xrange(num_replicates): rep_tree_files.append(os.path.join(self.replicate_dir, 'bootstrap_tree.r_' + str(rep_index) + '.tree')) else: for f in os.listdir(boot_dir): if f.endswith('.tree') or f.endswith('.tre'): rep_tree_files.append(os.path.join(boot_dir, f)) self.logger.info('Read %d bootstrap replicates.' % len(rep_tree_files)) # calculate support values self.logger.info('Calculating bootstrap support values.') output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.bootstrap.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def _dump_seqs(self, genomic_file, gtdb_taxonomy, genomes_of_interest, prefix, min_ar_gene_len, min_bac_gene_len, min_contig_len, output_prefix, output_dir): fout_ar_summary = open( os.path.join(output_dir, output_prefix + '_ar.tsv'), 'w') fout_ar_fna = open(os.path.join(output_dir, output_prefix + '_ar.fna'), 'w') fout_ar_taxonmy = open( os.path.join(output_dir, output_prefix + '_ar_taxonomy.tsv'), 'w') fout_bac_summary = open( os.path.join(output_dir, output_prefix + '_bac.tsv'), 'w') fout_bac_fna = open( os.path.join(output_dir, output_prefix + '_bac.fna'), 'w') fout_bac_taxonmy = open( os.path.join(output_dir, output_prefix + '_bac_taxonomy.tsv'), 'w') write_header = True total_seq = 0 for line in open(genomic_file): gid, genome_path = [t.strip() for t in line.split()] if gid.startswith('GCA_'): gid = 'GB_' + gid elif gid.startswith('GCF_'): gid = 'RS_' + gid if genomes_of_interest and gid not in genomes_of_interest: continue if 'd__Archaea' in gtdb_taxonomy[gid]: fout_summary = fout_ar_summary fout_fna = fout_ar_fna fout_taxonomy = fout_ar_taxonmy min_gene_len = min_ar_gene_len else: fout_summary = fout_bac_summary fout_fna = fout_bac_fna fout_taxonomy = fout_bac_taxonmy min_gene_len = min_bac_gene_len # extract sequences hmm_summary = os.path.join(genome_path, prefix + '.hmm_summary.tsv') if not os.path.exists(hmm_summary): continue seqs = seq_io.read(os.path.join(genome_path, prefix + '.fna')) gene_count = 0 with open(hmm_summary) as f: header = f.readline() if write_header: write_header = False fout_summary.write('%s\t%s' % ('Gene ID', header)) for line in f: line_split = line.strip().split('\t') gene_id = line_split[0] gene_len = int(line_split[5]) contig_len = int(line_split[-1]) if gene_len >= min_gene_len and contig_len >= min_contig_len: unique_gene_id = '%s~gene_%s' % (gid, gene_count) fout_summary.write('%s\t%s' % (unique_gene_id, line)) fout_fna.write('>%s [%s]\n' % (unique_gene_id, gene_id)) fout_fna.write(seqs[gene_id] + '\n') fout_taxonomy.write( '%s\t%s\n' % (unique_gene_id, '; '.join(gtdb_taxonomy[gid]))) gene_count += 1 total_seq += 1 fout_ar_summary.close() fout_ar_fna.close() fout_ar_taxonmy.close() fout_bac_summary.close() fout_bac_fna.close() fout_bac_taxonmy.close() self.logger.info('Wrote %d sequences.' % total_seq)
def generate(self, genome_file, contig_break): """Derive metdata across nucleotide sequences. Parameters ---------- genome_file : str Name of fasta file containing nucleotide sequences. contig_break : int Minimum number of ambiguous bases for defining contigs. Returns ------- dict : d[metadata_field] -> value Map of metadata fields to their respective values. dict : d[metadata_field -> description Description of each metadata field. """ # calculate nucleotide statistics scaffolds = seq_io.read(genome_file) nuc_stats = {} nuc_desc = {} nuc_stats['scaffold_count'] = len(scaffolds) nuc_desc['scaffold_count'] = "Number of scaffolds in genome." nuc_stats['gc_count'] = genome_tk.gc_count(scaffolds) nuc_desc['gc_count'] = "Number of G or C bases in genome." nuc_stats['gc_percentage'] = genome_tk.gc(scaffolds) * 100.0 nuc_desc['gc_percentage'] = "GC content of genome." nuc_stats['genome_size'] = sum( [len(x) for x in list(scaffolds.values())]) nuc_desc[ 'genome_size'] = "Total base pairs in genome including nucleotide bases, ambiguous bases, and gaps." nuc_stats['n50_scaffolds'] = seq_tk.N50(scaffolds) nuc_desc[ 'n50_scaffolds'] = "Scaffold length at which 50% of total bases in assembly are in scaffolds of that length or greater." nuc_stats['l50_scaffolds'] = seq_tk.L50(scaffolds, nuc_stats['n50_scaffolds']) nuc_desc[ 'l50_scaffolds'] = "Number of scaffolds longer than, or equal to, the scaffold N50 length." nuc_stats['mean_scaffold_length'] = seq_tk.mean_length(scaffolds) nuc_desc[ 'mean_scaffold_length'] = "Mean length of scaffolds in base pairs." nuc_stats['longest_scaffold'] = seq_tk.max_length(scaffolds) nuc_desc['longest_scaffold'] = "Number of bases in longest scaffold." contigs = seq_tk.identify_contigs(scaffolds, 'N' * contig_break) nuc_stats['contig_count'] = len(contigs) nuc_desc['contig_count'] = "Number of contigs in genome." nuc_stats['ambiguous_bases'] = genome_tk.ambiguous_nucleotides(contigs) nuc_desc['ambiguous_bases'] = "Number of ambiguous bases in contigs." nuc_stats['total_gap_length'] = genome_tk.ambiguous_nucleotides( scaffolds) - nuc_stats['ambiguous_bases'] nuc_desc[ 'total_gap_length'] = "Number of ambiguous bases comprising gaps in scaffolds." nuc_stats['n50_contigs'] = seq_tk.N50(contigs) nuc_desc[ 'n50_contigs'] = "Contig length at which 50% of total bases in assembly are in contigs of that length or greater." nuc_stats['l50_contigs'] = seq_tk.L50(contigs, nuc_stats['n50_contigs']) nuc_desc[ 'l50_contigs'] = "Number of contigs longer than, or equal to, the contig N50 length." nuc_stats['mean_contig_length'] = seq_tk.mean_length(contigs) nuc_desc[ 'mean_contig_length'] = "Mean length of contigs in base pairs." nuc_stats['longest_contig'] = seq_tk.max_length(contigs) nuc_desc['longest_contig'] = "Number of bases in longest contig." return nuc_stats, nuc_desc
def run(self, scaffold_stats, num_clusters, num_components, K, no_coverage, no_pca, iterations, genome_file, output_dir): """Calculate statistics for genomes. Parameters ---------- scaffold_stats : ScaffoldStats Statistics for individual scaffolds. num_clusters : int Number of cluster to form. num_components : int Number of PCA components to consider. K : int K-mer size to use for calculating genomic signature. no_coverage : boolean Flag indicating if coverage information should be used during clustering. no_pca : boolean Flag indicating if PCA of genomic signature should be calculated. iterations : int Iterations of clustering to perform. genome_file : str Sequences being clustered. output_dir : str Directory to write results. """ # get GC and mean coverage for each scaffold in genome self.logger.info('') self.logger.info(' Determining mean coverage and genomic signatures.') signatures = GenomicSignature(K) genome_stats = [] signature_matrix = [] seqs = seq_io.read(genome_file) for seq_id, seq in seqs.iteritems(): stats = scaffold_stats.stats[seq_id] if not no_coverage: genome_stats.append((np_mean(stats.coverage))) else: genome_stats.append(()) if K == 0: pass elif K == 4: signature_matrix.append(stats.signature) else: sig = signatures.seq_signature(seq) total_kmers = sum(sig) for i in xrange(0, len(sig)): sig[i] = float(sig[i]) / total_kmers signature_matrix.append(sig) # calculate PCA of tetranucleotide signatures if K != 0: if not no_pca: self.logger.info(' Calculating PCA of genomic signatures.') pc, variance = self.pca(signature_matrix) self.logger.info(' First %d PCs capture %.1f%% of the variance.' % (num_components, sum(variance[0:num_components]) * 100)) for i, stats in enumerate(genome_stats): genome_stats[i] = np_append(stats, pc[i][0:num_components]) else: self.logger.info(' Using complete genomic signature.') for i, stats in enumerate(genome_stats): genome_stats[i] = np_append(stats, signature_matrix[i]) # whiten data if feature matrix contains coverage and genomic signature data if not no_coverage and K != 0: print ' Whitening data.' genome_stats = whiten(genome_stats) else: genome_stats = np_array(genome_stats) # cluster self.logger.info(' Partitioning genome into %d clusters.' % num_clusters) bError = True while bError: try: bError = False _centroids, labels = kmeans2(genome_stats, num_clusters, iterations, minit='points', missing='raise') except ClusterError: bError = True for k in range(num_clusters): self.logger.info(' Placed %d sequences in cluster %d.' % (sum(labels == k), (k + 1))) # write out clusters genome_id = remove_extension(genome_file) for k in range(num_clusters): fout = open(os.path.join(output_dir, genome_id + '_c%d' % (k + 1) + '.fna'), 'w') for i in np_where(labels == k)[0]: seq_id = seqs.keys()[i] fout.write('>' + seq_id + '\n') fout.write(seqs[seq_id] + '\n') fout.close()
def run(self, input_tree, msa_file, marker_info_file, mask_file, perc_markers_to_keep, num_replicates, model, jk_dir, output_dir): """Jackknife marker genes. Marker file should have the format: <marker id>\t<marker name>\t<marker desc>\t<length>\n Parameters ---------- input_tree : str Tree inferred with all data. msa_file : str File containing multiple sequence alignment for all taxa. marker_info_file : str File indicating database id, HMM name, description and length of each marker in the alignment. mask_file : str File indicating masking of multiple sequence alignment. perc_markers_to_keep : float [0, 1] Percentage of marker genes to keep in each replicate. num_replicates : int Number of replicates to perform. model : str Desired model of evolution. output_dir : str Output directory for jackkife trees. """ assert (model in ['wag', 'jtt']) self.model = model self.perc_markers_to_keep = perc_markers_to_keep # determine length of each marker gene in alignment rep_tree_files = [] if not jk_dir: self.replicate_dir = os.path.join(output_dir, 'replicates') make_sure_path_exists(self.replicate_dir) marker_lengths = [] total_len = 0 with open(marker_info_file) as f: f.readline() for line in f: line_split = line.split('\t') ml = int(line_split[3]) marker_lengths.append(ml) total_len += ml self.logger.info('Concatenated length of markers: %d' % total_len) # read mask mask = open(mask_file).readline().strip() start = 0 self.marker_lengths = [] total_mask_len = 0 for ml in marker_lengths: end = start + ml zeros = mask[start:end].count('0') start = end self.marker_lengths.append(ml - zeros) total_mask_len += ml - zeros self.logger.info('Concatenated length of filtered MSA: %d' % total_mask_len) # read full multiple sequence alignment self.msa = seq_io.read(msa_file) if len(self.msa.values()[0]) != total_mask_len: self.logger.error( 'Length of MSA does not meet length of mask.') sys.exit() # calculate replicates self.logger.info('Calculating jackknife marker replicates:') parallel = Parallel(self.cpus) parallel.run(self._producer, None, xrange(num_replicates), self._progress) # calculate support self.logger.info('Calculating support for %d replicates.' % num_replicates) for rep_index in xrange(num_replicates): rep_tree_files.append( os.path.join(self.replicate_dir, 'jk_markers.tree.' + str(rep_index) + '.tre')) else: for f in os.listdir(jk_dir): if f.endswith('.tree') or f.endswith('.tre'): rep_tree_files.append(os.path.join(jk_dir, f)) self.logger.info('Read %d jackknife replicates.' % len(rep_tree_files)) output_tree = os.path.join( output_dir, remove_extension(input_tree) + '.jk_markers.tree') bootstrap_support(input_tree, rep_tree_files, output_tree) return output_tree
def split(self, scaffold_stats, criteria1, criteria2, genome_file, output_dir): """Split genome into two based ongenomic feature. Parameters ---------- scaffold_stats : ScaffoldStats Statistics for individual scaffolds. criteria1 : str First criteria used for splitting genome. criteria2 : str Second criteria used for splitting genome. genome_file : str Sequences being clustered. output_dir : str Directory to write results. """ seqs = seq_io.read(genome_file) # calculate PCA if necessary if 'pc' in criteria1 or 'pc' in criteria2: self.logger.info('Performing PCA.') signatures = GenomicSignature(K) signature_matrix = [] seqs = seq_io.read(genome_file) for seq_id, seq in seqs.items(): stats = scaffold_stats.stats[seq_id] signature_matrix.append(stats.signature) pc, _variance = self.pca(signature_matrix) for i, seq_id in enumerate(seqs): scaffold_stats.stats[seq_id].pc1 = pc[i][0] scaffold_stats.stats[seq_id].pc2 = pc[i][1] scaffold_stats.stats[seq_id].pc3 = pc[i][2] # split bin genome_id = remove_extension(genome_file) fout1 = open(os.path.join(output_dir, genome_id + '_c1.fna'), 'w') fout2 = open(os.path.join(output_dir, genome_id + '_c2.fna'), 'w') for seq_id, seq in seqs.items(): stats = scaffold_stats.stats[seq_id] meet_criteria = True for criteria in [criteria1, criteria2]: if 'gc' in criteria: v = eval(criteria.replace('gc', str(stats.gc)), {"__builtins__": {}}) elif 'coverage' in criteria: v = eval(criteria.replace('coverage', str(stats.coverage)), {"__builtins__": {}}) elif 'pc1' in criteria: v = eval(criteria.replace('pc1', str(stats.pc1)), {"__builtins__": {}}) elif 'pc2' in criteria: v = eval(criteria.replace('pc2', str(stats.pc2)), {"__builtins__": {}}) elif 'pc3' in criteria: v = eval(criteria.replace('pc3', str(stats.pc3)), {"__builtins__": {}}) meet_criteria = meet_criteria and v if meet_criteria: fout1.write('>' + seq_id + '\n') fout1.write(seqs[seq_id] + '\n') else: fout2.write('>' + seq_id + '\n') fout2.write(seqs[seq_id] + '\n') fout1.close() fout2.close()
def kmeans(self, scaffold_stats, num_clusters, num_components, K, no_coverage, no_pca, iterations, genome_file, output_dir): """Cluster genome with k-means. Parameters ---------- scaffold_stats : ScaffoldStats Statistics for individual scaffolds. num_clusters : int Number of cluster to form. num_components : int Number of PCA components to consider. K : int K-mer size to use for calculating genomic signature no_coverage : boolean Flag indicating if coverage information should be used during clustering. no_pca : boolean Flag indicating if PCA of genomic signature should be calculated. iterations: int iterations to perform during clustering genome_file : str Sequences being clustered. output_dir : str Directory to write results. """ # get GC and mean coverage for each scaffold in genome self.logger.info('Determining mean coverage and genomic signatures.') signatures = GenomicSignature(K) genome_stats = [] signature_matrix = [] seqs = seq_io.read(genome_file) for seq_id, seq in seqs.items(): stats = scaffold_stats.stats[seq_id] if not no_coverage: genome_stats.append((np_mean(stats.coverage))) else: genome_stats.append(()) if K == 0: pass elif K == 4: signature_matrix.append(stats.signature) else: sig = signatures.seq_signature(seq) total_kmers = sum(sig) for i in range(0, len(sig)): sig[i] = float(sig[i]) / total_kmers signature_matrix.append(sig) # calculate PCA of signatures if K != 0: if not no_pca: self.logger.info('Calculating PCA of genomic signatures.') pc, variance = self.pca(signature_matrix) self.logger.info( 'First {:,} PCs capture {:.1f}% of the variance.'.format( num_components, sum(variance[0:num_components]) * 100)) for i, stats in enumerate(genome_stats): genome_stats[i] = np_append(stats, pc[i][0:num_components]) else: self.logger.info('Using complete genomic signature.') for i, stats in enumerate(genome_stats): genome_stats[i] = np_append(stats, signature_matrix[i]) # whiten data if feature matrix contains coverage and genomic signature data if not no_coverage and K != 0: self.logger.info('Whitening data.') genome_stats = whiten(genome_stats) else: genome_stats = np_array(genome_stats) # cluster self.logger.info( 'Partitioning genome into {:,} clusters.'.format(num_clusters)) bError = True while bError: try: bError = False _centroids, labels = kmeans2(genome_stats, num_clusters, iterations, minit='points', missing='raise') except ClusterError: bError = True for k in range(num_clusters): self.logger.info('Placed {:,} sequences in cluster {:,}.'.format( sum(labels == k), (k + 1))) # write out clusters genome_id = remove_extension(genome_file) for k in range(num_clusters): fout = open( os.path.join(output_dir, genome_id + '_c%d' % (k + 1) + '.fna'), 'w') for i in np_where(labels == k)[0]: seq_id = seqs.keys()[i] fout.write('>' + seq_id + '\n') fout.write(seqs[seq_id] + '\n') fout.close()
def add_compatible_closest(self, scaffold_file, genome_file, compatible_file, out_genome): """Add sequences specified as compatible. A sequences is added to a bin if and only if it is closest to that bin in GC, tetranuclotide, and coverage space. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine statistics for each potentially compatible scaffold scaffold_ids = defaultdict(dict) with open(compatible_file) as f: headers = [x.strip() for x in f.readline().split('\t')] scaffold_gc_index = headers.index('Scaffold GC') genome_gc_index = headers.index('Mean genome GC') td_dist_index = headers.index('Scaffold TD') scaffold_cov_index = headers.index('Mean scaffold coverage') genome_cov_index = headers.index('Mean genome coverage') for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_gc = float(line_split[scaffold_gc_index]) genome_gc = float(line_split[genome_gc_index]) gc_dist = abs(scaffold_gc - genome_gc) td_dist = float(line_split[td_dist_index]) scaffold_cov = float(line_split[scaffold_cov_index]) genome_cov = float(line_split[genome_cov_index]) cov_dist = abs(scaffold_cov - genome_cov) scaffold_ids[scaffold_id][bin_id] = [gc_dist, td_dist, cov_dist] # determine scaffolds that are closest to a single bin # in terms of GC, tetranucleotide distance, and coverage compatible_scaffolds = set() for scaffold_id, bin_stats in scaffold_ids.iteritems(): best_gc = [1e9, None] best_td = [1e9, None] best_cov = [1e9, None] for bin_id, stats in bin_stats.iteritems(): gc, td, cov = stats if gc < best_gc[0]: best_gc = [gc, bin_id] if td < best_td[0]: best_td = [td, bin_id] if cov < best_cov[0]: best_cov = [cov, bin_id] # check if scaffold is closest to a single bin if (best_gc[1] == best_td[1] == best_cov[1]) and best_gc[1] == cur_bin_id: compatible_scaffolds.add(scaffold_id) # add compatible sequences to genome genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in compatible_scaffolds: genome_seqs[seq_id] = seq # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def run(self, msa_file, tree_program, prot_model, skip_rooting, output_dir): """Infer tree. Parameters ---------- msa_file : str Multiple sequence alignment in fasta format. tree_program : str Program to use for tree inference ['fasttree', 'raxml']. prot_model : str Protein substitution model for tree inference ['WAG', 'LG', 'AUTO']. output_dir : str Directory to store results. """ num_seqs = sum([1 for _, _ in seq_io.read_seq(msa_file)]) if num_seqs <= 2: self.logger.error( 'Insufficient number of sequences in MSA to infer tree.') raise SystemExit('Tree inference failed.') output_file = ntpath.basename(msa_file) prefix = output_file[0:output_file.rfind('.')] suffix = output_file[output_file.rfind('.') + 1:] if tree_program == 'fasttree': self.logger.info( 'Inferring gene tree with FastTree using %s+GAMMA.' % prot_model) fasttree = FastTree(multithreaded=(self.cpus > 1)) tree_unrooted_output = os.path.join(output_dir, prefix + '.unrooted.tree') tree_log = os.path.join(output_dir, prefix + '.tree.log') tree_output_log = os.path.join(output_dir, 'fasttree.log') fasttree.run(msa_file, 'prot', prot_model, tree_unrooted_output, tree_log, tree_output_log) elif tree_program == 'raxml': self.logger.info( 'Inferring gene tree with RAxML using PROTGAMMA%s.' % prot_model) # create phylip MSA file phylip_msa_file = msa_file.replace('.faa', '.phyx') cmd = 'seqmagick convert %s %s' % (msa_file, phylip_msa_file) os.system(cmd) # run RAxML raxml_dir = os.path.abspath(os.path.join(output_dir, 'raxml')) tree_output_log = os.path.join(output_dir, 'raxml.log') raxml = RAxML(self.cpus) tree_unrooted_output = raxml.run(phylip_msa_file, prot_model, raxml_dir) # root tree at midpoint if not skip_rooting: seqs = seq_io.read(msa_file) if len(seqs) > 2: self.logger.info('Rooting tree at midpoint.') tree = dendropy.Tree.get_from_path(tree_unrooted_output, schema='newick', rooting="force-rooted", preserve_underscores=True) tree.reroot_at_midpoint(update_bipartitions=False) tree_output = os.path.join(output_dir, prefix + '.rooted.tree') tree.write_to_path(tree_output, schema='newick', suppress_rooting=True, unquoted_underscores=True) else: tree_output = tree_unrooted_output return tree_output