def add_compatible(self, scaffold_file, genome_file, compatible_file, min_len, out_genome): """Add sequences specified as compatible. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. min_len : int Minimum length to add scaffold. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine statistics for each potentially compatible scaffold scaffold_ids = set() with open(compatible_file) as f: headers = [x.strip() for x in f.readline().split('\t')] scaffold_gc_index = headers.index('Scaffold GC') genome_gc_index = headers.index('Median genome GC') td_dist_index = headers.index('Scaffold TD') scaffold_cov_index = headers.index('Scaffold coverage') genome_cov_index = headers.index('Median genome coverage') for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_gc = float(line_split[scaffold_gc_index]) genome_gc = float(line_split[genome_gc_index]) gc_dist = abs(scaffold_gc - genome_gc) td_dist = float(line_split[td_dist_index]) scaffold_cov = float(line_split[scaffold_cov_index]) genome_cov = float(line_split[genome_cov_index]) cov_dist = abs(scaffold_cov - genome_cov) if bin_id == cur_bin_id: scaffold_ids.add(scaffold_id) # add compatible sequences to genome added_seqs = 0 genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in scaffold_ids: if len(seq) >= min_len: genome_seqs[seq_id] = seq added_seqs += 1 self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs)) # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def remove_outliers(self, genome_file, outlier_file, out_genome): """Remove sequences specified as outliers. Any scaffolds lists in the first column of the outlier file are removed from the specified genome. Parameters ---------- genome_file : str Fasta file of binned scaffolds. outlier_file : str File specifying outlying scaffolds. out_genome : str Name of output genome. """ genome_seqs = seq_io.read(genome_file) # remove scaffolds with open(outlier_file) as f: f.readline() for line in f: line_split = line.split('\t') scaffold_id = line_split[0] genome_seqs.pop(scaffold_id, None) # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def add_compatible_unique(self, scaffold_file, genome_file, compatible_file, min_len, out_genome): """Add sequences specified as compatible. Only sequences specified exactly once in the compatibility file are added. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. min_len : int Minimum length to add scaffold. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine scaffolds compatible with genome scaffold_ids = [] bin_ids = {} with open(compatible_file) as f: f.readline() for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_ids.append(scaffold_id) bin_ids[scaffold_id] = bin_id compatible_scaffolds = set() for scaffold_id, bin_id in bin_ids.iteritems(): if scaffold_ids.count(scaffold_id) == 1 and bin_id == cur_bin_id: compatible_scaffolds.add(scaffold_id) self.logger.info('Identified %d compatible scaffolds.' % len(compatible_scaffolds)) # add compatible sequences to genome added_seqs = 0 genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in compatible_scaffolds: if len(seq) >= min_len: genome_seqs[seq_id] = seq added_seqs += 1 self.logger.info('Added %d scaffolds meeting length criterion.' % added_seqs) # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def create_concatenated_alignment(genome_ids, marker_genes, alignment_dir, concatenated_alignment_file, marker_file): """Create concatenated multiple sequence alignment for all genomes. Parameters ---------- genome_ids : iterable Genomes of interest. marker_genes : iterable Unique ids of marker genes. alignment_dir : str Directory containing multiple sequence alignments. concatenated_alignment_file : str File to containing concatenated alignment. marker_file : str File indicating length of each marker in the alignment. """ # Read alignment files. Some genomes may have multiple # copies of a marker gene in which case the last one # is arbitrarily taken. This is acceptable as all genes # are already screen to be conspecific. alignments = defaultdict(dict) marker_length = {} for mg in marker_genes: f = mg + '.aln.masked.faa' seqs = seq_io.read_fasta(os.path.join(alignment_dir, f)) for seq_id, seq in seqs.iteritems(): genome_id = seq_id[0:seq_id.find(DefaultValues.SEQ_CONCAT_CHAR)] alignments[mg][genome_id] = seq marker_length[mg] = len(seq) # create marker file fout = open(marker_file, 'w') for mg in marker_genes: fout.write('%s\t%s\t%s\t%d\n' % (mg, mg, mg, marker_length[mg])) fout.close() # create concatenated alignment concatenated_seqs = {} for mg in marker_genes: seqs = alignments[mg] for genome_id in genome_ids: if genome_id in seqs: # append alignment concatenated_seqs[genome_id] = concatenated_seqs.get(genome_id, '') + seqs[genome_id] else: # missing gene concatenated_seqs[genome_id] = concatenated_seqs.get(genome_id, '') + '-' * marker_length[mg] # save concatenated alignment seq_io.write_fasta(concatenated_seqs, concatenated_alignment_file)
def create_concatenated_alignment(genome_ids, marker_genes, alignment_dir, concatenated_alignment_file, marker_file): """Create concatenated multiple sequence alignment for all genomes. Parameters ---------- genome_ids : iterable Genomes of interest. marker_genes : iterable Unique ids of marker genes. alignment_dir : str Directory containing multiple sequence alignments. concatenated_alignment_file : str File to containing concatenated alignment. marker_file : str File indicating length of each marker in the alignment. """ # Read alignment files. Some genomes may have multiple # copies of a marker gene in which case the last one # is arbitrarily taken. This is acceptable as all genes # are already screen to be conspecific. alignments = defaultdict(dict) marker_length = {} for mg in marker_genes: f = mg + '.aln.masked.faa' seqs = seq_io.read_fasta(os.path.join(alignment_dir, f)) for seq_id, seq in seqs.iteritems(): genome_id = seq_id[0:seq_id.find(DefaultValues.SEQ_CONCAT_CHAR)] alignments[mg][genome_id] = seq marker_length[mg] = len(seq) # create marker file fout = open(marker_file, 'w') for mg in marker_genes: fout.write('%s\t%s\t%s\t%d\n' % (mg, mg, mg, marker_length[mg])) fout.close() # create concatenated alignment concatenated_seqs = {} for mg in marker_genes: seqs = alignments[mg] for genome_id in genome_ids: if genome_id in seqs: # append alignment concatenated_seqs[genome_id] = concatenated_seqs.get( genome_id, '') + seqs[genome_id] else: # missing gene concatenated_seqs[genome_id] = concatenated_seqs.get( genome_id, '') + '-' * marker_length[mg] # save concatenated alignment seq_io.write_fasta(concatenated_seqs, concatenated_alignment_file)
def modify(input_file, scaffold_file, seqs_to_add, seqs_to_remove, output_file): """Add or remove scaffolds from a fasta file. Parameters ---------- input_file : str Fasta file to modify. scaffold_file : str Fasta file containing scaffolds to add. seqs_to_add: iterable Unique ids of scaffolds to add. seqs_to_remove : iterable Unique ids of scaffolds to remove. output_file : str Desired name of modified fasta file. Returns ------- iterable, iterable Unique ids of sequences that could not be added, unique ids of sequences that could not be removed. """ seqs = seq_io.read(input_file) # add sequences to bin failed_to_add = set() if seqs_to_add: failed_to_add = set(seqs_to_add) if seqs_to_add != None: for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in seqs_to_add: failed_to_add.remove(seq_id) seqs[seq_id] = seq # remove sequences from bin failed_to_remove = set() if seqs_to_remove: failed_to_remove = set(seqs_to_remove) if seqs_to_remove != None: for seq_id in seqs_to_remove: if seq_id in seqs: failed_to_remove.remove(seq_id) seqs.pop(seq_id) # save modified bin seq_io.write_fasta(seqs, output_file) return failed_to_add, failed_to_remove
def add_compatible_unique(self, scaffold_file, genome_file, compatible_file, out_genome): """Add sequences specified as compatible. Only sequences specified exactly once in the compatibility file are added. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine scaffolds compatible with genome scaffold_ids = [] bin_ids = {} with open(compatible_file) as f: f.readline() for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_ids.append(scaffold_id) bin_ids[scaffold_id] = bin_id compatible_scaffolds = set() for scaffold_id, bin_id in bin_ids.iteritems(): if scaffold_ids.count(scaffold_id) == 1 and bin_id == cur_bin_id: compatible_scaffolds.add(scaffold_id) # add compatible sequences to genome genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in compatible_scaffolds: genome_seqs[seq_id] = seq # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def unbinned(self, options): """Unbinned Command""" check_dir_exists(options.genome_nt_dir) genomes_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genomes_files): self.logger.warning('All files must contain nucleotide sequences.') sys.exit() unbinned = Unbinned() unbinned_seqs = unbinned.run(genomes_files, options.scaffold_file, options.min_seq_len) seq_io.write_fasta(unbinned_seqs, options.output_file) self.logger.info('Unbinned scaffolds written to: ' + options.output_file)
def remove_outliers(self, genome_file, outlier_file, out_genome, modified_only): """Remove sequences specified as outliers. Any scaffolds lists in the first column of the outlier file are removed from the specified genome. Parameters ---------- genome_file : str Fasta file of binned scaffolds. outlier_file : str File specifying outlying scaffolds. out_genome : str Name of output genome. modified_only : bool Only create output file if genome is modified. """ genome_seqs = seq_io.read(genome_file) if not genome_seqs: return # remove scaffolds bModified = False with open(outlier_file) as f: f.readline() for line in f: if line[0] == '#': continue line_split = line.split('\t') scaffold_id = line_split[0] rtn = genome_seqs.pop(scaffold_id, None) if rtn: bModified = True # save modified bin if bModified or not modified_only: seq_io.write_fasta(genome_seqs, out_genome)
def unbinned(self, options): """Unbinned Command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - unbinned] Identify unbinned scaffolds.') self.logger.info('*******************************************************************************') check_dir_exists(options.genome_nt_dir) genomes_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genomes_files): self.logger.warning('[Warning] All files must contain nucleotide sequences.') sys.exit() unbinned = Unbinned() unbinned_seqs = unbinned.run(genomes_files, options.scaffold_file, options.min_seq_len) seq_io.write_fasta(unbinned_seqs, options.output_file) self.logger.info('') self.logger.info(' Unbinned scaffolds written to: ' + options.output_file) self.time_keeper.print_time_stamp()
def write_windows(self,scaffold_file,output_dir,window_size,window_gap): ''' -------------------------------------------------------------------- Take a scaffold file in fasta format and prints a similarly name fasta file of the windows made from the scaffolds in the scaffold file. -------------------------------------------------------------------- Input: scaffold_file The name of the fasta file to turn into windows Output: Writes a links file and a file of windows ''' seq_win_id={} #pairs a scaffold with the windows made from it window_dict={} #dictionary of windows_dict[win_id]=seq_win for seq_id, sequence in seq_io.read_seq(scaffold_file): win_id,seq_win=self.make_windows([seq_id,sequence], window_size, window_gap) seq_win_id[seq_id]=win_id for i in range(0,len(win_id)): window_dict[win_id[i]]=seq_win[i] filename=os.path.split(scaffold_file)[1] start=".".join(filename.split('.')[:-1]) end=filename.split('.')[-1] window_file=os.path.join(output_dir,start+"windows."+end) names = ['id','sequence'] formats = [] print len(window_dict) seq_io.write_fasta(window_dict,window_file) #self.write_fasta(window_dict,window_file) links_file=os.path.join(output_dir,"links_file.tsv") self.write_links(seq_win_id,links_file) return [window_file,links_file]
def run(self, homolog_file, gene_id_file, taxonomy_file, min_per_taxa, consensus, min_per_bp, use_trimAl, msa_program, tree_program, prot_model, output_dir): """Infer a tree over a reduced set of genes. Filter a set of homolgs to a specified set of gene ids, and infer tree over this reduced set of proteins. Parameters ---------- homolog_file : str Fasta file containing homologs. gene_ids : str File with gene ids to retain in tree. taxonomy_file : str Taxonomic assignment of each reference genomes. min_per_taxa : float Minimum percentage of taxa required to retain a column. consensus : float Minimum percentage of the same amino acid required to retain column. min_per_bp : float Minimum percentage of base pairs required to keep trimmed sequence. use_trimAl : boolean Filter columns using trimAl. msa_program : str Program to use for multiple sequence alignment ['mafft', 'muscle']. tree_program : str Program to use for tree inference ['fasttree', 'raxml']. prot_model : str Protein substitution model for tree inference ['WAG', 'LG', 'AUTO']. output_dir: str Output directory. """ # generate msa with reduced sequences self.logger.info('Extracting sequences to retain.') genes_to_retain = self.read_ids(gene_id_file) self.logger.info(' ...identified %d sequences to retain.' % len(genes_to_retain)) seqs = seq_io.read_fasta(homolog_file) reduced_seqs = {} for seq_id, seq in seqs.iteritems(): if seq_id in genes_to_retain: reduced_seqs[seq_id] = seq reduced_homolog_file = homolog_file[0:homolog_file.rfind('.')] reduced_homolog_file += '.reduced.' + homolog_file[homolog_file. rfind('.') + 1:] seq_io.write_fasta(reduced_seqs, reduced_homolog_file) self.logger.info('Retained %d sequences.' % len(reduced_seqs)) # infer multiple sequence alignment msa = MsaWorkflow(self.cpus) trimmed_msa_output = msa.run(reduced_homolog_file, min_per_taxa, consensus, min_per_bp, use_trimAl, msa_program, output_dir) # infer tree tw = TreeWorkflow(self.cpus) tree_output = tw.run(trimmed_msa_output, tree_program, prot_model, False, output_dir) # create tax2tree consensus map and decorate tree self.logger.info('Decorating internal tree nodes with tax2tree.') t2t_tree = tree_output.replace('.tree', '.tax2tree.tree') os.system('t2t decorate -m %s -t %s -o %s' % (taxonomy_file, tree_output, t2t_tree))
def add_compatible_closest(self, scaffold_file, genome_file, compatible_file, min_len, out_genome): """Add sequences specified as compatible. A sequences is added to a bin if and only if it is closest to that bin in GC, tetranuclotide, and coverage space. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. min_len : int Minimum length to add scaffold. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine statistics for each potentially compatible scaffold scaffold_ids = defaultdict(dict) with open(compatible_file) as f: headers = [x.strip() for x in f.readline().split('\t')] scaffold_gc_index = headers.index('Scaffold GC') genome_gc_index = headers.index('Median genome GC') td_dist_index = headers.index('Scaffold TD') scaffold_cov_index = headers.index('Scaffold coverage') genome_cov_index = headers.index('Median genome coverage') for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_gc = float(line_split[scaffold_gc_index]) genome_gc = float(line_split[genome_gc_index]) gc_dist = abs(scaffold_gc - genome_gc) td_dist = float(line_split[td_dist_index]) scaffold_cov = float(line_split[scaffold_cov_index]) genome_cov = float(line_split[genome_cov_index]) cov_dist = abs(scaffold_cov - genome_cov) scaffold_ids[scaffold_id][bin_id] = [gc_dist, td_dist, cov_dist] # determine scaffolds that are closest to a single bin # in terms of GC, tetranucleotide distance, and coverage compatible_scaffolds = set() for scaffold_id, bin_stats in scaffold_ids.items(): best_gc = [1e9, None] best_td = [1e9, None] best_cov = [1e9, None] for bin_id, stats in bin_stats.items(): gc, td, cov = stats if gc < best_gc[0]: best_gc = [gc, bin_id] if td < best_td[0]: best_td = [td, bin_id] if cov < best_cov[0]: best_cov = [cov, bin_id] # check if scaffold is closest to a single bin if (best_gc[1] == best_td[1] == best_cov[1]) and best_gc[1] == cur_bin_id: compatible_scaffolds.add(scaffold_id) self.logger.info('Identified {:,} compatible scaffolds.'.format(len(compatible_scaffolds))) # add compatible sequences to genome added_seqs = 0 genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in compatible_scaffolds: if len(seq) >= min_len: genome_seqs[seq_id] = seq added_seqs += 1 self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs)) # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def add_compatible_closest(self, scaffold_file, genome_file, compatible_file, out_genome): """Add sequences specified as compatible. A sequences is added to a bin if and only if it is closest to that bin in GC, tetranuclotide, and coverage space. Parameters ---------- scaffold_file : str Fasta file containing scaffolds to add. genome_file : str Fasta file of binned scaffolds. compatible_file : str File specifying compatible scaffolds. out_genome : str Name of output genome. """ cur_bin_id = remove_extension(genome_file) # determine statistics for each potentially compatible scaffold scaffold_ids = defaultdict(dict) with open(compatible_file) as f: headers = [x.strip() for x in f.readline().split('\t')] scaffold_gc_index = headers.index('Scaffold GC') genome_gc_index = headers.index('Mean genome GC') td_dist_index = headers.index('Scaffold TD') scaffold_cov_index = headers.index('Mean scaffold coverage') genome_cov_index = headers.index('Mean genome coverage') for line in f: line_split = line.split('\t') scaffold_id = line_split[0] bin_id = line_split[1].strip() scaffold_gc = float(line_split[scaffold_gc_index]) genome_gc = float(line_split[genome_gc_index]) gc_dist = abs(scaffold_gc - genome_gc) td_dist = float(line_split[td_dist_index]) scaffold_cov = float(line_split[scaffold_cov_index]) genome_cov = float(line_split[genome_cov_index]) cov_dist = abs(scaffold_cov - genome_cov) scaffold_ids[scaffold_id][bin_id] = [gc_dist, td_dist, cov_dist] # determine scaffolds that are closest to a single bin # in terms of GC, tetranucleotide distance, and coverage compatible_scaffolds = set() for scaffold_id, bin_stats in scaffold_ids.iteritems(): best_gc = [1e9, None] best_td = [1e9, None] best_cov = [1e9, None] for bin_id, stats in bin_stats.iteritems(): gc, td, cov = stats if gc < best_gc[0]: best_gc = [gc, bin_id] if td < best_td[0]: best_td = [td, bin_id] if cov < best_cov[0]: best_cov = [cov, bin_id] # check if scaffold is closest to a single bin if (best_gc[1] == best_td[1] == best_cov[1]) and best_gc[1] == cur_bin_id: compatible_scaffolds.add(scaffold_id) # add compatible sequences to genome genome_seqs = seq_io.read(genome_file) for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id in compatible_scaffolds: genome_seqs[seq_id] = seq # save modified bin seq_io.write_fasta(genome_seqs, out_genome)
def run(self, homolog_file, min_per_taxa, consensus, min_per_bp, use_trimAl, msa_program, output_dir): """Create multiple sequence alignment. Parameters ---------- homolog_file : str File containing sequences to align min_per_taxa : float Minimum percentage of taxa required to retain a column. consensus : float Minimum percentage of the same amino acid required to retain column. min_per_bp : float Minimum percentage of base pairs required to keep trimmed sequence. use_trimAl : boolean Filter columns using trimAl. msa_program : str Program to use for multiple sequence alignment ['mafft', 'muscle']. output_dir : str Directory to store results. """ # infer multiple sequence alignment self.logger.info('Inferring multiple sequence alignment with %s.' % msa_program) output_file = ntpath.basename(homolog_file) prefix = output_file[0:output_file.rfind('.')] suffix = output_file[output_file.rfind('.') + 1:] msa_output = os.path.join(output_dir, prefix + '.aligned.' + suffix) if msa_program == 'mafft': mafft = Mafft(self.cpus) msa_log = os.path.join(output_dir, 'mafft.log') mafft.run(homolog_file, msa_output, msa_log) elif msa_program == 'muscle': muscle = Muscle() msa_log = os.path.join(output_dir, 'muscle.log') muscle.run(homolog_file, msa_output, msa_log) # trim multiple sequence alignment trimmed_msa_output = os.path.join( output_dir, prefix + '.trimmed.aligned.' + suffix) if use_trimAl: self.logger.info( 'Using trimAl to filter poorly represented columns from alignment.' ) # convert MSA to relaxed phylip format phylip_msa_output = msa_output.replace('.faa', '.phyx') cmd = 'seqmagick convert %s %s' % (msa_output, phylip_msa_output) os.system(cmd) tmp_output = os.path.join(output_dir, 'tmp.faa') cmd = 'trimal -in %s -out %s -automated1 -fasta' % ( phylip_msa_output, tmp_output) os.system(cmd) cmd = 'trimal -in %s -out %s -resoverlap 0.75 -seqoverlap %f' % ( tmp_output, trimmed_msa_output, min_per_bp) os.system(cmd) seqs = seq_io.read_fasta(msa_output) tmp_seqs = seq_io.read_fasta(tmp_output) trimmed_seqs = seq_io.read_fasta(trimmed_msa_output) self.logger.info( 'Trimmed alignment from %d to %d AA.' % (len(seqs.values()[0]), len(trimmed_seqs.values()[0]))) self.logger.info( '%d of %d taxa were deemed to be too short and removed.' % (len(tmp_seqs) - len(trimmed_seqs), len(seqs))) os.remove(tmp_output) else: self.logger.info( 'Trimming poorly represented columns from alignment.') seqs = seq_io.read_fasta(msa_output, keep_annotation=True) trimmed_seqs, pruned_seqs, min_taxa_filtered, consensus_filtered = seq_tk.trim_seqs( seqs, min_per_taxa / 100.0, consensus / 100.0, min_per_bp / 100.0) self.logger.info( 'Trimmed alignment from %d to %d AA (%d by minimum taxa percent, %d by consensus).' % (len(seqs.values()[0]), len(trimmed_seqs.values()[0]), min_taxa_filtered, consensus_filtered)) self.logger.info( '%d of %d taxa were deemed to be too short and removed.' % (len(pruned_seqs), len(seqs))) if len(pruned_seqs) > 0: prune_seqs_out = os.path.join(output_dir, 'filtered_seqs.too_short.txt') self.logger.info('Pruned sequences written to %s.' % prune_seqs_out) seq_io.write_fasta(pruned_seqs, prune_seqs_out) if len(pruned_seqs) == len(seqs): self.logger.error( 'Too many sequences were pruned. Gene tree cannot be inferred.' ) sys.exit() seq_io.write_fasta(trimmed_seqs, trimmed_msa_output) return trimmed_msa_output