def generate_lane_mask(infile, entropy_threshold, existing_mask=None): """ Generates lane mask dynamically by calculating base frequencies infile: open file object for aligned fasta file entropy_threshold: float value that designates the percentage of entropic positions to be removed, i.e., 0.10 means the 10% most entropic positions are removed. """ aln = Alignment.from_fasta_records(parse_fasta(infile), DNA) uncertainty = aln.position_entropies(nan_on_non_standard_chars=False) uncertainty_sorted = sorted(uncertainty) cutoff_index = int(round((len(uncertainty_sorted) - 1) * (1 - entropy_threshold))) max_uncertainty = uncertainty_sorted[cutoff_index] # This correction is for small datasets with a small possible number of # uncertainty values. highest_certainty = min(uncertainty_sorted) lane_mask = "" for base in uncertainty: if base >= max_uncertainty and base != highest_certainty: lane_mask += "0" else: lane_mask += "1" return lane_mask
def setUp(self): fd, self.infernal_test1_input_fp = mkstemp(prefix="InfernalAlignerTests_", suffix=".fasta") close(fd) with open(self.infernal_test1_input_fp, "w") as in_f: in_f.write("\n".join(infernal_test1_input_fasta)) fd, self.infernal_test1_template_fp = mkstemp(prefix="InfernalAlignerTests_", suffix="template.sto") close(fd) with open(self.infernal_test1_template_fp, "w") as in_f: in_f.write(infernal_test1_template_stockholm) # create temp file names (and touch them so we can reliably # clean them up) fd, self.result_fp = mkstemp(prefix="InfernalAlignerTests_", suffix=".fasta") close(fd) open(self.result_fp, "w").close() fd, self.log_fp = mkstemp(prefix="InfernalAlignerTests_", suffix=".log") close(fd) open(self.log_fp, "w").close() self._paths_to_clean_up = [ self.infernal_test1_input_fp, self.result_fp, self.log_fp, self.infernal_test1_template_fp, ] self.infernal_test1_aligner = InfernalAligner({"template_filepath": self.infernal_test1_template_fp}) self.infernal_test1_expected_aln = Alignment.from_fasta_records( parse_fasta(infernal_test1_expected_alignment), DNA )
def remove_outliers(seqs, num_stds, fraction_seqs_for_stats=.95): """ remove sequences very different from the majority consensus given aligned sequences, will: 1. calculate a majority consensus (most common symbol at each position of the alignment); 2. compute the mean/std edit distance of each seq to the consensus; 3. discard sequences whose edit dist is greater than the cutoff, which is defined as being `num_stds` greater than the mean. """ # load the alignment and compute the consensus sequence aln = Alignment.from_fasta_records(parse_fasta(seqs), DNA) consensus_seq = aln.majority_consensus() # compute the hamming distance between all sequences in the alignment # and the consensus sequence dists_to_consensus = [s.distance(consensus_seq) for s in aln] # compute the average and standard deviation distance from the consensus average_distance = mean(dists_to_consensus) std_distance = std(dists_to_consensus) # compute the distance cutoff dist_cutoff = average_distance + num_stds * std_distance # for all sequences, determine if they're distance to the consensus # is less then or equal to the cutoff distance. if so, add the sequence's # identifier to the list of sequence identifiers to keep seqs_to_keep = [] for seq_id, dist_to_consensus in izip(aln.ids(), dists_to_consensus): if dist_to_consensus <= dist_cutoff: seqs_to_keep.append(seq_id) # filter the alignment to only keep the sequences identified in the step # above filtered_aln = aln.subalignment(seqs_to_keep=seqs_to_keep) # and return the filtered alignment return filtered_aln
def getResult(self, aln_path, *args, **kwargs): """Returns alignment from sequences. Currently does not allow parameter tuning of program and uses default parameters -- this is bad and should be fixed. #TODO: allow command-line access to important aln params. """ module = self.Params['Module'] # standard qiime says we just consider the first word as the unique ID # the rest of the defline of the fasta alignment often doesn't match # the otu names in the otu table with open(aln_path) as aln_f: seqs = Alignment.from_fasta_records( parse_fasta(aln_f, label_to_name=lambda x: x.split()[0]), DNA) # This ugly little line of code lets us pass a skbio Alignment when a # a cogent alignment is expected. seqs.getIntMap = seqs.int_map result = module.build_tree_from_alignment(seqs, moltype=DNA_cogent) try: root_method = kwargs['root_method'] if root_method == 'midpoint': result = root_midpt(result) elif root_method == 'tree_method_default': pass except KeyError: pass return result
def getResult(self, aln_path, *args, **kwargs): """Returns alignment from sequences. Currently does not allow parameter tuning of program and uses default parameters -- this is bad and should be fixed. #TODO: allow command-line access to important aln params. """ module = self.Params['Module'] # standard qiime says we just consider the first word as the unique ID # the rest of the defline of the fasta alignment often doesn't match # the otu names in the otu table with open(aln_path) as aln_f: seqs = Alignment.from_fasta_records( parse_fasta(aln_f, label_to_name=lambda x: x.split()[0]), DNA) # This ugly little line of code lets us pass a skbio Alignment when a # a cogent alignment is expected. seqs.getIntMap = seqs.int_map result = module.build_tree_from_alignment(seqs, moltype=DNA_cogent) try: root_method = kwargs['root_method'] if root_method == 'midpoint': result = root_midpt(result) elif root_method == 'tree_method_default': pass except KeyError: pass return result
def remove_outliers(seqs, num_stds, fraction_seqs_for_stats=.95): """ remove sequences very different from the majority consensus given aligned sequences, will: 1. calculate a majority consensus (most common symbol at each position of the alignment); 2. compute the mean/std edit distance of each seq to the consensus; 3. discard sequences whose edit dist is greater than the cutoff, which is defined as being `num_stds` greater than the mean. """ # load the alignment and compute the consensus sequence aln = Alignment.from_fasta_records(parse_fasta(seqs), DNA) consensus_seq = aln.majority_consensus() # compute the hamming distance between all sequences in the alignment # and the consensus sequence dists_to_consensus = [s.distance(consensus_seq) for s in aln] # compute the average and standard deviation distance from the consensus average_distance = mean(dists_to_consensus) std_distance = std(dists_to_consensus) # compute the distance cutoff dist_cutoff = average_distance + num_stds * std_distance # for all sequences, determine if they're distance to the consensus # is less then or equal to the cutoff distance. if so, add the sequence's # identifier to the list of sequence identifiers to keep seqs_to_keep = [] for seq_id, dist_to_consensus in izip(aln.ids(), dists_to_consensus): if dist_to_consensus <= dist_cutoff: seqs_to_keep.append(seq_id) # filter the alignment to only keep the sequences identified in the step # above filtered_aln = aln.subalignment(seqs_to_keep=seqs_to_keep) # and return the filtered alignment return filtered_aln
def generate_lane_mask(infile, entropy_threshold, existing_mask=None): """ Generates lane mask dynamically by calculating base frequencies infile: open file object for aligned fasta file entropy_threshold: float value that designates the percentage of entropic positions to be removed, i.e., 0.10 means the 10% most entropic positions are removed. """ aln = Alignment.from_fasta_records(parse_fasta(infile), DNA) uncertainty = aln.position_entropies(nan_on_non_standard_chars=False) uncertainty_sorted = sorted(uncertainty) cutoff_index = int( round((len(uncertainty_sorted) - 1) * (1 - entropy_threshold))) max_uncertainty = uncertainty_sorted[cutoff_index] # This correction is for small datasets with a small possible number of # uncertainty values. highest_certainty = min(uncertainty_sorted) lane_mask = "" for base in uncertainty: if base >= max_uncertainty and base != highest_certainty: lane_mask += "0" else: lane_mask += "1" return lane_mask
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) template_alignment = Alignment.from_fasta_records(template_alignment, DNASequence, validate=True) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) for i, seq in enumerate(pynast_failed): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_failed[i] = skb_seq pynast_failed = SequenceCollection(pynast_failed) for i, seq in enumerate(pynast_aligned): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_aligned[i] = skb_seq pynast_aligned = Alignment(pynast_aligned) if failure_path is not None: fail_file = open(failure_path, 'w') fail_file.write(pynast_failed.to_fasta()) fail_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(pynast_aligned.to_fasta()) result_file.close() return None else: return pynast_aligned
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) template_alignment = Alignment.from_fasta_records( template_alignment, DNASequence, validate=True) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) for i, seq in enumerate(pynast_failed): skb_seq = DNASequence(str(seq), identifier=seq.Name) pynast_failed[i] = skb_seq pynast_failed = SequenceCollection(pynast_failed) for i, seq in enumerate(pynast_aligned): skb_seq = DNASequence(str(seq), identifier=seq.Name) pynast_aligned[i] = skb_seq pynast_aligned = Alignment(pynast_aligned) if failure_path is not None: fail_file = open(failure_path, 'w') fail_file.write(pynast_failed.to_fasta()) fail_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(pynast_aligned.to_fasta()) result_file.close() return None else: return pynast_aligned
def setUp(self): fd, self.pynast_test1_input_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta") close(fd) with open(self.pynast_test1_input_fp, "w") as f: f.write(pynast_test1_input_fasta) fd, self.pynast_test1_template_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta") close(fd) with open(self.pynast_test1_template_fp, "w") as f: f.write(pynast_test1_template_fasta) fd, self.pynast_test_template_w_dots_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta") close(fd) with open(self.pynast_test_template_w_dots_fp, "w") as f: f.write(pynast_test1_template_fasta.replace("-", ".")) fd, self.pynast_test_template_w_u_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta") close(fd) with open(self.pynast_test_template_w_u_fp, "w") as f: f.write(pynast_test1_template_fasta.replace("T", "U")) fd, self.pynast_test_template_w_lower_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta") close(fd) with open(self.pynast_test_template_w_lower_fp, "w") as f: f.write(pynast_test1_template_fasta.lower()) # create temp file names (and touch them so we can reliably # clean them up) fd, self.result_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta") close(fd) open(self.result_fp, "w").close() fd, self.failure_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta") close(fd) open(self.failure_fp, "w").close() fd, self.log_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".log") close(fd) open(self.log_fp, "w").close() self._paths_to_clean_up = [ self.pynast_test1_input_fp, self.result_fp, self.failure_fp, self.log_fp, self.pynast_test1_template_fp, self.pynast_test_template_w_dots_fp, self.pynast_test_template_w_u_fp, self.pynast_test_template_w_lower_fp, ] self.pynast_test1_aligner = PyNastAligner({"template_filepath": self.pynast_test1_template_fp, "min_len": 15}) self.pynast_test1_expected_aln = Alignment.from_fasta_records(parse_fasta(pynast_test1_expected_alignment), DNA) self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records( parse_fasta(pynast_test1_expected_failure), DNA )
def test_call_infernal_test1_file_output(self): """InfernalAligner writes correct output files for infernal_test1 seqs """ # do not collect results; check output files instead actual = self.infernal_test1_aligner( self.infernal_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp ) self.assertTrue(actual is None, "Result should be None when result path provided.") expected_aln = self.infernal_test1_expected_aln with open(self.result_fp) as result_f: actual_aln = Alignment.from_fasta_records(parse_fasta(result_f), DNA) self.assertEqual(actual_aln, expected_aln)
def test_call_infernal_test1_file_output(self): """InfernalAligner writes correct output files for infernal_test1 seqs """ # do not collect results; check output files instead actual = self.infernal_test1_aligner( self.infernal_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp) self.assertTrue(actual is None, "Result should be None when result path provided.") expected_aln = self.infernal_test1_expected_aln with open(self.result_fp) as result_f: actual_aln = Alignment.from_fasta_records(parse_fasta( result_f), DNA) self.assertEqual(actual_aln, expected_aln)
def test_call_pynast_test1_file_output(self): """PyNastAligner writes correct output files for pynast_test1 seqs """ # do not collect results; check output files instead actual = self.pynast_test1_aligner( self.pynast_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp, failure_path=self.failure_fp ) self.assertTrue(actual is None, "Result should be None when result path provided.") expected_aln = self.pynast_test1_expected_aln with open(self.result_fp) as result_f: actual_aln = Alignment.from_fasta_records(parse_fasta(result_f), DNA) self.assertEqual(actual_aln, expected_aln) with open(self.failure_fp) as failure_f: actual_fail = SequenceCollection.from_fasta_records(parse_fasta(failure_f), DNA) self.assertEqual(actual_fail.to_fasta(), self.pynast_test1_expected_fail.to_fasta())
def test_call_pynast_test1_file_output(self): """PyNastAligner writes correct output files for pynast_test1 seqs """ # do not collect results; check output files instead actual = self.pynast_test1_aligner( self.pynast_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp, failure_path=self.failure_fp) self.assertTrue(actual is None, "Result should be None when result path provided.") expected_aln = self.pynast_test1_expected_aln with open(self.result_fp) as result_f: actual_aln = Alignment.from_fasta_records(parse_fasta( result_f), DNA) self.assertEqual(actual_aln, expected_aln) with open(self.failure_fp) as failure_f: actual_fail = SequenceCollection.from_fasta_records( parse_fasta(failure_f), DNA) self.assertEqual(actual_fail.to_fasta(), self.pynast_test1_expected_fail.to_fasta())
def setUp(self): fd, self.infernal_test1_input_fp = mkstemp( prefix='InfernalAlignerTests_', suffix='.fasta') close(fd) with open(self.infernal_test1_input_fp, 'w') as in_f: in_f.write('\n'.join(infernal_test1_input_fasta)) fd, self.infernal_test1_template_fp = mkstemp( prefix='InfernalAlignerTests_', suffix='template.sto') close(fd) with open(self.infernal_test1_template_fp, 'w') as in_f: in_f.write(infernal_test1_template_stockholm) # create temp file names (and touch them so we can reliably # clean them up) fd, self.result_fp = mkstemp( prefix='InfernalAlignerTests_', suffix='.fasta') close(fd) open(self.result_fp, 'w').close() fd, self.log_fp = mkstemp( prefix='InfernalAlignerTests_', suffix='.log') close(fd) open(self.log_fp, 'w').close() self._paths_to_clean_up = [ self.infernal_test1_input_fp, self.result_fp, self.log_fp, self.infernal_test1_template_fp, ] self.infernal_test1_aligner = InfernalAligner({ 'template_filepath': self.infernal_test1_template_fp, }) self.infernal_test1_expected_aln = Alignment.from_fasta_records( parse_fasta(infernal_test1_expected_alignment), DNA)
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None, cmbuild_params=None, cmalign_params=None): log_params = [] # load candidate sequences candidate_sequences = dict(parse_fasta(open(seq_path, 'U'))) # load template sequences try: info, template_alignment, struct = list(MinimalRfamParser(open( self.Params['template_filepath'], 'U'), seq_constructor=ChangedSequence))[0] except RecordError: raise ValueError( "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner.") moltype = self.Params['moltype'] # Need to make separate mapping for unaligned sequences unaligned = SequenceCollection.from_fasta_records( candidate_sequences.iteritems(), DNASequence) mapped_seqs, new_to_old_ids = unaligned.int_map(prefix='unaligned_') mapped_seq_tuples = [(k, str(v)) for k,v in mapped_seqs.iteritems()] # Turn on --gapthresh option in cmbuild to force alignment to full # model if cmbuild_params is None: cmbuild_params = {} cmbuild_params.update({'--gapthresh': 1.0}) # record cmbuild parameters log_params.append('cmbuild parameters:') log_params.append(str(cmbuild_params)) # Turn on --sub option in Infernal, since we know the unaligned sequences # are fragments. # Also turn on --gapthresh to use same gapthresh as was used to build # model if cmalign_params is None: cmalign_params = {} cmalign_params.update({'--sub': True, '--gapthresh': 1.0}) # record cmalign parameters log_params.append('cmalign parameters:') log_params.append(str(cmalign_params)) # Align sequences to alignment including alignment gaps. aligned, struct_string = cmalign_from_alignment(aln=template_alignment, structure_string=struct, seqs=mapped_seq_tuples, moltype=moltype, include_aln=True, params=cmalign_params, cmbuild_params=cmbuild_params) # Pull out original sequences from full alignment. infernal_aligned = [] # Get a dict of the identifiers to sequences (note that this is a # cogent alignment object, hence the call to NamedSeqs) aligned_dict = aligned.NamedSeqs for n, o in new_to_old_ids.iteritems(): aligned_seq = aligned_dict[n] infernal_aligned.append((o, aligned_seq)) # Create an Alignment object from alignment dict infernal_aligned = Alignment.from_fasta_records(infernal_aligned, DNASequence) if log_path is not None: log_file = open(log_path, 'w') log_file.write('\n'.join(log_params)) log_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(infernal_aligned.to_fasta()) result_file.close() return None else: try: return infernal_aligned except ValueError: return {}
def setUp(self): fd, self.pynast_test1_input_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) with open(self.pynast_test1_input_fp, 'w') as f: f.write(pynast_test1_input_fasta) fd, self.pynast_test1_template_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test1_template_fp, 'w') as f: f.write(pynast_test1_template_fasta) fd, self.pynast_test_template_w_dots_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_dots_fp, 'w') as f: f.write(pynast_test1_template_fasta.replace('-', '.')) fd, self.pynast_test_template_w_u_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_u_fp, 'w') as f: f.write(pynast_test1_template_fasta.replace('T', 'U')) fd, self.pynast_test_template_w_lower_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_lower_fp, 'w') as f: f.write(pynast_test1_template_fasta.lower()) # create temp file names (and touch them so we can reliably # clean them up) fd, self.result_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) open(self.result_fp, 'w').close() fd, self.failure_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) open(self.failure_fp, 'w').close() fd, self.log_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.log') close(fd) open(self.log_fp, 'w').close() self._paths_to_clean_up = [ self.pynast_test1_input_fp, self.result_fp, self.failure_fp, self.log_fp, self.pynast_test1_template_fp, self.pynast_test_template_w_dots_fp, self.pynast_test_template_w_u_fp, self.pynast_test_template_w_lower_fp ] self.pynast_test1_aligner = PyNastAligner({ 'template_filepath': self.pynast_test1_template_fp, 'min_len': 15, }) self.pynast_test1_expected_aln = Alignment.from_fasta_records( parse_fasta(pynast_test1_expected_alignment), DNA) self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records( parse_fasta(pynast_test1_expected_failure), DNA)