def generate_lane_mask(infile, entropy_threshold, existing_mask=None): """ Generates lane mask dynamically by calculating base frequencies infile: open file object for aligned fasta file entropy_threshold: float value that designates the percentage of entropic positions to be removed, i.e., 0.10 means the 10% most entropic positions are removed. """ aln = Alignment.from_fasta_records(parse_fasta(infile), DNA) uncertainty = aln.position_entropies(nan_on_non_standard_chars=False) uncertainty_sorted = sorted(uncertainty) cutoff_index = int( round((len(uncertainty_sorted) - 1) * (1 - entropy_threshold))) max_uncertainty = uncertainty_sorted[cutoff_index] # This correction is for small datasets with a small possible number of # uncertainty values. highest_certainty = min(uncertainty_sorted) lane_mask = "" for base in uncertainty: if base >= max_uncertainty and base != highest_certainty: lane_mask += "0" else: lane_mask += "1" return lane_mask
def getResult(self, aln_path, *args, **kwargs): """Returns alignment from sequences. Currently does not allow parameter tuning of program and uses default parameters -- this is bad and should be fixed. #TODO: allow command-line access to important aln params. """ module = self.Params['Module'] # standard qiime says we just consider the first word as the unique ID # the rest of the defline of the fasta alignment often doesn't match # the otu names in the otu table with open(aln_path) as aln_f: seqs = Alignment.from_fasta_records( parse_fasta(aln_f, label_to_name=lambda x: x.split()[0]), DNA) # This ugly little line of code lets us pass a skbio Alignment when a # a cogent alignment is expected. seqs.getIntMap = seqs.int_map result = module.build_tree_from_alignment(seqs) try: root_method = kwargs['root_method'] if root_method == 'midpoint': result = root_midpt(result) elif root_method == 'tree_method_default': pass except KeyError: pass return result
def remove_outliers(seqs, num_stds, fraction_seqs_for_stats=.95): """ remove sequences very different from the majority consensus given aligned sequences, will: 1. calculate a majority consensus (most common symbol at each position of the alignment); 2. compute the mean/std edit distance of each seq to the consensus; 3. discard sequences whose edit dist is greater than the cutoff, which is defined as being `num_stds` greater than the mean. """ # load the alignment and compute the consensus sequence aln = Alignment.from_fasta_records(parse_fasta(seqs), DNA) consensus_seq = aln.majority_consensus() # compute the hamming distance between all sequences in the alignment # and the consensus sequence dists_to_consensus = [s.distance(consensus_seq) for s in aln] # compute the average and standard deviation distance from the consensus average_distance = mean(dists_to_consensus) std_distance = std(dists_to_consensus) # compute the distance cutoff dist_cutoff = average_distance + num_stds * std_distance # for all sequences, determine if they're distance to the consensus # is less then or equal to the cutoff distance. if so, add the sequence's # identifier to the list of sequence identifiers to keep seqs_to_keep = [] for seq_id, dist_to_consensus in izip(aln.ids(), dists_to_consensus): if dist_to_consensus <= dist_cutoff: seqs_to_keep.append(seq_id) # filter the alignment to only keep the sequences identified in the step # above filtered_aln = aln.subalignment(seqs_to_keep=seqs_to_keep) # and return the filtered alignment return filtered_aln
def generate_lane_mask(infile, entropy_threshold, existing_mask=None): """ Generates lane mask dynamically by calculating base frequencies infile: open file object for aligned fasta file entropy_threshold: float value that designates the percentage of entropic positions to be removed, i.e., 0.10 means the 10% most entropic positions are removed. """ aln = Alignment.from_fasta_records(parse_fasta(infile), DNA) uncertainty = aln.position_entropies(nan_on_non_standard_chars=False) uncertainty_sorted = sorted(uncertainty) cutoff_index = int(round((len(uncertainty_sorted) - 1) * (1 - entropy_threshold))) max_uncertainty = uncertainty_sorted[cutoff_index] # This correction is for small datasets with a small possible number of # uncertainty values. highest_certainty = min(uncertainty_sorted) lane_mask = "" for base in uncertainty: if base >= max_uncertainty and base != highest_certainty: lane_mask += "0" else: lane_mask += "1" return lane_mask
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) template_alignment = Alignment.from_fasta_records(template_alignment, DNASequence, validate=True) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) for i, seq in enumerate(pynast_failed): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_failed[i] = skb_seq pynast_failed = SequenceCollection(pynast_failed) for i, seq in enumerate(pynast_aligned): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_aligned[i] = skb_seq pynast_aligned = Alignment(pynast_aligned) if failure_path is not None: fail_file = open(failure_path, 'w') fail_file.write(pynast_failed.to_fasta()) fail_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(pynast_aligned.to_fasta()) result_file.close() return None else: return pynast_aligned
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) template_alignment = Alignment.from_fasta_records( template_alignment, DNASequence, validate=True) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) for i, seq in enumerate(pynast_failed): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_failed[i] = skb_seq pynast_failed = SequenceCollection(pynast_failed) for i, seq in enumerate(pynast_aligned): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_aligned[i] = skb_seq pynast_aligned = Alignment(pynast_aligned) if failure_path is not None: fail_file = open(failure_path, 'w') fail_file.write(pynast_failed.to_fasta()) fail_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(pynast_aligned.to_fasta()) result_file.close() return None else: return pynast_aligned
def apply_lane_mask_and_gap_filter(fastalines, mask, allowed_gap_frac=1. - 1e-6, entropy_threshold=None): """Applies a mask and gap filter to fasta file, yielding filtered seqs.""" # load the alignment aln = Alignment.from_fasta_records(parse_fasta(fastalines), DNA) # build the entropy mask if mask is None and entropy_threshold is None: if allowed_gap_frac < 1: aln = aln.omit_gap_positions(allowed_gap_frac) elif mask is not None and entropy_threshold is not None: raise ValueError('only mask or entropy threshold can be provided.') elif mask is not None: # a pre-computed mask (e.g., Lane mask) was provided, so apply that # and then remove highly gapped positions (gap positions have to be # removed after the mask-based filtering so that the positions in the # mask correspond with the positions in the alignment at the time of # filtering) entropy_mask = mask_to_positions(mask) aln = aln.subalignment(positions_to_keep=entropy_mask) if allowed_gap_frac < 1: aln = aln.omit_gap_positions(allowed_gap_frac) elif entropy_threshold is not None: # a mask is being computed on the fly to filter the entropy_threshold # most entropic positions. if highly gapped positions are being omitted # those are filtered first, so the entropy scores for those positions # aren't included when determining the entropy threshold (since the # positions that are mostly gaps will be counted as a lot of low # entropy positions) if not (0 <= entropy_threshold <= 1): raise ValueError('entropy_threshold needs to be between 0 and 1' ' (inclusive)') if allowed_gap_frac < 1: aln = aln.omit_gap_positions(allowed_gap_frac) entropy_mask = generate_lane_mask(aln, entropy_threshold) entropy_mask = mask_to_positions(entropy_mask) aln = aln.subalignment(positions_to_keep=entropy_mask) else: # it shouldn't be possible to get here raise ValueError("Can't resolve parameters for " "apply_lane_mask_and_gap_filter.") if aln.sequence_length() == 0: raise ValueError("Positional filtering resulted in removal of all " "alignment positions.") for seq in aln: yield ">%s\n" % seq.id yield "%s\n" % seq
def apply_lane_mask_and_gap_filter(fastalines, mask, allowed_gap_frac=1.-1e-6, entropy_threshold=None): """Applies a mask and gap filter to fasta file, yielding filtered seqs.""" # load the alignment aln = Alignment.from_fasta_records(parse_fasta(fastalines), DNA) # build the entropy mask if mask is None and entropy_threshold is None: if allowed_gap_frac < 1: aln = aln.omit_gap_positions(allowed_gap_frac) elif mask is not None and entropy_threshold is not None: raise ValueError('only mask or entropy threshold can be provided.') elif mask is not None: # a pre-computed mask (e.g., Lane mask) was provided, so apply that # and then remove highly gapped positions (gap positions have to be # removed after the mask-based filtering so that the positions in the # mask correspond with the positions in the alignment at the time of # filtering) entropy_mask = mask_to_positions(mask) aln = aln.subalignment(positions_to_keep=entropy_mask) if allowed_gap_frac < 1: aln = aln.omit_gap_positions(allowed_gap_frac) elif entropy_threshold is not None: # a mask is being computed on the fly to filter the entropy_threshold # most entropic positions. if highly gapped positions are being omitted # those are filtered first, so the entropy scores for those positions # aren't included when determining the entropy threshold (since the # positions that are mostly gaps will be counted as a lot of low # entropy positions) if not (0 <= entropy_threshold <= 1): raise ValueError('entropy_threshold needs to be between 0 and 1' ' (inclusive)') if allowed_gap_frac < 1: aln = aln.omit_gap_positions(allowed_gap_frac) entropy_mask = generate_lane_mask(aln, entropy_threshold) entropy_mask = mask_to_positions(entropy_mask) aln = aln.subalignment(positions_to_keep=entropy_mask) else: # it shouldn't be possible to get here raise ValueError("Can't resolve parameters for " "apply_lane_mask_and_gap_filter.") if aln.sequence_length() == 0: raise ValueError("Positional filtering resulted in removal of all " "alignment positions.") for seq in aln: yield ">%s\n" % seq.id yield "%s\n" % seq
def test_call_infernal_test1_file_output(self): """InfernalAligner writes correct output files for infernal_test1 seqs """ # do not collect results; check output files instead actual = self.infernal_test1_aligner( self.infernal_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp) self.assertTrue(actual is None, "Result should be None when result path provided.") expected_aln = self.infernal_test1_expected_aln with open(self.result_fp) as result_f: actual_aln = Alignment.from_fasta_records(parse_fasta( result_f), DNA) self.assertEqual(actual_aln, expected_aln)
def test_call_infernal_test1_file_output(self): """InfernalAligner writes correct output files for infernal_test1 seqs """ # do not collect results; check output files instead actual = self.infernal_test1_aligner(self.infernal_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp) self.assertTrue(actual is None, "Result should be None when result path provided.") expected_aln = self.infernal_test1_expected_aln with open(self.result_fp) as result_f: actual_aln = Alignment.from_fasta_records(parse_fasta(result_f), DNA) self.assertEqual(actual_aln, expected_aln)
def test_call_pynast_test1_file_output(self): """PyNastAligner writes correct output files for pynast_test1 seqs """ # do not collect results; check output files instead actual = self.pynast_test1_aligner( self.pynast_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp, failure_path=self.failure_fp) self.assertTrue(actual is None, "Result should be None when result path provided.") expected_aln = self.pynast_test1_expected_aln with open(self.result_fp) as result_f: actual_aln = Alignment.from_fasta_records(parse_fasta( result_f), DNA) self.assertEqual(actual_aln, expected_aln) with open(self.failure_fp) as failure_f: actual_fail = SequenceCollection.from_fasta_records( parse_fasta(failure_f), DNA) self.assertEqual(actual_fail.to_fasta(), self.pynast_test1_expected_fail.to_fasta())
def setUp(self): fd, self.infernal_test1_input_fp = mkstemp( prefix='InfernalAlignerTests_', suffix='.fasta') close(fd) with open(self.infernal_test1_input_fp, 'w') as in_f: in_f.write('\n'.join(infernal_test1_input_fasta)) fd, self.infernal_test1_template_fp = mkstemp( prefix='InfernalAlignerTests_', suffix='template.sto') close(fd) with open(self.infernal_test1_template_fp, 'w') as in_f: in_f.write(infernal_test1_template_stockholm) # create temp file names (and touch them so we can reliably # clean them up) fd, self.result_fp = mkstemp( prefix='InfernalAlignerTests_', suffix='.fasta') close(fd) open(self.result_fp, 'w').close() fd, self.log_fp = mkstemp( prefix='InfernalAlignerTests_', suffix='.log') close(fd) open(self.log_fp, 'w').close() self._paths_to_clean_up = [ self.infernal_test1_input_fp, self.result_fp, self.log_fp, self.infernal_test1_template_fp, ] self.infernal_test1_aligner = InfernalAligner({ 'template_filepath': self.infernal_test1_template_fp, }) self.infernal_test1_expected_aln = Alignment.from_fasta_records( parse_fasta(infernal_test1_expected_alignment), DNA)
def setUp(self): fd, self.infernal_test1_input_fp = mkstemp( prefix='InfernalAlignerTests_', suffix='.fasta') close(fd) with open(self.infernal_test1_input_fp, 'w') as in_f: in_f.write('\n'.join(infernal_test1_input_fasta)) fd, self.infernal_test1_template_fp = mkstemp( prefix='InfernalAlignerTests_', suffix='template.sto') close(fd) with open(self.infernal_test1_template_fp, 'w') as in_f: in_f.write(infernal_test1_template_stockholm) # create temp file names (and touch them so we can reliably # clean them up) fd, self.result_fp = mkstemp(prefix='InfernalAlignerTests_', suffix='.fasta') close(fd) open(self.result_fp, 'w').close() fd, self.log_fp = mkstemp(prefix='InfernalAlignerTests_', suffix='.log') close(fd) open(self.log_fp, 'w').close() self._paths_to_clean_up = [ self.infernal_test1_input_fp, self.result_fp, self.log_fp, self.infernal_test1_template_fp, ] self.infernal_test1_aligner = InfernalAligner({ 'template_filepath': self.infernal_test1_template_fp, }) self.infernal_test1_expected_aln = Alignment.from_fasta_records( parse_fasta(infernal_test1_expected_alignment), DNA)
def setUp(self): fd, self.pynast_test1_input_fp = mkstemp(prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) with open(self.pynast_test1_input_fp, 'w') as f: f.write(pynast_test1_input_fasta) fd, self.pynast_test1_template_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test1_template_fp, 'w') as f: f.write(pynast_test1_template_fasta) fd, self.pynast_test_template_w_dots_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_dots_fp, 'w') as f: f.write(pynast_test1_template_fasta.replace('-', '.')) fd, self.pynast_test_template_w_u_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_u_fp, 'w') as f: f.write(pynast_test1_template_fasta.replace('T', 'U')) fd, self.pynast_test_template_w_lower_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_lower_fp, 'w') as f: f.write(pynast_test1_template_fasta.lower()) # create temp file names (and touch them so we can reliably # clean them up) fd, self.result_fp = mkstemp(prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) open(self.result_fp, 'w').close() fd, self.failure_fp = mkstemp(prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) open(self.failure_fp, 'w').close() fd, self.log_fp = mkstemp(prefix='PyNastAlignerTests_', suffix='.log') close(fd) open(self.log_fp, 'w').close() self._paths_to_clean_up = [ self.pynast_test1_input_fp, self.result_fp, self.failure_fp, self.log_fp, self.pynast_test1_template_fp, self.pynast_test_template_w_dots_fp, self.pynast_test_template_w_u_fp, self.pynast_test_template_w_lower_fp ] self.pynast_test1_aligner = PyNastAligner({ 'template_filepath': self.pynast_test1_template_fp, 'min_len': 15, }) self.pynast_test1_expected_aln = Alignment.from_fasta_records( parse_fasta(pynast_test1_expected_alignment), DNA) self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records( parse_fasta(pynast_test1_expected_failure), DNA)
def setUp(self): fd, self.pynast_test1_input_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) with open(self.pynast_test1_input_fp, 'w') as f: f.write(pynast_test1_input_fasta) fd, self.pynast_test1_template_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test1_template_fp, 'w') as f: f.write(pynast_test1_template_fasta) fd, self.pynast_test_template_w_dots_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_dots_fp, 'w') as f: f.write(pynast_test1_template_fasta.replace('-', '.')) fd, self.pynast_test_template_w_u_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_u_fp, 'w') as f: f.write(pynast_test1_template_fasta.replace('T', 'U')) fd, self.pynast_test_template_w_lower_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_lower_fp, 'w') as f: f.write(pynast_test1_template_fasta.lower()) # create temp file names (and touch them so we can reliably # clean them up) fd, self.result_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) open(self.result_fp, 'w').close() fd, self.failure_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) open(self.failure_fp, 'w').close() fd, self.log_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.log') close(fd) open(self.log_fp, 'w').close() self._paths_to_clean_up = [ self.pynast_test1_input_fp, self.result_fp, self.failure_fp, self.log_fp, self.pynast_test1_template_fp, self.pynast_test_template_w_dots_fp, self.pynast_test_template_w_u_fp, self.pynast_test_template_w_lower_fp ] self.pynast_test1_aligner = PyNastAligner({ 'template_filepath': self.pynast_test1_template_fp, 'min_len': 15, }) self.pynast_test1_expected_aln = Alignment.from_fasta_records( parse_fasta(pynast_test1_expected_alignment), DNA) self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records( parse_fasta(pynast_test1_expected_failure), DNA)
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None, cmbuild_params=None, cmalign_params=None): log_params = [] # load candidate sequences candidate_sequences = dict(parse_fasta(open(seq_path, 'U'))) # load template sequences try: info, template_alignment, struct = list(MinimalRfamParser(open( self.Params['template_filepath'], 'U'), seq_constructor=ChangedSequence))[0] except RecordError: raise ValueError( "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner.") # Need to make separate mapping for unaligned sequences unaligned = SequenceCollection.from_fasta_records( candidate_sequences.iteritems(), DNASequence) mapped_seqs, new_to_old_ids = unaligned.int_map(prefix='unaligned_') mapped_seq_tuples = [(k, str(v)) for k,v in mapped_seqs.iteritems()] # Turn on --gapthresh option in cmbuild to force alignment to full # model if cmbuild_params is None: cmbuild_params = {} cmbuild_params.update({'--gapthresh': 1.0}) # record cmbuild parameters log_params.append('cmbuild parameters:') log_params.append(str(cmbuild_params)) # Turn on --sub option in Infernal, since we know the unaligned sequences # are fragments. # Also turn on --gapthresh to use same gapthresh as was used to build # model if cmalign_params is None: cmalign_params = {} cmalign_params.update({'--sub': True, '--gapthresh': 1.0}) # record cmalign parameters log_params.append('cmalign parameters:') log_params.append(str(cmalign_params)) # Align sequences to alignment including alignment gaps. aligned, struct_string = cmalign_from_alignment(aln=template_alignment, structure_string=struct, seqs=mapped_seq_tuples, include_aln=True, params=cmalign_params, cmbuild_params=cmbuild_params) # Pull out original sequences from full alignment. infernal_aligned = [] # Get a dict of the ids to sequences (note that this is a # cogent alignment object, hence the call to NamedSeqs) aligned_dict = aligned.NamedSeqs for n, o in new_to_old_ids.iteritems(): aligned_seq = aligned_dict[n] infernal_aligned.append((o, aligned_seq)) # Create an Alignment object from alignment dict infernal_aligned = Alignment.from_fasta_records(infernal_aligned, DNASequence) if log_path is not None: log_file = open(log_path, 'w') log_file.write('\n'.join(log_params)) log_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(infernal_aligned.to_fasta()) result_file.close() return None else: try: return infernal_aligned except ValueError: return {}