def test_uclust_search_and_align_from_fasta_filepath(self): """ uclust_search_and_align_from_fasta_filepath functions as expected """ # rev comp matches allowed (default) actual = list(uclust_search_and_align_from_fasta_filepath( self.search_align_query1_fp,self.search_align_template1_fp)) self.assertEqual(actual,self.search_align_out1_expected) # rev comp matches not allowed actual = list(uclust_search_and_align_from_fasta_filepath( self.search_align_query1_fp,self.search_align_template1_fp, enable_rev_strand_matching=False)) self.assertEqual(actual,self.search_align_out1_expected[:2])
def test_uclust_search_and_align_from_fasta_filepath(self): """ uclust_search_and_align_from_fasta_filepath functions as expected """ # rev comp matches allowed (default) actual = list( uclust_search_and_align_from_fasta_filepath( self.search_align_query1_fp, self.search_align_template1_fp)) self.assertEqual(actual, self.search_align_out1_expected) # rev comp matches not allowed actual = list( uclust_search_and_align_from_fasta_filepath( self.search_align_query1_fp, self.search_align_template1_fp, enable_rev_strand_matching=False)) self.assertEqual(actual, self.search_align_out1_expected[:2])
def test_uclust_search_and_align_from_fasta_filepath_protein(self): """ uclust_search_and_align_from_fasta_filepath functions with protein """ # rev comp matches allowed (default) actual = list( uclust_search_and_align_from_fasta_filepath( self.search_align_query2_fp, self.search_align_template2_fp)) self.assertEqual(actual, self.search_align_out2_expected)
def ipynast_seqs(candidate_sequences, template_alignment, max_hits=30, min_pct=75.0, min_len=1000, align_unaligned_seqs_f=None, log_fp=None, logger=None, temp_dir=get_pynast_temp_dir(), **kwargs): """Iterator that yields results of pynast on candidate_sequences This function yields the sequence and exit status of the alignment step, as (sequence, exit status) tuples. Status values can be: 0 : indicates a sucessful alignment, in which case the sequence will be aligned 1 : indicates unsucessful sequence search, in which case the sequence will be unaligned 2 : indicates alignment did not meet minimum requirements, in which case the sequence will be unaligned All sequences are returned as DNA sequence objects. candidate_sequences an iterable object (e.g., a list) containing tuples of (seq_id, sequence) pairs (e.g., as returned by MinimalFastaParser) or a fasta filepath template_alignment a PyCogent alignment object containing the template alignment or a fasta filepath max_hits Maximum number of uclust hits to return min_pct minimum % identity for best database match min_len minimum length of match for alignment align_unaligned_seqs_f Function to align sequences. Must be of the form: align_unaligned_seqs(seqs, moltype, params=None) see cogent.app.muscle_v38.align_unaligned_seqs log_fp Optional path to log file logger Optional NastLogger object, takes precedence over log_fp """ deprecation_warning(kwargs) files_to_remove = [] if type(candidate_sequences) == str: # filepath provided for candidate sequences candidate_sequences = MinimalFastaParser(open(candidate_sequences)) # sequence list provided for candidate sequence -- write # the seqs to a temp file to pass to uclust. This is done in all # cases to convert the sequences to uppercase in case they're not already. # The bad handling of upper versus lower-cased sequences is a uclust issue. # Note that delete = False here because we don't want these to # be deleted when they are closed (since we need to pass # the filepaths around after we write and close them). The files # are deleted explicitly at the end of this function. candidate_fasta_f = NamedTemporaryFile(prefix='pynast_candidate', suffix='.fasta', dir=temp_dir, delete=False) candidate_fasta_filepath = candidate_fasta_f.name for seq_id, seq in candidate_sequences: candidate_fasta_f.write('>%s\n%s\n' % (seq_id, str(seq).upper())) candidate_fasta_f.close() files_to_remove.append(candidate_fasta_filepath) # degap the template alignment for the sequence searching step and # write it to file. See above comment about delete=False template_fasta_f = NamedTemporaryFile(prefix='pynast_template', suffix='.fasta', dir=temp_dir, delete=False) template_fasta_filepath = template_fasta_f.name if type(template_alignment) == str: # the template alignment was received as a filepath try: template_alignment_f = open(template_alignment) except IOError: raise IOError,\ "Cannot open specified filepath: %s" % template_alignment # template alignment provided as filepath -- process it iteratively # to handle potentially massive template_alignments template_alignment = {} for seq_id, seq in MinimalFastaParser(template_alignment_f): template_alignment[seq_id] = seq seq = Sequence(seq=seq, moltype=DNA) template_fasta_f.write('>%s\n%s\n' % (seq_id, seq.degap())) else: # the template alignment was received as a filepath template_fasta_f.write(template_alignment.degap().toFasta()) template_fasta_f.close() files_to_remove.append(template_fasta_filepath) # Set up logging. NastLogger object takes precedence over log # file path, if both are provided. if logger is not None: logger = logger elif log_fp is not None: logger = NastLogger(log_fp) else: logger = NastLogger() min_pct /= 100. # get the alignment iterator pw_alignment_iterator = uclust_search_and_align_from_fasta_filepath( candidate_fasta_filepath, template_fasta_filepath, percent_ID=min_pct, enable_rev_strand_matching=True, tmp_dir=temp_dir) try: current_result = pw_alignment_iterator.next() except StopIteration: current_result = None for seq_id, seq in MinimalFastaParser(open(candidate_fasta_filepath)): seq_len = len(seq) if '-' in seq: # clean-up temporary blast database files if any were created pw_alignment_iterator.close() remove_files(files_to_remove, error_on_missing=False) raise ValueError, "Candidate sequence contains gaps. This is not supported." try: candidate_seq_id, template_seq_id, pw_aligned_candidate,\ pw_aligned_template, pct_identity = current_result except TypeError: pass if not current_result or seq_id.split()[0] != candidate_seq_id.split( )[0]: # a suitable match was not found - don't align the sequence # log the failure logger.record( seq_id, # input sequence identifier len(seq), # input sequence length "No search results.") # yield the unaligned sequence and failure code yield DNA.makeSequence(seq, Name=seq_id), 1 else: # this sequence was aligned if align_unaligned_seqs_f: # if an alternate pairwise aligner was specified, unalign # and re-align the sequences. pw_aligned_template, pw_aligned_candidate =\ align_two_seqs(pw_aligned_template.replace('-',''), pw_aligned_candidate.replace('-',''), align_unaligned_seqs_f) # Cast the pairwise alignments to DNA sequence objects pw_aligned_candidate = \ DNA.makeSequence(pw_aligned_candidate,Name=candidate_seq_id) pw_aligned_template = \ DNA.makeSequence(pw_aligned_template,Name=template_seq_id) # Remove any terminal gaps that were introduced into the template # sequence pw_aligned_candidate, pw_aligned_template = \ remove_template_terminal_gaps( pw_aligned_candidate, pw_aligned_template) candidate_seq_id = pw_aligned_candidate.Name # get the aligned template sequence from the template alignment try: template_aligned_seq = \ template_alignment.getGappedSeq(template_seq_id) except AttributeError: template_aligned_seq = \ Sequence(seq=template_alignment[template_seq_id],moltype=DNA) # reintroduce the gap spacing from the template alignment pw_aligned_template, pw_aligned_candidate, new_gaps =\ reintroduce_template_spacing(template_aligned_seq,\ pw_aligned_template,pw_aligned_candidate) # delete any new gaps that were introduced during the # pairwise alignment step pw_aligned_template, pw_aligned_candidate = adjust_alignment(\ pw_aligned_template,pw_aligned_candidate,new_gaps) # reintroduce any terminal gaps that were present in the template result = introduce_terminal_gaps(\ template_aligned_seq,pw_aligned_template,pw_aligned_candidate) unaligned_length = len(result.degap()) if unaligned_length < min_len: # alignment is too short - log this as a failure error = "Alignment does not meet minimum length "+\ "requirement for alignment (%d < %d)"\ % (seq_len,min_len) logger.record( seq_id, # input sequence identifier len(seq), # input sequence length "No search results.") # yield the unaligned sequence and failure code yield DNA.makeSequence(seq, Name=seq_id), 2 else: # log the alignment logger.record( seq_id, # input sequence identifier len(seq), # input sequence length '', # Errors template_seq_id, # best template match id '%3.2f' % pct_identity, # pct id to template unaligned_length, # post alignment sequence length ) # yield the aligned sequence and sucess code yield DNA.makeSequence(result, Name=candidate_seq_id), 0 # get the next alignment try: current_result = pw_alignment_iterator.next() except StopIteration: # end of the input fasta file indicates completion, # not end of the aligned sequences continue # clean-up temporary blast database files if any were created remove_files(files_to_remove, error_on_missing=False)
def test_uclust_search_and_align_from_fasta_filepath_protein(self): """ uclust_search_and_align_from_fasta_filepath functions with protein """ # rev comp matches allowed (default) actual = list(uclust_search_and_align_from_fasta_filepath( self.search_align_query2_fp,self.search_align_template2_fp)) self.assertEqual(actual,self.search_align_out2_expected)
def ipynast_seqs(candidate_sequences, template_alignment, max_hits=30, min_pct=75.0, min_len=1000, align_unaligned_seqs_f=None, log_fp=None, logger=None, temp_dir=get_pynast_temp_dir(), **kwargs): """Iterator that yields results of pynast on candidate_sequences This function yields the sequence and exit status of the alignment step, as (sequence, exit status) tuples. Status values can be: 0 : indicates a sucessful alignment, in which case the sequence will be aligned 1 : indicates unsucessful sequence search, in which case the sequence will be unaligned 2 : indicates alignment did not meet minimum requirements, in which case the sequence will be unaligned All sequences are returned as DNA sequence objects. candidate_sequences an iterable object (e.g., a list) containing tuples of (seq_id, sequence) pairs (e.g., as returned by MinimalFastaParser) or a fasta filepath template_alignment a PyCogent alignment object containing the template alignment or a fasta filepath max_hits Maximum number of uclust hits to return min_pct minimum % identity for best database match min_len minimum length of match for alignment align_unaligned_seqs_f Function to align sequences. Must be of the form: align_unaligned_seqs(seqs, moltype, params=None) see cogent.app.muscle_v38.align_unaligned_seqs log_fp Optional path to log file logger Optional NastLogger object, takes precedence over log_fp """ deprecation_warning(kwargs) files_to_remove = [] if type(candidate_sequences) == str: # filepath provided for candidate sequences candidate_sequences = MinimalFastaParser(open(candidate_sequences)) # sequence list provided for candidate sequence -- write # the seqs to a temp file to pass to uclust. This is done in all # cases to convert the sequences to uppercase in case they're not already. # The bad handling of upper versus lower-cased sequences is a uclust issue. # Note that delete = False here because we don't want these to # be deleted when they are closed (since we need to pass # the filepaths around after we write and close them). The files # are deleted explicitly at the end of this function. candidate_fasta_f = NamedTemporaryFile(prefix='pynast_candidate', suffix='.fasta', dir=temp_dir, delete=False) candidate_fasta_filepath = candidate_fasta_f.name for seq_id, seq in candidate_sequences: candidate_fasta_f.write('>%s\n%s\n' % (seq_id,str(seq).upper())) candidate_fasta_f.close() files_to_remove.append(candidate_fasta_filepath) # degap the template alignment for the sequence searching step and # write it to file. See above comment about delete=False template_fasta_f = NamedTemporaryFile(prefix='pynast_template', suffix='.fasta', dir=temp_dir, delete=False) template_fasta_filepath = template_fasta_f.name if type(template_alignment) == str: # the template alignment was received as a filepath try: template_alignment_f = open(template_alignment) except IOError: raise IOError,\ "Cannot open specified filepath: %s" % template_alignment # template alignment provided as filepath -- process it iteratively # to handle potentially massive template_alignments template_alignment = {} for seq_id,seq in MinimalFastaParser(template_alignment_f): template_alignment[seq_id] = seq seq = Sequence(seq=seq,moltype=DNA) template_fasta_f.write('>%s\n%s\n' % (seq_id,seq.degap())) else: # the template alignment was received as a filepath template_fasta_f.write(template_alignment.degap().toFasta()) template_fasta_f.close() files_to_remove.append(template_fasta_filepath) # Set up logging. NastLogger object takes precedence over log # file path, if both are provided. if logger is not None: logger = logger elif log_fp is not None: logger = NastLogger(log_fp) else: logger = NastLogger() min_pct /= 100. # get the alignment iterator pw_alignment_iterator = uclust_search_and_align_from_fasta_filepath( candidate_fasta_filepath, template_fasta_filepath, percent_ID=min_pct, enable_rev_strand_matching=True, tmp_dir=temp_dir) try: current_result = pw_alignment_iterator.next() except StopIteration: current_result = None for seq_id, seq in MinimalFastaParser(open(candidate_fasta_filepath)): seq_len = len(seq) if '-' in seq: # clean-up temporary blast database files if any were created pw_alignment_iterator.close() remove_files(files_to_remove,error_on_missing=False) raise ValueError, "Candidate sequence contains gaps. This is not supported." try: candidate_seq_id, template_seq_id, pw_aligned_candidate,\ pw_aligned_template, pct_identity = current_result except TypeError: pass if not current_result or seq_id.split()[0] != candidate_seq_id.split()[0]: # a suitable match was not found - don't align the sequence # log the failure logger.record( seq_id, # input sequence identifier len(seq), # input sequence length "No search results.") # yield the unaligned sequence and failure code yield DNA.makeSequence(seq,Name=seq_id), 1 else: # this sequence was aligned if align_unaligned_seqs_f: # if an alternate pairwise aligner was specified, unalign # and re-align the sequences. pw_aligned_template, pw_aligned_candidate =\ align_two_seqs(pw_aligned_template.replace('-',''), pw_aligned_candidate.replace('-',''), align_unaligned_seqs_f) # Cast the pairwise alignments to DNA sequence objects pw_aligned_candidate = \ DNA.makeSequence(pw_aligned_candidate,Name=candidate_seq_id) pw_aligned_template = \ DNA.makeSequence(pw_aligned_template,Name=template_seq_id) # Remove any terminal gaps that were introduced into the template # sequence pw_aligned_candidate, pw_aligned_template = \ remove_template_terminal_gaps( pw_aligned_candidate, pw_aligned_template) candidate_seq_id = pw_aligned_candidate.Name # get the aligned template sequence from the template alignment try: template_aligned_seq = \ template_alignment.getGappedSeq(template_seq_id) except AttributeError: template_aligned_seq = \ Sequence(seq=template_alignment[template_seq_id],moltype=DNA) # reintroduce the gap spacing from the template alignment pw_aligned_template, pw_aligned_candidate, new_gaps =\ reintroduce_template_spacing(template_aligned_seq,\ pw_aligned_template,pw_aligned_candidate) # delete any new gaps that were introduced during the # pairwise alignment step pw_aligned_template, pw_aligned_candidate = adjust_alignment(\ pw_aligned_template,pw_aligned_candidate,new_gaps) # reintroduce any terminal gaps that were present in the template result = introduce_terminal_gaps(\ template_aligned_seq,pw_aligned_template,pw_aligned_candidate) unaligned_length = len(result.degap()) if unaligned_length < min_len: # alignment is too short - log this as a failure error = "Alignment does not meet minimum length "+\ "requirement for alignment (%d < %d)"\ % (seq_len,min_len) logger.record( seq_id, # input sequence identifier len(seq), # input sequence length "No search results.") # yield the unaligned sequence and failure code yield DNA.makeSequence(seq,Name=seq_id), 2 else: # log the alignment logger.record( seq_id, # input sequence identifier len(seq), # input sequence length '', # Errors template_seq_id, # best template match id '%3.2f' % pct_identity, # pct id to template unaligned_length, # post alignment sequence length ) # yield the aligned sequence and sucess code yield DNA.makeSequence(result,Name=candidate_seq_id), 0 # get the next alignment try: current_result = pw_alignment_iterator.next() except StopIteration: # end of the input fasta file indicates completion, # not end of the aligned sequences continue # clean-up temporary blast database files if any were created remove_files(files_to_remove,error_on_missing=False)