def test_split_fasta_diff_num_seqs_per_file_alt(self): """split_fasta funcs always catches all seqs """ # start with 59 seqs (b/c it's prime, so should make more # confusing splits) in_seqs = SequenceCollection.from_fasta_records( [('seq%s' % k, 'AACCTTAA') for k in range(59)], DNA) infile = in_seqs.to_fasta().split('\n') # test seqs_per_file from 1 to 1000 for i in range(1, 1000): fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(fd) actual = split_fasta(infile, i, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) # remove the files now, so if the test fails they still get # cleaned up remove_files(actual) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual( SequenceCollection.from_fasta_records(parse_fasta(infile), DNA), SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
def test_init(self): """Initialization functions as expected with varied input types """ SequenceCollection(self.seqs1) SequenceCollection(self.seqs2) SequenceCollection(self.seqs3) SequenceCollection([])
def test_split_fasta_diff_num_seqs_per_file(self): """split_fasta funcs as expected when diff num seqs go to each file """ fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(fd) infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3', 'CCTT--AA'] actual = split_fasta(infile, 2, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)] # list of file paths is as expected self.assertEqual(actual, expected) # building seq collections from infile and the split files result in # equivalent seq collections self.assertEqual( SequenceCollection.from_fasta_records(parse_fasta(infile), DNA), SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) template_alignment = Alignment.from_fasta_records(template_alignment, DNASequence, validate=True) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) for i, seq in enumerate(pynast_failed): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_failed[i] = skb_seq pynast_failed = SequenceCollection(pynast_failed) for i, seq in enumerate(pynast_aligned): skb_seq = DNASequence(str(seq), id=seq.Name) pynast_aligned[i] = skb_seq pynast_aligned = Alignment(pynast_aligned) if failure_path is not None: fail_file = open(failure_path, 'w') fail_file.write(pynast_failed.to_fasta()) fail_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(pynast_aligned.to_fasta()) result_file.close() return None else: return pynast_aligned
def test_init_validate(self): """initialization with validation functions as expected """ SequenceCollection(self.seqs1, validate=True) SequenceCollection(self.seqs1, validate=True) # can't validate self.seqs2 as a DNASequence self.assertRaises(SequenceCollectionError, SequenceCollection, self.invalid_s1, validate=True)
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None): # load candidate sequences seq_file = open(seq_path, 'U') candidate_sequences = parse_fasta(seq_file) # load template sequences template_alignment = [] template_alignment_fp = self.Params['template_filepath'] for seq_id, seq in parse_fasta(open(template_alignment_fp)): # replace '.' characters with '-' characters template_alignment.append((seq_id, seq.replace('.', '-').upper())) template_alignment = Alignment.from_fasta_records( template_alignment, DNASequence, validate=True) # initialize_logger logger = NastLogger(log_path) # get function for pairwise alignment method pairwise_alignment_f = pairwise_alignment_methods[ self.Params['pairwise_alignment_method']] pynast_aligned, pynast_failed = pynast_seqs( candidate_sequences, template_alignment, min_pct=self.Params['min_pct'], min_len=self.Params['min_len'], align_unaligned_seqs_f=pairwise_alignment_f, logger=logger, temp_dir=get_qiime_temp_dir()) logger.record(str(self)) for i, seq in enumerate(pynast_failed): skb_seq = DNASequence(str(seq), identifier=seq.Name) pynast_failed[i] = skb_seq pynast_failed = SequenceCollection(pynast_failed) for i, seq in enumerate(pynast_aligned): skb_seq = DNASequence(str(seq), identifier=seq.Name) pynast_aligned[i] = skb_seq pynast_aligned = Alignment(pynast_aligned) if failure_path is not None: fail_file = open(failure_path, 'w') fail_file.write(pynast_failed.to_fasta()) fail_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(pynast_aligned.to_fasta()) result_file.close() return None else: return pynast_aligned
def filter_samples(prefs, data, dir_path='', filename=None): """processes the filtering of the otus file and representative seq set, then writes filtered otus and filtered representative seq set files""" aln = data['aln'] otus = data['otus'] # filter the otus file based on which samples to remove new_otus_list = filter_otus(otus, prefs) filtered_otus_output_filepath = '%s/%s_sfiltered_otus.txt' \ % (dir_path, filename) filtered_otus_output_filepath = open(filtered_otus_output_filepath, 'w') # Write out a new otus file for key in (new_otus_list): filtered_otus_output_filepath.write(key[0]) for j in key[1]: filtered_otus_output_filepath.write('\t' + str(j)) filtered_otus_output_filepath.write('\n') filtered_otus_output_filepath.close() # filter seq set filtered_seqs, removed_seqs = filter_aln_by_otus(aln, prefs) # write a fasta containing list of sequences removed from # representative set if len(removed_seqs) > 0: removed_seqs = SequenceCollection.from_fasta_records( [(e[0], str(e[1])) for e in removed_seqs], DNA) else: raise ValueError( 'No sequences were removed. Did you specify the correct Sample ID?' ) output_filepath2 = '%s/%s_sremoved.fasta' % (dir_path, filename) output_file2 = open(output_filepath2, 'w') output_file2.write(removed_seqs.to_fasta()) output_file2.close() # write a fasta containing the filtered representative seqs if len(filtered_seqs) > 0: filtered_seqs = SequenceCollection.from_fasta_records( [(e[0], str(e[1])) for e in filtered_seqs], DNA) else: raise ValueError( 'No sequences were remaining in the fasta file. Did you remove all Sample ID\'s?' ) output_filepath = '%s/%s_sfiltered.fasta' % (dir_path, filename) output_file = open(output_filepath, 'w') output_file.write(filtered_seqs.to_fasta()) output_file.close()
def test_call_write_to_file(self): """ReferenceRepSetPicker.__call__ otu map correctly written to file""" app = ReferenceRepSetPicker(params={'Algorithm': 'first', 'ChoiceF': first_id}) app(self.tmp_seq_filepath, self.tmp_otu_filepath, self.ref_seq_filepath, result_path=self.result_filepath) with open(self.result_filepath) as f: actual = SequenceCollection.from_fasta_records(parse_fasta(f), DNA) expected = SequenceCollection.from_fasta_records( parse_fasta(rep_seqs_reference_result_file_exp.split('\n')), DNA) # we don't care about order in the results self.assertEqual(set(actual), set(expected))
def test_degap(self): """degap functions as expected """ expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs1_t] expected = SequenceCollection.from_fasta_records(expected, DNASequence) actual = self.a1.degap() self.assertEqual(actual, expected) expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs2_t] expected = SequenceCollection.from_fasta_records(expected, RNASequence) actual = self.a2.degap() self.assertEqual(actual, expected)
def filter_samples(prefs, data, dir_path='', filename=None): """processes the filtering of the otus file and representative seq set, then writes filtered otus and filtered representative seq set files""" aln = data['aln'] otus = data['otus'] # filter the otus file based on which samples to remove new_otus_list = filter_otus(otus, prefs) filtered_otus_output_filepath = '%s/%s_sfiltered_otus.txt' \ % (dir_path, filename) filtered_otus_output_filepath = open(filtered_otus_output_filepath, 'w') # Write out a new otus file for key in (new_otus_list): filtered_otus_output_filepath.write(key[0]) for j in key[1]: filtered_otus_output_filepath.write('\t' + str(j)) filtered_otus_output_filepath.write('\n') filtered_otus_output_filepath.close() # filter seq set filtered_seqs, removed_seqs = filter_aln_by_otus(aln, prefs) # write a fasta containing list of sequences removed from # representative set if len(removed_seqs) > 0: removed_seqs = SequenceCollection.from_fasta_records( [(e[0], str(e[1])) for e in removed_seqs], DNA) else: raise ValueError( 'No sequences were removed. Did you specify the correct Sample ID?') output_filepath2 = '%s/%s_sremoved.fasta' % (dir_path, filename) output_file2 = open(output_filepath2, 'w') output_file2.write(removed_seqs.to_fasta()) output_file2.close() # write a fasta containing the filtered representative seqs if len(filtered_seqs) > 0: filtered_seqs = SequenceCollection.from_fasta_records( [(e[0], str(e[1])) for e in filtered_seqs], DNA) else: raise ValueError( 'No sequences were remaining in the fasta file. Did you remove all Sample ID\'s?') output_filepath = '%s/%s_sfiltered.fasta' % (dir_path, filename) output_file = open(output_filepath, 'w') output_file.write(filtered_seqs.to_fasta()) output_file.close()
def test_call_write_to_file(self): """ReferenceRepSetPicker.__call__ otu map correctly written to file""" app = ReferenceRepSetPicker(params={ 'Algorithm': 'first', 'ChoiceF': first_id }) app(self.tmp_seq_filepath, self.tmp_otu_filepath, self.ref_seq_filepath, result_path=self.result_filepath) with open(self.result_filepath) as f: actual = SequenceCollection.from_fasta_records(parse_fasta(f), DNA) expected = SequenceCollection.from_fasta_records( parse_fasta(rep_seqs_reference_result_file_exp.split('\n')), DNA) # we don't care about order in the results self.assertEqual(set(actual), set(expected))
def test_ne(self): """inequality operator functions as expected """ self.assertFalse(self.s1 != self.s1) self.assertTrue(self.s1 != self.s2) # SequenceCollections with different number of sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertTrue(self.s1 != FakeSequenceCollection([self.d1, self.d2])) self.assertTrue(self.s1 != Alignment([self.d1, self.d2])) # SequenceCollections with different sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1]))
def test_distances(self): """distances functions as expected """ s1 = SequenceCollection([DNA("ACGT", "d1"), DNA("ACGG", "d2")]) expected = [[0, 0.25], [0.25, 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(hamming) self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42.], [42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(dumb_distance) self.assertEqual(actual, expected)
def setUp(self): fd, self.pynast_test1_input_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta") close(fd) with open(self.pynast_test1_input_fp, "w") as f: f.write(pynast_test1_input_fasta) fd, self.pynast_test1_template_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta") close(fd) with open(self.pynast_test1_template_fp, "w") as f: f.write(pynast_test1_template_fasta) fd, self.pynast_test_template_w_dots_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta") close(fd) with open(self.pynast_test_template_w_dots_fp, "w") as f: f.write(pynast_test1_template_fasta.replace("-", ".")) fd, self.pynast_test_template_w_u_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta") close(fd) with open(self.pynast_test_template_w_u_fp, "w") as f: f.write(pynast_test1_template_fasta.replace("T", "U")) fd, self.pynast_test_template_w_lower_fp = mkstemp(prefix="PyNastAlignerTests_", suffix="template.fasta") close(fd) with open(self.pynast_test_template_w_lower_fp, "w") as f: f.write(pynast_test1_template_fasta.lower()) # create temp file names (and touch them so we can reliably # clean them up) fd, self.result_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta") close(fd) open(self.result_fp, "w").close() fd, self.failure_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".fasta") close(fd) open(self.failure_fp, "w").close() fd, self.log_fp = mkstemp(prefix="PyNastAlignerTests_", suffix=".log") close(fd) open(self.log_fp, "w").close() self._paths_to_clean_up = [ self.pynast_test1_input_fp, self.result_fp, self.failure_fp, self.log_fp, self.pynast_test1_template_fp, self.pynast_test_template_w_dots_fp, self.pynast_test_template_w_u_fp, self.pynast_test_template_w_lower_fp, ] self.pynast_test1_aligner = PyNastAligner({"template_filepath": self.pynast_test1_template_fp, "min_len": 15}) self.pynast_test1_expected_aln = Alignment.from_fasta_records(parse_fasta(pynast_test1_expected_alignment), DNA) self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records( parse_fasta(pynast_test1_expected_failure), DNA )
def setUp(self): """Initialize values to be used in tests """ self.d1 = DNASequence('GATTACA', identifier="d1") self.d2 = DNASequence('TTG', identifier="d2") self.d1_lower = DNASequence('gattaca', identifier="d1") self.d2_lower = DNASequence('ttg', identifier="d2") self.r1 = RNASequence('GAUUACA', identifier="r1") self.r2 = RNASequence('UUG', identifier="r2") self.r3 = RNASequence('U-----UGCC--', identifier="r3") self.i1 = DNASequence('GATXACA', identifier="i1") self.seqs1 = [self.d1, self.d2] self.seqs1_lower = [self.d1_lower, self.d2_lower] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')] self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'), ('r3', 'U-----UGCC--')] self.seqs3_t = self.seqs1_t + self.seqs2_t self.s1 = SequenceCollection(self.seqs1) self.s1_lower = SequenceCollection(self.seqs1_lower) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.empty = SequenceCollection([]) self.invalid_s1 = SequenceCollection([self.i1])
def test_split_fasta_equal_num_seqs_per_file(self): """split_fasta funcs as expected when equal num seqs go to each file """ fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(), prefix='split_fasta_tests', suffix='') close(fd) infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3', 'CCTT--AA'] actual = split_fasta(infile, 1, filename_prefix) actual_seqs = [] for fp in actual: actual_seqs += list(open(fp)) remove_files(actual) expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)] self.assertEqual(actual, expected) self.assertEqual( SequenceCollection.from_fasta_records(parse_fasta(infile), DNA), SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
def test_filter_aln_by_otus(self): """filter_aln_by_otus: determines which sequences to keep and which sequences to remove""" self.sample_to_extract = "SampleA,SampleB" exp1 = [] exp1.append(("SampleA", "AAAAAAAAAAAAAAA")) exp2 = [] exp2.append(("SampleB", "CCCCCCC")) exp2.append(("SampleC", "GGGGGGGGGGGGGG")) aln = SequenceCollection.from_fasta_records(self.aln, DNA) obs1, obs2 = filter_aln_by_otus(aln, self.prefs) self.assertEqual(obs1, exp1) self.assertEqual(obs2, exp2)
def test_filter_aln_by_otus(self): """filter_aln_by_otus: determines which sequences to keep and which sequences to remove""" self.sample_to_extract = 'SampleA,SampleB' exp1 = [] exp1.append(('SampleA', 'AAAAAAAAAAAAAAA')) exp2 = [] exp2.append(('SampleB', 'CCCCCCC')) exp2.append(('SampleC', 'GGGGGGGGGGGGGG')) aln = SequenceCollection.from_fasta_records(self.aln, DNA) obs1, obs2 = filter_aln_by_otus(aln, self.prefs) self.assertEqual(obs1, exp1) self.assertEqual(obs2, exp2)
def main(): """opens files as necessary based on prefs""" option_parser, opts, args = parse_command_line_parameters(**script_info) data = {} fasta_file = opts.input_fasta_fp # load the input alignment data['aln'] = SequenceCollection.from_fasta_records( parse_fasta(open(fasta_file)), DNA) # Load the otu file otu_path = opts.otu_map_fp otu_f = open(otu_path, 'U') otus = fields_to_dict(otu_f) otu_f.close() data['otus'] = otus # Determine which which samples to extract from representative seqs # and from otus file if opts.samples_to_extract: prefs = process_extract_samples(opts.samples_to_extract) filepath = opts.input_fasta_fp filename = filepath.strip().split('/')[-1] filename = filename.split('.')[0] if opts.output_dir: if os.path.exists(opts.output_dir): dir_path = opts.output_dir else: try: os.mkdir(opts.output_dir) dir_path = opts.output_dir except OSError: pass else: dir_path = './' try: action = filter_samples except NameError: action = None # Place this outside try/except so we don't mask NameError in action if action: action(prefs, data, dir_path, filename)
def test_call_pynast_test1_file_output_alt_params(self): """PyNastAligner writes correct output files when no seqs align """ aligner = PyNastAligner({"template_filepath": self.pynast_test1_template_fp, "min_len": 1000}) actual = aligner( self.pynast_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp, failure_path=self.failure_fp ) self.assertTrue(actual is None, "Result should be None when result path provided.") self.assertEqual(getsize(self.result_fp), 0, "No alignable seqs should result in an empty file.") # all seqs reported to fail with open(self.failure_fp) as failure_f: actual_fail = SequenceCollection.from_fasta_records(parse_fasta(failure_f), DNA) self.assertEqual(actual_fail.sequence_count(), 3)
def test_call_pynast_test1_file_output(self): """PyNastAligner writes correct output files for pynast_test1 seqs """ # do not collect results; check output files instead actual = self.pynast_test1_aligner( self.pynast_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp, failure_path=self.failure_fp ) self.assertTrue(actual is None, "Result should be None when result path provided.") expected_aln = self.pynast_test1_expected_aln with open(self.result_fp) as result_f: actual_aln = Alignment.from_fasta_records(parse_fasta(result_f), DNA) self.assertEqual(actual_aln, expected_aln) with open(self.failure_fp) as failure_f: actual_fail = SequenceCollection.from_fasta_records(parse_fasta(failure_f), DNA) self.assertEqual(actual_fail.to_fasta(), self.pynast_test1_expected_fail.to_fasta())
def test_call_pynast_test1_file_output(self): """PyNastAligner writes correct output files for pynast_test1 seqs """ # do not collect results; check output files instead actual = self.pynast_test1_aligner( self.pynast_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp, failure_path=self.failure_fp) self.assertTrue(actual is None, "Result should be None when result path provided.") expected_aln = self.pynast_test1_expected_aln with open(self.result_fp) as result_f: actual_aln = Alignment.from_fasta_records(parse_fasta( result_f), DNA) self.assertEqual(actual_aln, expected_aln) with open(self.failure_fp) as failure_f: actual_fail = SequenceCollection.from_fasta_records( parse_fasta(failure_f), DNA) self.assertEqual(actual_fail.to_fasta(), self.pynast_test1_expected_fail.to_fasta())
def test_call_pynast_test1_file_output_alt_params(self): """PyNastAligner writes correct output files when no seqs align """ aligner = PyNastAligner({ 'template_filepath': self.pynast_test1_template_fp, 'min_len': 1000}) actual = aligner( self.pynast_test1_input_fp, result_path=self.result_fp, log_path=self.log_fp, failure_path=self.failure_fp) self.assertTrue(actual is None, "Result should be None when result path provided.") self.assertEqual(getsize(self.result_fp), 0, "No alignable seqs should result in an empty file.") # all seqs reported to fail with open(self.failure_fp) as failure_f: actual_fail = SequenceCollection.from_fasta_records( parse_fasta(failure_f), DNA) self.assertEqual(actual_fail.sequence_count(), 3)
def test_from_fasta_records(self): """Initialization from list of tuples functions as expected """ SequenceCollection.from_fasta_records(self.seqs1_t, DNASequence) SequenceCollection.from_fasta_records(self.seqs2_t, RNASequence) SequenceCollection.from_fasta_records(self.seqs3_t, NucleotideSequence)
class SequenceCollectionTests(TestCase): """Tests of the SequenceCollection class """ def setUp(self): """Initialize values to be used in tests """ self.d1 = DNASequence('GATTACA', identifier="d1") self.d2 = DNASequence('TTG', identifier="d2") self.d1_lower = DNASequence('gattaca', identifier="d1") self.d2_lower = DNASequence('ttg', identifier="d2") self.r1 = RNASequence('GAUUACA', identifier="r1") self.r2 = RNASequence('UUG', identifier="r2") self.r3 = RNASequence('U-----UGCC--', identifier="r3") self.i1 = DNASequence('GATXACA', identifier="i1") self.seqs1 = [self.d1, self.d2] self.seqs1_lower = [self.d1_lower, self.d2_lower] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')] self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'), ('r3', 'U-----UGCC--')] self.seqs3_t = self.seqs1_t + self.seqs2_t self.s1 = SequenceCollection(self.seqs1) self.s1_lower = SequenceCollection(self.seqs1_lower) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.empty = SequenceCollection([]) self.invalid_s1 = SequenceCollection([self.i1]) def test_init(self): """Initialization functions as expected with varied input types """ SequenceCollection(self.seqs1) SequenceCollection(self.seqs2) SequenceCollection(self.seqs3) SequenceCollection([]) def test_init_fail(self): """initialization with sequences with overlapping identifiers fails """ s1 = [self.d1, self.d1] self.assertRaises(SequenceCollectionError, SequenceCollection, s1) def test_init_validate(self): """initialization with validation functions as expected """ SequenceCollection(self.seqs1, validate=True) SequenceCollection(self.seqs1, validate=True) # can't validate self.seqs2 as a DNASequence self.assertRaises(SequenceCollectionError, SequenceCollection, self.invalid_s1, validate=True) def test_from_fasta_records(self): """Initialization from list of tuples functions as expected """ SequenceCollection.from_fasta_records(self.seqs1_t, DNASequence) SequenceCollection.from_fasta_records(self.seqs2_t, RNASequence) SequenceCollection.from_fasta_records(self.seqs3_t, NucleotideSequence) def test_contains(self): """in operator functions as expected """ self.assertTrue('d1' in self.s1) self.assertTrue('r2' in self.s2) self.assertFalse('r2' in self.s1) def test_eq(self): """equality operator functions as expected """ self.assertTrue(self.s1 == self.s1) self.assertFalse(self.s1 == self.s2) # different objects can be equal self.assertTrue(self.s1 == SequenceCollection([self.d1, self.d2])) self.assertTrue(SequenceCollection([self.d1, self.d2]) == self.s1) # SequenceCollections with different number of sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertFalse(self.s1 == FakeSequenceCollection([self.d1, self.d2])) self.assertFalse(self.s1 == Alignment([self.d1, self.d2])) # SequenceCollections with different sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1, self.r1])) def test_getitem(self): """getitem functions as expected """ self.assertEqual(self.s1[0], self.d1) self.assertEqual(self.s1[1], self.d2) self.assertEqual(self.s2[0], self.r1) self.assertEqual(self.s2[1], self.r2) self.assertRaises(IndexError, self.empty.__getitem__, 0) self.assertRaises(KeyError, self.empty.__getitem__, '0') def test_iter(self): """iter functions as expected """ s1_iter = iter(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_len(self): """len functions as expected """ self.assertEqual(len(self.s1), 2) self.assertEqual(len(self.s2), 3) self.assertEqual(len(self.s3), 5) self.assertEqual(len(self.empty), 0) def test_ne(self): """inequality operator functions as expected """ self.assertFalse(self.s1 != self.s1) self.assertTrue(self.s1 != self.s2) # SequenceCollections with different number of sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertTrue(self.s1 != FakeSequenceCollection([self.d1, self.d2])) self.assertTrue(self.s1 != Alignment([self.d1, self.d2])) # SequenceCollections with different sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1])) def test_repr(self): """repr functions as expected """ self.assertEqual(repr(self.s1), "<SequenceCollection: n=2; " "mean +/- std length=5.00 +/- 2.00>") self.assertEqual(repr(self.s2), "<SequenceCollection: n=3; " "mean +/- std length=7.33 +/- 3.68>") self.assertEqual(repr(self.s3), "<SequenceCollection: n=5; " "mean +/- std length=6.40 +/- 3.32>") self.assertEqual(repr(self.empty), "<SequenceCollection: n=0; " "mean +/- std length=0.00 +/- 0.00>") def test_reversed(self): """reversed functions as expected """ s1_iter = reversed(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1[::-1]): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_k_word_frequencies(self): """k_word_frequencies functions as expected """ expected1 = defaultdict(int) expected1['A'] = 3/7. expected1['C'] = 1/7. expected1['G'] = 1/7. expected1['T'] = 2/7. expected2 = defaultdict(int) expected2['G'] = 1/3. expected2['T'] = 2/3. self.assertEqual(self.s1.k_word_frequencies(k=1), [expected1, expected2]) expected1 = defaultdict(int) expected1['GAT'] = 1/2. expected1['TAC'] = 1/2. expected2 = defaultdict(int) expected2['TTG'] = 1/1. self.assertEqual(self.s1.k_word_frequencies(k=3, overlapping=False), [expected1, expected2]) self.assertEqual(self.empty.k_word_frequencies(k=1), []) def test_str(self): """str functions as expected """ exp1 = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(str(self.s1), exp1) exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n" self.assertEqual(str(self.s2), exp2) exp4 = "" self.assertEqual(str(self.empty), exp4) def test_distribution_stats(self): """distribution_stats functions as expected """ actual1 = self.s1.distribution_stats() self.assertEqual(actual1[0], 2) self.assertAlmostEqual(actual1[1], 5.0, 3) self.assertAlmostEqual(actual1[2], 2.0, 3) actual2 = self.s2.distribution_stats() self.assertEqual(actual2[0], 3) self.assertAlmostEqual(actual2[1], 7.333, 3) self.assertAlmostEqual(actual2[2], 3.682, 3) actual3 = self.s3.distribution_stats() self.assertEqual(actual3[0], 5) self.assertAlmostEqual(actual3[1], 6.400, 3) self.assertAlmostEqual(actual3[2], 3.323, 3) actual4 = self.empty.distribution_stats() self.assertEqual(actual4[0], 0) self.assertEqual(actual4[1], 0.0) self.assertEqual(actual4[2], 0.0) def test_degap(self): """degap functions as expected """ expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs2_t] expected = SequenceCollection.from_fasta_records(expected, RNASequence) actual = self.s2.degap() self.assertEqual(actual, expected) def test_get_seq(self): """getseq functions asexpected """ self.assertEqual(self.s1.get_seq('d1'), self.d1) self.assertEqual(self.s1.get_seq('d2'), self.d2) def test_identifiers(self): """identifiers functions as expected """ self.assertEqual(self.s1.identifiers(), ['d1', 'd2']) self.assertEqual(self.s2.identifiers(), ['r1', 'r2', 'r3']) self.assertEqual(self.s3.identifiers(), ['d1', 'd2', 'r1', 'r2', 'r3']) self.assertEqual(self.empty.identifiers(), []) def test_int_map(self): """int_map functions as expected """ expected1 = {"1": self.d1, "2": self.d2} expected2 = {"1": "d1", "2": "d2"} self.assertEqual(self.s1.int_map(), (expected1, expected2)) expected1 = {"h-1": self.d1, "h-2": self.d2} expected2 = {"h-1": "d1", "h-2": "d2"} self.assertEqual(self.s1.int_map(prefix='h-'), (expected1, expected2)) def test_is_empty(self): """is_empty functions as expected """ self.assertFalse(self.s1.is_empty()) self.assertFalse(self.s2.is_empty()) self.assertFalse(self.s3.is_empty()) self.assertTrue(self.empty.is_empty()) def test_is_valid(self): """is_valid functions as expected """ self.assertTrue(self.s1.is_valid()) self.assertTrue(self.s2.is_valid()) self.assertTrue(self.s3.is_valid()) self.assertTrue(self.empty.is_valid()) self.assertFalse(self.invalid_s1.is_valid()) def test_iteritems(self): """iteritems functions as expected """ self.assertEqual(list(self.s1.iteritems()), [(s.identifier, s) for s in self.s1]) def test_lower(self): """lower functions as expected """ self.assertEqual(self.s1.lower(), self.s1_lower) def test_sequence_count(self): """num_seqs functions as expected """ self.assertEqual(self.s1.sequence_count(), 2) self.assertEqual(self.s2.sequence_count(), 3) self.assertEqual(self.s3.sequence_count(), 5) self.assertEqual(self.empty.sequence_count(), 0) def test_sequence_lengths(self): """sequence_lengths functions as expected """ self.assertEqual(self.s1.sequence_lengths(), [7, 3]) self.assertEqual(self.s2.sequence_lengths(), [7, 3, 12]) self.assertEqual(self.s3.sequence_lengths(), [7, 3, 7, 3, 12]) self.assertEqual(self.empty.sequence_lengths(), []) def test_to_fasta(self): """to_fasta functions as expected """ exp1 = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(self.s1.to_fasta(), exp1) exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n" self.assertEqual(self.s2.to_fasta(), exp2) def test_upper(self): """upper functions as expected """ self.assertEqual(self.s1_lower.upper(), self.s1)
"""AY800210\tArchaea;Euryarchaeota;Halobacteriales;uncultured EU883771\tArchaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel. EF503699\tArchaea;Crenarchaeota;uncultured;uncultured DQ260310\tArchaea;Euryarchaeota;Methanobacteriales;Methanobacterium EF503697\tArchaea;Crenarchaeota;uncultured;uncultured""" blast_test_seqs = SequenceCollection.from_fasta_records([ ('s1', 'TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGATGCGACTAAGCCATGCTAGTCTAACGGATCTTCGGATCCGTGGCATACCGCTCTGTAACACGTAGATAACCTACCCTGAGGTCGGGGAAACTCCCGGGAAACTGGGCCTAATCCCCGATAGATAATTTGTACTGGAATGTCTTTTTATTGAAACCTCCGAGGCCTCAGGATGGGTCTGCGCCAGATTATGGTCGTAGGTGGGGTAACGGCCCACCTAGCCTTTGATCTGTACCGGACATGAGAGTGTGTGCCGGGAGATGGCCACTGAGACAAGGGGCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTTCACAATGCCCGCAAGGGTGATGAGGGTATCCGAGTGCTACCTTAGCCGGTAGCTTTTATTCAGTGTAAATAGCTAGATGAATAAGGGGAGGGCAAGGCTGGTGCCAGCCGCCGCGGTAAAACCAGCTCCCGAGTGGTCGGGATTTTTATTGGGCCTAAAGCGTCCGTAGCCGGGCGTGCAAGTCATTGGTTAAATATCGGGTCTTAAGCCCGAACCTGCTAGTGATACTACACGCCTTGGGACCGGAAGAGGCAAATGGTACGTTGAGGGTAGGGGTGAAATCCTGTAATCCCCAACGGACCACCGGTGGCGAAGCTTGTTCAGTCATGAACAACTCTACACAAGGCGATTTGCTGGGACGGATCCGACGGTGAGGGACGAAACCCAGGGGAGCGAGCGGGATTAGATACCCCGGTAGTCCTGGGCGTAAACGATGCGAACTAGGTGTTGGCGGAGCCACGAGCTCTGTCGGTGCCGAAGCGAAGGCGTTAAGTTCGCCGCCAGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCAC' ), ('s2', 'TGGCGTACGGCTCAGTAACACGTGGATAACTTACCCTTAGGACTGGGATAACTCTGGGAAACTGGGGATAATACTGGATATTAGGCTATGCCTGGAATGGTTTGCCTTTGAAATGTTTTTTTTCGCCTAAGGATAGGTCTGCGGCTGATTAGGTCGTTGGTGGGGTAATGGCCCACCAAGCCGATGATCGGTACGGGTTGTGAGAGCAAGGGCCCGGAGATGGAACCTGAGACAAGGTTCCAGACCCTACGGGGTGCAGCAGGCGCGAAACCTCCGCAATGTACGAAAGTGCGACGGGGGGATCCCAAGTGTTATGCTTTTTTGTATGACTTTTCATTAGTGTAAAAAGCTTTTAGAATAAGAGCTGGGCAAGACCGGTGCCAGCCGCCGCGGTAACACCGGCAGCTCGAGTGGTGACCACTTTTATTGGGCTTAAAGCGTTCGTAGCTTGATTTTTAAGTCTCTTGGGAAATCTCACGGCTTAACTGTGAGGCGTCTAAGAGATACTGGGAATCTAGGGACCGGGAGAGGTAAGAGGTACTTCAGGGGTAGAAGTGAAATTCTGTAATCCTTGAGGGACCACCGATGGCGAAGGCATCTTACCAGAACGGCTTCGACAGTGAGGAACGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCGGTAGTCCCAGCCGTAAACTATGCGCGTTAGGTGTGCCTGTAACTACGAGTTACCGGGGTGCCGAAGTGAAAACGTGAAACGTGCCGCCTGGGAAGTACGGTCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAACGGGTGGAGCCTGCGGTTTAATTGGACTCAACGCCGGGCAGCTCACCGGATAGGACAGCGGAATGATAGCCGGGCTGAAGACCTTGCTTGACCAGCTGAGA' ), ('s3', 'AAGAATGGGGATAGCATGCGAGTCACGCCGCAATGTGTGGCATACGGCTCAGTAACACGTAGTCAACATGCCCAGAGGACGTGGACACCTCGGGAAACTGAGGATAAACCGCGATAGGCCACTACTTCTGGAATGAGCCATGACCCAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGGAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCACGAAACCTCTGCAATAGGCGAAAGCTTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCCGCTTAACGGATGGGCTGCGGAGGATACTGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCTTTGATCTACTGAAGACCACCAGTGGTGAAGGCGGTTCGCCAGAACGCGCTCGAACGGTGAGGATGAAAGCTGGGGGAGCAAACCGGAATAGATACCCGAGTAATCCCAACTGTAAACGATGGCAACTCGGGGATGGGTTGGCCTCCAACCAACCCCATGGCCGCAGGGAAGCCGTTTAGCTCTCCCGCCTGGGGAATACGGTCCGCAGAATTGAACCTTAAAGGAATTTGGCGGGGAACCCCCACAAGGGGGAAAACCGTGCGGTTCAATTGGAATCCACCCCCCGGAAACTTTACCCGGGCGCG' ), ('s4', 'GATACCCCCGGAAACTGGGGATTATACCGGATATGTGGGGCTGCCTGGAATGGTACCTCATTGAAATGCTCCCGCGCCTAAAGATGGATCTGCCGCAGAATAAGTAGTTTGCGGGGTAAATGGCCACCCAGCCAGTAATCCGTACCGGTTGTGAAAACCAGAACCCCGAGATGGAAACTGAAACAAAGGTTCAAGGCCTACCGGGCACAACAAGCGCCAAAACTCCGCCATGCGAGCCATCGCGACGGGGGAAAACCAAGTACCACTCCTAACGGGGTGGTTTTTCCGAAGTGGAAAAAGCCTCCAGGAATAAGAACCTGGGCCAGAACCGTGGCCAGCCGCCGCCGTTACACCCGCCAGCTCGAGTTGTTGGCCGGTTTTATTGGGGCCTAAAGCCGGTCCGTAGCCCGTTTTGATAAGGTCTCTCTGGTGAAATTCTACAGCTTAACCTGTGGGAATTGCTGGAGGATACTATTCAAGCTTGAAGCCGGGAGAAGCCTGGAAGTACTCCCGGGGGTAAGGGGTGAAATTCTATTATCCCCGGAAGACCAACTGGTGCCGAAGCGGTCCAGCCTGGAACCGAACTTGACCGTGAGTTACGAAAAGCCAAGGGGCGCGGACCGGAATAAAATAACCAGGGTAGTCCTGGCCGTAAACGATGTGAACTTGGTGGTGGGAATGGCTTCGAACTGCCCAATTGCCGAAAGGAAGCTGTAAATTCACCCGCCTTGGAAGTACGGTCGCAAGACTGGAACCTAAAAGGAATTGGCGGGGGGACACCACAACGCGTGGAGCCTGGCGGTTTTATTGGGATTCCACGCAGACATCTCACTCAGGGGCGACAGCAGAAATGATGGGCAGGTTGATGACCTTGCTTGACAAGCTGAAAAGGAGGTGCAT' ), ('s5', 'TAAAATGACTAGCCTGCGAGTCACGCCGTAAGGCGTGGCATACAGGCTCAGTAACACGTAGTCAACATGCCCAAAGGACGTGGATAACCTCGGGAAACTGAGGATAAACCGCGATAGGCCAAGGTTTCTGGAATGAGCTATGGCCGAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGTAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCGCGAAACCTCTGCAATAGGCGAAAGCCTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCTGCTCAACGGATGGGCTGCGGAGGATACCGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCATTGATCTACTGAAGACCACCAGTGGCGAAGGCGGTTTGCCAGAACGCGCTCGACGGTGAGGGATGAAAGCTGGGGGAGCAAACCGGATTAGATACCCGGGGTAGTCCCAGCTGTAAACGGATGCAGACTCGGGTGATGGGGTTGGCTTCCGGCCCAACCCCAATTGCCCCCAGGCGAAGCCCGTTAAGATCTTGCCGCCCTGTCAGATGTCAGGGCCGCCAATACTCGAAACCTTAAAAGGAAATTGGGCGCGGGAAAAGTCACCAAAAGGGGGTTGAAACCCTGCGGGTTATATATTGTAAACC' ), ('s6', 'ATAGTAGGTGATTGCGAAGACCGCGGAACCGGGACCTAGCACCCAGCCTGTACCGAGGGATGGGGAGCTGTGGCGGTCCACCGACGACCCTTTGTGACAGCCGATTCCTACAATCCCAGCAACTGCAATGATCCACTCTAGTCGGCATAACCGGGAATCGTTAACCTGGTAGGGTTCTCTACGTCTGAGTCTACAGCCCAGAGCAGTCAGGCTACTATACGGTTTGCTGCATTGCATAGGCATCGGTCGCGGGCACTCCTCGCGGTTTCAGCTAGGGTTTAAATGGAGGGTCGCTGCATGAGTATGCAAATAGTGCCACTGCTCTGATACAGAGAAGTGTTGATATGACACCTAAGACCTGGTCACAGTTTTAACCTGCCTACGCACACCAGTGTGCTATTGATTAACGATATCGGTAGACACGACCTTGGTAACCTGACTAACCTCATGGAAAGTGACTAGATAAATGGACCGGAGCCAACTTTCACCCGGAAAACGGACCGACGAATCGTCGTAGACTACCGATCTGACAAAATAAGCACGAGGGAGCATGTTTTGCGCAGGCTAGCCTATTCCCACCTCAAGCCTCGAGAACCAAGACGCCTGATCCGGTGCTGCACGAAGGGTCGCCTCTAGGTAAGGAGAGCTGGCATCTCCAGATCCGATATTTTACCCAACCTTTGCGCGCTCAGATTGTTATAGTGAAACGATTTAAGCCTGAACGGAGTTCCGCTCCATATGTGGGTTATATATGTGAGATGTATTAACTTCCGCAGTTGTCTCTTTCGGTGCAGTACGCTTGGTATGTGTCTCAAATAATCGGTATTATAGTGATCTGAGAGGTTTTAAG' ) ], DNA) blast_reference_seqs = SequenceCollection.from_fasta_records([ ('AY800210', 'TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGATGCGACTAAGCCATGCTAGTCTAACGGATCTTCGGATCCGTGGCATACCGCTCTGTAACACGTAGATAACCTACCCTGAGGTCGGGGAAACTCCCGGGAAACTGGGCCTAATCCCCGATAGATAATTTGTACTGGAATGTCTTTTTATTGAAACCTCCGAGGCCTCAGGATGGGTCTGCGCCAGATTATGGTCGTAGGTGGGGTAACGGCCCACCTAGCCTTTGATCTGTACCGGACATGAGAGTGTGTGCCGGGAGATGGCCACTGAGACAAGGGGCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTTCACAATGCCCGCAAGGGTGATGAGGGTATCCGAGTGCTACCTTAGCCGGTAGCTTTTATTCAGTGTAAATAGCTAGATGAATAAGGGGAGGGCAAGGCTGGTGCCAGCCGCCGCGGTAAAACCAGCTCCCGAGTGGTCGGGATTTTTATTGGGCCTAAAGCGTCCGTAGCCGGGCGTGCAAGTCATTGGTTAAATATCGGGTCTTAAGCCCGAACCTGCTAGTGATACTACACGCCTTGGGACCGGAAGAGGCAAATGGTACGTTGAGGGTAGGGGTGAAATCCTGTAATCCCCAACGGACCACCGGTGGCGAAGCTTGTTCAGTCATGAACAACTCTACACAAGGCGATTTGCTGGGACGGATCCGACGGTGAGGGACGAAACCCAGGGGAGCGAGCGGGATTAGATACCCCGGTAGTCCTGGGCGTAAACGATGCGAACTAGGTGTTGGCGGAGCCACGAGCTCTGTCGGTGCCGAAGCGAAGGCGTTAAGTTCGCCGCCAGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCAC' ), ('EU883771',
def __call__(self, seq_path, result_path=None, log_path=None, failure_path=None, cmbuild_params=None, cmalign_params=None): log_params = [] # load candidate sequences candidate_sequences = dict(parse_fasta(open(seq_path, 'U'))) # load template sequences try: info, template_alignment, struct = list(MinimalRfamParser(open( self.Params['template_filepath'], 'U'), seq_constructor=ChangedSequence))[0] except RecordError: raise ValueError( "Template alignment must be in Stockholm format with corresponding secondary structure annotation when using InfernalAligner.") moltype = self.Params['moltype'] # Need to make separate mapping for unaligned sequences unaligned = SequenceCollection.from_fasta_records( candidate_sequences.iteritems(), DNASequence) mapped_seqs, new_to_old_ids = unaligned.int_map(prefix='unaligned_') mapped_seq_tuples = [(k, str(v)) for k,v in mapped_seqs.iteritems()] # Turn on --gapthresh option in cmbuild to force alignment to full # model if cmbuild_params is None: cmbuild_params = {} cmbuild_params.update({'--gapthresh': 1.0}) # record cmbuild parameters log_params.append('cmbuild parameters:') log_params.append(str(cmbuild_params)) # Turn on --sub option in Infernal, since we know the unaligned sequences # are fragments. # Also turn on --gapthresh to use same gapthresh as was used to build # model if cmalign_params is None: cmalign_params = {} cmalign_params.update({'--sub': True, '--gapthresh': 1.0}) # record cmalign parameters log_params.append('cmalign parameters:') log_params.append(str(cmalign_params)) # Align sequences to alignment including alignment gaps. aligned, struct_string = cmalign_from_alignment(aln=template_alignment, structure_string=struct, seqs=mapped_seq_tuples, moltype=moltype, include_aln=True, params=cmalign_params, cmbuild_params=cmbuild_params) # Pull out original sequences from full alignment. infernal_aligned = [] # Get a dict of the identifiers to sequences (note that this is a # cogent alignment object, hence the call to NamedSeqs) aligned_dict = aligned.NamedSeqs for n, o in new_to_old_ids.iteritems(): aligned_seq = aligned_dict[n] infernal_aligned.append((o, aligned_seq)) # Create an Alignment object from alignment dict infernal_aligned = Alignment.from_fasta_records(infernal_aligned, DNASequence) if log_path is not None: log_file = open(log_path, 'w') log_file.write('\n'.join(log_params)) log_file.close() if result_path is not None: result_file = open(result_path, 'w') result_file.write(infernal_aligned.to_fasta()) result_file.close() return None else: try: return infernal_aligned except ValueError: return {}
tggctcagattgaacgctggcggcaggcctaacacatgcaagtcgagcggaaacgantnntntgaaccttcggggnacgatnacggcgtcgagcggcggacgggtgagtaatgcctgggaaattgccctgatgtgggggataactattggaaacgatagctaataccgcataatgtctacggaccaaagagggggaccttcgggcctctcgcttcaggatatgcccaggtgggattagctagttggtgaggtaatggctcaccaaggcgacgatccctagctggtctgagaggatgatcagccacactggaactgag """ blast_id_to_taxonomy = \ """AY800210\tArchaea;Euryarchaeota;Halobacteriales;uncultured EU883771\tArchaea;Euryarchaeota;Methanomicrobiales;Methanomicrobium et rel. EF503699\tArchaea;Crenarchaeota;uncultured;uncultured DQ260310\tArchaea;Euryarchaeota;Methanobacteriales;Methanobacterium EF503697\tArchaea;Crenarchaeota;uncultured;uncultured""" blast_test_seqs = SequenceCollection.from_fasta_records([ ('s1', 'TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGATGCGACTAAGCCATGCTAGTCTAACGGATCTTCGGATCCGTGGCATACCGCTCTGTAACACGTAGATAACCTACCCTGAGGTCGGGGAAACTCCCGGGAAACTGGGCCTAATCCCCGATAGATAATTTGTACTGGAATGTCTTTTTATTGAAACCTCCGAGGCCTCAGGATGGGTCTGCGCCAGATTATGGTCGTAGGTGGGGTAACGGCCCACCTAGCCTTTGATCTGTACCGGACATGAGAGTGTGTGCCGGGAGATGGCCACTGAGACAAGGGGCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTTCACAATGCCCGCAAGGGTGATGAGGGTATCCGAGTGCTACCTTAGCCGGTAGCTTTTATTCAGTGTAAATAGCTAGATGAATAAGGGGAGGGCAAGGCTGGTGCCAGCCGCCGCGGTAAAACCAGCTCCCGAGTGGTCGGGATTTTTATTGGGCCTAAAGCGTCCGTAGCCGGGCGTGCAAGTCATTGGTTAAATATCGGGTCTTAAGCCCGAACCTGCTAGTGATACTACACGCCTTGGGACCGGAAGAGGCAAATGGTACGTTGAGGGTAGGGGTGAAATCCTGTAATCCCCAACGGACCACCGGTGGCGAAGCTTGTTCAGTCATGAACAACTCTACACAAGGCGATTTGCTGGGACGGATCCGACGGTGAGGGACGAAACCCAGGGGAGCGAGCGGGATTAGATACCCCGGTAGTCCTGGGCGTAAACGATGCGAACTAGGTGTTGGCGGAGCCACGAGCTCTGTCGGTGCCGAAGCGAAGGCGTTAAGTTCGCCGCCAGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCAC'), ('s2', 'TGGCGTACGGCTCAGTAACACGTGGATAACTTACCCTTAGGACTGGGATAACTCTGGGAAACTGGGGATAATACTGGATATTAGGCTATGCCTGGAATGGTTTGCCTTTGAAATGTTTTTTTTCGCCTAAGGATAGGTCTGCGGCTGATTAGGTCGTTGGTGGGGTAATGGCCCACCAAGCCGATGATCGGTACGGGTTGTGAGAGCAAGGGCCCGGAGATGGAACCTGAGACAAGGTTCCAGACCCTACGGGGTGCAGCAGGCGCGAAACCTCCGCAATGTACGAAAGTGCGACGGGGGGATCCCAAGTGTTATGCTTTTTTGTATGACTTTTCATTAGTGTAAAAAGCTTTTAGAATAAGAGCTGGGCAAGACCGGTGCCAGCCGCCGCGGTAACACCGGCAGCTCGAGTGGTGACCACTTTTATTGGGCTTAAAGCGTTCGTAGCTTGATTTTTAAGTCTCTTGGGAAATCTCACGGCTTAACTGTGAGGCGTCTAAGAGATACTGGGAATCTAGGGACCGGGAGAGGTAAGAGGTACTTCAGGGGTAGAAGTGAAATTCTGTAATCCTTGAGGGACCACCGATGGCGAAGGCATCTTACCAGAACGGCTTCGACAGTGAGGAACGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCGGTAGTCCCAGCCGTAAACTATGCGCGTTAGGTGTGCCTGTAACTACGAGTTACCGGGGTGCCGAAGTGAAAACGTGAAACGTGCCGCCTGGGAAGTACGGTCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAACGGGTGGAGCCTGCGGTTTAATTGGACTCAACGCCGGGCAGCTCACCGGATAGGACAGCGGAATGATAGCCGGGCTGAAGACCTTGCTTGACCAGCTGAGA'), ('s3', 'AAGAATGGGGATAGCATGCGAGTCACGCCGCAATGTGTGGCATACGGCTCAGTAACACGTAGTCAACATGCCCAGAGGACGTGGACACCTCGGGAAACTGAGGATAAACCGCGATAGGCCACTACTTCTGGAATGAGCCATGACCCAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGGAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCACGAAACCTCTGCAATAGGCGAAAGCTTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCCGCTTAACGGATGGGCTGCGGAGGATACTGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCTTTGATCTACTGAAGACCACCAGTGGTGAAGGCGGTTCGCCAGAACGCGCTCGAACGGTGAGGATGAAAGCTGGGGGAGCAAACCGGAATAGATACCCGAGTAATCCCAACTGTAAACGATGGCAACTCGGGGATGGGTTGGCCTCCAACCAACCCCATGGCCGCAGGGAAGCCGTTTAGCTCTCCCGCCTGGGGAATACGGTCCGCAGAATTGAACCTTAAAGGAATTTGGCGGGGAACCCCCACAAGGGGGAAAACCGTGCGGTTCAATTGGAATCCACCCCCCGGAAACTTTACCCGGGCGCG'), ('s4', 'GATACCCCCGGAAACTGGGGATTATACCGGATATGTGGGGCTGCCTGGAATGGTACCTCATTGAAATGCTCCCGCGCCTAAAGATGGATCTGCCGCAGAATAAGTAGTTTGCGGGGTAAATGGCCACCCAGCCAGTAATCCGTACCGGTTGTGAAAACCAGAACCCCGAGATGGAAACTGAAACAAAGGTTCAAGGCCTACCGGGCACAACAAGCGCCAAAACTCCGCCATGCGAGCCATCGCGACGGGGGAAAACCAAGTACCACTCCTAACGGGGTGGTTTTTCCGAAGTGGAAAAAGCCTCCAGGAATAAGAACCTGGGCCAGAACCGTGGCCAGCCGCCGCCGTTACACCCGCCAGCTCGAGTTGTTGGCCGGTTTTATTGGGGCCTAAAGCCGGTCCGTAGCCCGTTTTGATAAGGTCTCTCTGGTGAAATTCTACAGCTTAACCTGTGGGAATTGCTGGAGGATACTATTCAAGCTTGAAGCCGGGAGAAGCCTGGAAGTACTCCCGGGGGTAAGGGGTGAAATTCTATTATCCCCGGAAGACCAACTGGTGCCGAAGCGGTCCAGCCTGGAACCGAACTTGACCGTGAGTTACGAAAAGCCAAGGGGCGCGGACCGGAATAAAATAACCAGGGTAGTCCTGGCCGTAAACGATGTGAACTTGGTGGTGGGAATGGCTTCGAACTGCCCAATTGCCGAAAGGAAGCTGTAAATTCACCCGCCTTGGAAGTACGGTCGCAAGACTGGAACCTAAAAGGAATTGGCGGGGGGACACCACAACGCGTGGAGCCTGGCGGTTTTATTGGGATTCCACGCAGACATCTCACTCAGGGGCGACAGCAGAAATGATGGGCAGGTTGATGACCTTGCTTGACAAGCTGAAAAGGAGGTGCAT'), ('s5', 'TAAAATGACTAGCCTGCGAGTCACGCCGTAAGGCGTGGCATACAGGCTCAGTAACACGTAGTCAACATGCCCAAAGGACGTGGATAACCTCGGGAAACTGAGGATAAACCGCGATAGGCCAAGGTTTCTGGAATGAGCTATGGCCGAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGTAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCGCGAAACCTCTGCAATAGGCGAAAGCCTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCTGCTCAACGGATGGGCTGCGGAGGATACCGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCATTGATCTACTGAAGACCACCAGTGGCGAAGGCGGTTTGCCAGAACGCGCTCGACGGTGAGGGATGAAAGCTGGGGGAGCAAACCGGATTAGATACCCGGGGTAGTCCCAGCTGTAAACGGATGCAGACTCGGGTGATGGGGTTGGCTTCCGGCCCAACCCCAATTGCCCCCAGGCGAAGCCCGTTAAGATCTTGCCGCCCTGTCAGATGTCAGGGCCGCCAATACTCGAAACCTTAAAAGGAAATTGGGCGCGGGAAAAGTCACCAAAAGGGGGTTGAAACCCTGCGGGTTATATATTGTAAACC'), ('s6', 'ATAGTAGGTGATTGCGAAGACCGCGGAACCGGGACCTAGCACCCAGCCTGTACCGAGGGATGGGGAGCTGTGGCGGTCCACCGACGACCCTTTGTGACAGCCGATTCCTACAATCCCAGCAACTGCAATGATCCACTCTAGTCGGCATAACCGGGAATCGTTAACCTGGTAGGGTTCTCTACGTCTGAGTCTACAGCCCAGAGCAGTCAGGCTACTATACGGTTTGCTGCATTGCATAGGCATCGGTCGCGGGCACTCCTCGCGGTTTCAGCTAGGGTTTAAATGGAGGGTCGCTGCATGAGTATGCAAATAGTGCCACTGCTCTGATACAGAGAAGTGTTGATATGACACCTAAGACCTGGTCACAGTTTTAACCTGCCTACGCACACCAGTGTGCTATTGATTAACGATATCGGTAGACACGACCTTGGTAACCTGACTAACCTCATGGAAAGTGACTAGATAAATGGACCGGAGCCAACTTTCACCCGGAAAACGGACCGACGAATCGTCGTAGACTACCGATCTGACAAAATAAGCACGAGGGAGCATGTTTTGCGCAGGCTAGCCTATTCCCACCTCAAGCCTCGAGAACCAAGACGCCTGATCCGGTGCTGCACGAAGGGTCGCCTCTAGGTAAGGAGAGCTGGCATCTCCAGATCCGATATTTTACCCAACCTTTGCGCGCTCAGATTGTTATAGTGAAACGATTTAAGCCTGAACGGAGTTCCGCTCCATATGTGGGTTATATATGTGAGATGTATTAACTTCCGCAGTTGTCTCTTTCGGTGCAGTACGCTTGGTATGTGTCTCAAATAATCGGTATTATAGTGATCTGAGAGGTTTTAAG')], DNA) blast_reference_seqs = SequenceCollection.from_fasta_records([ ('AY800210', 'TTCCGGTTGATCCTGCCGGACCCGACTGCTATCCGGATGCGACTAAGCCATGCTAGTCTAACGGATCTTCGGATCCGTGGCATACCGCTCTGTAACACGTAGATAACCTACCCTGAGGTCGGGGAAACTCCCGGGAAACTGGGCCTAATCCCCGATAGATAATTTGTACTGGAATGTCTTTTTATTGAAACCTCCGAGGCCTCAGGATGGGTCTGCGCCAGATTATGGTCGTAGGTGGGGTAACGGCCCACCTAGCCTTTGATCTGTACCGGACATGAGAGTGTGTGCCGGGAGATGGCCACTGAGACAAGGGGCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTTCACAATGCCCGCAAGGGTGATGAGGGTATCCGAGTGCTACCTTAGCCGGTAGCTTTTATTCAGTGTAAATAGCTAGATGAATAAGGGGAGGGCAAGGCTGGTGCCAGCCGCCGCGGTAAAACCAGCTCCCGAGTGGTCGGGATTTTTATTGGGCCTAAAGCGTCCGTAGCCGGGCGTGCAAGTCATTGGTTAAATATCGGGTCTTAAGCCCGAACCTGCTAGTGATACTACACGCCTTGGGACCGGAAGAGGCAAATGGTACGTTGAGGGTAGGGGTGAAATCCTGTAATCCCCAACGGACCACCGGTGGCGAAGCTTGTTCAGTCATGAACAACTCTACACAAGGCGATTTGCTGGGACGGATCCGACGGTGAGGGACGAAACCCAGGGGAGCGAGCGGGATTAGATACCCCGGTAGTCCTGGGCGTAAACGATGCGAACTAGGTGTTGGCGGAGCCACGAGCTCTGTCGGTGCCGAAGCGAAGGCGTTAAGTTCGCCGCCAGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCAC'), ('EU883771', 'TGGCGTACGGCTCAGTAACACGTGGATAACTTACCCTTAGGACTGGGATAACTCTGGGAAACTGGGGATAATACTGGATATTAGGCTATGCCTGGAATGGTTTGCCTTTGAAATGTTTTTTTTCGCCTAAGGATAGGTCTGCGGCTGATTAGGTCGTTGGTGGGGTAATGGCCCACCAAGCCGATGATCGGTACGGGTTGTGAGAGCAAGGGCCCGGAGATGGAACCTGAGACAAGGTTCCAGACCCTACGGGGTGCAGCAGGCGCGAAACCTCCGCAATGTACGAAAGTGCGACGGGGGGATCCCAAGTGTTATGCTTTTTTGTATGACTTTTCATTAGTGTAAAAAGCTTTTAGAATAAGAGCTGGGCAAGACCGGTGCCAGCCGCCGCGGTAACACCGGCAGCTCGAGTGGTGACCACTTTTATTGGGCTTAAAGCGTTCGTAGCTTGATTTTTAAGTCTCTTGGGAAATCTCACGGCTTAACTGTGAGGCGTCTAAGAGATACTGGGAATCTAGGGACCGGGAGAGGTAAGAGGTACTTCAGGGGTAGAAGTGAAATTCTGTAATCCTTGAGGGACCACCGATGGCGAAGGCATCTTACCAGAACGGCTTCGACAGTGAGGAACGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCGGTAGTCCCAGCCGTAAACTATGCGCGTTAGGTGTGCCTGTAACTACGAGTTACCGGGGTGCCGAAGTGAAAACGTGAAACGTGCCGCCTGGGAAGTACGGTCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAACGGGTGGAGCCTGCGGTTTAATTGGACTCAACGCCGGGCAGCTCACCGGATAGGACAGCGGAATGATAGCCGGGCTGAAGACCTTGCTTGACCAGCTGAGA'), ('EF503699', 'AAGAATGGGGATAGCATGCGAGTCACGCCGCAATGTGTGGCATACGGCTCAGTAACACGTAGTCAACATGCCCAGAGGACGTGGACACCTCGGGAAACTGAGGATAAACCGCGATAGGCCACTACTTCTGGAATGAGCCATGACCCAAATCTATATGGCCTTTGGATTGGACTGCGGCCGATCAGGCTGTTGGTGAGGTAATGGCCCACCAAACCTGTAACCGGTACGGGCTTTGAGAGAAGGAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTATGGGGCGCAGCAGGCACGAAACCTCTGCAATAGGCGAAAGCTTGACAGGGTTACTCTGAGTGATGCCCGCTAAGGGTATCTTTTGGCACCTCTAAAAATGGTGCAGAATAAGGGGTGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCACCCCGAGTTGTCGGGACGATTATTGGGCCTAAAGCATCCGTAGCCTGTTCTGCAAGTCCTCCGTTAAATCCACCCGCTTAACGGATGGGCTGCGGAGGATACTGCAGAGCTAGGAGGCGGGAGAGGCAAACGGTACTCAGTGGGTAGGGGTAAAATCCTTTGATCTACTGAAGACCACCAGTGGTGAAGGCGGTTCGCCAGAACGCGCTCGAACGGTGAGGATGAAAGCTGGGGGAGCAAACCGGAATAGATACCCGAGTAATCCCAACTGTAAACGATGGCAACTCGGGGATGGGTTGGCCTCCAACCAACCCCATGGCCGCAGGGAAGCCGTTTAGCTCTCCCGCCTGGGGAATACGGTCCGCAGAATTGAACCTTAAAGGAATTTGGCGGGGAACCCCCACAAGGGGGAAAACCGTGCGGTTCAATTGGAATCCACCCCCCGGAAACTTTACCCGGGCGCG'), ('DQ260310',
def setUp(self): fd, self.pynast_test1_input_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) with open(self.pynast_test1_input_fp, 'w') as f: f.write(pynast_test1_input_fasta) fd, self.pynast_test1_template_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test1_template_fp, 'w') as f: f.write(pynast_test1_template_fasta) fd, self.pynast_test_template_w_dots_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_dots_fp, 'w') as f: f.write(pynast_test1_template_fasta.replace('-', '.')) fd, self.pynast_test_template_w_u_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_u_fp, 'w') as f: f.write(pynast_test1_template_fasta.replace('T', 'U')) fd, self.pynast_test_template_w_lower_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='template.fasta') close(fd) with open(self.pynast_test_template_w_lower_fp, 'w') as f: f.write(pynast_test1_template_fasta.lower()) # create temp file names (and touch them so we can reliably # clean them up) fd, self.result_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) open(self.result_fp, 'w').close() fd, self.failure_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.fasta') close(fd) open(self.failure_fp, 'w').close() fd, self.log_fp = mkstemp( prefix='PyNastAlignerTests_', suffix='.log') close(fd) open(self.log_fp, 'w').close() self._paths_to_clean_up = [ self.pynast_test1_input_fp, self.result_fp, self.failure_fp, self.log_fp, self.pynast_test1_template_fp, self.pynast_test_template_w_dots_fp, self.pynast_test_template_w_u_fp, self.pynast_test_template_w_lower_fp ] self.pynast_test1_aligner = PyNastAligner({ 'template_filepath': self.pynast_test1_template_fp, 'min_len': 15, }) self.pynast_test1_expected_aln = Alignment.from_fasta_records( parse_fasta(pynast_test1_expected_alignment), DNA) self.pynast_test1_expected_fail = SequenceCollection.from_fasta_records( parse_fasta(pynast_test1_expected_failure), DNA)
def setUp(self): """Initialize values to be used in tests """ self.d1 = DNASequence('GATTACA', id="d1") self.d2 = DNASequence('TTG', id="d2") self.d1_lower = DNASequence('gattaca', id="d1") self.d2_lower = DNASequence('ttg', id="d2") self.r1 = RNASequence('GAUUACA', id="r1") self.r2 = RNASequence('UUG', id="r2") self.r3 = RNASequence('U-----UGCC--', id="r3") self.i1 = DNASequence('GATXACA', id="i1") self.seqs1 = [self.d1, self.d2] self.seqs1_lower = [self.d1_lower, self.d2_lower] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')] self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'), ('r3', 'U-----UGCC--')] self.seqs3_t = self.seqs1_t + self.seqs2_t self.s1 = SequenceCollection(self.seqs1) self.s1_lower = SequenceCollection(self.seqs1_lower) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.empty = SequenceCollection([]) self.invalid_s1 = SequenceCollection([self.i1])
class SequenceCollectionTests(TestCase): """Tests of the SequenceCollection class """ def setUp(self): """Initialize values to be used in tests """ self.d1 = DNASequence('GATTACA', id="d1") self.d2 = DNASequence('TTG', id="d2") self.d1_lower = DNASequence('gattaca', id="d1") self.d2_lower = DNASequence('ttg', id="d2") self.r1 = RNASequence('GAUUACA', id="r1") self.r2 = RNASequence('UUG', id="r2") self.r3 = RNASequence('U-----UGCC--', id="r3") self.i1 = DNASequence('GATXACA', id="i1") self.seqs1 = [self.d1, self.d2] self.seqs1_lower = [self.d1_lower, self.d2_lower] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')] self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'), ('r3', 'U-----UGCC--')] self.seqs3_t = self.seqs1_t + self.seqs2_t self.s1 = SequenceCollection(self.seqs1) self.s1_lower = SequenceCollection(self.seqs1_lower) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.empty = SequenceCollection([]) self.invalid_s1 = SequenceCollection([self.i1]) def test_init(self): """Initialization functions as expected with varied input types """ SequenceCollection(self.seqs1) SequenceCollection(self.seqs2) SequenceCollection(self.seqs3) SequenceCollection([]) def test_init_fail(self): """initialization with sequences with overlapping ids fails """ s1 = [self.d1, self.d1] self.assertRaises(SequenceCollectionError, SequenceCollection, s1) def test_init_validate(self): """initialization with validation functions as expected """ SequenceCollection(self.seqs1, validate=True) SequenceCollection(self.seqs1, validate=True) # can't validate self.seqs2 as a DNASequence self.assertRaises(SequenceCollectionError, SequenceCollection, self.invalid_s1, validate=True) def test_from_fasta_records(self): """Initialization from list of tuples functions as expected """ SequenceCollection.from_fasta_records(self.seqs1_t, DNASequence) SequenceCollection.from_fasta_records(self.seqs2_t, RNASequence) SequenceCollection.from_fasta_records(self.seqs3_t, NucleotideSequence) def test_contains(self): """in operator functions as expected """ self.assertTrue('d1' in self.s1) self.assertTrue('r2' in self.s2) self.assertFalse('r2' in self.s1) def test_eq(self): """equality operator functions as expected """ self.assertTrue(self.s1 == self.s1) self.assertFalse(self.s1 == self.s2) # different objects can be equal self.assertTrue(self.s1 == SequenceCollection([self.d1, self.d2])) self.assertTrue(SequenceCollection([self.d1, self.d2]) == self.s1) # SequenceCollections with different number of sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertFalse(self.s1 == FakeSequenceCollection([self.d1, self.d2])) self.assertFalse(self.s1 == Alignment([self.d1, self.d2])) # SequenceCollections with different sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1, self.r1])) def test_getitem(self): """getitem functions as expected """ self.assertEqual(self.s1[0], self.d1) self.assertEqual(self.s1[1], self.d2) self.assertEqual(self.s2[0], self.r1) self.assertEqual(self.s2[1], self.r2) self.assertRaises(IndexError, self.empty.__getitem__, 0) self.assertRaises(KeyError, self.empty.__getitem__, '0') def test_iter(self): """iter functions as expected """ s1_iter = iter(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_len(self): """len functions as expected """ self.assertEqual(len(self.s1), 2) self.assertEqual(len(self.s2), 3) self.assertEqual(len(self.s3), 5) self.assertEqual(len(self.empty), 0) def test_ne(self): """inequality operator functions as expected """ self.assertFalse(self.s1 != self.s1) self.assertTrue(self.s1 != self.s2) # SequenceCollections with different number of sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertTrue(self.s1 != FakeSequenceCollection([self.d1, self.d2])) self.assertTrue(self.s1 != Alignment([self.d1, self.d2])) # SequenceCollections with different sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1])) def test_repr(self): """repr functions as expected """ self.assertEqual( repr(self.s1), "<SequenceCollection: n=2; " "mean +/- std length=5.00 +/- 2.00>") self.assertEqual( repr(self.s2), "<SequenceCollection: n=3; " "mean +/- std length=7.33 +/- 3.68>") self.assertEqual( repr(self.s3), "<SequenceCollection: n=5; " "mean +/- std length=6.40 +/- 3.32>") self.assertEqual( repr(self.empty), "<SequenceCollection: n=0; " "mean +/- std length=0.00 +/- 0.00>") def test_reversed(self): """reversed functions as expected """ s1_iter = reversed(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1[::-1]): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_k_word_frequencies(self): """k_word_frequencies functions as expected """ expected1 = defaultdict(int) expected1['A'] = 3 / 7. expected1['C'] = 1 / 7. expected1['G'] = 1 / 7. expected1['T'] = 2 / 7. expected2 = defaultdict(int) expected2['G'] = 1 / 3. expected2['T'] = 2 / 3. self.assertEqual(self.s1.k_word_frequencies(k=1), [expected1, expected2]) expected1 = defaultdict(int) expected1['GAT'] = 1 / 2. expected1['TAC'] = 1 / 2. expected2 = defaultdict(int) expected2['TTG'] = 1 / 1. self.assertEqual(self.s1.k_word_frequencies(k=3, overlapping=False), [expected1, expected2]) self.assertEqual(self.empty.k_word_frequencies(k=1), []) def test_str(self): """str functions as expected """ exp1 = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(str(self.s1), exp1) exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n" self.assertEqual(str(self.s2), exp2) exp4 = "" self.assertEqual(str(self.empty), exp4) def test_distribution_stats(self): """distribution_stats functions as expected """ actual1 = self.s1.distribution_stats() self.assertEqual(actual1[0], 2) self.assertAlmostEqual(actual1[1], 5.0, 3) self.assertAlmostEqual(actual1[2], 2.0, 3) actual2 = self.s2.distribution_stats() self.assertEqual(actual2[0], 3) self.assertAlmostEqual(actual2[1], 7.333, 3) self.assertAlmostEqual(actual2[2], 3.682, 3) actual3 = self.s3.distribution_stats() self.assertEqual(actual3[0], 5) self.assertAlmostEqual(actual3[1], 6.400, 3) self.assertAlmostEqual(actual3[2], 3.323, 3) actual4 = self.empty.distribution_stats() self.assertEqual(actual4[0], 0) self.assertEqual(actual4[1], 0.0) self.assertEqual(actual4[2], 0.0) def test_degap(self): """degap functions as expected """ expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs2_t] expected = SequenceCollection.from_fasta_records(expected, RNASequence) actual = self.s2.degap() self.assertEqual(actual, expected) def test_get_seq(self): """getseq functions asexpected """ self.assertEqual(self.s1.get_seq('d1'), self.d1) self.assertEqual(self.s1.get_seq('d2'), self.d2) def test_ids(self): """ids functions as expected """ self.assertEqual(self.s1.ids(), ['d1', 'd2']) self.assertEqual(self.s2.ids(), ['r1', 'r2', 'r3']) self.assertEqual(self.s3.ids(), ['d1', 'd2', 'r1', 'r2', 'r3']) self.assertEqual(self.empty.ids(), []) def test_int_map(self): """int_map functions as expected """ expected1 = {"1": self.d1, "2": self.d2} expected2 = {"1": "d1", "2": "d2"} self.assertEqual(self.s1.int_map(), (expected1, expected2)) expected1 = {"h-1": self.d1, "h-2": self.d2} expected2 = {"h-1": "d1", "h-2": "d2"} self.assertEqual(self.s1.int_map(prefix='h-'), (expected1, expected2)) def test_is_empty(self): """is_empty functions as expected """ self.assertFalse(self.s1.is_empty()) self.assertFalse(self.s2.is_empty()) self.assertFalse(self.s3.is_empty()) self.assertTrue(self.empty.is_empty()) def test_is_valid(self): """is_valid functions as expected """ self.assertTrue(self.s1.is_valid()) self.assertTrue(self.s2.is_valid()) self.assertTrue(self.s3.is_valid()) self.assertTrue(self.empty.is_valid()) self.assertFalse(self.invalid_s1.is_valid()) def test_iteritems(self): """iteritems functions as expected """ self.assertEqual(list(self.s1.iteritems()), [(s.id, s) for s in self.s1]) def test_lower(self): """lower functions as expected """ self.assertEqual(self.s1.lower(), self.s1_lower) def test_sequence_count(self): """num_seqs functions as expected """ self.assertEqual(self.s1.sequence_count(), 2) self.assertEqual(self.s2.sequence_count(), 3) self.assertEqual(self.s3.sequence_count(), 5) self.assertEqual(self.empty.sequence_count(), 0) def test_sequence_lengths(self): """sequence_lengths functions as expected """ self.assertEqual(self.s1.sequence_lengths(), [7, 3]) self.assertEqual(self.s2.sequence_lengths(), [7, 3, 12]) self.assertEqual(self.s3.sequence_lengths(), [7, 3, 7, 3, 12]) self.assertEqual(self.empty.sequence_lengths(), []) def test_to_fasta(self): """to_fasta functions as expected """ exp1 = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(self.s1.to_fasta(), exp1) exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n" self.assertEqual(self.s2.to_fasta(), exp2) def test_upper(self): """upper functions as expected """ self.assertEqual(self.s1_lower.upper(), self.s1)