def test_k_word_frequencies(self): expected1 = defaultdict(float) expected1['A'] = 3 / 7. expected1['C'] = 1 / 7. expected1['G'] = 1 / 7. expected1['T'] = 2 / 7. expected2 = defaultdict(float) expected2['G'] = 1 / 3. expected2['T'] = 2 / 3. self.assertEqual(self.s1.k_word_frequencies(k=1), [expected1, expected2]) expected1 = defaultdict(float) expected1['GAT'] = 1 / 2. expected1['TAC'] = 1 / 2. expected2 = defaultdict(float) expected2['TTG'] = 1 / 1. self.assertEqual(self.s1.k_word_frequencies(k=3, overlapping=False), [expected1, expected2]) self.assertEqual(self.empty.k_word_frequencies(k=1), []) # Test to ensure floating point precision bug isn't present. See the # tests for BiologicalSequence.k_word_frequencies for more details. sc = SequenceCollection([RNA('C' * 10, id='s1'), RNA('G' * 10, id='s2')]) self.assertEqual(sc.k_word_frequencies(1), [defaultdict(float, {'C': 1.0}), defaultdict(float, {'G': 1.0})])
def test_k_word_frequencies(self): expected1 = defaultdict(float) expected1['A'] = 3 / 7. expected1['C'] = 1 / 7. expected1['G'] = 1 / 7. expected1['T'] = 2 / 7. expected2 = defaultdict(float) expected2['G'] = 1 / 3. expected2['T'] = 2 / 3. self.assertEqual(self.s1.k_word_frequencies(k=1), [expected1, expected2]) expected1 = defaultdict(float) expected1['GAT'] = 1 / 2. expected1['TAC'] = 1 / 2. expected2 = defaultdict(float) expected2['TTG'] = 1 / 1. self.assertEqual(self.s1.k_word_frequencies(k=3, overlapping=False), [expected1, expected2]) self.assertEqual(self.empty.k_word_frequencies(k=1), []) # Test to ensure floating point precision bug isn't present. See the # tests for BiologicalSequence.k_word_frequencies for more details. sc = SequenceCollection( [RNA('C' * 10, id='s1'), RNA('G' * 10, id='s2')]) self.assertEqual( sc.k_word_frequencies(1), [defaultdict(float, {'C': 1.0}), defaultdict(float, {'G': 1.0})])
def test_init(self): """Initialization functions as expected with varied input types """ SequenceCollection(self.seqs1) SequenceCollection(self.seqs2) SequenceCollection(self.seqs3) SequenceCollection([])
def test_init_validate(self): SequenceCollection(self.seqs1, validate=True) SequenceCollection(self.seqs1, validate=True) # can't validate self.seqs2 as a DNASequence self.assertRaises(SequenceCollectionError, SequenceCollection, self.invalid_s1, validate=True)
def test_init_validate(self): """initialization with validation functions as expected """ SequenceCollection(self.seqs1, validate=True) SequenceCollection(self.seqs1, validate=True) # can't validate self.seqs2 as a DNASequence self.assertRaises(SequenceCollectionError, SequenceCollection, self.invalid_s1, validate=True)
def extract_seq_ids(data, fmt='fasta', variant=None): """ Given FASTQ-format data (string), parse out only the sequence IDs and return. """ fh = StringIO(data) if fmt == 'fastq': sc = SequenceCollection.read(fh, format=fmt, variant=variant) else: sc = SequenceCollection.read(fh, format=fmt) return frozenset(entry.id for entry in sc)
def test_degap(self): expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs1_t] expected = SequenceCollection.from_fasta_records(expected, DNASequence) actual = self.a1.degap() self.assertEqual(actual, expected) expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs2_t] expected = SequenceCollection.from_fasta_records(expected, RNASequence) actual = self.a2.degap() self.assertEqual(actual, expected)
def test_degap(self): """degap functions as expected """ expected = [(id_, seq.replace(".", "").replace("-", "")) for id_, seq in self.seqs1_t] expected = SequenceCollection.from_fasta_records(expected, DNASequence) actual = self.a1.degap() self.assertEqual(actual, expected) expected = [(id_, seq.replace(".", "").replace("-", "")) for id_, seq in self.seqs2_t] expected = SequenceCollection.from_fasta_records(expected, RNASequence) actual = self.a2.degap() self.assertEqual(actual, expected)
def test_distances(self): s1 = SequenceCollection([DNA("ACGT", "d1"), DNA("ACGG", "d2")]) expected = [[0, 0.25], [0.25, 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(hamming) self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42.], [42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(dumb_distance) self.assertEqual(actual, expected)
def test_degap(self): expected = SequenceCollection([ DNA('ACCGTTGG', metadata={'id': "d1"}), DNA('TTACCGGTGGCC', metadata={'id': "d2"}), DNA('ACCGTTGC', metadata={'id': "d3"}) ]) actual = self.a1.degap() self.assertEqual(actual, expected) expected = SequenceCollection([ RNA('UUAU', metadata={'id': "r1"}), RNA('ACGUU', metadata={'id': "r2"}) ]) actual = self.a2.degap() self.assertEqual(actual, expected)
def test_ne(self): self.assertFalse(self.s1 != self.s1) self.assertTrue(self.s1 != self.s2) # SequenceCollections with different number of sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertTrue(self.s4 != FakeSequenceCollection([self.d1, self.d3])) self.assertTrue(self.s4 != Alignment([self.d1, self.d3])) # SequenceCollections with different sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1]))
def convert_phylip(infile, outfile, format): seqs = SequenceCollection.read( infile, format='phylip', data_parser=phylip.relaxed_ids ) seqs.write(outfile, format=format)
def test_init_fail_no_id(self): seq = Sequence('ACGTACGT') with six.assertRaisesRegex( self, SequenceCollectionError, "'id' must be included in the sequence " "metadata"): SequenceCollection([seq])
def test_fastq_to_sequence_collection(self): for valid_files, kwargs, components in self.valid_configurations: for valid in valid_files: for observed_kwargs in kwargs: _drop_kwargs(observed_kwargs, 'seq_num') constructor = observed_kwargs.get('constructor', Sequence) expected_kwargs = {} expected_kwargs['lowercase'] = 'introns' observed_kwargs['lowercase'] = 'introns' expected = SequenceCollection([ constructor(c[2], metadata={ 'id': c[0], 'description': c[1] }, positional_metadata={ 'quality': np.array(c[3], np.uint8) }, **expected_kwargs) for c in components ]) observed = _fastq_to_sequence_collection( valid, **observed_kwargs) self.assertEqual(observed, expected)
def test_valid_files(self): for valid, kwargs, components in self.valid_files: for kwarg in kwargs: _drop_kwargs(kwarg, 'seq_num') constructor = kwarg.get('constructor', Sequence) expected = SequenceCollection([ constructor(c['sequence'], metadata={ 'id': c['id'], 'machine_name': c['machine_name'], 'run_number': c['run_number'], 'lane_number': c['lane_number'], 'tile_number': c['tile_number'], 'x': c['x'], 'y': c['y'], 'index': c['index'], 'read_number': c['read_number'] }, positional_metadata={ 'quality': np.array(c['quality'], dtype=np.uint8) }) for c in components ]) observed = _qseq_to_sequence_collection(valid, **kwarg) self.assertEqual(observed, expected)
def test_fastq_to_sequence_collection(self): for valid_files, kwargs, components in self.valid_configurations: for valid in valid_files: for observed_kwargs in kwargs: _drop_kwargs(observed_kwargs, 'seq_num') constructor = observed_kwargs.get('constructor', Sequence) # Can't use partials for this because the read # function below can't operate on partials expected_kwargs = {} if hasattr(constructor, 'lowercase'): expected_kwargs['lowercase'] = 'introns' observed_kwargs['lowercase'] = 'introns' expected = SequenceCollection( [constructor( c[2], metadata={'id': c[0], 'description': c[1]}, positional_metadata={'quality': np.array(c[3], np.uint8)}, **expected_kwargs) for c in components]) observed = _fastq_to_sequence_collection(valid, **observed_kwargs) self.assertEqual(observed, expected)
def main(): args = handle_program_options() if osp.isfile(args.out_dir): print("--out_dir (-o) option must be a valid directory and not a file", file=sys.stderr) sys.exit(1) # will fail gracefully if dir exists skbu.create_dir(args.out_dir) metagenomes = [] if args.metagenome_id is not None: metagenomes.append(args.metagenome_id) elif args.metagenome_file is not None: metagenomes.extend(parse_metagenome_file(args.metagenome_file)) if args.verbose: msg = 'Processing requested for {} metagenome(s) found in: {}' print(msg.format(len(metagenomes), args.metagenome_file)) # MG-RAST stage.file ids for downloading derep_passed = '150.1' screen_passed = '299.1' for mg_id in metagenomes: if args.verbose: print('Processing metagenome: {}'.format(mg_id)) print('\tDownloading: Dereplication Passed...', end='') sys.stdout.flush() derepp_rsp = mgapi.mgrast_request('download', mg_id, {'file': derep_passed}, auth_key=args.auth_key) derepp_sc = SequenceCollection.read(StringIO(derepp_rsp.text), format='fastq', variant='illumina1.8') if args.verbose: print('{} sequences'.format(len(derepp_sc))) print('\tDownloading: Screen Passed...', end='') sys.stdout.flush() screenp_rsp = mgapi.mgrast_request('download', mg_id, {'file': screen_passed}, auth_key=args.auth_key) screenp_ids = extract_seq_ids(screenp_rsp.text, fmt='fastq', variant='illumina1.8') if args.verbose: print('{} sequences'.format(len(screenp_ids))) # filter dereplication passed with IDs from screen passed failed_screen = filter_seqs(derepp_sc, screenp_ids) if args.verbose: nsp = len(screenp_ids) print('\tRemoved {} sequences from Dereplication Passed'.format(nsp)) print('\tleaving {} sequences'.format(len(failed_screen))) out_fp = osp.join(args.out_dir, mg_id + '_screen_failed.fastq') failed_screen.write(out_fp, format='fastq', variant='illumina1.8') if args.verbose: print('Sequence data written to: ' + out_fp)
def test_make_mini_otu_files(self): os.system("mkdir tmp") self.extension_seqs = SequenceCollection.read(self.extension_seqs) result = _make_mini_otu_files(self.key_node, self.extension_genus_dic_few, self.extension_seqs) os.system("rm -r tmp") self.assertEqual(result, """>P1\nTTAAAAAA\n""")
def test_degap(self): expected = SequenceCollection([ RNA('GAUUACA', metadata={'id': "r1"}), RNA('UUG', metadata={'id': "r2"}), RNA('UUGCC', metadata={'id': "r3"}) ]) actual = self.s2.degap() self.assertEqual(actual, expected)
def setUp(self): self.d1 = DNA('GATTACA', metadata={'id': "d1"}) self.d2 = DNA('TTG', metadata={'id': "d2"}) self.d3 = DNA('GTATACA', metadata={'id': "d3"}) self.r1 = RNA('GAUUACA', metadata={'id': "r1"}) self.r2 = RNA('UUG', metadata={'id': "r2"}) self.r3 = RNA('U-----UGCC--', metadata={'id': "r3"}) self.seqs1 = [self.d1, self.d2] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs4 = [self.d1, self.d3] self.s1 = SequenceCollection(self.seqs1) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.s4 = SequenceCollection(self.seqs4) self.empty = SequenceCollection([])
def test_ne(self): """inequality operator functions as expected """ self.assertFalse(self.s1 != self.s1) self.assertTrue(self.s1 != self.s2) # SequenceCollections with different number of sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertTrue(self.s1 != FakeSequenceCollection([self.d1, self.d2])) self.assertTrue(self.s1 != Alignment([self.d1, self.d2])) # SequenceCollections with different sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1]))
def test_generator_to_fasta_sequence_lowercase_exception(self): seq = Sequence('ACgt', metadata={'id': ''}) fh = io.StringIO() with six.assertRaisesRegex(self, AttributeError, "lowercase specified but class Sequence " "does not support lowercase " "functionality"): _generator_to_fasta(SequenceCollection([seq]), fh, lowercase='introns') fh.close()
def test_distances(self): s1 = SequenceCollection([DNA("ACGT", metadata={'id': "d1"}), DNA("ACGG", metadata={'id': "d2"})]) expected = [[0, 0.25], [0.25, 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) def h(s1, s2): return hamming(s1.values, s2.values) actual = s1.distances(h) self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42.], [42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(dumb_distance) self.assertEqual(actual, expected)
def test_eq(self): self.assertTrue(self.s1 == self.s1) self.assertFalse(self.s1 == self.s2) # different objects can be equal self.assertTrue(self.s1 == SequenceCollection([self.d1, self.d2])) self.assertTrue(SequenceCollection([self.d1, self.d2]) == self.s1) # SequenceCollections with different number of sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertFalse(self.s4 == FakeSequenceCollection([self.d1, self.d3])) self.assertFalse(self.s4 == Alignment([self.d1, self.d3])) # SequenceCollections with different sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1, self.r1]))
def setUp(self): """Initialize values to be used in tests """ self.d1 = DNASequence("GATTACA", id="d1") self.d2 = DNASequence("TTG", id="d2") self.d1_lower = DNASequence("gattaca", id="d1") self.d2_lower = DNASequence("ttg", id="d2") self.r1 = RNASequence("GAUUACA", id="r1") self.r2 = RNASequence("UUG", id="r2") self.r3 = RNASequence("U-----UGCC--", id="r3") self.i1 = DNASequence("GATXACA", id="i1") self.seqs1 = [self.d1, self.d2] self.seqs1_lower = [self.d1_lower, self.d2_lower] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs1_t = [("d1", "GATTACA"), ("d2", "TTG")] self.seqs2_t = [("r1", "GAUUACA"), ("r2", "UUG"), ("r3", "U-----UGCC--")] self.seqs3_t = self.seqs1_t + self.seqs2_t self.s1 = SequenceCollection(self.seqs1) self.s1_lower = SequenceCollection(self.seqs1_lower) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.empty = SequenceCollection([]) self.invalid_s1 = SequenceCollection([self.i1])
def setUp(self): self.d1 = DNASequence('GATTACA', id="d1") self.d2 = DNASequence('TTG', id="d2") self.d3 = DNASequence('GTATACA', id="d3") self.d1_lower = DNASequence('gattaca', id="d1") self.d2_lower = DNASequence('ttg', id="d2") self.d3_lower = DNASequence('gtataca', id="d3") self.r1 = RNASequence('GAUUACA', id="r1") self.r2 = RNASequence('UUG', id="r2") self.r3 = RNASequence('U-----UGCC--', id="r3") self.i1 = DNASequence('GATXACA', id="i1") self.seqs1 = [self.d1, self.d2] self.seqs1_lower = [self.d1_lower, self.d2_lower] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs4 = [self.d1, self.d3] self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')] self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'), ('r3', 'U-----UGCC--')] self.seqs3_t = self.seqs1_t + self.seqs2_t self.s1 = SequenceCollection(self.seqs1) self.s1_lower = SequenceCollection(self.seqs1_lower) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.s4 = SequenceCollection(self.seqs4) self.empty = SequenceCollection([]) self.invalid_s1 = SequenceCollection([self.i1])
def test_kmer_frequencies(self): expected1 = Counter({'GAT': 1, 'TAC': 1}) expected2 = Counter({'TTG': 1}) self.assertEqual( self.s1.kmer_frequencies(k=3, overlap=False, relative=False), [expected1, expected2]) expected1 = defaultdict(float) expected1['A'] = 3 / 7. expected1['C'] = 1 / 7. expected1['G'] = 1 / 7. expected1['T'] = 2 / 7. expected2 = defaultdict(float) expected2['G'] = 1 / 3. expected2['T'] = 2 / 3. self.assertEqual(self.s1.kmer_frequencies(k=1, relative=True), [expected1, expected2]) expected1 = defaultdict(float) expected1['GAT'] = 1 / 2. expected1['TAC'] = 1 / 2. expected2 = defaultdict(float) expected2['TTG'] = 1 / 1. self.assertEqual( self.s1.kmer_frequencies(k=3, overlap=False, relative=True), [expected1, expected2]) self.assertEqual(self.empty.kmer_frequencies(k=1, relative=True), []) # Test to ensure floating point precision bug isn't present. See the # tests for Sequence.kmer_frequencies for more details. sc = SequenceCollection([ RNA('C' * 10, metadata={'id': 's1'}), RNA('G' * 10, metadata={'id': 's2'}) ]) self.assertEqual( sc.kmer_frequencies(1, relative=True), [defaultdict(float, {'C': 1.0}), defaultdict(float, {'G': 1.0})])
def test_distances(self): s1 = SequenceCollection([ DNA("ACGT", metadata={'id': "d1"}), DNA("ACGG", metadata={'id': "d2"}) ]) expected = [[0, 0.25], [0.25, 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) def h(s1, s2): return hamming(s1.values, s2.values) actual = s1.distances(h) self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42.], [42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(dumb_distance) self.assertEqual(actual, expected)
def filter_seqs(seqs, remove_ids): """ Given a collections of sequences and a set of IDs to remove, return all sequences that do not match those IDs. :type seqs: skbio.SequenceCollection :param seqs: The sequences to be filtered :type remove_ids: set :param remove_ids: A set of sequence IDs to remove from the sequence data :rtype: SequenceCollection """ return SequenceCollection( [seq for seq in seqs if seq.id not in remove_ids])
def test_valid_files(self): for valid, kwargs, components in self.valid_files: for kwarg in kwargs: _drop_kwargs(kwarg, 'seq_num') constructor = kwarg.get('constructor', BiologicalSequence) expected = SequenceCollection([constructor(c[1], id=c[0], quality=c[2]) for c in components]) observed = _qseq_to_sequence_collection(valid, **kwarg) # TODO remove when #656 is resolved self.assertEqual(observed, expected) for o, e in zip(observed, expected): self.assertTrue(o.equals(e))
def test_kmer_frequencies(self): expected1 = Counter({'GAT': 1, 'TAC': 1}) expected2 = Counter({'TTG': 1}) self.assertEqual( self.s1.kmer_frequencies(k=3, overlap=False, relative=False), [expected1, expected2]) expected1 = defaultdict(float) expected1['A'] = 3 / 7. expected1['C'] = 1 / 7. expected1['G'] = 1 / 7. expected1['T'] = 2 / 7. expected2 = defaultdict(float) expected2['G'] = 1 / 3. expected2['T'] = 2 / 3. self.assertEqual(self.s1.kmer_frequencies(k=1, relative=True), [expected1, expected2]) expected1 = defaultdict(float) expected1['GAT'] = 1 / 2. expected1['TAC'] = 1 / 2. expected2 = defaultdict(float) expected2['TTG'] = 1 / 1. self.assertEqual( self.s1.kmer_frequencies(k=3, overlap=False, relative=True), [expected1, expected2]) self.assertEqual(self.empty.kmer_frequencies(k=1, relative=True), []) # Test to ensure floating point precision bug isn't present. See the # tests for Sequence.kmer_frequencies for more details. sc = SequenceCollection([RNA('C' * 10, metadata={'id': 's1'}), RNA('G' * 10, metadata={'id': 's2'})]) self.assertEqual(sc.kmer_frequencies(1, relative=True), [defaultdict(float, {'C': 1.0}), defaultdict(float, {'G': 1.0})])
def test_update_ids_default_behavior(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="1"), RNA('UUG', id="2"), RNA('U-----UGCC--', id="3") ]) exp_id_map = {'1': 'r1', '2': 'r2', '3': 'r3'} obs_sc, obs_id_map = self.s2.update_ids() self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids() self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {})
def test_update_ids_prefix(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="abc1"), RNA('UUG', id="abc2"), RNA('U-----UGCC--', id="abc3") ]) exp_id_map = {'abc1': 'r1', 'abc2': 'r2', 'abc3': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(prefix='abc') self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(prefix='abc') self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {})
def test_update_ids_ids_parameter(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="abc"), RNA('UUG', id="def"), RNA('U-----UGCC--', id="ghi") ]) exp_id_map = {'abc': 'r1', 'def': 'r2', 'ghi': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(ids=('abc', 'def', 'ghi')) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(ids=[]) self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {})
def test_sequence_collection_to_fastq_kwargs_passed(self): for components, kwargs_expected_fp in self.valid_files: for kwargs, expected_fp in kwargs_expected_fp: obj = SequenceCollection([ NucleotideSequence(c[2], id=c[0], description=c[1], quality=c[3]) for c in components ]) fh = StringIO() _sequence_collection_to_fastq(obj, fh, **kwargs) observed = fh.getvalue() fh.close() with open(expected_fp, 'U') as f: expected = f.read() self.assertEqual(observed, expected)
def test_update_ids_fn_parameter(self): def append_42(ids): return [id_ + '-42' for id_ in ids] # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="r1-42"), RNA('UUG', id="r2-42"), RNA('U-----UGCC--', id="r3-42") ]) exp_id_map = {'r1-42': 'r1', 'r2-42': 'r2', 'r3-42': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(fn=append_42) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(fn=append_42) self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {})
def test_sequence_collection_to_fastq_kwargs_passed(self): for components, kwargs_expected_fp in self.valid_files: for kwargs, expected_fp in kwargs_expected_fp: obj = SequenceCollection([ DNA(c[2], metadata={'id': c[0], 'description': c[1]}, positional_metadata={'quality': c[3]}, lowercase='introns') for c in components]) fh = StringIO() kwargs['lowercase'] = 'introns' _sequence_collection_to_fastq(obj, fh, **kwargs) observed = fh.getvalue() fh.close() with open(expected_fp, 'U') as f: expected = f.read() self.assertEqual(observed, expected)
def generateReference(assay_list): from skbio import DNA from skbio import SequenceCollection reference = [] for assay in assay_list: name = assay.name if assay.AND: for operand in assay.AND: if isinstance(operand, Target): name = name + "_%s" % operand.gene_name if operand.gene_name else name for amplicon in operand.amplicons: name = name + "_%s" % amplicon.variant_name if amplicon.variant_name else name seq = DNA(amplicon.sequence, id=name) reference.append(seq) else: for amplicon in assay.target.amplicons: name = assay.name + "_%s" % amplicon.variant_name if amplicon.variant_name else name seq = DNA(amplicon.sequence, {'id': name}) reference.append(seq) return SequenceCollection(reference)
log_choices = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] parser.add_argument( '--log-level', '-l', default="INFO", choices=log_choices, help="Set logging level. Default is info." ) return parser if __name__ == '__main__': parser = get_argument_parser() args = parser.parse_args() level = getattr(logging, args.log_level.upper(), logging.INFO) logging.basicConfig(level=level) sequences = SequenceCollection.read(args.infile, format=args.format) if args.parallel == 0 and len(sequences) > 16: pool_size = multiprocessing.cpu_count() else: pool_size = 1 dmatrix = create_distance_matrix(sequences, d2.distance, pool_size, statistic=d2.d2_neighbourhood_dna) print(dmatrix) phylo_tree = nj(dmatrix) print(phylo_tree.ascii_art()) phylo_tree.write(args.outfile, format=args.target)
class SequenceCollectionTests(TestCase): def setUp(self): self.d1 = DNA('GATTACA', metadata={'id': "d1"}) self.d2 = DNA('TTG', metadata={'id': "d2"}) self.d3 = DNA('GTATACA', metadata={'id': "d3"}) self.r1 = RNA('GAUUACA', metadata={'id': "r1"}) self.r2 = RNA('UUG', metadata={'id': "r2"}) self.r3 = RNA('U-----UGCC--', metadata={'id': "r3"}) self.seqs1 = [self.d1, self.d2] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs4 = [self.d1, self.d3] self.s1 = SequenceCollection(self.seqs1) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.s4 = SequenceCollection(self.seqs4) self.empty = SequenceCollection([]) def test_init(self): SequenceCollection(self.seqs1) SequenceCollection(self.seqs2) SequenceCollection(self.seqs3) SequenceCollection([]) def test_init_fail(self): # sequences with overlapping ids s1 = [self.d1, self.d1] self.assertRaises(SequenceCollectionError, SequenceCollection, s1) def test_init_fail_no_id(self): seq = Sequence('ACGTACGT') with six.assertRaisesRegex(self, SequenceCollectionError, "'id' must be included in the sequence " "metadata"): SequenceCollection([seq]) def test_contains(self): self.assertTrue('d1' in self.s1) self.assertTrue('r2' in self.s2) self.assertFalse('r2' in self.s1) def test_eq(self): self.assertTrue(self.s1 == self.s1) self.assertFalse(self.s1 == self.s2) # different objects can be equal self.assertTrue(self.s1 == SequenceCollection([self.d1, self.d2])) self.assertTrue(SequenceCollection([self.d1, self.d2]) == self.s1) # SequenceCollections with different number of sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertFalse(self.s4 == FakeSequenceCollection([self.d1, self.d3])) self.assertFalse(self.s4 == Alignment([self.d1, self.d3])) # SequenceCollections with different sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1, self.r1])) def test_getitem(self): self.assertEqual(self.s1[0], self.d1) self.assertEqual(self.s1[1], self.d2) self.assertEqual(self.s2[0], self.r1) self.assertEqual(self.s2[1], self.r2) self.assertRaises(IndexError, self.empty.__getitem__, 0) self.assertRaises(KeyError, self.empty.__getitem__, '0') def test_iter(self): s1_iter = iter(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_len(self): self.assertEqual(len(self.s1), 2) self.assertEqual(len(self.s2), 3) self.assertEqual(len(self.s3), 5) self.assertEqual(len(self.empty), 0) def test_ne(self): self.assertFalse(self.s1 != self.s1) self.assertTrue(self.s1 != self.s2) # SequenceCollections with different number of sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertTrue(self.s4 != FakeSequenceCollection([self.d1, self.d3])) self.assertTrue(self.s4 != Alignment([self.d1, self.d3])) # SequenceCollections with different sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1])) def test_repr(self): self.assertEqual(repr(self.s1), "<SequenceCollection: n=2; " "mean +/- std length=5.00 +/- 2.00>") self.assertEqual(repr(self.s2), "<SequenceCollection: n=3; " "mean +/- std length=7.33 +/- 3.68>") self.assertEqual(repr(self.s3), "<SequenceCollection: n=5; " "mean +/- std length=6.40 +/- 3.32>") self.assertEqual(repr(self.empty), "<SequenceCollection: n=0; " "mean +/- std length=0.00 +/- 0.00>") def test_reversed(self): s1_iter = reversed(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1[::-1]): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_kmer_frequencies(self): expected1 = Counter({'GAT': 1, 'TAC': 1}) expected2 = Counter({'TTG': 1}) self.assertEqual( self.s1.kmer_frequencies(k=3, overlap=False, relative=False), [expected1, expected2]) expected1 = defaultdict(float) expected1['A'] = 3 / 7. expected1['C'] = 1 / 7. expected1['G'] = 1 / 7. expected1['T'] = 2 / 7. expected2 = defaultdict(float) expected2['G'] = 1 / 3. expected2['T'] = 2 / 3. self.assertEqual(self.s1.kmer_frequencies(k=1, relative=True), [expected1, expected2]) expected1 = defaultdict(float) expected1['GAT'] = 1 / 2. expected1['TAC'] = 1 / 2. expected2 = defaultdict(float) expected2['TTG'] = 1 / 1. self.assertEqual( self.s1.kmer_frequencies(k=3, overlap=False, relative=True), [expected1, expected2]) self.assertEqual(self.empty.kmer_frequencies(k=1, relative=True), []) # Test to ensure floating point precision bug isn't present. See the # tests for Sequence.kmer_frequencies for more details. sc = SequenceCollection([RNA('C' * 10, metadata={'id': 's1'}), RNA('G' * 10, metadata={'id': 's2'})]) self.assertEqual(sc.kmer_frequencies(1, relative=True), [defaultdict(float, {'C': 1.0}), defaultdict(float, {'G': 1.0})]) def test_str(self): exp1 = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(str(self.s1), exp1) exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n" self.assertEqual(str(self.s2), exp2) exp4 = "" self.assertEqual(str(self.empty), exp4) def test_distances(self): s1 = SequenceCollection([DNA("ACGT", metadata={'id': "d1"}), DNA("ACGG", metadata={'id': "d2"})]) expected = [[0, 0.25], [0.25, 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) def h(s1, s2): return hamming(s1.values, s2.values) actual = s1.distances(h) self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42.], [42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(dumb_distance) self.assertEqual(actual, expected) def test_distribution_stats(self): actual1 = self.s1.distribution_stats() self.assertEqual(actual1[0], 2) self.assertAlmostEqual(actual1[1], 5.0, 3) self.assertAlmostEqual(actual1[2], 2.0, 3) actual2 = self.s2.distribution_stats() self.assertEqual(actual2[0], 3) self.assertAlmostEqual(actual2[1], 7.333, 3) self.assertAlmostEqual(actual2[2], 3.682, 3) actual3 = self.s3.distribution_stats() self.assertEqual(actual3[0], 5) self.assertAlmostEqual(actual3[1], 6.400, 3) self.assertAlmostEqual(actual3[2], 3.323, 3) actual4 = self.empty.distribution_stats() self.assertEqual(actual4[0], 0) self.assertEqual(actual4[1], 0.0) self.assertEqual(actual4[2], 0.0) def test_degap(self): expected = SequenceCollection([ RNA('GAUUACA', metadata={'id': "r1"}), RNA('UUG', metadata={'id': "r2"}), RNA('UUGCC', metadata={'id': "r3"})]) actual = self.s2.degap() self.assertEqual(actual, expected) def test_get_seq(self): self.assertEqual(self.s1.get_seq('d1'), self.d1) self.assertEqual(self.s1.get_seq('d2'), self.d2) def test_ids(self): self.assertEqual(self.s1.ids(), ['d1', 'd2']) self.assertEqual(self.s2.ids(), ['r1', 'r2', 'r3']) self.assertEqual(self.s3.ids(), ['d1', 'd2', 'r1', 'r2', 'r3']) self.assertEqual(self.empty.ids(), []) def test_update_ids_default_behavior(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', metadata={'id': "1"}), RNA('UUG', metadata={'id': "2"}), RNA('U-----UGCC--', metadata={'id': "3"}) ]) exp_id_map = {'1': 'r1', '2': 'r2', '3': 'r3'} obs_sc, obs_id_map = self.s2.update_ids() self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids() self.assertEqual(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_prefix(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', metadata={'id': "abc1"}), RNA('UUG', metadata={'id': "abc2"}), RNA('U-----UGCC--', metadata={'id': "abc3"}) ]) exp_id_map = {'abc1': 'r1', 'abc2': 'r2', 'abc3': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(prefix='abc') self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(prefix='abc') self.assertEqual(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_func_parameter(self): def append_42(ids): return [id_ + '-42' for id_ in ids] # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', metadata={'id': "r1-42"}), RNA('UUG', metadata={'id': "r2-42"}), RNA('U-----UGCC--', metadata={'id': "r3-42"}) ]) exp_id_map = {'r1-42': 'r1', 'r2-42': 'r2', 'r3-42': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(func=append_42) self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(func=append_42) self.assertEqual(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_ids_parameter(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', metadata={'id': "abc"}), RNA('UUG', metadata={'id': "def"}), RNA('U-----UGCC--', metadata={'id': "ghi"}) ]) exp_id_map = {'abc': 'r1', 'def': 'r2', 'ghi': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(ids=('abc', 'def', 'ghi')) self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(ids=[]) self.assertEqual(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_sequence_attributes_propagated(self): # 1 seq exp_sc = Alignment([ DNA('ACGT', metadata={'id': "abc", 'description': 'desc'}, positional_metadata={'quality': range(4)}) ]) exp_id_map = {'abc': 'seq1'} obj = Alignment([ DNA('ACGT', metadata={'id': "seq1", 'description': 'desc'}, positional_metadata={'quality': range(4)}) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc',)) self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # 2 seqs exp_sc = Alignment([ DNA('ACGT', metadata={'id': "abc", 'description': 'desc1'}, positional_metadata={'quality': range(4)}), DNA('TGCA', metadata={'id': "def", 'description': 'desc2'}, positional_metadata={'quality': range(4)[::-1]}) ]) exp_id_map = {'abc': 'seq1', 'def': 'seq2'} obj = Alignment([ DNA('ACGT', metadata={'id': "seq1", 'description': 'desc1'}, positional_metadata={'quality': (0, 1, 2, 3)}), DNA('TGCA', metadata={'id': "seq2", 'description': 'desc2'}, positional_metadata={'quality': (3, 2, 1, 0)}) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc', 'def')) self.assertEqual(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) def test_update_ids_invalid_parameter_combos(self): with six.assertRaisesRegex(self, SequenceCollectionError, 'ids and func'): self.s1.update_ids(func=lambda e: e, ids=['foo', 'bar']) with six.assertRaisesRegex(self, SequenceCollectionError, 'prefix'): self.s1.update_ids(ids=['foo', 'bar'], prefix='abc') with six.assertRaisesRegex(self, SequenceCollectionError, 'prefix'): self.s1.update_ids(func=lambda e: e, prefix='abc') def test_update_ids_invalid_ids(self): # incorrect number of new ids with six.assertRaisesRegex(self, SequenceCollectionError, '3 != 2'): self.s1.update_ids(ids=['foo', 'bar', 'baz']) with six.assertRaisesRegex(self, SequenceCollectionError, '4 != 2'): self.s1.update_ids(func=lambda e: ['foo', 'bar', 'baz', 'abc']) # duplicates with six.assertRaisesRegex(self, SequenceCollectionError, 'foo'): self.s2.update_ids(ids=['foo', 'bar', 'foo']) with six.assertRaisesRegex(self, SequenceCollectionError, 'bar'): self.s2.update_ids(func=lambda e: ['foo', 'bar', 'bar']) def test_is_empty(self): self.assertFalse(self.s1.is_empty()) self.assertFalse(self.s2.is_empty()) self.assertFalse(self.s3.is_empty()) self.assertTrue(self.empty.is_empty()) def test_iteritems(self): self.assertEqual(list(self.s1.iteritems()), [(s.metadata['id'], s) for s in self.s1]) def test_sequence_count(self): self.assertEqual(self.s1.sequence_count(), 2) self.assertEqual(self.s2.sequence_count(), 3) self.assertEqual(self.s3.sequence_count(), 5) self.assertEqual(self.empty.sequence_count(), 0) def test_sequence_lengths(self): self.assertEqual(self.s1.sequence_lengths(), [7, 3]) self.assertEqual(self.s2.sequence_lengths(), [7, 3, 12]) self.assertEqual(self.s3.sequence_lengths(), [7, 3, 7, 3, 12]) self.assertEqual(self.empty.sequence_lengths(), [])
from qiime_default_reference import get_template_alignment, get_reference_sequences from skbio import SequenceCollection gapped_sequences = [(s.id, str(s)) for s in SequenceCollection.read(get_template_alignment())][:500] sequences = [(s.id, str(s)) for s in SequenceCollection.read(get_reference_sequences())][:500] motif_1 = "GGTGCAAGCCGGTGGAAACA"
class SequenceCollectionTests(TestCase): """Tests of the SequenceCollection class """ def setUp(self): """Initialize values to be used in tests """ self.d1 = DNASequence('GATTACA', id="d1") self.d2 = DNASequence('TTG', id="d2") self.d1_lower = DNASequence('gattaca', id="d1") self.d2_lower = DNASequence('ttg', id="d2") self.r1 = RNASequence('GAUUACA', id="r1") self.r2 = RNASequence('UUG', id="r2") self.r3 = RNASequence('U-----UGCC--', id="r3") self.i1 = DNASequence('GATXACA', id="i1") self.seqs1 = [self.d1, self.d2] self.seqs1_lower = [self.d1_lower, self.d2_lower] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')] self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'), ('r3', 'U-----UGCC--')] self.seqs3_t = self.seqs1_t + self.seqs2_t self.s1 = SequenceCollection(self.seqs1) self.s1_lower = SequenceCollection(self.seqs1_lower) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.empty = SequenceCollection([]) self.invalid_s1 = SequenceCollection([self.i1]) def test_init(self): """Initialization functions as expected with varied input types """ SequenceCollection(self.seqs1) SequenceCollection(self.seqs2) SequenceCollection(self.seqs3) SequenceCollection([]) def test_init_fail(self): """initialization with sequences with overlapping ids fails """ s1 = [self.d1, self.d1] self.assertRaises(SequenceCollectionError, SequenceCollection, s1) def test_init_validate(self): """initialization with validation functions as expected """ SequenceCollection(self.seqs1, validate=True) SequenceCollection(self.seqs1, validate=True) # can't validate self.seqs2 as a DNASequence self.assertRaises(SequenceCollectionError, SequenceCollection, self.invalid_s1, validate=True) def test_from_fasta_records(self): """Initialization from list of tuples functions as expected """ SequenceCollection.from_fasta_records(self.seqs1_t, DNASequence) SequenceCollection.from_fasta_records(self.seqs2_t, RNASequence) SequenceCollection.from_fasta_records(self.seqs3_t, NucleotideSequence) def test_contains(self): """in operator functions as expected """ self.assertTrue('d1' in self.s1) self.assertTrue('r2' in self.s2) self.assertFalse('r2' in self.s1) def test_eq(self): """equality operator functions as expected """ self.assertTrue(self.s1 == self.s1) self.assertFalse(self.s1 == self.s2) # different objects can be equal self.assertTrue(self.s1 == SequenceCollection([self.d1, self.d2])) self.assertTrue(SequenceCollection([self.d1, self.d2]) == self.s1) # SequenceCollections with different number of sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertFalse(self.s1 == FakeSequenceCollection([self.d1, self.d2])) self.assertFalse(self.s1 == Alignment([self.d1, self.d2])) # SequenceCollections with different sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1, self.r1])) def test_getitem(self): """getitem functions as expected """ self.assertEqual(self.s1[0], self.d1) self.assertEqual(self.s1[1], self.d2) self.assertEqual(self.s2[0], self.r1) self.assertEqual(self.s2[1], self.r2) self.assertRaises(IndexError, self.empty.__getitem__, 0) self.assertRaises(KeyError, self.empty.__getitem__, '0') def test_iter(self): """iter functions as expected """ s1_iter = iter(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_len(self): """len functions as expected """ self.assertEqual(len(self.s1), 2) self.assertEqual(len(self.s2), 3) self.assertEqual(len(self.s3), 5) self.assertEqual(len(self.empty), 0) def test_ne(self): """inequality operator functions as expected """ self.assertFalse(self.s1 != self.s1) self.assertTrue(self.s1 != self.s2) # SequenceCollections with different number of sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertTrue(self.s1 != FakeSequenceCollection([self.d1, self.d2])) self.assertTrue(self.s1 != Alignment([self.d1, self.d2])) # SequenceCollections with different sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1])) def test_repr(self): """repr functions as expected """ self.assertEqual(repr(self.s1), "<SequenceCollection: n=2; " "mean +/- std length=5.00 +/- 2.00>") self.assertEqual(repr(self.s2), "<SequenceCollection: n=3; " "mean +/- std length=7.33 +/- 3.68>") self.assertEqual(repr(self.s3), "<SequenceCollection: n=5; " "mean +/- std length=6.40 +/- 3.32>") self.assertEqual(repr(self.empty), "<SequenceCollection: n=0; " "mean +/- std length=0.00 +/- 0.00>") def test_reversed(self): """reversed functions as expected """ s1_iter = reversed(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1[::-1]): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_k_word_frequencies(self): """k_word_frequencies functions as expected """ expected1 = defaultdict(int) expected1['A'] = 3 / 7. expected1['C'] = 1 / 7. expected1['G'] = 1 / 7. expected1['T'] = 2 / 7. expected2 = defaultdict(int) expected2['G'] = 1 / 3. expected2['T'] = 2 / 3. self.assertEqual(self.s1.k_word_frequencies(k=1), [expected1, expected2]) expected1 = defaultdict(int) expected1['GAT'] = 1 / 2. expected1['TAC'] = 1 / 2. expected2 = defaultdict(int) expected2['TTG'] = 1 / 1. self.assertEqual(self.s1.k_word_frequencies(k=3, overlapping=False), [expected1, expected2]) self.assertEqual(self.empty.k_word_frequencies(k=1), []) def test_str(self): """str functions as expected """ exp1 = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(str(self.s1), exp1) exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n" self.assertEqual(str(self.s2), exp2) exp4 = "" self.assertEqual(str(self.empty), exp4) def test_distances(self): """distances functions as expected """ s1 = SequenceCollection([DNA("ACGT", "d1"), DNA("ACGG", "d2")]) expected = [[0, 0.25], [0.25, 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(hamming) self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42.], [42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(dumb_distance) self.assertEqual(actual, expected) def test_distribution_stats(self): """distribution_stats functions as expected """ actual1 = self.s1.distribution_stats() self.assertEqual(actual1[0], 2) self.assertAlmostEqual(actual1[1], 5.0, 3) self.assertAlmostEqual(actual1[2], 2.0, 3) actual2 = self.s2.distribution_stats() self.assertEqual(actual2[0], 3) self.assertAlmostEqual(actual2[1], 7.333, 3) self.assertAlmostEqual(actual2[2], 3.682, 3) actual3 = self.s3.distribution_stats() self.assertEqual(actual3[0], 5) self.assertAlmostEqual(actual3[1], 6.400, 3) self.assertAlmostEqual(actual3[2], 3.323, 3) actual4 = self.empty.distribution_stats() self.assertEqual(actual4[0], 0) self.assertEqual(actual4[1], 0.0) self.assertEqual(actual4[2], 0.0) def test_degap(self): """degap functions as expected """ expected = [(id_, seq.replace('.', '').replace('-', '')) for id_, seq in self.seqs2_t] expected = SequenceCollection.from_fasta_records(expected, RNASequence) actual = self.s2.degap() self.assertEqual(actual, expected) def test_get_seq(self): """getseq functions asexpected """ self.assertEqual(self.s1.get_seq('d1'), self.d1) self.assertEqual(self.s1.get_seq('d2'), self.d2) def test_ids(self): """ids functions as expected """ self.assertEqual(self.s1.ids(), ['d1', 'd2']) self.assertEqual(self.s2.ids(), ['r1', 'r2', 'r3']) self.assertEqual(self.s3.ids(), ['d1', 'd2', 'r1', 'r2', 'r3']) self.assertEqual(self.empty.ids(), []) def test_int_map(self): """int_map functions as expected """ expected1 = {"1": self.d1, "2": self.d2} expected2 = {"1": "d1", "2": "d2"} self.assertEqual(self.s1.int_map(), (expected1, expected2)) expected1 = {"h-1": self.d1, "h-2": self.d2} expected2 = {"h-1": "d1", "h-2": "d2"} self.assertEqual(self.s1.int_map(prefix='h-'), (expected1, expected2)) def test_is_empty(self): """is_empty functions as expected """ self.assertFalse(self.s1.is_empty()) self.assertFalse(self.s2.is_empty()) self.assertFalse(self.s3.is_empty()) self.assertTrue(self.empty.is_empty()) def test_is_valid(self): """is_valid functions as expected """ self.assertTrue(self.s1.is_valid()) self.assertTrue(self.s2.is_valid()) self.assertTrue(self.s3.is_valid()) self.assertTrue(self.empty.is_valid()) self.assertFalse(self.invalid_s1.is_valid()) def test_iteritems(self): """iteritems functions as expected """ self.assertEqual(list(self.s1.iteritems()), [(s.id, s) for s in self.s1]) def test_lower(self): """lower functions as expected """ self.assertEqual(self.s1.lower(), self.s1_lower) def test_sequence_count(self): """num_seqs functions as expected """ self.assertEqual(self.s1.sequence_count(), 2) self.assertEqual(self.s2.sequence_count(), 3) self.assertEqual(self.s3.sequence_count(), 5) self.assertEqual(self.empty.sequence_count(), 0) def test_sequence_lengths(self): """sequence_lengths functions as expected """ self.assertEqual(self.s1.sequence_lengths(), [7, 3]) self.assertEqual(self.s2.sequence_lengths(), [7, 3, 12]) self.assertEqual(self.s3.sequence_lengths(), [7, 3, 7, 3, 12]) self.assertEqual(self.empty.sequence_lengths(), []) def test_to_fasta(self): """to_fasta functions as expected """ exp1 = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(self.s1.to_fasta(), exp1) exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n" self.assertEqual(self.s2.to_fasta(), exp2) def test_toFasta(self): with warnings.catch_warnings(): warnings.simplefilter("ignore") exp = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(self.s1.toFasta(), exp) def test_upper(self): """upper functions as expected """ self.assertEqual(self.s1_lower.upper(), self.s1)
def test_from_fasta_records(self): """Initialization from list of tuples functions as expected """ SequenceCollection.from_fasta_records(self.seqs1_t, DNASequence) SequenceCollection.from_fasta_records(self.seqs2_t, RNASequence) SequenceCollection.from_fasta_records(self.seqs3_t, NucleotideSequence)
class SequenceCollectionTests(TestCase): def setUp(self): self.d1 = DNASequence('GATTACA', id="d1") self.d2 = DNASequence('TTG', id="d2") self.d3 = DNASequence('GTATACA', id="d3") self.d1_lower = DNASequence('gattaca', id="d1") self.d2_lower = DNASequence('ttg', id="d2") self.d3_lower = DNASequence('gtataca', id="d3") self.r1 = RNASequence('GAUUACA', id="r1") self.r2 = RNASequence('UUG', id="r2") self.r3 = RNASequence('U-----UGCC--', id="r3") self.i1 = DNASequence('GATXACA', id="i1") self.seqs1 = [self.d1, self.d2] self.seqs1_lower = [self.d1_lower, self.d2_lower] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs4 = [self.d1, self.d3] self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')] self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'), ('r3', 'U-----UGCC--')] self.seqs3_t = self.seqs1_t + self.seqs2_t self.s1 = SequenceCollection(self.seqs1) self.s1_lower = SequenceCollection(self.seqs1_lower) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.s4 = SequenceCollection(self.seqs4) self.empty = SequenceCollection([]) self.invalid_s1 = SequenceCollection([self.i1]) def test_init(self): SequenceCollection(self.seqs1) SequenceCollection(self.seqs2) SequenceCollection(self.seqs3) SequenceCollection([]) def test_init_fail(self): # sequences with overlapping ids s1 = [self.d1, self.d1] self.assertRaises(SequenceCollectionError, SequenceCollection, s1) def test_init_validate(self): SequenceCollection(self.seqs1, validate=True) SequenceCollection(self.seqs1, validate=True) # can't validate self.seqs2 as a DNASequence self.assertRaises(SequenceCollectionError, SequenceCollection, self.invalid_s1, validate=True) def test_contains(self): self.assertTrue('d1' in self.s1) self.assertTrue('r2' in self.s2) self.assertFalse('r2' in self.s1) def test_eq(self): self.assertTrue(self.s1 == self.s1) self.assertFalse(self.s1 == self.s2) # different objects can be equal self.assertTrue(self.s1 == SequenceCollection([self.d1, self.d2])) self.assertTrue(SequenceCollection([self.d1, self.d2]) == self.s1) # SequenceCollections with different number of sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertFalse(self.s4 == FakeSequenceCollection([self.d1, self.d3])) self.assertFalse(self.s4 == Alignment([self.d1, self.d3])) # SequenceCollections with different sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1, self.r1])) def test_getitem(self): self.assertEqual(self.s1[0], self.d1) self.assertEqual(self.s1[1], self.d2) self.assertEqual(self.s2[0], self.r1) self.assertEqual(self.s2[1], self.r2) self.assertRaises(IndexError, self.empty.__getitem__, 0) self.assertRaises(KeyError, self.empty.__getitem__, '0') def test_iter(self): s1_iter = iter(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_len(self): self.assertEqual(len(self.s1), 2) self.assertEqual(len(self.s2), 3) self.assertEqual(len(self.s3), 5) self.assertEqual(len(self.empty), 0) def test_ne(self): self.assertFalse(self.s1 != self.s1) self.assertTrue(self.s1 != self.s2) # SequenceCollections with different number of sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertTrue(self.s4 != FakeSequenceCollection([self.d1, self.d3])) self.assertTrue(self.s4 != Alignment([self.d1, self.d3])) # SequenceCollections with different sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1])) def test_repr(self): self.assertEqual(repr(self.s1), "<SequenceCollection: n=2; " "mean +/- std length=5.00 +/- 2.00>") self.assertEqual(repr(self.s2), "<SequenceCollection: n=3; " "mean +/- std length=7.33 +/- 3.68>") self.assertEqual(repr(self.s3), "<SequenceCollection: n=5; " "mean +/- std length=6.40 +/- 3.32>") self.assertEqual(repr(self.empty), "<SequenceCollection: n=0; " "mean +/- std length=0.00 +/- 0.00>") def test_reversed(self): s1_iter = reversed(self.s1) count = 0 for actual, expected in zip(s1_iter, self.seqs1[::-1]): count += 1 self.assertEqual(actual, expected) self.assertEqual(count, len(self.seqs1)) self.assertRaises(StopIteration, lambda: next(s1_iter)) def test_k_word_frequencies(self): expected1 = defaultdict(float) expected1['A'] = 3 / 7. expected1['C'] = 1 / 7. expected1['G'] = 1 / 7. expected1['T'] = 2 / 7. expected2 = defaultdict(float) expected2['G'] = 1 / 3. expected2['T'] = 2 / 3. self.assertEqual(self.s1.k_word_frequencies(k=1), [expected1, expected2]) expected1 = defaultdict(float) expected1['GAT'] = 1 / 2. expected1['TAC'] = 1 / 2. expected2 = defaultdict(float) expected2['TTG'] = 1 / 1. self.assertEqual(self.s1.k_word_frequencies(k=3, overlapping=False), [expected1, expected2]) self.assertEqual(self.empty.k_word_frequencies(k=1), []) # Test to ensure floating point precision bug isn't present. See the # tests for BiologicalSequence.k_word_frequencies for more details. sc = SequenceCollection([RNA('C' * 10, id='s1'), RNA('G' * 10, id='s2')]) self.assertEqual(sc.k_word_frequencies(1), [defaultdict(float, {'C': 1.0}), defaultdict(float, {'G': 1.0})]) def test_str(self): exp1 = ">d1\nGATTACA\n>d2\nTTG\n" self.assertEqual(str(self.s1), exp1) exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n" self.assertEqual(str(self.s2), exp2) exp4 = "" self.assertEqual(str(self.empty), exp4) def test_distances(self): s1 = SequenceCollection([DNA("ACGT", "d1"), DNA("ACGG", "d2")]) expected = [[0, 0.25], [0.25, 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(hamming) self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42.], [42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(dumb_distance) self.assertEqual(actual, expected) def test_distribution_stats(self): actual1 = self.s1.distribution_stats() self.assertEqual(actual1[0], 2) self.assertAlmostEqual(actual1[1], 5.0, 3) self.assertAlmostEqual(actual1[2], 2.0, 3) actual2 = self.s2.distribution_stats() self.assertEqual(actual2[0], 3) self.assertAlmostEqual(actual2[1], 7.333, 3) self.assertAlmostEqual(actual2[2], 3.682, 3) actual3 = self.s3.distribution_stats() self.assertEqual(actual3[0], 5) self.assertAlmostEqual(actual3[1], 6.400, 3) self.assertAlmostEqual(actual3[2], 3.323, 3) actual4 = self.empty.distribution_stats() self.assertEqual(actual4[0], 0) self.assertEqual(actual4[1], 0.0) self.assertEqual(actual4[2], 0.0) def test_degap(self): expected = SequenceCollection([ RNASequence('GAUUACA', id="r1"), RNASequence('UUG', id="r2"), RNASequence('UUGCC', id="r3")]) actual = self.s2.degap() self.assertEqual(actual, expected) def test_get_seq(self): self.assertEqual(self.s1.get_seq('d1'), self.d1) self.assertEqual(self.s1.get_seq('d2'), self.d2) def test_ids(self): self.assertEqual(self.s1.ids(), ['d1', 'd2']) self.assertEqual(self.s2.ids(), ['r1', 'r2', 'r3']) self.assertEqual(self.s3.ids(), ['d1', 'd2', 'r1', 'r2', 'r3']) self.assertEqual(self.empty.ids(), []) def _assert_sequence_collections_equal(self, observed, expected): """Compare SequenceCollections strictly.""" # TODO remove this custom equality testing code when SequenceCollection # has an equals method (part of #656). We need this method to include # IDs in the comparison (not part of SequenceCollection.__eq__). self.assertEqual(observed, expected) for obs_seq, exp_seq in zip(observed, expected): self.assertTrue(obs_seq.equals(exp_seq)) def test_update_ids_default_behavior(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="1"), RNA('UUG', id="2"), RNA('U-----UGCC--', id="3") ]) exp_id_map = {'1': 'r1', '2': 'r2', '3': 'r3'} obs_sc, obs_id_map = self.s2.update_ids() self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids() self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_prefix(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="abc1"), RNA('UUG', id="abc2"), RNA('U-----UGCC--', id="abc3") ]) exp_id_map = {'abc1': 'r1', 'abc2': 'r2', 'abc3': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(prefix='abc') self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(prefix='abc') self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_fn_parameter(self): def append_42(ids): return [id_ + '-42' for id_ in ids] # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="r1-42"), RNA('UUG', id="r2-42"), RNA('U-----UGCC--', id="r3-42") ]) exp_id_map = {'r1-42': 'r1', 'r2-42': 'r2', 'r3-42': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(fn=append_42) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(fn=append_42) self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_ids_parameter(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="abc"), RNA('UUG', id="def"), RNA('U-----UGCC--', id="ghi") ]) exp_id_map = {'abc': 'r1', 'def': 'r2', 'ghi': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(ids=('abc', 'def', 'ghi')) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(ids=[]) self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {}) def test_update_ids_sequence_attributes_propagated(self): # 1 seq exp_sc = Alignment([ DNA('ACGT', id="abc", description='desc', quality=range(4)) ]) exp_id_map = {'abc': 'seq1'} obj = Alignment([ DNA('ACGT', id="seq1", description='desc', quality=range(4)) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc',)) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # 2 seqs exp_sc = Alignment([ DNA('ACGT', id="abc", description='desc1', quality=range(4)), DNA('TGCA', id="def", description='desc2', quality=range(4)[::-1]) ]) exp_id_map = {'abc': 'seq1', 'def': 'seq2'} obj = Alignment([ DNA('ACGT', id="seq1", description='desc1', quality=(0, 1, 2, 3)), DNA('TGCA', id="seq2", description='desc2', quality=(3, 2, 1, 0)) ]) obs_sc, obs_id_map = obj.update_ids(ids=('abc', 'def')) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) def test_update_ids_invalid_parameter_combos(self): with self.assertRaisesRegexp(SequenceCollectionError, 'ids and fn'): self.s1.update_ids(fn=lambda e: e, ids=['foo', 'bar']) with self.assertRaisesRegexp(SequenceCollectionError, 'prefix'): self.s1.update_ids(ids=['foo', 'bar'], prefix='abc') with self.assertRaisesRegexp(SequenceCollectionError, 'prefix'): self.s1.update_ids(fn=lambda e: e, prefix='abc') def test_update_ids_invalid_ids(self): # incorrect number of new ids with self.assertRaisesRegexp(SequenceCollectionError, '3 != 2'): self.s1.update_ids(ids=['foo', 'bar', 'baz']) with self.assertRaisesRegexp(SequenceCollectionError, '4 != 2'): self.s1.update_ids(fn=lambda e: ['foo', 'bar', 'baz', 'abc']) # duplicates with self.assertRaisesRegexp(SequenceCollectionError, 'foo'): self.s2.update_ids(ids=['foo', 'bar', 'foo']) with self.assertRaisesRegexp(SequenceCollectionError, 'bar'): self.s2.update_ids(fn=lambda e: ['foo', 'bar', 'bar']) def test_is_empty(self): self.assertFalse(self.s1.is_empty()) self.assertFalse(self.s2.is_empty()) self.assertFalse(self.s3.is_empty()) self.assertTrue(self.empty.is_empty()) def test_is_valid(self): self.assertTrue(self.s1.is_valid()) self.assertTrue(self.s2.is_valid()) self.assertTrue(self.s3.is_valid()) self.assertTrue(self.empty.is_valid()) self.assertFalse(self.invalid_s1.is_valid()) def test_iteritems(self): self.assertEqual(list(self.s1.iteritems()), [(s.id, s) for s in self.s1]) def test_lower(self): self.assertEqual(self.s1.lower(), self.s1_lower) def test_sequence_count(self): self.assertEqual(self.s1.sequence_count(), 2) self.assertEqual(self.s2.sequence_count(), 3) self.assertEqual(self.s3.sequence_count(), 5) self.assertEqual(self.empty.sequence_count(), 0) def test_sequence_lengths(self): self.assertEqual(self.s1.sequence_lengths(), [7, 3]) self.assertEqual(self.s2.sequence_lengths(), [7, 3, 12]) self.assertEqual(self.s3.sequence_lengths(), [7, 3, 7, 3, 12]) self.assertEqual(self.empty.sequence_lengths(), []) def test_upper(self): self.assertEqual(self.s1_lower.upper(), self.s1)