def test_filter_sample_categories(self): test = self.test1.filter_ids(['badsample'], axis=0, negate=True) # does not filter anything assert_experiment_equal(test.filter_sample_categories('group', 9), test) # filter group of 2 assert_experiment_equal(test.filter_sample_categories('group', 10), test.filter_samples('group', '1'))
def test_filter_features_edge_cases(self): # none filtered obs = self.test2.filter_features('oxygen', ['facultative']) self.assertEqual(obs.shape, (9, 0)) obs = self.test2.filter_features('oxygen', ['facultative'], negate=True) assert_experiment_equal(obs, self.test2)
def test_copy(self): exp = self.test1.copy() assert_experiment_equal(exp, self.test1) self.assertIsNot(exp, self.test1) # make sure it is a deep copy - not sharing the data exp.data[0, 0] = exp.data[0, 0] + 1 self.assertNotEqual(exp.data[0, 0], self.test1.data[0, 0])
def test_join_metadata_fields_complex(self): # test join feature fields with new field name, separator and inplace exp = deepcopy(self.test1) newexp = exp.join_metadata_fields('taxonomy', 'taxonomy', 'test', axis=1, sep=';', inplace=True) self.assertIs(newexp, exp) self.assertIn('test', exp.feature_metadata.columns) self.assertNotIn('test', exp.sample_metadata.columns) self.assertEqual(exp.feature_metadata['test'].iloc[11], 'bad_bacteria;bad_bacteria') # test we didn't change anything besides the new sample metadata column assert_experiment_equal(exp, self.test1, ignore_md_fields=['test']) # test join feature fields with new field name, sepparator, inplace and align exp = deepcopy(self.test1) newexp = exp.join_metadata_fields('taxonomy', 'ph', 'test', axis=1, sep=';', align='<', inplace=True) self.assertIs(newexp, exp) self.assertIn('test', exp.feature_metadata.columns) self.assertNotIn('test', exp.sample_metadata.columns) self.assertEqual( exp.feature_metadata.loc['AT', 'test'], 'k__Bacteria; p__Tenericutes; c__Mollicutes; o__Mycoplasmatales; f__Mycoplasmataceae; g__Mycoplasma; s__ ;4.1' ) # test we didn't change anything besides the new sample metadata column assert_experiment_equal(exp, self.test1, ignore_md_fields=['test'])
def test_read_amplicon(self): # test loading a taxonomy biom table and filtering/normalizing exp1 = ca.read_amplicon(self.test1_biom, min_reads=1000, normalize=10000) exp2 = ca.read(self.test1_biom, normalize=None) exp2.filter_by_data('abundance', axis=0, cutoff=1000, inplace=True, mean_or_sum='sum') exp2.normalize(inplace=True) assert_experiment_equal(exp1, exp2) self.assertIn('taxonomy', exp1.feature_metadata.columns)
def test_filter_by_data_sample_edge_cases(self): # all samples are filtered out obs = self.test2.filter_by_data('abundance', axis=0, cutoff=100000, mean_or_sum='sum') self.assertEqual(obs.shape, (0, 8)) # none is filtered out obs = self.test2.filter_by_data('abundance', axis=0, cutoff=1, mean_or_sum='sum') assert_experiment_equal(obs, self.test2) self.assertIsNot(obs, self.test2)
def test_sort_by_metadata_feature(self): obs = self.test2.sort_by_metadata( field='level2', axis=1).sort_by_metadata(field='level1', axis=1) self.assertIsNot(obs, self.test2) assert_experiment_equal( obs, self.test2.reorder(obs.feature_metadata['ori.order'], axis=1)) self.assertListEqual(obs.feature_metadata['new.order'].tolist(), list(range(8)))
def test_split_train_test(self): train, test = self.test2_dense.split_train_test( test_size=3, stratify='categorical', random_state=7) assert_experiment_equal( test, self.test2_dense.filter_ids(['S3', 'S8', 'S1'], axis='s')) assert_experiment_equal( train, self.test2_dense.filter_ids(['S9', 'S6', 'S5', 'S2', 'S4', 'S7'], axis='s'))
def test_downsample_sample(self): obs = self.test2.downsample('group') # should be down to 4 samples; feature number is the same self.assertEqual(obs.shape, (4, 8)) sid = obs.sample_metadata.index.tolist() all_sid = self.test2.sample_metadata.index.tolist() exp = self.test2.reorder([all_sid.index(i) for i in sid]) assert_experiment_equal(obs, exp)
def test_filter_by_data_feature_edge_cases(self): # all features are filtered out obs = self.test2.filter_by_data('sum_abundance', axis=1, cutoff=10000) self.assertEqual(obs.shape, (9, 0)) # none is filtered out obs = self.test2.filter_by_data('sum_abundance', axis=1, cutoff=1) assert_experiment_equal(obs, self.test2) self.assertIsNot(obs, self.test2)
def test_save(self): exp = ca.read(self.test2_biom, self.test2_samp, normalize=None) d = mkdtemp() f = join(d, 'test1.save') # test the json biom format exp.save(f, fmt='json') newexp = ca.read(f + '.biom', f + '_sample.txt', normalize=None) assert_experiment_equal(newexp, exp, ignore_md_fields=['#SampleID.1']) shutil.rmtree(d)
def test_join_metadata_fields(self): # test the default params newexp = self.test1.join_metadata_fields('id', 'group', inplace=False) self.assertIn('id_group', newexp.sample_metadata.columns) self.assertEqual(newexp.sample_metadata.loc['S12', 'id_group'], '12.0_2') # test we didn't change anything besides the new sample metadata column assert_experiment_equal(newexp, self.test1, ignore_md_fields=['id_group'])
def test_from_pandas_reorder(self): df = self.test1.to_pandas(sparse=False) # let's reorder the dataframe df = df.sort_values(self.test1.feature_metadata.index.values[10]) df = df.sort_values(df.index.values[0], axis=1) res = ca.Experiment.from_pandas(df, self.test1) # we need to reorder the original experiment exp = self.test1.sort_by_data(subset=[10], key='mean') exp = exp.sort_by_data(subset=[0], key='mean', axis=1) assert_experiment_equal(res, exp)
def test_read_amplicon(self): # test loading a taxonomy biom table and filtering/normalizing exp = ca.read_amplicon(self.test1_biom, filter_reads=1000, normalize=10000) exp2 = ca.read(self.test1_biom, normalize=None) exp2.filter_by_data('sum_abundance', cutoff=1000, inplace=True) exp2.normalize(inplace=True) assert_experiment_equal(exp, exp2) self.assertIn('taxonomy', exp.feature_metadata)
def test_filter_samples_edge_cases(self): # no group 3 - none filtered test1 = ca.read(self.test1_biom, self.test1_samp, self.test1_feat, normalize=None) # group dtype is O obs = test1.filter_samples('group', ['3']) self.assertEqual(obs.shape, (0, 12)) obs = test1.filter_samples('group', ['3'], negate=True) assert_experiment_equal(obs, test1)
def test_sort_by_data_sample(self): # sort sample based on the first and last features obs = self.test2.sort_by_data(subset=[0, 7]) # the order is the same with original assert_experiment_equal(obs, self.test2) obs = self.test2.sort_by_data(subset=[0, 3]) assert_experiment_equal( obs, self.test2.reorder(obs.sample_metadata['ori.order'], axis=0)) self.assertListEqual(obs.sample_metadata['new.order'].tolist(), list(range(9)))
def test_join_experiments(self): # do the famous join experiment to itself trick texp = deepcopy(self.test1) texp.description = 't2' newexp = self.test1.join_experiments(texp, prefixes=('c1', '')) self.assertEqual(len(newexp.feature_metadata), len(self.test1.feature_metadata)) self.assertEqual(len(newexp.sample_metadata), len(self.test1.sample_metadata) * 2) fexp = newexp.filter_samples('experiments', ['t2']) assert_experiment_equal(fexp, texp, ignore_md_fields=['experiments'])
def test_filter_by_metadata_sample_edge_cases(self): # no group 3 - none filtered obs = self.test2.filter_by_metadata('group', [3]) self.assertEqual(obs.shape, (0, 8)) obs = self.test2.filter_by_metadata('group', [3], negate=True) assert_experiment_equal(obs, self.test2) # all samples are filtered obs = self.test2.filter_by_metadata('group', [1, 2]) assert_experiment_equal(obs, self.test2) obs = self.test2.filter_by_metadata('group', [1, 2], negate=True) self.assertEqual(obs.shape, (0, 8))
def test_sort_by_metadata_sample(self): # test sorting various fields (keeping the order) obs = self.timeseries.sort_by_metadata( field='MINUTES', inplace=True).sort_by_metadata( field='HOUR', inplace=True).sort_by_metadata(field='DAY', inplace=True) self.assertIs(obs, self.timeseries) exp = ca.read(join(self.test_data_dir, 'timeseries.sorted.time.biom'), join(self.test_data_dir, 'timeseries.sample'), normalize=None) assert_experiment_equal(obs, exp, almost_equal=True) self.assertListEqual(obs.sample_metadata['MF_SAMPLE_NUMBER'].tolist(), list(range(1, 96)))
def test_cluster_data(self): def log_and_scale(exp): exp.log_n(inplace=True) exp.scale(inplace=True, axis=1) return exp # no minimal filtering obs = self.test1.cluster_data(transform=log_and_scale) exp = ca.read(join(self.test_data_dir, 'test1.clustered.features.biom'), self.test1_samp, normalize=None) assert_experiment_equal(obs, exp, almost_equal=True)
def test_sort_samples(self): obs = self.timeseries.sort_samples( 'MINUTES', inplace=True).sort_samples('HOUR', inplace=True).sort_samples('DAY', inplace=True) self.assertIs(obs, self.timeseries) exp = ca.read(join(self.test_data_dir, 'timeseries.sorted.time.biom'), join(self.test_data_dir, 'timeseries.sample'), normalize=None) assert_experiment_equal(obs, exp, almost_equal=True) self.assertListEqual(obs.sample_metadata['MF_SAMPLE_NUMBER'].tolist(), list(range(1, 96)))
def test_reorder_round_trip(self): # test double permuting of a bigger data set exp = ca.read(self.timeseries_biom, self.timeseries_samp, normalize=None) rand_perm_samples = np.random.permutation(exp.data.shape[0]) rand_perm_features = np.random.permutation(exp.data.shape[1]) rev_perm_samples = np.argsort(rand_perm_samples) rev_perm_features = np.argsort(rand_perm_features) new = exp.reorder(rand_perm_features, axis=1, inplace=False) new.reorder(rand_perm_samples, axis=0, inplace=True) new.reorder(rev_perm_features, axis=1, inplace=True) new.reorder(rev_perm_samples, axis=0, inplace=True) assert_experiment_equal(new, exp)
def test_filter_by_data_feature(self): # one feature is filtered out when cutoff is set to 25 for sparse, inplace in [(True, False), (True, True), (False, False), (False, True)]: obs = self.test2.filter_by_data( 'sum_abundance', axis=1, inplace=inplace, cutoff=25) self.assertEqual(obs.shape, (9, 7)) exp = ca.read(*[get_data_path(i) for i in [ 'test2.biom.filter.feature', 'test2.sample', 'test2.feature']], normalize=None) assert_experiment_equal(obs, exp) if inplace: self.assertIs(obs, self.test2) else: self.assertIsNot(obs, self.test2)
def test_log_n(self): obs = self.test2.log_n() self.test2.data = np.log2([[10., 20., 1., 20., 5., 100., 844., 100.], [10., 20., 2., 19., 1., 100., 849., 200.], [10., 20., 3., 18., 5., 100., 844., 300.], [10., 20., 4., 17., 1., 100., 849., 400.], [10., 20., 5., 16., 4., 100., 845., 500.], [10., 20., 6., 15., 1., 100., 849., 600.], [10., 20., 7., 14., 3., 100., 846., 700.], [10., 20., 8., 13., 1., 100., 849., 800.], [10., 20., 9., 12., 7., 100., 842., 900.]]) assert_experiment_equal(obs, self.test2) self.assertIsNot(obs, self.test2) obs = self.test2.log_n(inplace=True) self.assertIs(obs, self.test2)
def test_join_fields_complex(self): # test join feature fields with new field name, separator and inplace exp = deepcopy(self.test1) newexp = exp.join_fields('taxonomy', 'taxonomy', newname='test', axis=1, sep=';', inplace=True) self.assertIs(newexp, exp) self.assertIn('test', exp.feature_metadata.columns) self.assertNotIn('test', exp.sample_metadata.columns) self.assertEqual(exp.feature_metadata['test'].iloc[11], 'bad_bacteria;bad_bacteria') # test we didn't change anything besides the new sample metadata column assert_experiment_equal(exp, self.test1, ignore_md_fields=['test'])
def test_filter_by_data_sample(self): for sparse, inplace in [(True, False), (True, True), (False, False), (False, True)]: test2 = ca.read(self.test2_biom, self.test2_samp, self.test2_feat, sparse=sparse, normalize=None) # filter out samples with abundance < 1200. only the last sample is filtered out. obs = test2.filter_by_data( 'sum_abundance', axis=0, inplace=inplace, cutoff=1200) self.assertEqual(obs.shape, (8, 8)) exp = ca.read(*[get_data_path(i) for i in [ 'test2.biom.filter.sample', 'test2.sample', 'test2.feature']], normalize=None) assert_experiment_equal(obs, exp) if inplace: self.assertIs(obs, test2) else: self.assertIsNot(obs, test2)
def test_join_experiments_featurewise(self): otu1 = ca.Experiment(np.array([[0, 9], [7, 4]]), sparse=False, sample_metadata=pd.DataFrame( { 'category': ['B', 'A'], 'ph': [7.7, 6.6] }, index=['s2', 's1']), feature_metadata=pd.DataFrame( {'motile': ['y', 'n']}, index=['16S1', '16S2'])) otu2 = ca.Experiment(np.array([[6], [8], [10]]), sparse=False, sample_metadata=pd.DataFrame( { 'category': ['A', 'B', 'C'], 'ph': [6.6, 7.7, 8.8] }, index=['s1', 's2', 's3']), feature_metadata=pd.DataFrame({'motile': [None]}, index=['ITS1'])) combined_obs = otu1.join_experiments_featurewise( otu2, 'origin', ('16S', 'ITS')) combined_exp = ca.Experiment(np.array([[7, 4, 6], [0, 9, 8]]), sparse=False, sample_metadata=pd.DataFrame( { 'category': ['A', 'B'], 'ph': [6.6, 7.7] }, index=['s1', 's2']), feature_metadata=pd.DataFrame( { 'motile': ['y', 'n', None], 'origin': ['16S', '16S', 'ITS'] }, index=['16S1', '16S2', 'ITS1'])) # reorder the samples combined_obs = combined_obs.filter_ids( combined_exp.sample_metadata.index, axis=0) assert_experiment_equal(combined_obs, combined_exp)
def test_save_biom(self): # NOTE: Currently not testing the save biom hdf with taxonomy # as there is a bug there! exp = ca.read_amplicon(self.test1_biom, self.test1_samp, normalize=None, min_reads=None) d = mkdtemp() f = join(d, 'test1.save.biom') # test the json biom format exp.save_biom(f, fmt='hdf5') newexp = ca.read_amplicon(f, self.test1_samp, normalize=None, min_reads=None) assert_experiment_equal(newexp, exp) # test the txt biom format exp.save_biom(f, fmt='txt') newexp = ca.read_amplicon(f, self.test1_samp, normalize=None, min_reads=None) assert_experiment_equal(newexp, exp, ignore_md_fields=['taxonomy']) # test the hdf5 biom format with no taxonomy exp.save_biom(f, add_metadata=None) newexp = ca.read(f, self.test1_samp, normalize=None) self.assertTrue('taxonomy' not in newexp.feature_metadata) assert_experiment_equal(newexp, exp, ignore_md_fields=['taxonomy']) shutil.rmtree(d)
def test_from_pandas_with_experiment(self): df = self.test1.to_pandas(sparse=False) res = ca.Experiment.from_pandas(df, self.test1) assert_experiment_equal(res, self.test1)
def test_deep_copy_experiment(self): exp = deepcopy(self.test1) assert_experiment_equal(exp, self.test1) self.assertIsNot(exp, self.test1)