def test_reconstruct_counts_unweighted(self): count_table = reconstruct_counts( region=['Bludhaven', 'Gotham'], regional_alignment=[ ts.region1_align.view(Delayed).copy(), ts.region2_align.view(Delayed).copy() ], regional_table=[ ts.region1_counts.view(biom.Table), ts.region2_counts.view(biom.Table) ], database_map=self.datbase_map['clean_name'], database_summary=self.database_summary, debug=True, min_counts=10, min_abund=1e-2, region_normalize='unweighted') npt.assert_array_equal( count_table.matrix_data.todense(), np.array([[200, 100, 0, 50, 100, 100], [200, 50, 200, 25, 50, 50], [0, 200, 200, 0, 100, 100]]).T * 1.) npt.assert_array_equal(np.array(list(count_table.ids(axis='sample'))), np.array(['sample1', 'sample2', 'sample3'])) npt.assert_array_equal( np.array(list(count_table.ids(axis='observation'))), np.array(['seq1', 'seq2', 'seq3', 'seq4', 'seq5', 'seq6']), )
def test_reconstruct_counts_align_drop_samples_warning(self): with warnings.catch_warnings(record=True) as w: count_table, summary, mapping = reconstruct_counts( region=['Bludhaven', 'Gotham'], regional_alignment=[ts.region1_align.view(pd.DataFrame).copy(), ts.region2_align.view(pd.DataFrame).copy()], kmer_map=[ts.region1_db_map.view(pd.DataFrame).copy(), ts.region2_db_map.view(pd.DataFrame).copy()], regional_table=[ts.region1_counts.view(biom.Table).copy(), ts.region2_counts.view(biom.Table).copy()], debug=True, min_counts=590, min_abund=1e-2 ) self.assertTrue(issubclass(w[-1].category, UserWarning)) self.assertEqual(str(w[-1].message), 'There are 2 samples with fewer than 590 ' 'total reads. These samples will be discarded.')
def test_reconstruct_counts_align_drop_samples_error(self): with self.assertRaises(ValueError) as err: count_table, summary, mapping = reconstruct_counts( region=['Bludhaven', 'Gotham'], regional_alignment=[ts.region1_align.view(pd.DataFrame).copy(), ts.region2_align.view(pd.DataFrame).copy()], kmer_map=[ts.region1_db_map.view(pd.DataFrame).copy(), ts.region2_db_map.view(pd.DataFrame).copy()], regional_table=[ts.region1_counts.view(biom.Table).copy(), ts.region2_counts.view(biom.Table).copy()], debug=True, min_abund=1e-2 ) self.assertEqual( str(err.exception), 'None of the samples have more than the 1000 total ' 'sequences required for reconstruction.' )
def test_reconstruct_counts_align_drop_samples_error(self): with self.assertRaises(ValueError) as err: count_table = reconstruct_counts( region=['Bludhaven', 'Gotham'], regional_alignment=[ ts.region1_align.view(pd.DataFrame).copy(), ts.region2_align.view(pd.DataFrame).copy() ], regional_table=[ ts.region1_counts.view(biom.Table), ts.region2_counts.view(biom.Table) ], database_map=self.datbase_map['clean_name'], database_summary=self.database_summary, debug=True, min_counts=590, min_abund=1e-2) self.assertEqual( str(err.exception), 'There are 2 samples with fewer than 590 total sequences. ' 'Please check your minimum counts and make sure your ' 'representative sequences are aligned with the database.')
def test_reconstruct_counts_same_twice(self): known_map = pd.DataFrame( data=[['seq1|seq2', 'WANTCAT', 'WANTCAT', 15], ['seq1|seq2', 'WANTCAT', 'WANTCAT', 15], ['seq3', 'WANTCAT', 'WANTCAT', 15], ['seq5', 'WANTCAT', 'WANTCAT', 15], ['seq6', 'WANTCAT', 'WANTCAT', 15]], index=pd.Index(['seq1', 'seq2', 'seq3', 'seq5', 'seq6'], name='db-seq'), columns=['clean_name', 'first-fwd-primer', 'last-fwd-primer', 'last-kmer-length'], ) known_summary = pd.DataFrame.from_dict(orient='index', data={ 'seq1|seq2': {'num-regions': 1., 'total-kmers-mapped': 2., 'mean-kmer-per-region': 2., 'stdv-kmer-per-region': 0., 'mapped-asvs': 'asv01' }, 'seq3': {'num-regions': 1, 'total-kmers-mapped': 2, 'mean-kmer-per-region': 2, 'stdv-kmer-per-region': 0, 'mapped-asvs': 'asv02|asv03' }, 'seq5': {'num-regions': 1, 'total-kmers-mapped': 1, 'mean-kmer-per-region': 1, 'stdv-kmer-per-region': 0, 'mapped-asvs': 'asv04|asv05', }, 'seq6': {'num-regions': 1, 'total-kmers-mapped': 1, 'mean-kmer-per-region': 1, 'stdv-kmer-per-region': 0, 'mapped-asvs': 'asv04|asv05', }, }) known_summary.index.set_names('feature-id', inplace=True) count_table, summary, mapping =reconstruct_counts( region=['Bludhaven', 'Bludhaven'], regional_alignment=[ts.region1_align.view(pd.DataFrame).copy(), ts.region1_align.view(pd.DataFrame).copy()], kmer_map=[ts.region1_db_map.view(pd.DataFrame).copy(), ts.region1_db_map.view(pd.DataFrame).copy()], regional_table=[ts.region1_counts.view(biom.Table).copy()], debug=True, min_counts=10, min_abund=1e-2, region_normalize='unweighted' ) npt.assert_array_equal( np.array(list(count_table.ids(axis='sample'))), np.array(['sample1', 'sample2', 'sample3']) ) npt.assert_array_equal( np.array(list(count_table.ids(axis='observation'))), np.array(['seq1|seq2', 'seq3', 'seq5', 'seq6']), ) npt.assert_array_equal( count_table.matrix_data.todense(), np.array([[150, 0, 50, 50], [125, 100, 25, 25], [100, 100, 50, 50]]).T * 1., ) pdt.assert_frame_equal(mapping, known_map) pdt.assert_frame_equal(summary.to_dataframe(), known_summary)
def test_reconstruct_counts(self): known_map = pd.DataFrame( data=[['seq1', 'WANTCAT', 'CACCTCGTN', 15], ['seq2', 'WANTCAT', 'CACCTCGTN', 15], ['seq3', 'WANTCAT', 'CACCTCGTN', 15], ['seq4', 'CACCTCGTN', 'CACCTCGTN', 15], ['seq5', 'WANTCAT', 'CACCTCGTN', 15], ['seq6', 'WANTCAT', 'CACCTCGTN', 15]], index=pd.Index(['seq1', 'seq2', 'seq3', 'seq4', 'seq5', 'seq6'], name='db-seq'), columns=['clean_name', 'first-fwd-primer', 'last-fwd-primer', 'last-kmer-length'], ) known_summary = pd.DataFrame.from_dict(orient='index', data={ 'seq1': {'num-regions': 2., 'total-kmers-mapped': 2., 'mean-kmer-per-region': 1., 'stdv-kmer-per-region': 0., 'mapped-asvs': 'asv01|asv06' }, 'seq2': {'num-regions': 2, 'total-kmers-mapped': 2, 'mean-kmer-per-region': 1, 'stdv-kmer-per-region': 0, 'mapped-asvs': 'asv01|asv07', }, 'seq3': {'num-regions': 2, 'total-kmers-mapped': 3, 'mean-kmer-per-region': 1.5, 'stdv-kmer-per-region': np.std([1, 2], ddof=1), 'mapped-asvs': 'asv02|asv03|asv08' }, 'seq4': {'num-regions': 1, 'total-kmers-mapped': 1, 'mean-kmer-per-region': 1, 'stdv-kmer-per-region': 0, 'mapped-asvs': 'asv09' }, 'seq5': {'num-regions': 2, 'total-kmers-mapped': 2, 'mean-kmer-per-region': 1, 'stdv-kmer-per-region': 0, 'mapped-asvs': 'asv04|asv05|asv10', }, 'seq6': {'num-regions': 2, 'total-kmers-mapped': 2, 'mean-kmer-per-region': 1, 'stdv-kmer-per-region': 0, 'mapped-asvs': 'asv04|asv05|asv11', }, }) known_summary.index.set_names('feature-id', inplace=True) count_table, summary, mapping = reconstruct_counts( region=['Bludhaven', 'Gotham'], regional_alignment=[ts.region1_align.view(pd.DataFrame).copy(), ts.region2_align.view(pd.DataFrame).copy()], kmer_map=[ts.region1_db_map.view(pd.DataFrame).copy(), ts.region2_db_map.view(pd.DataFrame).copy()], regional_table=[ts.region1_counts.view(biom.Table), ts.region2_counts.view(biom.Table)], debug=True, min_counts=10, min_abund=1e-2) npt.assert_array_equal( np.array(count_table.matrix_data.todense()), np.array([[100, 50, 0, 50, 50, 50], [100, 25, 100, 25, 25, 25], [ 0, 100, 100, 0, 50, 50]]).T ) npt.assert_array_equal( np.array(list(count_table.ids(axis='sample'))), np.array(['sample1', 'sample2', 'sample3']) ) npt.assert_array_equal( np.array(list(count_table.ids(axis='observation'))), np.array(['seq1', 'seq2', 'seq3', 'seq4', 'seq5', 'seq6']), ) pdt.assert_frame_equal(known_map, mapping) pdt.assert_frame_equal(known_summary, summary.to_dataframe())