def test_invalid_header(self): fp = get_data_path('invalid/invalid-header.tsv') with self.assertRaisesRegex(MetadataFileError, 'unrecognized ID column name.*' 'invalid_id_header'): Metadata.load(fp)
def test_empty_file(self): fp = pkg_resources.resource_filename( 'qiime2.metadata.tests', 'data/empty') with self.assertRaisesRegex(MetadataFileError, 'locate header.*file may be empty'): Metadata.load(fp)
def test_column_types_unrecognized_column_name(self): fp = get_data_path('valid/simple.tsv') with self.assertRaisesRegex(MetadataFileError, 'not_a_column.*column_types.*not a column ' 'in the metadata file'): Metadata.load(fp, column_types={'not_a_column': 'numeric'})
def test_duplicate_column_names_with_whitespace(self): fp = get_data_path( 'invalid/duplicate-column-names-with-whitespace.tsv') with self.assertRaisesRegex(MetadataFileError, 'Column names must be unique.*col1'): Metadata.load(fp)
def test_directive_before_header(self): fp = get_data_path('invalid/directive-before-header.tsv') with self.assertRaisesRegex(MetadataFileError, 'directive.*#q2:types.*searching for ' 'header'): Metadata.load(fp)
def test_column_types_override_directive_not_convertible_to_numeric(self): fp = get_data_path('valid/simple-with-directive.tsv') with self.assertRaisesRegex(MetadataFileError, "column 'col3' to numeric.*could not be " "interpreted as numeric: 'bar', 'foo'"): Metadata.load(fp, column_types={'col3': 'numeric'})
def test_unrecognized_column_type_in_directive(self): fp = get_data_path('invalid/unrecognized-column-type.tsv') with self.assertRaisesRegex(MetadataFileError, 'col2.*unrecognized column type.*foo.*' '#q2:types directive'): Metadata.load(fp)
def test_unrecognized_directive(self): fp = get_data_path('invalid/unrecognized-directive.tsv') with self.assertRaisesRegex(MetadataFileError, 'Unrecognized directive.*#q2:foo.*' '#q2:types directive is supported'): Metadata.load(fp)
def test_data_longer_than_header(self): fp = get_data_path('invalid/data-longer-than-header.tsv') with self.assertRaisesRegex(MetadataFileError, 'row has 5 cells.*header declares 4 ' 'cells'): Metadata.load(fp)
def test_path_is_directory(self): fp = get_data_path('valid') with self.assertRaisesRegex(MetadataFileError, "path points to something other than a " "file"): Metadata.load(fp)
def test_comments_and_empty_rows_only(self): fp = get_data_path('invalid/comments-and-empty-rows-only.tsv') with self.assertRaisesRegex(MetadataFileError, 'locate header.*only of comments or empty ' 'rows'): Metadata.load(fp)
def test_qiime1_empty_mapping_file(self): fp = pkg_resources.resource_filename( 'qiime2.metadata.tests', 'data/qiime1-empty.tsv') with self.assertRaisesRegex(MetadataFileError, 'at least one ID.*empty'): Metadata.load(fp)
def test_non_standard_characters(self): fp = get_data_path('valid/non-standard-characters.tsv') md1 = Metadata.load(fp) md1.save(self.filepath) md2 = Metadata.load(self.filepath) self.assertEqual(md1, md2)
def test_all_cells_padded(self): fp = get_data_path('valid/all-cells-padded.tsv') md1 = Metadata.load(fp) md1.save(self.filepath) md2 = Metadata.load(self.filepath) self.assertEqual(md1, md2)
def test_numeric_column(self): fp = get_data_path('valid/numeric-column.tsv') md1 = Metadata.load(fp) md1.save(self.filepath) md2 = Metadata.load(self.filepath) self.assertEqual(md1, md2)
def test_directive_after_directives_section(self): fp = get_data_path( 'invalid/directive-after-directives-section.tsv') with self.assertRaisesRegex(MetadataFileError, '#q2:types.*outside of the directives ' 'section'): Metadata.load(fp)
def test_minimal_file(self): fp = get_data_path('valid/minimal.tsv') md1 = Metadata.load(fp) md1.save(self.filepath) md2 = Metadata.load(self.filepath) self.assertEqual(md1, md2)
def test_column_types_unrecognized_column_type(self): fp = get_data_path('valid/simple.tsv') with self.assertRaisesRegex(MetadataFileError, 'col2.*column_types.*unrecognized column ' 'type.*CATEGORICAL'): Metadata.load(fp, column_types={'col1': 'numeric', 'col2': 'CATEGORICAL'})
def test_column_name_conflicts_with_id_header(self): fp = get_data_path( 'invalid/column-name-conflicts-with-id-header.tsv') with self.assertRaisesRegex(MetadataFileError, "column name 'featureid' conflicts.*ID " "column header"): Metadata.load(fp)
def test_column_types_directive_not_convertible_to_numeric(self): fp = get_data_path('invalid/types-directive-non-numeric.tsv') # This error message regex is intentionally verbose because we want to # assert that many different types of non-numeric strings aren't # interpreted as numbers. The error message displays a sorted list of # all values that couldn't be converted to numbers, making it possible # to test a variety of non-numeric strings in a single test case. msg = (r"column 'col2' to numeric.*could not be interpreted as " r"numeric: '\$42', '\+inf', '-inf', '0xAF', '1,000', " r"'1\.000\.0', '1_000_000', '1e3e4', 'Infinity', 'NA', 'NaN', " "'a', 'e3', 'foo', 'inf', 'nan', 'sample-1'") with self.assertRaisesRegex(MetadataFileError, msg): Metadata.load(fp)
def test_numeric_metadata_column(self): fp = get_data_path('valid/simple.tsv') md1 = Metadata.load(fp) mdc1 = md1.get_column('col1') self.assertIsInstance(mdc1, NumericMetadataColumn) mdc1.save(self.filepath) md2 = Metadata.load(self.filepath) mdc2 = md2.get_column('col1') self.assertIsInstance(mdc1, NumericMetadataColumn) self.assertEqual(mdc1, mdc2)
def test_numeric_metadata_column(self): fp = get_data_path('valid/simple.tsv') md1 = Metadata.load(fp) mdc1 = md1.get_column('col1') self.assertIsInstance(mdc1, NumericMetadataColumn) mdc1.save(self.filepath) md2 = Metadata.load(self.filepath) mdc2 = md2.get_column('col1') self.assertIsInstance(mdc1, NumericMetadataColumn) self.assertEqual(mdc1, mdc2)
def test_column_types_directive_not_convertible_to_numeric(self): fp = get_data_path('invalid/types-directive-non-numeric.tsv') # This error message regex is intentionally verbose because we want to # assert that many different types of non-numeric strings aren't # interpreted as numbers. The error message displays a sorted list of # all values that couldn't be converted to numbers, making it possible # to test a variety of non-numeric strings in a single test case. msg = (r"column 'col2' to numeric.*could not be interpreted as " r"numeric: '\$42', '\+inf', '-inf', '0xAF', '1,000', " r"'1\.000\.0', '1_000_000', '1e3e4', 'Infinity', 'NA', 'NaN', " "'a', 'e3', 'foo', 'inf', 'nan', 'sample-1'") with self.assertRaisesRegex(MetadataFileError, msg): Metadata.load(fp)
def test_bom_simple_txt(self): # This is the encoding that notepad.exe will use most commonly fp = get_data_path('valid/BOM-simple.txt') obs_md = Metadata.load(fp) self.assertEqual(obs_md, self.simple_md)
def test_jagged_trailing_columns(self): # Test case based on https://github.com/qiime2/qiime2/issues/335 fp = get_data_path('valid/jagged-trailing-columns.tsv') obs_md = Metadata.load(fp) self.assertEqual(obs_md, self.simple_md)
def test_artifacts(self): fp = pkg_resources.resource_filename('qiime2.metadata.tests', 'data/simple.tsv') metadata = Metadata.load(fp) self.assertEqual(metadata.artifacts, ())
def test_jagged_trailing_columns(self): # Test case based on https://github.com/qiime2/qiime2/issues/335 fp = get_data_path('valid/jagged-trailing-columns.tsv') obs_md = Metadata.load(fp) self.assertEqual(obs_md, self.simple_md)
def test_bom_simple_txt(self): # This is the encoding that notepad.exe will use most commonly fp = get_data_path('valid/BOM-simple.txt') obs_md = Metadata.load(fp) self.assertEqual(obs_md, self.simple_md)
def test_artifacts(self): fp = pkg_resources.resource_filename( 'qiime2.metadata.tests', 'data/simple.tsv') metadata = Metadata.load(fp) self.assertEqual(metadata.artifacts, ())
def test_non_standard_characters(self): # Test that non-standard characters in IDs, column names, and cells are # handled correctly. The test case isn't exhaustive (e.g. it doesn't # test every Unicode character; that would be a nice additional test # case to have in the future). Instead, this test aims to be more of an # integration test for the robustness of the reader to non-standard # data. Many of the characters and their placement within the data file # are based on use-cases/bugs reported on the forum, Slack, etc. The # data file has comments explaining these test case choices in more # detail. fp = get_data_path('valid/non-standard-characters.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index( ['©id##1', '((id))2', "'id_3<>'", '"id#4"', 'i d\r\t\n5'], name='id') exp_columns = [ '↩c@l1™', 'col(#2)', "#col'3", '"<col_4>"', 'col\t \r\n5' ] exp_data = [['ƒoo', '(foo)', '#f o #o', 'fo\ro', np.nan], ["''2''", 'b#r', 'ba\nr', np.nan, np.nan], ['b"ar', 'c\td', '4\r\n2', np.nan, np.nan], ['b__a_z', '<42>', '>42', np.nan, np.nan], ['baz', np.nan, '42']] exp_df = pd.DataFrame(exp_data, index=exp_index, columns=exp_columns) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_non_standard_characters(self): # Test that non-standard characters in IDs, column names, and cells are # handled correctly. The test case isn't exhaustive (e.g. it doesn't # test every Unicode character; that would be a nice additional test # case to have in the future). Instead, this test aims to be more of an # integration test for the robustness of the reader to non-standard # data. Many of the characters and their placement within the data file # are based on use-cases/bugs reported on the forum, Slack, etc. The # data file has comments explaining these test case choices in more # detail. fp = get_data_path('valid/non-standard-characters.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['©id##1', '((id))2', "'id_3<>'", '"id#4"', 'i d\r\t\n5'], name='id') exp_columns = ['↩c@l1™', 'col(#2)', "#col'3", '"<col_4>"', 'col\t \r\n5'] exp_data = [ ['ƒoo', '(foo)', '#f o #o', 'fo\ro', np.nan], ["''2''", 'b#r', 'ba\nr', np.nan, np.nan], ['b"ar', 'c\td', '4\r\n2', np.nan, np.nan], ['b__a_z', '<42>', '>42', np.nan, np.nan], ['baz', np.nan, '42'] ] exp_df = pd.DataFrame(exp_data, index=exp_index, columns=exp_columns) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_type_mismatch(self): fp = pkg_resources.resource_filename('qiime2.metadata.tests', 'data/simple.tsv') md = Metadata.load(fp) mdc = md.get_column('col1') self.assertIsInstance(md, Metadata) self.assertIsInstance(mdc, NumericMetadataColumn) self.assertReallyNotEqual(md, mdc)
def test_no_columns(self): fp = pkg_resources.resource_filename( 'qiime2.metadata.tests', 'data/no-columns.tsv') metadata = Metadata.load(fp) obs = metadata.get_ids() exp = {'a', 'b', 'my-id'} self.assertEqual(obs, exp)
def test_type_mismatch(self): fp = pkg_resources.resource_filename( 'qiime2.metadata.tests', 'data/simple.tsv') md = Metadata.load(fp) mdc = md.get_column('col1') self.assertIsInstance(md, Metadata) self.assertIsInstance(mdc, NumericMetadataColumn) self.assertReallyNotEqual(md, mdc)
def test_no_columns(self): fp = pkg_resources.resource_filename('qiime2.metadata.tests', 'data/no-columns.tsv') metadata = Metadata.load(fp) obs = metadata.get_ids() exp = {'a', 'b', 'my-id'} self.assertEqual(obs, exp)
def test_no_columns(self): fp = get_data_path('valid/no-columns.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['a', 'b', 'my-id'], name='id') exp_df = pd.DataFrame({}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_single_column(self): fp = get_data_path('valid/single-column.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['id1', 'id2', 'id3'], name='id') exp_df = pd.DataFrame({'col1': [1.0, 2.0, 3.0]}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_simple(self): # Simple metadata file without comments, empty rows, jaggedness, # missing data, odd IDs or column names, directives, etc. The file has # multiple column types (numeric, categorical, and something that has # mixed numbers and strings, which must be interpreted as categorical). fp = get_data_path('valid/simple.tsv') obs_md = Metadata.load(fp) self.assertEqual(obs_md, self.simple_md)
def test_no_columns(self): fp = get_data_path('valid/no-columns.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['a', 'b', 'my-id'], name='id') exp_df = pd.DataFrame({}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_single_column(self): fp = get_data_path('valid/single-column.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['id1', 'id2', 'id3'], name='id') exp_df = pd.DataFrame({'col1': [1.0, 2.0, 3.0]}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_simple(self): # Simple metadata file without comments, empty rows, jaggedness, # missing data, odd IDs or column names, directives, etc. The file has # multiple column types (numeric, categorical, and something that has # mixed numbers and strings, which must be interpreted as categorical). fp = get_data_path('valid/simple.tsv') obs_md = Metadata.load(fp) self.assertEqual(obs_md, self.simple_md)
def test_recommended_ids(self): fp = get_data_path('valid/recommended-ids.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['c6ca034a-223f-40b4-a0e0-45942912a5ea', 'My.ID'], name='id') exp_df = pd.DataFrame({'col1': ['foo', 'bar']}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_no_columns(self): fp = pkg_resources.resource_filename('qiime2.metadata.tests', 'data/no-columns.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['a', 'b', 'my-id'], name='id', dtype=object) exp_df = pd.DataFrame({}, index=exp_index, dtype=object) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_minimal_file(self): # Simplest possible metadata file consists of one ID and zero columns. fp = get_data_path('valid/minimal.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['a'], name='id') exp_df = pd.DataFrame({}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_single_id(self): fp = get_data_path('valid/single-id.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['id1'], name='id') exp_df = pd.DataFrame({'col1': [1.0], 'col2': ['a'], 'col3': ['foo']}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_with_partial_types_directive(self): fp = get_data_path('valid/partial-types-directive.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['id1', 'id2', 'id3'], name='id') exp_df = pd.DataFrame({'col1': ['1', '2', '3'], 'col2': ['a', 'b', 'c'], 'col3': ['foo', 'bar', '42']}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_numeric_column(self): fp = get_data_path('valid/numeric-column.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['id1', 'id2', 'id3', 'id4', 'id5', 'id6', 'id7', 'id8', 'id9', 'id10', 'id11', 'id12'], name='id') exp_df = pd.DataFrame({'col1': [0.0, 2.0, 0.0003, -4.2, 1e-4, 1e4, 1.5e2, np.nan, 1.0, 0.5, 1e-8, -0.0]}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_all_cells_padded(self): fp = get_data_path('valid/all-cells-padded.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['id1', 'id2', 'id3'], name='id') exp_df = pd.DataFrame({'col1': [np.nan, np.nan, np.nan], 'col2': [np.nan, np.nan, np.nan], 'col3': [np.nan, np.nan, np.nan]}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_biom_observation_metadata_file(self): fp = get_data_path('valid/biom-observation-metadata.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['OTU_1', 'OTU_2'], name='#OTUID') exp_df = pd.DataFrame( [['k__Bacteria;p__Firmicutes', 0.890], ['k__Bacteria', 0.9999]], columns=['taxonomy', 'confidence'], index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_column_types_without_directive(self): fp = get_data_path('valid/simple.tsv') obs_md = Metadata.load(fp, column_types={'col1': 'categorical'}) exp_index = pd.Index(['id1', 'id2', 'id3'], name='id') exp_df = pd.DataFrame({'col1': ['1', '2', '3'], 'col2': ['a', 'b', 'c'], 'col3': ['foo', 'bar', '42']}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_qiime1_mapping_file(self): fp = get_data_path('valid/qiime1.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['id1', 'id2', 'id3'], name='#SampleID') exp_df = pd.DataFrame({'col1': [1.0, 2.0, 3.0], 'col2': ['a', 'b', 'c'], 'col3': ['foo', 'bar', '42']}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_qiita_sample_information_file(self): fp = get_data_path('valid/qiita-sample-information.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['id.1', 'id.2'], name='sample_name') exp_df = pd.DataFrame({ 'DESCRIPTION': ['description 1', 'description 2'], 'TITLE': ['A Title', 'Another Title']}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_padding_rows_shorter_than_header(self): fp = get_data_path('valid/rows-shorter-than-header.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['id1', 'id2', 'id3'], name='id') exp_df = pd.DataFrame({'col1': [1.0, 2.0, np.nan], 'col2': ['a', np.nan, np.nan], 'col3': [np.nan, np.nan, np.nan]}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_with_case_insensitive_types_directive(self): fp = get_data_path('valid/case-insensitive-types-directive.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['id1', 'id2', 'id3'], name='id') exp_df = pd.DataFrame({'col1': ['1', '2', '3'], 'col2': ['a', 'b', 'c'], 'col3': [-5.0, 0.0, 42.0]}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_empty_rows(self): fp = self.get_data_path('valid/empty-rows.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['id1', 'id2', 'id3'], name='id', dtype=object) exp_df = pd.DataFrame({'col1': [1.0, 2.0, 3.0], 'col2': ['a', 'b', 'c'], 'col3': ['foo', 'bar', '42']}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_does_not_cast_ids_or_column_names(self): fp = get_data_path('valid/no-id-or-column-name-type-cast.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['0.000001', '0.004000', '0.000000'], dtype=object, name='id') exp_columns = ['42.0', '1000', '-4.2'] exp_data = [[2.0, 'b', 2.5], [1.0, 'b', 4.2], [3.0, 'c', -9.999]] exp_df = pd.DataFrame(exp_data, index=exp_index, columns=exp_columns) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_jagged_trailing_columns(self): # Test case based on https://github.com/qiime2/qiime2/issues/335 fp = self.get_data_path('valid/jagged-trailing-columns.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['id1', 'id2', 'id3'], name='id', dtype=object) exp_df = pd.DataFrame({'col1': [1.0, 2.0, 3.0], 'col2': ['a', 'b', 'c'], 'col3': ['foo', 'bar', '42']}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_numeric_column_as_categorical(self): fp = get_data_path('valid/numeric-column.tsv') obs_md = Metadata.load(fp, column_types={'col1': 'categorical'}) exp_index = pd.Index(['id1', 'id2', 'id3', 'id4', 'id5', 'id6', 'id7', 'id8', 'id9', 'id10', 'id11', 'id12'], name='id') exp_df = pd.DataFrame({'col1': ['0', '2.0', '0.00030', '-4.2', '1e-4', '1e4', '+1.5E+2', np.nan, '1.', '.5', '1e-08', '-0']}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_qiita_preparation_information_file(self): fp = get_data_path('valid/qiita-preparation-information.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['id.1', 'id.2'], name='sample_name') exp_df = pd.DataFrame({ 'BARCODE': ['ACGT', 'TGCA'], 'EXPERIMENT_DESIGN_DESCRIPTION': ['longitudinal study', 'longitudinal study']}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)
def test_does_not_cast_ids(self): fp = self.get_data_path('valid/no-type-cast.tsv') obs_md = Metadata.load(fp) exp_index = pd.Index(['0.000001', '0.004000', '0.000000'], dtype=object, name='id') exp_df = pd.DataFrame({'col1': [2.0, 1.0, 3.0], 'col2': ['b', 'b', 'c'], 'col3': [2.5, 4.2, -9.999]}, index=exp_index) exp_md = Metadata(exp_df) self.assertEqual(obs_md, exp_md)