def preprocess_mapping_file(data, headers, columns, unique=False, single=False, clones=0): """Process a mapping file to expand the data or remove unuseful fields Inputs: data: mapping file data headers: mapping file headers columns: list of headers to keep, if one of these headers includes two ampersands, this function will create a new column by merging the delimited columns. unique: keep columns where all values are unique single: keep columns where all values are the same clones: number of times to replicate the metadata Outputs: data: processed mapping file data headers: processed mapping file headers """ # The sample ID must always be there, else it's meaningless data if 'SampleID' != columns[0]: columns = ['SampleID'] + columns # process concatenated columns if needed merge = [] for column in columns: if '&&' in column: merge.append(column) # each element needs several columns to be merged for new_column in merge: indices = [headers.index(header_name) for header_name in new_column.split('&&')] # join all the fields of the metadata that are listed in indices for line in data: line.append(''.join([line[index] for index in indices])) headers.append(new_column) # remove all unique or singled valued columns if unique or single: columns_to_remove = [] metadata = MetadataMap(mapping_file_to_dict(data, headers), []) # find columns that have values that are all unique if unique == True: columns_to_remove += [column_name for column_name in headers[1::] if metadata.hasUniqueCategoryValues(column_name)] # remove categories where there is only one value if single == True: columns_to_remove += [column_name for column_name in headers[1::] if metadata.hasSingleCategoryValue(column_name)] columns_to_remove = list(set(columns_to_remove)) # remove the single or unique columns data, headers = keep_columns_from_mapping_file(data, headers, columns_to_remove, negate=True) # remove anything not specified in the input data, headers = keep_columns_from_mapping_file(data, headers, columns) # sanitize the mapping file data and headers data, headers = sanitize_mapping_file(data, headers) # clones mean: replicate the metadata retagging the sample ids with a suffix if clones: out_data = [] for index in range(0, clones): out_data.extend([[element[0]+'_%d' % index]+element[1::] for element in data]) data = out_data return data, headers
class MetadataMapTests(TestCase): """Tests for the MetadataMap class.""" def setUp(self): """Create MetadataMap objects that will be used in the tests.""" # Create a map using the overview tutorial mapping file. self.overview_map_str = [ "#SampleID\tBarcodeSequence\tTreatment\tDOB\tDescription", "PC.354\tAGCACGAGCCTA\tControl\t20061218\t354", "PC.355\tAACTCGTCGATG\tControl\t20061218\t355", "PC.356\tACAGACCACTCA\tControl\t20061126\t356", "PC.481\tACCAGCGACTAG\tControl\t20070314\t481", "PC.593\tAGCAGCACTTGT\tControl\t20071210\t593", "PC.607\tAACTGTGCGTAC\tFast\t20071112\t607", "PC.634\tACAGAGTCGGCT\tFast\t20080116\t634", "PC.635\tACCGCAGAGTCA\tFast\t20080116\t635", "PC.636\tACGGTGAGTGTC\tFast\t20080116\t636" ] self.overview_map = MetadataMap( *parse_mapping_file_to_dict(self.overview_map_str)) # Create the same overview tutorial map, but this time with some # comments. self.comment = "# Some comments about this mapping file" self.map_with_comments_str = self.overview_map_str[:] self.map_with_comments_str.insert(1, self.comment) self.map_with_comments = MetadataMap( *parse_mapping_file_to_dict(self.map_with_comments_str)) # Create a MetadataMap object that has no metadata (i.e. no sample IDs, # so no metadata about samples). self.empty_map = MetadataMap({}, []) # Create a MetadataMap object that has samples (i.e. sample IDs) but # not associated metadata (i.e. no columns other than SampleID). self.no_metadata_str = [ "#SampleID", "PC.354", "PC.355", "PC.356", "PC.481", "PC.593", "PC.607", "PC.634", "PC.635", "PC.636" ] self.no_metadata = MetadataMap( *parse_mapping_file_to_dict(self.no_metadata_str)) # Create a MetadataMap object that has a category with only one value # throughout the entire column. self.single_value_str = [ "#SampleID\tFoo", "PC.354\tfoo", "PC.355\tfoo", "PC.356\tfoo", "PC.481\tfoo", "PC.593\tfoo", "PC.607\tfoo", "PC.634\tfoo", "PC.635\tfoo", "PC.636\tfoo" ] self.single_value = MetadataMap( *parse_mapping_file_to_dict(self.single_value_str)) def test_parseMetadataMap(self): """Test parsing a mapping file into a MetadataMap instance.""" obs = MetadataMap.parseMetadataMap(self.overview_map_str) self.assertEqual(obs, self.overview_map) def test_parseMetadataMap_empty(self): """Test parsing empty mapping file contents.""" self.assertRaises(QiimeParseError, MetadataMap.parseMetadataMap, []) def test_eq(self): """Test whether two MetadataMaps are equal.""" self.assertTrue(self.empty_map == MetadataMap({}, [])) self.assertTrue(self.overview_map == MetadataMap( self.overview_map._metadata, self.overview_map.Comments)) def test_ne(self): """Test whether two MetadataMaps are not equal.""" self.assertTrue(self.empty_map != MetadataMap({}, ["foo"])) self.assertTrue(self.overview_map != MetadataMap( self.overview_map._metadata, ["foo"])) self.assertTrue( self.overview_map != MetadataMap({}, self.overview_map.Comments)) self.assertTrue(self.overview_map != self.empty_map) self.assertTrue(self.overview_map != self.map_with_comments) self.assertTrue(self.overview_map != self.no_metadata) def test_getSampleMetadata(self): """Test metadata by sample ID accessor with valid sample IDs.""" exp = { 'BarcodeSequence': 'AGCACGAGCCTA', 'Treatment': 'Control', 'DOB': '20061218', 'Description': '354' } obs = self.overview_map.getSampleMetadata('PC.354') self.assertEqual(obs, exp) exp = { 'BarcodeSequence': 'ACCAGCGACTAG', 'Treatment': 'Control', 'DOB': '20070314', 'Description': '481' } obs = self.map_with_comments.getSampleMetadata('PC.481') self.assertEqual(obs, exp) exp = { 'BarcodeSequence': 'ACGGTGAGTGTC', 'Treatment': 'Fast', 'DOB': '20080116', 'Description': '636' } obs = self.map_with_comments.getSampleMetadata('PC.636') self.assertEqual(obs, exp) exp = {} obs = self.no_metadata.getSampleMetadata('PC.636') self.assertEqual(obs, exp) def test_getSampleMetadata_bad_sample_id(self): """Test metadata by sample ID accessor with invalid sample IDs.""" # Nonexistent sample ID. self.assertRaises(KeyError, self.overview_map.getSampleMetadata, 'PC.000') self.assertRaises(KeyError, self.no_metadata.getSampleMetadata, 'PC.000') # Integer sample ID. self.assertRaises(KeyError, self.overview_map.getSampleMetadata, 42) # Sample ID of type None. self.assertRaises(KeyError, self.overview_map.getSampleMetadata, None) # Sample ID on empty map. self.assertRaises(KeyError, self.empty_map.getSampleMetadata, 's1') # Integer sample ID on empty map. self.assertRaises(KeyError, self.empty_map.getSampleMetadata, 1) # Sample ID of None on empty map. self.assertRaises(KeyError, self.empty_map.getSampleMetadata, None) def test_getCategoryValue(self): """Test category value by sample ID/category name accessor.""" exp = "Fast" obs = self.overview_map.getCategoryValue('PC.634', 'Treatment') self.assertEqual(obs, exp) exp = "20070314" obs = self.overview_map.getCategoryValue('PC.481', 'DOB') self.assertEqual(obs, exp) exp = "ACGGTGAGTGTC" obs = self.map_with_comments.getCategoryValue('PC.636', 'BarcodeSequence') self.assertEqual(obs, exp) def test_getCategoryValues(self): """Test category value list by sample ID/category name accessor.""" smpl_ids = [ 'PC.354', 'PC.355', 'PC.356', 'PC.481', 'PC.593', 'PC.607', 'PC.634', 'PC.635', 'PC.636' ] exp = [ 'Control', 'Control', 'Control', 'Control', 'Control', 'Fast', 'Fast', 'Fast', 'Fast' ] obs = self.overview_map.getCategoryValues(smpl_ids, 'Treatment') self.assertEqual(obs, exp) def test_isNumericCategory(self): """Test checking if a category is numeric.""" obs = self.overview_map.isNumericCategory('Treatment') self.assertEqual(obs, False) obs = self.overview_map.isNumericCategory('DOB') self.assertEqual(obs, True) def test_hasUniqueCategoryValues(self): """Test checking if a category has unique values.""" obs = self.overview_map.hasUniqueCategoryValues('Treatment') self.assertEqual(obs, False) obs = self.overview_map.hasUniqueCategoryValues('DOB') self.assertEqual(obs, False) obs = self.overview_map.hasUniqueCategoryValues('Description') self.assertEqual(obs, True) def test_hasSingleCategoryValue(self): """Test checking if a category has only a single value.""" obs = self.overview_map.hasSingleCategoryValue('Treatment') self.assertEqual(obs, False) obs = self.single_value.hasSingleCategoryValue('Foo') self.assertEqual(obs, True) def test_getCategoryValue_bad_sample_id(self): """Test category value by sample ID accessor with bad sample IDs.""" # Nonexistent sample ID. self.assertRaises(KeyError, self.overview_map.getCategoryValue, 'PC.000', 'Treatment') self.assertRaises(KeyError, self.no_metadata.getCategoryValue, 'PC.000', 'Treatment') # Integer sample ID. self.assertRaises(KeyError, self.overview_map.getCategoryValue, 42, 'DOB') # Sample ID of type None. self.assertRaises(KeyError, self.overview_map.getCategoryValue, None, 'Treatment') # Sample ID on empty map. self.assertRaises(KeyError, self.empty_map.getCategoryValue, 's1', 'foo') # Integer sample ID on empty map. self.assertRaises(KeyError, self.empty_map.getCategoryValue, 1, 'bar') # Sample ID of None on empty map. self.assertRaises(KeyError, self.empty_map.getCategoryValue, None, 'baz') def test_getCategoryValue_bad_category(self): """Test category value by sample ID accessor with bad categories.""" # Nonexistent category. self.assertRaises(KeyError, self.overview_map.getCategoryValue, 'PC.354', 'foo') # Integer category. self.assertRaises(KeyError, self.overview_map.getCategoryValue, 'PC.354', 42) # Category of type None. self.assertRaises(KeyError, self.overview_map.getCategoryValue, 'PC.354', None) # Category on map with no metadata, but that has sample IDs. self.assertRaises(KeyError, self.no_metadata.getCategoryValue, 'PC.354', 'Treatment') # Integer category on map with no metadata. self.assertRaises(KeyError, self.no_metadata.getCategoryValue, 'PC.354', 34) # Category of type None on map with no metadata. self.assertRaises(KeyError, self.no_metadata.getCategoryValue, 'PC.354', None) def test_SampleIds(self): """Test sample IDs accessor.""" exp = [ "PC.354", "PC.355", "PC.356", "PC.481", "PC.593", "PC.607", "PC.634", "PC.635", "PC.636" ] obs = self.overview_map.SampleIds self.assertEqual(obs, exp) obs = self.no_metadata.SampleIds self.assertEqual(obs, exp) obs = self.empty_map.SampleIds self.assertEqual(obs, []) def test_CategoryNames(self): """Test category names accessor.""" exp = ["BarcodeSequence", "DOB", "Description", "Treatment"] obs = self.overview_map.CategoryNames self.assertEqual(obs, exp) obs = self.no_metadata.CategoryNames self.assertEqual(obs, []) obs = self.empty_map.CategoryNames self.assertEqual(obs, []) def test_filterSamples(self): """Test filtering out samples from metadata map.""" exp = ['PC.356', 'PC.593'] self.overview_map.filterSamples(['PC.593', 'PC.356']) obs = self.overview_map.SampleIds self.assertEqual(obs, exp) self.overview_map.filterSamples([]) self.assertEqual(self.overview_map.SampleIds, []) def test_filterSamples_strict(self): """Test strict checking of sample prescence when filtering.""" with self.assertRaises(ValueError): self.overview_map.filterSamples(['PC.356', 'abc123']) with self.assertRaises(ValueError): self.empty_map.filterSamples(['foo']) def test_filterSamples_no_strict(self): """Test missing samples does not raise error.""" self.overview_map.filterSamples(['PC.356', 'abc123'], strict=False) self.assertEqual(self.overview_map.SampleIds, ['PC.356']) self.empty_map.filterSamples(['foo'], strict=False) self.assertEqual(self.empty_map.SampleIds, []) def test_is_valid_git_refname(self): """Test correct validation of refnames""" # valid branchnames self.assertTrue(is_valid_git_refname('master')) self.assertTrue(is_valid_git_refname('debuggatron_2000')) self.assertTrue(is_valid_git_refname('refname/bar')) self.assertTrue(is_valid_git_refname('ref.nameslu/_eggs_/spam')) self.assertTrue(is_valid_git_refname('valid{0}char'.format( unichr(40)))) self.assertTrue(is_valid_git_refname('master@head')) self.assertTrue(is_valid_git_refname('bar{thing}foo')) # case happening with git < 1.6.6 self.assertFalse( is_valid_git_refname( '--abbrev-ref\nbaa350d7b7063d585ca293fc16ef15e0765dc9ee')) # different invalid refnames, for a description of each group see the # man page of git check-ref-format self.assertFalse(is_valid_git_refname('bar/.spam/eggs')) self.assertFalse(is_valid_git_refname('bar.lock/spam/eggs')) self.assertFalse(is_valid_git_refname('bar.lock')) self.assertFalse(is_valid_git_refname('.foobar')) self.assertFalse(is_valid_git_refname('ref..name')) self.assertFalse( is_valid_git_refname(u'invalid{0}char'.format(unichr(177)))) self.assertFalse( is_valid_git_refname('invalid{0}char'.format(unichr(39)))) self.assertFalse(is_valid_git_refname('ref~name/bar')) self.assertFalse(is_valid_git_refname('refname spam')) self.assertFalse(is_valid_git_refname('bar/foo/eggs~spam')) self.assertFalse(is_valid_git_refname('bar:_spam_')) self.assertFalse(is_valid_git_refname('eggtastic^2')) self.assertFalse(is_valid_git_refname('areyourandy?')) self.assertFalse(is_valid_git_refname('bar/*/spam')) self.assertFalse(is_valid_git_refname('bar[spam]/eggs')) self.assertFalse(is_valid_git_refname('/barfooeggs')) self.assertFalse(is_valid_git_refname('barfooeggs/')) self.assertFalse(is_valid_git_refname('bar/foo//////eggs')) self.assertFalse(is_valid_git_refname('dotEnding.')) self.assertFalse(is_valid_git_refname('@{branch')) self.assertFalse(is_valid_git_refname('contains\\slash')) self.assertFalse(is_valid_git_refname('$newbranch')) def test_is_valid_git_sha1(self): """ """ # valid sha1 strings self.assertTrue( is_valid_git_sha1('65a9ba2ef4b126fb5b054ea6b89b457463db4ec6')) self.assertTrue( is_valid_git_sha1('a29a9911e41253405494c43889925a6d79ca26db')) self.assertTrue( is_valid_git_sha1('e099cd5fdea89eba929d6051fbd26cc9e7a0c961')) self.assertTrue( is_valid_git_sha1('44235d322c3386bd5ce872d9d7ea2e10d27c86cb')) self.assertTrue( is_valid_git_sha1('7d2fc23E04540EE92c742948cca9ed5bc54d08d1')) self.assertTrue( is_valid_git_sha1('fb5dc0285a8b11f199c4f3a7547a2da38138373f')) self.assertTrue( is_valid_git_sha1('0b2abAEb195ba7ebc5cfdb53213a66fbaddefdb8')) # invalid length self.assertFalse(is_valid_git_sha1('cca9ed5bc54d08d1')) self.assertFalse(is_valid_git_sha1('')) # invalid characters self.assertFalse( is_valid_git_sha1('fb5dy0f85a8b11f199c4f3a75474a2das8138373')) self.assertFalse( is_valid_git_sha1('0x5dcc816fbc1c2e8eX087d7d2ed8d2950a7c16b'))
class MetadataMapTests(TestCase): """Tests for the MetadataMap class.""" def setUp(self): """Create MetadataMap objects that will be used in the tests.""" # Create a map using the overview tutorial mapping file. self.overview_map_str = [ "#SampleID\tBarcodeSequence\tTreatment\tDOB\tDescription", "PC.354\tAGCACGAGCCTA\tControl\t20061218\t354", "PC.355\tAACTCGTCGATG\tControl\t20061218\t355", "PC.356\tACAGACCACTCA\tControl\t20061126\t356", "PC.481\tACCAGCGACTAG\tControl\t20070314\t481", "PC.593\tAGCAGCACTTGT\tControl\t20071210\t593", "PC.607\tAACTGTGCGTAC\tFast\t20071112\t607", "PC.634\tACAGAGTCGGCT\tFast\t20080116\t634", "PC.635\tACCGCAGAGTCA\tFast\t20080116\t635", "PC.636\tACGGTGAGTGTC\tFast\t20080116\t636"] self.overview_map = MetadataMap( *parse_mapping_file_to_dict(self.overview_map_str)) # Create the same overview tutorial map, but this time with some # comments. self.comment = "# Some comments about this mapping file" self.map_with_comments_str = self.overview_map_str[:] self.map_with_comments_str.insert(1, self.comment) self.map_with_comments = MetadataMap(*parse_mapping_file_to_dict( self.map_with_comments_str)) # Create a MetadataMap object that has no metadata (i.e. no sample IDs, # so no metadata about samples). self.empty_map = MetadataMap({}, []) # Create a MetadataMap object that has samples (i.e. sample IDs) but # not associated metadata (i.e. no columns other than SampleID). self.no_metadata_str = ["#SampleID", "PC.354", "PC.355", "PC.356", "PC.481", "PC.593", "PC.607", "PC.634", "PC.635", "PC.636"] self.no_metadata = MetadataMap(*parse_mapping_file_to_dict( self.no_metadata_str)) # Create a MetadataMap object that has a category with only one value # throughout the entire column. self.single_value_str = ["#SampleID\tFoo", "PC.354\tfoo", "PC.355\tfoo", "PC.356\tfoo", "PC.481\tfoo", "PC.593\tfoo", "PC.607\tfoo", "PC.634\tfoo", "PC.635\tfoo", "PC.636\tfoo"] self.single_value = MetadataMap(*parse_mapping_file_to_dict( self.single_value_str)) def test_parseMetadataMap(self): """Test parsing a mapping file into a MetadataMap instance.""" obs = MetadataMap.parseMetadataMap(self.overview_map_str) self.assertEqual(obs, self.overview_map) def test_parseMetadataMap_empty(self): """Test parsing empty mapping file contents.""" self.assertRaises(QiimeParseError, MetadataMap.parseMetadataMap, []) def test_eq(self): """Test whether two MetadataMaps are equal.""" self.assertTrue(self.empty_map == MetadataMap({}, [])) self.assertTrue(self.overview_map == MetadataMap( self.overview_map._metadata, self.overview_map.Comments)) def test_ne(self): """Test whether two MetadataMaps are not equal.""" self.assertTrue(self.empty_map != MetadataMap({}, ["foo"])) self.assertTrue(self.overview_map != MetadataMap( self.overview_map._metadata, ["foo"])) self.assertTrue(self.overview_map != MetadataMap({}, self.overview_map.Comments)) self.assertTrue(self.overview_map != self.empty_map) self.assertTrue(self.overview_map != self.map_with_comments) self.assertTrue(self.overview_map != self.no_metadata) def test_getSampleMetadata(self): """Test metadata by sample ID accessor with valid sample IDs.""" exp = {'BarcodeSequence': 'AGCACGAGCCTA', 'Treatment': 'Control', 'DOB': '20061218', 'Description': '354'} obs = self.overview_map.getSampleMetadata('PC.354') self.assertEqual(obs, exp) exp = {'BarcodeSequence': 'ACCAGCGACTAG', 'Treatment': 'Control', 'DOB': '20070314', 'Description': '481'} obs = self.map_with_comments.getSampleMetadata('PC.481') self.assertEqual(obs, exp) exp = {'BarcodeSequence': 'ACGGTGAGTGTC', 'Treatment': 'Fast', 'DOB': '20080116', 'Description': '636'} obs = self.map_with_comments.getSampleMetadata('PC.636') self.assertEqual(obs, exp) exp = {} obs = self.no_metadata.getSampleMetadata('PC.636') self.assertEqual(obs, exp) def test_getSampleMetadata_bad_sample_id(self): """Test metadata by sample ID accessor with invalid sample IDs.""" # Nonexistent sample ID. self.assertRaises(KeyError, self.overview_map.getSampleMetadata, 'PC.000') self.assertRaises(KeyError, self.no_metadata.getSampleMetadata, 'PC.000') # Integer sample ID. self.assertRaises(KeyError, self.overview_map.getSampleMetadata, 42) # Sample ID of type None. self.assertRaises(KeyError, self.overview_map.getSampleMetadata, None) # Sample ID on empty map. self.assertRaises(KeyError, self.empty_map.getSampleMetadata, 's1') # Integer sample ID on empty map. self.assertRaises(KeyError, self.empty_map.getSampleMetadata, 1) # Sample ID of None on empty map. self.assertRaises(KeyError, self.empty_map.getSampleMetadata, None) def test_getCategoryValue(self): """Test category value by sample ID/category name accessor.""" exp = "Fast" obs = self.overview_map.getCategoryValue('PC.634', 'Treatment') self.assertEqual(obs, exp) exp = "20070314" obs = self.overview_map.getCategoryValue('PC.481', 'DOB') self.assertEqual(obs, exp) exp = "ACGGTGAGTGTC" obs = self.map_with_comments.getCategoryValue( 'PC.636', 'BarcodeSequence') self.assertEqual(obs, exp) def test_getCategoryValues(self): """Test category value list by sample ID/category name accessor.""" smpl_ids = ['PC.354', 'PC.355', 'PC.356', 'PC.481', 'PC.593', 'PC.607', 'PC.634', 'PC.635', 'PC.636'] exp = ['Control','Control','Control','Control','Control','Fast' ,'Fast','Fast','Fast'] obs = self.overview_map.getCategoryValues(smpl_ids, 'Treatment') self.assertEqual(obs, exp) def test_isNumericCategory(self): """Test checking if a category is numeric.""" obs = self.overview_map.isNumericCategory('Treatment') self.assertEqual(obs, False) obs = self.overview_map.isNumericCategory('DOB') self.assertEqual(obs, True) def test_hasUniqueCategoryValues(self): """Test checking if a category has unique values.""" obs = self.overview_map.hasUniqueCategoryValues('Treatment') self.assertEqual(obs, False) obs = self.overview_map.hasUniqueCategoryValues('DOB') self.assertEqual(obs, False) obs = self.overview_map.hasUniqueCategoryValues('Description') self.assertEqual(obs, True) def test_hasSingleCategoryValue(self): """Test checking if a category has only a single value.""" obs = self.overview_map.hasSingleCategoryValue('Treatment') self.assertEqual(obs, False) obs = self.single_value.hasSingleCategoryValue('Foo') self.assertEqual(obs, True) def test_getCategoryValue_bad_sample_id(self): """Test category value by sample ID accessor with bad sample IDs.""" # Nonexistent sample ID. self.assertRaises(KeyError, self.overview_map.getCategoryValue, 'PC.000', 'Treatment') self.assertRaises(KeyError, self.no_metadata.getCategoryValue, 'PC.000', 'Treatment') # Integer sample ID. self.assertRaises(KeyError, self.overview_map.getCategoryValue, 42, 'DOB') # Sample ID of type None. self.assertRaises(KeyError, self.overview_map.getCategoryValue, None, 'Treatment') # Sample ID on empty map. self.assertRaises(KeyError, self.empty_map.getCategoryValue, 's1', 'foo') # Integer sample ID on empty map. self.assertRaises(KeyError, self.empty_map.getCategoryValue, 1, 'bar') # Sample ID of None on empty map. self.assertRaises(KeyError, self.empty_map.getCategoryValue, None, 'baz') def test_getCategoryValue_bad_category(self): """Test category value by sample ID accessor with bad categories.""" # Nonexistent category. self.assertRaises(KeyError, self.overview_map.getCategoryValue, 'PC.354', 'foo') # Integer category. self.assertRaises(KeyError, self.overview_map.getCategoryValue, 'PC.354', 42) # Category of type None. self.assertRaises(KeyError, self.overview_map.getCategoryValue, 'PC.354', None) # Category on map with no metadata, but that has sample IDs. self.assertRaises(KeyError, self.no_metadata.getCategoryValue, 'PC.354', 'Treatment') # Integer category on map with no metadata. self.assertRaises(KeyError, self.no_metadata.getCategoryValue, 'PC.354', 34) # Category of type None on map with no metadata. self.assertRaises(KeyError, self.no_metadata.getCategoryValue, 'PC.354', None) def test_SampleIds(self): """Test sample IDs accessor.""" exp = ["PC.354", "PC.355", "PC.356", "PC.481", "PC.593", "PC.607", "PC.634", "PC.635", "PC.636"] obs = self.overview_map.SampleIds self.assertEqual(obs, exp) obs = self.no_metadata.SampleIds self.assertEqual(obs, exp) obs = self.empty_map.SampleIds self.assertEqual(obs, []) def test_CategoryNames(self): """Test category names accessor.""" exp = ["BarcodeSequence", "DOB", "Description", "Treatment"] obs = self.overview_map.CategoryNames self.assertEqual(obs, exp) obs = self.no_metadata.CategoryNames self.assertEqual(obs, []) obs = self.empty_map.CategoryNames self.assertEqual(obs, []) def test_filterSamples(self): """Test filtering out samples from metadata map.""" exp = ['PC.356', 'PC.593'] self.overview_map.filterSamples(['PC.593', 'PC.356']) obs = self.overview_map.SampleIds self.assertEqual(obs, exp) self.overview_map.filterSamples([]) self.assertEqual(self.overview_map.SampleIds, []) def test_filterSamples_strict(self): """Test strict checking of sample prescence when filtering.""" with self.assertRaises(ValueError): self.overview_map.filterSamples(['PC.356', 'abc123']) with self.assertRaises(ValueError): self.empty_map.filterSamples(['foo']) def test_filterSamples_no_strict(self): """Test missing samples does not raise error.""" self.overview_map.filterSamples(['PC.356', 'abc123'], strict=False) self.assertEqual(self.overview_map.SampleIds, ['PC.356']) self.empty_map.filterSamples(['foo'], strict=False) self.assertEqual(self.empty_map.SampleIds, []) def test_is_valid_git_refname(self): """Test correct validation of refnames""" # valid branchnames self.assertTrue(is_valid_git_refname('master')) self.assertTrue(is_valid_git_refname('debuggatron_2000')) self.assertTrue(is_valid_git_refname('refname/bar')) self.assertTrue(is_valid_git_refname('ref.nameslu/_eggs_/spam')) self.assertTrue(is_valid_git_refname('valid{0}char'.format( unichr(40)))) self.assertTrue(is_valid_git_refname('master@head')) self.assertTrue(is_valid_git_refname('bar{thing}foo')) # case happening with git < 1.6.6 self.assertFalse(is_valid_git_refname( '--abbrev-ref\nbaa350d7b7063d585ca293fc16ef15e0765dc9ee')) # different invalid refnames, for a description of each group see the # man page of git check-ref-format self.assertFalse(is_valid_git_refname('bar/.spam/eggs')) self.assertFalse(is_valid_git_refname('bar.lock/spam/eggs')) self.assertFalse(is_valid_git_refname('bar.lock')) self.assertFalse(is_valid_git_refname('.foobar')) self.assertFalse(is_valid_git_refname('ref..name')) self.assertFalse(is_valid_git_refname(u'invalid{0}char'.format( unichr(177)))) self.assertFalse(is_valid_git_refname('invalid{0}char'.format( unichr(39)))) self.assertFalse(is_valid_git_refname('ref~name/bar')) self.assertFalse(is_valid_git_refname('refname spam')) self.assertFalse(is_valid_git_refname('bar/foo/eggs~spam')) self.assertFalse(is_valid_git_refname('bar:_spam_')) self.assertFalse(is_valid_git_refname('eggtastic^2')) self.assertFalse(is_valid_git_refname('areyourandy?')) self.assertFalse(is_valid_git_refname('bar/*/spam')) self.assertFalse(is_valid_git_refname('bar[spam]/eggs')) self.assertFalse(is_valid_git_refname('/barfooeggs')) self.assertFalse(is_valid_git_refname('barfooeggs/')) self.assertFalse(is_valid_git_refname('bar/foo//////eggs')) self.assertFalse(is_valid_git_refname('dotEnding.')) self.assertFalse(is_valid_git_refname('@{branch')) self.assertFalse(is_valid_git_refname('contains\\slash')) self.assertFalse(is_valid_git_refname('$newbranch')) def test_is_valid_git_sha1(self): """ """ # valid sha1 strings self.assertTrue(is_valid_git_sha1( '65a9ba2ef4b126fb5b054ea6b89b457463db4ec6')) self.assertTrue(is_valid_git_sha1( 'a29a9911e41253405494c43889925a6d79ca26db')) self.assertTrue(is_valid_git_sha1( 'e099cd5fdea89eba929d6051fbd26cc9e7a0c961')) self.assertTrue(is_valid_git_sha1( '44235d322c3386bd5ce872d9d7ea2e10d27c86cb')) self.assertTrue(is_valid_git_sha1( '7d2fc23E04540EE92c742948cca9ed5bc54d08d1')) self.assertTrue(is_valid_git_sha1( 'fb5dc0285a8b11f199c4f3a7547a2da38138373f')) self.assertTrue(is_valid_git_sha1( '0b2abAEb195ba7ebc5cfdb53213a66fbaddefdb8')) # invalid length self.assertFalse(is_valid_git_sha1('cca9ed5bc54d08d1')) self.assertFalse(is_valid_git_sha1('')) # invalid characters self.assertFalse(is_valid_git_sha1( 'fb5dy0f85a8b11f199c4f3a75474a2das8138373')) self.assertFalse(is_valid_git_sha1( '0x5dcc816fbc1c2e8eX087d7d2ed8d2950a7c16b'))
def preprocess_mapping_file(data, headers, columns, unique=False, single=False, clones=0): """Process a mapping file to expand the data or remove unuseful fields Inputs: data: mapping file data headers: mapping file headers columns: list of headers to keep, if one of these headers includes two ampersands, this function will create a new column by merging the delimited columns. unique: keep columns where all values are unique single: keep columns where all values are the same clones: number of times to replicate the metadata Outputs: data: processed mapping file data headers: processed mapping file headers """ # The sample ID must always be there, else it's meaningless data if 'SampleID' != columns[0]: columns = ['SampleID'] + columns # process concatenated columns if needed merge = [] for column in columns: if '&&' in column: merge.append(column) # each element needs several columns to be merged for new_column in merge: indices = [ headers.index(header_name) for header_name in new_column.split('&&') ] # join all the fields of the metadata that are listed in indices for line in data: line.append(''.join([line[index] for index in indices])) headers.append(new_column) # remove all unique or singled valued columns if unique or single: columns_to_remove = [] metadata = MetadataMap(mapping_file_to_dict(data, headers), []) # find columns that have values that are all unique if unique == True: columns_to_remove += [ column_name for column_name in headers[1::] if metadata.hasUniqueCategoryValues(column_name) ] # remove categories where there is only one value if single == True: columns_to_remove += [ column_name for column_name in headers[1::] if metadata.hasSingleCategoryValue(column_name) ] columns_to_remove = list(set(columns_to_remove)) # remove the single or unique columns data, headers = keep_columns_from_mapping_file(data, headers, columns_to_remove, negate=True) # remove anything not specified in the input data, headers = keep_columns_from_mapping_file(data, headers, columns) # sanitize the mapping file data and headers data, headers = sanitize_mapping_file(data, headers) # clones mean: replicate the metadata retagging the sample ids with a suffix if clones: out_data = [] for index in range(0, clones): out_data.extend([[element[0] + '_%d' % index] + element[1::] for element in data]) data = out_data return data, headers
def preprocess_mapping_file(data, headers, columns, unique=False, single=False, clones=0): """Process a mapping file to expand the data or remove unuseful fields Inputs: data: mapping file data headers: mapping file headers columns: list of headers to keep, if one of these headers includes two ampersands, this function will create a new column by merging the delimited columns. unique: keep columns where all values are unique single: keep columns where all values are the same clones: number of times to replicate the metadata Outputs: data: processed mapping file data headers: processed mapping file headers """ # The sample ID must always be there, else it's meaningless data if 'SampleID' != columns[0]: columns = ['SampleID'] + columns # process concatenated columns if needed merge = [] for column in columns: # the list can contain None so check "if column" before treating as str if column and '&&' in column: merge.append(column) # each element needs several columns to be merged for new_column in merge: indices = [headers.index(header_name) for header_name in new_column.split('&&')] # join all the fields of the metadata that are listed in indices for line in data: line.append(''.join([line[index] for index in indices])) headers.append(new_column) # remove all unique or singled valued columns that are not included in # the list of categories that should be kept i. e. columns if unique or single: columns_to_remove = [] metadata = MetadataMap(mapping_file_to_dict(data, headers), []) # the --coloy_by option in the script interface allows the user to # specify the categories you want to use in the generated plot, this # the default behaviour is to color by all categories that are not # unique. If the user specifies a category with with the --color_by # option and this category contains a unique values, this category must # still be added thus the structure of the next few lines that # form the structure for the two different routes. (1) where no value # is specified in the CLI (the value of columns will be [None, x1, x2, # x3] where x{1,2,3} are categories requested in other CLI options) and # (2) where a value is specified in the CLI. # # TL;DR # see https://github.com/biocore/emperor/issues/271 if None in columns: columns = headers[:] f_unique = metadata.hasUniqueCategoryValues f_single = metadata.hasSingleCategoryValue else: f_unique = lambda x: metadata.hasUniqueCategoryValues(x) and\ x not in columns f_single = lambda x: metadata.hasSingleCategoryValue(x) and\ x not in columns # find columns that have values that are all unique if unique: for c in headers[1::]: if f_unique(c): columns_to_remove.append(c) # remove categories where there is only one value if single: for c in headers[1::]: if f_single(c): columns_to_remove.append(c) columns_to_remove = list(set(columns_to_remove)) # remove the single or unique columns data, headers = keep_columns_from_mapping_file(data, headers, columns_to_remove, negate=True) else: # when a None is contained in columns, we imply we want to use all the # available categories in the mapping file, thus just overwrite the # value if None in columns: columns = headers[:] # remove anything not specified in the input data, headers = keep_columns_from_mapping_file(data, headers, columns) # sanitize the mapping file data and headers data, headers = sanitize_mapping_file(data, headers) # clones mean: replicate the metadata retagging the sample ids with a suffix if clones: out_data = [] for index in range(0, clones): out_data.extend([[element[0]+'_%d' % index]+element[1::] for element in data]) data = out_data return data, headers