def setUp(self): super(DissimilarityMatrixTests, self).setUp() self.dm_1x1 = DissimilarityMatrix(self.dm_1x1_data, ['a']) self.dm_2x2 = DissimilarityMatrix(self.dm_2x2_data, ['a', 'b']) self.dm_2x2_asym = DissimilarityMatrix(self.dm_2x2_asym_data, ['a', 'b']) self.dm_3x3 = DissimilarityMatrix(self.dm_3x3_data, ['a', 'b', 'c']) self.dms = [self.dm_1x1, self.dm_2x2, self.dm_2x2_asym, self.dm_3x3] self.dm_f_lines = [ DM_1x1_F, DM_2x2_F, self.dm_2x2_asym_lines, self.dm_3x3_lines ] self.dm_fs = [ self.dm_1x1_f, self.dm_2x2_f, self.dm_2x2_asym_f, self.dm_3x3_f ] self.dm_shapes = [(1, 1), (2, 2), (2, 2), (3, 3)] self.dm_sizes = [1, 4, 4, 9] self.dm_transposes = [ self.dm_1x1, self.dm_2x2, DissimilarityMatrix([[0, -2], [1, 0]], ['a', 'b']), self.dm_3x3 ] self.dm_redundant_forms = [ np.array(self.dm_1x1_data), np.array(self.dm_2x2_data), np.array(self.dm_2x2_asym_data), np.array(self.dm_3x3_data) ]
def test_init_invalid_input(self): # Empty data. with self.assertRaises(DissimilarityMatrixError): DissimilarityMatrix([], []) # Another type of empty data. with self.assertRaises(DissimilarityMatrixError): DissimilarityMatrix(np.empty((0, 0)), []) # Invalid number of dimensions. with self.assertRaises(DissimilarityMatrixError): DissimilarityMatrix([1, 2, 3], ['a']) # Dimensions don't match. with self.assertRaises(DissimilarityMatrixError): DissimilarityMatrix([[1, 2, 3]], ['a']) data = [[0, 1], [1, 0]] # Duplicate IDs. with self.assertRaises(DissimilarityMatrixError): DissimilarityMatrix(data, ['a', 'a']) # Number of IDs don't match dimensions. with self.assertRaises(DissimilarityMatrixError): DissimilarityMatrix(data, ['a', 'b', 'c']) with self.assertRaises(DissimilarityMatrixError): DissimilarityMatrix(data, []) # Non-hollow. data = [[0.0, 1.0], [1.0, 0.01]] with self.assertRaises(DissimilarityMatrixError): DissimilarityMatrix(data, ['a', 'b'])
def test_filter_asymmetric(self): # 2x2 ids = ['b', 'a'] exp = DissimilarityMatrix([[0, -2], [1, 0]], ids) obs = self.dm_2x2_asym.filter(ids) self.assertEqual(obs, exp) # 3x3 dm = DissimilarityMatrix([[0, 10, 53], [42, 0, 22.5], [53, 1, 0]], ('bro', 'brah', 'breh')) ids = ['breh', 'brah'] exp = DissimilarityMatrix([[0, 1], [22.5, 0]], ids) obs = dm.filter(ids) self.assertEqual(obs, exp)
def test_from_iterable_asymmetric_data(self): iterable = (x for x in range(4)) exp = DissimilarityMatrix([[0, 1, 2, 3], [-1, 0, 1, 2], [-2, -1, 0, 1], [-3, -2, -1, 0]]) res = DissimilarityMatrix.from_iterable(iterable, lambda a, b: b - a) self.assertEqual(res, exp)
def test_preprocess_input_raises_error(self): # Requires a DistanceMatrix. with self.assertRaises(TypeError): _preprocess_input( DissimilarityMatrix([[0, 2], [3, 0]], ['a', 'b']), [1, 2], None) # Requires column if DataFrame. with self.assertRaises(ValueError): _preprocess_input(self.dm, self.df, None) # Cannot provide column if not data frame. with self.assertRaises(ValueError): _preprocess_input(self.dm, self.grouping, 'Group') # Column must exist in data frame. with self.assertRaises(ValueError): _preprocess_input(self.dm, self.df, 'foo') # All distance matrix IDs must be in data frame. with self.assertRaises(ValueError): _preprocess_input(self.dm, self.df_missing_id, 'Group') # Grouping vector length must match number of objects in dm. with self.assertRaises(ValueError): _preprocess_input(self.dm, [1, 2], None) # Grouping vector cannot have only unique values. with self.assertRaises(ValueError): _preprocess_input(self.dm, [1, 2, 3], None) # Grouping vector cannot have only a single group. with self.assertRaises(ValueError): _preprocess_input(self.dm, [1, 1, 1], None)
def test_init_invalid_input(self): # Requires a DistanceMatrix. with self.assertRaises(TypeError): CategoricalStats(DissimilarityMatrix([[0, 2], [3, 0]], ['a', 'b']), [1, 2]) # Requires column if DataFrame. with self.assertRaises(ValueError): CategoricalStats(self.dm, self.df) # Cannot provide column if not data frame. with self.assertRaises(ValueError): CategoricalStats(self.dm, self.grouping, column='Group') # Column must exist in data frame. with self.assertRaises(ValueError): CategoricalStats(self.dm, self.df, column='foo') # All distance matrix IDs must be in data frame. with self.assertRaises(ValueError): CategoricalStats(self.dm, self.df_missing_id, column='Group') # Grouping vector length must match number of objects in dm. with self.assertRaises(ValueError): CategoricalStats(self.dm, [1, 2]) # Grouping vector cannot have only unique values. with self.assertRaises(ValueError): CategoricalStats(self.dm, [1, 2, 3]) # Grouping vector cannot have only a single group. with self.assertRaises(ValueError): CategoricalStats(self.dm, [1, 1, 1])
def test_eq(self): """Test data equality between different matrix types.""" # Compare DistanceMatrix to DissimilarityMatrix, where both have the # same data and IDs. eq_dm = DissimilarityMatrix(self.dm_3x3_data, ['a', 'b', 'c']) self.assertTrue(self.dm_3x3 == eq_dm) self.assertTrue(eq_dm == self.dm_3x3)
def test_filter_reorder(self): # Don't filter anything, but reorder the distance matrix. order = ['c', 'a', 'b'] exp = DissimilarityMatrix( [[0, 4.2, 12], [4.2, 0, 0.01], [12, 0.01, 0]], order) obs = self.dm_3x3.filter(order) self.assertEqual(obs, exp)
def test_from_iterable_non_hollow_data(self): iterable = (x for x in range(4)) exp = DissimilarityMatrix([[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]) res = DissimilarityMatrix.from_iterable(iterable, lambda a, b: 1) self.assertEqual(res, exp)
def test_from_iterable_no_key(self): iterable = (x for x in range(4)) exp = DissimilarityMatrix([[0, 1, 2, 3], [1, 0, 1, 2], [2, 1, 0, 1], [3, 2, 1, 0]]) res = DissimilarityMatrix.from_iterable(iterable, lambda a, b: abs(b - a)) self.assertEqual(res, exp)
def test_filter_subset(self): ids = ('c', 'a') exp = DissimilarityMatrix([[0, 4.2], [4.2, 0]], ids) obs = self.dm_3x3.filter(ids) self.assertEqual(obs, exp) ids = ('b', 'a') exp = DissimilarityMatrix([[0, 0.01], [0.01, 0]], ids) obs = self.dm_3x3.filter(ids) self.assertEqual(obs, exp) # 4x4 dm = DissimilarityMatrix([[0, 1, 55, 7], [1, 0, 16, 1], [55, 16, 0, 23], [7, 1, 23, 0]]) ids = np.asarray(['3', '0', '1']) exp = DissimilarityMatrix([[0, 7, 1], [7, 0, 1], [1, 1, 0]], ids) obs = dm.filter(ids) self.assertEqual(obs, exp)
def test_from_iterable_with_key(self): iterable = (x for x in range(4)) exp = DissimilarityMatrix( [[0, 1, 2, 3], [1, 0, 1, 2], [2, 1, 0, 1], [3, 2, 1, 0]], ['0', '1', '4', '9']) res = DissimilarityMatrix.from_iterable(iterable, lambda a, b: abs(b - a), key=lambda x: str(x**2)) self.assertEqual(res, exp)
def setUp(self): super(DissimilarityAndDistanceMatrixReaderWriterTests, self).setUp() self.lsmat_1x1_data = [[0.0]] self.lsmat_2x2_data = [[0.0, 0.123], [0.123, 0.0]] self.lsmat_2x2_asym_data = [[0.0, 1.0], [-2.0, 0.0]] self.lsmat_3x3_data = [[0.0, 0.01, 4.2], [0.01, 0.0, 12.0], [4.2, 12.0, 0.0]] # We repeat the 3x3 example because there are two file format # representations of it, one that is messy and one that is not. Both # should be read into an equivalent object and written to an equivalent # format though, which is why we duplicate the 3x3 objects and strings. self.dissim_objs = [ DissimilarityMatrix(self.lsmat_1x1_data, ['a']), DissimilarityMatrix(self.lsmat_2x2_data, ['a', 'b']), DissimilarityMatrix(self.lsmat_2x2_asym_data, ['a', 'b']), DissimilarityMatrix(self.lsmat_3x3_data, ['a', 'b', 'c']), DissimilarityMatrix(self.lsmat_3x3_data, ['a', 'b', 'c']) ] self.dissim_strs = [ LSMat_1x1, LSMat_2x2, LSMat_2x2_ASYM, LSMat_3x3, LSMat_3x3 ] self.dissim_fhs = [ self.lsmat_1x1_fh, self.lsmat_2x2_fh, self.lsmat_2x2_asym_fh, self.lsmat_3x3_fh, self.lsmat_3x3_whitespace_fh ] self.dist_objs = [ DistanceMatrix(self.lsmat_1x1_data, ['a']), DistanceMatrix(self.lsmat_2x2_data, ['a', 'b']), DistanceMatrix(self.lsmat_3x3_data, ['a', 'b', 'c']), DistanceMatrix(self.lsmat_3x3_data, ['a', 'b', 'c']) ] self.dist_strs = [LSMat_1x1, LSMat_2x2, LSMat_3x3, LSMat_3x3] self.dist_fhs = [ self.lsmat_1x1_fh, self.lsmat_2x2_fh, self.lsmat_3x3_fh, self.lsmat_3x3_whitespace_fh ]
def create_correlation_plot(EDM, artists): """ Creates and plots a correlation heatmap graph from a given EDM matrix and list of artist labels """ from skbio.stats.distance import DissimilarityMatrix import matplotlib.pyplot as plt dm = DissimilarityMatrix(EDM, artists) fig = dm.plot(cmap='Reds', title='Lyrical Similarity') fig.show() plt.pause(500)
def test_init_from_dm(self): ids = ['foo', 'bar', 'baz'] # DissimilarityMatrix -> DissimilarityMatrix exp = DissimilarityMatrix(self.dm_3x3_data, ids) obs = DissimilarityMatrix(self.dm_3x3, ids) self.assertEqual(obs, exp) # Test that copy of data is not made. self.assertTrue(obs.data is self.dm_3x3.data) obs.data[0, 1] = 424242 self.assertTrue(np.array_equal(obs.data, self.dm_3x3.data)) # DistanceMatrix -> DissimilarityMatrix exp = DissimilarityMatrix(self.dm_3x3_data, ids) obs = DissimilarityMatrix( DistanceMatrix(self.dm_3x3_data, ('a', 'b', 'c')), ids) self.assertEqual(obs, exp) # DissimilarityMatrix -> DistanceMatrix with self.assertRaises(DistanceMatrixError): DistanceMatrix(self.dm_2x2_asym, ['foo', 'bar'])
def test_avoid_copy_on_construction(self): # ((data, expect_copy)) tests = (([[0, 1], [1, 0]], True), ([(0, 1), (1, 0)], True), (((0, 1), (1, 0)), True), (np.array([[0, 1], [1, 0]], dtype='int'), True), (np.array([[0, 1], [1, 0]], dtype='float'), False), (np.array([[0, 1], [1, 0]], dtype=np.float32), False), (np.array([[0, 1], [1, 0]], dtype=np.float64), False), (np.array([[0, 1], [1, 0]], dtype='double'), False)) for data, expect in tests: obj = DissimilarityMatrix(data) self.assertEqual(id(obj.data) != id(data), expect)
def test_plot_no_default(self): ids = ['0', 'one', '2', 'three', '4.000'] data = ([0, 1, 2, 3, 4], [1, 0, 1, 2, 3], [2, 1, 0, 1, 2], [3, 2, 1, 0, 1], [4, 3, 2, 1, 0]) dm = DissimilarityMatrix(data, ids) fig = dm.plot(cmap='Reds', title='Testplot') self.assertIsInstance(fig, mpl.figure.Figure) axes = fig.get_axes() self.assertEqual(len(axes), 2) ax = axes[0] self.assertEqual(ax.get_title(), 'Testplot') xticks = [] for tick in ax.get_xticklabels(): xticks.append(tick.get_text()) self.assertEqual(xticks, ['0', 'one', '2', 'three', '4.000']) yticks = [] for tick in ax.get_yticklabels(): yticks.append(tick.get_text()) self.assertEqual(yticks, ['0', 'one', '2', 'three', '4.000'])
def test_from_iterable_skbio_hamming_metric_with_metadata(self): # test for #1254 seqs = [ Sequence('ACGT'), Sequence('ACGA', metadata={'id': 'seq1'}), Sequence('AAAA', metadata={'id': 'seq2'}), Sequence('AAAA', positional_metadata={'qual': range(4)}) ] exp = DissimilarityMatrix( [[0, 0.25, 0.75, 0.75], [0.25, 0.0, 0.5, 0.5], [0.75, 0.5, 0.0, 0.0], [0.75, 0.5, 0.0, 0.0]], ['a', 'b', 'c', 'd']) dm = DissimilarityMatrix.from_iterable( seqs, metric=skbio.sequence.distance.hamming, keys=['a', 'b', 'c', 'd']) self.assertEqual(dm, exp)
def test_init_no_ids(self): exp = DissimilarityMatrix(self.dm_3x3_data, ('0', '1', '2')) obs = DissimilarityMatrix(self.dm_3x3_data) self.assertEqual(obs, exp) self.assertEqual(obs['1', '2'], 12.0)
def test_eq(self): # Compare DistanceMatrix to DissimilarityMatrix, where both have the # same data and IDs. eq_dm = DissimilarityMatrix(self.dm_3x3_data, ['a', 'b', 'c']) self.assertTrue(self.dm_3x3 == eq_dm) self.assertTrue(eq_dm == self.dm_3x3)
def test_to_data_frame_default_ids(self): df = DissimilarityMatrix(self.dm_2x2_data).to_data_frame() exp = pd.DataFrame([[0.0, 0.123], [0.123, 0.0]], index=['0', '1'], columns=['0', '1']) assert_data_frame_almost_equal(df, exp)
def test_filter_missing_ids_strict_false(self): # no exception should be raised ids = ('c', 'a') exp = DissimilarityMatrix([[0, 4.2], [4.2, 0]], ids) obs = self.dm_3x3.filter(['c', 'a', 'not found'], strict=False) self.assertEqual(obs, exp)
dm = np.zeros([len(samples),len(samples)]) pm = np.zeros([len(samples),len(samples)]) # fill matrices with values for s1, s2, d, p in zip(mash_vec[0],mash_vec[1],mash_vec[2],mash_vec[3]): i1 = samples.index(s1) i2 = samples.index(s2) print('s1: %s, s2: %s, i1: %s, i2: %s, d: %s, p: %s' % (s1, s2, i1, i2, d, p)) dm[i1,i2] = d dm[i2,i1] = d pm[i1,i2] = p pm[i2,i1] = p ids = [os.path.basename(x) for x in samples] sk_dm = DissimilarityMatrix(dm, ids=ids) sk_pm = DissimilarityMatrix(pm, ids=ids) sk_dm.write(output['dist_matrix']) sk_pm.write(output['p_matrix']) #### Mash rules rule mash: input: expand(mash_dir + '{sample}/mash/{sample}.msh', sample=samples), expand(mash_dir + '{sample}/mash/{sample}.refseq.txt', sample=samples), mash_dir + 'combined_analysis/mash.dist.dm', mash_dir + 'combined_analysis/mash.dist.p'
def test_filter_single_id(self): ids = ['b'] exp = DissimilarityMatrix([[0]], ids) obs = self.dm_2x2_asym.filter(ids) self.assertEqual(obs, exp)
# Define a dissimilarity matrix with five objects labeled A-E: from skbio.stats.distance import DissimilarityMatrix dm = DissimilarityMatrix([[0, 1, 2, 3, 4], [1, 0, 1, 2, 3], [2, 1, 0, 1, 2], [3, 2, 1, 0, 1], [4, 3, 2, 1, 0]], ['A', 'B', 'C', 'D', 'E']) # Plot the dissimilarity matrix as a heatmap: fig = dm.plot(cmap='Reds', title='Example heatmap')
def test_from_iterable_single(self): exp = DissimilarityMatrix([[100]]) res = DissimilarityMatrix.from_iterable(["boo"], lambda a, b: 100) self.assertEqual(res, exp)
def test_constructor(self): """Test generating random dist mats with a specific constructor.""" exp = DissimilarityMatrix(np.asarray([[0.0]]), ['1']) obs = randdm(1, constructor=DissimilarityMatrix) self.assertEqual(obs, exp) self.assertEqual(type(obs), DissimilarityMatrix)
def test_constructor(self): exp = DissimilarityMatrix(np.asarray([[0.0]]), ['1']) obs = randdm(1, constructor=DissimilarityMatrix) self.assertEqual(obs, exp) self.assertEqual(type(obs), DissimilarityMatrix)
def test_init_non_hollow_dm(self): data = [[1, 1], [1, 1]] obs = DissimilarityMatrix(data, ['a', 'b']) self.assertTrue(np.array_equal(obs.data, data)) data_hollow = skbio.stats.distance._utils.is_hollow(obs.data) self.assertEqual(data_hollow, False)
def test_init_non_hollow_dm(self): data = [[1, 1], [1, 1]] obs = DissimilarityMatrix(data, ['a', 'b']) self.assertTrue(np.array_equal(obs.data, data))