def test_qm_result_matrices(self): m1 = dm.DataMatrix(2, 2, values=[[2, 1], [3, 4]]) m2 = dm.DataMatrix(2, 2, values=[[6, 5], [4, 3]]) tmp_mean = np.array([1.0, 2.0, 3.0, 4.0]) result = dm.qm_result_matrices([m1, m2], tmp_mean) self.assertEquals(2, len(result)) qm1 = result[0] qm2 = result[1] self.assertTrue((qm1.values == [[2, 1], [3, 4]]).all()) self.assertTrue((qm2.values == [[4, 3], [2, 1]]).all())
def test_as_sorted_flat_values(self): """tests that the flat values of the input matrices are all put in one big numpy array""" m1 = dm.DataMatrix(2, 2, values=[[2, np.nan], [3, 4]]) m2 = dm.DataMatrix(2, 2, values=[[6, 5], [4, 3]]) flat_values = as_sorted_flat_values([m1, m2]) self.assertEquals(4, len(flat_values)) self.assertTrue((flat_values[0] == [2, 3]).all()) self.assertTrue((flat_values[1] == [3, 4]).all()) self.assertTrue((flat_values[2] == [4, 5]).all()) self.assertTrue(np.isnan(flat_values[3][0])) self.assertEquals(6, flat_values[3][1])
def test_residual2(self): """tests the residual() method""" matrix = dm.DataMatrix(3, 3, values=[[1000, -4000, 7000], [-2000, 5000, -8000], [3000, -6000, 9000]]) self.assertAlmostEqual(4049.38271604938, matrix.residual())
def compute_row_scores(membership, matrix, num_clusters, config_params): """for each cluster 1, 2, .. num_clusters compute the row scores for the each row name in the input name matrix""" start_time = util.current_millis() cluster_row_scores = __compute_row_scores_for_clusters( membership, matrix, num_clusters, config_params) # TODO: replace the nan/inf-Values with the quantile-thingy in the R-version logging.debug("__compute_row_scores_for_clusters() in %f s.", (util.current_millis() - start_time) / 1000.0) # rearrange result into a DataMatrix, where rows are indexed by gene # and columns represent clusters start_time = util.current_millis() values = np.zeros((matrix.num_rows, num_clusters)) # note that cluster is 0 based on a matrix for cluster in xrange(num_clusters): row_scores = cluster_row_scores[cluster] values[:, cluster] = row_scores result = dm.DataMatrix(matrix.num_rows, num_clusters, row_names=matrix.row_names, values=values) logging.debug("made result matrix in %f s.", (util.current_millis() - start_time) / 1000.0) return result
def test_min(self): """tests the min() method""" matrix = dm.DataMatrix(2, 2, row_names=['R0', 'R1'], col_names=['C0', 'C1'], values=[[1, -np.inf], [np.nan, 4]]) self.assertEquals(1, matrix.min())
def test_quantile_normalize_scores_with_all_defined_weights(self): """happy path for quantile normalization""" m1 = dm.DataMatrix(2, 2, values=[[1, 3], [2, 4]]) m2 = dm.DataMatrix(2, 2, values=[[2.3, 2.5], [2.1, 2.31]]) result = dm.quantile_normalize_scores([m1, m2], [6.0, 1.0]) outmatrix1 = result[0].values self.assertAlmostEqual(0.5785714, outmatrix1[0][0]) self.assertAlmostEqual(1.45071428, outmatrix1[0][1]) self.assertAlmostEqual(1.02142857, outmatrix1[1][0]) self.assertAlmostEqual(1.89285714, outmatrix1[1][1]) outmatrix2 = result[1].values self.assertAlmostEqual(1.02142857, outmatrix2[0][0]) self.assertAlmostEqual(1.89285714, outmatrix2[0][1]) self.assertAlmostEqual(0.5785714, outmatrix2[1][0]) self.assertAlmostEqual(1.45071428, outmatrix2[1][1])
def test_combine_single(self): """Test combine with a single matrix""" m = dm.DataMatrix(2, 2, [[0.1, 0.2], [0.1, 0.2]]) result = s.combine([m], [1.0], None, 1, { 'quantile_normalize': True, 'debug': {}, 'num_clusters': 42 })
def test_quantile_normalize_scores_with_no_weights(self): """no weights -> fall back to row means""" m1 = dm.DataMatrix(2, 2, values=[[1, 3], [2, 4]]) m2 = dm.DataMatrix(2, 2, values=[[2.3, 2.5], [2.1, 2.31]]) result = dm.quantile_normalize_scores([m1, m2], None) outmatrix1 = result[0].values self.assertAlmostEqual(1.55, outmatrix1[0][0]) self.assertAlmostEqual(2.655, outmatrix1[0][1]) self.assertAlmostEqual(2.15, outmatrix1[1][0]) self.assertAlmostEqual(3.25, outmatrix1[1][1]) outmatrix2 = result[1].values self.assertAlmostEqual(2.15, outmatrix2[0][0]) self.assertAlmostEqual(3.25, outmatrix2[0][1]) self.assertAlmostEqual(1.55, outmatrix2[1][0]) self.assertAlmostEqual(2.655, outmatrix2[1][1])
def test_quantile_normalize_scores_with_undefined_weight(self): """one undefined weight""" m1 = dm.DataMatrix(2, 2, values=[[1, 3], [2, 4]]) m2 = dm.DataMatrix(2, 2, values=[[2.3, 2.5], [2.1, 2.31]]) result = dm.quantile_normalize_scores([m1, m2], [6.0, np.nan]) outmatrix1 = result[0].values self.assertAlmostEqual(1.0, outmatrix1[0][0]) self.assertAlmostEqual(3.0, outmatrix1[0][1]) self.assertAlmostEqual(2.0, outmatrix1[1][0]) self.assertAlmostEqual(4.0, outmatrix1[1][1]) outmatrix2 = result[1].values self.assertAlmostEqual(2.0, outmatrix2[0][0]) self.assertAlmostEqual(4.0, outmatrix2[0][1]) self.assertAlmostEqual(1.0, outmatrix2[1][0]) self.assertAlmostEqual(3.0, outmatrix2[1][1])
def test_remove_column(self): """remove one column""" matrix = dm.DataMatrix(2, 2, ['R1', 'R2'], ['C1', 'C2'], values=[[0.001, -0.35], [np.nan, 0.42]]) filtered = dm.nochange_filter(matrix) self.assertEquals(2, filtered.num_rows) self.assertEquals(1, filtered.num_columns) self.assertTrue((filtered.values == [[-0.35], [0.42]]).all())
def test_column_values(self): """tests the column_values() method""" matrix = dm.DataMatrix(2, 3, values=[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) colvals = matrix.column_values(1) self.assertTrue((colvals == [2.0, 5.0]).all()) colvals[1] = 42.0 self.assertTrue((colvals == [2.0, 42.0]).all()) self.assertTrue((matrix.values == [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]).all())
def test_row_values(self): """tests the row_values() method""" matrix = dm.DataMatrix(2, 3, values=[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) rowvals = matrix.row_values(0) self.assertTrue((rowvals == [1.0, 2.0, 3.0]).all()) rowvals[0] = 42.0 self.assertTrue((rowvals == [42.0, 2.0, 3.0]).all()) self.assertTrue((matrix.values == [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]).all())
def test_create_without_names(self): """create DataMatrix without row and column names""" matrix = dm.DataMatrix(3, 4) self.assertEquals(3, matrix.num_rows) self.assertEquals(4, matrix.num_columns) self.assertEquals(0.0, matrix.values[0][0]) self.assertEquals("Row 0", matrix.row_names[0]) self.assertEquals("Row 1", matrix.row_names[1]) self.assertEquals("Col 0", matrix.column_names[0]) self.assertEquals("Col 1", matrix.column_names[1])
def test_filter(self): """test the centering""" matrix = dm.DataMatrix(2, 2, ['R1', 'R2'], ['C1', 'C2'], values=[[2, 3], [3, 4]]) filtered = dm.center_scale_filter(matrix).values self.assertAlmostEqual(-0.70710678237309499, filtered[0][0]) self.assertAlmostEqual(0.70710678237309499, filtered[0][1]) self.assertAlmostEqual(-0.70710678237309499, filtered[1][0]) self.assertAlmostEqual(0.70710678237309499, filtered[1][1])
def test_simple(self): """simplest test case: everything kept""" matrix = dm.DataMatrix(2, 2, ['R1', 'R2'], ['C1', 'C2'], values=[[0.24, -0.35], [-0.42, 0.42]]) filtered = dm.nochange_filter(matrix) self.assertEquals(2, filtered.num_rows) self.assertEquals(2, filtered.num_columns) self.assertTrue((filtered.values == [[0.24, -0.35], [-0.42, 0.42]]).all())
def test_sorted_by_rowname(self): matrix = dm.DataMatrix(3, 3, row_names=['R0', 'R2', 'R1'], col_names=['C0', 'C1', 'C2'], values=[[1, 2, 3], [4, 5, 6], [8, 9, 10]]) sorted_matrix = matrix.sorted_by_row_name() self.assertEquals(sorted_matrix.row_names, ['R0', 'R1', 'R2']) self.assertTrue((sorted_matrix.values == [[1, 2, 3], [8, 9, 10], [4, 5, 6]]).all())
def test_residual_var_normalize(self): """tests the residual() method. Note that this method seems to make rounding errors in the 5th place""" matrix = dm.DataMatrix(3, 3, values=[[1000, -4000, 7000], [-2000, 5000, -8000], [3000, -6000, 9000]]) max_row_var = matrix.row_variance() self.assertAlmostEqual(0.000105128205128205, matrix.residual(max_row_variance=max_row_var), places=4)
def test_write_csv(self): ratios = dm.DataMatrix(2, 2, ['Gene 1', 'Gene 2'], ['Cond 1', 'Cond 2'], np.array([[1, 2], [3, 4]])) ratios.write_tsv_file('/tmp/simple_ratios.tsv', compressed=False) # check the written file ratios = dm.create_from_csv('/tmp/simple_ratios.tsv') self.assertEquals(ratios.row_names, ['Gene 1', 'Gene 2']) self.assertEquals(ratios.column_names, ['Cond 1', 'Cond 2']) self.assertAlmostEquals(ratios.values[0][0], 1.0)
def test_fix_extreme_values(self): """tests the adjustment function""" matrix = dm.DataMatrix(3, 2, row_names=['R0', 'R1', 'R3'], col_names=['C0', 'C1'], values=[[-1.01, np.nan], [np.inf, -22.0], [-19.9, -25.3]]) matrix.fix_extreme_values() self.assertTrue((matrix.values == [[-1.01, -1.01], [-1.01, -19.9], [-19.9, -19.9]]).all())
def test_submatrix_by_rows(self): """test creating sub matrices by providing row indexes""" matrix = dm.DataMatrix(4, 2, row_names=['R0', 'R1', 'R2', 'R3'], col_names=['C0', 'C1'], values=[[1, 2], [3, 4], [5, 6], [7, 8]]) submatrix = matrix.submatrix_by_rows([1, 3]) self.assertEquals(submatrix.row_names, ['R1', 'R3']) self.assertEquals(submatrix.column_names, ['C0', 'C1']) self.assertTrue((submatrix.values == [[3, 4], [7, 8]]).all())
def test_submatrix_by_name_rows_only(self): """test creating sub matrices by row/column names""" matrix = dm.DataMatrix(4, 4, row_names=['R0', 'R1', 'R2', 'R3'], col_names=['C0', 'C1', 'C2', 'C3'], values=[[1, 2, 3, 4], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]) submatrix = matrix.submatrix_by_name(row_names=['R0', 'R2']) self.assertEquals(submatrix.row_names, ['R0', 'R2']) self.assertTrue((submatrix.values == [[1, 2, 3, 4], [8, 9, 10, 11]]).all())
def test_multiply_column_by(self): """tests the multiply_column_by method""" matrix = dm.DataMatrix(2, 2, row_names=['R0', 'R1'], col_names=['C0', 'C1'], values=[[1, 2], [3, 4]]) multiplied = matrix.multiply_column_by(1, 2) self.assertEquals(multiplied.row_names, ['R0', 'R1']) self.assertEquals(multiplied.column_names, ['C0', 'C1']) self.assertEquals(matrix, multiplied) self.assertTrue((multiplied.values == [[1, 4], [3, 8]]).all())
def test_subtract_with_quantile(self): """create DataMatrix with an initialization value""" matrix = dm.DataMatrix(3, 2, values=[[-1.01, -1.01], [-1.01, -19.9], [-19.9, -19.9]]) mean_before = matrix.mean() matrix.subtract_with_quantile(0.25) self.assertNotEquals(mean_before, matrix.mean()) self.assertAlmostEquals(matrix.values[0, 0], 18.89) self.assertAlmostEquals(matrix.values[0, 1], 18.89) self.assertAlmostEquals(matrix.values[1, 0], 18.89) self.assertAlmostEquals(matrix.values[1, 1], 0.0) self.assertAlmostEquals(matrix.values[2, 0], 0.0) self.assertAlmostEquals(matrix.values[2, 1], 0.0)
def test_create_with_names(self): """create DataMatrix with row and column names""" matrix = dm.DataMatrix(3, 2, ["MyRow1", "MyRow2", "MyRow3"], ["MyCol1", "MyCol2"]) self.assertEquals(3, matrix.num_rows) self.assertEquals(2, matrix.num_columns) self.assertEquals(0.0, matrix.values[0][0]) self.assertEquals("MyRow1", matrix.row_names[0]) self.assertEquals("MyRow2", matrix.row_names[1]) self.assertEquals("MyCol1", matrix.column_names[0]) self.assertEquals("MyCol2", matrix.column_names[1]) self.assertIsNotNone(str(matrix))
def test_submatrix_by_name_rows_and_cols_with_nonexisting(self): """test creating sub matrices by row/column name selection using non-existing names""" matrix = dm.DataMatrix(4, 4, row_names=['R0', 'R1', 'R2', 'R3'], col_names=['C0', 'C1', 'C2', 'C3'], values=[[1, 2, 3, 4], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]]) submatrix = matrix.submatrix_by_name(row_names=['R0', 'R2', 'R5'], column_names=['C1', 'C3', 'C5']) self.assertEquals(submatrix.row_names, ['R0', 'R2']) self.assertEquals(submatrix.column_names, ['C1', 'C3']) self.assertTrue((submatrix.values == [[2, 4], [9, 11]]).all())
def pvalues2matrix(all_pvalues, num_clusters, gene_names, reverse_map): """converts a map from {cluster: {feature: pvalue}} to a scoring matrix """ row_map = {gene: index for index, gene in enumerate(gene_names)} # convert remapped to an actual scoring matrix matrix = dm.DataMatrix(len(gene_names), num_clusters, gene_names) mvalues = matrix.values for cluster, feature_pvals in all_pvalues.items(): for feature_id, pval in feature_pvals.items(): ridx = row_map[reverse_map[feature_id]] mvalues[ridx, cluster - 1] = pval matrix.apply_log() return matrix
def get_col_density_scores(membership, col_scores): num_clusters = membership.num_clusters() cscore_range = abs(col_scores.max() - col_scores.min()) colscore_bandwidth = max(cscore_range / 100.0, 0.001) cd_scores = dm.DataMatrix(col_scores.num_rows, col_scores.num_columns, col_scores.row_names, col_scores.column_names) cds_values = cd_scores.values start_time = util.current_millis() for cluster in xrange(1, num_clusters + 1): # instead of assigning the cc_scores values per row, we can assign to the # transpose and let numpy do the assignment cds_values.T[cluster - 1] = get_cc_scores(membership, col_scores, colscore_bandwidth, cluster) elapsed = util.current_millis() - start_time logging.debug("CC_SCORES IN %f s.", elapsed / 1000.0) return cd_scores
def get_row_density_scores(membership, row_scores): """getting density scores improves small clusters""" num_clusters = membership.num_clusters() rscore_range = abs(row_scores.max() - row_scores.min()) rowscore_bandwidth = max(rscore_range / 100.0, 0.001) rd_scores = dm.DataMatrix(row_scores.num_rows, row_scores.num_columns, row_scores.row_names, row_scores.column_names) rds_values = rd_scores.values start_time = util.current_millis() for cluster in xrange(1, num_clusters + 1): # instead of assigning the rr_scores values per row, we can assign to the # transpose and let numpy do the assignment rds_values.T[cluster - 1] = get_rr_scores(membership, row_scores, rowscore_bandwidth, cluster) elapsed = util.current_millis() - start_time logging.debug("RR_SCORES IN %f s.", elapsed / 1000.0) return rd_scores
def do_compute(self, iteration_result, ref_matrix=None): """compute method, iteration is the 0-based iteration number""" matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(), self.gene_names()) network_scores = {} for network in self.networks(): logging.debug("Compute scores for network '%s', WEIGHT: %f", network.name, network.weight) start_time = util.current_millis() network_score = self.__compute_network_cluster_scores(network) network_scores[network.name] = network_score self.__update_score_matrix(matrix, network_score, network.weight) elapsed = util.current_millis() - start_time logging.debug("NETWORK '%s' SCORING TIME: %f s.", network.name, (elapsed / 1000.0)) # compute and store score means self.score_means = self.__update_score_means(network_scores) return matrix
def do_compute(self, iteration_result, ref_matrix): """compute method Note: will return None if not computed yet and the result of a previous scoring if the function is not supposed to actually run in this iteration """ global SET_MATRIX, SET_MEMBERSHIP, SET_SET_TYPE, SET_SYNONYMS, CANONICAL_ROWNAMES, CANONICAL_ROW_INDEXES logging.info("Compute scores for set enrichment...") start_time = util.current_millis() matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(), self.gene_names()) use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING] SET_MATRIX = self.ratios SET_MEMBERSHIP = self.membership SET_SYNONYMS = self.organism.thesaurus() if CANONICAL_ROWNAMES is None: CANONICAL_ROWNAMES = set( map(lambda n: SET_SYNONYMS[n] if n in SET_SYNONYMS else n, self.ratios.row_names)) if CANONICAL_ROW_INDEXES is None: CANONICAL_ROW_INDEXES = {} for index, row in enumerate(self.ratios.row_names): if row in SET_SYNONYMS: CANONICAL_ROW_INDEXES[SET_SYNONYMS[row]] = index else: CANONICAL_ROW_INDEXES[row] = index ref_min_score = ref_matrix.min() logging.info('REF_MIN_SCORE: %f', ref_min_score) set_filepath = os.path.join(self.config_params['output_dir'], 'setEnrichment_set.csv') pval_filepath = os.path.join(self.config_params['output_dir'], 'setEnrichment_pvalue.csv') for set_type in self.__set_types: SET_SET_TYPE = set_type logging.info("PROCESSING SET TYPE '%s'", set_type.name) start1 = util.current_millis() if use_multiprocessing: with util.get_mp_pool(self.config_params) as pool: results = pool.map( compute_cluster_score, [(cluster, self.bonferroni_cutoff(), ref_min_score) for cluster in xrange(1, self.num_clusters() + 1)]) else: results = [] for cluster in xrange(1, self.num_clusters() + 1): results.append( compute_cluster_score( (cluster, self.bonferroni_cutoff(), ref_min_score))) elapsed1 = util.current_millis() - start1 logging.info("ENRICHMENT SCORES COMPUTED in %f s, STORING...", elapsed1 / 1000.0) if not os.path.exists(set_filepath): setFile = open(set_filepath, 'w') setFile.write(',' + ','.join( [str(i) for i in xrange(1, self.num_clusters() + 1)])) pvFile = open(pval_filepath, 'w') pvFile.write(',' + ','.join( [str(i) for i in xrange(1, self.num_clusters() + 1)])) else: setFile = open(set_filepath, 'a') pvFile = open(pval_filepath, 'a') minSets = [] pValues = [] for cluster in xrange(1, self.num_clusters() + 1): # store the best enriched set determined scores, min_set, min_pvalue = results[cluster - 1] minSets.append(min_set) pValues.append(min_pvalue) for row in xrange(len(self.gene_names())): matrix.values[row][cluster - 1] += scores[row] * set_type.weight setFile.write('\n' + str(iteration_result['iteration']) + ',' + ','.join([str(i) for i in minSets])) pvFile.write('\n' + str(iteration_result['iteration']) + ',' + ','.join([str(i) for i in pValues])) setFile.close() pvFile.close() logging.info("SET ENRICHMENT FINISHED IN %f s.\n", (util.current_millis() - start_time) / 1000.0) # cleanup SET_SET_TYPE = None SET_MATRIX = None SET_MEMBERSHIP = None SET_SYNONYMS = None return matrix