Пример #1
0
 def test_qm_result_matrices(self):
     m1 = dm.DataMatrix(2, 2, values=[[2, 1], [3, 4]])
     m2 = dm.DataMatrix(2, 2, values=[[6, 5], [4, 3]])
     tmp_mean = np.array([1.0, 2.0, 3.0, 4.0])
     result = dm.qm_result_matrices([m1, m2], tmp_mean)
     self.assertEquals(2, len(result))
     qm1 = result[0]
     qm2 = result[1]
     self.assertTrue((qm1.values == [[2, 1], [3, 4]]).all())
     self.assertTrue((qm2.values == [[4, 3], [2, 1]]).all())
Пример #2
0
 def test_as_sorted_flat_values(self):
     """tests that the flat values of the input matrices are
     all put in one big numpy array"""
     m1 = dm.DataMatrix(2, 2, values=[[2, np.nan], [3, 4]])
     m2 = dm.DataMatrix(2, 2, values=[[6, 5], [4, 3]])
     flat_values = as_sorted_flat_values([m1, m2])
     self.assertEquals(4, len(flat_values))
     self.assertTrue((flat_values[0] == [2, 3]).all())
     self.assertTrue((flat_values[1] == [3, 4]).all())
     self.assertTrue((flat_values[2] == [4, 5]).all())
     self.assertTrue(np.isnan(flat_values[3][0]))
     self.assertEquals(6, flat_values[3][1])
Пример #3
0
 def test_residual2(self):
     """tests the residual() method"""
     matrix = dm.DataMatrix(3, 3,
                            values=[[1000, -4000, 7000],
                                    [-2000, 5000, -8000],
                                    [3000, -6000, 9000]])
     self.assertAlmostEqual(4049.38271604938, matrix.residual())
Пример #4
0
def compute_row_scores(membership, matrix, num_clusters, config_params):
    """for each cluster 1, 2, .. num_clusters compute the row scores
    for the each row name in the input name matrix"""
    start_time = util.current_millis()
    cluster_row_scores = __compute_row_scores_for_clusters(
        membership, matrix, num_clusters, config_params)
    # TODO: replace the nan/inf-Values with the quantile-thingy in the R-version

    logging.debug("__compute_row_scores_for_clusters() in %f s.",
                  (util.current_millis() - start_time) / 1000.0)

    # rearrange result into a DataMatrix, where rows are indexed by gene
    # and columns represent clusters
    start_time = util.current_millis()
    values = np.zeros((matrix.num_rows, num_clusters))

    # note that cluster is 0 based on a matrix
    for cluster in xrange(num_clusters):
        row_scores = cluster_row_scores[cluster]
        values[:, cluster] = row_scores
    result = dm.DataMatrix(matrix.num_rows,
                           num_clusters,
                           row_names=matrix.row_names,
                           values=values)
    logging.debug("made result matrix in %f s.",
                  (util.current_millis() - start_time) / 1000.0)
    return result
Пример #5
0
 def test_min(self):
     """tests the min() method"""
     matrix = dm.DataMatrix(2, 2,
                             row_names=['R0', 'R1'],
                             col_names=['C0', 'C1'],
                             values=[[1, -np.inf],
                                     [np.nan, 4]])
     self.assertEquals(1, matrix.min())
Пример #6
0
    def test_quantile_normalize_scores_with_all_defined_weights(self):
        """happy path for quantile normalization"""
        m1 = dm.DataMatrix(2, 2, values=[[1, 3], [2, 4]])
        m2 = dm.DataMatrix(2, 2, values=[[2.3, 2.5], [2.1, 2.31]])
        result = dm.quantile_normalize_scores([m1, m2], [6.0, 1.0])

        outmatrix1 = result[0].values
        self.assertAlmostEqual(0.5785714, outmatrix1[0][0])
        self.assertAlmostEqual(1.45071428, outmatrix1[0][1])
        self.assertAlmostEqual(1.02142857, outmatrix1[1][0])
        self.assertAlmostEqual(1.89285714, outmatrix1[1][1])

        outmatrix2 = result[1].values
        self.assertAlmostEqual(1.02142857, outmatrix2[0][0])
        self.assertAlmostEqual(1.89285714, outmatrix2[0][1])
        self.assertAlmostEqual(0.5785714, outmatrix2[1][0])
        self.assertAlmostEqual(1.45071428, outmatrix2[1][1])
Пример #7
0
 def test_combine_single(self):
     """Test combine with a single matrix"""
     m = dm.DataMatrix(2, 2, [[0.1, 0.2], [0.1, 0.2]])
     result = s.combine([m], [1.0], None, 1, {
         'quantile_normalize': True,
         'debug': {},
         'num_clusters': 42
     })
Пример #8
0
    def test_quantile_normalize_scores_with_no_weights(self):
        """no weights -> fall back to row means"""
        m1 = dm.DataMatrix(2, 2, values=[[1, 3], [2, 4]])
        m2 = dm.DataMatrix(2, 2, values=[[2.3, 2.5], [2.1, 2.31]])
        result = dm.quantile_normalize_scores([m1, m2], None)

        outmatrix1 = result[0].values
        self.assertAlmostEqual(1.55, outmatrix1[0][0])
        self.assertAlmostEqual(2.655, outmatrix1[0][1])
        self.assertAlmostEqual(2.15, outmatrix1[1][0])
        self.assertAlmostEqual(3.25, outmatrix1[1][1])

        outmatrix2 = result[1].values
        self.assertAlmostEqual(2.15, outmatrix2[0][0])
        self.assertAlmostEqual(3.25, outmatrix2[0][1])
        self.assertAlmostEqual(1.55, outmatrix2[1][0])
        self.assertAlmostEqual(2.655, outmatrix2[1][1])
Пример #9
0
    def test_quantile_normalize_scores_with_undefined_weight(self):
        """one undefined weight"""
        m1 = dm.DataMatrix(2, 2, values=[[1, 3], [2, 4]])
        m2 = dm.DataMatrix(2, 2, values=[[2.3, 2.5], [2.1, 2.31]])
        result = dm.quantile_normalize_scores([m1, m2], [6.0, np.nan])

        outmatrix1 = result[0].values
        self.assertAlmostEqual(1.0, outmatrix1[0][0])
        self.assertAlmostEqual(3.0, outmatrix1[0][1])
        self.assertAlmostEqual(2.0, outmatrix1[1][0])
        self.assertAlmostEqual(4.0, outmatrix1[1][1])

        outmatrix2 = result[1].values
        self.assertAlmostEqual(2.0, outmatrix2[0][0])
        self.assertAlmostEqual(4.0, outmatrix2[0][1])
        self.assertAlmostEqual(1.0, outmatrix2[1][0])
        self.assertAlmostEqual(3.0, outmatrix2[1][1])
Пример #10
0
 def test_remove_column(self):
     """remove one column"""
     matrix = dm.DataMatrix(2,
                            2, ['R1', 'R2'], ['C1', 'C2'],
                            values=[[0.001, -0.35], [np.nan, 0.42]])
     filtered = dm.nochange_filter(matrix)
     self.assertEquals(2, filtered.num_rows)
     self.assertEquals(1, filtered.num_columns)
     self.assertTrue((filtered.values == [[-0.35], [0.42]]).all())
Пример #11
0
 def test_column_values(self):
     """tests the column_values() method"""
     matrix = dm.DataMatrix(2, 3, values=[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
     colvals = matrix.column_values(1)
     self.assertTrue((colvals == [2.0, 5.0]).all())
     colvals[1] = 42.0
     self.assertTrue((colvals == [2.0, 42.0]).all())
     self.assertTrue((matrix.values == [[1.0, 2.0, 3.0], [4.0, 5.0,
                                                          6.0]]).all())
Пример #12
0
 def test_row_values(self):
     """tests the row_values() method"""
     matrix = dm.DataMatrix(2, 3, values=[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
     rowvals = matrix.row_values(0)
     self.assertTrue((rowvals == [1.0, 2.0, 3.0]).all())
     rowvals[0] = 42.0
     self.assertTrue((rowvals == [42.0, 2.0, 3.0]).all())
     self.assertTrue((matrix.values == [[1.0, 2.0, 3.0], [4.0, 5.0,
                                                          6.0]]).all())
Пример #13
0
 def test_create_without_names(self):
     """create DataMatrix without row and column names"""
     matrix = dm.DataMatrix(3, 4)
     self.assertEquals(3, matrix.num_rows)
     self.assertEquals(4, matrix.num_columns)
     self.assertEquals(0.0, matrix.values[0][0])
     self.assertEquals("Row 0", matrix.row_names[0])
     self.assertEquals("Row 1", matrix.row_names[1])
     self.assertEquals("Col 0", matrix.column_names[0])
     self.assertEquals("Col 1", matrix.column_names[1])
Пример #14
0
 def test_filter(self):
     """test the centering"""
     matrix = dm.DataMatrix(2,
                            2, ['R1', 'R2'], ['C1', 'C2'],
                            values=[[2, 3], [3, 4]])
     filtered = dm.center_scale_filter(matrix).values
     self.assertAlmostEqual(-0.70710678237309499, filtered[0][0])
     self.assertAlmostEqual(0.70710678237309499, filtered[0][1])
     self.assertAlmostEqual(-0.70710678237309499, filtered[1][0])
     self.assertAlmostEqual(0.70710678237309499, filtered[1][1])
Пример #15
0
 def test_simple(self):
     """simplest test case: everything kept"""
     matrix = dm.DataMatrix(2,
                            2, ['R1', 'R2'], ['C1', 'C2'],
                            values=[[0.24, -0.35], [-0.42, 0.42]])
     filtered = dm.nochange_filter(matrix)
     self.assertEquals(2, filtered.num_rows)
     self.assertEquals(2, filtered.num_columns)
     self.assertTrue((filtered.values == [[0.24, -0.35], [-0.42,
                                                          0.42]]).all())
Пример #16
0
 def test_sorted_by_rowname(self):
     matrix = dm.DataMatrix(3,
                            3,
                            row_names=['R0', 'R2', 'R1'],
                            col_names=['C0', 'C1', 'C2'],
                            values=[[1, 2, 3], [4, 5, 6], [8, 9, 10]])
     sorted_matrix = matrix.sorted_by_row_name()
     self.assertEquals(sorted_matrix.row_names, ['R0', 'R1', 'R2'])
     self.assertTrue((sorted_matrix.values == [[1, 2, 3], [8, 9, 10],
                                               [4, 5, 6]]).all())
Пример #17
0
 def test_residual_var_normalize(self):
     """tests the residual() method. Note that this method
     seems to make rounding errors in the 5th place"""
     matrix = dm.DataMatrix(3, 3,
                            values=[[1000, -4000, 7000],
                                    [-2000, 5000, -8000],
                                    [3000, -6000, 9000]])
     max_row_var = matrix.row_variance()
     self.assertAlmostEqual(0.000105128205128205,
                            matrix.residual(max_row_variance=max_row_var), places=4)
Пример #18
0
    def test_write_csv(self):
        ratios = dm.DataMatrix(2, 2, ['Gene 1', 'Gene 2'], ['Cond 1', 'Cond 2'],
                                np.array([[1, 2], [3, 4]]))
        ratios.write_tsv_file('/tmp/simple_ratios.tsv', compressed=False)

        # check the written file
        ratios = dm.create_from_csv('/tmp/simple_ratios.tsv')
        self.assertEquals(ratios.row_names, ['Gene 1', 'Gene 2'])
        self.assertEquals(ratios.column_names, ['Cond 1', 'Cond 2'])
        self.assertAlmostEquals(ratios.values[0][0], 1.0)
Пример #19
0
 def test_fix_extreme_values(self):
     """tests the adjustment function"""
     matrix = dm.DataMatrix(3,
                            2,
                            row_names=['R0', 'R1', 'R3'],
                            col_names=['C0', 'C1'],
                            values=[[-1.01, np.nan], [np.inf, -22.0],
                                    [-19.9, -25.3]])
     matrix.fix_extreme_values()
     self.assertTrue((matrix.values == [[-1.01, -1.01], [-1.01, -19.9],
                                        [-19.9, -19.9]]).all())
Пример #20
0
 def test_submatrix_by_rows(self):
     """test creating sub matrices by providing row indexes"""
     matrix = dm.DataMatrix(4,
                            2,
                            row_names=['R0', 'R1', 'R2', 'R3'],
                            col_names=['C0', 'C1'],
                            values=[[1, 2], [3, 4], [5, 6], [7, 8]])
     submatrix = matrix.submatrix_by_rows([1, 3])
     self.assertEquals(submatrix.row_names, ['R1', 'R3'])
     self.assertEquals(submatrix.column_names, ['C0', 'C1'])
     self.assertTrue((submatrix.values == [[3, 4], [7, 8]]).all())
Пример #21
0
 def test_submatrix_by_name_rows_only(self):
     """test creating sub matrices by row/column names"""
     matrix = dm.DataMatrix(4,
                            4,
                            row_names=['R0', 'R1', 'R2', 'R3'],
                            col_names=['C0', 'C1', 'C2', 'C3'],
                            values=[[1, 2, 3, 4], [4, 5, 6, 7],
                                    [8, 9, 10, 11], [12, 13, 14, 15]])
     submatrix = matrix.submatrix_by_name(row_names=['R0', 'R2'])
     self.assertEquals(submatrix.row_names, ['R0', 'R2'])
     self.assertTrue((submatrix.values == [[1, 2, 3, 4], [8, 9, 10,
                                                          11]]).all())
Пример #22
0
 def test_multiply_column_by(self):
     """tests the multiply_column_by method"""
     matrix = dm.DataMatrix(2, 2,
                            row_names=['R0', 'R1'],
                            col_names=['C0', 'C1'],
                            values=[[1, 2],
                                    [3, 4]])
     multiplied = matrix.multiply_column_by(1, 2)
     self.assertEquals(multiplied.row_names, ['R0', 'R1'])
     self.assertEquals(multiplied.column_names, ['C0', 'C1'])
     self.assertEquals(matrix, multiplied)
     self.assertTrue((multiplied.values == [[1, 4], [3, 8]]).all())
Пример #23
0
 def test_subtract_with_quantile(self):
     """create DataMatrix with an initialization value"""
     matrix = dm.DataMatrix(3, 2, values=[[-1.01, -1.01], [-1.01, -19.9], [-19.9, -19.9]])
     mean_before = matrix.mean()
     matrix.subtract_with_quantile(0.25)
     self.assertNotEquals(mean_before, matrix.mean())
     self.assertAlmostEquals(matrix.values[0, 0], 18.89)
     self.assertAlmostEquals(matrix.values[0, 1], 18.89)
     self.assertAlmostEquals(matrix.values[1, 0], 18.89)
     self.assertAlmostEquals(matrix.values[1, 1], 0.0)
     self.assertAlmostEquals(matrix.values[2, 0], 0.0)
     self.assertAlmostEquals(matrix.values[2, 1], 0.0)
Пример #24
0
 def test_create_with_names(self):
     """create DataMatrix with row and column names"""
     matrix = dm.DataMatrix(3, 2, ["MyRow1", "MyRow2", "MyRow3"],
                            ["MyCol1", "MyCol2"])
     self.assertEquals(3, matrix.num_rows)
     self.assertEquals(2, matrix.num_columns)
     self.assertEquals(0.0, matrix.values[0][0])
     self.assertEquals("MyRow1", matrix.row_names[0])
     self.assertEquals("MyRow2", matrix.row_names[1])
     self.assertEquals("MyCol1", matrix.column_names[0])
     self.assertEquals("MyCol2", matrix.column_names[1])
     self.assertIsNotNone(str(matrix))
Пример #25
0
 def test_submatrix_by_name_rows_and_cols_with_nonexisting(self):
     """test creating sub matrices by row/column name selection
     using non-existing names"""
     matrix = dm.DataMatrix(4,
                            4,
                            row_names=['R0', 'R1', 'R2', 'R3'],
                            col_names=['C0', 'C1', 'C2', 'C3'],
                            values=[[1, 2, 3, 4], [4, 5, 6, 7],
                                    [8, 9, 10, 11], [12, 13, 14, 15]])
     submatrix = matrix.submatrix_by_name(row_names=['R0', 'R2', 'R5'],
                                          column_names=['C1', 'C3', 'C5'])
     self.assertEquals(submatrix.row_names, ['R0', 'R2'])
     self.assertEquals(submatrix.column_names, ['C1', 'C3'])
     self.assertTrue((submatrix.values == [[2, 4], [9, 11]]).all())
Пример #26
0
def pvalues2matrix(all_pvalues, num_clusters, gene_names, reverse_map):
    """converts a map from {cluster: {feature: pvalue}} to a scoring matrix
    """
    row_map = {gene: index for index, gene in enumerate(gene_names)}

    # convert remapped to an actual scoring matrix
    matrix = dm.DataMatrix(len(gene_names), num_clusters, gene_names)
    mvalues = matrix.values
    for cluster, feature_pvals in all_pvalues.items():
        for feature_id, pval in feature_pvals.items():
            ridx = row_map[reverse_map[feature_id]]
            mvalues[ridx, cluster - 1] = pval

    matrix.apply_log()
    return matrix
Пример #27
0
def get_col_density_scores(membership, col_scores):
    num_clusters = membership.num_clusters()
    cscore_range = abs(col_scores.max() - col_scores.min())
    colscore_bandwidth = max(cscore_range / 100.0, 0.001)
    cd_scores = dm.DataMatrix(col_scores.num_rows, col_scores.num_columns,
                              col_scores.row_names, col_scores.column_names)
    cds_values = cd_scores.values

    start_time = util.current_millis()
    for cluster in xrange(1, num_clusters + 1):
        # instead of assigning the cc_scores values per row, we can assign to the
        # transpose and let numpy do the assignment
        cds_values.T[cluster - 1] = get_cc_scores(membership, col_scores,
                                                  colscore_bandwidth, cluster)

    elapsed = util.current_millis() - start_time
    logging.debug("CC_SCORES IN %f s.", elapsed / 1000.0)
    return cd_scores
Пример #28
0
def get_row_density_scores(membership, row_scores):
    """getting density scores improves small clusters"""
    num_clusters = membership.num_clusters()
    rscore_range = abs(row_scores.max() - row_scores.min())
    rowscore_bandwidth = max(rscore_range / 100.0, 0.001)
    rd_scores = dm.DataMatrix(row_scores.num_rows, row_scores.num_columns,
                              row_scores.row_names, row_scores.column_names)
    rds_values = rd_scores.values

    start_time = util.current_millis()
    for cluster in xrange(1, num_clusters + 1):
        # instead of assigning the rr_scores values per row, we can assign to the
        # transpose and let numpy do the assignment
        rds_values.T[cluster - 1] = get_rr_scores(membership, row_scores,
                                                  rowscore_bandwidth, cluster)

    elapsed = util.current_millis() - start_time
    logging.debug("RR_SCORES IN %f s.", elapsed / 1000.0)
    return rd_scores
Пример #29
0
    def do_compute(self, iteration_result, ref_matrix=None):
        """compute method, iteration is the 0-based iteration number"""

        matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(),
                               self.gene_names())
        network_scores = {}
        for network in self.networks():
            logging.debug("Compute scores for network '%s', WEIGHT: %f",
                          network.name, network.weight)
            start_time = util.current_millis()
            network_score = self.__compute_network_cluster_scores(network)
            network_scores[network.name] = network_score
            self.__update_score_matrix(matrix, network_score, network.weight)
            elapsed = util.current_millis() - start_time
            logging.debug("NETWORK '%s' SCORING TIME: %f s.",
                          network.name, (elapsed / 1000.0))

        # compute and store score means
        self.score_means = self.__update_score_means(network_scores)
        return matrix
Пример #30
0
    def do_compute(self, iteration_result, ref_matrix):
        """compute method
        Note: will return None if not computed yet and the result of a previous
        scoring if the function is not supposed to actually run in this iteration
        """
        global SET_MATRIX, SET_MEMBERSHIP, SET_SET_TYPE, SET_SYNONYMS, CANONICAL_ROWNAMES, CANONICAL_ROW_INDEXES
        logging.info("Compute scores for set enrichment...")
        start_time = util.current_millis()
        matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(),
                               self.gene_names())
        use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING]
        SET_MATRIX = self.ratios
        SET_MEMBERSHIP = self.membership
        SET_SYNONYMS = self.organism.thesaurus()

        if CANONICAL_ROWNAMES is None:
            CANONICAL_ROWNAMES = set(
                map(lambda n: SET_SYNONYMS[n]
                    if n in SET_SYNONYMS else n, self.ratios.row_names))

        if CANONICAL_ROW_INDEXES is None:
            CANONICAL_ROW_INDEXES = {}
            for index, row in enumerate(self.ratios.row_names):
                if row in SET_SYNONYMS:
                    CANONICAL_ROW_INDEXES[SET_SYNONYMS[row]] = index
                else:
                    CANONICAL_ROW_INDEXES[row] = index

        ref_min_score = ref_matrix.min()
        logging.info('REF_MIN_SCORE: %f', ref_min_score)

        set_filepath = os.path.join(self.config_params['output_dir'],
                                    'setEnrichment_set.csv')
        pval_filepath = os.path.join(self.config_params['output_dir'],
                                     'setEnrichment_pvalue.csv')

        for set_type in self.__set_types:
            SET_SET_TYPE = set_type
            logging.info("PROCESSING SET TYPE '%s'", set_type.name)
            start1 = util.current_millis()
            if use_multiprocessing:
                with util.get_mp_pool(self.config_params) as pool:
                    results = pool.map(
                        compute_cluster_score,
                        [(cluster, self.bonferroni_cutoff(), ref_min_score)
                         for cluster in xrange(1,
                                               self.num_clusters() + 1)])
            else:
                results = []
                for cluster in xrange(1, self.num_clusters() + 1):
                    results.append(
                        compute_cluster_score(
                            (cluster, self.bonferroni_cutoff(),
                             ref_min_score)))

            elapsed1 = util.current_millis() - start1
            logging.info("ENRICHMENT SCORES COMPUTED in %f s, STORING...",
                         elapsed1 / 1000.0)

            if not os.path.exists(set_filepath):
                setFile = open(set_filepath, 'w')
                setFile.write(',' + ','.join(
                    [str(i) for i in xrange(1,
                                            self.num_clusters() + 1)]))
                pvFile = open(pval_filepath, 'w')
                pvFile.write(',' + ','.join(
                    [str(i) for i in xrange(1,
                                            self.num_clusters() + 1)]))
            else:
                setFile = open(set_filepath, 'a')
                pvFile = open(pval_filepath, 'a')

            minSets = []
            pValues = []
            for cluster in xrange(1, self.num_clusters() + 1):
                # store the best enriched set determined
                scores, min_set, min_pvalue = results[cluster - 1]
                minSets.append(min_set)
                pValues.append(min_pvalue)

                for row in xrange(len(self.gene_names())):
                    matrix.values[row][cluster -
                                       1] += scores[row] * set_type.weight
            setFile.write('\n' + str(iteration_result['iteration']) + ',' +
                          ','.join([str(i) for i in minSets]))
            pvFile.write('\n' + str(iteration_result['iteration']) + ',' +
                         ','.join([str(i) for i in pValues]))
            setFile.close()
            pvFile.close()

        logging.info("SET ENRICHMENT FINISHED IN %f s.\n",
                     (util.current_millis() - start_time) / 1000.0)
        # cleanup
        SET_SET_TYPE = None
        SET_MATRIX = None
        SET_MEMBERSHIP = None
        SET_SYNONYMS = None

        return matrix