示例#1
0
    def test_quantile_normalize(self):
        row_scores = read_matrix('testdata/rowscores_fixed.tsv')
        mot_scores = read_matrix('testdata/motscores_fixed.tsv')
        net_scores = read_matrix('testdata/netscores_fixed.tsv')

        ref_rowscores = read_matrix('testdata/rowscores_qnorm.tsv')
        ref_motscores = read_matrix('testdata/motscores_qnorm.tsv')
        ref_netscores = read_matrix('testdata/netscores_qnorm.tsv')

        in_matrices = [row_scores, mot_scores, net_scores]
        # scaling for cluster 49
        scalings = [6.0, 0.033355570380253496, 0.016677785190126748]
        result = dm.quantile_normalize_scores(in_matrices, scalings)
        self.assertTrue(check_matrix_values(result[0], ref_rowscores))
        self.assertTrue(check_matrix_values(result[1], ref_motscores))
        self.assertTrue(check_matrix_values(result[2], ref_netscores))
示例#2
0
    def test_quantile_normalize(self):
        row_scores = read_matrix('testdata/rowscores_fixed.tsv')
        mot_scores = read_matrix('testdata/motscores_fixed.tsv')
        net_scores = read_matrix('testdata/netscores_fixed.tsv')

        ref_rowscores = read_matrix('testdata/rowscores_qnorm.tsv')
        ref_motscores = read_matrix('testdata/motscores_qnorm.tsv')
        ref_netscores = read_matrix('testdata/netscores_qnorm.tsv')

        in_matrices = [row_scores, mot_scores, net_scores]
        # scaling for cluster 49
        scalings = [6.0, 0.033355570380253496, 0.016677785190126748]
        result = dm.quantile_normalize_scores(in_matrices, scalings)
        self.assertTrue(check_matrix_values(result[0], ref_rowscores))
        self.assertTrue(check_matrix_values(result[1], ref_motscores))
        self.assertTrue(check_matrix_values(result[2], ref_netscores))
    def test_quantile_normalize_scores_with_undefined_weight(self):
        """one undefined weight"""
        m1 = dm.DataMatrix(2, 2, values=[[1, 3], [2, 4]])
        m2 = dm.DataMatrix(2, 2, values=[[2.3, 2.5], [2.1, 2.31]])
        result = dm.quantile_normalize_scores([m1, m2], [6.0, np.nan])

        outmatrix1 = result[0].values
        self.assertAlmostEqual(1.0, outmatrix1[0][0])
        self.assertAlmostEqual(3.0, outmatrix1[0][1])
        self.assertAlmostEqual(2.0, outmatrix1[1][0])
        self.assertAlmostEqual(4.0, outmatrix1[1][1])

        outmatrix2 = result[1].values
        self.assertAlmostEqual(2.0, outmatrix2[0][0])
        self.assertAlmostEqual(4.0, outmatrix2[0][1])
        self.assertAlmostEqual(1.0, outmatrix2[1][0])
        self.assertAlmostEqual(3.0, outmatrix2[1][1])
    def test_quantile_normalize_scores_with_no_weights(self):
        """no weights -> fall back to row means"""
        m1 = dm.DataMatrix(2, 2, values=[[1, 3], [2, 4]])
        m2 = dm.DataMatrix(2, 2, values=[[2.3, 2.5], [2.1, 2.31]])
        result = dm.quantile_normalize_scores([m1, m2], None)

        outmatrix1 = result[0].values
        self.assertAlmostEqual(1.55, outmatrix1[0][0])
        self.assertAlmostEqual(2.655, outmatrix1[0][1])
        self.assertAlmostEqual(2.15, outmatrix1[1][0])
        self.assertAlmostEqual(3.25, outmatrix1[1][1])

        outmatrix2 = result[1].values
        self.assertAlmostEqual(2.15, outmatrix2[0][0])
        self.assertAlmostEqual(3.25, outmatrix2[0][1])
        self.assertAlmostEqual(1.55, outmatrix2[1][0])
        self.assertAlmostEqual(2.655, outmatrix2[1][1])
    def test_quantile_normalize_scores_with_all_defined_weights(self):
        """happy path for quantile normalization"""
        m1 = dm.DataMatrix(2, 2, values=[[1, 3], [2, 4]])
        m2 = dm.DataMatrix(2, 2, values=[[2.3, 2.5], [2.1, 2.31]])
        result = dm.quantile_normalize_scores([m1, m2], [6.0, 1.0])

        outmatrix1 = result[0].values
        self.assertAlmostEqual(0.5785714, outmatrix1[0][0])
        self.assertAlmostEqual(1.45071428, outmatrix1[0][1])
        self.assertAlmostEqual(1.02142857, outmatrix1[1][0])
        self.assertAlmostEqual(1.89285714, outmatrix1[1][1])

        outmatrix2 = result[1].values
        self.assertAlmostEqual(1.02142857, outmatrix2[0][0])
        self.assertAlmostEqual(1.89285714, outmatrix2[0][1])
        self.assertAlmostEqual(0.5785714, outmatrix2[1][0])
        self.assertAlmostEqual(1.45071428, outmatrix2[1][1])
示例#6
0
    def test_quantile_normalize_scores_with_undefined_weight(self):
        """one undefined weight"""
        m1 = dm.DataMatrix(2, 2, values=[[1, 3], [2, 4]])
        m2 = dm.DataMatrix(2, 2, values=[[2.3, 2.5], [2.1, 2.31]])
        result = dm.quantile_normalize_scores([m1, m2], [6.0, np.nan])

        outmatrix1 = result[0].values
        self.assertAlmostEqual(1.0, outmatrix1[0][0])
        self.assertAlmostEqual(3.0, outmatrix1[0][1])
        self.assertAlmostEqual(2.0, outmatrix1[1][0])
        self.assertAlmostEqual(4.0, outmatrix1[1][1])

        outmatrix2 = result[1].values
        self.assertAlmostEqual(2.0, outmatrix2[0][0])
        self.assertAlmostEqual(4.0, outmatrix2[0][1])
        self.assertAlmostEqual(1.0, outmatrix2[1][0])
        self.assertAlmostEqual(3.0, outmatrix2[1][1])
示例#7
0
    def test_quantile_normalize_scores_with_no_weights(self):
        """no weights -> fall back to row means"""
        m1 = dm.DataMatrix(2, 2, values=[[1, 3], [2, 4]])
        m2 = dm.DataMatrix(2, 2, values=[[2.3, 2.5], [2.1, 2.31]])
        result = dm.quantile_normalize_scores([m1, m2], None)

        outmatrix1 = result[0].values
        self.assertAlmostEqual(1.55, outmatrix1[0][0])
        self.assertAlmostEqual(2.655, outmatrix1[0][1])
        self.assertAlmostEqual(2.15, outmatrix1[1][0])
        self.assertAlmostEqual(3.25, outmatrix1[1][1])

        outmatrix2 = result[1].values
        self.assertAlmostEqual(2.15, outmatrix2[0][0])
        self.assertAlmostEqual(3.25, outmatrix2[0][1])
        self.assertAlmostEqual(1.55, outmatrix2[1][0])
        self.assertAlmostEqual(2.655, outmatrix2[1][1])
示例#8
0
    def test_quantile_normalize_scores_with_all_defined_weights(self):
        """happy path for quantile normalization"""
        m1 = dm.DataMatrix(2, 2, values=[[1, 3], [2, 4]])
        m2 = dm.DataMatrix(2, 2, values=[[2.3, 2.5], [2.1, 2.31]])
        result = dm.quantile_normalize_scores([m1, m2], [6.0, 1.0])

        outmatrix1 = result[0].values
        self.assertAlmostEqual(0.5785714, outmatrix1[0][0])
        self.assertAlmostEqual(1.45071428, outmatrix1[0][1])
        self.assertAlmostEqual(1.02142857, outmatrix1[1][0])
        self.assertAlmostEqual(1.89285714, outmatrix1[1][1])

        outmatrix2 = result[1].values
        self.assertAlmostEqual(1.02142857, outmatrix2[0][0])
        self.assertAlmostEqual(1.89285714, outmatrix2[0][1])
        self.assertAlmostEqual(0.5785714, outmatrix2[1][0])
        self.assertAlmostEqual(1.45071428, outmatrix2[1][1])
示例#9
0
def combine(result_matrices, score_scalings, membership, iteration, config_params):
    """This is  the combining function, taking n result matrices and scalings"""
    quantile_normalize = config_params["quantile_normalize"]

    for i, m in enumerate(result_matrices):
        m.fix_extreme_values()
        m.subtract_with_quantile(0.99)

        # debug mode: print scoring matrices before combining
        if "dump_scores" in config_params["debug"] and (
            iteration == 1 or (iteration % config_params["debug_freq"] == 0)
        ):
            funs = config_params["pipeline"]["row-scoring"]["args"]["functions"]
            m.write_tsv_file(
                os.path.join(config_params["output_dir"], "score-%s-%04d.tsv" % (funs[i]["id"], iteration)),
                compressed=False,
            )

    if quantile_normalize:
        if len(result_matrices) > 1:
            start_time = util.current_millis()
            result_matrices = dm.quantile_normalize_scores(result_matrices, score_scalings)
            elapsed = util.current_millis() - start_time
            logging.debug("quantile normalize in %f s.", elapsed / 1000.0)

        in_matrices = [m.values for m in result_matrices]

    else:
        in_matrices = []
        num_clusters = membership.num_clusters()
        mat = result_matrices[0]
        index_map = {name: index for index, name in enumerate(mat.row_names)}
        # we assume matrix 0 is always the gene expression score
        # we also assume that the matrices are already extreme value
        # fixed
        rsm = []
        for cluster in range(1, num_clusters + 1):
            row_members = sorted(membership.rows_for_cluster(cluster))
            rsm.extend([mat.values[index_map[row], cluster - 1] for row in row_members])
        scale = util.mad(rsm)
        if scale == 0:  # avoid that we are dividing by 0
            scale = util.r_stddev(rsm)
        if scale != 0:
            median_rsm = util.median(rsm)
            rsvalues = (mat.values - median_rsm) / scale
            num_rows, num_cols = rsvalues.shape
            rscores = dm.DataMatrix(num_rows, num_cols, mat.row_names, mat.column_names, values=rsvalues)
            rscores.fix_extreme_values()
        else:
            logging.warn("combiner scaling -> scale == 0 !!!")
            rscores = mat
        in_matrices.append(rscores.values)

        if len(result_matrices) > 1:
            rs_quant = util.quantile(rscores.values, 0.01)
            logging.debug("RS_QUANT = %f", rs_quant)
            for i in range(1, len(result_matrices)):
                values = result_matrices[i].values
                qqq = abs(util.quantile(values, 0.01))
                if qqq == 0:
                    logging.debug("SPARSE SCORES - %d attempt 1: pick from sorted values", i)
                    qqq = sorted(values.ravel())[9]
                if qqq == 0:
                    logging.debug("SPARSE SCORES - %d attempt 2: pick minimum value", i)
                    qqq = abs(values.min())
                if qqq != 0:
                    values = values / qqq * abs(rs_quant)
                else:
                    logging.debug("SPARSE SCORES - %d not normalizing!", i)
                in_matrices.append(values)

    if len(result_matrices) > 0:
        start_time = util.current_millis()
        # assuming same format of all matrices
        combined_score = np.zeros(in_matrices[0].shape)
        for i in xrange(len(in_matrices)):
            combined_score += in_matrices[i] * score_scalings[i]

        elapsed = util.current_millis() - start_time
        logging.debug("combined score in %f s.", elapsed / 1000.0)
        matrix0 = result_matrices[0]  # as reference for names
        return dm.DataMatrix(
            matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score
        )
    else:
        return None
示例#10
0
def combine(result_matrices, score_scalings, membership, iteration,
            config_params):
    """This is  the combining function, taking n result matrices and scalings"""
    quantile_normalize = config_params['quantile_normalize']

    for i, m in enumerate(result_matrices):
        m.fix_extreme_values()
        m.subtract_with_quantile(0.99)

        # debug mode: print scoring matrices before combining
        if ('dump_scores' in config_params['debug']
                and (iteration == 1 or
                     (iteration % config_params['debug_freq'] == 0))):
            funs = config_params['pipeline']['row-scoring']['args'][
                'functions']
            m.write_tsv_file(os.path.join(
                config_params['output_dir'],
                'score-%s-%04d.tsv' % (funs[i]['id'], iteration)),
                             compressed=False)

    if quantile_normalize:
        if len(result_matrices) > 1:
            start_time = util.current_millis()
            result_matrices = dm.quantile_normalize_scores(
                result_matrices, score_scalings)
            elapsed = util.current_millis() - start_time
            logging.debug("quantile normalize in %f s.", elapsed / 1000.0)

        in_matrices = [m.values for m in result_matrices]

    else:
        in_matrices = []
        num_clusters = membership.num_clusters()
        mat = result_matrices[0]
        index_map = {name: index for index, name in enumerate(mat.row_names)}
        # we assume matrix 0 is always the gene expression score
        # we also assume that the matrices are already extreme value
        # fixed
        rsm = []
        for cluster in range(1, num_clusters + 1):
            row_members = sorted(membership.rows_for_cluster(cluster))
            rsm.extend([
                mat.values[index_map[row], cluster - 1] for row in row_members
            ])
        scale = util.mad(rsm)
        if scale == 0:  # avoid that we are dividing by 0
            scale = util.r_stddev(rsm)
        if scale != 0:
            median_rsm = util.median(rsm)
            rsvalues = (mat.values - median_rsm) / scale
            num_rows, num_cols = rsvalues.shape
            rscores = dm.DataMatrix(num_rows,
                                    num_cols,
                                    mat.row_names,
                                    mat.column_names,
                                    values=rsvalues)
            rscores.fix_extreme_values()
        else:
            logging.warn("combiner scaling -> scale == 0 !!!")
            rscores = mat
        in_matrices.append(rscores.values)

        if len(result_matrices) > 1:
            rs_quant = util.quantile(rscores.values, 0.01)
            logging.debug("RS_QUANT = %f", rs_quant)
            for i in range(1, len(result_matrices)):
                values = result_matrices[i].values
                qqq = abs(util.quantile(values, 0.01))
                if qqq == 0:
                    logging.debug(
                        'SPARSE SCORES - %d attempt 1: pick from sorted values',
                        i)
                    qqq = sorted(values.ravel())[9]
                if qqq == 0:
                    logging.debug(
                        'SPARSE SCORES - %d attempt 2: pick minimum value', i)
                    qqq = abs(values.min())
                if qqq != 0:
                    values = values / qqq * abs(rs_quant)
                else:
                    logging.debug('SPARSE SCORES - %d not normalizing!', i)
                in_matrices.append(values)

    if len(result_matrices) > 0:
        start_time = util.current_millis()
        # assuming same format of all matrices
        combined_score = np.zeros(in_matrices[0].shape)
        for i in xrange(len(in_matrices)):
            combined_score += in_matrices[i] * score_scalings[i]

        elapsed = util.current_millis() - start_time
        logging.debug("combined score in %f s.", elapsed / 1000.0)
        matrix0 = result_matrices[0]  # as reference for names
        return dm.DataMatrix(matrix0.num_rows,
                             matrix0.num_columns,
                             matrix0.row_names,
                             matrix0.column_names,
                             values=combined_score)
    else:
        return None