Пример #1
0
    def test_enum(self):

        self.assertTrue(check.argument_enum("A", ("A", "B")))
        self.assertTrue(check.argument_enum(["A", "B", "A"], ("A", "B")))

        with self.assertRaises(ValueError):
            check.argument_enum(["A", "B", "C"], ("A", "B"))
Пример #2
0
    def write_output_files(self,
                           pr_calc,
                           output_dir,
                           priors,
                           beta_threshold,
                           network_data,
                           threshold_network=True):

        assert check.argument_type(pr_calc, RankSummaryPR)
        assert check.argument_path(output_dir,
                                   allow_none=True,
                                   create_if_needed=True)

        self.write_csv(pr_calc.combined_confidences(), output_dir,
                       self.confidence_file_name)
        self.write_csv(beta_threshold, output_dir, self.threshold_file_name)
        pr_calc.output_pr_curve_pdf(output_dir,
                                    file_name=self.pr_curve_file_name)

        # Threshold the network with the boolean beta_threshold if threshold_network is True
        beta_threshold = beta_threshold if threshold_network else None

        # Write output
        self.save_network_to_tsv(pr_calc,
                                 priors,
                                 output_dir,
                                 output_file_name=self.network_file_name,
                                 beta_threshold=beta_threshold,
                                 extra_columns=network_data)
Пример #3
0
    def write_csv(data, pathname, filename):
        assert check.argument_path(pathname, allow_none=True)
        assert check.argument_type(filename, str, allow_none=True)
        assert check.argument_type(data, pd.DataFrame)

        if pathname is not None and filename is not None:
            data.to_csv(os.path.join(pathname, filename), sep='\t')
Пример #4
0
def _split_axis(priors, split_ratio, axis=default.DEFAULT_CV_AXIS, seed=default.DEFAULT_CV_RANDOM_SEED):
    """
    Split by axis labels on the chosen axis
    :param priors: pd.DataFrame [M x N]
    :param split_ratio: float
    :param axis: [0, 1]
    :param seed:
    :return:
    """

    check.argument_numeric(split_ratio, 0, 1)
    check.argument_enum(axis, [0, 1])

    pc = priors.shape[axis]
    gs_count = int((1 - split_ratio) * pc)
    idx = _make_shuffled_index(pc, seed=seed)

    if axis == 0:
        axis_idx = priors.index
    elif axis == 1:
        axis_idx = priors.columns
    else:
        raise ValueError("Axis can only be 0 or 1")

    pr_idx = axis_idx[idx[0:gs_count]]
    gs_idx = axis_idx[idx[gs_count:]]

    priors_data = priors.drop(gs_idx, axis=axis)
    gold_standard = priors.drop(pr_idx, axis=axis)

    return priors_data, gold_standard
Пример #5
0
def split_for_cv(all_data, split_ratio, split_axis=default.DEFAULT_CV_AXIS, seed=default.DEFAULT_CV_RANDOM_SEED):
    """
    Take a dataframe and split it according to split_ratio on split_axis into two new dataframes. This is for
    crossvalidation splits of a gold standard

    :param all_data: pd.DataFrame [G x K]
        Existing prior or gold standard data
    :param split_ratio: float
        The proportion of the priors that should go into the gold standard
    :param split_axis: int
        Splits on rows (when 0), columns (when 1), or on flattened individual data points (when None)
    :return prior_data, gold_standard: pd.DataFrame [G/2 x K], pd.DataFrame [G/2 x K]
        Returns a new prior and gold standard by splitting the old one in half
    """

    check.argument_numeric(split_ratio, 0, 1)
    check.argument_enum(split_axis, [0, 1], allow_none=True)

    # Split the priors into gold standard based on axis (flatten if axis=None)
    if split_axis is None:
        priors_data, gold_standard = _split_flattened(all_data, split_ratio, seed=seed)
    else:
        priors_data, gold_standard = _split_axis(all_data, split_ratio, axis=split_axis, seed=seed)

    return priors_data, gold_standard
Пример #6
0
def _split_flattened(data, split_ratio, seed=default.DEFAULT_CV_RANDOM_SEED):
    """
    Instead of splitting by axis labels, split edges and ignore axes
    :param data: pd.DataFrame [M x N]
    :param split_ratio: float
    :param seed:
    :return priors_data: pd.DataFrame [M x N]
    :return gold_standard: pd.DataFrame [M x N]
    """

    check.argument_numeric(split_ratio, 0, 1)

    pc = np.sum(data.values != 0)
    gs_count = int(split_ratio * pc)
    idx = _make_shuffled_index(pc, seed=seed)

    pr_idx = data.values[data.values != 0].copy()
    gs_idx = data.values[data.values != 0].copy()

    pr_idx[idx[0:gs_count]] = 0
    gs_idx[idx[gs_count:]] = 0

    gs = data.values.copy()
    pr = data.values.copy()

    gs[gs != 0] = gs_idx
    pr[pr != 0] = pr_idx

    priors_data = pd.DataFrame(pr, index=data.index, columns=data.columns)
    gold_standard = pd.DataFrame(gs, index=data.index, columns=data.columns)

    return priors_data, gold_standard
Пример #7
0
    def test_type(self):

        self.assertTrue(check.argument_type(self, unittest.TestCase))
        self.assertTrue(
            check.argument_type(None, unittest.TestCase, allow_none=True))

        with self.assertRaises(ValueError):
            self.assertTrue(check.argument_type("0", unittest.TestCase))
Пример #8
0
    def test_none(self):

        self.assertTrue(check.arguments_not_none(("A", "B")))
        self.assertTrue(check.arguments_not_none(("A", None), num_none=1))
        with self.assertRaises(ValueError):
            self.assertTrue(check.arguments_not_none((None, None, "A")))
        with self.assertRaises(ValueError):
            self.assertTrue(
                check.arguments_not_none((None, None, "A"), num_none=0))
Пример #9
0
def remove_prior_circularity(priors, gold_standard, split_axis=default.DEFAULT_CV_AXIS):
    """
    Remove all row labels that occur in the gold standard from the prior
    :param priors: pd.DataFrame [M x N]
    :param gold_standard: pd.DataFrame [m x n]
    :param split_axis: int (0,1)
    :return new_priors: pd.DataFrame [M-m x N]
    :return gold_standard: pd.DataFrame [m x n]
    """

    check.argument_enum(split_axis, [0, 1])
    new_priors = priors.drop(gold_standard.axes[split_axis], axis=split_axis, errors='ignore')

    return new_priors, gold_standard
Пример #10
0
    def __init__(self,
                 rankable_data,
                 gold_standard,
                 filter_method='keep_all_gold_standard',
                 rank_method="sum"):

        assert check.argument_enum(filter_method,
                                   self.filter_method_lookup.keys())
        self.filter_method = getattr(self,
                                     self.filter_method_lookup[filter_method])

        # Calculate confidences based on the ranked data
        self.all_confidences = self.compute_combined_confidences(
            rankable_data, rank_method=rank_method)

        # Filter the gold standard and confidences down to a format that can be directly compared
        utils.Debug.vprint("GS: {gs}, Confidences: {conf}".format(
            gs=gold_standard.shape, conf=self.all_confidences.shape),
                           level=0)
        self.gold_standard, self.filtered_confidences = self.filter_method(
            gold_standard, self.all_confidences)
        utils.Debug.vprint("Filtered to GS: {gs}, Confidences: {conf}".format(
            gs=gold_standard.shape, conf=self.all_confidences.shape),
                           level=0)

        # Calculate the precision and recall and save the index that sorts the ranked confidences (filtered)
        self.recall, self.precision, self.ranked_idx = self.calculate_precision_recall(
            self.filtered_confidences, self.gold_standard)
        self.aupr = self.calculate_aupr(self.recall, self.precision)
Пример #11
0
    def summarize_network(self, output_dir, gold_standard, priors):
        """
        Take the betas and rescaled beta_errors, construct a network, and test it against the gold standard
        :param output_dir: str
            Path to write files into. Don't write anything if this is None.
        :param gold_standard: pd.DataFrame [G x K]
            Gold standard to test the network against
        :param priors: pd.DataFrame [G x K]
            Prior data
        :return aupr: float
            Returns the AUPR calculated from the network and gold standard
        """

        assert check.argument_path(output_dir, allow_none=True)
        assert check.argument_type(gold_standard, pd.DataFrame)
        assert check.argument_type(priors, pd.DataFrame)

        pr_calc = RankSummaryPR(self.rescaled_betas,
                                gold_standard,
                                filter_method=self.filter_method)
        beta_sign, beta_nonzero = self.summarize(self.betas)
        beta_threshold = self.passes_threshold(beta_nonzero, len(self.betas),
                                               self.threshold)
        resc_betas_mean, resc_betas_median = self.mean_and_median(
            self.rescaled_betas)
        network_data = {
            'beta.sign.sum': beta_sign,
            'var.exp.median': resc_betas_median
        }

        utils.Debug.vprint("Model AUPR:\t{aupr}".format(aupr=pr_calc.aupr),
                           level=0)

        # Plot PR curve & Output results to a TSV
        self.write_output_files(pr_calc, output_dir, priors, beta_threshold,
                                network_data)

        return pr_calc.aupr
Пример #12
0
    def __init__(self,
                 betas,
                 rescaled_betas,
                 threshold=0.5,
                 filter_method='overlap'):
        """
        :param betas: list(pd.DataFrame[G x K])
        :param rescaled_betas: list(pd.DataFrame[G x K])
        :param threshold: float
        :param filter_method: str
            How to handle gold standard filtering ('overlap' filters to beta, 'keep_all_gold_standard' doesn't filter)
        """

        assert check.dataframes_align(betas)
        self.betas = betas

        assert check.dataframes_align(rescaled_betas)
        self.rescaled_betas = rescaled_betas

        assert check.argument_enum(filter_method, FILTER_METHODS)
        self.filter_method = filter_method

        assert check.argument_numeric(threshold, 0, 1)
        self.threshold = threshold
Пример #13
0
    def test_numeric(self):

        self.assertTrue(check.argument_numeric(0))
        self.assertTrue(check.argument_numeric(0.0))

        with self.assertRaises(ValueError):
            check.argument_numeric("0")

        self.assertTrue(check.argument_numeric(1, 0, 2))

        with self.assertRaises(ValueError):
            self.assertTrue(check.argument_numeric(2, 0, 1))

        self.assertTrue(check.argument_numeric(None, allow_none=True))
Пример #14
0
    def test_frame_alignment(self):

        self.assertTrue(
            check.dataframes_align([self.frame1, self.frame1, self.frame1]))
        self.assertTrue(
            check.dataframes_align([self.frame1, self.frame1, self.frame3],
                                   check_order=False))

        with self.assertRaises(ValueError):
            check.dataframes_align([self.frame1, self.frame2, self.frame1])

        with self.assertRaises(ValueError):
            check.dataframes_align([self.frame1, self.frame3, self.frame1])
Пример #15
0
    def compute_combined_confidences(rankable_data, **kwargs):
        """
        Calculate combined confidences from rank sum
        :param rankable_data: list(pd.DataFrame) R x [M x N]
            List of dataframes which have the same axes and need to be rank summed
        :return combine_conf: pd.DataFrame [M x N]
        """

        rank_method = kwargs.pop("rank_method", "sum")
        assert check.argument_enum(rank_method,
                                   ("sum", "threshold_sum", "max", "geo_mean"))

        if rank_method == "sum":
            return RankSummaryPR.rank_sum(rankable_data)
        elif rank_method == "threshold_sum":
            return RankSummaryPR.rank_sum_threshold(rankable_data,
                                                    data_threshold=kwargs.pop(
                                                        "data_threshold", 0.9))
        elif rank_method == "max":
            return RankSummaryPR.rank_max_value(rankable_data)
        elif rank_method == "geo_mean":
            return RankSummaryPR.rank_geo_mean(rankable_data)
    def get_sample_index(self,
                         meta_data=None,
                         sample_ratio=None,
                         sample_size=None,
                         min_size=default.DEFAULT_MINIMUM_SAMPLE_SIZE,
                         stratified_sampling=None):
        """
        Produce an integer index to sample data using .iloc. If the self.stratified_sampling flag is True, sample
        separately from each group, as defined by the self.stratified_batch_lookup column.
        :param meta_data: pd.DataFrame [N x ?]
            Data frame to sample from. Use self.meta_data if this is not set.
        :param sample_ratio: float
            Sample expression_matrix to this proportion of data points
        :param sample_size: int
            Sample expression matrix to this absolute number of data points. If sampling from each stratified group,
            this is the absolute number of data points PER GROUP (not total)
        :return new_size, new_idx: int, np.ndarray
            Return the total number of
        """

        # Sanity check inputs
        assert check.arguments_not_none((sample_ratio, sample_size),
                                        num_none=1)
        assert check.argument_numeric(sample_ratio, low=0, allow_none=True)
        assert check.argument_numeric(sample_size, low=0, allow_none=True)

        stratified_sampling = stratified_sampling if stratified_sampling is not None else self.stratified_sampling

        if stratified_sampling:
            # Use the main meta_data if there's nothing given
            meta_data = meta_data if meta_data is not None else self.meta_data

            # Copy and reindex the meta_data so that the index can be used with iloc
            meta_data = meta_data.copy()
            meta_data.index = pd.Index(range(meta_data.shape[0]))
            new_idx = np.ndarray(0, dtype=int)

            # For each factor in the batch column
            for batch in meta_data[
                    self.stratified_batch_lookup].unique().tolist():
                # Get the integer index of the data points in this batch
                batch_idx = meta_data.loc[
                    meta_data[self.stratified_batch_lookup] ==
                    batch, :].index.tolist()

                # Decide how many to collect from this batch
                size = sample_size if sample_ratio is None else max(
                    int(len(batch_idx) * sample_ratio), min_size)

                # Resample and append the new sample index to the index array
                new_idx = np.append(
                    new_idx,
                    np.random.choice(batch_idx,
                                     size=size,
                                     replace=self.sample_with_replacement))
            return new_idx
        else:
            # Decide how many to collect from the total expression matrix or the meta_data
            num_samples = self.expression_matrix.shape[
                1] if meta_data is None else meta_data.shape[0]
            size = sample_size if sample_ratio is None else max(
                int(sample_ratio * num_samples), min_size)
            return np.random.choice(num_samples,
                                    size=size,
                                    replace=self.sample_with_replacement)
Пример #17
0
    def save_network_to_tsv(pr_calc,
                            priors,
                            output_dir,
                            confidence_threshold=0,
                            output_file_name="network.tsv",
                            beta_threshold=None,
                            extra_columns=None):
        """
        Create a network file and save it
        :param pr_calc: RankSummaryPR
            The rank-sum object with the math in it
        :param priors: pd.DataFrame [G x K]
            Prior data
        :param output_dir: str
            The path to the output file. If None, don't save anything
        :param confidence_threshold: numeric
            The minimum confidence score needed to write a network edge
        :param output_file_name: str
            The output file name. If None, don't save anything
        :param beta_threshold: pd.DataFrame [G x K]
            The thresholded betas to include in the network. If None, include everything.
        :param extra_columns: dict(col_name: pd.DataFrame [G x K])
            Any additional data to include, keyed by column name and indexable with row and column names
        """

        assert check.argument_type(pr_calc, RankSummaryPR)
        assert check.argument_type(priors, pd.DataFrame)
        assert check.argument_type(beta_threshold,
                                   pd.DataFrame,
                                   allow_none=True)
        assert check.argument_path(output_dir, allow_none=True)
        assert check.argument_type(output_file_name, str, allow_none=True)
        assert check.argument_numeric(confidence_threshold, 0, 1)

        if output_dir is None or output_file_name is None:
            return False

        header = [
            'regulator', 'target', 'combined_confidences', 'prior',
            'gold.standard', 'precision', 'recall'
        ]
        if extra_columns is not None:
            header += [k for k in sorted(extra_columns.keys())]

        output_list = [header]

        recall_data, precision_data = pr_calc.dataframe_recall_precision()

        for row_name, column_name, conf in pr_calc.confidence_ordered_generator(
        ):
            if conf < confidence_threshold:
                continue

            if beta_threshold is not None and not beta_threshold.ix[
                    row_name, column_name]:
                continue

            row_data = [column_name, row_name, conf]

            # Add prior value (or nan if the priors does not cover this interaction)
            if row_name in priors.index and column_name in priors.columns:
                row_data += [priors.ix[row_name, column_name]]
            else:
                row_data += [np.nan]

            # Add gold standard, precision, and recall (or nan if the gold standard does not cover this interaction)
            if row_name in pr_calc.gold_standard.index and column_name in pr_calc.gold_standard.columns:
                row_data += [
                    pr_calc.gold_standard.ix[row_name, column_name],
                    precision_data.ix[row_name, column_name],
                    recall_data.ix[row_name, column_name]
                ]
            else:
                row_data += [np.nan, np.nan, np.nan]

            if extra_columns is not None:
                for k in sorted(extra_columns.keys()):
                    if row_name in extra_columns[
                            k].index and column_name in extra_columns[
                                k].columns:
                        row_data += [
                            extra_columns[k].ix[row_name, column_name]
                        ]
                    else:
                        row_data += [np.nan]

            output_list.append(row_data)

        with open(os.path.join(output_dir, output_file_name), 'w') as myfile:
            wr = csv.writer(myfile, delimiter='\t')
            for row in output_list:
                wr.writerow(row)