def make_PR_data(gs, confidences):
     data = utils.melt_and_reindex_dataframe(
         confidences, value_name=CONFIDENCE_COLUMN).reset_index()
     data = data.join(utils.melt_and_reindex_dataframe(
         gs, value_name=GOLD_STANDARD_COLUMN),
                      on=[TARGET_COLUMN, REGULATOR_COLUMN],
                      how='outer')
     return data
예제 #2
0
    def process_network(metric, priors, confidence_threshold=0, beta_threshold=None, extra_columns=None):
        """
        Process rank-summed results into a network data frame
        :param metric: RankSummingMetric
            The rank-sum object with the math in it
        :param priors: pd.DataFrame [G x K]
            Prior data
        :param confidence_threshold: numeric
            The minimum confidence score needed to write a network edge
        :param beta_threshold: pd.DataFrame [G x K]
            The thresholded betas to include in the network. If None, include everything.
        :param extra_columns: dict(col_name: pd.DataFrame [G x K])
            Any additional data to include, keyed by column name and indexable with row and column names
        :return network_data: pd.DataFrame [(G*K) x 7+]
            Network edge dataframe

        """

        assert check.argument_type(metric, RankSummingMetric)
        assert check.argument_type(priors, pd.DataFrame, allow_none=True)
        assert check.argument_type(beta_threshold, pd.DataFrame, allow_none=True)
        assert check.argument_numeric(confidence_threshold, 0, 1)

        # Get the combined confidences and subset for confidence threshold
        network_data = metric.confidence_dataframe()
        network_data = network_data.loc[network_data[CONFIDENCE_COLUMN] > confidence_threshold, :]

        # If beta_threshold has been provided, melt and join it to the network data
        # Then discard anything which isn't meeting the threshold
        if beta_threshold is not None and False:
            beta_data = utils.melt_and_reindex_dataframe(beta_threshold, BETA_THRESHOLD_COLUMN)
            network_data = network_data.join(beta_data, on=[TARGET_COLUMN, REGULATOR_COLUMN])
            network_data = network_data.loc[network_data[BETA_THRESHOLD_COLUMN] == 1, :]
            del network_data[BETA_THRESHOLD_COLUMN]

        if priors is not None:
            prior_data = utils.melt_and_reindex_dataframe(priors, PRIOR_COLUMN)
            network_data = network_data.join(prior_data, on=[TARGET_COLUMN, REGULATOR_COLUMN])

        # Add any extra columns as needed
        if extra_columns is not None:
            for k in sorted(extra_columns.keys()):
                extra_data = utils.melt_and_reindex_dataframe(extra_columns[k], k)
                network_data = network_data.join(extra_data, on=[TARGET_COLUMN, REGULATOR_COLUMN])

        # Make sure all missing values are NaN
        network_data[pd.isnull(network_data)] = np.nan

        return network_data
예제 #3
0
    def __init__(self,
                 rankable_data,
                 gold_standard,
                 filter_method='keep_all_gold_standard'):
        """
        Take rankable data and process it into confidence scores which are stored in this object
        :param rankable_data: list(pd.DataFrame) [B x [G x K]]
            A list of numeric dataframes (with identical axes)
        :param gold_standard: pd.DataFrame [G x K]
            A dataframe which corresponds to known, gold-standard data
        :param filter_method: str
            The method of aligning the

        """

        # Get the filtering method
        assert check.argument_enum(filter_method,
                                   self.filter_method_lookup.keys())
        self.filter_method = getattr(self,
                                     self.filter_method_lookup[filter_method])

        # Explicitly cast the gold standard data to a boolean array [0,1]
        gold_standard = (gold_standard != 0).astype(int)
        self.gold_standard = gold_standard

        # Calculate confidences based on the ranked data
        self.all_confidences = self.compute_combined_confidences(rankable_data)

        # Convert the confidence data to long format
        confidence_data = utils.melt_and_reindex_dataframe(
            self.all_confidences,
            CONFIDENCE_COLUMN,
            idx_name=TARGET_COLUMN,
            col_name=REGULATOR_COLUMN)

        # Attach the gold standard
        confidence_data = self.attach_gs_to_confidences(
            confidence_data, gold_standard)

        # Sort by confidence (descending) and reset the index
        self.confidence_data = confidence_data.sort_values(
            by=CONFIDENCE_COLUMN, ascending=False, na_position='last')
        self.confidence_data.reset_index(inplace=True)

        # Filter the gold standard and confidences down to a format that can be directly compared
        utils.Debug.vprint("GS: {gs} edges, Confidences: {conf} edges".format(
            gs=gold_standard.shape[0], conf=self.confidence_data.shape[0]),
                           level=0)

        self.filtered_data = self.filter_method(GOLD_STANDARD_COLUMN,
                                                CONFIDENCE_COLUMN,
                                                self.confidence_data)
        utils.Debug.vprint("Filtered data to {e} edges".format(
            e=self.filtered_data.shape[0], level=0))
예제 #4
0
    def attach_gs_to_confidences(confidence_data, gold_standard):
        """
        Outer join the gold standard into the confidence data

        :param confidence_data: pd.DataFrame [G*K x n]
        :param gold_standard: pd.DataFrame [G x K]
        :return:
        """

        gold_standard = utils.melt_and_reindex_dataframe(
            gold_standard,
            GOLD_STANDARD_COLUMN,
            idx_name=TARGET_COLUMN,
            col_name=REGULATOR_COLUMN)

        return confidence_data.join(gold_standard,
                                    how='outer',
                                    on=[TARGET_COLUMN, REGULATOR_COLUMN])