Пример #1
0
    def write_output_files(self,
                           pr_calc,
                           output_dir,
                           priors,
                           beta_threshold,
                           network_data,
                           threshold_network=True):

        assert check.argument_type(pr_calc, RankSummaryPR)
        assert check.argument_path(output_dir,
                                   allow_none=True,
                                   create_if_needed=True)

        self.write_csv(pr_calc.combined_confidences(), output_dir,
                       self.confidence_file_name)
        self.write_csv(beta_threshold, output_dir, self.threshold_file_name)
        pr_calc.output_pr_curve_pdf(output_dir,
                                    file_name=self.pr_curve_file_name)

        # Threshold the network with the boolean beta_threshold if threshold_network is True
        beta_threshold = beta_threshold if threshold_network else None

        # Write output
        self.save_network_to_tsv(pr_calc,
                                 priors,
                                 output_dir,
                                 output_file_name=self.network_file_name,
                                 beta_threshold=beta_threshold,
                                 extra_columns=network_data)
Пример #2
0
    def write_csv(data, pathname, filename):
        assert check.argument_path(pathname, allow_none=True)
        assert check.argument_type(filename, str, allow_none=True)
        assert check.argument_type(data, pd.DataFrame)

        if pathname is not None and filename is not None:
            data.to_csv(os.path.join(pathname, filename), sep='\t')
Пример #3
0
    def summarize_network(self, output_dir, gold_standard, priors):
        """
        Take the betas and rescaled beta_errors, construct a network, and test it against the gold standard
        :param output_dir: str
            Path to write files into. Don't write anything if this is None.
        :param gold_standard: pd.DataFrame [G x K]
            Gold standard to test the network against
        :param priors: pd.DataFrame [G x K]
            Prior data
        :return aupr: float
            Returns the AUPR calculated from the network and gold standard
        """

        assert check.argument_path(output_dir, allow_none=True)
        assert check.argument_type(gold_standard, pd.DataFrame)
        assert check.argument_type(priors, pd.DataFrame)

        pr_calc = RankSummaryPR(self.rescaled_betas,
                                gold_standard,
                                filter_method=self.filter_method)
        beta_sign, beta_nonzero = self.summarize(self.betas)
        beta_threshold = self.passes_threshold(beta_nonzero, len(self.betas),
                                               self.threshold)
        resc_betas_mean, resc_betas_median = self.mean_and_median(
            self.rescaled_betas)
        network_data = {
            'beta.sign.sum': beta_sign,
            'var.exp.median': resc_betas_median
        }

        utils.Debug.vprint("Model AUPR:\t{aupr}".format(aupr=pr_calc.aupr),
                           level=0)

        # Plot PR curve & Output results to a TSV
        self.write_output_files(pr_calc, output_dir, priors, beta_threshold,
                                network_data)

        return pr_calc.aupr
Пример #4
0
    def save_network_to_tsv(pr_calc,
                            priors,
                            output_dir,
                            confidence_threshold=0,
                            output_file_name="network.tsv",
                            beta_threshold=None,
                            extra_columns=None):
        """
        Create a network file and save it
        :param pr_calc: RankSummaryPR
            The rank-sum object with the math in it
        :param priors: pd.DataFrame [G x K]
            Prior data
        :param output_dir: str
            The path to the output file. If None, don't save anything
        :param confidence_threshold: numeric
            The minimum confidence score needed to write a network edge
        :param output_file_name: str
            The output file name. If None, don't save anything
        :param beta_threshold: pd.DataFrame [G x K]
            The thresholded betas to include in the network. If None, include everything.
        :param extra_columns: dict(col_name: pd.DataFrame [G x K])
            Any additional data to include, keyed by column name and indexable with row and column names
        """

        assert check.argument_type(pr_calc, RankSummaryPR)
        assert check.argument_type(priors, pd.DataFrame)
        assert check.argument_type(beta_threshold,
                                   pd.DataFrame,
                                   allow_none=True)
        assert check.argument_path(output_dir, allow_none=True)
        assert check.argument_type(output_file_name, str, allow_none=True)
        assert check.argument_numeric(confidence_threshold, 0, 1)

        if output_dir is None or output_file_name is None:
            return False

        header = [
            'regulator', 'target', 'combined_confidences', 'prior',
            'gold.standard', 'precision', 'recall'
        ]
        if extra_columns is not None:
            header += [k for k in sorted(extra_columns.keys())]

        output_list = [header]

        recall_data, precision_data = pr_calc.dataframe_recall_precision()

        for row_name, column_name, conf in pr_calc.confidence_ordered_generator(
        ):
            if conf < confidence_threshold:
                continue

            if beta_threshold is not None and not beta_threshold.ix[
                    row_name, column_name]:
                continue

            row_data = [column_name, row_name, conf]

            # Add prior value (or nan if the priors does not cover this interaction)
            if row_name in priors.index and column_name in priors.columns:
                row_data += [priors.ix[row_name, column_name]]
            else:
                row_data += [np.nan]

            # Add gold standard, precision, and recall (or nan if the gold standard does not cover this interaction)
            if row_name in pr_calc.gold_standard.index and column_name in pr_calc.gold_standard.columns:
                row_data += [
                    pr_calc.gold_standard.ix[row_name, column_name],
                    precision_data.ix[row_name, column_name],
                    recall_data.ix[row_name, column_name]
                ]
            else:
                row_data += [np.nan, np.nan, np.nan]

            if extra_columns is not None:
                for k in sorted(extra_columns.keys()):
                    if row_name in extra_columns[
                            k].index and column_name in extra_columns[
                                k].columns:
                        row_data += [
                            extra_columns[k].ix[row_name, column_name]
                        ]
                    else:
                        row_data += [np.nan]

            output_list.append(row_data)

        with open(os.path.join(output_dir, output_file_name), 'w') as myfile:
            wr = csv.writer(myfile, delimiter='\t')
            for row in output_list:
                wr.writerow(row)