def test_enum(self): self.assertTrue(check.argument_enum("A", ("A", "B"))) self.assertTrue(check.argument_enum(["A", "B", "A"], ("A", "B"))) with self.assertRaises(ValueError): check.argument_enum(["A", "B", "C"], ("A", "B"))
def write_output_files(self, pr_calc, output_dir, priors, beta_threshold, network_data, threshold_network=True): assert check.argument_type(pr_calc, RankSummaryPR) assert check.argument_path(output_dir, allow_none=True, create_if_needed=True) self.write_csv(pr_calc.combined_confidences(), output_dir, self.confidence_file_name) self.write_csv(beta_threshold, output_dir, self.threshold_file_name) pr_calc.output_pr_curve_pdf(output_dir, file_name=self.pr_curve_file_name) # Threshold the network with the boolean beta_threshold if threshold_network is True beta_threshold = beta_threshold if threshold_network else None # Write output self.save_network_to_tsv(pr_calc, priors, output_dir, output_file_name=self.network_file_name, beta_threshold=beta_threshold, extra_columns=network_data)
def write_csv(data, pathname, filename): assert check.argument_path(pathname, allow_none=True) assert check.argument_type(filename, str, allow_none=True) assert check.argument_type(data, pd.DataFrame) if pathname is not None and filename is not None: data.to_csv(os.path.join(pathname, filename), sep='\t')
def _split_axis(priors, split_ratio, axis=default.DEFAULT_CV_AXIS, seed=default.DEFAULT_CV_RANDOM_SEED): """ Split by axis labels on the chosen axis :param priors: pd.DataFrame [M x N] :param split_ratio: float :param axis: [0, 1] :param seed: :return: """ check.argument_numeric(split_ratio, 0, 1) check.argument_enum(axis, [0, 1]) pc = priors.shape[axis] gs_count = int((1 - split_ratio) * pc) idx = _make_shuffled_index(pc, seed=seed) if axis == 0: axis_idx = priors.index elif axis == 1: axis_idx = priors.columns else: raise ValueError("Axis can only be 0 or 1") pr_idx = axis_idx[idx[0:gs_count]] gs_idx = axis_idx[idx[gs_count:]] priors_data = priors.drop(gs_idx, axis=axis) gold_standard = priors.drop(pr_idx, axis=axis) return priors_data, gold_standard
def split_for_cv(all_data, split_ratio, split_axis=default.DEFAULT_CV_AXIS, seed=default.DEFAULT_CV_RANDOM_SEED): """ Take a dataframe and split it according to split_ratio on split_axis into two new dataframes. This is for crossvalidation splits of a gold standard :param all_data: pd.DataFrame [G x K] Existing prior or gold standard data :param split_ratio: float The proportion of the priors that should go into the gold standard :param split_axis: int Splits on rows (when 0), columns (when 1), or on flattened individual data points (when None) :return prior_data, gold_standard: pd.DataFrame [G/2 x K], pd.DataFrame [G/2 x K] Returns a new prior and gold standard by splitting the old one in half """ check.argument_numeric(split_ratio, 0, 1) check.argument_enum(split_axis, [0, 1], allow_none=True) # Split the priors into gold standard based on axis (flatten if axis=None) if split_axis is None: priors_data, gold_standard = _split_flattened(all_data, split_ratio, seed=seed) else: priors_data, gold_standard = _split_axis(all_data, split_ratio, axis=split_axis, seed=seed) return priors_data, gold_standard
def _split_flattened(data, split_ratio, seed=default.DEFAULT_CV_RANDOM_SEED): """ Instead of splitting by axis labels, split edges and ignore axes :param data: pd.DataFrame [M x N] :param split_ratio: float :param seed: :return priors_data: pd.DataFrame [M x N] :return gold_standard: pd.DataFrame [M x N] """ check.argument_numeric(split_ratio, 0, 1) pc = np.sum(data.values != 0) gs_count = int(split_ratio * pc) idx = _make_shuffled_index(pc, seed=seed) pr_idx = data.values[data.values != 0].copy() gs_idx = data.values[data.values != 0].copy() pr_idx[idx[0:gs_count]] = 0 gs_idx[idx[gs_count:]] = 0 gs = data.values.copy() pr = data.values.copy() gs[gs != 0] = gs_idx pr[pr != 0] = pr_idx priors_data = pd.DataFrame(pr, index=data.index, columns=data.columns) gold_standard = pd.DataFrame(gs, index=data.index, columns=data.columns) return priors_data, gold_standard
def test_type(self): self.assertTrue(check.argument_type(self, unittest.TestCase)) self.assertTrue( check.argument_type(None, unittest.TestCase, allow_none=True)) with self.assertRaises(ValueError): self.assertTrue(check.argument_type("0", unittest.TestCase))
def test_none(self): self.assertTrue(check.arguments_not_none(("A", "B"))) self.assertTrue(check.arguments_not_none(("A", None), num_none=1)) with self.assertRaises(ValueError): self.assertTrue(check.arguments_not_none((None, None, "A"))) with self.assertRaises(ValueError): self.assertTrue( check.arguments_not_none((None, None, "A"), num_none=0))
def remove_prior_circularity(priors, gold_standard, split_axis=default.DEFAULT_CV_AXIS): """ Remove all row labels that occur in the gold standard from the prior :param priors: pd.DataFrame [M x N] :param gold_standard: pd.DataFrame [m x n] :param split_axis: int (0,1) :return new_priors: pd.DataFrame [M-m x N] :return gold_standard: pd.DataFrame [m x n] """ check.argument_enum(split_axis, [0, 1]) new_priors = priors.drop(gold_standard.axes[split_axis], axis=split_axis, errors='ignore') return new_priors, gold_standard
def __init__(self, rankable_data, gold_standard, filter_method='keep_all_gold_standard', rank_method="sum"): assert check.argument_enum(filter_method, self.filter_method_lookup.keys()) self.filter_method = getattr(self, self.filter_method_lookup[filter_method]) # Calculate confidences based on the ranked data self.all_confidences = self.compute_combined_confidences( rankable_data, rank_method=rank_method) # Filter the gold standard and confidences down to a format that can be directly compared utils.Debug.vprint("GS: {gs}, Confidences: {conf}".format( gs=gold_standard.shape, conf=self.all_confidences.shape), level=0) self.gold_standard, self.filtered_confidences = self.filter_method( gold_standard, self.all_confidences) utils.Debug.vprint("Filtered to GS: {gs}, Confidences: {conf}".format( gs=gold_standard.shape, conf=self.all_confidences.shape), level=0) # Calculate the precision and recall and save the index that sorts the ranked confidences (filtered) self.recall, self.precision, self.ranked_idx = self.calculate_precision_recall( self.filtered_confidences, self.gold_standard) self.aupr = self.calculate_aupr(self.recall, self.precision)
def summarize_network(self, output_dir, gold_standard, priors): """ Take the betas and rescaled beta_errors, construct a network, and test it against the gold standard :param output_dir: str Path to write files into. Don't write anything if this is None. :param gold_standard: pd.DataFrame [G x K] Gold standard to test the network against :param priors: pd.DataFrame [G x K] Prior data :return aupr: float Returns the AUPR calculated from the network and gold standard """ assert check.argument_path(output_dir, allow_none=True) assert check.argument_type(gold_standard, pd.DataFrame) assert check.argument_type(priors, pd.DataFrame) pr_calc = RankSummaryPR(self.rescaled_betas, gold_standard, filter_method=self.filter_method) beta_sign, beta_nonzero = self.summarize(self.betas) beta_threshold = self.passes_threshold(beta_nonzero, len(self.betas), self.threshold) resc_betas_mean, resc_betas_median = self.mean_and_median( self.rescaled_betas) network_data = { 'beta.sign.sum': beta_sign, 'var.exp.median': resc_betas_median } utils.Debug.vprint("Model AUPR:\t{aupr}".format(aupr=pr_calc.aupr), level=0) # Plot PR curve & Output results to a TSV self.write_output_files(pr_calc, output_dir, priors, beta_threshold, network_data) return pr_calc.aupr
def __init__(self, betas, rescaled_betas, threshold=0.5, filter_method='overlap'): """ :param betas: list(pd.DataFrame[G x K]) :param rescaled_betas: list(pd.DataFrame[G x K]) :param threshold: float :param filter_method: str How to handle gold standard filtering ('overlap' filters to beta, 'keep_all_gold_standard' doesn't filter) """ assert check.dataframes_align(betas) self.betas = betas assert check.dataframes_align(rescaled_betas) self.rescaled_betas = rescaled_betas assert check.argument_enum(filter_method, FILTER_METHODS) self.filter_method = filter_method assert check.argument_numeric(threshold, 0, 1) self.threshold = threshold
def test_numeric(self): self.assertTrue(check.argument_numeric(0)) self.assertTrue(check.argument_numeric(0.0)) with self.assertRaises(ValueError): check.argument_numeric("0") self.assertTrue(check.argument_numeric(1, 0, 2)) with self.assertRaises(ValueError): self.assertTrue(check.argument_numeric(2, 0, 1)) self.assertTrue(check.argument_numeric(None, allow_none=True))
def test_frame_alignment(self): self.assertTrue( check.dataframes_align([self.frame1, self.frame1, self.frame1])) self.assertTrue( check.dataframes_align([self.frame1, self.frame1, self.frame3], check_order=False)) with self.assertRaises(ValueError): check.dataframes_align([self.frame1, self.frame2, self.frame1]) with self.assertRaises(ValueError): check.dataframes_align([self.frame1, self.frame3, self.frame1])
def compute_combined_confidences(rankable_data, **kwargs): """ Calculate combined confidences from rank sum :param rankable_data: list(pd.DataFrame) R x [M x N] List of dataframes which have the same axes and need to be rank summed :return combine_conf: pd.DataFrame [M x N] """ rank_method = kwargs.pop("rank_method", "sum") assert check.argument_enum(rank_method, ("sum", "threshold_sum", "max", "geo_mean")) if rank_method == "sum": return RankSummaryPR.rank_sum(rankable_data) elif rank_method == "threshold_sum": return RankSummaryPR.rank_sum_threshold(rankable_data, data_threshold=kwargs.pop( "data_threshold", 0.9)) elif rank_method == "max": return RankSummaryPR.rank_max_value(rankable_data) elif rank_method == "geo_mean": return RankSummaryPR.rank_geo_mean(rankable_data)
def get_sample_index(self, meta_data=None, sample_ratio=None, sample_size=None, min_size=default.DEFAULT_MINIMUM_SAMPLE_SIZE, stratified_sampling=None): """ Produce an integer index to sample data using .iloc. If the self.stratified_sampling flag is True, sample separately from each group, as defined by the self.stratified_batch_lookup column. :param meta_data: pd.DataFrame [N x ?] Data frame to sample from. Use self.meta_data if this is not set. :param sample_ratio: float Sample expression_matrix to this proportion of data points :param sample_size: int Sample expression matrix to this absolute number of data points. If sampling from each stratified group, this is the absolute number of data points PER GROUP (not total) :return new_size, new_idx: int, np.ndarray Return the total number of """ # Sanity check inputs assert check.arguments_not_none((sample_ratio, sample_size), num_none=1) assert check.argument_numeric(sample_ratio, low=0, allow_none=True) assert check.argument_numeric(sample_size, low=0, allow_none=True) stratified_sampling = stratified_sampling if stratified_sampling is not None else self.stratified_sampling if stratified_sampling: # Use the main meta_data if there's nothing given meta_data = meta_data if meta_data is not None else self.meta_data # Copy and reindex the meta_data so that the index can be used with iloc meta_data = meta_data.copy() meta_data.index = pd.Index(range(meta_data.shape[0])) new_idx = np.ndarray(0, dtype=int) # For each factor in the batch column for batch in meta_data[ self.stratified_batch_lookup].unique().tolist(): # Get the integer index of the data points in this batch batch_idx = meta_data.loc[ meta_data[self.stratified_batch_lookup] == batch, :].index.tolist() # Decide how many to collect from this batch size = sample_size if sample_ratio is None else max( int(len(batch_idx) * sample_ratio), min_size) # Resample and append the new sample index to the index array new_idx = np.append( new_idx, np.random.choice(batch_idx, size=size, replace=self.sample_with_replacement)) return new_idx else: # Decide how many to collect from the total expression matrix or the meta_data num_samples = self.expression_matrix.shape[ 1] if meta_data is None else meta_data.shape[0] size = sample_size if sample_ratio is None else max( int(sample_ratio * num_samples), min_size) return np.random.choice(num_samples, size=size, replace=self.sample_with_replacement)
def save_network_to_tsv(pr_calc, priors, output_dir, confidence_threshold=0, output_file_name="network.tsv", beta_threshold=None, extra_columns=None): """ Create a network file and save it :param pr_calc: RankSummaryPR The rank-sum object with the math in it :param priors: pd.DataFrame [G x K] Prior data :param output_dir: str The path to the output file. If None, don't save anything :param confidence_threshold: numeric The minimum confidence score needed to write a network edge :param output_file_name: str The output file name. If None, don't save anything :param beta_threshold: pd.DataFrame [G x K] The thresholded betas to include in the network. If None, include everything. :param extra_columns: dict(col_name: pd.DataFrame [G x K]) Any additional data to include, keyed by column name and indexable with row and column names """ assert check.argument_type(pr_calc, RankSummaryPR) assert check.argument_type(priors, pd.DataFrame) assert check.argument_type(beta_threshold, pd.DataFrame, allow_none=True) assert check.argument_path(output_dir, allow_none=True) assert check.argument_type(output_file_name, str, allow_none=True) assert check.argument_numeric(confidence_threshold, 0, 1) if output_dir is None or output_file_name is None: return False header = [ 'regulator', 'target', 'combined_confidences', 'prior', 'gold.standard', 'precision', 'recall' ] if extra_columns is not None: header += [k for k in sorted(extra_columns.keys())] output_list = [header] recall_data, precision_data = pr_calc.dataframe_recall_precision() for row_name, column_name, conf in pr_calc.confidence_ordered_generator( ): if conf < confidence_threshold: continue if beta_threshold is not None and not beta_threshold.ix[ row_name, column_name]: continue row_data = [column_name, row_name, conf] # Add prior value (or nan if the priors does not cover this interaction) if row_name in priors.index and column_name in priors.columns: row_data += [priors.ix[row_name, column_name]] else: row_data += [np.nan] # Add gold standard, precision, and recall (or nan if the gold standard does not cover this interaction) if row_name in pr_calc.gold_standard.index and column_name in pr_calc.gold_standard.columns: row_data += [ pr_calc.gold_standard.ix[row_name, column_name], precision_data.ix[row_name, column_name], recall_data.ix[row_name, column_name] ] else: row_data += [np.nan, np.nan, np.nan] if extra_columns is not None: for k in sorted(extra_columns.keys()): if row_name in extra_columns[ k].index and column_name in extra_columns[ k].columns: row_data += [ extra_columns[k].ix[row_name, column_name] ] else: row_data += [np.nan] output_list.append(row_data) with open(os.path.join(output_dir, output_file_name), 'w') as myfile: wr = csv.writer(myfile, delimiter='\t') for row in output_list: wr.writerow(row)