Пример #1
0
def knn_bootstrap(emb_df, num_bootstrap=2, seed=SEED, percent_norm=False,
                  factor_analys=False):
  """Generate bootstrap statistics.

  Args:
    emb_df (pandas dataframe): dataframe to use (includes controls)
    num_bootstrap (int): number of bootstrap reps
    seed (int): which seed value to use for bootstrapping
    percent_norm (bool): whether to apply percentile normalization
    factor_analys (bool): whether to apply factor analysis
  Returns:
    stats_dict (dict): dictionary containing mean and
      standard deviation information for each overlapping
      category and value of k.
  """
  boot_knns = []
  boot_clustering_scores = []
  np.random.seed(seed=seed)
  for _ in range(num_bootstrap):
    boot_emb = transform.get_bootstrap_sample(emb_df)
    boot_post_proc = apply_post_processing(boot_emb,
                                           percent_norm=percent_norm,
                                           factor_analys=factor_analys)
    boot_means = transform.drop_unevaluated_comp(boot_post_proc).groupby(level=[
        metadata.MOA, metadata.COMPOUND,
        metadata.CONCENTRATION, metadata.BATCH,
        metadata.TREATMENT_GROUP]).mean()
    scores = get_scores_from_means(boot_means, report_confusion_matrix=False)
    boot_knns.append(scores["knn"])
    boot_clustering_scores.append(scores["clustering_score"])

  knn_return = {"knn_scores": elementwise_stats(boot_knns),
                "clustering_scores": boot_clustering_scores}
  return knn_return
Пример #2
0
def cross_val_knn_bootstrap(emb_df, contents, steps, list_of_comp_set,
                            num_bootstrap=2, seed=SEED,
                            percent_norm=False,
                            factor_analys=False):
  """Generate bootstrap statistics.

  Args:
    emb_df (pandas dataframe): dataframe to use
    contents (dict): Contents from Wasserstein training routine
    steps (int): List of timesteps at which timestep to evaluate
    list_of_comp_set (list): each element is a dict for cross-validation.
    num_bootstrap (int): number of bootstrap reps
    seed (int): which seed value to use for bootstrapping
    percent_norm (bool): whether to apply percentile normalization
    factor_analys (bool): whether to apply factor analysis

  Returns:
    cross_validated_scores (list): list of cross-validated knn scores for each
      bootstrap sample.
  """
  cross_validated_scores = []
  n_comp = len(list_of_comp_set)
  np.random.seed(seed=seed)
  for _ in range(num_bootstrap):
    boot_emb = transform.get_bootstrap_sample(emb_df)
    cross_val = cross_val_train(boot_emb, contents, steps,
                                list_of_comp_set, n_comp,
                                report_confusion_matrix=False,
                                percent_norm=percent_norm,
                                factor_analys=factor_analys)
    cross_validated_scores.append(cross_val)
  return cross_validated_scores
 def testGetBootstrapSampleRun(self):
     bootstrap_data = transform.get_bootstrap_sample(self.data)
     self.assertTupleEqual(self.data.shape, bootstrap_data.shape)