Exemplo n.º 1
0
def test_coral(emb_df_clean, num_bootstrap=2, percent_norm=False,
               factor_analys=False):
  """test set (Mike's CORAL).

  Args:
    emb_df_clean (pandas dataframe): input dataframe
    num_bootstrap (int): number of bootstrap samples to use
    percent_norm (bool): whether to apply percentile normalization
    factor_analys (bool): whether to apply factor analysis

  Returns:
    metrics_dict_coral (dict): contains batch_classification_scores and
      moa_scores
  """
  emb_df_test_coral_mike = transform.coral_without_mean_shift_batch(
      emb_df_clean)
  emb_df_post = apply_post_processing(emb_df_test_coral_mike,
                                      percent_norm, factor_analys)
  means = transform.drop_unevaluated_comp(emb_df_post).groupby(level=[
      metadata.MOA, metadata.COMPOUND, metadata.CONCENTRATION, metadata.BATCH,
      metadata.TREATMENT_GROUP
  ]).mean()
  batch_classification_scores = get_batch_classification_scores(
      emb_df_post)
  moa_scores = get_scores_from_means(means)
  bootstrap_scores = knn_bootstrap(emb_df_test_coral_mike,
                                   num_bootstrap=num_bootstrap,
                                   percent_norm=percent_norm,
                                   factor_analys=factor_analys)
  return_dict = {
      "batch_classification_scores": batch_classification_scores,
      "bootstrap_scores": bootstrap_scores
  }
  return_dict.update(moa_scores)
  return return_dict
Exemplo n.º 2
0
def knn_bootstrap(emb_df, num_bootstrap=2, seed=SEED, percent_norm=False,
                  factor_analys=False):
  """Generate bootstrap statistics.

  Args:
    emb_df (pandas dataframe): dataframe to use (includes controls)
    num_bootstrap (int): number of bootstrap reps
    seed (int): which seed value to use for bootstrapping
    percent_norm (bool): whether to apply percentile normalization
    factor_analys (bool): whether to apply factor analysis
  Returns:
    stats_dict (dict): dictionary containing mean and
      standard deviation information for each overlapping
      category and value of k.
  """
  boot_knns = []
  boot_clustering_scores = []
  np.random.seed(seed=seed)
  for _ in range(num_bootstrap):
    boot_emb = transform.get_bootstrap_sample(emb_df)
    boot_post_proc = apply_post_processing(boot_emb,
                                           percent_norm=percent_norm,
                                           factor_analys=factor_analys)
    boot_means = transform.drop_unevaluated_comp(boot_post_proc).groupby(level=[
        metadata.MOA, metadata.COMPOUND,
        metadata.CONCENTRATION, metadata.BATCH,
        metadata.TREATMENT_GROUP]).mean()
    scores = get_scores_from_means(boot_means, report_confusion_matrix=False)
    boot_knns.append(scores["knn"])
    boot_clustering_scores.append(scores["clustering_score"])

  knn_return = {"knn_scores": elementwise_stats(boot_knns),
                "clustering_scores": boot_clustering_scores}
  return knn_return
Exemplo n.º 3
0
def transform_and_means(contents,
                        emb_df,
                        step,
                        linear=True,
                        percent_norm=False,
                        factor_analys=False,
                        drop_controls=True):
    """Apply Wasserstein and take means.

  This functions takes the Wasserstein transform and then means across desired
  categories. It takes advantage when the transformation is linear to
  significantly speed things up, by taking the means first and then applying the
  transformation.

  If we apply the percentile normalization or factor analysis, we cannot do the
  means-first trick.

  Args:
    contents (dict): Transformation file.
    emb_df (pandas dataframe): Dataframe to transform and take means of.
    step (int): step where to take Wasserstein transform.
    linear (bool): Whether or not transformation is linear.
    percent_norm (bool): whether to apply percentile normalization
    factor_analys (bool): whether to apply factor analysis
    drop_controls (bool): whether or not to drop controls

  Returns:
    transformed_means (pandas dataframe): dataframe after transform and
      taking the mean.
  """
    if "treatment_group" not in emb_df.index.names:
        raise ValueError(
            "Must have treatment_group in embeddings index names.")
    if linear and not percent_norm and not factor_analys:
        means = emb_df.groupby(level=[
            metadata.MOA, metadata.COMPOUND, metadata.CONCENTRATION,
            metadata.BATCH, metadata.TREATMENT_GROUP
        ]).mean()
        transformed_means = wasserstein_transform(contents, means, step)

    else:
        emb_df_trans = wasserstein_transform(contents, emb_df, step)
        df_post_processed = apply_post_processing(emb_df_trans, percent_norm,
                                                  factor_analys)
        transformed_means = df_post_processed.groupby(level=[
            metadata.MOA, metadata.COMPOUND, metadata.CONCENTRATION,
            metadata.BATCH, metadata.TREATMENT_GROUP
        ]).mean()
    if drop_controls:
        return transform.drop_unevaluated_comp(transformed_means)
    else:
        return transformed_means
Exemplo n.º 4
0
def main(argv):
  del argv

  emb_df_clean = io_utils.read_dataframe_from_hdf5(FLAGS.input_df)
  if "treatment_group" not in emb_df_clean.index.names:
    raise ValueError("Must have treatment_group in embeddings index names.")
  contents = load_contents(FLAGS.transformation_file)

  ## dictionary to save things
  save_dict = {}

  ## Get steps over training
  steps = list(contents.keys())
  steps.remove("params")
  steps = np.sort(steps)

  ## Truncate list of steps
  steps = steps[:FLAGS.num_steps]

  ## embeddings without unevaluated compound
  emb_df_valid = transform.drop_unevaluated_comp(emb_df_clean)
  if "treatment_group" not in emb_df_valid.index.names:
    raise ValueError("Must have treatment_group in embeddings index names.")

  ## list of compounds and number of compounds
  comp_list = emb_df_valid.index.get_level_values(
      level=metadata.COMPOUND).unique()
  n_comp = len(comp_list)

  ## Set up data structure for leave-one-out cross validation
  list_of_comp_set = []
  for i in range(n_comp):
    comp_set = {}
    comp_set["b"] = comp_list[i]
    comp_set["a"] = list(set(comp_list).difference([comp_list[i]]))
    list_of_comp_set.append(comp_set)

  ## Cross validation training with leave-one-out and variable stopping time.
  (steps_max, cross_validated_scores) = cross_val_train(
      emb_df_clean, contents, steps, list_of_comp_set, n_comp,
      percent_norm=FLAGS.percentile_normalize,
      factor_analys=FLAGS.factor_analysis)

  ## Find first and last timesteps used, to use for bootstraps
  boot_steps = [steps[i] for i, v in enumerate(steps) if
                np.max(steps_max) >= v >= np.min(steps_max)]

  metrics_dict = evaluate_metrics(contents, emb_df_clean, steps_max,
                                  boot_steps, list_of_comp_set,
                                  num_bootstrap=FLAGS.num_bootstrap,
                                  percent_norm=FLAGS.percentile_normalize,
                                  factor_analys=FLAGS.factor_analysis)
  save_dict["metrics_dict"] = metrics_dict

  ## time steps where max cross validation results were found
  save_dict["list_of_time_step_max"] = steps_max

  ## accuracy for not same compound or batch, obtained at time_step_max
  ## for each individual compound.
  save_dict["metrics_dict"]["wdn"]["cross_val_scores"] = cross_validated_scores

  with gfile.GFile(FLAGS.output_file, mode="w") as f:
    f.write(pickle.dumps(save_dict))
Exemplo n.º 5
0
def test_wdn(emb_df_clean, contents, list_of_time_step_max, steps,
             list_of_comp_set, num_bootstrap=2, percent_norm=False,
             factor_analys=False):
  """test set (WDN).

  Args:
    emb_df_clean (pandas dataframe): input dataframe
    contents (dict): Contents from Wasserstein training routine
    list_of_time_step_max (int): List of timesteps at which timestep to evaluate
      WDN statistics. For example, could be the time step where average nsc and
      nscb for k=1...4 is maximized for a given compound in the cross-validation
      procedure.
    steps (list): all timesteps from analysis, used for bootstrapping.
    list_of_comp_set (list): each element is a dict for cross-validation.
    num_bootstrap (int): number of bootstrap samples to use
    percent_norm (bool): whether to apply percentile normalization
    factor_analys (bool): whether to apply factor analysis

  Returns:
    metrics_dict_wdn (dict): contains batch_classification_scores and moa_scores
  """

  batch_classification_scores = {}
  clustering_scores = {}

  ##  We do not do cross validation for batch classification and Silhouette
  ##  scores. For the BBBC021 dataset, batch classification only applies to
  ##  controls, so the result of leave-one-out cross validation are the same as
  ##  taking the weighted average/standard deviation across left-out compounds.
  ##  For the Silhouette score, it is possible to do leave-one-out cross
  ##  validation, but then we would also have to do it for TVN and CORAL for
  ##  each left-out compound as well.
  unique_time_step_max = list(set(list_of_time_step_max))
  for time_step_max in unique_time_step_max:

    ## We need both the transformed embeddings as well as the means, so we do
    ## not use transform_and_means here.
    emb_df_trans = wasserstein_transform(contents, emb_df_clean, time_step_max)
    df_post_processed = apply_post_processing(emb_df_trans, percent_norm,
                                              factor_analys)
    means = transform.drop_unevaluated_comp(df_post_processed.groupby(level=[
        metadata.MOA, metadata.COMPOUND, metadata.CONCENTRATION, metadata.BATCH,
        metadata.TREATMENT_GROUP
    ]).mean())

    batch_class_at_time = get_batch_classification_scores(df_post_processed)
    batch_classification_scores[time_step_max] = batch_class_at_time
    moa_at_time = get_scores_from_means(means, report_knn=False,
                                        report_confusion_matrix=False)
    clustering_score = moa_at_time["clustering_score"]
    clustering_scores[time_step_max] = clustering_score

  knn_bootstrap_scores = cross_val_knn_bootstrap(emb_df_clean, contents, steps,
                                                 list_of_comp_set,
                                                 num_bootstrap=num_bootstrap,
                                                 percent_norm=percent_norm,
                                                 factor_analys=factor_analys)
  return_dict = {
      "batch_classification_scores": batch_classification_scores,
      "knn_bootstrap_scores": knn_bootstrap_scores,
      "clustering_scores": clustering_scores
  }
  return return_dict
Exemplo n.º 6
0
def cross_val_train(emb_df_clean, contents, steps, list_of_comp_set, n_comp,
                    report_confusion_matrix=True, percent_norm=False,
                    factor_analys=False):
  """Cross validation to find stopping time with each left-one-out compound.

  Args:
    emb_df_clean (pandas dataframe): embeddings WITH unevaluated compounds.
    contents (dict): Contents from Wasserstein training routine
    steps (list): Steps for training
    list_of_comp_set (list): dictionaries for each compound for leave-one-out
    n_comp (int): number of compounds
    report_confusion_matrix (bool): whether or not to include confusion matrix.
    percent_norm (bool): whether to apply percentile normalization
    factor_analys (bool): whether to apply factor analysis

  Returns:
    list_of_time_step_max (list): best stopping time for each compound
    cross_validated_scores (dict): Contains cross-validated accuracy scores and
      confusion matrices.

  """
  list_of_time_step_max = []
  correct_nsc = collections.defaultdict(list)
  mismatch_nsc = collections.defaultdict(list)

  correct_nscb = collections.defaultdict(list)
  mismatch_nscb = collections.defaultdict(list)

  emb_df_valid = transform.drop_unevaluated_comp(emb_df_clean)
  match_metadata_values = sorted(emb_df_valid.index.get_level_values(
      level=metadata.MOA).unique())
  num_moa = len(match_metadata_values)

  if report_confusion_matrix:
    confusion_matrices_nsc = collections.defaultdict(list)
    confusion_matrices_nscb = collections.defaultdict(list)
    for k in range(1, 5):
      confusion_matrices_nsc[k] = np.zeros((num_moa, num_moa))
      confusion_matrices_nscb[k] = np.zeros((num_moa, num_moa))
  else:
    confusion_matrices_nsc = None
    confusion_matrices_nscb = None

  dist_at_time = {}

  all_compounds_valid = emb_df_valid.index.get_level_values(
      level=metadata.COMPOUND)
  for i in range(n_comp):

    print("cross-validation for compound %s" %i)

    comp_set = list_of_comp_set[i]

    ## dataframe excluding the left-out compound
    emb_df_train = emb_df_valid[all_compounds_valid.isin(comp_set["a"])]
    if "treatment_group" not in emb_df_train.index.names:
      raise ValueError("Must have treatment_group in embeddings index names.")

    ## best time step for a given left-out compound
    ## as far as speed, this would be a significant bottleneck,
    ## since it has to evaluate at all timesteps

    time_step_max = find_time_step_max(emb_df_train, contents, steps)
    # time_step_max = 20000  ## Used for testing purposes
    list_of_time_step_max.append(time_step_max)

    if time_step_max in dist_at_time:
      ## Cache dist matrix at given time.
      dist = dist_at_time[time_step_max]
    else:
      ## find cosine distances given left-out compound at time_step_max
      means = transform_and_means(contents, emb_df_clean, time_step_max,
                                  percent_norm=percent_norm,
                                  factor_analys=factor_analys)
      means_valid = transform.drop_unevaluated_comp(means)
      dist = distance_analysis.matrix(distance.cosine, means_valid)
      dist_at_time[time_step_max] = dist

    # k-NN up to k=4
    for k in range(1, 5):
      update_stats_new_compound(comp_set, dist, k,
                                evaluate.not_same_compound_filter,
                                correct_nsc, mismatch_nsc,
                                match_metadata_values,
                                confusion_matrices_nsc)

      update_stats_new_compound(comp_set, dist, k,
                                evaluate.not_same_compound_or_batch_filter,
                                correct_nscb, mismatch_nscb,
                                match_metadata_values,
                                confusion_matrices_nscb)

  ## obtain accuracies from correct and mismatched, for cross validated scores.
  acc_nsc = calculate_moa_accuracy(correct_nsc, mismatch_nsc)
  acc_nscb = calculate_moa_accuracy(correct_nscb, mismatch_nscb)

  cross_validated_scores = {
      "acc_nsc": acc_nsc,
      "acc_nscb": acc_nscb
  }

  if report_confusion_matrix:
    cross_validated_scores.update({
        "confusion_matrices_nsc": confusion_matrices_nsc,
        "confusion_matrices_nscb": confusion_matrices_nscb
    })
  return (list_of_time_step_max, cross_validated_scores)
 def testDropUnevaluatedComp(self):
     pandas_testing.assert_frame_equal(
         pd.concat([self.pos_controls, self.experimental]),
         transform.drop_unevaluated_comp(self.data))