def testMatrix(self):
        expected = pd.DataFrame({
            'a': [0.0, self.dcos],
            'b': [self.dcos, 0.0]
        },
                                index=['a', 'b'])
        pandas_testing.assert_frame_equal(expected,
                                          distance.matrix('cosine', self.m))

        expected = pd.DataFrame({
            'a': [0.0, self.deuc],
            'b': [self.deuc, 0.0]
        },
                                index=['a', 'b'])
        pandas_testing.assert_frame_equal(expected,
                                          distance.matrix('euclidean', self.m))

        euc = lambda v1, v2: np.sqrt((v2 - v1).dot(v2 - v1))
        pandas_testing.assert_frame_equal(expected,
                                          distance.matrix(euc, self.m))
def make_knn_moa_dataframe(means, max_k=4):
    """Make a dataframe of k-NN classification accuracy for MOA.

  Args:
    means: Pandas dataframe computed from a dataframe of embedding vectors by
      aggregating the cell-level embedding vectors to a higher level (e.g.,
      batch-level) averaged embedding vectors
    max_k: (optional) An integer giving the maximum number of neighbors under
      consideration in k-NN

  Returns:
    A Pandas dataframe consisting of the k-NN classification accuracy.  Each row
      represents a record of the accuracy.
  """
    dist = distance_analysis.matrix(distance.cosine, means)
    correct_nsc_list, mismatch_nsc_list, accuracy_nsc_list = [], [], []
    (correct_nsc_nsb_list, mismatch_nsc_nsb_list,
     accuracy_nsc_nsb_list) = [], [], []
    for k in range(1, max_k + 1):
        correct_nsc, mismatch_nsc = k_nearest_neighbors(
            dist, k, not_same_compound_filter)
        correct_nsc_nsb, mismatch_nsc_nsb = k_nearest_neighbors(
            dist, k, not_same_compound_or_batch_filter)
        correct_nsc_list.append(len(correct_nsc))
        mismatch_nsc_list.append(len(mismatch_nsc))
        accuracy_nsc_list.append(
            round(
                100.0 * len(correct_nsc) /
                (len(correct_nsc) + len(mismatch_nsc)), 1))
        correct_nsc_nsb_list.append(len(correct_nsc_nsb))
        mismatch_nsc_nsb_list.append(len(mismatch_nsc_nsb))
        accuracy_nsc_nsb_list.append(
            round(
                100.0 * len(correct_nsc_nsb) /
                (len(correct_nsc_nsb) + len(mismatch_nsc_nsb)), 1))
    dict_knn = {
        CORRECT_NSC: correct_nsc_list,
        MISMATCH_NSC: mismatch_nsc_list,
        ACCURACY_NSC: accuracy_nsc_list,
        CORRECT_NSC_NSB: correct_nsc_nsb_list,
        MISMATCH_NSC_NSB: mismatch_nsc_nsb_list,
        ACCURACY_NSC_NSB: accuracy_nsc_nsb_list,
    }
    return pd.DataFrame(data=dict_knn)
示例#3
0
def get_scores_from_means(means,
                          report_knn=True,
                          report_confusion_matrix=True):
    """Get confusion matrices, accuracy scores, and clustering score.

  Args:
    means (pandas dataframe): means for each treatment.
    report_knn (boolean): whether or not to compute KNN scores.
    report_confusion_matrix (boolean): whether or not to include confusion
      matrix.
  Returns:
    dict containing the following:
      confusion_matrix: contains confusion matrices for nsc and nscb and k=1...4
      knn_df_dict (dict): contains accuracy scores for nsc and nscb and k=1...4
      clustering_score (float):
  """
    moa_name_index = get_index_for_name(means, "moa")
    dist = distance_analysis.matrix(distance.cosine, means)
    clustering_score = metrics.silhouette_score(
        dist,
        labels=means.index.get_level_values(level=metadata.MOA),
        metric="precomputed")
    output_dict = {"clustering_score": clustering_score}

    if report_knn:
        knn_df = evaluate.make_knn_moa_dataframe(means)
        output_dict.update({"knn": knn_df.to_dict()})

    if report_confusion_matrix:
        confusion_matrix = {"nsc": {}, "nscb": {}}
        for k in range(1, 5):
            confusion_matrix["nsc"][k] = confusion_matrix_from_dist(
                dist, k, evaluate.not_same_compound_filter,
                dist.index.levels[moa_name_index])
            confusion_matrix["nscb"][k] = confusion_matrix_from_dist(
                dist, k, evaluate.not_same_compound_or_batch_filter,
                dist.index.levels[moa_name_index])
        output_dict.update({"confusion_matrix": confusion_matrix})
    return output_dict
def cross_val_train(emb_df_clean, contents, steps, list_of_comp_set, n_comp,
                    report_confusion_matrix=True, percent_norm=False,
                    factor_analys=False):
  """Cross validation to find stopping time with each left-one-out compound.

  Args:
    emb_df_clean (pandas dataframe): embeddings WITH unevaluated compounds.
    contents (dict): Contents from Wasserstein training routine
    steps (list): Steps for training
    list_of_comp_set (list): dictionaries for each compound for leave-one-out
    n_comp (int): number of compounds
    report_confusion_matrix (bool): whether or not to include confusion matrix.
    percent_norm (bool): whether to apply percentile normalization
    factor_analys (bool): whether to apply factor analysis

  Returns:
    list_of_time_step_max (list): best stopping time for each compound
    cross_validated_scores (dict): Contains cross-validated accuracy scores and
      confusion matrices.

  """
  list_of_time_step_max = []
  correct_nsc = collections.defaultdict(list)
  mismatch_nsc = collections.defaultdict(list)

  correct_nscb = collections.defaultdict(list)
  mismatch_nscb = collections.defaultdict(list)

  emb_df_valid = transform.drop_unevaluated_comp(emb_df_clean)
  match_metadata_values = sorted(emb_df_valid.index.get_level_values(
      level=metadata.MOA).unique())
  num_moa = len(match_metadata_values)

  if report_confusion_matrix:
    confusion_matrices_nsc = collections.defaultdict(list)
    confusion_matrices_nscb = collections.defaultdict(list)
    for k in range(1, 5):
      confusion_matrices_nsc[k] = np.zeros((num_moa, num_moa))
      confusion_matrices_nscb[k] = np.zeros((num_moa, num_moa))
  else:
    confusion_matrices_nsc = None
    confusion_matrices_nscb = None

  dist_at_time = {}

  all_compounds_valid = emb_df_valid.index.get_level_values(
      level=metadata.COMPOUND)
  for i in range(n_comp):

    print("cross-validation for compound %s" %i)

    comp_set = list_of_comp_set[i]

    ## dataframe excluding the left-out compound
    emb_df_train = emb_df_valid[all_compounds_valid.isin(comp_set["a"])]
    if "treatment_group" not in emb_df_train.index.names:
      raise ValueError("Must have treatment_group in embeddings index names.")

    ## best time step for a given left-out compound
    ## as far as speed, this would be a significant bottleneck,
    ## since it has to evaluate at all timesteps

    time_step_max = find_time_step_max(emb_df_train, contents, steps)
    # time_step_max = 20000  ## Used for testing purposes
    list_of_time_step_max.append(time_step_max)

    if time_step_max in dist_at_time:
      ## Cache dist matrix at given time.
      dist = dist_at_time[time_step_max]
    else:
      ## find cosine distances given left-out compound at time_step_max
      means = transform_and_means(contents, emb_df_clean, time_step_max,
                                  percent_norm=percent_norm,
                                  factor_analys=factor_analys)
      means_valid = transform.drop_unevaluated_comp(means)
      dist = distance_analysis.matrix(distance.cosine, means_valid)
      dist_at_time[time_step_max] = dist

    # k-NN up to k=4
    for k in range(1, 5):
      update_stats_new_compound(comp_set, dist, k,
                                evaluate.not_same_compound_filter,
                                correct_nsc, mismatch_nsc,
                                match_metadata_values,
                                confusion_matrices_nsc)

      update_stats_new_compound(comp_set, dist, k,
                                evaluate.not_same_compound_or_batch_filter,
                                correct_nscb, mismatch_nscb,
                                match_metadata_values,
                                confusion_matrices_nscb)

  ## obtain accuracies from correct and mismatched, for cross validated scores.
  acc_nsc = calculate_moa_accuracy(correct_nsc, mismatch_nsc)
  acc_nscb = calculate_moa_accuracy(correct_nscb, mismatch_nscb)

  cross_validated_scores = {
      "acc_nsc": acc_nsc,
      "acc_nscb": acc_nscb
  }

  if report_confusion_matrix:
    cross_validated_scores.update({
        "confusion_matrices_nsc": confusion_matrices_nsc,
        "confusion_matrices_nscb": confusion_matrices_nscb
    })
  return (list_of_time_step_max, cross_validated_scores)