def testMatrix(self): expected = pd.DataFrame({ 'a': [0.0, self.dcos], 'b': [self.dcos, 0.0] }, index=['a', 'b']) pandas_testing.assert_frame_equal(expected, distance.matrix('cosine', self.m)) expected = pd.DataFrame({ 'a': [0.0, self.deuc], 'b': [self.deuc, 0.0] }, index=['a', 'b']) pandas_testing.assert_frame_equal(expected, distance.matrix('euclidean', self.m)) euc = lambda v1, v2: np.sqrt((v2 - v1).dot(v2 - v1)) pandas_testing.assert_frame_equal(expected, distance.matrix(euc, self.m))
def make_knn_moa_dataframe(means, max_k=4): """Make a dataframe of k-NN classification accuracy for MOA. Args: means: Pandas dataframe computed from a dataframe of embedding vectors by aggregating the cell-level embedding vectors to a higher level (e.g., batch-level) averaged embedding vectors max_k: (optional) An integer giving the maximum number of neighbors under consideration in k-NN Returns: A Pandas dataframe consisting of the k-NN classification accuracy. Each row represents a record of the accuracy. """ dist = distance_analysis.matrix(distance.cosine, means) correct_nsc_list, mismatch_nsc_list, accuracy_nsc_list = [], [], [] (correct_nsc_nsb_list, mismatch_nsc_nsb_list, accuracy_nsc_nsb_list) = [], [], [] for k in range(1, max_k + 1): correct_nsc, mismatch_nsc = k_nearest_neighbors( dist, k, not_same_compound_filter) correct_nsc_nsb, mismatch_nsc_nsb = k_nearest_neighbors( dist, k, not_same_compound_or_batch_filter) correct_nsc_list.append(len(correct_nsc)) mismatch_nsc_list.append(len(mismatch_nsc)) accuracy_nsc_list.append( round( 100.0 * len(correct_nsc) / (len(correct_nsc) + len(mismatch_nsc)), 1)) correct_nsc_nsb_list.append(len(correct_nsc_nsb)) mismatch_nsc_nsb_list.append(len(mismatch_nsc_nsb)) accuracy_nsc_nsb_list.append( round( 100.0 * len(correct_nsc_nsb) / (len(correct_nsc_nsb) + len(mismatch_nsc_nsb)), 1)) dict_knn = { CORRECT_NSC: correct_nsc_list, MISMATCH_NSC: mismatch_nsc_list, ACCURACY_NSC: accuracy_nsc_list, CORRECT_NSC_NSB: correct_nsc_nsb_list, MISMATCH_NSC_NSB: mismatch_nsc_nsb_list, ACCURACY_NSC_NSB: accuracy_nsc_nsb_list, } return pd.DataFrame(data=dict_knn)
def get_scores_from_means(means, report_knn=True, report_confusion_matrix=True): """Get confusion matrices, accuracy scores, and clustering score. Args: means (pandas dataframe): means for each treatment. report_knn (boolean): whether or not to compute KNN scores. report_confusion_matrix (boolean): whether or not to include confusion matrix. Returns: dict containing the following: confusion_matrix: contains confusion matrices for nsc and nscb and k=1...4 knn_df_dict (dict): contains accuracy scores for nsc and nscb and k=1...4 clustering_score (float): """ moa_name_index = get_index_for_name(means, "moa") dist = distance_analysis.matrix(distance.cosine, means) clustering_score = metrics.silhouette_score( dist, labels=means.index.get_level_values(level=metadata.MOA), metric="precomputed") output_dict = {"clustering_score": clustering_score} if report_knn: knn_df = evaluate.make_knn_moa_dataframe(means) output_dict.update({"knn": knn_df.to_dict()}) if report_confusion_matrix: confusion_matrix = {"nsc": {}, "nscb": {}} for k in range(1, 5): confusion_matrix["nsc"][k] = confusion_matrix_from_dist( dist, k, evaluate.not_same_compound_filter, dist.index.levels[moa_name_index]) confusion_matrix["nscb"][k] = confusion_matrix_from_dist( dist, k, evaluate.not_same_compound_or_batch_filter, dist.index.levels[moa_name_index]) output_dict.update({"confusion_matrix": confusion_matrix}) return output_dict
def cross_val_train(emb_df_clean, contents, steps, list_of_comp_set, n_comp, report_confusion_matrix=True, percent_norm=False, factor_analys=False): """Cross validation to find stopping time with each left-one-out compound. Args: emb_df_clean (pandas dataframe): embeddings WITH unevaluated compounds. contents (dict): Contents from Wasserstein training routine steps (list): Steps for training list_of_comp_set (list): dictionaries for each compound for leave-one-out n_comp (int): number of compounds report_confusion_matrix (bool): whether or not to include confusion matrix. percent_norm (bool): whether to apply percentile normalization factor_analys (bool): whether to apply factor analysis Returns: list_of_time_step_max (list): best stopping time for each compound cross_validated_scores (dict): Contains cross-validated accuracy scores and confusion matrices. """ list_of_time_step_max = [] correct_nsc = collections.defaultdict(list) mismatch_nsc = collections.defaultdict(list) correct_nscb = collections.defaultdict(list) mismatch_nscb = collections.defaultdict(list) emb_df_valid = transform.drop_unevaluated_comp(emb_df_clean) match_metadata_values = sorted(emb_df_valid.index.get_level_values( level=metadata.MOA).unique()) num_moa = len(match_metadata_values) if report_confusion_matrix: confusion_matrices_nsc = collections.defaultdict(list) confusion_matrices_nscb = collections.defaultdict(list) for k in range(1, 5): confusion_matrices_nsc[k] = np.zeros((num_moa, num_moa)) confusion_matrices_nscb[k] = np.zeros((num_moa, num_moa)) else: confusion_matrices_nsc = None confusion_matrices_nscb = None dist_at_time = {} all_compounds_valid = emb_df_valid.index.get_level_values( level=metadata.COMPOUND) for i in range(n_comp): print("cross-validation for compound %s" %i) comp_set = list_of_comp_set[i] ## dataframe excluding the left-out compound emb_df_train = emb_df_valid[all_compounds_valid.isin(comp_set["a"])] if "treatment_group" not in emb_df_train.index.names: raise ValueError("Must have treatment_group in embeddings index names.") ## best time step for a given left-out compound ## as far as speed, this would be a significant bottleneck, ## since it has to evaluate at all timesteps time_step_max = find_time_step_max(emb_df_train, contents, steps) # time_step_max = 20000 ## Used for testing purposes list_of_time_step_max.append(time_step_max) if time_step_max in dist_at_time: ## Cache dist matrix at given time. dist = dist_at_time[time_step_max] else: ## find cosine distances given left-out compound at time_step_max means = transform_and_means(contents, emb_df_clean, time_step_max, percent_norm=percent_norm, factor_analys=factor_analys) means_valid = transform.drop_unevaluated_comp(means) dist = distance_analysis.matrix(distance.cosine, means_valid) dist_at_time[time_step_max] = dist # k-NN up to k=4 for k in range(1, 5): update_stats_new_compound(comp_set, dist, k, evaluate.not_same_compound_filter, correct_nsc, mismatch_nsc, match_metadata_values, confusion_matrices_nsc) update_stats_new_compound(comp_set, dist, k, evaluate.not_same_compound_or_batch_filter, correct_nscb, mismatch_nscb, match_metadata_values, confusion_matrices_nscb) ## obtain accuracies from correct and mismatched, for cross validated scores. acc_nsc = calculate_moa_accuracy(correct_nsc, mismatch_nsc) acc_nscb = calculate_moa_accuracy(correct_nscb, mismatch_nscb) cross_validated_scores = { "acc_nsc": acc_nsc, "acc_nscb": acc_nscb } if report_confusion_matrix: cross_validated_scores.update({ "confusion_matrices_nsc": confusion_matrices_nsc, "confusion_matrices_nscb": confusion_matrices_nscb }) return (list_of_time_step_max, cross_validated_scores)