def _get_embeddings(self, imputed_data): log("Fitting PCA ...") emb_pca = PCA(n_components=5). \ fit_transform(imputed_data.transpose()) log("Fitting ICA ...") emb_ica_2d = FastICA(n_components=2). \ fit_transform(imputed_data.transpose()) emb_ica = FastICA(n_components=5). \ fit_transform(imputed_data.transpose()) log("Fitting TruncatedSVD ...") emb_tsvd = TruncatedSVD(n_components=5). \ fit_transform(imputed_data.transpose()) log("Fitting TSNE ...") emb_tsne_2d = TSNE(n_components=2, method='barnes_hut'). \ fit_transform(imputed_data.transpose()) emb_tsne = TSNE(n_components=3, method='barnes_hut'). \ fit_transform(imputed_data.transpose()) log("Fitting UMAP ...") emb_umap = umap.UMAP(n_neighbors=4, min_dist=0.3, metric='correlation'). \ fit_transform(imputed_data.transpose()) embedded_data = { "PCA": (emb_pca, emb_pca), "ICA": (emb_ica, emb_ica_2d), "Truncated SVD": (emb_tsvd, emb_tsvd), "tSNE": (emb_tsne, emb_tsne_2d), "UMAP": (emb_umap, emb_umap) } return embedded_data
def generate_test_bench(self, count_file_path, **kwargs): preserve_columns = kwargs['preserve_columns'] count_file_path = os.path.abspath(count_file_path) count_rna = self.data_set.get("RNA") count_adt = self.data_set.get("ADT") # Shuffle columns count_rna, original_columns, column_permutation = \ shuffle_and_rename_columns(count_rna, disabled=preserve_columns) # Remove zero rows count_rna = count_rna[np.sum(count_rna, axis=1) > 0].copy() # Save hidden data make_sure_dir_exists(settings.STORAGE_DIR) hidden_data_file_path = os.path.join(settings.STORAGE_DIR, "%s.hidden.pkl.gz" % self.uid) dump_gzip_pickle([ count_rna.to_sparse(), original_columns, column_permutation, count_adt.to_sparse(), self.protein_rna_mapping ], hidden_data_file_path) log("Benchmark hidden data saved to `%s`" % hidden_data_file_path) make_sure_dir_exists(os.path.dirname(count_file_path)) write_csv(count_rna, count_file_path) log("Count file saved to `%s`" % count_file_path)
def generate_test_bench(self, count_file_path, **kwargs): preserve_columns = kwargs['preserve_columns'] count_file_path = os.path.abspath(count_file_path) count_matrix, classes = self._load_data() # Remove zero rows count_matrix = count_matrix[np.sum(count_matrix, axis=1) > 0].copy() # Shuffle columns count_matrix, original_columns, column_permutation = \ shuffle_and_rename_columns(count_matrix, disabled=preserve_columns) # Save hidden data make_sure_dir_exists(settings.STORAGE_DIR) hidden_data_file_path = os.path.join(settings.STORAGE_DIR, "%s.hidden.pkl.gz" % self.uid) dump_gzip_pickle([ count_matrix.to_sparse(), classes, original_columns, column_permutation ], hidden_data_file_path) log("Benchmark hidden data saved to `%s`" % hidden_data_file_path) make_sure_dir_exists(os.path.dirname(count_file_path)) write_csv(count_matrix, count_file_path) log("Count file saved to `%s`" % count_file_path)
def generate_test_bench(self, count_file_path, **kwargs): n_samples = kwargs['n_samples'] dropout_count = kwargs['dropout_count'] min_expression = kwargs['min_expression'] hvg_frac = kwargs['hvg_frac'] preserve_columns = kwargs['preserve_columns'] count_file_path = os.path.abspath(count_file_path) data = self._load_data(n_samples) hvg_indices = self.get_hvg_genes(data, hvg_frac) # Generate elimination mask non_zero_locations = [] data_values = data.values for x in hvg_indices: for y in range(data.shape[1]): if data_values[x, y] >= min_expression: non_zero_locations.append((x, y)) del data_values mask = np.zeros_like(data) masked_locations = [ non_zero_locations[index] for index in np.random.choice( len(non_zero_locations), dropout_count, replace=False) ] for (x, y) in masked_locations: mask[x, y] = 1 mask = pd.DataFrame(mask, index=data.index, columns=data.columns) # Elimination low_quality_data = data * (1 - mask.values) is_nonzero = np.sum(low_quality_data, axis=1) > 0 mask = mask[is_nonzero].copy() data = data[is_nonzero].copy() low_quality_data = low_quality_data[is_nonzero].copy() # Shuffle columns low_quality_data, original_columns, column_permutation = \ shuffle_and_rename_columns(low_quality_data, disabled=preserve_columns) # Save hidden data make_sure_dir_exists(settings.STORAGE_DIR) hidden_data_file_path = os.path.join(settings.STORAGE_DIR, "%s.hidden.pkl.gz" % self.uid) dump_gzip_pickle([ data.to_sparse(), mask.to_sparse(), original_columns, column_permutation ], hidden_data_file_path) log("Benchmark hidden data saved to `%s`" % hidden_data_file_path) make_sure_dir_exists(os.path.dirname(count_file_path)) write_csv(low_quality_data, count_file_path) log("Count file saved to `%s`" % count_file_path)
def generate_test_bench(self, count_file_path, **kwargs): n_samples = kwargs['n_samples'] read_ratio = kwargs['read_ratio'] replce = kwargs['replace'] preserve_columns = kwargs['preserve_columns'] count_file_path = os.path.abspath(count_file_path) data = self._load_data(n_samples) # find cumulative distribution (sum) data_values = data.astype(int).values n_all_reads = np.sum(data_values) data_cumsum = np.reshape(np.cumsum(data_values), data_values.shape) # Sample from original dataset new_reads = np.sort( np.random.choice(n_all_reads, int(read_ratio * n_all_reads), replace=replce)) low_quality_data = np.zeros_like(data_values) read_index = 0 for x in range(data_values.shape[0]): for y in range(data_values.shape[1]): while read_index < len( new_reads) and new_reads[read_index] < data_cumsum[x, y]: low_quality_data[x, y] += 1 read_index += 1 # Convert to data frame low_quality_data = pd.DataFrame(low_quality_data, index=data.index, columns=data.columns) # Shuffle columns low_quality_data, original_columns, column_permutation = \ shuffle_and_rename_columns(low_quality_data, disabled=preserve_columns) # Remove zero rows data = data[np.sum(low_quality_data, axis=1) > 0].copy() low_quality_data = low_quality_data[ np.sum(low_quality_data, axis=1) > 0].copy() # Save hidden data make_sure_dir_exists(settings.STORAGE_DIR) hidden_data_file_path = os.path.join(settings.STORAGE_DIR, "%s.hidden.pkl.gz" % self.uid) dump_gzip_pickle([ data.to_sparse(), read_ratio, original_columns, column_permutation ], hidden_data_file_path) log("Benchmark hidden data saved to `%s`" % hidden_data_file_path) make_sure_dir_exists(os.path.dirname(count_file_path)) write_csv(low_quality_data, count_file_path) log("Count file saved to `%s`" % count_file_path)
def generate_test_bench(self, count_file_path, **kwargs): count_file_path = os.path.abspath(count_file_path) rm_ercc = kwargs['rm_ercc'] rm_mt = kwargs['rm_mt'] rm_lq = kwargs['rm_lq'] preserve_columns = kwargs['preserve_columns'] # Load dataset data = self._load_and_combine_data() # Remove some rows and columns if rm_ercc: remove_list = [ symbol for symbol in data.index.values if symbol.startswith("ERCC-") ] data = data.drop(remove_list) if rm_mt: remove_list = [ symbol for symbol in data.index.values if symbol.startswith("mt-") ] data = data.drop(remove_list) if rm_lq: remove_list = data.columns.values[data.sum(axis=0) < 1e6] data = data.drop(columns=remove_list) # Remove empty rows remove_list = data.index.values[data.sum(axis=1) == 0] data = data.drop(remove_list) # Shuffle columns new_data, original_columns, column_permutation = shuffle_and_rename_columns( data, disabled=preserve_columns) # Save hidden data make_sure_dir_exists(settings.STORAGE_DIR) hidden_data_file_path = os.path.join(settings.STORAGE_DIR, "%s.hidden.pkl.gz" % self.uid) dump_gzip_pickle( [data.to_sparse(), original_columns, column_permutation], hidden_data_file_path) log("Benchmark hidden data saved to `%s`" % hidden_data_file_path) make_sure_dir_exists(os.path.dirname(count_file_path)) write_csv(new_data, count_file_path) log("Count file saved to `%s`" % count_file_path) return None
def _extract_PBMC_human_cells(self): human_cells_file_path = os.path.join(self.DATA_SET_DIR_NAME, "PBMC-human-cells.csv") if os.path.exists(human_cells_file_path): log("PBMC human cells are already extracted.") return data = pd.read_csv(self.PBMC_RNA_DATA_FILE_PATH, index_col=0) human_section = data.loc[[gene for gene in data.index.values if gene.startswith("HUMAN")]] mouse_section = data.loc[[gene for gene in data.index.values if gene.startswith("MOUSE")]] pd.DataFrame(data.columns.values[human_section.sum(axis=0) > 20 * mouse_section.sum(axis=0)], columns=["human"]).to_csv(human_cells_file_path, index=None) log("PBMC human cells extracted.")
def evaluate_result(self, processed_count_file_path, result_dir, visualization, **kwargs): normalization = kwargs['normalization'] transformation = kwargs['transformation'] clear_cache = kwargs['clear_cache'] make_sure_dir_exists(os.path.join(result_dir, "files")) info = [] # Load hidden state and data count_matrix, classes, original_columns, column_permutation = self._load_hidden_state( ) # Load imputed data imputed_data = read_table_file(processed_count_file_path) # Restore column names and order imputed_data = rearrange_and_rename_columns(imputed_data, original_columns, column_permutation) # Data transformations if np.sum(imputed_data.values < 0) > 0: log("Observed some negative values!") imputed_data[imputed_data < 0] = 0 imputed_data = transformations[transformation]( normalizations[normalization](imputed_data)) # Save class details for future write_csv(classes, os.path.join(result_dir, "files", "classes.csv")) # Evaluation metric_results = dict() embedded_data_file_path = os.path.join(result_dir, "files", "embedded_data.pkl.gz") if os.path.exists(embedded_data_file_path) and not clear_cache: embedded_data = load_gzip_pickle(embedded_data_file_path) else: embedded_data = self._get_embeddings(imputed_data) dump_gzip_pickle(embedded_data, embedded_data_file_path) log("Evaluating ...") for class_label in classes.index.values: class_names = classes.loc[class_label].values for embedding_name in embedded_data: emb, emb_2d = embedded_data[embedding_name] embedding_slug = embedding_name.replace(" ", "_").lower() k_means = KMeans(n_clusters=len(set(class_names))) k_means.fit(emb) clusters = k_means.predict(emb) embedding_df = pd.DataFrame(emb) embedding_df["X"] = emb_2d[:, 0] embedding_df["Y"] = emb_2d[:, 1] embedding_df["class"] = class_names embedding_df["k_means_clusters"] = clusters write_csv( embedding_df, os.path.join(result_dir, "files", "%s_%s.csv" % (class_label, embedding_slug))) info.append({ 'filename': "%s_%s.csv" % (class_label, embedding_slug), 'description': '%s embedding of cells along %s labels' % (embedding_name, class_label), 'plot_description': '%s embedding of cells along %s labels (Classes can be identified ' 'with their colors and K-means clusters are marked ' 'with different shapes)' % (embedding_name, class_label), }) metric_results.update({ 'kmeans_on_%s_%s_adjusted_mutual_info_score' % (embedding_slug, class_label): adjusted_mutual_info_score(class_names, clusters, average_method="arithmetic"), 'kmeans_on_%s_%s_v_measure_score' % (embedding_slug, class_label): v_measure_score(class_names, clusters), 'embedding_%s_%s_calinski_harabaz_score' % (embedding_slug, class_label): calinski_harabaz_score(emb, class_names), 'embedding_%s_%s_silhouette_score' % (embedding_slug, class_label): silhouette_score(emb, class_names) }) write_csv(pd.DataFrame(info), os.path.join(result_dir, "files", "info.csv")) result_path = os.path.join(result_dir, "result.txt") with open(result_path, 'w') as file: file.write("## METRICS:\n") for metric in sorted(metric_results): file.write("%s\t%4f\n" % (metric, metric_results[metric])) file.write("##\n## ADDITIONAL INFO:\n") log("Evaluation results saved to `%s`" % result_path) if visualization != "none": self.visualize_result(result_dir, output_type=visualization) return metric_results
def evaluate_result(self, processed_count_file_path, result_dir, visualization, **kwargs): normalization = kwargs['normalization'] transformation = kwargs['transformation'] # Load hidden state and data count_matrix_lq, original_columns, column_permutation, count_matrix_hq = self._load_hidden_state( ) # Load imputed data imputed_data = read_table_file(processed_count_file_path) # Restore column names and order imputed_data = rearrange_and_rename_columns(imputed_data, original_columns, column_permutation) # Replace negative values with zero imputed_data = imputed_data.clip(lower=0) # Data transformations imputed_data = transformations[transformation]( normalizations[normalization](imputed_data)) count_matrix_hq = transformations[transformation]( normalizations[normalization](count_matrix_hq)) # Evaluation rmse_distances = [] mae_distances = [] euclidean_distances = [] cosine_distances = [] correlation_distances = [] for i in range(count_matrix_hq.shape[1]): non_zeros = np.logical_and(count_matrix_hq.values[:, i] > 0, count_matrix_lq.values[:, i] == 0) hq = count_matrix_hq.values[non_zeros, i] lq = count_matrix_lq.values[non_zeros, i] y = imputed_data.values[non_zeros, i] if np.sum(y) > 0: y = y * np.sum(hq) / np.sum(y) rmse_distances.append(float(np.mean(np.square(hq - y)**0.5))) mae_distances.append(float(np.mean(np.abs(hq - y)))) euclidean_distances.append( pdist(np.vstack((hq, y)), 'euclidean')[0]) cosine_distances.append(pdist(np.vstack((hq, y)), 'cosine')[0]) correlation_distances.append( pdist(np.vstack((hq, y)), 'correlation')[0]) metric_results = { 'cell_root_mean_squared_error': np.mean(rmse_distances), 'cell_mean_absolute_error': np.mean(mae_distances), 'cell_mean_euclidean_distance': np.mean(euclidean_distances), 'cell_mean_cosine_distance': np.mean(cosine_distances), 'cell_mean_correlation_distance': np.mean(correlation_distances) } # Save results to a file make_sure_dir_exists(os.path.join(result_dir, "files")) result_path = os.path.join(result_dir, "result.txt") with open(result_path, 'w') as file: file.write("## METRICS:\n") for metric in sorted(metric_results): file.write("%s\t%4f\n" % (metric, float(metric_results[metric]))) file.write("##\n## ADDITIONAL INFO:\n") file.write( "# CELL\troot_mean_squared_error\tmean_absolute_error\tmean_euclidean_distance\t" "mean_cosine_distance\tmean_correlation_distance:\n") for i in range(count_matrix_hq.shape[1]): file.write( "# %s\t%f\t%f\t%f\t%f\t%f\n" % (count_matrix_hq.columns.values[i], rmse_distances[i], mae_distances[i], euclidean_distances[i], cosine_distances[i], correlation_distances[i])) log("Evaluation results saved to `%s`" % result_path) if visualization != "none": self.visualize_result(result_dir, output_type=visualization) return metric_results
def evaluate_result(self, processed_count_file_path, result_dir, visualization, **kwargs): make_sure_dir_exists(os.path.join(result_dir, "files")) transformation = kwargs['transformation'] # Load hidden state and data _, original_columns, column_permutation, count_adt, protein_rna_mapping = self._load_hidden_state( ) # Load imputed data imputed_rna = read_table_file(processed_count_file_path) # Restore column names and order imputed_rna = rearrange_and_rename_columns(imputed_rna, original_columns, column_permutation) # Data transformations imputed_rna = transformations[transformation](imputed_rna) count_adt = transformations[transformation](count_adt) # Use related data adt = count_adt.loc[[ prot for prot in count_adt.index.values if (protein_rna_mapping[prot] in imputed_rna.index.values) ]].copy() adt.index = ["prot_" + p for p in adt.index.values] rna = imputed_rna.loc[[ protein_rna_mapping[prot] for prot in count_adt.index.values if (protein_rna_mapping[prot] in imputed_rna.index.values) ]] rna.index = ["gene_" + g for g in rna.index.values] info = [] write_csv(adt, os.path.join(result_dir, "files", "adt.csv")) info.append({ 'filename': "adt.csv", 'description': 'Protein expressions (adt) after transformation', 'plot_description': 'Protein expressions (adt) after transformation', }) write_csv(rna, os.path.join(result_dir, "files", "rna.csv")) info.append({ 'filename': "rna.csv", 'description': 'Gene expressions of genes related to adt data after transformation', 'plot_description': 'Gene expressions of genes related to adt data after transformation', }) n = adt.shape[0] # Calculating Spearman correlations combined_df = pd.concat((adt, rna)).transpose() correlations = combined_df.corr(method="spearman") adt_adt_spearmanr = correlations.iloc[:n, :n] rna_rna_spearmanr = correlations.iloc[n:, n:] adt_rna_spearmanr = correlations.iloc[:n, n:] write_csv( correlations, os.path.join(result_dir, "files", "spearman_correlations.csv")) info.append({ 'filename': "spearman_correlations.csv", 'description': 'Pairwise Spearman correlations (first n items are ' 'adt expressions and second n items are rna expressions)', 'plot_description': 'Pairwise Spearman correlations (first n items are ' 'adt expressions and second n items are rna expressions)', }) # Calculating Pearson correlations combined_df = pd.concat((adt, rna)).transpose() correlations = combined_df.corr(method="pearson") adt_adt_pearsonr = correlations.iloc[:n, :n] rna_rna_pearsonr = correlations.iloc[n:, n:] adt_rna_pearsonr = correlations.iloc[:n, n:] write_csv( correlations, os.path.join(result_dir, "files", "pearson_correlations.csv")) info.append({ 'filename': "pearson_correlations.csv", 'description': 'Pairwise Pearson correlations (first n items are ' 'adt expressions and second n items are rna expressions)', 'plot_description': 'Pairwise Pearson correlations (first n items are ' 'adt expressions and second n items are rna expressions)', }) # Evaluation metric_results = { 'rna_protein_mean_spearman_correlatoin': np.mean(adt_rna_spearmanr.values.diagonal()), 'rna_protein_mean_pearson_correlatoin': np.mean(adt_rna_pearsonr.values.diagonal()), 'MSE_of_adt_adt_and_rna_rna_spearman_correlations': np.mean((adt_adt_spearmanr.values - rna_rna_spearmanr.values)**2), 'MSE_of_adt_adt_and_rna_rna_pearson_correlations': np.mean((adt_adt_pearsonr.values - rna_rna_pearsonr.values)**2) } write_csv(pd.DataFrame(info), os.path.join(result_dir, "files", "info.csv")) # Save results to a file result_path = os.path.join(result_dir, "result.txt") with open(result_path, 'w') as file: file.write("## METRICS:\n") for metric in sorted(metric_results): file.write("%s\t%4f\n" % (metric, float(metric_results[metric]))) file.write("##\n## ADDITIONAL INFO:\n") file.write("## Pearson of adt/rna:\n") file.write("## " + "\n## ".join(adt_rna_pearsonr.to_string().split("\n")) + "\n") file.write('## Spearman of adt/rna:\n') file.write( "## " + "\n## ".join(adt_rna_spearmanr.to_string().split("\n")) + "\n") file.write("## Pearson of adt/adt:\n") file.write("## " + "\n## ".join(adt_adt_pearsonr.to_string().split("\n")) + "\n") file.write("## Pearson of rna/rna:\n") file.write("## " + "\n## ".join(rna_rna_pearsonr.to_string().split("\n")) + "\n") file.write('## Spearman of adt/adt:\n') file.write( "## " + "\n## ".join(adt_adt_spearmanr.to_string().split("\n")) + "\n") file.write('## Spearman of rna/rna:\n') file.write( "## " + "\n## ".join(rna_rna_spearmanr.to_string().split("\n")) + "\n") log("Evaluation results saved to `%s`" % result_path) if visualization != "none": self.visualize_result(result_dir, output_type=visualization) return metric_results
parser_visualize_clustering.set_defaults(function=visualize_clustering_evaluation) parser_visualize_random_mask = subparsers_visualize.add_parser('random-mask') parser_visualize_random_mask.set_defaults(function=visualize_random_mask_evaluation) parser_visualize_down_sample = subparsers_visualize.add_parser('down-sample') parser_visualize_down_sample.set_defaults(function=visualize_down_sample_evaluation) parser_visualize_paired_data = subparsers_visualize.add_parser('paired-data') parser_visualize_paired_data.set_defaults(function=visualize_paired_data_evaluation) parser_visualize_cite_seq= subparsers_visualize.add_parser('cite-seq') parser_visualize_cite_seq.set_defaults(function=visualize_cite_seq_evaluation) return main_parser if __name__ == '__main__': parser = generate_parser() args = parser.parse_args() if settings.DEBUG: log("Running with arguments: " + str(args)) handle_main_arguments(args) if 'function' in args: args.function(args) else: args.default_function()
def evaluate_result(self, processed_count_file, result_dir, visualization, **kwargs): normalization = kwargs['normalization'] transformation = kwargs['transformation'] clear_cache = kwargs['clear_cache'] make_sure_dir_exists(os.path.join(result_dir, "files")) info = [] data, imputed_data = self._load_data_and_imputed_data_for_evaluation( processed_count_file) gold_standard_classes = [ column_name.split("_")[0] for column_name in data.columns.values ] # Data transformations if np.sum(imputed_data.values < 0) > 0: log("Observed some negative values!") imputed_data[imputed_data < 0] = 0 imputed_data = transformations[transformation]( normalizations[normalization](imputed_data)) G1_S_related_part_of_imputed_data, G2_M_related_part_of_imputed_data = self._get_related_part( imputed_data) related_part_of_imputed_data = pd.concat([ G1_S_related_part_of_imputed_data, G2_M_related_part_of_imputed_data ]) write_csv( G1_S_related_part_of_imputed_data, os.path.join(result_dir, "files", "G1_S_related_part_of_imputed_data.csv")) info.append({ 'filename': 'G1_S_related_part_of_imputed_data.csv', 'description': 'Vales of genes related to G1/S', 'plot_description': 'Heatmap of Genes related to G1/S', }) write_csv( G2_M_related_part_of_imputed_data, os.path.join(result_dir, "files", "G2_M_related_part_of_imputed_data.csv")) info.append({ 'filename': 'G2_M_related_part_of_imputed_data.csv', 'description': 'Vales of genes related to G2/M', 'plot_description': 'Heatmap of Genes related to G2/M', }) svm_results, knn_results = self._get_classification_results( related_part_of_imputed_data, gold_standard_classes) embedded_data_file_path = os.path.join(result_dir, "files", "embedded_data.pkl.gz") if os.path.exists(embedded_data_file_path) and not clear_cache: embedded_data = load_gzip_pickle(embedded_data_file_path) else: embedded_data = self._get_embeddings(related_part_of_imputed_data) dump_gzip_pickle(embedded_data, embedded_data_file_path) metric_results = { "classification_svm_mean_accuracy": np.mean(svm_results), "classification_knn_mean_accuracy": np.mean(knn_results) } embedded_data["identity"] = related_part_of_imputed_data.transpose() for i, embedding_name in enumerate(embedded_data): emb = embedded_data[embedding_name] k_means = KMeans(n_clusters=3) k_means.fit(emb) clusters = k_means.predict(emb) embedding_slug = embedding_name.replace(" ", "_").lower() if embedding_name != "identity": embedding_df = pd.DataFrame( { "X": emb[:, 0], "Y": emb[:, 1], "class": gold_standard_classes, "k_means_clusters": clusters }, index=data.columns.values) write_csv( embedding_df, os.path.join(result_dir, "files", "%s.csv" % embedding_slug)) info.append({ 'filename': "%s.csv" % embedding_slug, 'description': '%s embedding of cells considering genes related ' 'to cell-cycle' % embedding_name, 'plot_description': '%s embedding of cells considering genes related ' 'to cell-cycle (K-means clusters are marked ' 'with different shapes)' % embedding_name, }) metric_results.update({ 'kmeans_on_%s_adjusted_mutual_info_score' % embedding_slug: adjusted_mutual_info_score(gold_standard_classes, clusters, average_method="arithmetic"), 'kmeans_on_%s_v_measure_score' % embedding_slug: v_measure_score(gold_standard_classes, clusters), 'embedding_%s_calinski_harabaz_score' % embedding_slug: calinski_harabaz_score(emb, gold_standard_classes), 'embedding_%s_silhouette_score' % embedding_slug: silhouette_score(emb, gold_standard_classes) }) write_csv(pd.DataFrame(info), os.path.join(result_dir, "files", "info.csv")) result_path = os.path.join(result_dir, "result.txt") with open(result_path, 'w') as file: file.write("## METRICS:\n") for metric in sorted(metric_results): file.write("%s\t%4f\n" % (metric, metric_results[metric])) file.write("##\n## ADDITIONAL INFO:\n") file.write("## SVM classifiers accuracies: %s\n" % str(svm_results)) file.write("## KNN classifiers accuracies: %s\n" % str(knn_results)) log("Evaluation results saved to `%s`" % result_path) if visualization != "none": self.visualize_result(result_dir, output_type=visualization) return metric_results
def evaluate_result(self, processed_count_file_path, result_dir, visualization, **kwargs): transformation = kwargs['transformation'] make_sure_dir_exists(os.path.join(result_dir, "files")) # Load hidden state and data scaled_data, original_columns, column_permutation = self._load_hidden_state( ) # Load imputed data imputed_data = read_table_file(processed_count_file_path) # Restore column names and order imputed_data = rearrange_and_rename_columns(imputed_data, original_columns, column_permutation) # Replace negative values with zero imputed_data = imputed_data.clip(lower=0) # Data transformation scaled_data = transformations[transformation](scaled_data) imputed_data = transformations[transformation](imputed_data) # Evaluation rmse_distances = [] mae_distances = [] euclidean_distances = [] cosine_distances = [] correlation_distances = [] rmse = float( np.sum( np.where(scaled_data.values > 0, 1, 0) * np.square(scaled_data.values - imputed_data.values)) / np.sum(np.where(scaled_data.values > 0, 1, 0)))**0.5 mae = float( np.sum( np.where(scaled_data.values > 0, 1, 0) * np.abs(scaled_data.values - imputed_data.values)) / np.sum(np.where(scaled_data.values > 0, 1, 0))) for i in range(scaled_data.shape[1]): non_zeros = scaled_data.values[:, i] > 0 x = scaled_data.values[non_zeros, i] y = imputed_data.values[non_zeros, i] rmse_distances.append( float(np.sum(np.square(x - y)) / np.sum(non_zeros))**0.5) mae_distances.append( float(np.sum(np.abs(x - y)) / np.sum(non_zeros))) cosine_distances.append(pdist(np.vstack((x, y)), 'cosine')[0]) euclidean_distances.append( pdist(np.vstack((x, y)), 'euclidean')[0]) correlation_distances.append( pdist(np.vstack((x, y)), 'correlation')[0]) metric_results = { 'all_mean_absolute_error_on_non_zeros': mae, 'all_root_mean_squared_error_on_non_zeros': rmse, 'cell_mean_mean_absolute_error_on_non_zeros': np.mean(mae_distances), 'cell_mean_root_mean_squared_error_on_non_zeros': np.mean(rmse_distances), 'cell_mean_euclidean_distance': np.mean(euclidean_distances), 'cell_mean_cosine_distance': np.mean(cosine_distances), 'cell_mean_correlation_distance': np.mean(correlation_distances), } # Save results to a file result_path = os.path.join(result_dir, "result.txt") with open(result_path, 'w') as file: file.write("## METRICS:\n") for metric in sorted(metric_results): file.write("%s\t%4f\n" % (metric, float(metric_results[metric]))) file.write("##\n## ADDITIONAL INFO:\n") file.write( "# CELL\troot_mean_squared_error_on_non_zeros\tmean_absolute_error_on_non_zeros\t" "euclidean_distance_on_non_zeros\tcosine_distance_on_non_zeros\tcorrelation_distance_on_non_zeros:\n" ) for i in range(scaled_data.shape[1]): file.write("# %s\t%f\t%f\t%f\t%f\t%f\n" % (scaled_data.columns.values[i], rmse_distances[i], mae_distances[i], euclidean_distances[i], cosine_distances[i], correlation_distances[i])) log("Evaluation results saved to `%s`" % result_path) if visualization != "none": self.visualize_result(result_dir, output_type=visualization) return metric_results
def evaluate_result(self, processed_count_file_path, result_dir, visualization, **kwargs): make_sure_dir_exists(os.path.join(result_dir, "files")) info = [] # Load hidden state and data data, mask, original_columns, column_permutation = self._load_hidden_state( ) # Load imputed data imputed_data = read_table_file(processed_count_file_path) # Restore column names and order imputed_data = rearrange_and_rename_columns(imputed_data, original_columns, column_permutation) # Replace negative values with zero imputed_data = imputed_data.clip(lower=0) # Evaluation log_diff = np.abs(transformations["log"](data) - transformations["log"](imputed_data)) sqrt_diff = np.abs(transformations["sqrt"](data) - transformations["sqrt"](imputed_data)) mse_on_log = float( np.sum( np.sum(mask * np.where(data != 0, 1, 0) * np.square(log_diff))) / np.sum(np.sum(mask * np.where(data != 0, 1, 0)))) mae_on_log = float( np.sum(np.sum(mask * np.where(data != 0, 1, 0) * np.abs(log_diff))) / np.sum(np.sum(mask * np.where(data != 0, 1, 0)))) mse_on_sqrt = float( np.sum( np.sum( mask * np.where(data != 0, 1, 0) * np.square(sqrt_diff))) / np.sum(np.sum(mask * np.where(data != 0, 1, 0)))) mae_on_sqrt = float( np.sum(np.sum( mask * np.where(data != 0, 1, 0) * np.abs(sqrt_diff))) / np.sum(np.sum(mask * np.where(data != 0, 1, 0)))) metric_results = { 'RMSE_sqrt': mse_on_sqrt**0.5, 'MAE_sqrt': mae_on_sqrt, 'RMSE_log': mse_on_log**0.5, 'MAE_log': mae_on_log } masked_locations = [] mask_values = mask.values for x in range(mask_values.shape[0]): for y in range(mask_values.shape[1]): if mask_values[x, y] == 1: masked_locations.append((x, y)) original_values = [] predicted_values = [] for (x, y) in masked_locations: original_values.append(data.iloc[x, y]) predicted_values.append(imputed_data.iloc[x, y]) original_values = np.asarray(original_values) predicted_values = np.asarray(predicted_values) predictions_df = pd.DataFrame({ 'original': original_values, 'predicted': predicted_values }) write_csv(predictions_df, os.path.join(result_dir, "files", "predictions.csv")) info.append({ 'filename': "predictions.csv", 'description': 'Original masked values along predicted values', 'plot_description': 'Predicted values vs. original masked values', }) write_csv(pd.DataFrame(info), os.path.join(result_dir, "files", "info.csv")) # Save results to a file result_path = os.path.join(result_dir, "result.txt") with open(result_path, 'w') as file: file.write("## METRICS:\n") for metric in sorted(metric_results): file.write("%s\t%4f\n" % (metric, metric_results[metric])) file.write("##\n## ADDITIONAL INFO:\n") file.write("# GENE\tCELL\tGOLD_STANDARD\tRESULT:\n") for (x, y) in masked_locations: file.write("# %s\t%s\t%f\t%f\n" % (data.index.values[x], data.columns.values[y], data.iloc[x, y], imputed_data.iloc[x, y])) log("Evaluation results saved to `%s`" % result_path) if visualization != "none": self.visualize_result(result_dir, output_type=visualization) return metric_results