Exemplo n.º 1
0
    def generate_test_bench(self, count_file_path, **kwargs):
        preserve_columns = kwargs['preserve_columns']

        count_file_path = os.path.abspath(count_file_path)

        count_rna = self.data_set.get("RNA")
        count_adt = self.data_set.get("ADT")

        # Shuffle columns
        count_rna, original_columns, column_permutation = \
            shuffle_and_rename_columns(count_rna, disabled=preserve_columns)

        # Remove zero rows
        count_rna = count_rna[np.sum(count_rna, axis=1) > 0].copy()

        # Save hidden data
        make_sure_dir_exists(settings.STORAGE_DIR)
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        dump_gzip_pickle([
            count_rna.to_sparse(), original_columns, column_permutation,
            count_adt.to_sparse(), self.protein_rna_mapping
        ], hidden_data_file_path)
        log("Benchmark hidden data saved to `%s`" % hidden_data_file_path)

        make_sure_dir_exists(os.path.dirname(count_file_path))
        write_csv(count_rna, count_file_path)
        log("Count file saved to `%s`" % count_file_path)
Exemplo n.º 2
0
    def generate_test_bench(self, count_file_path, **kwargs):
        preserve_columns = kwargs['preserve_columns']

        count_file_path = os.path.abspath(count_file_path)

        count_matrix, classes = self._load_data()

        # Remove zero rows
        count_matrix = count_matrix[np.sum(count_matrix, axis=1) > 0].copy()

        # Shuffle columns
        count_matrix, original_columns, column_permutation = \
            shuffle_and_rename_columns(count_matrix, disabled=preserve_columns)

        # Save hidden data
        make_sure_dir_exists(settings.STORAGE_DIR)
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        dump_gzip_pickle([
            count_matrix.to_sparse(), classes, original_columns,
            column_permutation
        ], hidden_data_file_path)
        log("Benchmark hidden data saved to `%s`" % hidden_data_file_path)

        make_sure_dir_exists(os.path.dirname(count_file_path))
        write_csv(count_matrix, count_file_path)
        log("Count file saved to `%s`" % count_file_path)
Exemplo n.º 3
0
    def generate_test_bench(self, count_file_path, **kwargs):
        n_samples = kwargs['n_samples']
        dropout_count = kwargs['dropout_count']
        min_expression = kwargs['min_expression']
        hvg_frac = kwargs['hvg_frac']
        preserve_columns = kwargs['preserve_columns']

        count_file_path = os.path.abspath(count_file_path)
        data = self._load_data(n_samples)

        hvg_indices = self.get_hvg_genes(data, hvg_frac)

        # Generate elimination mask
        non_zero_locations = []

        data_values = data.values
        for x in hvg_indices:
            for y in range(data.shape[1]):
                if data_values[x, y] >= min_expression:
                    non_zero_locations.append((x, y))
        del data_values

        mask = np.zeros_like(data)

        masked_locations = [
            non_zero_locations[index] for index in np.random.choice(
                len(non_zero_locations), dropout_count, replace=False)
        ]

        for (x, y) in masked_locations:
            mask[x, y] = 1

        mask = pd.DataFrame(mask, index=data.index, columns=data.columns)

        # Elimination
        low_quality_data = data * (1 - mask.values)

        is_nonzero = np.sum(low_quality_data, axis=1) > 0
        mask = mask[is_nonzero].copy()
        data = data[is_nonzero].copy()
        low_quality_data = low_quality_data[is_nonzero].copy()

        # Shuffle columns
        low_quality_data, original_columns, column_permutation = \
            shuffle_and_rename_columns(low_quality_data, disabled=preserve_columns)

        # Save hidden data
        make_sure_dir_exists(settings.STORAGE_DIR)
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        dump_gzip_pickle([
            data.to_sparse(),
            mask.to_sparse(), original_columns, column_permutation
        ], hidden_data_file_path)
        log("Benchmark hidden data saved to `%s`" % hidden_data_file_path)

        make_sure_dir_exists(os.path.dirname(count_file_path))
        write_csv(low_quality_data, count_file_path)
        log("Count file saved to `%s`" % count_file_path)
Exemplo n.º 4
0
    def generate_test_bench(self, count_file_path, **kwargs):
        n_samples = kwargs['n_samples']
        read_ratio = kwargs['read_ratio']
        replce = kwargs['replace']
        preserve_columns = kwargs['preserve_columns']

        count_file_path = os.path.abspath(count_file_path)
        data = self._load_data(n_samples)

        # find cumulative distribution (sum)
        data_values = data.astype(int).values
        n_all_reads = np.sum(data_values)
        data_cumsum = np.reshape(np.cumsum(data_values), data_values.shape)

        # Sample from original dataset
        new_reads = np.sort(
            np.random.choice(n_all_reads,
                             int(read_ratio * n_all_reads),
                             replace=replce))

        low_quality_data = np.zeros_like(data_values)
        read_index = 0
        for x in range(data_values.shape[0]):
            for y in range(data_values.shape[1]):
                while read_index < len(
                        new_reads) and new_reads[read_index] < data_cumsum[x,
                                                                           y]:
                    low_quality_data[x, y] += 1
                    read_index += 1

        # Convert to data frame
        low_quality_data = pd.DataFrame(low_quality_data,
                                        index=data.index,
                                        columns=data.columns)

        # Shuffle columns
        low_quality_data, original_columns, column_permutation = \
            shuffle_and_rename_columns(low_quality_data, disabled=preserve_columns)

        # Remove zero rows
        data = data[np.sum(low_quality_data, axis=1) > 0].copy()
        low_quality_data = low_quality_data[
            np.sum(low_quality_data, axis=1) > 0].copy()

        # Save hidden data
        make_sure_dir_exists(settings.STORAGE_DIR)
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        dump_gzip_pickle([
            data.to_sparse(), read_ratio, original_columns, column_permutation
        ], hidden_data_file_path)
        log("Benchmark hidden data saved to `%s`" % hidden_data_file_path)

        make_sure_dir_exists(os.path.dirname(count_file_path))
        write_csv(low_quality_data, count_file_path)
        log("Count file saved to `%s`" % count_file_path)
Exemplo n.º 5
0
 def _download_data_set(self):
     make_sure_dir_exists(self.DATA_SET_DIR_NAME)
     download_file_if_not_exists(self.PBMC_RNA_DATA_URL, self.PBMC_RNA_DATA_FILE_PATH, self.PBMC_RNA_DATA_MD5_SUM)
     download_file_if_not_exists(self.PBMC_ADT_DATA_URL, self.PBMC_ADT_DATA_FILE_PATH, self.PBMC_ADT_DATA_MD5_SUM)
     download_file_if_not_exists(self.PBMC_TRANSFORMED_ADT_DATA_URL, self.PBMC_TRANSFORMED_ADT_DATA_FILE_PATH,
                                 self.PBMC_TRANSFORMED_ADT_DATA_MD5_SUM)
     download_file_if_not_exists(self.CBMC_RNA_DATA_URL, self.CBMC_RNA_DATA_FILE_PATH, self.CBMC_RNA_DATA_MD5_SUM)
     download_file_if_not_exists(self.CBMC_ADT_DATA_URL, self.CBMC_ADT_DATA_FILE_PATH, self.CBMC_ADT_DATA_MD5_SUM)
     download_file_if_not_exists(self.CBMC_TRANSFORMED_ADT_DATA_URL, self.CBMC_TRANSFORMED_ADT_DATA_FILE_PATH,
                                 self.CBMC_TRANSFORMED_ADT_DATA_MD5_SUM)
     download_file_if_not_exists(self.CD8_RNA_DATA_URL, self.CD8_RNA_DATA_FILE_PATH, self.CD8_RNA_DATA_MD5_SUM)
     download_file_if_not_exists(self.CD8_ADT_DATA_URL, self.CD8_ADT_DATA_FILE_PATH, self.CD8_ADT_DATA_MD5_SUM)
     download_file_if_not_exists(self.CD8_TRANSFORMED_ADT_DATA_URL, self.CD8_TRANSFORMED_ADT_DATA_FILE_PATH,
                                 self.CD8_TRANSFORMED_ADT_DATA_MD5_SUM)
Exemplo n.º 6
0
    def generate_test_bench(self, count_file_path, **kwargs):
        count_file_path = os.path.abspath(count_file_path)
        rm_ercc = kwargs['rm_ercc']
        rm_mt = kwargs['rm_mt']
        rm_lq = kwargs['rm_lq']
        preserve_columns = kwargs['preserve_columns']

        # Load dataset
        data = self._load_and_combine_data()

        # Remove some rows and columns
        if rm_ercc:
            remove_list = [
                symbol for symbol in data.index.values
                if symbol.startswith("ERCC-")
            ]
            data = data.drop(remove_list)
        if rm_mt:
            remove_list = [
                symbol for symbol in data.index.values
                if symbol.startswith("mt-")
            ]
            data = data.drop(remove_list)
        if rm_lq:
            remove_list = data.columns.values[data.sum(axis=0) < 1e6]
            data = data.drop(columns=remove_list)
        # Remove empty rows
        remove_list = data.index.values[data.sum(axis=1) == 0]
        data = data.drop(remove_list)

        # Shuffle columns
        new_data, original_columns, column_permutation = shuffle_and_rename_columns(
            data, disabled=preserve_columns)

        # Save hidden data
        make_sure_dir_exists(settings.STORAGE_DIR)
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        dump_gzip_pickle(
            [data.to_sparse(), original_columns, column_permutation],
            hidden_data_file_path)
        log("Benchmark hidden data saved to `%s`" % hidden_data_file_path)

        make_sure_dir_exists(os.path.dirname(count_file_path))
        write_csv(new_data, count_file_path)
        log("Count file saved to `%s`" % count_file_path)

        return None
Exemplo n.º 7
0
    def evaluate_result(self, processed_count_file_path, result_dir,
                        visualization, **kwargs):
        normalization = kwargs['normalization']
        transformation = kwargs['transformation']
        clear_cache = kwargs['clear_cache']

        make_sure_dir_exists(os.path.join(result_dir, "files"))
        info = []

        # Load hidden state and data
        count_matrix, classes, original_columns, column_permutation = self._load_hidden_state(
        )

        # Load imputed data
        imputed_data = read_table_file(processed_count_file_path)

        # Restore column names and order
        imputed_data = rearrange_and_rename_columns(imputed_data,
                                                    original_columns,
                                                    column_permutation)

        # Data transformations
        if np.sum(imputed_data.values < 0) > 0:
            log("Observed some negative values!")
            imputed_data[imputed_data < 0] = 0
        imputed_data = transformations[transformation](
            normalizations[normalization](imputed_data))

        # Save class details for future
        write_csv(classes, os.path.join(result_dir, "files", "classes.csv"))

        # Evaluation
        metric_results = dict()

        embedded_data_file_path = os.path.join(result_dir, "files",
                                               "embedded_data.pkl.gz")
        if os.path.exists(embedded_data_file_path) and not clear_cache:
            embedded_data = load_gzip_pickle(embedded_data_file_path)
        else:
            embedded_data = self._get_embeddings(imputed_data)
            dump_gzip_pickle(embedded_data, embedded_data_file_path)

        log("Evaluating ...")
        for class_label in classes.index.values:
            class_names = classes.loc[class_label].values
            for embedding_name in embedded_data:
                emb, emb_2d = embedded_data[embedding_name]

                embedding_slug = embedding_name.replace(" ", "_").lower()

                k_means = KMeans(n_clusters=len(set(class_names)))
                k_means.fit(emb)
                clusters = k_means.predict(emb)

                embedding_df = pd.DataFrame(emb)
                embedding_df["X"] = emb_2d[:, 0]
                embedding_df["Y"] = emb_2d[:, 1]
                embedding_df["class"] = class_names
                embedding_df["k_means_clusters"] = clusters
                write_csv(
                    embedding_df,
                    os.path.join(result_dir, "files",
                                 "%s_%s.csv" % (class_label, embedding_slug)))
                info.append({
                    'filename':
                    "%s_%s.csv" % (class_label, embedding_slug),
                    'description':
                    '%s embedding of cells along %s labels' %
                    (embedding_name, class_label),
                    'plot_description':
                    '%s embedding of cells along %s labels (Classes can be identified '
                    'with their colors and K-means clusters are marked '
                    'with different shapes)' % (embedding_name, class_label),
                })

                metric_results.update({
                    'kmeans_on_%s_%s_adjusted_mutual_info_score' % (embedding_slug, class_label):
                    adjusted_mutual_info_score(class_names,
                                               clusters,
                                               average_method="arithmetic"),
                    'kmeans_on_%s_%s_v_measure_score' % (embedding_slug, class_label):
                    v_measure_score(class_names, clusters),
                    'embedding_%s_%s_calinski_harabaz_score' % (embedding_slug, class_label):
                    calinski_harabaz_score(emb, class_names),
                    'embedding_%s_%s_silhouette_score' % (embedding_slug, class_label):
                    silhouette_score(emb, class_names)
                })

        write_csv(pd.DataFrame(info),
                  os.path.join(result_dir, "files", "info.csv"))

        result_path = os.path.join(result_dir, "result.txt")
        with open(result_path, 'w') as file:
            file.write("## METRICS:\n")
            for metric in sorted(metric_results):
                file.write("%s\t%4f\n" % (metric, metric_results[metric]))

            file.write("##\n## ADDITIONAL INFO:\n")

        log("Evaluation results saved to `%s`" % result_path)

        if visualization != "none":
            self.visualize_result(result_dir, output_type=visualization)

        return metric_results
Exemplo n.º 8
0
    def evaluate_result(self, processed_count_file_path, result_dir,
                        visualization, **kwargs):
        normalization = kwargs['normalization']
        transformation = kwargs['transformation']

        # Load hidden state and data
        count_matrix_lq, original_columns, column_permutation, count_matrix_hq = self._load_hidden_state(
        )

        # Load imputed data
        imputed_data = read_table_file(processed_count_file_path)

        # Restore column names and order
        imputed_data = rearrange_and_rename_columns(imputed_data,
                                                    original_columns,
                                                    column_permutation)

        # Replace negative values with zero
        imputed_data = imputed_data.clip(lower=0)

        # Data transformations
        imputed_data = transformations[transformation](
            normalizations[normalization](imputed_data))
        count_matrix_hq = transformations[transformation](
            normalizations[normalization](count_matrix_hq))

        # Evaluation
        rmse_distances = []
        mae_distances = []
        euclidean_distances = []
        cosine_distances = []
        correlation_distances = []

        for i in range(count_matrix_hq.shape[1]):
            non_zeros = np.logical_and(count_matrix_hq.values[:, i] > 0,
                                       count_matrix_lq.values[:, i] == 0)
            hq = count_matrix_hq.values[non_zeros, i]
            lq = count_matrix_lq.values[non_zeros, i]
            y = imputed_data.values[non_zeros, i]
            if np.sum(y) > 0:
                y = y * np.sum(hq) / np.sum(y)
            rmse_distances.append(float(np.mean(np.square(hq - y)**0.5)))
            mae_distances.append(float(np.mean(np.abs(hq - y))))
            euclidean_distances.append(
                pdist(np.vstack((hq, y)), 'euclidean')[0])
            cosine_distances.append(pdist(np.vstack((hq, y)), 'cosine')[0])
            correlation_distances.append(
                pdist(np.vstack((hq, y)), 'correlation')[0])

        metric_results = {
            'cell_root_mean_squared_error': np.mean(rmse_distances),
            'cell_mean_absolute_error': np.mean(mae_distances),
            'cell_mean_euclidean_distance': np.mean(euclidean_distances),
            'cell_mean_cosine_distance': np.mean(cosine_distances),
            'cell_mean_correlation_distance': np.mean(correlation_distances)
        }

        # Save results to a file
        make_sure_dir_exists(os.path.join(result_dir, "files"))
        result_path = os.path.join(result_dir, "result.txt")
        with open(result_path, 'w') as file:
            file.write("## METRICS:\n")
            for metric in sorted(metric_results):
                file.write("%s\t%4f\n" %
                           (metric, float(metric_results[metric])))

            file.write("##\n## ADDITIONAL INFO:\n")
            file.write(
                "# CELL\troot_mean_squared_error\tmean_absolute_error\tmean_euclidean_distance\t"
                "mean_cosine_distance\tmean_correlation_distance:\n")
            for i in range(count_matrix_hq.shape[1]):
                file.write(
                    "# %s\t%f\t%f\t%f\t%f\t%f\n" %
                    (count_matrix_hq.columns.values[i], rmse_distances[i],
                     mae_distances[i], euclidean_distances[i],
                     cosine_distances[i], correlation_distances[i]))

        log("Evaluation results saved to `%s`" % result_path)

        if visualization != "none":
            self.visualize_result(result_dir, output_type=visualization)

        return metric_results
Exemplo n.º 9
0
    def evaluate_result(self, processed_count_file_path, result_dir,
                        visualization, **kwargs):
        make_sure_dir_exists(os.path.join(result_dir, "files"))

        transformation = kwargs['transformation']

        # Load hidden state and data
        _, original_columns, column_permutation, count_adt, protein_rna_mapping = self._load_hidden_state(
        )

        # Load imputed data
        imputed_rna = read_table_file(processed_count_file_path)

        # Restore column names and order
        imputed_rna = rearrange_and_rename_columns(imputed_rna,
                                                   original_columns,
                                                   column_permutation)

        # Data transformations
        imputed_rna = transformations[transformation](imputed_rna)
        count_adt = transformations[transformation](count_adt)

        # Use related data
        adt = count_adt.loc[[
            prot for prot in count_adt.index.values
            if (protein_rna_mapping[prot] in imputed_rna.index.values)
        ]].copy()
        adt.index = ["prot_" + p for p in adt.index.values]
        rna = imputed_rna.loc[[
            protein_rna_mapping[prot] for prot in count_adt.index.values
            if (protein_rna_mapping[prot] in imputed_rna.index.values)
        ]]
        rna.index = ["gene_" + g for g in rna.index.values]

        info = []

        write_csv(adt, os.path.join(result_dir, "files", "adt.csv"))
        info.append({
            'filename':
            "adt.csv",
            'description':
            'Protein expressions (adt) after transformation',
            'plot_description':
            'Protein expressions (adt) after transformation',
        })

        write_csv(rna, os.path.join(result_dir, "files", "rna.csv"))
        info.append({
            'filename':
            "rna.csv",
            'description':
            'Gene expressions of genes related to adt data after transformation',
            'plot_description':
            'Gene expressions of genes related to adt data after transformation',
        })

        n = adt.shape[0]

        # Calculating Spearman correlations
        combined_df = pd.concat((adt, rna)).transpose()
        correlations = combined_df.corr(method="spearman")

        adt_adt_spearmanr = correlations.iloc[:n, :n]
        rna_rna_spearmanr = correlations.iloc[n:, n:]
        adt_rna_spearmanr = correlations.iloc[:n, n:]

        write_csv(
            correlations,
            os.path.join(result_dir, "files", "spearman_correlations.csv"))
        info.append({
            'filename':
            "spearman_correlations.csv",
            'description':
            'Pairwise Spearman correlations (first n items are '
            'adt expressions and second n items are rna expressions)',
            'plot_description':
            'Pairwise Spearman correlations (first n items are '
            'adt expressions and second n items are rna expressions)',
        })

        # Calculating Pearson correlations
        combined_df = pd.concat((adt, rna)).transpose()
        correlations = combined_df.corr(method="pearson")

        adt_adt_pearsonr = correlations.iloc[:n, :n]
        rna_rna_pearsonr = correlations.iloc[n:, n:]
        adt_rna_pearsonr = correlations.iloc[:n, n:]

        write_csv(
            correlations,
            os.path.join(result_dir, "files", "pearson_correlations.csv"))
        info.append({
            'filename':
            "pearson_correlations.csv",
            'description':
            'Pairwise Pearson correlations (first n items are '
            'adt expressions and second n items are rna expressions)',
            'plot_description':
            'Pairwise Pearson correlations (first n items are '
            'adt expressions and second n items are rna expressions)',
        })

        # Evaluation
        metric_results = {
            'rna_protein_mean_spearman_correlatoin':
            np.mean(adt_rna_spearmanr.values.diagonal()),
            'rna_protein_mean_pearson_correlatoin':
            np.mean(adt_rna_pearsonr.values.diagonal()),
            'MSE_of_adt_adt_and_rna_rna_spearman_correlations':
            np.mean((adt_adt_spearmanr.values - rna_rna_spearmanr.values)**2),
            'MSE_of_adt_adt_and_rna_rna_pearson_correlations':
            np.mean((adt_adt_pearsonr.values - rna_rna_pearsonr.values)**2)
        }

        write_csv(pd.DataFrame(info),
                  os.path.join(result_dir, "files", "info.csv"))

        # Save results to a file
        result_path = os.path.join(result_dir, "result.txt")
        with open(result_path, 'w') as file:
            file.write("## METRICS:\n")
            for metric in sorted(metric_results):
                file.write("%s\t%4f\n" %
                           (metric, float(metric_results[metric])))

            file.write("##\n## ADDITIONAL INFO:\n")
            file.write("## Pearson of adt/rna:\n")
            file.write("## " +
                       "\n## ".join(adt_rna_pearsonr.to_string().split("\n")) +
                       "\n")
            file.write('## Spearman of adt/rna:\n')
            file.write(
                "## " +
                "\n## ".join(adt_rna_spearmanr.to_string().split("\n")) + "\n")
            file.write("## Pearson of adt/adt:\n")
            file.write("## " +
                       "\n## ".join(adt_adt_pearsonr.to_string().split("\n")) +
                       "\n")
            file.write("## Pearson of rna/rna:\n")
            file.write("## " +
                       "\n## ".join(rna_rna_pearsonr.to_string().split("\n")) +
                       "\n")
            file.write('## Spearman of adt/adt:\n')
            file.write(
                "## " +
                "\n## ".join(adt_adt_spearmanr.to_string().split("\n")) + "\n")
            file.write('## Spearman of rna/rna:\n')
            file.write(
                "## " +
                "\n## ".join(rna_rna_spearmanr.to_string().split("\n")) + "\n")

        log("Evaluation results saved to `%s`" % result_path)

        if visualization != "none":
            self.visualize_result(result_dir, output_type=visualization)

        return metric_results
Exemplo n.º 10
0
 def _download_data_set(self):
     make_sure_dir_exists(os.path.dirname(self.DATA_SET_FILE_PATH))
     download_file_if_not_exists(self.DATA_SET_URL,
                                 self.DATA_SET_FILE_PATH,
                                 self.DATA_SET_MD5_SUM)
Exemplo n.º 11
0
    def evaluate_result(self, processed_count_file, result_dir, visualization,
                        **kwargs):
        normalization = kwargs['normalization']
        transformation = kwargs['transformation']
        clear_cache = kwargs['clear_cache']

        make_sure_dir_exists(os.path.join(result_dir, "files"))
        info = []

        data, imputed_data = self._load_data_and_imputed_data_for_evaluation(
            processed_count_file)
        gold_standard_classes = [
            column_name.split("_")[0] for column_name in data.columns.values
        ]

        # Data transformations
        if np.sum(imputed_data.values < 0) > 0:
            log("Observed some negative values!")
            imputed_data[imputed_data < 0] = 0
        imputed_data = transformations[transformation](
            normalizations[normalization](imputed_data))

        G1_S_related_part_of_imputed_data, G2_M_related_part_of_imputed_data = self._get_related_part(
            imputed_data)

        related_part_of_imputed_data = pd.concat([
            G1_S_related_part_of_imputed_data,
            G2_M_related_part_of_imputed_data
        ])

        write_csv(
            G1_S_related_part_of_imputed_data,
            os.path.join(result_dir, "files",
                         "G1_S_related_part_of_imputed_data.csv"))
        info.append({
            'filename': 'G1_S_related_part_of_imputed_data.csv',
            'description': 'Vales of genes related to G1/S',
            'plot_description': 'Heatmap of Genes related to G1/S',
        })

        write_csv(
            G2_M_related_part_of_imputed_data,
            os.path.join(result_dir, "files",
                         "G2_M_related_part_of_imputed_data.csv"))
        info.append({
            'filename': 'G2_M_related_part_of_imputed_data.csv',
            'description': 'Vales of genes related to G2/M',
            'plot_description': 'Heatmap of Genes related to G2/M',
        })

        svm_results, knn_results = self._get_classification_results(
            related_part_of_imputed_data, gold_standard_classes)

        embedded_data_file_path = os.path.join(result_dir, "files",
                                               "embedded_data.pkl.gz")
        if os.path.exists(embedded_data_file_path) and not clear_cache:
            embedded_data = load_gzip_pickle(embedded_data_file_path)
        else:
            embedded_data = self._get_embeddings(related_part_of_imputed_data)
            dump_gzip_pickle(embedded_data, embedded_data_file_path)

        metric_results = {
            "classification_svm_mean_accuracy": np.mean(svm_results),
            "classification_knn_mean_accuracy": np.mean(knn_results)
        }

        embedded_data["identity"] = related_part_of_imputed_data.transpose()

        for i, embedding_name in enumerate(embedded_data):
            emb = embedded_data[embedding_name]

            k_means = KMeans(n_clusters=3)
            k_means.fit(emb)
            clusters = k_means.predict(emb)

            embedding_slug = embedding_name.replace(" ", "_").lower()

            if embedding_name != "identity":
                embedding_df = pd.DataFrame(
                    {
                        "X": emb[:, 0],
                        "Y": emb[:, 1],
                        "class": gold_standard_classes,
                        "k_means_clusters": clusters
                    },
                    index=data.columns.values)
                write_csv(
                    embedding_df,
                    os.path.join(result_dir, "files",
                                 "%s.csv" % embedding_slug))
                info.append({
                    'filename':
                    "%s.csv" % embedding_slug,
                    'description':
                    '%s embedding of cells considering genes related '
                    'to cell-cycle' % embedding_name,
                    'plot_description':
                    '%s embedding of cells considering genes related '
                    'to cell-cycle (K-means clusters are marked '
                    'with different shapes)' % embedding_name,
                })

            metric_results.update({
                'kmeans_on_%s_adjusted_mutual_info_score' % embedding_slug:
                adjusted_mutual_info_score(gold_standard_classes,
                                           clusters,
                                           average_method="arithmetic"),
                'kmeans_on_%s_v_measure_score' % embedding_slug:
                v_measure_score(gold_standard_classes, clusters),
                'embedding_%s_calinski_harabaz_score' % embedding_slug:
                calinski_harabaz_score(emb, gold_standard_classes),
                'embedding_%s_silhouette_score' % embedding_slug:
                silhouette_score(emb, gold_standard_classes)
            })

        write_csv(pd.DataFrame(info),
                  os.path.join(result_dir, "files", "info.csv"))

        result_path = os.path.join(result_dir, "result.txt")
        with open(result_path, 'w') as file:
            file.write("## METRICS:\n")
            for metric in sorted(metric_results):
                file.write("%s\t%4f\n" % (metric, metric_results[metric]))

            file.write("##\n## ADDITIONAL INFO:\n")
            file.write("## SVM classifiers accuracies: %s\n" %
                       str(svm_results))
            file.write("## KNN classifiers accuracies: %s\n" %
                       str(knn_results))

        log("Evaluation results saved to `%s`" % result_path)

        if visualization != "none":
            self.visualize_result(result_dir, output_type=visualization)

        return metric_results
Exemplo n.º 12
0
    def evaluate_result(self, processed_count_file_path, result_dir,
                        visualization, **kwargs):
        transformation = kwargs['transformation']

        make_sure_dir_exists(os.path.join(result_dir, "files"))

        # Load hidden state and data
        scaled_data, original_columns, column_permutation = self._load_hidden_state(
        )

        # Load imputed data
        imputed_data = read_table_file(processed_count_file_path)

        # Restore column names and order
        imputed_data = rearrange_and_rename_columns(imputed_data,
                                                    original_columns,
                                                    column_permutation)

        # Replace negative values with zero
        imputed_data = imputed_data.clip(lower=0)

        # Data transformation
        scaled_data = transformations[transformation](scaled_data)
        imputed_data = transformations[transformation](imputed_data)

        # Evaluation
        rmse_distances = []
        mae_distances = []
        euclidean_distances = []
        cosine_distances = []
        correlation_distances = []

        rmse = float(
            np.sum(
                np.where(scaled_data.values > 0, 1, 0) *
                np.square(scaled_data.values - imputed_data.values)) /
            np.sum(np.where(scaled_data.values > 0, 1, 0)))**0.5
        mae = float(
            np.sum(
                np.where(scaled_data.values > 0, 1, 0) *
                np.abs(scaled_data.values - imputed_data.values)) /
            np.sum(np.where(scaled_data.values > 0, 1, 0)))

        for i in range(scaled_data.shape[1]):
            non_zeros = scaled_data.values[:, i] > 0
            x = scaled_data.values[non_zeros, i]
            y = imputed_data.values[non_zeros, i]
            rmse_distances.append(
                float(np.sum(np.square(x - y)) / np.sum(non_zeros))**0.5)
            mae_distances.append(
                float(np.sum(np.abs(x - y)) / np.sum(non_zeros)))
            cosine_distances.append(pdist(np.vstack((x, y)), 'cosine')[0])
            euclidean_distances.append(
                pdist(np.vstack((x, y)), 'euclidean')[0])
            correlation_distances.append(
                pdist(np.vstack((x, y)), 'correlation')[0])

        metric_results = {
            'all_mean_absolute_error_on_non_zeros':
            mae,
            'all_root_mean_squared_error_on_non_zeros':
            rmse,
            'cell_mean_mean_absolute_error_on_non_zeros':
            np.mean(mae_distances),
            'cell_mean_root_mean_squared_error_on_non_zeros':
            np.mean(rmse_distances),
            'cell_mean_euclidean_distance':
            np.mean(euclidean_distances),
            'cell_mean_cosine_distance':
            np.mean(cosine_distances),
            'cell_mean_correlation_distance':
            np.mean(correlation_distances),
        }

        # Save results to a file
        result_path = os.path.join(result_dir, "result.txt")
        with open(result_path, 'w') as file:
            file.write("## METRICS:\n")
            for metric in sorted(metric_results):
                file.write("%s\t%4f\n" %
                           (metric, float(metric_results[metric])))

            file.write("##\n## ADDITIONAL INFO:\n")
            file.write(
                "# CELL\troot_mean_squared_error_on_non_zeros\tmean_absolute_error_on_non_zeros\t"
                "euclidean_distance_on_non_zeros\tcosine_distance_on_non_zeros\tcorrelation_distance_on_non_zeros:\n"
            )
            for i in range(scaled_data.shape[1]):
                file.write("# %s\t%f\t%f\t%f\t%f\t%f\n" %
                           (scaled_data.columns.values[i], rmse_distances[i],
                            mae_distances[i], euclidean_distances[i],
                            cosine_distances[i], correlation_distances[i]))

        log("Evaluation results saved to `%s`" % result_path)

        if visualization != "none":
            self.visualize_result(result_dir, output_type=visualization)

        return metric_results
Exemplo n.º 13
0
    def evaluate_result(self, processed_count_file_path, result_dir,
                        visualization, **kwargs):
        make_sure_dir_exists(os.path.join(result_dir, "files"))
        info = []

        # Load hidden state and data
        data, mask, original_columns, column_permutation = self._load_hidden_state(
        )

        # Load imputed data
        imputed_data = read_table_file(processed_count_file_path)

        # Restore column names and order
        imputed_data = rearrange_and_rename_columns(imputed_data,
                                                    original_columns,
                                                    column_permutation)

        # Replace negative values with zero
        imputed_data = imputed_data.clip(lower=0)

        # Evaluation
        log_diff = np.abs(transformations["log"](data) -
                          transformations["log"](imputed_data))
        sqrt_diff = np.abs(transformations["sqrt"](data) -
                           transformations["sqrt"](imputed_data))

        mse_on_log = float(
            np.sum(
                np.sum(mask * np.where(data != 0, 1, 0) * np.square(log_diff)))
            / np.sum(np.sum(mask * np.where(data != 0, 1, 0))))
        mae_on_log = float(
            np.sum(np.sum(mask * np.where(data != 0, 1, 0) * np.abs(log_diff)))
            / np.sum(np.sum(mask * np.where(data != 0, 1, 0))))
        mse_on_sqrt = float(
            np.sum(
                np.sum(
                    mask * np.where(data != 0, 1, 0) * np.square(sqrt_diff))) /
            np.sum(np.sum(mask * np.where(data != 0, 1, 0))))
        mae_on_sqrt = float(
            np.sum(np.sum(
                mask * np.where(data != 0, 1, 0) * np.abs(sqrt_diff))) /
            np.sum(np.sum(mask * np.where(data != 0, 1, 0))))

        metric_results = {
            'RMSE_sqrt': mse_on_sqrt**0.5,
            'MAE_sqrt': mae_on_sqrt,
            'RMSE_log': mse_on_log**0.5,
            'MAE_log': mae_on_log
        }

        masked_locations = []
        mask_values = mask.values
        for x in range(mask_values.shape[0]):
            for y in range(mask_values.shape[1]):
                if mask_values[x, y] == 1:
                    masked_locations.append((x, y))

        original_values = []
        predicted_values = []
        for (x, y) in masked_locations:
            original_values.append(data.iloc[x, y])
            predicted_values.append(imputed_data.iloc[x, y])

        original_values = np.asarray(original_values)
        predicted_values = np.asarray(predicted_values)

        predictions_df = pd.DataFrame({
            'original': original_values,
            'predicted': predicted_values
        })
        write_csv(predictions_df,
                  os.path.join(result_dir, "files", "predictions.csv"))
        info.append({
            'filename':
            "predictions.csv",
            'description':
            'Original masked values along predicted values',
            'plot_description':
            'Predicted values vs. original masked values',
        })

        write_csv(pd.DataFrame(info),
                  os.path.join(result_dir, "files", "info.csv"))

        # Save results to a file
        result_path = os.path.join(result_dir, "result.txt")
        with open(result_path, 'w') as file:
            file.write("## METRICS:\n")
            for metric in sorted(metric_results):
                file.write("%s\t%4f\n" % (metric, metric_results[metric]))

            file.write("##\n## ADDITIONAL INFO:\n")
            file.write("# GENE\tCELL\tGOLD_STANDARD\tRESULT:\n")
            for (x, y) in masked_locations:
                file.write("# %s\t%s\t%f\t%f\n" %
                           (data.index.values[x], data.columns.values[y],
                            data.iloc[x, y], imputed_data.iloc[x, y]))

        log("Evaluation results saved to `%s`" % result_path)

        if visualization != "none":
            self.visualize_result(result_dir, output_type=visualization)

        return metric_results