示例#1
0
    def generate_test_bench(self, count_file_path, **kwargs):
        preserve_columns = kwargs['preserve_columns']

        count_file_path = os.path.abspath(count_file_path)

        count_rna = self.data_set.get("RNA")
        count_adt = self.data_set.get("ADT")

        # Shuffle columns
        count_rna, original_columns, column_permutation = \
            shuffle_and_rename_columns(count_rna, disabled=preserve_columns)

        # Remove zero rows
        count_rna = count_rna[np.sum(count_rna, axis=1) > 0].copy()

        # Save hidden data
        make_sure_dir_exists(settings.STORAGE_DIR)
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        dump_gzip_pickle([
            count_rna.to_sparse(), original_columns, column_permutation,
            count_adt.to_sparse(), self.protein_rna_mapping
        ], hidden_data_file_path)
        log("Benchmark hidden data saved to `%s`" % hidden_data_file_path)

        make_sure_dir_exists(os.path.dirname(count_file_path))
        write_csv(count_rna, count_file_path)
        log("Count file saved to `%s`" % count_file_path)
示例#2
0
    def generate_test_bench(self, count_file_path, **kwargs):
        preserve_columns = kwargs['preserve_columns']

        count_file_path = os.path.abspath(count_file_path)

        count_matrix, classes = self._load_data()

        # Remove zero rows
        count_matrix = count_matrix[np.sum(count_matrix, axis=1) > 0].copy()

        # Shuffle columns
        count_matrix, original_columns, column_permutation = \
            shuffle_and_rename_columns(count_matrix, disabled=preserve_columns)

        # Save hidden data
        make_sure_dir_exists(settings.STORAGE_DIR)
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        dump_gzip_pickle([
            count_matrix.to_sparse(), classes, original_columns,
            column_permutation
        ], hidden_data_file_path)
        log("Benchmark hidden data saved to `%s`" % hidden_data_file_path)

        make_sure_dir_exists(os.path.dirname(count_file_path))
        write_csv(count_matrix, count_file_path)
        log("Count file saved to `%s`" % count_file_path)
示例#3
0
    def generate_test_bench(self, count_file_path, **kwargs):
        n_samples = kwargs['n_samples']
        dropout_count = kwargs['dropout_count']
        min_expression = kwargs['min_expression']
        hvg_frac = kwargs['hvg_frac']
        preserve_columns = kwargs['preserve_columns']

        count_file_path = os.path.abspath(count_file_path)
        data = self._load_data(n_samples)

        hvg_indices = self.get_hvg_genes(data, hvg_frac)

        # Generate elimination mask
        non_zero_locations = []

        data_values = data.values
        for x in hvg_indices:
            for y in range(data.shape[1]):
                if data_values[x, y] >= min_expression:
                    non_zero_locations.append((x, y))
        del data_values

        mask = np.zeros_like(data)

        masked_locations = [
            non_zero_locations[index] for index in np.random.choice(
                len(non_zero_locations), dropout_count, replace=False)
        ]

        for (x, y) in masked_locations:
            mask[x, y] = 1

        mask = pd.DataFrame(mask, index=data.index, columns=data.columns)

        # Elimination
        low_quality_data = data * (1 - mask.values)

        is_nonzero = np.sum(low_quality_data, axis=1) > 0
        mask = mask[is_nonzero].copy()
        data = data[is_nonzero].copy()
        low_quality_data = low_quality_data[is_nonzero].copy()

        # Shuffle columns
        low_quality_data, original_columns, column_permutation = \
            shuffle_and_rename_columns(low_quality_data, disabled=preserve_columns)

        # Save hidden data
        make_sure_dir_exists(settings.STORAGE_DIR)
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        dump_gzip_pickle([
            data.to_sparse(),
            mask.to_sparse(), original_columns, column_permutation
        ], hidden_data_file_path)
        log("Benchmark hidden data saved to `%s`" % hidden_data_file_path)

        make_sure_dir_exists(os.path.dirname(count_file_path))
        write_csv(low_quality_data, count_file_path)
        log("Count file saved to `%s`" % count_file_path)
示例#4
0
    def generate_test_bench(self, count_file_path, **kwargs):
        n_samples = kwargs['n_samples']
        read_ratio = kwargs['read_ratio']
        replce = kwargs['replace']
        preserve_columns = kwargs['preserve_columns']

        count_file_path = os.path.abspath(count_file_path)
        data = self._load_data(n_samples)

        # find cumulative distribution (sum)
        data_values = data.astype(int).values
        n_all_reads = np.sum(data_values)
        data_cumsum = np.reshape(np.cumsum(data_values), data_values.shape)

        # Sample from original dataset
        new_reads = np.sort(
            np.random.choice(n_all_reads,
                             int(read_ratio * n_all_reads),
                             replace=replce))

        low_quality_data = np.zeros_like(data_values)
        read_index = 0
        for x in range(data_values.shape[0]):
            for y in range(data_values.shape[1]):
                while read_index < len(
                        new_reads) and new_reads[read_index] < data_cumsum[x,
                                                                           y]:
                    low_quality_data[x, y] += 1
                    read_index += 1

        # Convert to data frame
        low_quality_data = pd.DataFrame(low_quality_data,
                                        index=data.index,
                                        columns=data.columns)

        # Shuffle columns
        low_quality_data, original_columns, column_permutation = \
            shuffle_and_rename_columns(low_quality_data, disabled=preserve_columns)

        # Remove zero rows
        data = data[np.sum(low_quality_data, axis=1) > 0].copy()
        low_quality_data = low_quality_data[
            np.sum(low_quality_data, axis=1) > 0].copy()

        # Save hidden data
        make_sure_dir_exists(settings.STORAGE_DIR)
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        dump_gzip_pickle([
            data.to_sparse(), read_ratio, original_columns, column_permutation
        ], hidden_data_file_path)
        log("Benchmark hidden data saved to `%s`" % hidden_data_file_path)

        make_sure_dir_exists(os.path.dirname(count_file_path))
        write_csv(low_quality_data, count_file_path)
        log("Count file saved to `%s`" % count_file_path)
示例#5
0
    def generate_test_bench(self, count_file_path, **kwargs):
        count_file_path = os.path.abspath(count_file_path)
        rm_ercc = kwargs['rm_ercc']
        rm_mt = kwargs['rm_mt']
        rm_lq = kwargs['rm_lq']
        preserve_columns = kwargs['preserve_columns']

        # Load dataset
        data = self._load_and_combine_data()

        # Remove some rows and columns
        if rm_ercc:
            remove_list = [
                symbol for symbol in data.index.values
                if symbol.startswith("ERCC-")
            ]
            data = data.drop(remove_list)
        if rm_mt:
            remove_list = [
                symbol for symbol in data.index.values
                if symbol.startswith("mt-")
            ]
            data = data.drop(remove_list)
        if rm_lq:
            remove_list = data.columns.values[data.sum(axis=0) < 1e6]
            data = data.drop(columns=remove_list)
        # Remove empty rows
        remove_list = data.index.values[data.sum(axis=1) == 0]
        data = data.drop(remove_list)

        # Shuffle columns
        new_data, original_columns, column_permutation = shuffle_and_rename_columns(
            data, disabled=preserve_columns)

        # Save hidden data
        make_sure_dir_exists(settings.STORAGE_DIR)
        hidden_data_file_path = os.path.join(settings.STORAGE_DIR,
                                             "%s.hidden.pkl.gz" % self.uid)
        dump_gzip_pickle(
            [data.to_sparse(), original_columns, column_permutation],
            hidden_data_file_path)
        log("Benchmark hidden data saved to `%s`" % hidden_data_file_path)

        make_sure_dir_exists(os.path.dirname(count_file_path))
        write_csv(new_data, count_file_path)
        log("Count file saved to `%s`" % count_file_path)

        return None
示例#6
0
    def evaluate_result(self, processed_count_file_path, result_dir,
                        visualization, **kwargs):
        normalization = kwargs['normalization']
        transformation = kwargs['transformation']
        clear_cache = kwargs['clear_cache']

        make_sure_dir_exists(os.path.join(result_dir, "files"))
        info = []

        # Load hidden state and data
        count_matrix, classes, original_columns, column_permutation = self._load_hidden_state(
        )

        # Load imputed data
        imputed_data = read_table_file(processed_count_file_path)

        # Restore column names and order
        imputed_data = rearrange_and_rename_columns(imputed_data,
                                                    original_columns,
                                                    column_permutation)

        # Data transformations
        if np.sum(imputed_data.values < 0) > 0:
            log("Observed some negative values!")
            imputed_data[imputed_data < 0] = 0
        imputed_data = transformations[transformation](
            normalizations[normalization](imputed_data))

        # Save class details for future
        write_csv(classes, os.path.join(result_dir, "files", "classes.csv"))

        # Evaluation
        metric_results = dict()

        embedded_data_file_path = os.path.join(result_dir, "files",
                                               "embedded_data.pkl.gz")
        if os.path.exists(embedded_data_file_path) and not clear_cache:
            embedded_data = load_gzip_pickle(embedded_data_file_path)
        else:
            embedded_data = self._get_embeddings(imputed_data)
            dump_gzip_pickle(embedded_data, embedded_data_file_path)

        log("Evaluating ...")
        for class_label in classes.index.values:
            class_names = classes.loc[class_label].values
            for embedding_name in embedded_data:
                emb, emb_2d = embedded_data[embedding_name]

                embedding_slug = embedding_name.replace(" ", "_").lower()

                k_means = KMeans(n_clusters=len(set(class_names)))
                k_means.fit(emb)
                clusters = k_means.predict(emb)

                embedding_df = pd.DataFrame(emb)
                embedding_df["X"] = emb_2d[:, 0]
                embedding_df["Y"] = emb_2d[:, 1]
                embedding_df["class"] = class_names
                embedding_df["k_means_clusters"] = clusters
                write_csv(
                    embedding_df,
                    os.path.join(result_dir, "files",
                                 "%s_%s.csv" % (class_label, embedding_slug)))
                info.append({
                    'filename':
                    "%s_%s.csv" % (class_label, embedding_slug),
                    'description':
                    '%s embedding of cells along %s labels' %
                    (embedding_name, class_label),
                    'plot_description':
                    '%s embedding of cells along %s labels (Classes can be identified '
                    'with their colors and K-means clusters are marked '
                    'with different shapes)' % (embedding_name, class_label),
                })

                metric_results.update({
                    'kmeans_on_%s_%s_adjusted_mutual_info_score' % (embedding_slug, class_label):
                    adjusted_mutual_info_score(class_names,
                                               clusters,
                                               average_method="arithmetic"),
                    'kmeans_on_%s_%s_v_measure_score' % (embedding_slug, class_label):
                    v_measure_score(class_names, clusters),
                    'embedding_%s_%s_calinski_harabaz_score' % (embedding_slug, class_label):
                    calinski_harabaz_score(emb, class_names),
                    'embedding_%s_%s_silhouette_score' % (embedding_slug, class_label):
                    silhouette_score(emb, class_names)
                })

        write_csv(pd.DataFrame(info),
                  os.path.join(result_dir, "files", "info.csv"))

        result_path = os.path.join(result_dir, "result.txt")
        with open(result_path, 'w') as file:
            file.write("## METRICS:\n")
            for metric in sorted(metric_results):
                file.write("%s\t%4f\n" % (metric, metric_results[metric]))

            file.write("##\n## ADDITIONAL INFO:\n")

        log("Evaluation results saved to `%s`" % result_path)

        if visualization != "none":
            self.visualize_result(result_dir, output_type=visualization)

        return metric_results
示例#7
0
    def evaluate_result(self, processed_count_file, result_dir, visualization,
                        **kwargs):
        normalization = kwargs['normalization']
        transformation = kwargs['transformation']
        clear_cache = kwargs['clear_cache']

        make_sure_dir_exists(os.path.join(result_dir, "files"))
        info = []

        data, imputed_data = self._load_data_and_imputed_data_for_evaluation(
            processed_count_file)
        gold_standard_classes = [
            column_name.split("_")[0] for column_name in data.columns.values
        ]

        # Data transformations
        if np.sum(imputed_data.values < 0) > 0:
            log("Observed some negative values!")
            imputed_data[imputed_data < 0] = 0
        imputed_data = transformations[transformation](
            normalizations[normalization](imputed_data))

        G1_S_related_part_of_imputed_data, G2_M_related_part_of_imputed_data = self._get_related_part(
            imputed_data)

        related_part_of_imputed_data = pd.concat([
            G1_S_related_part_of_imputed_data,
            G2_M_related_part_of_imputed_data
        ])

        write_csv(
            G1_S_related_part_of_imputed_data,
            os.path.join(result_dir, "files",
                         "G1_S_related_part_of_imputed_data.csv"))
        info.append({
            'filename': 'G1_S_related_part_of_imputed_data.csv',
            'description': 'Vales of genes related to G1/S',
            'plot_description': 'Heatmap of Genes related to G1/S',
        })

        write_csv(
            G2_M_related_part_of_imputed_data,
            os.path.join(result_dir, "files",
                         "G2_M_related_part_of_imputed_data.csv"))
        info.append({
            'filename': 'G2_M_related_part_of_imputed_data.csv',
            'description': 'Vales of genes related to G2/M',
            'plot_description': 'Heatmap of Genes related to G2/M',
        })

        svm_results, knn_results = self._get_classification_results(
            related_part_of_imputed_data, gold_standard_classes)

        embedded_data_file_path = os.path.join(result_dir, "files",
                                               "embedded_data.pkl.gz")
        if os.path.exists(embedded_data_file_path) and not clear_cache:
            embedded_data = load_gzip_pickle(embedded_data_file_path)
        else:
            embedded_data = self._get_embeddings(related_part_of_imputed_data)
            dump_gzip_pickle(embedded_data, embedded_data_file_path)

        metric_results = {
            "classification_svm_mean_accuracy": np.mean(svm_results),
            "classification_knn_mean_accuracy": np.mean(knn_results)
        }

        embedded_data["identity"] = related_part_of_imputed_data.transpose()

        for i, embedding_name in enumerate(embedded_data):
            emb = embedded_data[embedding_name]

            k_means = KMeans(n_clusters=3)
            k_means.fit(emb)
            clusters = k_means.predict(emb)

            embedding_slug = embedding_name.replace(" ", "_").lower()

            if embedding_name != "identity":
                embedding_df = pd.DataFrame(
                    {
                        "X": emb[:, 0],
                        "Y": emb[:, 1],
                        "class": gold_standard_classes,
                        "k_means_clusters": clusters
                    },
                    index=data.columns.values)
                write_csv(
                    embedding_df,
                    os.path.join(result_dir, "files",
                                 "%s.csv" % embedding_slug))
                info.append({
                    'filename':
                    "%s.csv" % embedding_slug,
                    'description':
                    '%s embedding of cells considering genes related '
                    'to cell-cycle' % embedding_name,
                    'plot_description':
                    '%s embedding of cells considering genes related '
                    'to cell-cycle (K-means clusters are marked '
                    'with different shapes)' % embedding_name,
                })

            metric_results.update({
                'kmeans_on_%s_adjusted_mutual_info_score' % embedding_slug:
                adjusted_mutual_info_score(gold_standard_classes,
                                           clusters,
                                           average_method="arithmetic"),
                'kmeans_on_%s_v_measure_score' % embedding_slug:
                v_measure_score(gold_standard_classes, clusters),
                'embedding_%s_calinski_harabaz_score' % embedding_slug:
                calinski_harabaz_score(emb, gold_standard_classes),
                'embedding_%s_silhouette_score' % embedding_slug:
                silhouette_score(emb, gold_standard_classes)
            })

        write_csv(pd.DataFrame(info),
                  os.path.join(result_dir, "files", "info.csv"))

        result_path = os.path.join(result_dir, "result.txt")
        with open(result_path, 'w') as file:
            file.write("## METRICS:\n")
            for metric in sorted(metric_results):
                file.write("%s\t%4f\n" % (metric, metric_results[metric]))

            file.write("##\n## ADDITIONAL INFO:\n")
            file.write("## SVM classifiers accuracies: %s\n" %
                       str(svm_results))
            file.write("## KNN classifiers accuracies: %s\n" %
                       str(knn_results))

        log("Evaluation results saved to `%s`" % result_path)

        if visualization != "none":
            self.visualize_result(result_dir, output_type=visualization)

        return metric_results