def perform_data_preprocessing(self):
        #
        # Perform the preprocessing steps
        #
        pp_start_time = time.time()

        # perform log2 transformation over the input data
        if self.perform_log2:
            self.input_data = np.log2(self.input_data,
                                      where=(self.input_data != 0.0))
            logging_print("Log 2 transformation is performed on the input data")

        # perform center scaling over the input data
        if self.pre_processing_center_scale:
            scaler = StandardScaler()
            if self.over_samples:
                scaled_np_data = scaler.fit_transform(self.input_data)
            else:
                # Transform the dataset before and after scaling
                scaled_np_data = scaler.fit_transform(self.input_data.T).T

            # The method return a Numpy dataframe. Create a new Pandas DataFrame
            self.input_data = pd.DataFrame(scaled_np_data,
                                           index=self.input_data.index,
                                           columns=self.input_data.columns)
            logging_print("Centering and scaling is performed on the input data")

        timer_print(pp_start_time,
                    prefix="Data pre-processing ready",
                    time_overview_log=self.process_time_overview)
示例#2
0
    def write_output_files(self):
        wof_start_time = time.time()

        # Create AUC output file
        create_auc_output_file(self.auc_values,
                               self.gene_pathway_count,
                               self.p_values,
                               self.output_dir,
                               self.bonf_p_values,
                               txt_gzip=self.output_disable_gzip == False,
                               export_txt=self.output_disable_txt == False,
                               export_pkl=self.output_disable_pickle == False
                               or self.multi_node_num_nodes != None)

        # Export gene pathway scores
        if self.output_disable_txt == False:
            if self.output_disable_gzip:
                self.pathway_gene_scores.to_csv(os.path.join(
                    self.output_dir, "gene_pathway_scores.txt"),
                                                sep='\t')
            else:
                self.pathway_gene_scores.to_csv(os.path.join(
                    self.output_dir, "gene_pathway_scores.txt.gz"),
                                                sep='\t')
        if self.output_disable_pickle == False or \
                self.multi_node_num_nodes != None:
            self.pathway_gene_scores.to_pickle(
                os.path.join(self.output_dir, "gene_pathway_scores.pkl"))

        timer_print(wof_start_time, prefix="Writing output files ready")
示例#3
0
    def calculate_centriods(self):
        calc_cent_start_time = time.time()
        logging_print("Start calculation of the centriods")
        initial_df = self.overall_components[0]
        gene_names = initial_df.columns
        n_components = initial_df.shape[0]
        initial_np = initial_df.to_numpy()
        signs = np.sign(initial_np)
        initial_np = np.abs(initial_np)
        processed_dfs = [initial_np * signs]

        for ica_df in self.overall_components[1:]:
            ica_df = ica_df.loc[:, gene_names]
            ica_df = ica_df.iloc[:n_components, :]
            corr_table = np.abs(
                np.corrcoef(initial_np, np.abs(ica_df.to_numpy())))
            corr_table = corr_table[n_components:, :n_components]
            max_indexes_columns = np.argmax(corr_table, axis=0)

            ica_df = ica_df.iloc[max_indexes_columns, :]
            processed_dfs.append(ica_df.to_numpy())

        centriods_array = np.c_[np.abs(processed_dfs) * signs]

        centriods = centriods_array.mean(axis=0)
        centriod_df = pd.DataFrame(centriods,
                                   index=initial_df.index,
                                   columns=initial_df.columns)

        timer_print(calc_cent_start_time,
                    prefix="Centroid calculation is ready")
        return centriod_df
示例#4
0
    def calculate_wilcox_p_value(self):
        cwpv_st_time = time.time()
        wal_p_values = {}
        for pathway_id in self.auc_values.index:
            if pathway_id in list(self.matrix_data.columns):
                selected_pathway = self.pathway_gene_scores.loc[
                    self.matrix_data.index, pathway_id]
                include_in_pathway = selected_pathway[
                    self.matrix_data.loc[:, pathway_id]]
                include_not_in_pathway = selected_pathway[~self.matrix_data.
                                                          loc[:, pathway_id]]
                try:
                    _, p_value = scipy.stats.mannwhitneyu(
                        include_in_pathway,
                        include_not_in_pathway,
                        use_continuity=True,
                        alternative="two-sided")
                    wal_p_values[pathway_id] = p_value
                except ValueError:
                    logging_print(
                        "Value error in p value calculation, pvalue 0.0 is set to pathway: {}"
                        .format(pathway_id))
                    wal_p_values[pathway_id] = 0.0

        self.p_values = pd.Series(wal_p_values)
        timer_print(cwpv_st_time, prefix="Wilcox p value calculation ready")
    def read_file(self):
        #
        # Method to read all the input files
        #

        # save the start time
        rf_start_time = time.time()

        # Read the input file if present
        if os.path.isfile(self.input_file_path):
            if self.test_run:
                # Read a test dataset, so only the first 150 columns and rows
                self.input_data = pd.read_csv(self.input_file_path,
                                              sep="\t", index_col=0,
                                              nrows=150)
                self.input_data = self.input_data.iloc[:150, :100]
            else:
                # Check if only a part of the rows must be loaded instead
                # of the complete dataset
                n_rows = None
                if self.n_rows != None and self.n_rows != '':
                    n_rows = int(self.n_rows)

                # read the input file, This can be a (cahsed) Pandas pickle
                # file or a tab seperated text file which can be compressed.
                # If Force is set to true, the method will not read cashed
                # pickle files created from the original txt matrix if these
                # are present. By default the cashed version will be
                # loaded (which contain the suffix _cashed.pickle) if this file
                # is present in the same directory as the input matrix

                self.input_data = read_pd_df(
                    self.input_file_path,
                    {
                        "sep": "\t",
                        "index_col": 0,
                        "nrows": n_rows
                    },
                    force=self.force)
        else:
            raise FileNotFoundError("Cannot find input file: {}".format(
                self.input_file_path
            ))

        # if the number of components is not set, we will set this to
        # the smallest direction of the input matrix
        if self.n_components is None:
            self.n_components = np.min(self.input_data.shape)

        # log some basic info
        logging_print(stats_dict_to_string({
            "Input dataframe n_row": self.input_data.shape[0],
            "Input dataframe n_col": self.input_data.shape[1],
            "first column headers": self.input_data.columns.values[:5],
            "first row index": self.input_data.index.values[:5]
        }))
        timer_print(rf_start_time,
                    prefix="Reading input file ready",
                    time_overview_log=self.process_time_overview)
示例#6
0
 def perform_data_whitening(self, data):
     pdw_start_time = time.time()
     logging_print("FastICA, start data whitening")
     whited_svd_object = SVD_wrapper(svd_type=self.svd_type,
                                     n_components=self.n_components,
                                     white_data=True)
     self.whiten_data = whited_svd_object.fit_transform(data).to_numpy()
     self.whiten_components = whited_svd_object.components.to_numpy()
     timer_print(pdw_start_time, prefix="FastICA, data whitening ready")
示例#7
0
 def perform_filtering(self):
     if self.minimal_number_of_genes > 0:
         pf_start_time = time.time()
         matrix_selection = self.matrix_data.sum(
             axis=0) >= self.minimal_number_of_genes
         self.matrix_data = self.matrix_data.loc[:, matrix_selection[
             matrix_selection].index]
         logging_print(
             "Minimal gene filtering: {} number of pathways over".format(
                 self.matrix_data.shape[1]))
         timer_print(pf_start_time,
                     prefix="Minimal gene in pathway filtering ready")
示例#8
0
    def perform_gene_intersection(self):
        pgi_start_time = time.time()
        intersect_genes = self.components_data.index.intersection(
            self.matrix_data.index)
        self.components_data = self.components_data.loc[intersect_genes, :]
        self.matrix_data = self.matrix_data.loc[intersect_genes, :]
        if self.has_background_genes():
            self.background_genes_data = pd.Series(
                list(
                    set(self.background_genes_data).intersection(
                        set(intersect_genes))))

        timer_print(pgi_start_time, prefix="Gene intersection ready")
 def perform_data_whitening(self, data):
     # Method to perform a manual data whitening step, which is the
     # first stap in the fastICA analysis
     pdw_start_time = time.time()
     logging_print("FastICA, start data whitening")
     # Use the SVD wrapper to perform the whitening step based on a PCA.
     whited_svd_object = SVD_wrapper(svd_type=self.svd_type,
                                     n_components=self.n_components,
                                     white_data=True)
     # Save the results
     self.whiten_data = whited_svd_object.fit_transform(data).to_numpy()
     self.whiten_components = whited_svd_object.components.to_numpy()
     timer_print(pdw_start_time, prefix="FastICA, data whitening ready")
    def perform_analysis(self):
        #
        # Method to perform an analysis based on the earlier created analysis object
        #
        pa_start_time = time.time()

        # All analysis classes implements an fit method which uses
        # the given input matrix to perform the decomposition
        self.analysis_object.fit(self.get_analysis_input_df())

        # print the used time to the logging file
        timer_print(pa_start_time,
                    prefix="Performing analysis ready",
                    time_overview_log=self.process_time_overview)
    def fit(self, data):
        # Method to fit the FastICA models

        # Check if the whitening step is already done
        if self.whiten_components is None:
            # Perfrom the whitening step
            self.perform_data_whitening(data)
        fit_start_time = time.time()

        # Create the FastICA object from sklearn without performing
        # the whiten step
        fastICA_object = FastICA(algorithm="parallel",
                                 whiten=False,
                                 fun='logcosh',
                                 max_iter=self.max_iter,
                                 tol=1e-10)
        # Fit the model
        fastICA_object.fit(self.whiten_data[:, :self.n_components])

        # Calculate the independend components and the sources and save
        # the results
        indep_comp = np.dot(fastICA_object.components_,
                            self.whiten_components[:self.n_components, :])
        indep_sources = np.dot(indep_comp, data.to_numpy().T).T

        components_index = pd.RangeIndex(start=1,
                                         stop=self.n_components + 1,
                                         name="IC")

        indep_sources_df = pd.DataFrame(
            indep_sources, index=data.index,
            columns=components_index).add_prefix("IC_")

        indep_comp_df = pd.DataFrame(
            indep_comp, index=components_index,
            columns=data.columns).T.add_prefix("IC_").T

        self.projected_data = indep_sources_df
        self.components = indep_comp_df

        logging_print(
            stats_dict_to_string(
                {"Number of used iterations": fastICA_object.n_iter_}))
        timer_print(fit_start_time,
                    prefix="FastICA component optimalisation is ready")
示例#12
0
    def create_permutations_matrixes(self):
        perm_start_time = time.time()
        permutation_path = "{}_permutation_matrix.pkl".format(
            self.components_data_path)

        if os.path.isfile(permutation_path) and self.force == False:
            self.components_data_permutated = pd.read_pickle(permutation_path)
            logging_print("Permutation dataframe is loaded from '{}', "
                          "shape: {}".format(
                              permutation_path,
                              self.components_data_permutated.shape))
        else:
            self.components_data_permutated = self.components_data.set_index(
                np.random.permutation(self.components_data.index))
            self.components_data_permutated.to_pickle(permutation_path)
            logging_print("Permutation table is created, shape: {}".format(
                self.components_data_permutated.shape))
        timer_print(perm_start_time, prefix="permutation is ready")
    def calculate_centriods(self):
        # Method to calculate the centriods, e.q. the average components from
        # the components extracted from different individual runs
        calc_cent_start_time = time.time()
        logging_print("Start calculation of the centriods")
        # start position, extracted from the first run
        initial_df = self.overall_components[0]
        gene_names = initial_df.columns
        n_components = initial_df.shape[0]
        initial_np = initial_df.to_numpy()
        signs = np.sign(initial_np)
        initial_np = np.abs(initial_np)
        processed_dfs = [initial_np * signs]
        # loop through all the component runs
        for ica_df in self.overall_components[1:]:
            # Sort the components based on the highest correlation between the
            # components and the component from the first run. this resulted in
            # the same order of the fastica components over all runs.
            # By default, the components of a fastICA are random ordered
            ica_df = ica_df.loc[:, gene_names]
            ica_df = ica_df.iloc[:n_components, :]
            # calculate the correlation
            corr_table = np.abs(
                np.corrcoef(initial_np, np.abs(ica_df.to_numpy())))
            corr_table = corr_table[n_components:, :n_components]

            # find the order based on the maximal correlation
            max_indexes_columns = np.argmax(corr_table, axis=0)
            # Order the components
            ica_df = ica_df.iloc[max_indexes_columns, :]
            # Save the components in the right order
            processed_dfs.append(ica_df.to_numpy())

        # Calculate the centriods e.q. average components
        centriods_array = np.c_[np.abs(processed_dfs) * signs]
        centriods = centriods_array.mean(axis=0)
        centriod_df = pd.DataFrame(centriods,
                                   index=initial_df.index,
                                   columns=initial_df.columns)

        timer_print(calc_cent_start_time,
                    prefix="Centroid calculation is ready")
        # return the average components
        return centriod_df
示例#14
0
    def read_file(self):
        rf_start_time = time.time()

        if os.path.isfile(self.input_file_path):
            if self.test_run:
                self.input_data = pd.read_csv(self.input_file_path,
                                              sep="\t",
                                              index_col=0,
                                              nrows=150)
                self.input_data = self.input_data.iloc[:150, :100]
            else:
                n_rows = None
                if self.n_rows != None and self.n_rows != '':
                    n_rows = int(self.n_rows)
                self.input_data = read_pd_df(self.input_file_path, {
                    "sep": "\t",
                    "index_col": 0,
                    "nrows": n_rows
                },
                                             force=self.force)
        else:
            raise FileNotFoundError("Cannot find input file: {}".format(
                self.input_file_path))

        if self.n_components is None:
            self.n_components = np.min(self.input_data.shape)

        logging_print(
            stats_dict_to_string({
                "Input dataframe n_row":
                self.input_data.shape[0],
                "Input dataframe n_col":
                self.input_data.shape[1],
                "first column headers":
                self.input_data.columns.values[:5],
                "first row index":
                self.input_data.index.values[:5]
            }))
        timer_print(rf_start_time,
                    prefix="Reading input file ready",
                    time_overview_log=self.process_time_overview)
示例#15
0
    def calculate_auc_values(self):
        cav_st_stime = time.time()
        auc_values = {}
        for pathway_id in self.pathway_gene_scores.columns:
            if pathway_id in list(self.matrix_data.columns):
                pathway_gene_set = self.matrix_data.loc[:, pathway_id]
                pathway_gene_scores = self.pathway_gene_scores.loc[
                    self.matrix_data.index, pathway_id]
                try:
                    log_reg_fpr, log_reg_tpr, _ = roc_curve(
                        pathway_gene_set * 1, pathway_gene_scores)
                    auc_values[pathway_id] = auc(log_reg_fpr, log_reg_tpr)
                except ValueError:
                    logging_print(
                        "Value error in auc calculation of pathway: {}".format(
                            pathway_id))
            else:
                logging_print("HPO term: {} not in matrix".format(pathway_id))

        auc_calc_values = pd.Series(auc_values)
        auc_calc_values[auc_calc_values > 1.0] = 1.0
        self.auc_values = auc_calc_values
        timer_print(cav_st_stime, prefix="AUC value calculation ready")
示例#16
0
    def fit(self, data):
        if self.whiten_components is None:
            self.perform_data_whitening(data)
        fit_start_time = time.time()

        fastICA_object = FastICA(algorithm="parallel",
                                 whiten=False,
                                 fun='logcosh',
                                 max_iter=self.max_iter,
                                 tol=1e-10)

        fastICA_object.fit(self.whiten_data[:, :self.n_components])

        indep_comp = np.dot(fastICA_object.components_,
                            self.whiten_components[:self.n_components, :])
        indep_sources = np.dot(indep_comp, data.to_numpy().T).T

        components_index = pd.RangeIndex(start=1,
                                         stop=self.n_components + 1,
                                         name="IC")

        indep_sources_df = pd.DataFrame(
            indep_sources, index=data.index,
            columns=components_index).add_prefix("IC_")

        indep_comp_df = pd.DataFrame(
            indep_comp, index=components_index,
            columns=data.columns).T.add_prefix("IC_").T

        self.projected_data = indep_sources_df
        self.components = indep_comp_df

        logging_print(
            stats_dict_to_string(
                {"Number of used iterations": fastICA_object.n_iter_}))
        timer_print(fit_start_time,
                    prefix="FastICA component optimalisation is ready")
示例#17
0
    def perform_data_preprocessing(self):
        pp_start_time = time.time()
        if self.perform_log2:
            self.input_data = np.log2(self.input_data,
                                      where=(self.input_data != 0.0))
            logging_print(
                "Log 2 transformation is performed on the input data")

        if self.pre_processing_center_scale:
            scaler = StandardScaler()
            if self.over_samples:
                scaled_np_data = scaler.fit_transform(self.input_data)
            else:
                scaled_np_data = scaler.fit_transform(self.input_data.T).T

            self.input_data = pd.DataFrame(scaled_np_data,
                                           index=self.input_data.index,
                                           columns=self.input_data.columns)
            logging_print(
                "Centering and scaling is performed on the input data")

        timer_print(pp_start_time,
                    prefix="Data pre-processing ready",
                    time_overview_log=self.process_time_overview)
示例#18
0
    def read_files(self):
        rf_start_time = time.time()

        # read input component file
        if os.path.isfile(self.components_data_path):
            self.components_data = read_pd_df(self.components_data_path, {
                "sep": "\t",
                "index_col": 0
            },
                                              force=self.force)
            components_data_info = "Components dataframe n_row: {}, " \
                             "n_col: {}\n" \
                             "first column headers: {}\n" \
                             "first row index: {}".format(
                *self.components_data.shape,
                ", ".join(
                    self.components_data.columns.values[
                    :5]),
                ", ".join(
                    self.components_data.index.values[
                    :5]),
                )
            logging_print(components_data_info)
        else:
            raise FileNotFoundError("Cannot find input file: {}".format(
                self.components_data_path))

        # read matrix file
        self.matrix_data = read_pd_df(self.matrix_path, {
            "sep": "\t",
            "index_col": 0
        },
                                      force=self.force,
                                      proc_df_before_save=lambda df: df == 1.0)

        matrix_data_info = "Matrix dataframe n_row: {}, " \
                         "n_col: {}\n" \
                         "first column headers: {}\n" \
                         "first row index: {}".format(
            *self.matrix_data.shape,
            ", ".join(
                self.matrix_data.columns.values[
                :5]),
            ", ".join(
                self.matrix_data.index.values[
                :5]),
            )
        print(matrix_data_info)
        logging.info(matrix_data_info)

        # reading background gene path
        if self.background_genes_path is not None and self.background_genes_path != '':
            self.background_genes_data = read_pd_df(self.background_genes_path,
                                                    {
                                                        "sep": "\t",
                                                        "header": None
                                                    },
                                                    force=self.force)

            # convert to serie instead of matrix
            self.background_genes_data = self.background_genes_data.iloc[:, 0]

            background_genes_data_info = "Background genes file loaded:\n" \
                               "number of genes: {}, " \
                               "first genes: {}\n".format(
                self.background_genes_data.shape[0],
                ", ".join(
                    self.background_genes_data.values[
                    :5])
            )
            logging_print(background_genes_data_info)

        timer_print(rf_start_time, prefix="Reading input file ready")
示例#19
0
    def write_output_files(self, file_prefix=None):
        wof_start_time = time.time()
        if self.over_samples is False:
            eigenvectors_df = self.analysis_object.components.T
            pc_scores_df = self.analysis_object.projected_data
        else:
            eigenvectors_df = self.analysis_object.projected_data
            pc_scores_df = self.analysis_object.components.T
        eigenvectors_df.index.name = datetime.now().strftime('%d/%m/%Y')
        pc_scores_df.index.name = datetime.now().strftime('%d/%m/%Y')

        eigenvectors_file_name = "eigenvectors"
        pc_scores_file_name = "pc-scores"
        if file_prefix is not None:
            eigenvectors_file_name = "{}_{}".format(file_prefix,
                                                    eigenvectors_file_name)
            pc_scores_file_name = "{}_{}".format(file_prefix,
                                                 pc_scores_file_name)

        # export eigenvectors (components)
        if self.output_disable_txt == False:
            if self.output_disable_gzip:
                eigenvectors_df.to_csv(os.path.join(
                    self.output_dir, eigenvectors_file_name + ".txt"),
                                       sep='\t')
            else:
                eigenvectors_df.to_csv(os.path.join(
                    self.output_dir, eigenvectors_file_name + ".txt.gzip"),
                                       sep='\t')
        if self.output_disable_pickle == False:
            eigenvectors_df.to_pickle(
                os.path.join(self.output_dir, eigenvectors_file_name + ".pkl"))

        # export PC scores
        if self.output_disable_txt == False:
            if self.output_disable_gzip:
                pc_scores_df.to_csv(os.path.join(self.output_dir,
                                                 pc_scores_file_name + ".txt"),
                                    sep='\t')
            else:
                pc_scores_df.to_csv(os.path.join(
                    self.output_dir, pc_scores_file_name + ".txt.gzip"),
                                    sep='\t')

        if self.output_disable_pickle == False:
            pc_scores_df.to_pickle(
                os.path.join(self.output_dir, pc_scores_file_name + ".pkl"))

        timer_print(wof_start_time,
                    prefix="Writing output files ready",
                    time_overview_log=self.process_time_overview)

        if self.fastica_stable_safe_intermediates and self.analysis_type == decomposition_types[
                "FASTICA_STABLE"]:
            individual_run_dir = os.path.join(
                self.output_dir, "FastICA_individual_component_runs")
            create_output_dir_if_not_exists(individual_run_dir)
            for index, run_df in enumerate(
                    self.analysis_object.overall_components):
                ind_df_path = os.path.join(
                    individual_run_dir,
                    "fastica_components_run_{}.pkl".format(index + 1))
                run_df.to_pickle(ind_df_path)
示例#20
0
 def perform_analysis(self):
     pa_start_time = time.time()
     self.analysis_object.fit(self.get_analysis_input_df())
     timer_print(pa_start_time,
                 prefix="Performing analysis ready",
                 time_overview_log=self.process_time_overview)
示例#21
0
 def write_last_loginfo(self):
     timer_print(self.start_time,
                 prefix="## ANALYSIS READY",
                 time_overview_log=self.process_time_overview)
示例#22
0
    def perform_analysis(self):
        pa_start_time = time.time()
        total_z_score_results = []

        # check if some temp files where present
        temp_z_score_paths = glob.glob(
            os.path.join(self.output_dir,
                         "temp_results_analysis_z_scores_*.pkl"))
        temp_z_scores = None
        temp_already_processed_pathways = None
        if len(temp_z_score_paths) > 0:
            temp_z_scores = pd.read_pickle(temp_z_score_paths[0])
            logging_print(
                "Temp file '{}' with already processed pathways loaded. size df: {}"
                .format(temp_z_score_paths[0], temp_z_scores.shape))
            temp_already_processed_pathways = list(
                temp_z_scores.columns.to_numpy())

        pathway_manager = mp.Manager()
        pathway_queue = pathway_manager.Queue()

        retults_manager = mp.Manager()
        results_queue = retults_manager.Queue()

        for index, row in self.matrix_data.iteritems():
            if temp_z_scores is not None:
                if index in temp_already_processed_pathways:
                    total_z_score_results.append(temp_z_scores.loc[:, index])
                else:
                    pathway_queue.put(index)
            else:
                pathway_queue.put(index)

        logging_print("total pathways already done: {}".format(
            len(total_z_score_results)))

        n_workers = self.n_cores - 1
        processes = []
        for _ in range(n_workers):
            processes.append(
                mp.Process(target=single_pathway_worker,
                           args=(self.components_data, self.matrix_data,
                                 pathway_queue, results_queue,
                                 self.background_genes_data,
                                 self.analysis_type, -1,
                                 self.components_data_permutated)))

        for process in processes:
            process.start()

        total_done = 0
        last_save_time = time.time()
        save_time_in_minutes = 2
        z_score_file_path = None
        while True:
            try:
                if total_done >= n_workers:
                    # All workers are ready
                    break
                if results_queue.empty():
                    # Wait for the next results
                    time.sleep(5)
                else:
                    # Process the results
                    results = results_queue.get(True, timeout=1)
                    if isinstance(results, str) and results == "DONE":
                        # One worker is done
                        total_done += 1
                        logging_print("total workers done {} of {}".format(
                            total_done, n_workers))
                    else:
                        # Save the results
                        total_z_score_results.append(results)

                    # save temp results
                    if last_save_time + 60 * save_time_in_minutes < time.time(
                    ):
                        logging_print("save temp results: {}".format(
                            datetime.now()))
                        last_save_time = time.time()
                        temp_dataframe_z_scores = pd.DataFrame(
                            total_z_score_results).T
                        new_z_score_file_path = os.path.join(
                            self.output_dir,
                            "temp_results_analysis_z_scores_{}.pkl".format(
                                datetime.now()))
                        temp_dataframe_z_scores.to_pickle(
                            new_z_score_file_path)

                        if z_score_file_path is not None and os.path.isfile(
                                z_score_file_path):
                            os.remove(z_score_file_path)
                        z_score_file_path = new_z_score_file_path

            except queue.Empty:
                time.sleep(1)
                continue

        for process in processes:
            process.join()

        print("all processes ready")
        self.pathway_gene_scores = pd.DataFrame(total_z_score_results).T

        pathway_gene_scores_temp_file_path = os.path.join(
            self.output_dir,
            "temp_pathway_gene_scores_temp_{}.pkl".format(datetime.now()))
        self.pathway_gene_scores.to_pickle(pathway_gene_scores_temp_file_path)
        timer_print(pa_start_time, prefix="Performing analysis ready")
    def write_output_files(self, file_prefix=None):
        #
        # Method to create the output files
        #

        wof_start_time = time.time()

        # create the output eigenvector and pc scores matrix,
        # which will contains always the same information.
        # Normally the eigenvectors contains the components
        # (genes / components matrix) and the pc scores contains the rotated data (sample
        # / components) matrix. if the over_samples parameter is set,
        # the eigenvectors file contains the rotated data
        # (genes / components matrix) and the pc scores contains the
        # components (sample / components) matrix.

        if self.over_samples is False:
            eigenvectors_df = self.analysis_object.components.T
            pc_scores_df = self.analysis_object.projected_data
        else:
            eigenvectors_df = self.analysis_object.projected_data
            pc_scores_df = self.analysis_object.components.T

        # add the date information to the index name (first column of the
        # output files)
        eigenvectors_df.index.name = datetime.now().strftime('%d/%m/%Y')
        pc_scores_df.index.name = datetime.now().strftime('%d/%m/%Y')

        # Create the file names
        eigenvectors_file_name = "eigenvectors"
        pc_scores_file_name = "pc-scores"
        if file_prefix is not None:
            eigenvectors_file_name = "{}_{}".format(file_prefix,
                                                    eigenvectors_file_name)
            pc_scores_file_name = "{}_{}".format(file_prefix,
                                                 pc_scores_file_name)

        # export eigenvectors (components)
        if self.output_disable_txt == False:
            if self.output_disable_gzip:
                eigenvectors_df.to_csv(os.path.join(self.output_dir,
                                                    eigenvectors_file_name + ".txt"),
                                       sep='\t')
            else:
                eigenvectors_df.to_csv(os.path.join(self.output_dir,
                                                    eigenvectors_file_name + ".txt.gzip"),
                                       sep='\t')
        if self.output_disable_pickle == False:
            eigenvectors_df.to_pickle(
                os.path.join(self.output_dir, eigenvectors_file_name + ".pkl"))

        # export PC scores
        if self.output_disable_txt == False:
            if self.output_disable_gzip:
                pc_scores_df.to_csv(os.path.join(self.output_dir,
                                                 pc_scores_file_name + ".txt"),
                                    sep='\t')
            else:
                pc_scores_df.to_csv(os.path.join(self.output_dir,
                                                 pc_scores_file_name + ".txt.gzip"),
                                    sep='\t')

        if self.output_disable_pickle == False:
            pc_scores_df.to_pickle(os.path.join(self.output_dir,
                                                pc_scores_file_name + ".pkl"))

        # wirte the export time to the log file
        timer_print(wof_start_time,
                    prefix="Writing output files ready",
                    time_overview_log=self.process_time_overview)

        # if fastICA Stable analysis is used and the option to safe the
        # intermediate steps are set, the code below will export these data
        if self.fastica_stable_safe_intermediates and self.analysis_type == decomposition_types["FASTICA_STABLE"]:
            individual_run_dir = os.path.join(self.output_dir,
                                              "FastICA_individual_component_runs")
            create_output_dir_if_not_exists(individual_run_dir)
            for index, run_df in enumerate(self.analysis_object.overall_components):
                ind_df_path = os.path.join(individual_run_dir,
                                           "fastica_components_run_{}.pkl".format(index + 1))
                run_df.to_pickle(ind_df_path)
示例#24
0
 def calculate_p_value_bonferroni_correction(self, alpha=0.05):
     cpvbc_st_time = time.time()
     self.bonf_p_values = self.p_values * self.p_values.shape[0]
     timer_print(cpvbc_st_time,
                 prefix="Bonferroni p value correction ready")
示例#25
0
    def merge_output_files(self):
        mof_start_time = time.time()
        if self.multi_node_output_dir:
            node_output_auc_pred_files = glob.glob(
                os.path.join(self.multi_node_output_dir, "*",
                             "predictions_auc.pkl"))
            if len(node_output_auc_pred_files) == self.multi_node_num_nodes:
                logging_print("Merge output files")

                # merge AUC prediction files
                comp_df_auc_list = []
                for node_output_auc_file_path in node_output_auc_pred_files:
                    file_auc_df = pd.read_pickle(node_output_auc_file_path)
                    comp_df_auc_list.append(file_auc_df)

                overall_auc_df = pd.concat(comp_df_auc_list)

                bonf_values = overall_auc_df["pValue"] * overall_auc_df.shape[0]
                bonf_values[bonf_values > 1] = 1
                bonf_values[bonf_values < 0] = 0
                overall_auc_df["bonferroni"] = bonf_values

                if self.output_disable_txt == False:
                    output_file_path_csv = os.path.join(
                        self.multi_node_output_dir,
                        "predictions_auc_bonf.txt.gz")
                    if self.output_disable_gzip:
                        output_file_path_csv = os.path.join(
                            self.multi_node_output_dir,
                            "predictions_auc_bonf.txt")
                    overall_auc_df.to_csv(
                        output_file_path_csv,
                        columns=["geneCount", "pValue", "auc", "bonferroni"],
                        sep='\t')

                if self.output_disable_pickle == False:
                    overall_auc_df.to_pickle(
                        os.path.join(self.multi_node_output_dir,
                                     "predictions_auc_bonf.pkl"))

                # merge gene pathway files
                node_output_gene_pathway_pred_files = glob.glob(
                    os.path.join(self.multi_node_output_dir, "*",
                                 "gene_pathway_scores.pkl"))
                comp_df_gene_pathway_list = []
                for node_output_auc_file_path in node_output_gene_pathway_pred_files:
                    file_gene_pathway_df = pd.read_pickle(
                        node_output_auc_file_path)
                    comp_df_gene_pathway_list.append(file_gene_pathway_df)

                overall_gene_pathway_df = pd.concat(comp_df_gene_pathway_list,
                                                    axis=1)

                if self.output_disable_txt == False:
                    output_gene_pathway_pred_path = os.path.join(
                        self.multi_node_output_dir,
                        "gene_pathway_scores.txt.gz")

                    if self.output_disable_gzip:
                        output_gene_pathway_pred_path = os.path.join(
                            self.multi_node_output_dir,
                            "gene_pathway_scores.txt")
                    overall_gene_pathway_df.to_csv(
                        output_gene_pathway_pred_path, sep='\t')

                if self.output_disable_pickle == False:
                    overall_gene_pathway_df.to_pickle(
                        os.path.join(self.multi_node_output_dir,
                                     "gene_pathway_scores.pkl"))
        timer_print(mof_start_time, prefix="Writing merged outputfile ready")
 def write_last_loginfo(self):
     #
     # Add the end information to the logfile
     #
     timer_print(self.start_time, prefix="## ANALYSIS READY",
                 time_overview_log=self.process_time_overview)
示例#27
0
 def write_last_loginfo(self):
     timer_print(self.start_time, prefix="## ANALYSIS READY")