def perform_data_preprocessing(self): # # Perform the preprocessing steps # pp_start_time = time.time() # perform log2 transformation over the input data if self.perform_log2: self.input_data = np.log2(self.input_data, where=(self.input_data != 0.0)) logging_print("Log 2 transformation is performed on the input data") # perform center scaling over the input data if self.pre_processing_center_scale: scaler = StandardScaler() if self.over_samples: scaled_np_data = scaler.fit_transform(self.input_data) else: # Transform the dataset before and after scaling scaled_np_data = scaler.fit_transform(self.input_data.T).T # The method return a Numpy dataframe. Create a new Pandas DataFrame self.input_data = pd.DataFrame(scaled_np_data, index=self.input_data.index, columns=self.input_data.columns) logging_print("Centering and scaling is performed on the input data") timer_print(pp_start_time, prefix="Data pre-processing ready", time_overview_log=self.process_time_overview)
def write_output_files(self): wof_start_time = time.time() # Create AUC output file create_auc_output_file(self.auc_values, self.gene_pathway_count, self.p_values, self.output_dir, self.bonf_p_values, txt_gzip=self.output_disable_gzip == False, export_txt=self.output_disable_txt == False, export_pkl=self.output_disable_pickle == False or self.multi_node_num_nodes != None) # Export gene pathway scores if self.output_disable_txt == False: if self.output_disable_gzip: self.pathway_gene_scores.to_csv(os.path.join( self.output_dir, "gene_pathway_scores.txt"), sep='\t') else: self.pathway_gene_scores.to_csv(os.path.join( self.output_dir, "gene_pathway_scores.txt.gz"), sep='\t') if self.output_disable_pickle == False or \ self.multi_node_num_nodes != None: self.pathway_gene_scores.to_pickle( os.path.join(self.output_dir, "gene_pathway_scores.pkl")) timer_print(wof_start_time, prefix="Writing output files ready")
def calculate_centriods(self): calc_cent_start_time = time.time() logging_print("Start calculation of the centriods") initial_df = self.overall_components[0] gene_names = initial_df.columns n_components = initial_df.shape[0] initial_np = initial_df.to_numpy() signs = np.sign(initial_np) initial_np = np.abs(initial_np) processed_dfs = [initial_np * signs] for ica_df in self.overall_components[1:]: ica_df = ica_df.loc[:, gene_names] ica_df = ica_df.iloc[:n_components, :] corr_table = np.abs( np.corrcoef(initial_np, np.abs(ica_df.to_numpy()))) corr_table = corr_table[n_components:, :n_components] max_indexes_columns = np.argmax(corr_table, axis=0) ica_df = ica_df.iloc[max_indexes_columns, :] processed_dfs.append(ica_df.to_numpy()) centriods_array = np.c_[np.abs(processed_dfs) * signs] centriods = centriods_array.mean(axis=0) centriod_df = pd.DataFrame(centriods, index=initial_df.index, columns=initial_df.columns) timer_print(calc_cent_start_time, prefix="Centroid calculation is ready") return centriod_df
def calculate_wilcox_p_value(self): cwpv_st_time = time.time() wal_p_values = {} for pathway_id in self.auc_values.index: if pathway_id in list(self.matrix_data.columns): selected_pathway = self.pathway_gene_scores.loc[ self.matrix_data.index, pathway_id] include_in_pathway = selected_pathway[ self.matrix_data.loc[:, pathway_id]] include_not_in_pathway = selected_pathway[~self.matrix_data. loc[:, pathway_id]] try: _, p_value = scipy.stats.mannwhitneyu( include_in_pathway, include_not_in_pathway, use_continuity=True, alternative="two-sided") wal_p_values[pathway_id] = p_value except ValueError: logging_print( "Value error in p value calculation, pvalue 0.0 is set to pathway: {}" .format(pathway_id)) wal_p_values[pathway_id] = 0.0 self.p_values = pd.Series(wal_p_values) timer_print(cwpv_st_time, prefix="Wilcox p value calculation ready")
def read_file(self): # # Method to read all the input files # # save the start time rf_start_time = time.time() # Read the input file if present if os.path.isfile(self.input_file_path): if self.test_run: # Read a test dataset, so only the first 150 columns and rows self.input_data = pd.read_csv(self.input_file_path, sep="\t", index_col=0, nrows=150) self.input_data = self.input_data.iloc[:150, :100] else: # Check if only a part of the rows must be loaded instead # of the complete dataset n_rows = None if self.n_rows != None and self.n_rows != '': n_rows = int(self.n_rows) # read the input file, This can be a (cahsed) Pandas pickle # file or a tab seperated text file which can be compressed. # If Force is set to true, the method will not read cashed # pickle files created from the original txt matrix if these # are present. By default the cashed version will be # loaded (which contain the suffix _cashed.pickle) if this file # is present in the same directory as the input matrix self.input_data = read_pd_df( self.input_file_path, { "sep": "\t", "index_col": 0, "nrows": n_rows }, force=self.force) else: raise FileNotFoundError("Cannot find input file: {}".format( self.input_file_path )) # if the number of components is not set, we will set this to # the smallest direction of the input matrix if self.n_components is None: self.n_components = np.min(self.input_data.shape) # log some basic info logging_print(stats_dict_to_string({ "Input dataframe n_row": self.input_data.shape[0], "Input dataframe n_col": self.input_data.shape[1], "first column headers": self.input_data.columns.values[:5], "first row index": self.input_data.index.values[:5] })) timer_print(rf_start_time, prefix="Reading input file ready", time_overview_log=self.process_time_overview)
def perform_data_whitening(self, data): pdw_start_time = time.time() logging_print("FastICA, start data whitening") whited_svd_object = SVD_wrapper(svd_type=self.svd_type, n_components=self.n_components, white_data=True) self.whiten_data = whited_svd_object.fit_transform(data).to_numpy() self.whiten_components = whited_svd_object.components.to_numpy() timer_print(pdw_start_time, prefix="FastICA, data whitening ready")
def perform_filtering(self): if self.minimal_number_of_genes > 0: pf_start_time = time.time() matrix_selection = self.matrix_data.sum( axis=0) >= self.minimal_number_of_genes self.matrix_data = self.matrix_data.loc[:, matrix_selection[ matrix_selection].index] logging_print( "Minimal gene filtering: {} number of pathways over".format( self.matrix_data.shape[1])) timer_print(pf_start_time, prefix="Minimal gene in pathway filtering ready")
def perform_gene_intersection(self): pgi_start_time = time.time() intersect_genes = self.components_data.index.intersection( self.matrix_data.index) self.components_data = self.components_data.loc[intersect_genes, :] self.matrix_data = self.matrix_data.loc[intersect_genes, :] if self.has_background_genes(): self.background_genes_data = pd.Series( list( set(self.background_genes_data).intersection( set(intersect_genes)))) timer_print(pgi_start_time, prefix="Gene intersection ready")
def perform_data_whitening(self, data): # Method to perform a manual data whitening step, which is the # first stap in the fastICA analysis pdw_start_time = time.time() logging_print("FastICA, start data whitening") # Use the SVD wrapper to perform the whitening step based on a PCA. whited_svd_object = SVD_wrapper(svd_type=self.svd_type, n_components=self.n_components, white_data=True) # Save the results self.whiten_data = whited_svd_object.fit_transform(data).to_numpy() self.whiten_components = whited_svd_object.components.to_numpy() timer_print(pdw_start_time, prefix="FastICA, data whitening ready")
def perform_analysis(self): # # Method to perform an analysis based on the earlier created analysis object # pa_start_time = time.time() # All analysis classes implements an fit method which uses # the given input matrix to perform the decomposition self.analysis_object.fit(self.get_analysis_input_df()) # print the used time to the logging file timer_print(pa_start_time, prefix="Performing analysis ready", time_overview_log=self.process_time_overview)
def fit(self, data): # Method to fit the FastICA models # Check if the whitening step is already done if self.whiten_components is None: # Perfrom the whitening step self.perform_data_whitening(data) fit_start_time = time.time() # Create the FastICA object from sklearn without performing # the whiten step fastICA_object = FastICA(algorithm="parallel", whiten=False, fun='logcosh', max_iter=self.max_iter, tol=1e-10) # Fit the model fastICA_object.fit(self.whiten_data[:, :self.n_components]) # Calculate the independend components and the sources and save # the results indep_comp = np.dot(fastICA_object.components_, self.whiten_components[:self.n_components, :]) indep_sources = np.dot(indep_comp, data.to_numpy().T).T components_index = pd.RangeIndex(start=1, stop=self.n_components + 1, name="IC") indep_sources_df = pd.DataFrame( indep_sources, index=data.index, columns=components_index).add_prefix("IC_") indep_comp_df = pd.DataFrame( indep_comp, index=components_index, columns=data.columns).T.add_prefix("IC_").T self.projected_data = indep_sources_df self.components = indep_comp_df logging_print( stats_dict_to_string( {"Number of used iterations": fastICA_object.n_iter_})) timer_print(fit_start_time, prefix="FastICA component optimalisation is ready")
def create_permutations_matrixes(self): perm_start_time = time.time() permutation_path = "{}_permutation_matrix.pkl".format( self.components_data_path) if os.path.isfile(permutation_path) and self.force == False: self.components_data_permutated = pd.read_pickle(permutation_path) logging_print("Permutation dataframe is loaded from '{}', " "shape: {}".format( permutation_path, self.components_data_permutated.shape)) else: self.components_data_permutated = self.components_data.set_index( np.random.permutation(self.components_data.index)) self.components_data_permutated.to_pickle(permutation_path) logging_print("Permutation table is created, shape: {}".format( self.components_data_permutated.shape)) timer_print(perm_start_time, prefix="permutation is ready")
def calculate_centriods(self): # Method to calculate the centriods, e.q. the average components from # the components extracted from different individual runs calc_cent_start_time = time.time() logging_print("Start calculation of the centriods") # start position, extracted from the first run initial_df = self.overall_components[0] gene_names = initial_df.columns n_components = initial_df.shape[0] initial_np = initial_df.to_numpy() signs = np.sign(initial_np) initial_np = np.abs(initial_np) processed_dfs = [initial_np * signs] # loop through all the component runs for ica_df in self.overall_components[1:]: # Sort the components based on the highest correlation between the # components and the component from the first run. this resulted in # the same order of the fastica components over all runs. # By default, the components of a fastICA are random ordered ica_df = ica_df.loc[:, gene_names] ica_df = ica_df.iloc[:n_components, :] # calculate the correlation corr_table = np.abs( np.corrcoef(initial_np, np.abs(ica_df.to_numpy()))) corr_table = corr_table[n_components:, :n_components] # find the order based on the maximal correlation max_indexes_columns = np.argmax(corr_table, axis=0) # Order the components ica_df = ica_df.iloc[max_indexes_columns, :] # Save the components in the right order processed_dfs.append(ica_df.to_numpy()) # Calculate the centriods e.q. average components centriods_array = np.c_[np.abs(processed_dfs) * signs] centriods = centriods_array.mean(axis=0) centriod_df = pd.DataFrame(centriods, index=initial_df.index, columns=initial_df.columns) timer_print(calc_cent_start_time, prefix="Centroid calculation is ready") # return the average components return centriod_df
def read_file(self): rf_start_time = time.time() if os.path.isfile(self.input_file_path): if self.test_run: self.input_data = pd.read_csv(self.input_file_path, sep="\t", index_col=0, nrows=150) self.input_data = self.input_data.iloc[:150, :100] else: n_rows = None if self.n_rows != None and self.n_rows != '': n_rows = int(self.n_rows) self.input_data = read_pd_df(self.input_file_path, { "sep": "\t", "index_col": 0, "nrows": n_rows }, force=self.force) else: raise FileNotFoundError("Cannot find input file: {}".format( self.input_file_path)) if self.n_components is None: self.n_components = np.min(self.input_data.shape) logging_print( stats_dict_to_string({ "Input dataframe n_row": self.input_data.shape[0], "Input dataframe n_col": self.input_data.shape[1], "first column headers": self.input_data.columns.values[:5], "first row index": self.input_data.index.values[:5] })) timer_print(rf_start_time, prefix="Reading input file ready", time_overview_log=self.process_time_overview)
def calculate_auc_values(self): cav_st_stime = time.time() auc_values = {} for pathway_id in self.pathway_gene_scores.columns: if pathway_id in list(self.matrix_data.columns): pathway_gene_set = self.matrix_data.loc[:, pathway_id] pathway_gene_scores = self.pathway_gene_scores.loc[ self.matrix_data.index, pathway_id] try: log_reg_fpr, log_reg_tpr, _ = roc_curve( pathway_gene_set * 1, pathway_gene_scores) auc_values[pathway_id] = auc(log_reg_fpr, log_reg_tpr) except ValueError: logging_print( "Value error in auc calculation of pathway: {}".format( pathway_id)) else: logging_print("HPO term: {} not in matrix".format(pathway_id)) auc_calc_values = pd.Series(auc_values) auc_calc_values[auc_calc_values > 1.0] = 1.0 self.auc_values = auc_calc_values timer_print(cav_st_stime, prefix="AUC value calculation ready")
def fit(self, data): if self.whiten_components is None: self.perform_data_whitening(data) fit_start_time = time.time() fastICA_object = FastICA(algorithm="parallel", whiten=False, fun='logcosh', max_iter=self.max_iter, tol=1e-10) fastICA_object.fit(self.whiten_data[:, :self.n_components]) indep_comp = np.dot(fastICA_object.components_, self.whiten_components[:self.n_components, :]) indep_sources = np.dot(indep_comp, data.to_numpy().T).T components_index = pd.RangeIndex(start=1, stop=self.n_components + 1, name="IC") indep_sources_df = pd.DataFrame( indep_sources, index=data.index, columns=components_index).add_prefix("IC_") indep_comp_df = pd.DataFrame( indep_comp, index=components_index, columns=data.columns).T.add_prefix("IC_").T self.projected_data = indep_sources_df self.components = indep_comp_df logging_print( stats_dict_to_string( {"Number of used iterations": fastICA_object.n_iter_})) timer_print(fit_start_time, prefix="FastICA component optimalisation is ready")
def perform_data_preprocessing(self): pp_start_time = time.time() if self.perform_log2: self.input_data = np.log2(self.input_data, where=(self.input_data != 0.0)) logging_print( "Log 2 transformation is performed on the input data") if self.pre_processing_center_scale: scaler = StandardScaler() if self.over_samples: scaled_np_data = scaler.fit_transform(self.input_data) else: scaled_np_data = scaler.fit_transform(self.input_data.T).T self.input_data = pd.DataFrame(scaled_np_data, index=self.input_data.index, columns=self.input_data.columns) logging_print( "Centering and scaling is performed on the input data") timer_print(pp_start_time, prefix="Data pre-processing ready", time_overview_log=self.process_time_overview)
def read_files(self): rf_start_time = time.time() # read input component file if os.path.isfile(self.components_data_path): self.components_data = read_pd_df(self.components_data_path, { "sep": "\t", "index_col": 0 }, force=self.force) components_data_info = "Components dataframe n_row: {}, " \ "n_col: {}\n" \ "first column headers: {}\n" \ "first row index: {}".format( *self.components_data.shape, ", ".join( self.components_data.columns.values[ :5]), ", ".join( self.components_data.index.values[ :5]), ) logging_print(components_data_info) else: raise FileNotFoundError("Cannot find input file: {}".format( self.components_data_path)) # read matrix file self.matrix_data = read_pd_df(self.matrix_path, { "sep": "\t", "index_col": 0 }, force=self.force, proc_df_before_save=lambda df: df == 1.0) matrix_data_info = "Matrix dataframe n_row: {}, " \ "n_col: {}\n" \ "first column headers: {}\n" \ "first row index: {}".format( *self.matrix_data.shape, ", ".join( self.matrix_data.columns.values[ :5]), ", ".join( self.matrix_data.index.values[ :5]), ) print(matrix_data_info) logging.info(matrix_data_info) # reading background gene path if self.background_genes_path is not None and self.background_genes_path != '': self.background_genes_data = read_pd_df(self.background_genes_path, { "sep": "\t", "header": None }, force=self.force) # convert to serie instead of matrix self.background_genes_data = self.background_genes_data.iloc[:, 0] background_genes_data_info = "Background genes file loaded:\n" \ "number of genes: {}, " \ "first genes: {}\n".format( self.background_genes_data.shape[0], ", ".join( self.background_genes_data.values[ :5]) ) logging_print(background_genes_data_info) timer_print(rf_start_time, prefix="Reading input file ready")
def write_output_files(self, file_prefix=None): wof_start_time = time.time() if self.over_samples is False: eigenvectors_df = self.analysis_object.components.T pc_scores_df = self.analysis_object.projected_data else: eigenvectors_df = self.analysis_object.projected_data pc_scores_df = self.analysis_object.components.T eigenvectors_df.index.name = datetime.now().strftime('%d/%m/%Y') pc_scores_df.index.name = datetime.now().strftime('%d/%m/%Y') eigenvectors_file_name = "eigenvectors" pc_scores_file_name = "pc-scores" if file_prefix is not None: eigenvectors_file_name = "{}_{}".format(file_prefix, eigenvectors_file_name) pc_scores_file_name = "{}_{}".format(file_prefix, pc_scores_file_name) # export eigenvectors (components) if self.output_disable_txt == False: if self.output_disable_gzip: eigenvectors_df.to_csv(os.path.join( self.output_dir, eigenvectors_file_name + ".txt"), sep='\t') else: eigenvectors_df.to_csv(os.path.join( self.output_dir, eigenvectors_file_name + ".txt.gzip"), sep='\t') if self.output_disable_pickle == False: eigenvectors_df.to_pickle( os.path.join(self.output_dir, eigenvectors_file_name + ".pkl")) # export PC scores if self.output_disable_txt == False: if self.output_disable_gzip: pc_scores_df.to_csv(os.path.join(self.output_dir, pc_scores_file_name + ".txt"), sep='\t') else: pc_scores_df.to_csv(os.path.join( self.output_dir, pc_scores_file_name + ".txt.gzip"), sep='\t') if self.output_disable_pickle == False: pc_scores_df.to_pickle( os.path.join(self.output_dir, pc_scores_file_name + ".pkl")) timer_print(wof_start_time, prefix="Writing output files ready", time_overview_log=self.process_time_overview) if self.fastica_stable_safe_intermediates and self.analysis_type == decomposition_types[ "FASTICA_STABLE"]: individual_run_dir = os.path.join( self.output_dir, "FastICA_individual_component_runs") create_output_dir_if_not_exists(individual_run_dir) for index, run_df in enumerate( self.analysis_object.overall_components): ind_df_path = os.path.join( individual_run_dir, "fastica_components_run_{}.pkl".format(index + 1)) run_df.to_pickle(ind_df_path)
def perform_analysis(self): pa_start_time = time.time() self.analysis_object.fit(self.get_analysis_input_df()) timer_print(pa_start_time, prefix="Performing analysis ready", time_overview_log=self.process_time_overview)
def write_last_loginfo(self): timer_print(self.start_time, prefix="## ANALYSIS READY", time_overview_log=self.process_time_overview)
def perform_analysis(self): pa_start_time = time.time() total_z_score_results = [] # check if some temp files where present temp_z_score_paths = glob.glob( os.path.join(self.output_dir, "temp_results_analysis_z_scores_*.pkl")) temp_z_scores = None temp_already_processed_pathways = None if len(temp_z_score_paths) > 0: temp_z_scores = pd.read_pickle(temp_z_score_paths[0]) logging_print( "Temp file '{}' with already processed pathways loaded. size df: {}" .format(temp_z_score_paths[0], temp_z_scores.shape)) temp_already_processed_pathways = list( temp_z_scores.columns.to_numpy()) pathway_manager = mp.Manager() pathway_queue = pathway_manager.Queue() retults_manager = mp.Manager() results_queue = retults_manager.Queue() for index, row in self.matrix_data.iteritems(): if temp_z_scores is not None: if index in temp_already_processed_pathways: total_z_score_results.append(temp_z_scores.loc[:, index]) else: pathway_queue.put(index) else: pathway_queue.put(index) logging_print("total pathways already done: {}".format( len(total_z_score_results))) n_workers = self.n_cores - 1 processes = [] for _ in range(n_workers): processes.append( mp.Process(target=single_pathway_worker, args=(self.components_data, self.matrix_data, pathway_queue, results_queue, self.background_genes_data, self.analysis_type, -1, self.components_data_permutated))) for process in processes: process.start() total_done = 0 last_save_time = time.time() save_time_in_minutes = 2 z_score_file_path = None while True: try: if total_done >= n_workers: # All workers are ready break if results_queue.empty(): # Wait for the next results time.sleep(5) else: # Process the results results = results_queue.get(True, timeout=1) if isinstance(results, str) and results == "DONE": # One worker is done total_done += 1 logging_print("total workers done {} of {}".format( total_done, n_workers)) else: # Save the results total_z_score_results.append(results) # save temp results if last_save_time + 60 * save_time_in_minutes < time.time( ): logging_print("save temp results: {}".format( datetime.now())) last_save_time = time.time() temp_dataframe_z_scores = pd.DataFrame( total_z_score_results).T new_z_score_file_path = os.path.join( self.output_dir, "temp_results_analysis_z_scores_{}.pkl".format( datetime.now())) temp_dataframe_z_scores.to_pickle( new_z_score_file_path) if z_score_file_path is not None and os.path.isfile( z_score_file_path): os.remove(z_score_file_path) z_score_file_path = new_z_score_file_path except queue.Empty: time.sleep(1) continue for process in processes: process.join() print("all processes ready") self.pathway_gene_scores = pd.DataFrame(total_z_score_results).T pathway_gene_scores_temp_file_path = os.path.join( self.output_dir, "temp_pathway_gene_scores_temp_{}.pkl".format(datetime.now())) self.pathway_gene_scores.to_pickle(pathway_gene_scores_temp_file_path) timer_print(pa_start_time, prefix="Performing analysis ready")
def write_output_files(self, file_prefix=None): # # Method to create the output files # wof_start_time = time.time() # create the output eigenvector and pc scores matrix, # which will contains always the same information. # Normally the eigenvectors contains the components # (genes / components matrix) and the pc scores contains the rotated data (sample # / components) matrix. if the over_samples parameter is set, # the eigenvectors file contains the rotated data # (genes / components matrix) and the pc scores contains the # components (sample / components) matrix. if self.over_samples is False: eigenvectors_df = self.analysis_object.components.T pc_scores_df = self.analysis_object.projected_data else: eigenvectors_df = self.analysis_object.projected_data pc_scores_df = self.analysis_object.components.T # add the date information to the index name (first column of the # output files) eigenvectors_df.index.name = datetime.now().strftime('%d/%m/%Y') pc_scores_df.index.name = datetime.now().strftime('%d/%m/%Y') # Create the file names eigenvectors_file_name = "eigenvectors" pc_scores_file_name = "pc-scores" if file_prefix is not None: eigenvectors_file_name = "{}_{}".format(file_prefix, eigenvectors_file_name) pc_scores_file_name = "{}_{}".format(file_prefix, pc_scores_file_name) # export eigenvectors (components) if self.output_disable_txt == False: if self.output_disable_gzip: eigenvectors_df.to_csv(os.path.join(self.output_dir, eigenvectors_file_name + ".txt"), sep='\t') else: eigenvectors_df.to_csv(os.path.join(self.output_dir, eigenvectors_file_name + ".txt.gzip"), sep='\t') if self.output_disable_pickle == False: eigenvectors_df.to_pickle( os.path.join(self.output_dir, eigenvectors_file_name + ".pkl")) # export PC scores if self.output_disable_txt == False: if self.output_disable_gzip: pc_scores_df.to_csv(os.path.join(self.output_dir, pc_scores_file_name + ".txt"), sep='\t') else: pc_scores_df.to_csv(os.path.join(self.output_dir, pc_scores_file_name + ".txt.gzip"), sep='\t') if self.output_disable_pickle == False: pc_scores_df.to_pickle(os.path.join(self.output_dir, pc_scores_file_name + ".pkl")) # wirte the export time to the log file timer_print(wof_start_time, prefix="Writing output files ready", time_overview_log=self.process_time_overview) # if fastICA Stable analysis is used and the option to safe the # intermediate steps are set, the code below will export these data if self.fastica_stable_safe_intermediates and self.analysis_type == decomposition_types["FASTICA_STABLE"]: individual_run_dir = os.path.join(self.output_dir, "FastICA_individual_component_runs") create_output_dir_if_not_exists(individual_run_dir) for index, run_df in enumerate(self.analysis_object.overall_components): ind_df_path = os.path.join(individual_run_dir, "fastica_components_run_{}.pkl".format(index + 1)) run_df.to_pickle(ind_df_path)
def calculate_p_value_bonferroni_correction(self, alpha=0.05): cpvbc_st_time = time.time() self.bonf_p_values = self.p_values * self.p_values.shape[0] timer_print(cpvbc_st_time, prefix="Bonferroni p value correction ready")
def merge_output_files(self): mof_start_time = time.time() if self.multi_node_output_dir: node_output_auc_pred_files = glob.glob( os.path.join(self.multi_node_output_dir, "*", "predictions_auc.pkl")) if len(node_output_auc_pred_files) == self.multi_node_num_nodes: logging_print("Merge output files") # merge AUC prediction files comp_df_auc_list = [] for node_output_auc_file_path in node_output_auc_pred_files: file_auc_df = pd.read_pickle(node_output_auc_file_path) comp_df_auc_list.append(file_auc_df) overall_auc_df = pd.concat(comp_df_auc_list) bonf_values = overall_auc_df["pValue"] * overall_auc_df.shape[0] bonf_values[bonf_values > 1] = 1 bonf_values[bonf_values < 0] = 0 overall_auc_df["bonferroni"] = bonf_values if self.output_disable_txt == False: output_file_path_csv = os.path.join( self.multi_node_output_dir, "predictions_auc_bonf.txt.gz") if self.output_disable_gzip: output_file_path_csv = os.path.join( self.multi_node_output_dir, "predictions_auc_bonf.txt") overall_auc_df.to_csv( output_file_path_csv, columns=["geneCount", "pValue", "auc", "bonferroni"], sep='\t') if self.output_disable_pickle == False: overall_auc_df.to_pickle( os.path.join(self.multi_node_output_dir, "predictions_auc_bonf.pkl")) # merge gene pathway files node_output_gene_pathway_pred_files = glob.glob( os.path.join(self.multi_node_output_dir, "*", "gene_pathway_scores.pkl")) comp_df_gene_pathway_list = [] for node_output_auc_file_path in node_output_gene_pathway_pred_files: file_gene_pathway_df = pd.read_pickle( node_output_auc_file_path) comp_df_gene_pathway_list.append(file_gene_pathway_df) overall_gene_pathway_df = pd.concat(comp_df_gene_pathway_list, axis=1) if self.output_disable_txt == False: output_gene_pathway_pred_path = os.path.join( self.multi_node_output_dir, "gene_pathway_scores.txt.gz") if self.output_disable_gzip: output_gene_pathway_pred_path = os.path.join( self.multi_node_output_dir, "gene_pathway_scores.txt") overall_gene_pathway_df.to_csv( output_gene_pathway_pred_path, sep='\t') if self.output_disable_pickle == False: overall_gene_pathway_df.to_pickle( os.path.join(self.multi_node_output_dir, "gene_pathway_scores.pkl")) timer_print(mof_start_time, prefix="Writing merged outputfile ready")
def write_last_loginfo(self): # # Add the end information to the logfile # timer_print(self.start_time, prefix="## ANALYSIS READY", time_overview_log=self.process_time_overview)
def write_last_loginfo(self): timer_print(self.start_time, prefix="## ANALYSIS READY")