def perform_data_preprocessing(self): # # Perform the preprocessing steps # pp_start_time = time.time() # perform log2 transformation over the input data if self.perform_log2: self.input_data = np.log2(self.input_data, where=(self.input_data != 0.0)) logging_print("Log 2 transformation is performed on the input data") # perform center scaling over the input data if self.pre_processing_center_scale: scaler = StandardScaler() if self.over_samples: scaled_np_data = scaler.fit_transform(self.input_data) else: # Transform the dataset before and after scaling scaled_np_data = scaler.fit_transform(self.input_data.T).T # The method return a Numpy dataframe. Create a new Pandas DataFrame self.input_data = pd.DataFrame(scaled_np_data, index=self.input_data.index, columns=self.input_data.columns) logging_print("Centering and scaling is performed on the input data") timer_print(pp_start_time, prefix="Data pre-processing ready", time_overview_log=self.process_time_overview)
def calculate_wilcox_p_value(self): cwpv_st_time = time.time() wal_p_values = {} for pathway_id in self.auc_values.index: if pathway_id in list(self.matrix_data.columns): selected_pathway = self.pathway_gene_scores.loc[ self.matrix_data.index, pathway_id] include_in_pathway = selected_pathway[ self.matrix_data.loc[:, pathway_id]] include_not_in_pathway = selected_pathway[~self.matrix_data. loc[:, pathway_id]] try: _, p_value = scipy.stats.mannwhitneyu( include_in_pathway, include_not_in_pathway, use_continuity=True, alternative="two-sided") wal_p_values[pathway_id] = p_value except ValueError: logging_print( "Value error in p value calculation, pvalue 0.0 is set to pathway: {}" .format(pathway_id)) wal_p_values[pathway_id] = 0.0 self.p_values = pd.Series(wal_p_values) timer_print(cwpv_st_time, prefix="Wilcox p value calculation ready")
def calculate_centriods(self): calc_cent_start_time = time.time() logging_print("Start calculation of the centriods") initial_df = self.overall_components[0] gene_names = initial_df.columns n_components = initial_df.shape[0] initial_np = initial_df.to_numpy() signs = np.sign(initial_np) initial_np = np.abs(initial_np) processed_dfs = [initial_np * signs] for ica_df in self.overall_components[1:]: ica_df = ica_df.loc[:, gene_names] ica_df = ica_df.iloc[:n_components, :] corr_table = np.abs( np.corrcoef(initial_np, np.abs(ica_df.to_numpy()))) corr_table = corr_table[n_components:, :n_components] max_indexes_columns = np.argmax(corr_table, axis=0) ica_df = ica_df.iloc[max_indexes_columns, :] processed_dfs.append(ica_df.to_numpy()) centriods_array = np.c_[np.abs(processed_dfs) * signs] centriods = centriods_array.mean(axis=0) centriod_df = pd.DataFrame(centriods, index=initial_df.index, columns=initial_df.columns) timer_print(calc_cent_start_time, prefix="Centroid calculation is ready") return centriod_df
def read_file(self): # # Method to read all the input files # # save the start time rf_start_time = time.time() # Read the input file if present if os.path.isfile(self.input_file_path): if self.test_run: # Read a test dataset, so only the first 150 columns and rows self.input_data = pd.read_csv(self.input_file_path, sep="\t", index_col=0, nrows=150) self.input_data = self.input_data.iloc[:150, :100] else: # Check if only a part of the rows must be loaded instead # of the complete dataset n_rows = None if self.n_rows != None and self.n_rows != '': n_rows = int(self.n_rows) # read the input file, This can be a (cahsed) Pandas pickle # file or a tab seperated text file which can be compressed. # If Force is set to true, the method will not read cashed # pickle files created from the original txt matrix if these # are present. By default the cashed version will be # loaded (which contain the suffix _cashed.pickle) if this file # is present in the same directory as the input matrix self.input_data = read_pd_df( self.input_file_path, { "sep": "\t", "index_col": 0, "nrows": n_rows }, force=self.force) else: raise FileNotFoundError("Cannot find input file: {}".format( self.input_file_path )) # if the number of components is not set, we will set this to # the smallest direction of the input matrix if self.n_components is None: self.n_components = np.min(self.input_data.shape) # log some basic info logging_print(stats_dict_to_string({ "Input dataframe n_row": self.input_data.shape[0], "Input dataframe n_col": self.input_data.shape[1], "first column headers": self.input_data.columns.values[:5], "first row index": self.input_data.index.values[:5] })) timer_print(rf_start_time, prefix="Reading input file ready", time_overview_log=self.process_time_overview)
def perform_data_whitening(self, data): pdw_start_time = time.time() logging_print("FastICA, start data whitening") whited_svd_object = SVD_wrapper(svd_type=self.svd_type, n_components=self.n_components, white_data=True) self.whiten_data = whited_svd_object.fit_transform(data).to_numpy() self.whiten_components = whited_svd_object.components.to_numpy() timer_print(pdw_start_time, prefix="FastICA, data whitening ready")
def perform_filtering(self): if self.minimal_number_of_genes > 0: pf_start_time = time.time() matrix_selection = self.matrix_data.sum( axis=0) >= self.minimal_number_of_genes self.matrix_data = self.matrix_data.loc[:, matrix_selection[ matrix_selection].index] logging_print( "Minimal gene filtering: {} number of pathways over".format( self.matrix_data.shape[1])) timer_print(pf_start_time, prefix="Minimal gene in pathway filtering ready")
def perform_data_whitening(self, data): # Method to perform a manual data whitening step, which is the # first stap in the fastICA analysis pdw_start_time = time.time() logging_print("FastICA, start data whitening") # Use the SVD wrapper to perform the whitening step based on a PCA. whited_svd_object = SVD_wrapper(svd_type=self.svd_type, n_components=self.n_components, white_data=True) # Save the results self.whiten_data = whited_svd_object.fit_transform(data).to_numpy() self.whiten_components = whited_svd_object.components.to_numpy() timer_print(pdw_start_time, prefix="FastICA, data whitening ready")
def fit_auto_white(self, data): logging_print("Use fastICA with auto whiten") fastICA_object = FastICA(n_components=self.n_components, algorithm="parallel", fun='logcosh', max_iter=500, tol=1e-10) sources = fastICA_object.fit_transform(data) self.projected_data = sources self.components = fastICA_object.components_ logging_print( stats_dict_to_string( {"Number of used iterations": fastICA_object.n_iter_}))
def write_base_loginfo(self): info = "## START GENE PATHWAY ANALYSIS (new p value method) ##\n" \ "DATE:\t{date}\n" \ "Component file:\t{component_file}\n" \ "Matrix file:\t{matrix_file}\n" \ "Background gene file:\t{background_gene_file}\n" \ "Output dir:\t{output_dir}\n" \ "Analysis type:\t{analysis_type}\n" \ "Number of cores:\t{num_cores}\n" \ "".format(date=datetime.now(), component_file=self.components_data_path, matrix_file=self.matrix_path, background_gene_file=self.background_genes_path, output_dir=self.output_dir, analysis_type=self.analysis_type, num_cores=self.n_cores) logging_print(info)
def fit(self, data): # Method to fit the FastICA models # Check if the whitening step is already done if self.whiten_components is None: # Perfrom the whitening step self.perform_data_whitening(data) fit_start_time = time.time() # Create the FastICA object from sklearn without performing # the whiten step fastICA_object = FastICA(algorithm="parallel", whiten=False, fun='logcosh', max_iter=self.max_iter, tol=1e-10) # Fit the model fastICA_object.fit(self.whiten_data[:, :self.n_components]) # Calculate the independend components and the sources and save # the results indep_comp = np.dot(fastICA_object.components_, self.whiten_components[:self.n_components, :]) indep_sources = np.dot(indep_comp, data.to_numpy().T).T components_index = pd.RangeIndex(start=1, stop=self.n_components + 1, name="IC") indep_sources_df = pd.DataFrame( indep_sources, index=data.index, columns=components_index).add_prefix("IC_") indep_comp_df = pd.DataFrame( indep_comp, index=components_index, columns=data.columns).T.add_prefix("IC_").T self.projected_data = indep_sources_df self.components = indep_comp_df logging_print( stats_dict_to_string( {"Number of used iterations": fastICA_object.n_iter_})) timer_print(fit_start_time, prefix="FastICA component optimalisation is ready")
def write_base_loginfo(self): logging_print( stats_dict_to_string({ "## START DECOMPOSITION ##": "", "DATE": datetime.now(), "Input file": self.input_file_path, "Output dir": self.output_dir, "Analysis type": self.analysis_type, "Over samples": self.over_samples, "Test run": self.test_run, "FastICA max iter": self.fastICA_max_iter, "Number of components": self.n_components, "Number of rows": self.n_rows, "Perform log2 transformation": self.perform_log2, "Perform centering and scaling": self.pre_processing_center_scale, "Force": self.force }))
def create_permutations_matrixes(self): perm_start_time = time.time() permutation_path = "{}_permutation_matrix.pkl".format( self.components_data_path) if os.path.isfile(permutation_path) and self.force == False: self.components_data_permutated = pd.read_pickle(permutation_path) logging_print("Permutation dataframe is loaded from '{}', " "shape: {}".format( permutation_path, self.components_data_permutated.shape)) else: self.components_data_permutated = self.components_data.set_index( np.random.permutation(self.components_data.index)) self.components_data_permutated.to_pickle(permutation_path) logging_print("Permutation table is created, shape: {}".format( self.components_data_permutated.shape)) timer_print(perm_start_time, prefix="permutation is ready")
def calculate_centriods(self): # Method to calculate the centriods, e.q. the average components from # the components extracted from different individual runs calc_cent_start_time = time.time() logging_print("Start calculation of the centriods") # start position, extracted from the first run initial_df = self.overall_components[0] gene_names = initial_df.columns n_components = initial_df.shape[0] initial_np = initial_df.to_numpy() signs = np.sign(initial_np) initial_np = np.abs(initial_np) processed_dfs = [initial_np * signs] # loop through all the component runs for ica_df in self.overall_components[1:]: # Sort the components based on the highest correlation between the # components and the component from the first run. this resulted in # the same order of the fastica components over all runs. # By default, the components of a fastICA are random ordered ica_df = ica_df.loc[:, gene_names] ica_df = ica_df.iloc[:n_components, :] # calculate the correlation corr_table = np.abs( np.corrcoef(initial_np, np.abs(ica_df.to_numpy()))) corr_table = corr_table[n_components:, :n_components] # find the order based on the maximal correlation max_indexes_columns = np.argmax(corr_table, axis=0) # Order the components ica_df = ica_df.iloc[max_indexes_columns, :] # Save the components in the right order processed_dfs.append(ica_df.to_numpy()) # Calculate the centriods e.q. average components centriods_array = np.c_[np.abs(processed_dfs) * signs] centriods = centriods_array.mean(axis=0) centriod_df = pd.DataFrame(centriods, index=initial_df.index, columns=initial_df.columns) timer_print(calc_cent_start_time, prefix="Centroid calculation is ready") # return the average components return centriod_df
def perform_multinode_processing(self): # handle new start end stop if self.split_end is not None: self.matrix_data = self.matrix_data.iloc[:, :self.split_end] if self.split_start is not None: self.matrix_data = self.matrix_data.iloc[:, self.split_start:] if self.split_start is not None or self.split_end is not None: logging_print("Trim matrix, new start: {start}, " "new end: {end}, dataframe size: {df_size}".format( start=self.split_start, end=self.split_end, df_size=self.matrix_data.shape)) # handle multi node processing on a cluster if self.multi_node_num_nodes is not None and \ self.multi_node_node_id is not None: num_pathways = self.matrix_data.shape[1] pathways_per_node = num_pathways // self.multi_node_num_nodes if pathways_per_node * self.multi_node_num_nodes < num_pathways: pathways_per_node += 1 start_id = self.multi_node_node_id * pathways_per_node end_id = (self.multi_node_node_id + 1) * pathways_per_node if end_id > num_pathways: end_id = num_pathways self.matrix_data = self.matrix_data.iloc[:, start_id:end_id] logging_print( "## Process on multiple nodes ##\n" "Node: {node_id} of {node_num}\n" "Number pathways per node: {num_pathways_per_node} of {num_pathways}\n" "Start id: {start_id}\n" "End id: {end_id}\n" "Dataframe size: {df_size}".format( node_id=self.multi_node_node_id + 1, node_num=self.multi_node_num_nodes, num_pathways_per_node=pathways_per_node, num_pathways=num_pathways, start_id=start_id, end_id=end_id, df_size=self.matrix_data.shape))
def read_file(self): rf_start_time = time.time() if os.path.isfile(self.input_file_path): if self.test_run: self.input_data = pd.read_csv(self.input_file_path, sep="\t", index_col=0, nrows=150) self.input_data = self.input_data.iloc[:150, :100] else: n_rows = None if self.n_rows != None and self.n_rows != '': n_rows = int(self.n_rows) self.input_data = read_pd_df(self.input_file_path, { "sep": "\t", "index_col": 0, "nrows": n_rows }, force=self.force) else: raise FileNotFoundError("Cannot find input file: {}".format( self.input_file_path)) if self.n_components is None: self.n_components = np.min(self.input_data.shape) logging_print( stats_dict_to_string({ "Input dataframe n_row": self.input_data.shape[0], "Input dataframe n_col": self.input_data.shape[1], "first column headers": self.input_data.columns.values[:5], "first row index": self.input_data.index.values[:5] })) timer_print(rf_start_time, prefix="Reading input file ready", time_overview_log=self.process_time_overview)
def fit_auto_white(self, data): # Method to fiy the FastICA model without the manual whitening logging_print("Use fastICA with auto whiten") # Use the sklearn implementation to perform FastICA inclusive # the whitening step fastICA_object = FastICA(n_components=self.n_components, algorithm="parallel", fun='logcosh', max_iter=500, tol=1e-10) # Fit the model sources = fastICA_object.fit_transform(data) # Save the data self.projected_data = sources self.components = fastICA_object.components_ logging_print( stats_dict_to_string( {"Number of used iterations": fastICA_object.n_iter_}))
def calculate_auc_values(self): cav_st_stime = time.time() auc_values = {} for pathway_id in self.pathway_gene_scores.columns: if pathway_id in list(self.matrix_data.columns): pathway_gene_set = self.matrix_data.loc[:, pathway_id] pathway_gene_scores = self.pathway_gene_scores.loc[ self.matrix_data.index, pathway_id] try: log_reg_fpr, log_reg_tpr, _ = roc_curve( pathway_gene_set * 1, pathway_gene_scores) auc_values[pathway_id] = auc(log_reg_fpr, log_reg_tpr) except ValueError: logging_print( "Value error in auc calculation of pathway: {}".format( pathway_id)) else: logging_print("HPO term: {} not in matrix".format(pathway_id)) auc_calc_values = pd.Series(auc_values) auc_calc_values[auc_calc_values > 1.0] = 1.0 self.auc_values = auc_calc_values timer_print(cav_st_stime, prefix="AUC value calculation ready")
def fit(self, data): if self.whiten_components is None: self.perform_data_whitening(data) fit_start_time = time.time() fastICA_object = FastICA(algorithm="parallel", whiten=False, fun='logcosh', max_iter=self.max_iter, tol=1e-10) fastICA_object.fit(self.whiten_data[:, :self.n_components]) indep_comp = np.dot(fastICA_object.components_, self.whiten_components[:self.n_components, :]) indep_sources = np.dot(indep_comp, data.to_numpy().T).T components_index = pd.RangeIndex(start=1, stop=self.n_components + 1, name="IC") indep_sources_df = pd.DataFrame( indep_sources, index=data.index, columns=components_index).add_prefix("IC_") indep_comp_df = pd.DataFrame( indep_comp, index=components_index, columns=data.columns).T.add_prefix("IC_").T self.projected_data = indep_sources_df self.components = indep_comp_df logging_print( stats_dict_to_string( {"Number of used iterations": fastICA_object.n_iter_})) timer_print(fit_start_time, prefix="FastICA component optimalisation is ready")
def perform_data_preprocessing(self): pp_start_time = time.time() if self.perform_log2: self.input_data = np.log2(self.input_data, where=(self.input_data != 0.0)) logging_print( "Log 2 transformation is performed on the input data") if self.pre_processing_center_scale: scaler = StandardScaler() if self.over_samples: scaled_np_data = scaler.fit_transform(self.input_data) else: scaled_np_data = scaler.fit_transform(self.input_data.T).T self.input_data = pd.DataFrame(scaled_np_data, index=self.input_data.index, columns=self.input_data.columns) logging_print( "Centering and scaling is performed on the input data") timer_print(pp_start_time, prefix="Data pre-processing ready", time_overview_log=self.process_time_overview)
def read_files(self): rf_start_time = time.time() # read input component file if os.path.isfile(self.components_data_path): self.components_data = read_pd_df(self.components_data_path, { "sep": "\t", "index_col": 0 }, force=self.force) components_data_info = "Components dataframe n_row: {}, " \ "n_col: {}\n" \ "first column headers: {}\n" \ "first row index: {}".format( *self.components_data.shape, ", ".join( self.components_data.columns.values[ :5]), ", ".join( self.components_data.index.values[ :5]), ) logging_print(components_data_info) else: raise FileNotFoundError("Cannot find input file: {}".format( self.components_data_path)) # read matrix file self.matrix_data = read_pd_df(self.matrix_path, { "sep": "\t", "index_col": 0 }, force=self.force, proc_df_before_save=lambda df: df == 1.0) matrix_data_info = "Matrix dataframe n_row: {}, " \ "n_col: {}\n" \ "first column headers: {}\n" \ "first row index: {}".format( *self.matrix_data.shape, ", ".join( self.matrix_data.columns.values[ :5]), ", ".join( self.matrix_data.index.values[ :5]), ) print(matrix_data_info) logging.info(matrix_data_info) # reading background gene path if self.background_genes_path is not None and self.background_genes_path != '': self.background_genes_data = read_pd_df(self.background_genes_path, { "sep": "\t", "header": None }, force=self.force) # convert to serie instead of matrix self.background_genes_data = self.background_genes_data.iloc[:, 0] background_genes_data_info = "Background genes file loaded:\n" \ "number of genes: {}, " \ "first genes: {}\n".format( self.background_genes_data.shape[0], ", ".join( self.background_genes_data.values[ :5]) ) logging_print(background_genes_data_info) timer_print(rf_start_time, prefix="Reading input file ready")
def perform_analysis(self): pa_start_time = time.time() total_z_score_results = [] # check if some temp files where present temp_z_score_paths = glob.glob( os.path.join(self.output_dir, "temp_results_analysis_z_scores_*.pkl")) temp_z_scores = None temp_already_processed_pathways = None if len(temp_z_score_paths) > 0: temp_z_scores = pd.read_pickle(temp_z_score_paths[0]) logging_print( "Temp file '{}' with already processed pathways loaded. size df: {}" .format(temp_z_score_paths[0], temp_z_scores.shape)) temp_already_processed_pathways = list( temp_z_scores.columns.to_numpy()) pathway_manager = mp.Manager() pathway_queue = pathway_manager.Queue() retults_manager = mp.Manager() results_queue = retults_manager.Queue() for index, row in self.matrix_data.iteritems(): if temp_z_scores is not None: if index in temp_already_processed_pathways: total_z_score_results.append(temp_z_scores.loc[:, index]) else: pathway_queue.put(index) else: pathway_queue.put(index) logging_print("total pathways already done: {}".format( len(total_z_score_results))) n_workers = self.n_cores - 1 processes = [] for _ in range(n_workers): processes.append( mp.Process(target=single_pathway_worker, args=(self.components_data, self.matrix_data, pathway_queue, results_queue, self.background_genes_data, self.analysis_type, -1, self.components_data_permutated))) for process in processes: process.start() total_done = 0 last_save_time = time.time() save_time_in_minutes = 2 z_score_file_path = None while True: try: if total_done >= n_workers: # All workers are ready break if results_queue.empty(): # Wait for the next results time.sleep(5) else: # Process the results results = results_queue.get(True, timeout=1) if isinstance(results, str) and results == "DONE": # One worker is done total_done += 1 logging_print("total workers done {} of {}".format( total_done, n_workers)) else: # Save the results total_z_score_results.append(results) # save temp results if last_save_time + 60 * save_time_in_minutes < time.time( ): logging_print("save temp results: {}".format( datetime.now())) last_save_time = time.time() temp_dataframe_z_scores = pd.DataFrame( total_z_score_results).T new_z_score_file_path = os.path.join( self.output_dir, "temp_results_analysis_z_scores_{}.pkl".format( datetime.now())) temp_dataframe_z_scores.to_pickle( new_z_score_file_path) if z_score_file_path is not None and os.path.isfile( z_score_file_path): os.remove(z_score_file_path) z_score_file_path = new_z_score_file_path except queue.Empty: time.sleep(1) continue for process in processes: process.join() print("all processes ready") self.pathway_gene_scores = pd.DataFrame(total_z_score_results).T pathway_gene_scores_temp_file_path = os.path.join( self.output_dir, "temp_pathway_gene_scores_temp_{}.pkl".format(datetime.now())) self.pathway_gene_scores.to_pickle(pathway_gene_scores_temp_file_path) timer_print(pa_start_time, prefix="Performing analysis ready")
def __init__(self, components_data_path, matrix_path, output_dir, analysis_type, minimal_number_of_genes=None, background_genes_path=None, n_cores=None, split_start=None, split_end=None, multi_node_node_id=None, multi_node_num_nodes=None, multi_node_output_dir=None, force=False, output_disable_txt=False, output_disable_gzip=False, output_disable_pickle=False): self.start_time = time.time() self.components_data_path = components_data_path self.output_dir = output_dir self.analysis_type = analysis_type self.matrix_path = matrix_path self.background_genes_path = background_genes_path self.minimal_number_of_genes = -1 if minimal_number_of_genes is not None and minimal_number_of_genes != "": self.minimal_number_of_genes = int(minimal_number_of_genes) if self.minimal_number_of_genes < 3: self.minimal_number_of_genes = 3 logging_print( "Minimal number of genes must be 3 of higher, so the value is set to 3" ) # for selecting and multiprocessing self.split_start = split_start self.split_end = split_end self.multi_node_node_id = multi_node_node_id self.multi_node_num_nodes = multi_node_num_nodes self.multi_node_output_dir = multi_node_output_dir self.n_cores = mp.cpu_count() if n_cores is not None and n_cores != "": self.n_cores = int(n_cores) self.components_data = None self.background_genes_data = None self.matrix_data = None self.components_data_permutated = None # results self.gene_pathway_count = None self.pathway_gene_scores = None self.auc_values = None self.p_values = None self.bonf_p_values = None self.reject = None self.force = force self.method_stats = {} # output self.output_disable_txt = output_disable_txt self.output_disable_gzip = output_disable_gzip self.output_disable_pickle = output_disable_pickle
def merge_output_files(self): mof_start_time = time.time() if self.multi_node_output_dir: node_output_auc_pred_files = glob.glob( os.path.join(self.multi_node_output_dir, "*", "predictions_auc.pkl")) if len(node_output_auc_pred_files) == self.multi_node_num_nodes: logging_print("Merge output files") # merge AUC prediction files comp_df_auc_list = [] for node_output_auc_file_path in node_output_auc_pred_files: file_auc_df = pd.read_pickle(node_output_auc_file_path) comp_df_auc_list.append(file_auc_df) overall_auc_df = pd.concat(comp_df_auc_list) bonf_values = overall_auc_df["pValue"] * overall_auc_df.shape[0] bonf_values[bonf_values > 1] = 1 bonf_values[bonf_values < 0] = 0 overall_auc_df["bonferroni"] = bonf_values if self.output_disable_txt == False: output_file_path_csv = os.path.join( self.multi_node_output_dir, "predictions_auc_bonf.txt.gz") if self.output_disable_gzip: output_file_path_csv = os.path.join( self.multi_node_output_dir, "predictions_auc_bonf.txt") overall_auc_df.to_csv( output_file_path_csv, columns=["geneCount", "pValue", "auc", "bonferroni"], sep='\t') if self.output_disable_pickle == False: overall_auc_df.to_pickle( os.path.join(self.multi_node_output_dir, "predictions_auc_bonf.pkl")) # merge gene pathway files node_output_gene_pathway_pred_files = glob.glob( os.path.join(self.multi_node_output_dir, "*", "gene_pathway_scores.pkl")) comp_df_gene_pathway_list = [] for node_output_auc_file_path in node_output_gene_pathway_pred_files: file_gene_pathway_df = pd.read_pickle( node_output_auc_file_path) comp_df_gene_pathway_list.append(file_gene_pathway_df) overall_gene_pathway_df = pd.concat(comp_df_gene_pathway_list, axis=1) if self.output_disable_txt == False: output_gene_pathway_pred_path = os.path.join( self.multi_node_output_dir, "gene_pathway_scores.txt.gz") if self.output_disable_gzip: output_gene_pathway_pred_path = os.path.join( self.multi_node_output_dir, "gene_pathway_scores.txt") overall_gene_pathway_df.to_csv( output_gene_pathway_pred_path, sep='\t') if self.output_disable_pickle == False: overall_gene_pathway_df.to_pickle( os.path.join(self.multi_node_output_dir, "gene_pathway_scores.pkl")) timer_print(mof_start_time, prefix="Writing merged outputfile ready")