def loadData(self, input_dir, name_run, script_dir, data_type, data_type_lo, delTmax, delTmin, tau, tfa_bool, timehorizon, percent_LO_points, num_ets_lo, time_step, thres_coeff_var, prior_type, prior_file): str_output = "" uniq_dups = [] np.random.seed(self.rnd_seed) pps = Preprocess(self.rnd_seed) pps.delTmax = delTmax pps.delTmin = delTmin pps.tau = tau pps.input_dir = input_dir pps.str_output = str_output pps.flag_print = self.flag_print pps.priors_file = prior_file #IF CONDITIONS HAVE DUPLICATED NAMES, PRINT A META DATA FILE CALLED "meta_data_uniq.tsv" with only unique conds metadata_1 = pps.input_dataframe(pps.meta_data_file, has_index=False, strict=False) num_dups_conds = len( metadata_1.condName[metadata_1.condName.duplicated(keep=False)]) if num_dups_conds > 0: uniq_dups = (metadata_1.condName[metadata_1.condName.duplicated( keep=False)]).unique() num_uniq_dups = len(uniq_dups) if self.flag_print: print("name of duplicated conds in meta data: ", num_dups_conds) print("number of unique in dups conds", num_uniq_dups) metadata_1.set_index(['condName'], inplace=True) metadata_1_series = metadata_1.groupby(level=0).cumcount() metadata_1_series = "repet" + metadata_1_series.astype(str) metadata_1.index = metadata_1.index + metadata_1_series.replace( 'repet0', '') #metadata_1.index = metadata_1.index + "_dup_"+ metadata_1.groupby(level=0).cumcount().astype(str).replace('0','') #The following code is to fix names of prevCol for duplicated conditions metadata_copy = metadata_1.copy() name_prev_cond = np.nan count = 0 for index, row in (metadata_1[metadata_1.isTs == True]).iterrows(): if (row['is1stLast'] == 'm') or (row['is1stLast'] == 'l'): if row['prevCol'] != name_prev_cond: if self.flag_print: print(index, row) metadata_copy.at[index, 'prevCol'] = name_prev_cond count = count + 1 name_prev_cond = index if self.flag_print: print(count) if count != num_dups_conds - num_uniq_dups: raise ValueError('Wrong meta data format') #metadata_copy.drop(['Unnamed: 0'], axis=1, inplace=True) metadata_copy.reset_index(inplace=True) metadata_copy.columns = [ 'condName', 'isTs', 'is1stLast', 'prevCol', 'del.t' ] cols = ['isTs', 'is1stLast', 'prevCol', 'del.t', 'condName'] metadata_copy = metadata_copy[cols] pps.meta_data_file = "meta_data_uniq.tsv" path_file = pps.input_path(pps.meta_data_file) # metadata_copy.is1stLast = '"' + metadata_copy.is1stLast + '"' # metadata_copy.prevCol = '"' + metadata_copy.prevCol + '"' # metadata_copy.condName = '"' + metadata_copy.condName + '"' # metadata_copy.columns = ['"isTs"', '"is1stLast"', '"prevCol"', '"del.t"', '"condName"'] metadata_copy.to_csv(path_file, sep="\t", index=False, na_rep='NA') #, quoting=csv.QUOTE_NONE) #Add to expression file duplicated conds, this is important for how the leave-out section is implemented expression_1 = pps.input_dataframe(pps.expression_matrix_file, has_index=False, strict=False) count = 0 for ud in uniq_dups: pattern = re.compile(ud + "repet" + "\d") for cond_tmp in metadata_copy.condName: if pattern.match(cond_tmp): expression_1[cond_tmp] = expression_1[ud] count = count + 1 if count != num_dups_conds - num_uniq_dups: raise ValueError('Wrong expression/meta_data format') col_arr = (np.asarray(expression_1.columns[1:])) expression_1.columns = np.insert(col_arr, 0, "") pps.expression_matrix_file = "expression_new.tsv" path_file = pps.input_path(pps.expression_matrix_file) expression_1.to_csv(path_file, sep="\t", index=False, na_rep='NA') #, quoting=csv.QUOTE_NONE) #END CODE FOR PRINTING NEW UNIQUE META DATA FILE AND NEW EXPRESSION FILE str_output = pps.get_data(thres_coeff_var, str_output, prior_type) pps.compute_common_data(uniq_dups, time_step) #CODE FOR LEAVE OUT DATA TS_vectors, steady_state_cond, index_steady_state, num_total_timeseries_points = self.readDatasetFromMetaDataFile( pps.meta_data) #Parse data to dynGenie3 format in case parse_4dyng3 is set to "True" # print pps.expression_matrix.head() # print pps.expression_matrix.index.tolist() # print pps.expression_matrix.loc["G1", :] if self.parse_4dyng3: #(TS_data,time_points,genes,TFs,alphas) # import sys # reload(sys) # sys.setdefaultencoding('utf8') print("Start parsing data to dynGenie3 format") TS_data = list() time_points = list() genes = pps.expression_matrix.index.tolist() genes = np.asarray(genes).astype(str) genes = genes.tolist() num_gene_names = len(genes) alphas = [0.02] * num_gene_names alphas = np.asarray(alphas).astype(float) alphas = alphas.tolist() for ts_tmp in TS_vectors: #for loop over a single timeseries ts_tmp_vect = list(ts_tmp.keys()) num_time_points_intstmp = len(ts_tmp_vect) ts_dynGenie3 = np.zeros( (num_time_points_intstmp, num_gene_names)) ts_dynGenie3 = np.transpose( pps.expression_matrix.loc[:, ts_tmp_vect]) TS_data.append(np.asarray(ts_dynGenie3)) time_points_i = np.zeros(num_time_points_intstmp) for j, key in enumerate(ts_tmp_vect): time_points_i[j] = np.float(ts_tmp[key]) time_points.append(time_points_i) # print TS_data # print type(TS_data[1]) SS_data = np.transpose(pps.expression_matrix[steady_state_cond]) #(TS_data,time_points,genes,TFs,alphas) TFs = np.asarray(pps.tf_names).astype(str) TFs = TFs.tolist() TS_data_file = "TS_data.pkl" path_file = pps.input_path(TS_data_file) with open(path_file, 'wb') as f: pickle.dump([TS_data, time_points, genes, TFs, alphas], f) # cPickle.dump(TS_data, f) # print type(TS_data) # cPickle.dump(time_points, f) # print type(time_points) # cPickle.dump(alphas, f) # print type(alphas) # cPickle.dump(genes, f) # print type(genes) f.close() # with open(output_path_estimators+'/Gene'+str(output_idx), 'rb') as f: # treeEstimator = cPickle.load(f) SS_data_file = "SS_data.txt" path_file = pps.input_path(SS_data_file) SS_data.to_csv(path_file, sep="\t", index=False, na_rep='NA') print("End parsing data to dynGenie3 format") # # #END parse data to dynGenie3 format #Debug # pps.design.to_csv(os.path.abspath(os.path.join(pps.input_dir))+"/_design.txt", sep="\t") # pps.response.to_csv(os.path.abspath(os.path.join(pps.input_dir))+"/_response.txt", sep="\t") # pps.meta_data.to_csv(os.path.abspath(os.path.join(pps.input_dir))+"/_meta_data.txt", sep="\t") if data_type == "TS" or (data_type == "TS-SS" and (data_type_lo == "TS-SS" or data_type_lo == "TS")): if num_ets_lo > 0: ts_lopoints_x, ts_lopoints_y, timeseries_indices_lo = self.choose_LO_timeseries_random_withTimehorizon( num_ets_lo, TS_vectors, timehorizon) else: ts_lopoints_x, ts_lopoints_y, t0_lopoints, timeseries_indices_lo = self.choose_timeseries_LO_lastPoints_random_withTimehorizon( percent_LO_points, num_total_timeseries_points, TS_vectors, timehorizon) if data_type == "SS" or (data_type == "TS-SS" and (data_type_lo == "TS-SS" or data_type_lo == "SS")): ss_lo_cond_names = list() ss_lo_cond_names = np.asarray(ss_lo_cond_names) ss_lo_indices = list() ss_lo_indices = np.asarray(ss_lo_indices) if len(steady_state_cond) > 0: ss_lo_cond_names, ss_lo_indices = self.choose_steadystate_LO_points_random( percent_LO_points, steady_state_cond) #Debug # print "num_total_timeseries_points", num_total_timeseries_points # print "len(ss_lo_cond_names)", len(steady_state_cond) # print "len(pps.meta_data)", len(pps.meta_data) #TS_vectors, steady_state_cond, index_steady_state, num_total_timeseries_points # TS_vectors [OrderedDict([('S0_1', 0), # ('S1_1', 60.0), # ('S2_1', 120.0), # ('S3_1', 180.0), # ('S4_1', 240.0), # ('S5_1', 300.0), # ('S6_1', 360.0)]), # OrderedDict([('S0_2', 0), # ('S1_2', 60.0), # ('S2_2', 120.0), # ('S3_2', 180.0), # ('S4_2', 240.0), # ('S5_2', 300.0), # ('S6_2', 360.0)]),......] # steady_state_cond # array(['LBexp_1', 'LBexp_2', 'LBexp_3',....] # index_steady_state # array([163, 164, 165, 166, 167,....] # num_total_timeseries_points # 163 #Leave-out Time-series points #ts_lopoints_x, ts_lopoints_y, timeseries_indices_lo # timeseries_indices_lo left out # array([31, 15, 26, 17]) # ts_lopoints_x, ts_lopoints_y # OrderedDict([('MG+90_2', 95.0), ('SMM_1', 0), ('dia5_3', 5.0), ('SMM_3', 0)]) # OrderedDict([('MG+120_2', 125.0), ('Salt_1', 10.0), ('dia15_3', 15.0), ('Salt_3', 10.0)]) #Leave-out Steady state points #ss_lo_cond_names, ss_lo_indices # array(['H2O2_1', 'LBGexp_2', 'LBtran_2', ....] # array([100, 10, 4, 81, 97, 65, ... ] if self.flag_print: print("Shape of design var before leaving-out data: ", str(pps.design.shape)) print("Shape of response var before leaving-out data: ", str(pps.response.shape)) str_output = str_output + "Shape of design var before leaving-out data: " + str( pps.design.shape) + "\n" str_output = str_output + "Shape of response var before leaving-out data: " + str( pps.response.shape) + "\n" #Debug # w = csv.writer(open("ts_lopoints_x.csv", "w")) # for key, val in ts_lopoints_x.items(): # w.writerow([key, val]) # pps.design.to_csv(os.path.abspath(os.path.join(pps.input_dir))+"/_design.txt", sep="\t") # pps.response.to_csv(os.path.abspath(os.path.join(pps.input_dir))+"/_response.txt", sep="\t") #Before splitting the dataset in training and test, check if want to learn on SS only or TS only if data_type == "SS": str_output = str_output + "::::::::STEADY-STATE ONLY - LOOK AT JUST THE SHAPES OF DESIGN AND RESPONSE VARIABLES" + "\n" only_steady_state_indxes = ( pps.design.columns.isin(steady_state_cond)) pps.design = pps.design.loc[:, only_steady_state_indxes] #, axis=1, inplace=True) pps.response = pps.response.loc[:, only_steady_state_indxes] #, axis=1, inplace=True) pps.half_tau_response = pps.half_tau_response.loc[:, only_steady_state_indxes] pps.delta_vect = pps.delta_vect.loc[:, ( pps.delta_vect.columns.isin(steady_state_cond) )] #, axis=1, inplace=True) if data_type == "TS": str_output = str_output + "::::::::TIME-SERIES ONLY - LOOK AT JUST THE SHAPES OF DESIGN AND RESPONSE VARIABLES" + "\n" pps.design.drop(steady_state_cond, axis=1, inplace=True) pps.response.drop(steady_state_cond, axis=1, inplace=True) pps.half_tau_response.drop(steady_state_cond, axis=1, inplace=True) pps.delta_vect.drop(steady_state_cond, axis=1, inplace=True) # print "Shape of design design before splitting: "+str(pps.design.shape) # print "Shape of response response before splitting: "+str(pps.response.shape) # # design_tmp = pps.design # tfs_tmp = list(set(pps.tf_names).intersection(pps.expression_matrix.index)) # X_tmp = np.asarray(design_tmp.loc[tfs_tmp,:].values) # X_tmp = (X_tmp - (X_tmp.mean(axis=1)).reshape(-1,1)) / (X_tmp.std(axis=1)).reshape(-1,1) # design_tmp_2 = pd.DataFrame(X_tmp ,index = tfs_tmp, columns = design_tmp.columns) # pps.design = design_tmp_2 # # print "Shape of design after normalization/standardization: ", pps.design.shape # # response_tmp = pps.response # Y_tmp = np.asarray(response_tmp.values) # Y_tmp = (Y_tmp - (Y_tmp.mean(axis=1)).reshape(-1,1)) / (Y_tmp.std(axis=1)).reshape(-1,1) # response_tmp_2 = pd.DataFrame(Y_tmp ,index = response_tmp.index, columns = response_tmp.columns) # pps.response = response_tmp_2 # # print "Shape of response after normalization/standardization: ", pps.response.shape if data_type == "SS" or (data_type == "TS-SS" and (data_type_lo == "TS-SS" or data_type_lo == "SS")): #Leaving out Steady state points pps.leave_out_ss_design = pps.design[ss_lo_cond_names] pps.design.drop(ss_lo_cond_names, axis=1, inplace=True) pps.leave_out_ss_response = pps.response[ss_lo_cond_names] pps.response.drop(ss_lo_cond_names, axis=1, inplace=True) pps.half_tau_response.drop(ss_lo_cond_names, axis=1, inplace=True) if self.flag_print: print("Shape of leave out SS design var: ", pps.leave_out_ss_design.shape) print("Shape of leave out SS response var: ", pps.leave_out_ss_response.shape) pps.delta_vect.drop(ss_lo_cond_names, axis=1, inplace=True) if data_type == "TS" or (data_type == "TS-SS" and (data_type_lo == "TS-SS" or data_type_lo == "TS")): #Leaving out Time series points pps.leave_out_ts_design = pps.design[list(ts_lopoints_x.keys())] pps.design.drop(list(ts_lopoints_x.keys()), axis=1, inplace=True) pps.leave_out_ts_response = pps.response[list( ts_lopoints_x.keys())] pps.response.drop(list(ts_lopoints_x.keys()), axis=1, inplace=True) pps.half_tau_response.drop(list(ts_lopoints_x.keys()), axis=1, inplace=True) if self.flag_print: print("Shape of leave out TS design var: ", pps.leave_out_ts_design.shape) print("Shape of leave out TS response var: ", pps.leave_out_ts_response.shape) pps.delta_vect.drop(list(ts_lopoints_x.keys()), axis=1, inplace=True) if self.flag_print: print("Shape of design var after leaving-out data: ", pps.design.shape) print("Shape of response var after leaving-out data: ", pps.response.shape) str_output = str_output + "Shape of design var after leaving-out data: " + str( pps.design.shape) + "\n" str_output = str_output + "Shape of response var after leaving-out data: " + str( pps.response.shape) + "\n" if data_type == "SS" or (data_type == "TS-SS" and (data_type_lo == "TS-SS" or data_type_lo == "SS")): str_output = str_output + "Shape of leave out SS design var: " + str( pps.leave_out_ss_design.shape) + "\n" str_output = str_output + "Shape of leave out SS response var: " + str( pps.leave_out_ss_response.shape) + "\n" if data_type == "TS" or (data_type == "TS-SS" and (data_type_lo == "TS-SS" or data_type_lo == "TS")): str_output = str_output + "Shape of leave out TS design var: " + str( pps.leave_out_ts_design.shape) + "\n" str_output = str_output + "Shape of leave out TS response var: " + str( pps.leave_out_ts_response.shape) + "\n" #END CODE FOR LEAVE OUT DATA if data_type == "SS" or (data_type == "TS-SS" and (data_type_lo == "TS-SS" or data_type_lo == "SS")): steady_state_cond_new = list(steady_state_cond.copy()) for element in ss_lo_cond_names: steady_state_cond_new.remove(element) else: steady_state_cond_new = steady_state_cond index_steady_state_new = [] indexes_all = list(range(0, len(pps.design.columns))) delta_vect = list() #Debug #print len(indexes_all) if data_type == "SS" or data_type == "TS-SS": for element in steady_state_cond_new: index_steady_state_new.append( pps.design.columns.get_loc(element)) index_steady_state_new = np.asarray(index_steady_state_new) index_time_points_new = [] if data_type == "TS" or data_type == "TS-SS": index_time_points_new = set(indexes_all) - set( index_steady_state_new) index_time_points_new = np.asarray(list(index_time_points_new)) #Debug #print len(index_time_points_new) #print len(index_steady_state_new) #Debug # print "pps.priors_data.shape", pps.priors_data.shape # print "len(pps.priors_data.abs().sum(axis=0))", len(pps.priors_data.abs().sum(axis=0)) # print "len(pps.priors_data.abs().sum(axis=0))", len(pps.priors_data.abs().sum(axis=1)) # print "len(pps.priors_data.sum(axis=0))", len(pps.priors_data.sum(axis=0)) # print "type(np.abs(pps.priors_data))", type(np.abs(pps.priors_data)) # pps.priors_data.to_csv(os.path.abspath(os.path.join(pps.input_dir))+"/_ppspriors_data.txt", sep="\t") # pps.gold_standard.to_csv(os.path.abspath(os.path.join(pps.input_dir))+"/_ppsgold_standard.txt", sep="\t") # print type(pps.gold_standard) # pps.design.to_csv(os.path.abspath(os.path.join(pps.input_dir))+"/_design.txt", sep="\t") # pps.response.to_csv(os.path.abspath(os.path.join(pps.input_dir))+"/_response.txt", sep="\t") if prior_type == "binary_all": num_edges_prior = np.sum(pps.priors_data.values != 0) num_edges_gs = np.sum(pps.gold_standard.values != 0) if self.flag_print: if prior_type == "binary_all": print("Number of edges in the prior: ", num_edges_prior, pps.priors_data.shape) print( "Number of edges in the evaluation part of the gold standard: ", num_edges_gs, pps.gold_standard.shape) if prior_type == "binary_all": str_output = str_output + "Number of edges in the prior: " + str( num_edges_prior) + str(pps.priors_data.shape) + "\n" str_output = str_output + "Number of edges in the evaluation part of the gold standard: " + str( num_edges_gs) + str(pps.gold_standard.shape) + "\n" # print "pps.activity.shape", pps.activity.shape # print pps.expression_matrix.shape # print len(pps.tf_names) # print pps.gold_standard.shape # print pps.response.shape if tfa_bool: #compute_activity() # """ # Compute Transcription Factor Activity # """ if self.flag_print: print('Computing Transcription Factor Activity ... ') tfs = list( set(pps.tf_names).intersection(pps.expression_matrix.index)) #TFA_calculator = TFA(pps.priors_data, pps.design, pps.half_tau_response, tfs) pps.activity = pps.compute_transcription_factor_activity(tfs) #pps.activity, pps.priors_data= TFA_calculator.compute_transcription_factor_activity() else: if self.flag_print: print( 'Using just expression, NO Transcription Factor Activity') expression_matrix = pps.design tfs = list( set(pps.tf_names).intersection(pps.expression_matrix.index)) activity = pd.DataFrame(expression_matrix.loc[tfs, :].values, index=tfs, columns=expression_matrix.columns) if self.flag_print: print(('Design matrix of shape: {}'.format(activity.shape))) pps.activity = activity tf_names = pps.activity.index.tolist( ) #pps.priors_data.columns #pps.tf_names #Leave-out SS if data_type == "SS" or (data_type == "TS-SS" and (data_type_lo == "TS-SS" or data_type_lo == "SS")): expression_matrix_lo_ss = pps.leave_out_ss_design leave_out_ss_design = pd.DataFrame( expression_matrix_lo_ss.loc[tf_names, :].values, index=tf_names, columns=expression_matrix_lo_ss.columns) pps.leave_out_ss_design = leave_out_ss_design #Leave-out TS if data_type == "TS" or (data_type == "TS-SS" and (data_type_lo == "TS-SS" or data_type_lo == "TS")): expression_matrix_lo_ts = pps.leave_out_ts_design leave_out_ts_design = pd.DataFrame( expression_matrix_lo_ts.loc[tf_names, :].values, index=tf_names, columns=expression_matrix_lo_ts.columns) pps.leave_out_ts_design = leave_out_ts_design expression = pps.expression_matrix #this is the initial one but then there is filtering and stuff goldstandard = pps.gold_standard genelist = pps.response.index.tolist( ) #pps.expression_matrix.index.tolist() numtfs = len(tf_names) X = pps.activity.transpose().values #X [n_samples, n_features] y = pps.response.transpose().values #y [n_samples, num_genes] if self.flag_print: print("Shape of design var X: " + str(X.shape)) print("Shape of response var Y: " + str(y.shape)) str_output = str_output + "Shape of design var X: " + str( X.shape) + "\n" str_output = str_output + "Shape of response var Y: " + str( y.shape) + "\n" if self.flag_print: print("X False", np.any(np.isnan(X))) print("X True", np.all(np.isfinite(X))) print("y False", np.any(np.isnan(y))) print("y True", np.all(np.isfinite(y))) X = np.float64(X) y = np.float64(y) output_path = script_dir + "/output/" + name_run + "_numgenes" + str( len(genelist)) + "_numtfs" + str(numtfs) if not os.path.exists(output_path): os.makedirs(output_path) # else: # if self.poot or not(self.auto_meth): # num_folders = len([name for name in os.listdir(script_dir+"/output/") if # os.path.isdir(os.path.join(script_dir+"/output/",name)) and (name_run+"_numgenes"+str(len(genelist))+"_numtfs"+str(numtfs)) in name]) # os.makedirs(output_path + "_" + str(num_folders)) # output_path = output_path + "_" + str(num_folders) if prior_type == "binary_all": if not os.path.exists(input_dir + "/priors"): os.makedirs(input_dir + "/priors") if prior_type == "binary_all": #Save plot of prior number of targets for each TF distribution priors_data_tmp = np.abs(pps.priors_data) index_tmp = priors_data_tmp.sum(axis=0) != 0 prior_num_tfs = np.sum(index_tmp) #Debug print TFs #print priors_data_tmp.columns[index_tmp] #Debug #print priors_data_tmp.sum(axis=0)[index_tmp] max_outdegree = np.max(priors_data_tmp.sum(axis=0)[index_tmp]) #Debug #print "max_outdegree", max_outdegree max_outdegree = np.int(max_outdegree) out_prior_tfs_outdegrees = "Num of TFs in prior: " + str( prior_num_tfs ) + " Mean and var of targets for TFs in prior: " + str( np.mean(priors_data_tmp.sum(axis=0)[index_tmp])) + " , " + str( np.std(priors_data_tmp.sum(axis=0)[index_tmp])) str_output = str_output + out_prior_tfs_outdegrees + "\n" ax = priors_data_tmp.sum(axis=0)[index_tmp].plot( kind="hist", bins=list(range(0, max_outdegree + 1))) ax.set_title("Prior outdegrees distribution") ax.set_xlabel("outdegree of TFs ( i.e. TFs num of targets)") if self.flag_print: plt.savefig(output_path + "/Prior outdegrees distribution_numTFs" + str(prior_num_tfs) + "_numEdges" + str(num_edges_prior)) plt.close() #Save plot of Eval GS number of targets for each TF distribution gold_standard_tmp = np.abs(pps.gold_standard) index_tmp2 = gold_standard_tmp.sum(axis=0) != 0 gs_num_tfs = np.sum(index_tmp2) max_outdegree2 = np.max(gold_standard_tmp.sum(axis=0)[index_tmp2]) max_outdegree2 = np.int(max_outdegree2) #Debug #print gold_standard_tmp.sum(axis=0)[index_tmp2] #Debug #print max_outdegree2 out_gs_tfs_outdegrees = "Num of TFs in eval gold standard: " + str( gs_num_tfs ) + " Mean and var of targets for TFs in eval GS: " + str( np.mean(gold_standard_tmp.sum(axis=0)[index_tmp2])) + " , " + str( np.std(gold_standard_tmp.sum(axis=0)[index_tmp2])) str_output = str_output + out_gs_tfs_outdegrees + "\n" #Debug print TFs #print gold_standard_tmp.columns[index_tmp2] ax1 = gold_standard_tmp.sum(axis=0)[index_tmp2].plot( kind="hist", bins=list(range(0, max_outdegree2 + 1))) ax1.set_title("Eval Gold standard outdegrees distribution") ax1.set_xlabel("outdegree of TFs ( i.e. TFs num of targets)") if self.flag_print: plt.savefig(output_path + "/Eval Gold standard outdegrees distribution_numTFs" + str(gs_num_tfs) + "_numEdges" + str(num_edges_gs)) plt.close() if prior_type == "binary_all": #Write gold standard priors to file pps.priors_data.to_csv(input_dir + "/priors/" + prior_file, sep="\t") if self.flag_print: outfile = open(output_path + "/_preprocessing.txt", 'w') outfile.write("Run name: " + str(name_run) + "\n") outfile.write(str_output) if data_type == "SS" or (data_type == "TS-SS" and (data_type_lo == "TS-SS" or data_type_lo == "SS")): if len(steady_state_cond) > 0: #Debug if self.flag_print: print("Leave-out points for steady state: ", ss_lo_cond_names, ss_lo_indices) outfile.write("Leave-out points for steady state: " + str(ss_lo_cond_names) + str(ss_lo_indices) + "\n") if data_type == "TS" or (data_type == "TS-SS" and (data_type_lo == "TS-SS" or data_type_lo == "TS")): if self.flag_print: print("Leave-out points for timeseries: ", ts_lopoints_x, ts_lopoints_y, timeseries_indices_lo) outfile.write("Leave-out points for timeseries: " + str(ts_lopoints_x) + str(ts_lopoints_y) + str(timeseries_indices_lo) + "\n") # print "New dimensions after coeff of var filter..." # outfile.write("New dimensions after coeff of var filter... \n") if self.flag_print: print("Expression dim: ", expression.shape) outfile.write("Expression dim: " + str(expression.shape) + "\n") if self.flag_print: print("Num of tfs: ", len(tf_names)) outfile.write("Num of tfs: " + str(len(tf_names)) + "\n") if self.flag_print: print("Num of genes: ", len(genelist)) outfile.write("Num of genes: " + str(len(genelist)) + "\n") if self.flag_print: if prior_type == "binary_all": print("Priors dim: ", pps.priors_data.shape) outfile.write("Priors dim: " + str(pps.priors_data.shape) + "\n") if self.flag_print: print("Goldstandard dim: ", goldstandard.shape) outfile.write("Goldstandard dim: " + str(goldstandard.shape) + "\n") #Print INFO to log file if self.flag_print: print("The number of genes is: ", len(genelist)) outfile.write("The number of genes is: " + str(len(genelist)) + "\n") if self.flag_print: print("The number of TFs is: ", len(tf_names)) outfile.write("The number of TFs is: " + str(len(tf_names)) + "\n") if self.flag_print: print("The total Number of data points in the dataset is: ", len(pps.meta_data)) outfile.write( "The total Number of data points in the dataset is: " + str(len(pps.meta_data)) + "\n") if self.flag_print: print("The total number of time series is: ", len(TS_vectors)) outfile.write("The total number of time series is: " + str(len(TS_vectors)) + "\n") if self.flag_print: print("The number of total time points is: ", num_total_timeseries_points) outfile.write("The number of total time points is: " + str(num_total_timeseries_points) + "\n") if self.flag_print: print("The number of total steady state points is: ", len(steady_state_cond)) outfile.write("The number of total steady state points is: " + str(len(steady_state_cond)) + "\n") if data_type == "SS" or (data_type == "TS-SS" and (data_type_lo == "TS-SS" or data_type_lo == "SS")): if self.flag_print: print( "The percentage of leave-out steady state points is: ", str(100 * float(len(ss_lo_indices)) / len(steady_state_cond))) outfile.write( "The percentage of leave-out steady state points is: " + str(100 * float(len(ss_lo_indices)) / len(steady_state_cond)) + "\n") if data_type == "TS" or (data_type == "TS-SS" and (data_type_lo == "TS-SS" or data_type_lo == "TS")): if self.flag_print: print( "The percentage of leave-out time series points is: ", str(100 * float(len(timeseries_indices_lo)) / num_total_timeseries_points)) outfile.write( "The percentage of leave-out time series points is: " + str(100 * float(len(timeseries_indices_lo)) / num_total_timeseries_points) + "\n") outfile.close() #All variables that can be returned if necessary # (All points) # TS_vectors, steady_state_cond, num_total_timeseries_points # #Training and leave out points # index_time_points_new, index_steady_state_new, pps.leave_out_ss_design(X_test_ss), pps.leave_out_ss_response, pps.leave_out_ts_design, pps.leave_out_ts_response # #leave out points # ss_lo_cond_names, ts_lopoints_x, ts_lopoints_y, timeseries_indices_lo if data_type == "SS" or (data_type == "TS-SS" and (data_type_lo == "TS-SS" or data_type_lo == "SS")): X_test_ss = pps.leave_out_ss_design.transpose().values y_test_ss = pps.leave_out_ss_response.transpose().values else: X_test_ss = "" y_test_ss = "" deltas = [] if data_type == "TS" or (data_type == "TS-SS" and (data_type_lo == "TS-SS" or data_type_lo == "TS")): X_test_ts = pps.leave_out_ts_design.transpose().values y_test_ts = pps.leave_out_ts_response.transpose().values ts_lopoints_y_keys = list(ts_lopoints_y.keys()) for i, k in enumerate(ts_lopoints_x.keys()): # #Debug # #print "ts_lopoints_x[k]", ts_lopoints_x[k] # if float((ts_lopoints_x[k])) == 0: # log_of_frac = 1 # else: # #No log # #log_of_frac = float(ts_lopoints_y[ts_lopoints_y_keys[i]]) / float((ts_lopoints_x[k])) # # log_of_frac = np.log(float(ts_lopoints_y[ts_lopoints_y_keys[i]]) / float((ts_lopoints_x[k]))) #deltas.append(log_of_frac) #Original deltas.append(ts_lopoints_y[ts_lopoints_y_keys[i]] - (ts_lopoints_x[k])) y_test_ts_future_timepoint = pps.expression_matrix.loc[ genelist, ts_lopoints_y_keys].transpose().values x_test_ts_current_timepoint = pps.expression_matrix.loc[ genelist, list(ts_lopoints_x.keys())].transpose().values x_test_ts_timepoint0 = pps.expression_matrix.loc[ genelist, list(t0_lopoints.keys())].transpose().values else: X_test_ts = "" y_test_ts = "" y_test_ts_future_timepoint = "" x_test_ts_current_timepoint = "" x_test_ts_timepoint0 = "" #Debug #print y_test_ts_future_timepoint #print x_test_ts_current_timepoint return X, y, genelist, tf_names, goldstandard, output_path, pps.priors_data, X_test_ss, X_test_ts, y_test_ss, y_test_ts, x_test_ts_current_timepoint, y_test_ts_future_timepoint, deltas, x_test_ts_timepoint0, index_steady_state_new, index_time_points_new, pps.design, pps.delta_vect, pps.res_mat2