def mbgcbuild(prot_alignment, prot_family_name, cohort_name, nucl_seq_directory, prot_seq_directory, seq_fmt, pair_fmt, r1_file_suffix, r2_file_suffix, tp_genes_nucl, blast_db_directory, blastn_search_directory, hmm_search_directory, f1_thresh, output_directory, cpu): startTime = time.time() if cpu is not None: CPU_THREADS = int(cpu) # setup paths build_op_dir = output_directory + os.sep + "build" hmm_directory = os.path.join(build_op_dir, 'spHMMs') tp_genes_prot = build_op_dir + os.sep + "TPGenes.faa" alnOutput = os.path.join(build_op_dir, "TP_Homolog_Alignment.afa") gene_pos_file = os.path.join(build_op_dir, 'Gene_Interval_Pos.txt') gene_pos_file_aa = os.path.join(build_op_dir, 'Gene_Interval_Pos_AA.txt') if hmm_search_directory is None: hmm_search_directory = os.path.join(build_op_dir, 'hmm_result') allHMMResult = os.path.join(build_op_dir, "CombinedHmmSearch.txt") if blastn_search_directory is None: blastn_search_directory = os.path.join(build_op_dir, 'blastn_result') allBLASTResult = os.path.join(build_op_dir, "CombinedBLASTSearch.txt") # Create OP dirs os.makedirs(hmm_directory, 0o777, True) # Translate protein sequence runTranSeq(tp_genes_nucl, "1", tp_genes_prot) # Join true positives in the sample with the BGC proteins tmpFile = os.path.join(build_op_dir, "TP_Homolog.faa") joinedSeqs = [] tpGeneSeqs = list(SeqIO.parse(tp_genes_prot, "fasta")) # Removing _1 added by TranSeq for seq in tpGeneSeqs: seq.id = seq.id[:-2] seq.description = "" joinedSeqs.append(seq) SeqIO.write(joinedSeqs, tp_genes_prot, "fasta") protAlnSeqs = list(SeqIO.parse(prot_alignment, "fasta")) for seq in protAlnSeqs: joinedSeqs.append(seq) SeqIO.write(joinedSeqs, tmpFile, "fasta") # MUSCLE align TP genes with markers runMUSCLE(tmpFile, alnOutput) # Gen spHMMs and interval pos # Extract spHMM coordinates from MUSCLE alignment hmmDict = gensphmmfiles(prot_family_name, alnOutput, tp_genes_prot, hmm_directory, gene_pos_file, gene_pos_file_aa) if r1_file_suffix is None: r1_file_suffix = "" if r2_file_suffix is None: r2_file_suffix = "" # #Preprocess synthetic reads nucl_seq_directory = PreProcessReadsPar(nucl_seq_directory, seq_fmt, pair_fmt, r1_file_suffix.strip(), r2_file_suffix.strip(), build_op_dir, CPU_THREADS) #Check if BLAST DB directory is provided or not if blast_db_directory is None: blast_db_directory = "" # Translate nucleotide seq if not os.path.isdir(prot_seq_directory): prot_seq_directory = TranseqReadsDir(build_op_dir, nucl_seq_directory, CPU_THREADS) # HMMER Search if not os.path.exists(allHMMResult): os.makedirs(hmm_search_directory, 0o777, True) for hmmSeqPosKey, hmmFileObj in hmmDict.items(): hmmInterval = str(hmmDict[hmmSeqPosKey].intervalStart) + "_" + str( hmmDict[hmmSeqPosKey].intervalEnd) RunHMMDirectoryParallel(prot_seq_directory, hmmFileObj.hmmFile, cohort_name, prot_family_name, "30_10", hmmInterval, hmm_search_directory, CPU_THREADS) with open(allHMMResult, 'w') as outfile: for subdir, dirs, files in os.walk(hmm_search_directory): for file in files: filePath = os.path.join(subdir, file) if re.match(r".*txt$", file) and os.path.getsize(filePath) > 0: with open(filePath) as infile: for line in infile: outfile.write(line) # BLAST Alignment if not os.path.exists(allBLASTResult): if not os.path.isdir(blastn_search_directory): print("Constructing BLAST Search Dir:" + blastn_search_directory) os.makedirs(blastn_search_directory, 0o777, True) RunBLASTNDirectoryPar( nucl_seq_directory, blast_db_directory, tp_genes_nucl, "-max_target_seqs 10000 -perc_identity 90.0", blastn_search_directory, CPU_THREADS) with open(allBLASTResult, 'w') as outfile: outfile.write( "sseqid\tslen\tsstart\tsend\tqseqid\tqlen\tqstart\tqend\tpident\tevalue\tSample\tsampleType\n" ) for subdir, dirs, files in os.walk(blastn_search_directory): for file in files: filePath = os.path.join(subdir, file) if re.match(r".*txt$", file) and os.path.getsize(filePath) > 0: with open(filePath) as infile: for line in infile: sampleName = ntpath.basename(filePath).split( ".txt")[0] outfile.write(line.strip() + "\t" + sampleName + "\t" + cohort_name + "\n") # Eval spHMMs rpackages.importr('base') utils = rpackages.importr('utils') packageNames = ('tidyverse', 'ggsci', 'ggpubr', 'dplyr', 'ggplot2') packnames_to_install = [ x for x in packageNames if not rpackages.isinstalled(x) ] if len(packnames_to_install) > 0: utils.install_packages(StrVector(packnames_to_install)) rpackages.importr('tidyverse') rpackages.importr('ggsci') rpackages.importr('ggpubr') rpackages.importr('dplyr') rpackages.importr('ggplot2') hp_hmm_directory = os.path.join(build_op_dir, 'HiPer_spHMMs') os.makedirs(hp_hmm_directory, 0o777, True) module_dir = os.path.dirname(os.path.abspath(createhmm.__file__)) print("\nR-script path : " + module_dir) #module_dir = os.path.join(os.path.dirname(__file__)) #r_script = os.path.join('/projects/DONIA/abiswas/git/MetaBGC/MetaBGC-Development','metabgc','src','EvaluateSpHMMs.R') r_script = os.path.join(module_dir, 'EvaluateSpHMMs.R') with open(r_script, 'r') as f: rStr = f.read() myfunc = STAP(rStr, "EvaluateSpHMM") myfunc.EvaluateSpHMM(allHMMResult, allBLASTResult, gene_pos_file, prot_family_name, float(f1_thresh), hmm_directory, hp_hmm_directory) timeTaken = time.time() - startTime mins = int(timeTaken / 60) secs = int(timeTaken) % 60 print("\nTotal time taken : " + str(mins) + " mins " + str(secs) + " seconds") return hp_hmm_directory
if type(val) not in [str, int, float, list]: sys.exit(1) return initial_config if __name__ == '__main__': # Our application currently will contain only one pipeline p = Pipeline() # -------------------------- Stage 1 --------------------------------------- # Read initial configuration from R function with open('setup.R', 'r') as f: R_code = f.read() initial_config = STAP(R_code, 'initial_config') config = initial_config.initial_config(False) initial_config = dict(zip(config.names, list(config))) if not test_initial_config(initial_config): sys.exit(1) initial_config = process_initial_config(initial_config) ################################################# # additional conversion from for the dictionary # ################################################# # First stage corresponds to the AnEn computation s1 = Stage()
user_name = getpass.getuser() # a function uniting all other R functions need for annotation fullAnnotation = """ fullAnnotationInGRanges <- function(resistance_table){ source("/home/%s/bin/makeAnnotation.R") source("/home/%s/bin/resist2GRanges.R") resist_ranges <- resist2GRanges(resistance_table) annots <- makeAnnotation(withRanges=TRUE, inRanges=resist_ranges, outTable="annotation_rpoABC") } """ % (user_name, user_name) R_Annot = STAP(fullAnnotation, "R_Annot") help_message = "Script for annotating AA-changes in AB-resistance regions, v 1.2\n" \ "VCF files are taken from the CWD\n" \ "Resistance is taken from resistance_SNPs_withoutrpoAC.csv table\n" \ "Three R functions should be in ~/bin\n" \ "Four arguments have to be passed:\n" \ "1 - reference genome accession (NC_000962)\n" \ "2 - reference genome fasta (H37Rv.fna)\n" \ "3 - reference genome annotation (H37Rv.gff)" if len(sys.argv) < 4: print(help_message) sys.exit() # check how many vcf files have SNPs in resistance table
def main_function((individ, X_df, df, objective_str, trgt)): """ Function to calculate BIC or AIC of model Input: individual defining model parameters to be used, dataframe with independent variables, dataframe with all data, metric to be used (BIC or AIC), dependent variable string Output: mutate individual """ #---------------------------------------------------------------------- # Import necessary modules import numpy as np from rpy2.robjects import pandas2ri pandas2ri.activate() from rpy2.robjects.packages import STAP #---------------------------------------------------------------------- # Remove variables from dataframe that are not part of the individual's genome vars_lst = list(X_df.columns) # Get list of the available variables gen_ind_0 = np.where(individ == 0) # Find which elements are set to 0 gen_ind_0 = gen_ind_0[0] # Get indices of variables to be removed vars_lst2 = vars_lst[:] # Copy list of all variables # Remove variables for i in sorted(gen_ind_0, reverse=True): del vars_lst2[i] #---------------------------------------------------------------------- # Fit model in R # Create formula to be used in R myString = "+".join(vars_lst2) stable_str = 'as.ordered(' + trgt + ') ~ ' formula = stable_str + myString # Transform Pandas dataframe to R rdf = pandas2ri.py2ri(df) # Define R function as string string = """ mdl_func <- function(formula,df) { library(VGAM) mdl1=vglm(formula,family=propodds, data=df) ll=logLik(mdl1) return(ll) } """ ord_ll = STAP(string, "ord_ll") # Calculate AIC and BIC based on LogLikelihood (ll_) try: ll_ = ord_ll.mdl_func(formula, rdf) ll_ = ll_[0] # In case LogLikelihood calculation fails except: ll_ = -1000.0 k = float(len(vars_lst2)) n = float(len(df)) aic_ = (2.0 * k) - (2 * ll_) bic_ = (np.log(n) * k) - (2 * ll_) # Return AIC or BIC depending on used choice if objective_str == 'aic': obj_ = aic_ elif objective_str == 'bic': obj_ = bic_ else: obj_ = np.nan # Return optimisation metric return obj_, individ
'/Users/gracejia/Documents/A-UW/covid19 NSF Project/COVID19_RL/COVID19_models/SEIR/states.csv' ) thetas_init = np.array(thetas_init.iloc[0, :6], dtype=np.float64) thetas_sd_init = np.array(thetas_sd_init.iloc[0, :6], dtype=np.float64) #%% #thetas_updated = np.zeros((2,6)) numpy2ri.activate() pandas2ri.activate() cwd = os.getcwd() with open( '/Users/gracejia/Documents/A-UW/covid19 NSF Project/COVID19_RL/COVID19_models/SEIR/update_thetas_weekly.R', 'r') as f: string = f.read() seir_theta = STAP(string, "seir_theta") seir_theta = seir_theta.getThetas modelOut = seir_theta(thetas_so_far=thetas_init, thetas_sd_so_far=thetas_sd_init, pred=1) os.chdir(cwd) print(modelOut) #%% with open( '/Users/gracejia/Documents/A-UW/covid19 NSF Project/COVID19_RL/COVID19_models/SEIR/seir_r_weekly.R', 'r') as f: string = f.read() seir_r_weekly = STAP(string, "seir_r_weekly") seir_r_weekly = seir_r_weekly.seirPredictions
base = importr('base') rpart = importr('rpart') stats = importr('stats') base64enc = importr('base64enc') C50 = importr('C50') caret = importr('caret') # pandas2ri.activate() path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'scripts/graphvizC50.R') with open(path, 'r') as f: graphvizC50_string = f.read() graphvizC50 = STAP(graphvizC50_string, 'graphvizC50') def r_options(*args, **kwargs): return base.options(*args, **kwargs) def r_c(*args) -> RVector: """ Generic function wrapper around the c function. """ return base.c(*args) def r_data_frame(*args, **kwargs) -> RDataFrame: """
def compare(pred_Y, test_Y, predict_binary_output, peaks=None, save_curves=True, save_data=False): """ Evaluates performance for predictions pred_Y relative to true labels test_Y. If predict_binary_output, pred_Y should be a set of scores and test_Y should be 0, 1 labels. Otherwise, both pred_Y and test_Y should be continuous values. Returns squared error and Pearson correlation between the predicted output and the actual output. Both pred_Y and test_Y must be matrices of shape num_examples x num_histone_marks, or they must both be matrices of shape num_examples x seq_length x num_histone_marks. If the latter, examples are concatenated together before correlations are computed. peaks is a list. Each element of this list corresponds to one mark and is a N x 2 matrix where each row contains the (start, end) coordinates of a peak in that mark. If passing in peaks, make sure the coordinate system matches that of pred_Y and test_Y! For example, if your peaks start at the start of the chromosome, then pred_Y and test_Y have to start at the start of the chromosome as well. If save_curves is True, it saves the full precision-recall curve. save_curves cannot be True if predict_binary_output is False. Right now it saves recalls @10, 20...90% precision. If save_data is True, it saves the first mark of pred_Y and test_Y. Returns results, a dictionary containing: 'AUC' (if predict_binary_output) 'AUPRC' (if predict_binary_output) 'precision_curves' (if save_curves) 'recall_curves' (if save_curves) 'threshold_curves' (if save_curves) 'MSE' (if not predict_binary_output) 'true_var' (if not predict_binary_output) 'pearsonR' (if not predict_binary_output) 'pred_Y' (if save_data) 'test_Y' (if save_data) AUC, AUPRC, MSE, true_var, pearsonR, and spearmanR are each vectors of length num_histone_marks. true_var is the variance of the true data; it is useful for interpreting whether a given MSE is good or bad. """ # save_curves has to be False if predict_binary_output is also False if not predict_binary_output: save_curves = False pred_Y_is_binary = is_binary(pred_Y) test_Y_is_binary = is_binary(test_Y) assert pred_Y.shape == test_Y.shape, \ "pred_Y.shape = %s doesn't match test_Y.shape = %s" % (str(pred_Y.shape), str(test_Y.shape)) assert test_Y_is_binary == predict_binary_output #test_Y (the true labels) ought to be binary IFF we're predicting binary output. #pred_Y should be a set of continuous scores, regardless of whether we're predicting binary output. assert len(pred_Y.shape) == 2 or len(pred_Y.shape) == 3 # If peaks is not None, then there should be one element in peaks for each mark in pred_Y. if peaks: assert len(peaks) == pred_Y.shape[-1] # If the input matrices are 3D, then squash the first two dimensions together if len(pred_Y.shape) == 3: pred_Y = np.reshape( pred_Y, [pred_Y.shape[0] * pred_Y.shape[1], pred_Y.shape[2]]) test_Y = np.reshape( test_Y, [test_Y.shape[0] * test_Y.shape[1], test_Y.shape[2]]) num_histone_marks = pred_Y.shape[len(pred_Y.shape) - 1] true_var = [] MSE = [] pearsonR = [] precision_curves = [] recall_curves = [] threshold_curves = [] auc = [] auprc = [] Y_pos_frac = [] with open('PRROC.R', 'r') as f: #load in the R code. r_fxn_string = f.read() r_auc_func = STAP(r_fxn_string, "auc_func") for mark_idx in range(num_histone_marks): ### Sub-select only peak regions if peaks: # If peaks exists but peaks[mark_idx] is set to None, we should skip this mark. # This mark should correspond to INPUT, which has no peaks of its own. if peaks[mark_idx] is None: if predict_binary_output: precision_curves.append(None) recall_curves.append(None) threshold_curves.append(None) auprc.append(None) auc.append(None) else: true_var.append(None) MSE.append(None) pearsonR.append(None) continue # Initialize peak_idxs to all False num_bins = pred_Y.shape[0] peak_idxs = np.zeros(num_bins, dtype=bool) # Set peak_idx such that it is True in each peak # Simultaneously get the average signal density in each peak for peak_counter, peak in enumerate(peaks[mark_idx]): # We have to check for this, because pred_Y and test_Y might only represent # a fraction of any given chromosome if peak[1] > num_bins: continue peak_idxs[peak[0]:peak[1]] = True pred_Y_mark = pred_Y[peak_idxs, mark_idx] test_Y_mark = test_Y[peak_idxs, mark_idx] else: pred_Y_mark = pred_Y[:, mark_idx] test_Y_mark = test_Y[:, mark_idx] ### Run evaluations on (selected) regions if predict_binary_output: precisions, recalls, thresholds = precision_recall_curve( test_Y_mark, pred_Y_mark) precisions, recalls = compute_recalls_at_precision( precisions, recalls) precision_curves.append(list(precisions)) recall_curves.append(list(recalls)) if len(test_Y_mark) < 100000: downsample_idxs = range(len(test_Y_mark)) else: downsample_idxs = sample(range(len(test_Y_mark)), 100000) r_auprc_results = r_auc_func.pr_curve( scores_class0=robjects.vectors.FloatVector( pred_Y_mark[downsample_idxs]), weights_class0=robjects.vectors.FloatVector( test_Y_mark[downsample_idxs])) auprc.append(float(r_auprc_results.rx('auc.davis.goadrich')[0][0])) r_auc_results = r_auc_func.roc_curve( scores_class0=robjects.vectors.FloatVector( pred_Y_mark[downsample_idxs]), weights_class0=robjects.vectors.FloatVector( test_Y_mark[downsample_idxs])) auc.append(float(r_auc_results.rx('auc')[0][0])) Y_pos_frac.append(test_Y_mark.mean()) print("AUC %2.3f; AUPRC %2.3f" % (auc[mark_idx], auprc[mark_idx])) else: true_var.append(np.var(test_Y_mark)) MSE.append(get_MSE(pred_Y_mark, test_Y_mark)) pearsonR.append(get_pearsonR(pred_Y_mark, test_Y_mark)) print("MSE %2.3f (true var %2.3f), pearsonR %2.3f" % (MSE[mark_idx], true_var[mark_idx], pearsonR[mark_idx])) if predict_binary_output: assert ((len(precisions) > 0) and (len(recalls) > 0)) results = {'AUC': auc, 'AUPRC': auprc, 'Y_pos_frac': Y_pos_frac} results['precision_curves'] = precision_curves results['recall_curves'] = recall_curves else: results = {'MSE': MSE, 'true_var': true_var, 'pearsonR': pearsonR} if save_data: results['pred_Y'] = list(pred_Y[..., 0]) results['test_Y'] = list(test_Y[..., 0]) return results
def step(self, action): # Check for valid action err_msg = "%r (%s) invalid" % (action, type(action)) assert self.action_space.contains(action), err_msg # R <--> python conversions numpy2ri.activate( ) # automatic conversion of numpy objects to rpy2 objects # Update model based on actions action = (action - 1) / 2 #reduction_factor = np.reshape(action,(self.num_cities,self.num_cities)) # Get thetas updated for later calculations cwd = os.getcwd() input_path = '/Users/gracejia/Documents/A-UW/covid19 NSF Project/COVID19_RL/COVID19_models/SEIR' with open(input_path + '/update_thetas_weekly.R', 'r') as f: string = f.read() seir_theta = STAP(string, "seir_theta") seir_theta = seir_theta.getThetas modelOut = seir_theta(thetas_so_far=self.thetas_init, thetas_sd_so_far=self.thetas_sd_init, pred=self.pred) os.chdir(cwd) self.thetas_updated = np.array(modelOut, dtype=np.float64) # Get the SEIR model from r script with open(input_path + '/seir_r_weekly.R', 'r') as f: string = f.read() seir_r_weekly = STAP(string, 'seir_r_weekly') seir_r_weekly = seir_r_weekly.seirPredictions statesOut = seir_r_weekly( reduction_control=action, reduction_time_series=self.reduction_time_curr, thetas_data=self.thetas_updated, latent=self.latent, gamma=self.gamma, St_data=self.St_data, Et_data=self.Et_data, It_data=self.It_data, Rt_data=self.Rt_data, popu=self.popu, current=self.current, pred=self.pred) # Unpack output S = statesOut[0][0] E = statesOut[1][0] I = statesOut[2][0] R = statesOut[3][0] beta = statesOut[4][0] # Update state self.state = np.array((S, E, I, R)) self.daynum += self.pred self.St_data = S self.Et_data = E self.It_data = I self.Rt_data = R self.beta = beta # Print states print('States:', self.state) # ============================================================================= # print(sum(self.state)) # ============================================================================= print('Beta:', beta) print('Action picked:', action) # Reward economicCost = np.sum(action) + np.sum(self.reduction_time_curr) publichealthCost = -0.00001 * abs(self.It_data) reward = self.weight * economicCost + (1 - self.weight) * publichealthCost print('Reward:', reward) print(self.daynum) # Observation observation = np.reshape(self.state, (4, )) # Check if episode is over done = bool(I < 0.5 or self.daynum >= 199) return observation, reward, done, {}
def get_ml(self): ml = STAP(self.get_r(), "r_fct_string") return ml
def mdl_fit(model_vars, df, y_param, ci_level=0.95): """ Function to fit final model and extract modelling statistics Input: model variables as a list, dataframe holding all the data, dependent variable, confidence level for reporting statistics i.e. 0.95 for 95% Output: dataframe with model coefficients and statistics """ #---------------------------------------------------------------------- # Import necessary modules import rpy2.robjects.numpy2ri rpy2.robjects.numpy2ri.activate() import numpy as np from rpy2.robjects import pandas2ri pandas2ri.activate() from rpy2.robjects.packages import STAP import scipy.stats as stats #---------------------------------------------------------------------- # Fit R model # Set R function as string to fit model and return results string_ord_mdl = """ mdl_func <- function(formula,df) { library(VGAM) mdl1=vglm(formula,family=propodds, data=df) ll=logLik(mdl1) coefficients_df=coef(summary(mdl1)) coefficient_cols=colnames(coefficients_df) coefficient_rows=rownames(coefficients_df) output<-list(ll,coefficients_df,coefficient_cols,coefficient_rows) return(output) } """ # Transform pandas dataframe to R format rdf = pandas2ri.py2ri(df) # Set R formula as string using the model parameters and dependent variable formula = 'as.ordered(' + y_param + ') ~ ' + "+".join(model_vars) # Define R function to be used in Python ord_ll = STAP(string_ord_mdl, "ord_ll") # Fit model output_R = ord_ll.mdl_func(formula, rdf) # Extract data and place them in Pandas dataframe coeff_df_temp = output_R[1] coeff_df = pandas2ri.ri2py_dataframe(coeff_df_temp) cols_df = list(output_R[2]) rows_df = list(output_R[3]) coeff_df.columns = cols_df coeff_df.index = rows_df #---------------------------------------------------------------------- # Calculate statistics # Number of parameters n_vars = len(coeff_df) # Degrees for freedom for t-distribution deg_free = len(df) - n_vars # Calculate alpha value from confidence interval alpha_ = 1.0 - ci_level # array to hold the low % confidence intervals low_arr = np.zeros(len(coeff_df)) # array to hold the high % confidence intervals high_arr = np.zeros(len(coeff_df)) # array to hold the Wald test p-values p_val_arr = np.zeros(len(coeff_df)) # array to hold the t statistic t_value_arr = np.zeros(len(coeff_df)) # loop counter variable index_arr = 0 for index, row in coeff_df.iterrows(): # Get standard error for variable coefficient from R model fit data std_error = row['Std. Error'] # Get variable coefficient value from R model fit data coeff_value = row['Estimate'] # Calculate t_critical statistic for desired confidence interval t_critical = stats.t.ppf(1 - (alpha_ / 2.), df=deg_free) # Calculate low - high confidence interval limits low_arr[index_arr] = coeff_value - (t_critical * std_error) high_arr[index_arr] = coeff_value + (t_critical * std_error) # t statistic calculation to get p-value t_value = coeff_value / std_error t_value_arr[index_arr] = t_value # Calculate p-value p_val_arr[index_arr] = 2.0 * \ (1.0 - stats.t.cdf(np.abs(t_value), deg_free)) index_arr += 1 # Set arrays to dataframe columns coeff_df['Low ' + str((1.0 - alpha_) * 100) + '%'] = low_arr coeff_df['High ' + str((1.0 - alpha_) * 100) + '%'] = high_arr coeff_df['P Value'] = p_val_arr coeff_df['t Value'] = t_value_arr # Delete statistics of R model fit referring to normal distribution coeff_df.drop(['z value', 'Pr(>|z|)'], axis=1, inplace=True) # Return dataframe with model fit coefficients and statistics return coeff_df
# parse runtime configuration OutputLog().set_path(OUTPUT_DIR) data_config = ConfigParser.ConfigParser() data_config.read(data_set_config) data_parameters = ConfigSectionMap("dataset_parameters", data_config) # construct data set data_set = Container().create(data_parameters['name'], data_parameters) # robjects.r('source("%s")' % rca_location) with open(rca_location) as rca_file: rca_string = rca_file.read() rcca_fit = STAP(rca_string, "rcca_fit") rcca_eval = STAP(rca_string, "rcca_eval") # mnist = robjects.r('load')('/home/aviv/Project/DoubleEncoder/DataSet/MNIST_SPLIT/mnist.data') # x1_rcca = numpy.array(robjects.r['x_tr']) # x2_rcca = numpy.array(robjects.r['y_tr']) # # x1_dataset = data_set.trainset[1] # x2_dataset = data_set.trainset[0] # # x1_test = numpy.array(robjects.r['x_te']) # x2_test = numpy.array(robjects.r['y_te']) OutputLog().write('training rcca')
def compare(pred_Y, test_Y, predict_binary_output, peaks=None, save_curves=True, save_data=False): """ Evaluates performance for predictions pred_Y relative to true labels test_Y. If predict_binary_output, pred_Y should be a set of scores and test_Y should be 0, 1 labels. Otherwise, both pred_Y and test_Y should be continuous values. Returns squared error and Pearson correlation between the predicted output and the actual output. Both pred_Y and test_Y must be matrices of shape num_examples x num_histone_marks, or they must both be matrices of shape num_examples x seq_length x num_histone_marks. If the latter, examples are concatenated together before correlations are computed. peaks is a list. Each element of this list corresponds to one mark and is a N x 2 matrix where each row contains the (start, end) coordinates of a peak in that mark. If passing in peaks, make sure the coordinate system matches that of pred_Y and test_Y! For example, if your peaks start at the start of the chromosome, then pred_Y and test_Y have to start at the start of the chromosome as well. If save_curves is True, it saves the full precision-recall curve. save_curves cannot be True if predict_binary_output is False. Right now it saves recalls @10, 20...90% precision. If save_data is True, it saves the first mark of pred_Y and test_Y. Returns results, a dictionary containing: 'AUC' (if predict_binary_output) 'AUPRC' (if predict_binary_output) 'precision_curves' (if save_curves) 'recall_curves' (if save_curves) 'threshold_curves' (if save_curves) 'MSE' (if not predict_binary_output) 'true_var' (if not predict_binary_output) 'pearsonR' (if not predict_binary_output) 'pred_Y' (if save_data) 'test_Y' (if save_data) AUC, AUPRC, MSE, true_var, pearsonR, and spearmanR are each vectors of length num_histone_marks. true_var is the variance of the true data; it is useful for interpreting whether a given MSE is good or bad. """ # save_curves has to be False if predict_binary_output is also False if not predict_binary_output: save_curves = False pred_Y_is_binary = is_binary(pred_Y) test_Y_is_binary = is_binary(test_Y) assert pred_Y.shape == test_Y.shape, \ "pred_Y.shape = %s doesn't match test_Y.shape = %s" % (str(pred_Y.shape), str(test_Y.shape)) assert test_Y_is_binary == predict_binary_output #test_Y (the true labels) ought to be binary IFF we're predicting binary output. #pred_Y should be a set of continuous scores, regardless of whether we're predicting binary output. assert len(pred_Y.shape) == 2 or len(pred_Y.shape) == 3 # If peaks is not None, then there should be one element in peaks for each mark in pred_Y. if peaks: assert len(peaks) == pred_Y.shape[-1] # If the input matrices are 3D, then squash the first two dimensions together if len(pred_Y.shape) == 3: pred_Y = np.reshape(pred_Y, [pred_Y.shape[0] * pred_Y.shape[1], pred_Y.shape[2]]) test_Y = np.reshape(test_Y, [test_Y.shape[0] * test_Y.shape[1], test_Y.shape[2]]) num_histone_marks = pred_Y.shape[len(pred_Y.shape) - 1] true_var = [] MSE = [] pearsonR = [] precision_curves = [] recall_curves = [] threshold_curves = [] auc = [] auprc = [] Y_pos_frac = [] with open('PRROC.R', 'r') as f:#load in the R code. r_fxn_string = f.read() r_auc_func = STAP(r_fxn_string, "auc_func") for mark_idx in range(num_histone_marks): ### Sub-select only peak regions if peaks: # If peaks exists but peaks[mark_idx] is set to None, we should skip this mark. # This mark should correspond to INPUT, which has no peaks of its own. if peaks[mark_idx] is None: if predict_binary_output: precision_curves.append(None) recall_curves.append(None) threshold_curves.append(None) auprc.append(None) auc.append(None) else: true_var.append(None) MSE.append(None) pearsonR.append(None) continue # Initialize peak_idxs to all False num_bins = pred_Y.shape[0] peak_idxs = np.zeros( num_bins, dtype=bool) # Set peak_idx such that it is True in each peak # Simultaneously get the average signal density in each peak for peak_counter, peak in enumerate(peaks[mark_idx]): # We have to check for this, because pred_Y and test_Y might only represent # a fraction of any given chromosome if peak[1] > num_bins: continue peak_idxs[peak[0]:peak[1]] = True pred_Y_mark = pred_Y[peak_idxs, mark_idx] test_Y_mark = test_Y[peak_idxs, mark_idx] else: pred_Y_mark = pred_Y[:, mark_idx] test_Y_mark = test_Y[:, mark_idx] ### Run evaluations on (selected) regions if predict_binary_output: precisions, recalls, thresholds = precision_recall_curve(test_Y_mark, pred_Y_mark) precisions, recalls = compute_recalls_at_precision(precisions, recalls) precision_curves.append(list(precisions)) recall_curves.append(list(recalls)) if len(test_Y_mark) < 100000: downsample_idxs = range(len(test_Y_mark)) else: downsample_idxs = sample(range(len(test_Y_mark)), 100000) r_auprc_results = r_auc_func.pr_curve(scores_class0 = robjects.vectors.FloatVector(pred_Y_mark[downsample_idxs]), weights_class0 = robjects.vectors.FloatVector(test_Y_mark[downsample_idxs])) auprc.append(float(r_auprc_results.rx('auc.davis.goadrich')[0][0])) r_auc_results = r_auc_func.roc_curve(scores_class0 = robjects.vectors.FloatVector(pred_Y_mark[downsample_idxs]), weights_class0 = robjects.vectors.FloatVector(test_Y_mark[downsample_idxs])) auc.append(float(r_auc_results.rx('auc')[0][0])) Y_pos_frac.append(test_Y_mark.mean()) print("AUC %2.3f; AUPRC %2.3f" % (auc[mark_idx], auprc[mark_idx])) else: true_var.append(np.var(test_Y_mark)) MSE.append(get_MSE(pred_Y_mark, test_Y_mark)) pearsonR.append(get_pearsonR(pred_Y_mark, test_Y_mark)) print("MSE %2.3f (true var %2.3f), pearsonR %2.3f" % (MSE[mark_idx], true_var[mark_idx], pearsonR[mark_idx])) if predict_binary_output: assert((len(precisions) > 0) and (len(recalls) > 0)) results = { 'AUC':auc, 'AUPRC':auprc, 'Y_pos_frac':Y_pos_frac } results['precision_curves'] = precision_curves results['recall_curves'] = recall_curves else: results = { 'MSE': MSE, 'true_var': true_var, 'pearsonR': pearsonR } if save_data: results['pred_Y'] = list(pred_Y[..., 0]) results['test_Y'] = list(test_Y[..., 0]) return results
def main(): """ Performs a Sparse Partial Least Squares (sPLS) analysis over subsets of gene expression and metabolomic data. To perform this subsetting, three different methodologies can be used for the metabolites: - By generic metabolite (sphingomyelin, ...) - By MMC cluster - By generic metabolite and then by MMC cluster and four for the genes: - All the genes - Genes contained in a list with interesting genes for the analysis - Pathway related genes for an specific generic metabolite - Metagenes (PANA approach) The outputs depend on the inputs. Arguments: :param geneDataset metDataset: Gene expression/Metabolomics wide dataset, respectively. :type geneDataset metDataset: files :param geneId metId: Name of the Genes/metabolites unique identifier column, respectively. :type geneId metId: strings :param geneAnnot metAnnot: Gene Expression/Metabolomics Annotation Dataset. :type geneAnnot metAnnot: files :param geneAnnotName metAnnotName: annotation file column with gene/metabolite names. :type geneAnnotName metAnnotName: strings :param design: Design File :type design: file :param keepX: Number of genes to keep in the sPLS model :param keepX: integer :param geneOption metOption: Options for metabolite subsetting (one of 'generic', 'mmc' or 'both') and for gene expression subsetting (one of 'all', 'geneList', 'path' or 'pana') :type geneOption metOption: strings :param geneKeggAnno metKeggAnno: KEGG Annotation files for gene expression and metabolomics, respectively. From Add KEGG Anno Info Tool :type geneKeggAnno metKeggAnno: files :param geneKeggPath metKeggPath: KEGG Pathway files for gene expression and metabolomics, respectively. From Add KEGG Pathway Info Tool :type geneKeggPath metKeggPath: files :param path2genes: Downloaded KEGG file with this information: pathway_ID "\t" geneKEGG_ID :type path2genes: file Returns: :return figure1: sPLS heatmaps :rtype figure1: pdf :return splsOut: sif-like correlation matrix including a column describing the comparison. :rtype splsOut: file :return figure2: MMC plots if mmc or both metabolite subsetting option is selected. :rtype figure2: pdf :return mmcOut: MMC Output table if mmc or both metabolite subsetting option is selected. :rtype mmcOut: file :return panaOut: Table describing genes that forms the metagenes (1/0) :rtype panaOut: file """ args = getOptions() logger = logging.getLogger() sl.setLogger(logger) logger.info(u"Importing data with the following parameters: " "\n\tGene Dataset: {}" "\n\tGene UniqueID: {}" "\n\tGene Option: {}" "\n\tMetabolite Dataset:{}" "\n\tMetabolite UniqueID: {}" "\n\tMetabolite Option: {}".format( args.geneDataset, args.geneId, args.geneOption, args.metDataset, args.metId, args.metOption, )) pandas2ri.activate() with ires.path("gaitGM.data", "sPLS.R") as my_r_script_path: f = open(my_r_script_path, "r") rFile = f.read() sPLSScript = STAP(rFile, "sPLS") rGeneData, rMetData, multipleNames, multipleNamesId = modules.prepareSPLSData( args) rData = [] data_counter = 0 for R_met_df in rMetData: R_gene_df = rGeneData[data_counter] rData.append( sPLSScript.sPLS(geneData=R_gene_df, metData=R_met_df, keepX=args.keepX)) if args.geneOption == "path": data_counter += 1 if args.metOption == "both": sPLSScript.plotInPdf(splsObjects=rData, figurePath=args.figure1, multipleNames=multipleNamesId) # Correlation Matrix corMatrix = sPLSScript.corrMat(splsObjects=rData, multipleNames=multipleNamesId, threshold=args.thres) robjects.r["write.table"]( corMatrix, file=args.splsOut, sep="\t", quote=False, row_names=False, col_names=True, ) else: sPLSScript.plotInPdf(splsObjects=rData, figurePath=args.figure1, multipleNames=multipleNames) # Correlation Matrix corMatrix = sPLSScript.corrMat(splsObjects=rData, multipleNames=multipleNames, threshold=args.thres) robjects.r["write.table"]( corMatrix, file=args.splsOut, sep="\t", quote=False, row_names=False, col_names=True, )
used_features_payload = api.model('feature_payload', used_features) immEd_output = api.model( 'immEd_output', { 'score': fields.Integer, 'category': fields.String, 'guidance': fields.String, 'is_ready': fields.Boolean, 'used_features': fields.Nested(used_features_payload) }) ## ## from rpy2.robjects.packages import STAP immunomatch_ed = STAP(immunomatch_ed, "immunomatch_ed") ns1 = Namespace('Immunomatch Ed') @api.route('/immunomatch_ed') class Immunomatch_ed(Resource): #@api.marshal_with(a_language, envelope='the_data') @api.doc(security='apikey') @token_required @api.expect(feat_root_objs) # @api.doc(body=immEd_output) def post(self): if not request.get_json(): return bad_request('No input data provided') raw_dict = request.json
} fit_betabinom_w <- function(n, k, w) { fit <- vglm(cbind(k, n-k) ~ 1, betabinomialff, weights=w) return(coef(fit, matrix = TRUE)) } eval_betabinom <- function(n, k, a, b) { p <- dbetabinom.ab(k, size=n, shape1=a, shape2=b) return(p) } """ with warnings.catch_warnings(): warnings.simplefilter("ignore") bbfit = STAP(FIT_RFUN_STR, 'bbfit') def fit_betabinom_ab(n, k, weights=None): assert np.all((k >= 0) & (k <= n)) n_r = ro.FloatVector(n) k_r = ro.FloatVector(k) if weights is not None: assert len(weights) == len(k) assert len(weights) == len(n) weights_r = ro.FloatVector(weights) result_r = bbfit.fit_betabinom_w(n_r, k_r, weights_r) else: result_r = bbfit.fit_betabinom(n_r, k_r) result = rpyn.ri2py(result_r) log_a, log_b = result.flatten()
def fitcoxnet(x, y): """ Fit a cox net model Input : data_x : Data frame in pandas with covariates y: Survival time array with time and censored event Output : Coefficients selected """ pandas2ri.activate() x_raw = pandas2ri.py2ri(x) y_raw = pandas2ri.py2ri(pd.DataFrame(y)) ro.globalenv['x_raw'] = x_raw ro.globalenv['y_raw'] = y_raw r_fct_string = """ fitcoxnet <- function(DATAX,DATAY) { # Making Dummy Variables from the clinical Data x<- model.matrix( ~ ., data = DATAX) # Making a Survival Object surv_obj <- Surv(time=DATAY$Survival_in_days,event=DATAY$Status) # create a glmnet cox object using lasso regularization and cross validation glmnet.cv <- cv.glmnet (x, surv_obj, family="cox") ## get the glmnet model on the full dataset glmnet.obj <- glmnet.cv$glmnet.fit # find lambda index for the models with least partial likelihood deviance optimal.lambda <- glmnet.cv$lambda.min lambda.index <- which(glmnet.obj$lambda==optimal.lambda) # take beta for optimal lambda optimal.beta <- glmnet.obj$beta[,lambda.index] # find non zero beta coef nonzero.coef <- abs(optimal.beta)>0 selectedBeta <- optimal.beta[nonzero.coef] # take only covariates for which beta is not zero selectedVar <- x[,nonzero.coef] ## create a dataframe for trainSet with time, status and selected #variables in binary representation for evaluation in pec reformat_dataSet <- as.data.frame(cbind(surv_obj,selectedVar)) # glmnet.cox only with meaningful features selected by stepwise #bidirectional AIC feature selection glmnet.cox.meaningful <- step(coxph(Surv(time,status) ~ .,data=reformat_dataSet),direction="both",trace=0) ## C-Index calculation 100 iter bootstrapping cIndexCoxglmnet<-c() features<-c() for (i in 1:100) { train <- sample(1:nrow(x), nrow(x), replace = TRUE) reformat_trainSet <- reformat_dataSet [train,] glmnet.cox.meaningful.test <- step(coxph(Surv(time,status) ~ .,data=reformat_trainSet),direction="both",trace=0) lbls<- as.vector(attr(glmnet.cox.meaningful.test$terms,"term.labels")) features<- c(features,lbls) varnames <- lapply(lbls, as.name) testdf<- as.data.frame(x) selectedVarCox <-testdf[ names(testdf)[names(testdf) %in% varnames] ] reformat_testSet <- as.data.frame(cbind(surv_obj,selectedVarCox)) reformat_testSet <- reformat_dataSet [-train,] cIndexCoxglmnet <- c(cIndexCoxglmnet, 1-rcorr.cens(predict(glmnet.cox.meaningful, reformat_testSet),Surv(reformat_testSet$time,reformat_testSet$status))[1]) } cIndexm<- mean (unlist(cIndexCoxglmnet),rm.na=TRUE) cIndexstd<- sd(unlist(cIndexCoxglmnet)) Finalresult=list(coefficients=glmnet.cox.meaningful,Average_Concordance=cIndexm,concordance_sd=cIndexstd,lambda=optimal.lambda) return(Finalresult)} """ r_pkg = STAP(r_fct_string, "r_pkg") print(r_pkg.fitcoxnet(x_raw, y_raw))
def main(): """ Perform a correlation analysis of a Gene Expression Dataset and a Metabolomic Dataset. Arguments: :param geneDataset metDataset: Gene expression/Metabolomics wide dataset, respectively. :type geneDataset metDataset: files :param geneId metId: Name of the Genes/metabolites unique identifier column, respectively. :type geneId metId: strings :param geneAnnot metAnnot: Gene Expression/Metabolomics Annotation Datasets, respectively. :type geneAnnot metAnnot: files :param geneAnnotName metAnnotName: Name of the column of the Annotation file that contains genes/metabolites names respectively. :type geneAnnotName metAnnotName: strings :param meth: Methodology for the correlation function. One of 'pearson', 'spearman' or 'kendall'. :type meth: string :param thres: PValue Threshold to cut the correlations for the output table. :type thres: float Returns: :return output: Output table with the following information: Metabolite "\t" Gene "\t" Correlation "\t" pvalue :rtype output: file :return corMat: Correlation Matrix :rtype corMat: file :return fig: Network-like output figure :rtype fig: pdf """ warnings.filterwarnings("ignore", category=RRuntimeWarning) args = getOptions() logger = logging.getLogger() sl.setLogger(logger) logger.info(u"Importing data with the following parameters: " "\n\tGene Dataset: {}" "\n\tGene UniqueID: {}" "\n\tMet Dataset:{}" "\n\tMet UniqueID: {}" "\n\tMethod: {}" "\n\tThreshold: {}".format( args.geneDataset, args.geneId, args.metDataset, args.metId, args.meth, args.thres, )) modules.checkForDuplicates(args.geneDataset, args.geneId) modules.checkForDuplicates(args.metDataset, args.metId) pandas2ri.activate() with ires.path("gaitGM.data", "all_by_all_correlation.R") as my_r_script_path: f = open(my_r_script_path, "r") rFile = f.read() allByAllCorrScript = STAP(rFile, "corr_main_func") # Prepare Gene Expression Data geneTable = pd.read_table(args.geneDataset, sep="\t", header=0) if args.geneAnnot: R_gene_df = modules.Ids2Names(geneTable, args.geneId, args.geneAnnot, args.geneName) else: geneTable = geneTable.set_index(args.geneId) R_gene_df = pandas2ri.py2rpy(geneTable) # Prepare Metabolomics Data metTable = pd.read_table(args.metDataset, sep="\t", header=0) if args.metAnnot: R_met_df = modules.Ids2Names(metTable, args.metId, args.metAnnot, args.metName) else: metTable = metTable.set_index(args.metId) R_met_df = pandas2ri.py2rpy(metTable) allByAllCorrScript.corr_main_func( x=R_gene_df, y=R_met_df, meth=args.meth, thres=args.thres, corrMatPath=args.corMat, outputPath=args.output, figurePath=args.fig, )
def bin(self, method, target, event, *args): import rpy2.robjects as robjects from rpy2.robjects.vectors import StrVector import rpy2.robjects as ro from rpy2.robjects.packages import importr import rpy2.robjects.packages as rpackages #from rpy2.robjects.lib.dplyr import dplyr from rpy2.robjects import pandas2ri from rpy2.robjects import default_converter from rpy2.robjects.conversion import localconverter base = importr('base') utils = importr('utils') try: woeBinning = importr('woeBinning') except: print("woeBinning package is being installed") utils.install_packages('woeBinning') try: discretization = importr('discretization') except: print("discretization pacakge is being installed") utils.install_packages('discretization') try: lazyeval = importr('lazyeval') except: print("lazyeval pacakge is being installed") utils.install_packages('lazyeval') column_Name = [] cl = [] if method == 'woe': print('woe') for arg in args: print(arg) if self.Train[arg].dtypes != 'object': print(self.Train[arg].dtypes) column_Name.append(arg) cl.append(arg) print(column_Name) else: print("Column Name " + arg + " is Object so no binning is done for it") column_Name.append(target) df = self.Train[column_Name] try: with open('woe.r', 'r') as f: string = f.read() woe = STAP(string, "woe") except FileNotFoundError: path = input("set directory to path: ") from os import chdir chdir(path) with open('woe.r', 'r') as f: string = f.read() woe = STAP(string, "woe") pandas2ri.activate() self.woe_based_bin = pandas2ri.ri2py( woe.woe_based_binning(df, target, event, cl)) pandas2ri.deactivate() return self.woe_based_bin elif method == 'Chisq': print('Chisq') cl = [] s = [] for arg in args: s.append(arg) #column_name =[] if s[0] == 'ALL': print("ALL") for j in self.Train.columns: if self.Train[j].dtypes != 'object': cl.append(j) #column_name.append(arg) cl.append(target) df = self.Train[cl] try: with open('woe.r', 'r') as f: string = f.read() woe = STAP(string, "woe") print('woe') except FileNotFoundError: path = input("set directory to path: ") from os import chdir chdir(path) with open('woe.r', 'r') as f: string = f.read() woe = STAP(string, "woe") print('woe') pandas2ri.activate() print('pandas2ri is activated') self.chisq_based_bin = pandas2ri.ri2py( woe.Chi_sq_based_bin(df)) pandas2ri.deactivate() return self.chisq_based_bin else: cl = [] for arg in args: if self.Train[arg].dtypes != 'object': cl.append(arg) else: print(arg + " is not continuous") if len(cl) == 0: message = ' Binning can not be done as all variable passed is not continuos' return message cl.append(target) df = self.Train[cl].copy() try: with open('woe.r', 'r') as f: string = f.read() woe = STAP(string, "woe") print('woe module loaded') except FileNotFoundError: path = input("set directory to path: ") from os import chdir chdir(path) with open('woe.r', 'r') as f: string = f.read() woe = STAP(string, "woe") print('woe module loaded') pandas2ri.activate() print('pandas2ri is activated') self.chisq_based_bin = pandas2ri.ri2py( woe.Chi_sq_based_bin(df)) pandas2ri.deactivate() print('pandas2ri is deactivated') return self.chisq_based_bin elif method == 'Entropy': print('Entropy') cl = [] s = [] for arg in args: s.append(arg) if s[0] == 'ALL': print("ALL") for j in self.Train.columns: if self.Train[j].dtypes != 'object': cl.append(j) #column_name.append(arg) cl.append(target) df = self.Train[cl] try: with open('woe.r', 'r') as f: string = f.read() woe = STAP(string, "woe") print('woe') except FileNotFoundError: path = input("set directory to path: ") from os import chdir chdir(path) with open('woe.r', 'r') as f: string = f.read() woe = STAP(string, "woe") print('woe') pandas2ri.activate() print('pandas2ri is activated') self.entropy_based_bin = pandas2ri.ri2py( woe.entropy_based_bin(df)) pandas2ri.deactivate() return self.entropy_based_bin else: cl = [] for arg in args: if self.Train[arg].dtypes != 'object': cl.append(arg) else: print(arg + " is not continuous") if len(cl) == 0: message = ' Binning can not be done as all variable passed is not continuos' return message cl.append(target) df = self.Train[cl].copy() try: with open('woe.r', 'r') as f: string = f.read() woe = STAP(string, "woe") print('woe module loaded') except FileNotFoundError: path = input("set directory to path: ") from os import chdir chdir(path) with open('woe.r', 'r') as f: string = f.read() woe = STAP(string, "woe") print('woe module loaded') pandas2ri.activate() print('pandas2ri is activated') self.entropy_based_bin = pandas2ri.ri2py( woe.entropy_based_bin(df)) pandas2ri.deactivate() print('pandas2ri is deactivated') return self.entropy_based_bin def crt(): pass
import pprint import numpy as np import pickle import rpy2.robjects as robjects import rpy2.robjects.numpy2ri from rpy2.robjects.packages import importr from rpy2.robjects.packages import STAP rpy2.robjects.numpy2ri.activate() Rsession = rpy2.robjects.r TSclust = importr('TSclust') mfunc = 'myasmatrix <- function(dobj){return(as.matrix(dobj))}' myasmatrix = STAP(mfunc, "myasmatrix") mfunc = 'saxconv <- function(x,asize){return(convert.to.SAX.symbol( x, alpha=asize ))}' saxconv = STAP(mfunc, "saxconv") def znorm(x): return (x - x.mean()) / x.std() def compute_and_save_metrics_R(data, n, dname, dirname, njobs=40): # Creare a SAX symbolic representation with alphabet size 10 Xsax = [np.asarray(saxconv.saxconv(znorm(_x), 10)) for _x in data] Xsax = np.vstack(Xsax) # p
} c.F4 <- function(data) { data <- ovo(data) aux <- mapply(function(d) { nrow(removing(d))/nrow(d) }, d=data) #aux <- mean(aux) return(aux) } """ stringr_c = STAP(string, "stringr_c") stringr_c._rpy2r.keys() def my_evaluate(individual): dataFrame['label'] = individual robjects.globalenv['dataFrame'] = dataFrame fmla = Formula('label ~ .') ## -- linearity linearityVector = stringr_c.linearity_formula(fmla, dataFrame, measures="L2", summary="return") linearity = linearityVector.rx(1) fitness = abs(globalLinear - linearity[0][0])
class EvalAdmix(): 'Class for executing evalAdmix commands' def __init__(self, prefix, mc, funcs): self.prefix = prefix self.mc = mc self.mcOnly = False if (self.mc != "none"): self.mcOnly = True self.qfiles = dict() self.runs = dict() self.qfilePaths = dict() if (self.mcOnly == True): self.parseMC() #import R functions self.utils = importr('utils') self.base = importr('base') self.grdevices = importr('grDevices') # import R plotting functions from evalAdmix with open(funcs, 'r') as f: string = f.read() self.myfunc = STAP(string, "myfunc") def parseMC(self): print("Parsing MC") with open(self.mc) as fh: newlist = fh.read().splitlines() print(newlist) def loadJson(self): self.loadQ() self.loadRuns() self.loadQfilePaths() def loadQ(self): qfn = self.prefix + ".qfiles.json" if os.path.isfile(qfn): with open(qfn) as fh: self.qfiles = json.load(fh) else: print("ERROR:", qfn, "does not exist.") print("Exiting program...") raise SystemExit def loadRuns(self): rfn = "cvRuns.json" if os.path.isfile(rfn): with open(rfn) as fh: self.runs = json.load(fh) else: print("ERROR:", rfn, "does not exist.") print("Exiting program...") raise SystemExit def loadQfilePaths(self): rfn = "qfilePaths.json" if os.path.isfile(rfn): with open(rfn) as fh: self.qfilePaths = json.load(fh) else: print("ERROR:", rfn, "does not exist.") print("Exiting program...") raise SystemExit def evalAdmix(self, minK, maxK, np): ks = range(int(minK), int(maxK) + 1) for k in ks: for qf in self.qfiles[str(k)]: print(qf) temp = qf.split(".") #make .P file name temp[-1] = "P" pf = ".".join(temp) #make output .corres file name temp[-1] = "corres" eAf = ".".join(temp) #build command for evalAdmix evalAdmix_str_com = "evalAdmix -plink " + self.prefix + " -fname " + pf + " -qname " + qf + " -o " + eAf + " -P " + str( np) call = SysCall(evalAdmix_str_com) call.run_program() def averageCorres(self, funcs): for k in self.runs: matrixList = list() print(k) for run in self.runs[k]: temp = run.split(".") temp[-1] = "corres" eAf = ".".join(temp) if (os.path.isfile(eAf)): cor = self.base.as_matrix(self.utils.read_table(eAf)) matrixList.append(cor) else: print("ERROR:", eAf, "does not exist.") print("Exiting program...") raise SystemExit reducedList = self.base.Reduce('+', matrixList) #sum matrices in list cor = reducedList.ro / float( len(matrixList)) #div by num elements in list to get mean #get average corres matrix for major or minor cluster k q = self.parseClumpp(self.qfilePaths[k]) #check if object q is NoneType if q is None: print("") print("") print("ERROR from evaladmix.py:") print( "Empty matrices (Python NoneType) were returned When trying to create average matrices for Major/Minor clusters." ) print("Check that the paths in qfilePaths.json are valid.") print( "This error could occur if you have moved your admixture run folder after running distructRerun.py." ) print( "Alternatively, if you are using the Docker container this could have occurred if you ran distructRerun.py on your own system outside of the container." ) print("") print("") raise SystemExit famf = self.prefix + ".fam" pop = self.base.as_matrix(self.utils.read_table(famf)) # uncomment lines below for debugging of object types #print("Type for q is:") #print(type(q)) # uncomment below lines for debugging. #print(type(pop)) #print(type(famf)) output = k + ".png" ordr = self.myfunc.orderInds(pop=self.base.as_vector( pop.rx(True, 2)), q=q) title = k self.grdevices.png(file=output) try: self.myfunc.plotCorRes(cor_mat=cor, pop=self.base.as_vector(pop.rx(True, 2)), ord=ordr, title=title, max_z=0.1, min_z=-0.1) except rpy2.rinterface_lib.embedded.RRuntimeError: print("Error in R code (plotting functions) from evalAdmix.") self.grdevices.dev_off() def parseClumpp(self, f): if (os.path.isfile(f)): df = pandas.read_csv(f, delimiter="\s+", header=None, index_col=False) # Even though inplace=True is used in this context, operating on the dataframe directly rather than assigning to a new variable should prevent creation of a "NoneType" df.drop(df.columns[0:5], axis=1, inplace=True) # uncomment below lines for debugging. #print("Type for df is:") #print(type(df)) with localconverter(rpy2.robjects.default_converter + pandas2ri.converter): Rdf = rpy2.robjects.conversion.py2rpy(df) # uncomment lines below for debugging. #print("Type for Rdf is:") #print(type(Rdf)) #print(Rdf) return Rdf def Rcode(self, funcs, minK, maxK): #make file names famf = self.prefix + ".fam" ks = range(int(minK), int(maxK) + 1) for k in ks: title = "K=" + str(k) for qf in self.qfiles[str(k)]: temp = qf.split(".") temp[-1] = "corres" eAf = ".".join(temp) output = eAf + ".png" # read in files pop = self.base.as_matrix(self.utils.read_table(famf)) print(qf) q = self.utils.read_table(qf) cor = self.base.as_matrix(self.utils.read_table(eAf)) # run plotting functions ordr = self.myfunc.orderInds(pop=self.base.as_vector( pop.rx(True, 2)), q=q) self.grdevices.png(file=output) try: self.myfunc.plotCorRes(cor_mat=cor, pop=self.base.as_vector( pop.rx(True, 2)), ord=ordr, title=title, max_z=0.1, min_z=-0.1) except rpy2.rinterface_lib.embedded.RRuntimeError: print("Something happened.") self.grdevices.dev_off()