class LocfitEstimator(BaseEstimator): """ An R locfit estimator that pretends to work like a sklearn estimator. """ def __init__(self): rfile = "/net/noble/vol2/home/katecook/proj/2015HiC-differential/src/call_locfit.R" rfh = open(rfile, 'r') string = ''.join(rfh.readlines()) self.locfit = SignatureTranslatedAnonymousPackage(string, "locfit") def fit(self, X, y): X_robj = robjects.FloatVector(X) y_robj = robjects.FloatVector(y) self.fit_ = self.locfit.do_fit(X_robj, y_robj) def predict(self, X): check_is_fitted(self, "fit_") shape = X.shape flat = X.flatten().T #logging.debug(flat.shape) #logging.debug(flat) X_robj = robjects.FloatVector(flat) #logging.debug(X_robj) y_robj = self.locfit.safepredict(self.fit_, X_robj) y_ = np.array(y_robj).T.reshape(shape) logging.debug(y_) np.savetxt("y.txt", y_, delimiter='\t') return y_ def score(self, X, y): yest = self.safepredict(X) return r2_score(y, yest, sample_weight=None)
def run_hydrology(init_gwstorage, init_C, init_Nash, init_Qq, init_Qs, climate_type): if "hydrological" in CONFIG.paths: path = CONFIG.paths['hydrological'] else: path = os.path.dirname(__file__) #end if r_path = os.path.join(path, 'WrappableRunIhacresGw.R') with open(r_path) as r_file: """ import .R file and call function """ string = r_file.read() IhacresGW = SignatureTranslatedAnonymousPackage(string, "IhacresGW") workingdir = CONFIG.paths[ "hydrological"] if "hydrological" in CONFIG.paths else os.path.dirname( __file__) + "/" #workingdir = os.path.dirname(__file__) # workingdir = "~/Dropbox/integrated/Mike/hydrological" # datadir = workingdir + "/Maules_19690101_20100302" datadir = workingdir + "data" workingdir = workingdir[: -1] #Remove last slash as function below expects it to be empty # sim, tdat = IhacresGW.RunIhacresGw(workingdir, datadir) return IhacresGW.RunIhacresGw(workingdir, datadir, init_gwstorage, init_C, FloatVector(init_Nash), init_Qq, init_Qs, climate_type)
def generate_solutions_tables(self): ''' code from Adam use rpy2 to execute rcode which reads out a solutions file to pandas ''' col_names = [ 'alpha', 'tau', 'AT', 'b', 'delta', 'LL', 'mode_curv', 'genome mass', 'sigma.h.hat', 'theta.z.hat', 'sigma.A.hat', 'theta.Q.hat', 'lambda.hat', 'theta.0', 'frac.het', 'SCNA_LL', 'entropy', 'Kar_LL', 'WGD', 'combined_LL', 'SSNV_LL', 'SCNA_Theta_integral', 'dens' ] # Build R function to be used as a python package load_RData_func_str = """ load_RData <- function(file_path) { load(file_path) head_name <- ls()[1] file_name <- names(`segobj.list`)[1] r_data <- `segobj.list`[[file_name]]$mode.res$mode.tab return(r_data) } """ # Pack the function above as a package r_pack = SignatureTranslatedAnonymousPackage(load_RData_func_str, "r_pack") print 'Generating absolute tables for ' + str(len( self.data_table)) + ' samples' pandas2ri.activate() for index, row in self.data_table.iterrows(): if np.mod(index, 100) == 0: print str(index) + '/' + str(len(self.data_table)) r_data = r_pack.load_RData(row['absolute_summary_data']) abs_table = pd.DataFrame(pandas2ri.ri2py(r_data), columns=col_names) self.pp_modes_tables[row['pair_id']] = abs_table pandas2ri.deactivate()
def extract_strain_id(busco_result): ''' provide busco_result as input input:busco_result return:>95% busco result list ''' R_extract_95_code = """ extract_95 <- function(busco_result) { require(tidyverse) in_fl <- read.table(busco_result) in_fl_95 <- in_fl %>% filter(V2 >= 95) # print(in_fl_95$V1) pb_protein_id=in_fl_95[grep("_PB",in_fl_95$V1,ignore.case = F),1] need_remove=gsub("_PB","",pb_protein_id) in_fl_95_removed_pb=in_fl_95[!(in_fl_95$V1%in%need_remove),] # print(in_fl_95_removed_pb$V1) return(as.character(in_fl_95_removed_pb$V1)) } """ R_extract_95 = SignatureTranslatedAnonymousPackage(R_extract_95_code, "R_extract_95") R_95_strain = R_extract_95.extract_95(busco_result) strain_95_list = list(R_95_strain) strain_95_list.remove("70-15") strain_95_list.remove("HO_busco") strain_95_list.remove("PH42_busco") strain_95_list.append("magnaporthe_oryzae_70-15_8_proteins_T0") strain_95_list.append("HO") strain_95_list.append("PH42") return strain_95_list
def call_r(df): ''' Arguments: df: A string replicating a CSV file. The observations for the dependent variable MUST be in the FIRST COLUMN Returns: an rpy2 Robject float vector which stores the coefficients of the linear regression ''' from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage from io import StringIO from rpy2.robjects import DataFrame from rpy2.robjects import FloatVector import rpy2.rinterface as ri ri.initr() file_like_obj = StringIO(df) constructor_dict = parser(file_like_obj) rpy2_dataframe = DataFrame(constructor_dict) with open('regression_app\linear_modeler_function.R') as f: str = f.read() mod = SignatureTranslatedAnonymousPackage(str, 'mod') a = mod.linear_modeler(rpy2_dataframe) del mod return a
def remove_MGG_unpresent_Augustus( pav_orthofinder, MGG_unpresent_Augustus_unassianed_list_file_name, pav_orthofinder_1574): ''' 用于从pav_orthofinder中删掉1574个出现在unassigned gene中的 input 1: pav_orthofinder input 2: MGG_unpresent_Augustus_unassianed_list output 1: pav_orthofinder_1574 ''' R_code_remove_MGG_unpresent_Augustus = ''' R_remove_MGG_unpresent_Augustus=function( pav_orthofinder_file_name, MGG_unpresent_Augustus_unassianed_list_file_name, pav_orthofinder_1574_file_name ){ require(readxl) require(WriteXLS) require(dplyr) MGG_unpresent_Augustus_unassianed_list=read.table(MGG_unpresent_Augustus_unassianed_list_file_name) pav_orthofinder=read_xlsx(pav_orthofinder_file_name) pav_orthofinder_1574=pav_orthofinder %>% filter(!(protein_id %in% MGG_unpresent_Augustus_unassianed_list$V1)) pav_orthofinder_1574=pav_orthofinder_1574[,-1] pav_orthofinder_1574=pav_orthofinder_1574[,c(1,158,2:157)] WriteXLS::WriteXLS(pav_orthofinder_1574,pav_orthofinder_1574_file_name) } ''' R_remove_MGG_unpresent_Augustus = SignatureTranslatedAnonymousPackage( R_code_remove_MGG_unpresent_Augustus, "R_remove_MGG_unpresent_Augustus") R_remove_MGG_unpresent_Augustus.R_remove_MGG_unpresent_Augustus( str(pav_orthofinder), str(MGG_unpresent_Augustus_unassianed_list_file_name), str(pav_orthofinder_1574))
def predictPrices(path): r = robjects.r sourcepath = os.path.abspath("rpy2/project/R/predict.R") source = r.source(sourcepath) from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage project = SignatureTranslatedAnonymousPackage( "predictPrice <- " + str(source[0]), "project") return project.predictPrice(path)
def convertRtoPandas(file_path): # Pack the function above as a package r_pack = SignatureTranslatedAnonymousPackage(load_RData_func_str, "r_pack") pandas2ri.activate() r_data = r_pack.load_RData(file_path) py_data = pd.DataFrame(pandas2ri.ri2py(r_data), columns=col_names) pandas2ri.deactivate() return py_data
def getFDRCorrection(pvals): rcode = """ fdr <- function(pvals) { return(p.adjust(pvals, method = "fdr")) } """ rStats = SignatureTranslatedAnonymousPackage(rcode, "rStats") pvals_r = robjects.FloatVector(pvals) return rStats.fdr(pvals_r)
def r_cal(df): string = """ ptsPPP <- function(df) { X <- with(df, ppp(x, y, c(-25,25), c(-25,25))) plot(X) return(X) } """ sp = SignatureTranslatedAnonymousPackage(string, "powerpack") pandas2ri.activate() r_num_meanDis_DF = pandas2ri.py2ri(df[["x", "y"]]) ptsPPP = sp.ptsPPP(r_num_meanDis_DF)
def run(self): try: result = "0" # grise self.disalbledButtonsCalibration() self.ui.resetCalibrationPushButton.setDisabled(True) self.actualizeOutFiles() with open("fonctions_apprentissage.r", "r", encoding="utf-8") as apprentissageRopen: apprentissage = "".join(apprentissageRopen.readlines()) apprentissage = SignatureTranslatedAnonymousPackage(apprentissage, "apprentissage") if args.debug: print("{}\n{}".format(apprentissage, dir(apprentissage))) # test if Rdata file already exist, if yes remove file if user say yes, or stop analyse if os.path.exists(self.calibrationFilesOut["RData"]): reply = QMessageBox.question(self, 'WARNING', 'File will be overwritten.\nDo you still want to proceed?', QMessageBox.Yes | QMessageBox.No, QMessageBox.No) if reply == QMessageBox.Yes: for key, path in self.calibrationFilesOut.items(): os.remove(path) reloadCalibration = True elif reply == QMessageBox.No: reloadCalibration = False else: reloadCalibration = True if reloadCalibration: self.ui.statusbar.showMessage(str("Running calibration, please waiting ...."),9600) #doivent être (dans cet ordre) le nom (relatif) des sous-répertoires fond, limbe, lésions result , good = apprentissage.apprentissage(self.calibrationInOutPath,"background", "leaf", "lesion").r_repr().replace('"','').replace("c(","").replace(")","").split(",") self.calibrationFileOpenLineEdit.setText(self.calibrationFilesOut["RData"]) #print(result, good) if result == "1" and os.path.exists(self.calibrationFilesOut["RData"]): print(result, self.calibrationFilesOut["RData"]) self.infoDialogue(status = "new") self.ui.statusbar.showMessage(str("FINISH, files were product on : %s" % self.calibrationInOutPath),9600) self.ui.resetCalibrationPushButton.setEnabled(True) elif result == "0" and os.path.exists(self.calibrationFilesOut["RData"]): self.infoDialogue(status = "already") print(result, self.calibrationFilesOut["RData"]) self.calibrationFileOpenLineEdit.setText("") self.ui.resetCalibrationPushButton.setEnabled(True) self.resetLoadFolder() self.enableButtonsCalibration() elif result == "0" and not os.path.exists(self.calibrationFilesOut["RData"]): self.displayError(typeError = "ERROR:", message = "Error when running R code....") self.resetLoadFolder() except Exception as e: self.displayError(typeError = "ERROR:", message = "Error when running R code....\n"+e) self.resetLoadFolder() self.ui.resetCalibrationPushButton.setEnabled(True)
def run(self): try: result = "0" # grise self.disalbledButtonsCalibration() self.ui.resetCalibrationPushButton.setDisabled(True) with open("fonctions_apprentissage.r", "r", encoding="utf-8") as apprentissageRopen: apprentissage = "".join(apprentissageRopen.readlines()) apprentissage = SignatureTranslatedAnonymousPackage(apprentissage, "apprentissage") self.CalibrationOutPath = "/".join(str(self.dicoFoldersCalibration["leaf"]).split("/")[:-1]) self.CalibrationBasename = self.CalibrationOutPath.split("/")[-1] self.actualizeOutFiles() if debug: print("{}\n{}".format(apprentissage, dir(apprentissage))) # test if Rdata file already exist, if yes remove file if user say yes, or stop analyse if os.path.exists(self.CalibrationFilesOut["RData"]): reply = QMessageBox.question(self, 'Warning', 'File will be overwritten.\nDo you still want to proceed?', QMessageBox.Yes | QMessageBox.No, QMessageBox.No) if reply == QMessageBox.Yes: for key, path in self.CalibrationFilesOut.items(): os.remove(path) reloadCalibration = True elif reply == QMessageBox.No: reloadCalibration = False else: reloadCalibration = True if reloadCalibration: self.ui.statusbar.showMessage(str("Running Calibration, please waiting ...."),9600) #result , self.CalibrationFilesOut["RData"] = apprentissage.apprentissage(self.dicoObjectOpenLineEditCalibration["leaf"],self.dicoObjectOpenLineEditCalibration["symptom"],self.dicoObjectOpenLineEditCalibration["background"]).r_repr().replace('"','').replace("c(","").replace(")","").split(",") result , good = apprentissage.apprentissage(self.CalibrationOutPath).r_repr().replace('"','').replace("c(","").replace(")","").split(",") self.calibrationFileOpenLineEdit.setText(self.CalibrationFilesOut["RData"]) if result == "1" and os.path.exists(self.CalibrationFilesOut["RData"]): print(result, self.CalibrationFilesOut["RData"]) self.infoDialogue(status = "new") self.ui.statusbar.showMessage(str("FINISH, files were product on : %s" % self.CalibrationOutPath),9600) self.ui.resetCalibrationPushButton.setEnabled(True) elif result == "0" and os.path.exists(self.CalibrationFilesOut["RData"]): self.infoDialogue(status = "already") print(result, self.CalibrationFilesOut["RData"]) self.calibrationFileOpenLineEdit.setText("") self.ui.resetCalibrationPushButton.setEnabled(True) self.resetLoadFolder() self.enableButtonsCalibration() elif result == "0" and not os.path.exists(self.CalibrationFilesOut["RData"]): self.displayError(error = "Error when running R code....") except Exception as e: self.displayError(error = e)
def mkSimSem(variances=[0.5, 1.1, 0.8, 0.4, 0.4, 0.8, 0.8, 0.5, 0.6]): string = """ library(lavaan) mkdata=function(n){ popModel <- " f1 =~ 1*y1 + 0.6*y2 + 0.7*y3 f2 =~ 1*y4 + 1.1*y5 + 0.9*y6 f3 =~ 1*y7 + 1.2*y8 + 1.1*y9 f1 ~~ 0.8*f1 f2 ~~ 0.9*f2 f3 ~~ 0.4*f3 f1 ~~ 0.4*f2 f1 ~~ 0.2*f3 f2 ~~ 0.3*f3 y1 ~~ %f*y1 y2 ~~ %f*y2 y3 ~~ %f*y3 y4 ~~ %f*y4 y5 ~~ %f*y5 y6 ~~ %f*y6 y7 ~~ %f*y7 y8 ~~ %f*y8 y9 ~~ %f*y9 " analyzeModel <- " f1 =~ y1 + y2 + y3 f2 =~ y4 + y5 + y6 f3 =~ y7 + y8 + y9 " s=simulateData(popModel,sample.nobs=n) return(s) }""" % tuple((i for i in variances)) return SignatureTranslatedAnonymousPackage(string, "semsimdata")
def get_taxon_abundance_stacked_bar_plot(): box_plot_fnc = """ require("dplyr") require("ggplot2") taxon_abundance_stacked_bar_plot <- function(data, plot_file_path, title, xlabel, ylabel) { temp <- data[order(data$variant_allele_count),] #sort by variant_allele_count temp$genotype <- factor(temp$genotype,levels=unique(temp$genotype)) #use reordered genotypes as levels #creates a new data frame with median abundance from each combo result <- temp %>% group_by(genotype, gene, taxon) %>% summarize(medianAbundance = median(abundance)) #If you want the heights of the bars to represent values in the data, #use stat="identity" and map a value to the y aesthetic. pdf(plot_file_path, width=8, height=4) ap <- ggplot(data=result, aes(x=genotype,y=medianAbundance,fill=taxon)) + geom_bar(stat='identity') + ggtitle(title) ap <- ap + labs(x=xlabel, y=ylabel) ap <- ap + theme(legend.direction = 'vertical', legend.position = 'bottom') ap <- ap + guides(fill = guide_legend(reverse = TRUE)) print(ap) dev.off() } """ pck = SignatureTranslatedAnonymousPackage(box_plot_fnc, 'pck') return pck.taxon_abundance_stacked_bar_plot
def get_taxon_abundance_box_plot(): box_plot_fnc = """ require("dplyr") require("ggplot2") taxon_abundance_box_plot <- function(data, plot_file_path, title, xlabel, ylabel) { temp <- data[order(data$variant_allele_count),] #sort by variant_allele_count temp$genotype <- factor(temp$genotype,levels=unique(temp$genotype)) #use reordered genotypes as levels pdf(plot_file_path) ap <- ggplot(data=temp, aes(x=genotype,y=abundance) ) ap <- ap + geom_boxplot() ap <- ap + ggtitle(title) ap <- ap + labs(x=xlabel, y=ylabel) ap <- ap + geom_jitter(position=position_jitter(w=0.1)) print(ap) dev.off() } """ pck = SignatureTranslatedAnonymousPackage(box_plot_fnc, 'pck') return pck.taxon_abundance_box_plot
def set_minus_cut(start_point,pav_df_file_name,result_path): R_code_set_minus_cut=''' Cut=function(start_point,pav_df_file_name,result_path){ require(readxl) require(WriteXLS) require(tidyverse) pav_df=read_xlsx(pav_df_file_name) gene_is=pav_df %>% filter((!!sym(start_point))==1) %>% # filter(`70-15`)==1 select("protein_id") minus_part=pav_df %>% filter((!!sym(start_point))==1) %>% column_to_rownames("protein_id") # pav_df_colsum=colSums(minus_part_num) # pav_df_colsum_sort=sort(pav_df_colsum) add_part=pav_df %>% filter((!!sym(start_point))==0) %>% column_to_rownames("protein_id") # add_part=add_part %>% # column_to_rownames(pav_df_raw$...2) add_part_num=sapply(add_part[2:157], function(x) as.numeric(x)) pav_df_colsum=colSums(add_part_num) pav_df_colsum_sort=sort(pav_df_colsum) write.table(attributes(pav_df_colsum_sort),paste(result_path,sprintf("set_minus_sort_protein_id_%s.txt", start_point),sep = ""),append = F,quote = F,row.names = F,col.names = F) write.table(pav_df_colsum_sort,paste(result_path,sprintf("set_minus_sort_protein_id_num_%s.txt", start_point),sep = ""),append = F,quote = F,row.names = T,col.names = F) WriteXLS::WriteXLS( minus_part, paste(result_path,sprintf("set_minus_minus_%s.xlsx", start_point),sep = ""), col.names = T, row.names = T ) WriteXLS::WriteXLS( add_part, paste(result_path,sprintf("set_minus_add_%s.xlsx", start_point),sep = ""), col.names = T, row.names = T ) write.table(gene_is,paste(result_path,sprintf("set_minus_gene_id_%s.txt", start_point),sep = ""),append = F,quote = F,row.names = F,col.names = F) } ''' R_set_minus_cut = SignatureTranslatedAnonymousPackage(R_code_set_minus_cut, "R_set_minus_cut") R_set_minus_cut.Cut(start_point,str(pav_df_file_name),result_path)
def run_mccoil(barcode_file_lines, maf_file_lines=None, verbose=False): ## init barcode set barcode_set = Barcode.SetOfBarcodes() barcode_set.readBarcodeFileLines(barcode_file_lines) # validate is_valid_barcode_set, err_msg = barcode_set.validate() if not is_valid_barcode_set: print err_msg return ([]) ## init mafs if not maf_file_lines: mafs = barcode_set.computeMAFFromBarcodes(1) else: mafs = MAF.MAF() mafs.readMAFFileLines(maf_file_lines) mafs_R_vector = robjects.Vector(mafs.minor_allele_freqs()) # validate is_valid_mafs, err_msg = mafs.validate() if not is_valid_mafs: print err_msg return ({}) ## compute zygosity matrix, then convert to R DataFrame zygosity_matrix = barcode_set.to_zygosity_matrix(mafs, header=True, index=True) if verbose: print(zygosity_matrix) data = to_R_zygosity_df(zygosity_matrix) if verbose: print(data) ## import MCCOIL mccoil_R_code = open(mccoil_R, 'r').read() mccoil = SignatureTranslatedAnonymousPackage(mccoil_R_code, 'mccoil') ## compute result result = mccoil.McCOIL_categorical(data, P=mafs_R_vector) #print result ## get sites/samples sites, samples = list(result[-2]), list(result[-1]) ## get maf/coi predictions, which map 1-to-1 sites/samples # (i.e. maf_prediction[i] is prediction for site[i]) maf_predictions, coi_predictions = list(result[6]), list(result[5]) return ({ 'mafs': zip(sites, maf_predictions), 'cois': zip(samples, coi_predictions) })
def screw_around(): pi = robj.r['pi'] print pi print pi+2 print pi[0] print pi[0]+2 #create fake binned array nrow = 5 ncol = 10 counter = 0 binned = np.zeros((nrow, ncol), dtype="float64") for row in xrange(nrow): for col in xrange(ncol): binned[row, col] = counter counter += 1 #print binned #get binned array into R data.frame #vec = robj.FloatVector([1.1, 2.2, 0, 4.4, 5.5, ]) #print binned.shape print numpy2ri(binned) rdf = robj.r['data.frame'](numpy2ri(binned), code="ID1000") #print rdf # now see if we can get R to use this dataframe myRcode = """ square <- function(rdf) { myv = rdf$X2 + rdf$X3 return(myv) } doit <- function() { source("/srv/scratch/carolyn/Dengue_code/Rtest_rpy.R") run_test_wrap(3) } """ print "wwwwah" powerpack = SignatureTranslatedAnonymousPackage(myRcode, "powerpack") print powerpack._rpy2r.keys() #to reveal the functions within powerpack print powerpack.square(rdf) #to run the function "square" found in powerpack print powerpack.doit()
def get_arima_rsi(prices): df = pd.DataFrame(prices) pandas2ri.activate() calculate_models = """ calculate <- function(x, size=100){ x <- na.omit(x) library(TTR) library(stats) x <- ts(x) f <- function(m) class(try(solve(m),silent=T))=="matrix" if(f(x)){ x[50] = x[50] + 2 } arima <- arima(x, c(0,0,0)) rsi <- RSI(x, size-1)[size] list <-c(arima$coef, rsi) return(as.array(list)) }""" calculate = SignatureTranslatedAnonymousPackage(calculate_models, "calculate") stats = calculate.calculate(df, len(df)) return stats
def __init__(self, r_filename): # Read in r file and init data engine string = "" with open(r_filename, "r") as myfile: string = ''.join(myfile.readlines()) r = SignatureTranslatedAnonymousPackage(string, "r") robjects.r(string) self.r = r
def load_r_file(filename, namespace): if namespace not in r_namespaces: import rpy2.robjects.numpy2ri rpy2.robjects.numpy2ri.activate() if PROJECT_DIR not in filename: filename = os.path.join(PROJECT_DIR, 'r_src', 'forAndrej', filename) with open(filename, 'r') as pout: source = pout.read() res = SignatureTranslatedAnonymousPackage(source, namespace) r_namespaces[namespace] = res return r_namespaces[namespace]
def arxiv_crawl(crawling_list, limit=None, batchsize=100, submission_range=None, update_range=None, delay=None): """ This is a python wrapper for the aRxiv "arxiv_search" function. If submission_range or update_range are given, the results are filtered according to the date ranges. :param crawling_list: The subcategories to crawl. NOT "stat" -> USE "stat.AP" etc... :type crawling_list: dict of lists. :param limit: Max number of results to return. :type limit: int. :param batchsize: Number of queries per request. :type batchsize: int. :param submission_range: The range of submission dates. :type submission_range: Tuple (start,end). :param update_range: The range of last-update dates. :type update_range: Tuple (start,end). :returns: The created folder """ # Timestamp of starting datetime ts_start = time.time() timestamp = datetime.datetime.fromtimestamp(ts_start).strftime( '%Y-%m-%d_%H-%M-%S') # Create folder structure working_folder = base_directory + timestamp os.makedirs(working_folder) os.makedirs(working_folder + "/temp_files") # Setup logging config = logging_confdict(working_folder, __name__) logging.config.dictConfig(config) arxiv_logger = logging.getLogger(__name__) arxiv_logger.info("Starting new crawl for {}".format(str(crawling_list))) arxiv_logger.info("Created new folder: <<" + working_folder + ">>") # Load R-scripts arxiv_logger.debug("Loading R-Scripts ...") try: with open('../r_scripts/arxiv.R', 'r') as f: string = ''.join(f.readlines()) arxiv_crawler = SignatureTranslatedAnonymousPackage( string, "arxiv_crawler") except Exception, e: arxiv_logger.exception("Error while loading R-Scripts.") sys.exit('Could not load R-Scripts!')
def xml2df(url): # make some terrible R code from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage from rpy2.robjects import pandas2ri string = """ require(XML) require(plyr) getXML <- function(x) { xmlfile <- xmlTreeParse(x) temp = xmlToList(xmlfile, addAttributes = F) df <- ldply(temp, .fun=function(x) {data.frame(t(unlist(x)))}) return(df) } """ test = SignatureTranslatedAnonymousPackage(string, "test") # make a pandas DF out of the stupid R df pydf = pandas2ri.ri2py_dataframe(test.getXML(url)) return pydf
def file_to_anonymous_package( file: str) -> SignatureTranslatedAnonymousPackage: """ Takes some file.R and sources it in rpy2 as an anonymous package Returns the R package as an object The name of the package is accessible by package.__rname__ as str """ package_name = os.path.splitext(os.path.split(file)[1])[0] with open(file, "r") as r_package_file: r_package_src = r_package_file.read() package_src = SignatureTranslatedAnonymousPackage(r_package_src, name=package_name) return package_src
def main(): filenames, outdir = parse_arguments() #filenames will be a list os.chdir(outdir) #change pwd to output directory start_time_overall = time.time() rt_grid_size = 50 mz_grid_size = 50 log_statement( "Number of mzml patient files: {}".format(len(filenames)) ) #build empty np array to be filled with LCMS values for R prediction #dimension will be (# mzml files) by (# rt/mz bins + 1 for patient ID) floatD = np.zeros((len(filenames),rt_grid_size*mz_grid_size), dtype=float) #i vals strD = np.zeros((len(filenames),1), dtype='a6') #a6 is dtype for 6 char str respD = np.hstack((strD, floatD)) #fill the array for filecount, filename in enumerate(filenames): if filecount<1000: respD = fill_row_of_lcms_matrix(respD, rt_grid_size, mz_grid_size, filecount, filename) print "\n Data to use in R: " , respD[0:5,0:20] log_statement("Time till beginning of R section: {} minutes".format( (time.time() - start_time_overall)/60. ) ) #convert numpy array into data.frame recognized by R Rdf = robj.r['data.frame'](numpy2ri(respD)) #use this dataframe in R prediction code myRcode = """ doR <- function(python_respD, lcms_run) { source("/srv/scratch/carolyn/Dengue_code/prediction_with_LCMS_from_python.R") run_predictions_wrap(python_respD, lcms_run) } """ Rpack = SignatureTranslatedAnonymousPackage(myRcode, "Rpack") print Rpack.doR(Rdf, lcms_run=2) #to run the function doR, found in Rpack log_statement("Total execution time: {} minutes".format( (time.time() - start_time_overall)/60. ) ) #40 min to create binned data using pickles
def main(): mydata = [] # Open and Read the data file from a csv to convert it to a list with open('breakout_detection_wraper/fuel_data.csv', 'r') as csvfile: dat = csv.reader(csvfile) for line in dat: mydata.append(float(line[0])) # Define the parameters to configure the break out detection algoritm minsize = 30 method = 'multi' degree = 1 # Open the R file to run it on the wrapper with open('breakout_detection_wraper/breakout_function.R') as code: rcode = os.linesep.join(code.readlines()) # Create the wrapper as an anonymous package signature wrapper = SignatureTranslatedAnonymousPackage(rcode, "breakout_function") # Execute the method from the wrapper result = wrapper.Detect(FloatVector(mydata), minsize, method, degree) # Print the result returned from the R function print(result)
def crossref_lookup(working_folder, index, authors, titles, submitted, num_threads=1): # Load r-scripts print("\nLoading R-Scripts ...") with open('../r_scripts/doi_lookup.R', 'r') as f: string = ''.join(f.readlines()) doi_lookuper = SignatureTranslatedAnonymousPackage(string, "doi_lookuper") cr_input_queue = Queue.Queue() cr_to_process = Queue.Queue() process_to_result = Queue.Queue() doc_count = 0 for idx, author, title, date in zip(index, authors, titles, submitted): tokens = author.split("|") if len(tokens) >= 15: author = "|".join(tokens[:15]) cr_input_queue.put((idx, author, title, date)) doc_count += 1 process_thread = ProcessingThread(working_folder, cr_to_process, process_to_result, doc_count) print("\nStarting crossref crawl process...") crossref_threads = [] for i in range(num_threads): thread = CrossrefAPIThread(cr_input_queue, cr_to_process, doi_lookuper) thread.start() crossref_threads.append(thread) process_thread.start() for thread in crossref_threads: thread.event.set() for thread in crossref_threads: thread.join() process_thread.event.set() process_thread.join() results = [] while not process_to_result.empty(): results.append(process_to_result.get()) return results
def __init__(self): self.pd_active() self._dotty = self.func('py_dotted_data', self.DOTTED_DATA) self.pdata = self.func('py_packdata', self.PACKDATA) self.available = self.func('py_avail',self.AVAIL)() self.installed = self.calc('installed.packages') self._histlines = self.func('py_draw_histlines', self.HISTLINES) self._lines = self.func('py_draw_lines', self.LINES) self._hist = self.func('py_draw_hist', self.HIST) self._pareto = self.func('py_draw_pareto', self.PARETO) self._sleaf = self.func('py_draw_sleaf', self.SLEAF) self._csv = self.func('py_r_csvread', self.CSV) self._tapply = self.func('py_r_csvread', self.TAPPLY) self._closest = self.func('py_r_csvread', self.CLOSEST) self._jupyter_opt = self.func('py_r_jupyter', self.JUPYTER_OPT) self._readtab = self.func('py_r_readtab', self.READTAB) self._mul = self.func('py_r_mul', self.MUL) self._exp = self.func('py_r_exp', self.EXP) self._div = self.func('py_r_div', self.DIV) self._add = self.func('py_r_add', self.ADD) self._sub = self.func('py_r_sub', self.SUB) self._str = self.func('py_r_str', self.STR) self._cond = self.func('py_r_cond', self.COND) self._lmplot = self.func('py_r_lmplot', self.LMPLOT) self._plot = self.func('py_r_plot', self.PLOT) self._splot = self.func('py_r_splot', self.SPLOT) self._samps = self.func('py_sample_size', self.SAMP_SIZE) self._sample = self.func('py_sample', self.SAMPLE) self._randomsample = self.func('py_randomsample', self.RANDOMSAMPLE) self._column_ext = self.func('py_column_extract', self.COLUMN_EXTRACT) self._serror = self.func('py_serror_samp', self.SERROR_SAMP) self._pretty = self.func('py_pretty_frame', self.PRETTY_FRAME ) self._show_row = self.func('py_show_row', self.SHOW_ROW) self._some_nums = self.func('py_show_row', self.SOME_NUMBERS) self._packs = self.func('py_packs', self.PACKS) self._help = self.func('py_help', self.HELP) self._contents = self.func('py_contents', self.PACK_CONTENTS) self.anon = SignatureTranslatedAnonymousPackage(self.ANON_PACK, "anon_pack")
def individual(ind,individual): t = (ind,individual) cur = con.cursor() cur.execute("select * FROM ERV WHERE repName=? AND genoStart=?", t) ind = cur.fetchone() #Since we are only expected one result, fetchone make the search faster. with open('cariofunctions.R', 'r') as f: #Allows to read the scripts from R. string_again = f.read() imgs = STAP(string_again, "imgs") #Creates and object than later can be used to calle the functions inside of the R script. c = ind["genoName"][:5] #Get all the variables from the database. c = c.replace("_", "") #KaryoplotteR recognizes the chromosome name like chr1, chr2, etc. Since the first letters of the name of the start = int(ind["genoStart"]) #chromosomes from the database match with this nomenclature, we make sure to get the names correctly. end = int(ind["genoEnd"]) pngs = "" zpngs = "" if c!="chrUn": pngs = "static/img/"+c+str(start)+".png" #Allows to name the final images and locate them in the right folder. zpngs = "static/img/z"+c+str(start)+".png" cimg = imgs.localization(c,start,end,pngs) #Calls the R functions from the script. zimg = imgs.zoom(c,start,end,zpngs) else: pngs = "static/img/un.png" #In case that the chromosome is unknown (some cases in the database), an alternative image is displayed. return render_template('individual.html', ind = ind, pngs=pngs, zpngs=zpngs)
def filter_pan_id(pan_id_file_name, length_table_file, filtered_pan_id_file_name): ''' input 1: pan_id_file_name input 2: length_table_file output 1: filtered_pan_id_file_name ''' R_code = ''' filter_pan_id=function(pan_id_file_name,length_table_file,filtered_pan_id_file_name){ require(dplyr) pan_id=read.table(pan_id_file_name) length_table=read.table(length_table_file) length_table_filter=length_table %>% filter(V2>20) pan_id_filter=merge(pan_id,length_table_filter,by.x = 2,by.y = 1,all.y = T) pan_id_filter=pan_id_filter[,-3] pan_id_filter=pan_id_filter[,c(2,1)] write.table(pan_id_filter,filtered_pan_id_file_name,sep = "\t",quote = F,row.names = F,col.names = F) } ''' R_filter_pan_id = SignatureTranslatedAnonymousPackage( R_code, "R_filter_pan_id") R_filter_pan_id.filter_pan_id(pan_id_file_name, length_table_file, filtered_pan_id_file_name)
def MGG_unpresent_Augustus_locate_in_pav_orthofinder( MGG_unpresent_Augustus_list_file_name, pav_orthofinder_file_name, gene_protein_mapping_table_file_name, pav_MGG_unpresent_Augustus_file_name, orthofinder_unassianed_tsv_file_name, MGG_unpresent_Augustus_unassianed_list_file_name): R_code = ''' MGG_unpresent_Augustus_locate_in_pav_orthofinder=function(MGG_unpresent_Augustus_list_file_name,pav_orthofinder_file_name,gene_protein_mapping_table_file_name,pav_MGG_unpresent_Augustus_file_name,orthofinder_unassianed_tsv_file_name,MGG_unpresent_Augustus_unassianed_list_file_name){ require(readxl) require(WriteXLS) require(dplyr) MGG_unpresent_Augustus_list=read.table(MGG_unpresent_Augustus_list_file_name,stringsAsFactors = F) pav_orthofinde=read_xlsx(pav_orthofinder_file_name) pav_orthofinde=pav_orthofinde[,-1] gene_protein_mapping_table=read.table(gene_protein_mapping_table_file_name) pav_orthofinde_protein=merge(MGG_unpresent_Augustus_list,gene_protein_mapping_table,by.x = 1,by.y = 1) pav_MGG_unpresent_Augustus =pav_orthofinde %>% filter(protein_id %in% pav_orthofinde_protein$V2) pav_MGG_unpresent_Augustus=pav_MGG_unpresent_Augustus[,c(158,1:157)] WriteXLS::WriteXLS(pav_MGG_unpresent_Augustus,pav_MGG_unpresent_Augustus_file_name) orthofinder_unassianed=read.table(orthofinder_unassianed_tsv_file_name,sep = "\t",header = T,check.names = F) MGG_unpresent_Augustus_unassianed_list=intersect(pav_MGG_unpresent_Augustus$protein_id,orthofinder_unassianed$`70-15_protein`) write.table(MGG_unpresent_Augustus_unassianed_list,MGG_unpresent_Augustus_unassianed_list_file_name,quote = F,row.names = F,col.names = F) } ''' R_MGG_unpresent_Augustus_locate_in_pav_orthofinder = SignatureTranslatedAnonymousPackage( R_code, "R_MGG_unpresent_Augustus_locate_in_pav_orthofinder") R_MGG_unpresent_Augustus_locate_in_pav_orthofinder.MGG_unpresent_Augustus_locate_in_pav_orthofinder( MGG_unpresent_Augustus_list_file_name, pav_orthofinder_file_name, gene_protein_mapping_table_file_name, pav_MGG_unpresent_Augustus_file_name, orthofinder_unassianed_tsv_file_name, MGG_unpresent_Augustus_unassianed_list_file_name)
def fit_pogs(self, X, y, tau = 0.5): """ Using POGS for quantile regression :param X: independent variables :param y: response variable :param tau: quantile :return: coefficients """ X["Intercept"] = 1 qr_pogs_model = qr_pogs() pb = SignatureTranslatedAnonymousPackage(qr_pogs_model.qr_pogs_def, "powerpack") t0 = time.time() ret = pb.qr_pogs(X= X, y =y, tau = tau) t1 = time.time() output = {"Coefficients":{}, "Time (s)":t1-t0} for i in range(0,len(list(X.columns.values))): column = list(X.columns.values)[i] output["Coefficients"][column] = ret[i] return output
def matchit(outcome, treatment, data, method='nearest', distance='glm', replace=False): if replace: replace = 'TRUE' else: replace = 'FALSE' data.to_csv('data.csv', index=False) formula_cov = treatment + ' ~ ' i = 0 for cov in data.columns: if cov != outcome and cov != treatment: if i != 0: formula_cov += '+' formula_cov += str(cov) i += 1 string = """ library('MatchIt') data <- read.csv('data.csv') r <- matchit( %s,estimand="ATE", method = "%s", data = data, replace = %s) matrix <- r$match.matrix[,] names <- as.numeric(names(r$match.matrix[,])) mtch <- data[as.numeric(names(r$match.matrix[,])),] hh <- data[as.numeric(names(r$match.matrix[,])),'%s']- data[as.numeric(r$match.matrix[,]),'%s'] data2 <- data data2$%s <- 1 - data2$%s r2 <- matchit( %s, estimand="ATE", method = "%s", data = data2, replace = %s) matrix2 <- r2$match.matrix[,] names2 <- as.numeric(names(r2$match.matrix[,])) mtch2 <- data2[as.numeric(names(r2$match.matrix[,])),] hh2 <- data2[as.numeric(r2$match.matrix[,]),'%s'] - data2[as.numeric(names(r2$match.matrix[,])),'%s'] """ % (formula_cov, method, replace, outcome, outcome, treatment, treatment, formula_cov, method, replace, outcome, outcome) psnn = SignatureTranslatedAnonymousPackage(string, "powerpack") match = psnn.mtch match2 = psnn.mtch2 t_hat = pd.DataFrame(np.hstack((np.array(psnn.hh), np.array(psnn.hh2))), index=list(psnn.names.astype(int)) + list(psnn.names2.astype(int)), columns=['CATE']) ate = np.mean(t_hat['CATE']) return ate
def Bing_cust(lam1, lam2, lam3): string_rbing1 = """ rbingham <- function(n, A) { p <- ncol(A) ## dimensionality of A eig <- eigen(A) V <- eig$vectors ## eigenvectors lam <- c(%f,%f,%f) lam <- lam - lam[p] lam <- lam[-p] ### f.rbing part lam <- sort(lam, decreasing = TRUE) ## sort the eigenvalues in desceding order nsamp <- 0 X <- NULL lam.full <- c(lam, 0) qa <- length(lam.full) mu <- numeric(qa) sigacginv <- 1 + 2 * lam.full SigACG <- sqrt( 1 / ( 1 + 2 * lam.full ) ) Ntry <- 0 while (nsamp < n) { x.samp <- FALSE while ( !x.samp ) { yp <- rnorm(qa, mu, SigACG) y <- yp / sqrt( sum( yp^2 ) ) lratio <- - sum( y^2 * lam.full ) - qa/2 * log(qa) + 0.5 * (qa - 1) + qa/2 * log( sum(y^2 * sigacginv ) ) if ( log(runif(1) ) < lratio) { X <- c(X, y) x.samp <- TRUE nsamp <- nsamp + 1 } Ntry <- Ntry + 1 } } x <- matrix(X, byrow = TRUE, ncol = qa) ## the avtry is the estimate of the M in rejection sampling ## 1/M is the probability of acceptance ## the x contains the simulated values tcrossprod(x, V) ## simulated data } """ % (lam1, lam2, lam3) # 200,0.05 powerpack1 = SignatureTranslatedAnonymousPackage(string_rbing1, "powerpack") return powerpack1
def __init__(self, Rlibpath, search_terms, sites, language): self._RFmodel = None self._language = language self._search_terms = search_terms self._sites = sites #random forest model parameters self._threshold = 0.3 self._splitratio = 0.7 #base R assets self._utils = robjects.packages.importr("utils") self._utils.chooseCRANmirror(ind=5) #randomly chosen mirror self._base = robjects.packages.importr("base") #local library path self._base._libPaths(Rlibpath) #change locale to use utf-8 for r_repr() robjects.r['Sys.setlocale']("LC_CTYPE", "C") # tm - Framework for text mining. # SnowballC - Stemming. # textcat - Determining language of text. # # randomForest - Random forest. # caTools - Splitting data into training and test sets intelligently. # stringr - String manipulation needed_packages = ["tm", "SnowballC", "textcat", "randomForest", "caTools", "stringr"] #install packages to_install = [package for package in needed_packages if not robjects.packages.isinstalled(package)] if len(to_install) > 0: self._utils.install_packages(StrVector(to_install)) #load packages self._loaded_packages = [robjects.packages.importr(package) for package in needed_packages] self._loaded_packages = dict(zip(needed_packages, self._loaded_packages)) #load R functions self._R_functions = STAP( self.__R_functions_str, "R_functions")
import random # based on http://stackoverflow.com/questions/15419740/calling-custom-functions-from-python-using-rpy2 # and http://rpy.sourceforge.net/rpy2/doc-dev/html/robjects_rpackages.html#importing-arbitrary-r-code-as-a-package import rpy2.robjects.packages.SignatureTranslatedAnonymousPackage as STAP with open('electricityDoubleCES.R', 'r') as f: string = ''.join(f.readlines()) ces = STAP(string, "ces") p1 = [30] * 24 p2 = [random.gauss(mu=30, sigma=5) for x in range(24)] Theta = dict( theta = -0.075, alpha = 0.2, sigma = 50, gamma = 0.01 ) loads = range(600, 950+1, 50) + [1000] * 5 + range(1100, 800-1, -100) + [750] + [700] * 3 + [650, 650, 600] print ces.double.ces(p1, Theta, loads, 30) # should return baseline loads print ces.double.ces(p2, Theta, loads, 30) # example with random prices print ces.cs(1000, p2, Theta, loads, 30) - cs(1000, p1, Theta, loads, 30) # change in welfare from p1 to p2
from rpy2.robjects.packages import importr from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage # import R's "base" package base = importr('base') with open("fonctions_apprentissage.r", "r", encoding="utf-8") as apprentissageRopen: apprentissage = "".join(apprentissageRopen.readlines()) print(apprentissage) apprentissage = SignatureTranslatedAnonymousPackage(apprentissage, "apprentissage") path_sample = "/media/sebastien/Bayer/ScriptsSEB/scripts/GUI/EBimage/AnalyseImagesV2/Samples/5583" print(dir(apprentissage)) print(apprentissage.apprentissage(path_sample))
def main(): filenames, outdir = parse_arguments() #filenames will be a list os.chdir(outdir) #change pwd to output directory #print "filenames", filenames # will need to get intensity_2D_binned into R data.frame # intensity_2D_binned was created in my_cython_functions.pyx as follows: # cdef np.ndarray[np.float_t, ndim=2] my2Da # my2Da = np.zeros((rt_grid_size, mz_grid_size)) """ #create fake binned array nrow = 5 ncol = 10 counter = 0 binned = np.zeros((nrow, ncol), dtype="float64") for row in xrange(nrow): for col in xrange(ncol): binned[row, col] = counter counter += 1 ### Option 1 ### #turn each binned array into one row of what will be an R data.frame # then add ID to each row and combine these binned arrays nele = nrow*ncol rbinned1 = np.reshape(binned, nele) rbinned1c = np.hstack(( np.array(["ID1001"]), rbinned1 )) #concatenate also works rbinned2 = np.reshape(binned, nele) rbinned2c = np.hstack(( np.array(["ID1002"]), rbinned2 )) stacked = np.vstack((rbinned1c, rbinned2c)) #print stacked #print stacked.shape #2 by 51 rrdf1 = robj.r['data.frame'](numpy2ri(stacked)) #print rrdf1 ### Option 2 ### #build empty array that is 5 (# mzml files) by 51 (# rt/mz bins + 1 for patient ID) # fill each row with the binned data floatD = np.zeros((5,nrow*ncol), dtype="float64") strD = np.zeros((5,1), dtype='a6') #a6 is the dtype for a 6 character string respD = np.hstack((strD, floatD)) print respD.shape for filecount, filename in enumerate(filenames): if filecount<2: respD = build_row_of_lcms_matrix( binned, respD, nrow, ncol, filecount, filename) #print respD df2 = robj.r['data.frame'](numpy2ri(respD)) print df2 # now see if we can get R to use this dataframe myRcode = """ doR <- function(python_respD, lcms_run) { source("/srv/scratch/carolyn/Dengue_code/prediction_with_LCMS_from_python.R") run_predictions_wrap(python_respD, lcms_run) } """ #Rpack = SignatureTranslatedAnonymousPackage(myRcode, "Rpack") #print Rpack._rpy2r.keys() #to reveal the functions within powerpack #3print Rpack.doR(df2, 1) #to run the function found in powerpack # now see if we can get R to use this dataframe myRcode = """ square <- function(rdf) { myv = rdf$X2 + rdf$X3 return(myv) } doit <- function(input) { source("/srv/scratch/carolyn/Dengue_code/Rtest_rpy.R") run_test_wrap(input) } """ print "wwwwah" powerpack = SignatureTranslatedAnonymousPackage(myRcode, "powerpack") #print powerpack._rpy2r.keys() #to reveal the functions within powerpack #print powerpack.square(df2) #to run the function "square" found in powerpack print powerpack.doit(df2)
def run(self): print("RUN") try: warning = "" # initialise le nombre d'erreur val = 0 # initialise le nombre d'erreur txtInfo = "" # import R's "base" package base = importr('base') with open("fonctions_apprentissage.r", "r", encoding="utf-8") as apprentissageRopen: apprentissage = "".join(apprentissageRopen.readlines()) print(apprentissage) apprentissage = SignatureTranslatedAnonymousPackage(apprentissage, "apprentissage") #path_sample = "/media/sebastien/Bayer/ScriptsSEB/scripts/GUI/EBimage/AnalyseImagesV2/Samples/5583" path_sample = "/".join(str(self.dicoFoldersCallibration["leaf"]).split("/")[:-1]) print(path_sample) print(dir(apprentissage)) result , pathRdataFile= apprentissage.apprentissage(path_sample).r_repr().replace('"','').replace("c(","").replace(")","").split(",") print(result, pathRdataFile) if result == "1": reply = QMessageBox.question(parent=self, title='Attention', text='File will be overwritten.\nDo you still want to proceed?', buttons=QMessageBox.Yes | QMessageBox.No, defaultButton=QMessageBox.No) if reply == QMessageBox.Yes: print("OK") self.callibrationFileOpenLineEdit.setText(pathRdataFile) else: print("BAD") ##grise les boutons pour pas relancer job #self.ui.frameRun.show() #self.ui.runPushButton.setDisabled(True) #self.ui.loadMatriceFilePushButton.setDisabled(True) #self.ui.loadOrderFilePushButton.setDisabled(True) #self.ui.PCAlineEdit.setDisabled(True) #self.ui.DAlineEdit.setDisabled(True) #self.ui.popMinLineEdit.setDisabled(True) #self.ui.popMaxLineEdit.setDisabled(True) #self.ui.rmOldCheckBox.setDisabled(True) #self.ui.expertCheckBox.setDisabled(True) #self.ui.expertFrame.setDisabled(True) #self.ui.graphTypeComboBox.setDisabled(True) #if self.expertMode == "True": #self.EBimagefix = str(self.ui.EBimagefixPlainTextEdit.toPlainText().toUtf8()) #self.EBimagechange = str(self.ui.EBimagechangePlainTextEdit.toPlainText().toUtf8()) #"""to run programme""" ## création du pathout #if os.path.isdir(self.pathFileOut): #if self.rmOld == "True": #shutil.rmtree(str(self.pathFileOut)) #os.mkdir(self.pathFileOut) #else: #warning += "Warnnig folder "+self.pathFileOut+" already exist,\nPlease remove/rename before run new analysis or use checkbox" #raise Exception(warning) #else: #os.mkdir(self.pathFileOut) ################################################ ## code commun mode graphique ou interface ################################################ ## charge l'ordre a refaire #self.orderList = loadInListCol(self.orderPathFile, 0) ## copie de la matrice dans un dico #self.dicoMatrice = loadInDictLine(self.matricePathFile) ## Comptage du nombre d'individus, de markers et ncode: #self.nbindParam = len(self.dicoMatrice.keys())-1 #if self.nbindParam != len(self.orderList): #txtInfo += "WARNING: More individu in Matrice file (%s) than Order label file (%s)!!!\n" % (self.nbindParam, len(self.orderList)) #fileMat = open(self.matricePathFile,"r") #header = fileMat.readline() #self.nbmarkParam = len(header.split("\t")) #header = " \t"+"\t".join(header.split("\t")[1:]) #nbcode = fileMat.readline().split("\t")[1] #while nbcode == "-9": #nbcode = fileMat.readline().split("\t")[1] #self.ncodeParam = len(nbcode) #fileMat.close() ## ouverture du nouveau fichier trier #with open(self.pathFileOut+self.basename+"_Reorder.tab","w") as reorderMatriceFile: #reorderMatriceFile.write(header) #for ind in self.orderList: #if ind not in self.dicoMatrice.keys(): #error = "ERROR: The individu %s define in label file was not in the matrice file !!! Exit programme" % ind #raise Exception(error) #line = self.dicoMatrice[ind].split("\t")[0]+"\t"+"\t".join(self.dicoMatrice[ind].split("\t")[1:]).replace("999","-9") #reorderMatriceFile.write(line) #txtInfo += "Nb individus: %i\tNb markers: %i\tncodeParam: %i\tGraph type: %s\n" % (self.nbindParam,int(self.nbmarkParam)-1,self.ncodeParam, self.graphType) #if args.cmdMode: #pass #else: #self.ui.runningPlainTextEdit.setPlainText(txtInfo) ##ouverture du script R #Rscript = open(self.pathFileOut+self.basename+"_R_EBimage.R","w") #Rscript.write(installPackageR) ## Ajout du path du fichier matrice dans EBimagefix ## modifie Script R pour adapter aux parametres rentrés #dictToReplace = { #"**MAKERS**" : str(self.nbmarkParam), #"**NCODE**" : str(self.ncodeParam), #"**INDIV**" : str(self.nbindParam), #"**PATHTOFILE**": str(reorderMatriceFile.name), #"**current_dir**" : str(self.pathFileOut), #"**GRAPH**" : str(self.graphType) #} #EBimagefixModif = replace_all(dictToReplace, self.EBimagefix) ##print(EBimagefixModif) #Rscript.write(EBimagefixModif) #for pop in range(int(self.popMinValue),int(self.popMaxValue)+1): ##print(pop) #popstr=str(pop) #EBimagechange2 = self.EBimagechange.replace("**pop**",popstr).replace("**current_dir**",str(self.pathFileOut)).replace("**PCARETAIN**",str(self.PCAvalue)).replace("**DARETAIN**",str(self.DAvalue)) ##print(EBimagechange2) #Rscript.write(EBimagechange2) #Rscript.close() #self.ui.statusbar.showMessage(str("FINISH, script product on : %s" % self.pathFileOut),9600) #txtInfo += "FINISH, script product on :\n %s" % (self.pathFileOut) #if args.cmdMode: #print(txtInfo) #else: #self.ui.runningPlainTextEdit.setPlainText(txtInfo) ## si des erreurs: except Exception as e: self.displayError(error = e)
class JobAdClassification: """Classification of job ads using R and rpy2. This class provides training of a machine learning model for recommendations for new job ads, and determination of languages of job ads. Arguments --------- Rlibpath : str Path to local R libraries. search_terms : list[str] All search terms used in job ad collections. Needed to include all factor levels in the machine learning model. sites : list[str] All job sites used in job ad collections. Needed to include all factor levels in the model. language : str Language of job ads / machine learning model. Currently only Finnish and English supported. """ #Functions which were easier to implement in pure R than using rpy2. __R_functions_str = """ cleanJobAds <- function(class_data, search_terms, sites) { # Cleans and transforms job ads. # More precisely: # - Joins title and description columns # - Removes rows with empty column(s), removes extra whitespaces # - Removes duplicates # - Adds all factor levels to search terms and sites # # Returns a data frame with only site, search # # Arguments: # class_data - dataframe with columns for site, title, description, # searchterm and relevant. # search_terms - Character vector of all search terms used, needed for # factor levels. # sites - Character vector of all job ad sites, needed for # factor levels. class_data$description <- paste(class_data$title, class_data$description) class_data$title <- NULL #get rid of rows with empty column(s) for (col in colnames(class_data)) { class_data <- class_data[!(class_data[col] == ""),] class_data <- class_data[!is.na(class_data[col]),] } class_data <- unique(class_data) #get rid of extra whitespace in description class_data$description <- str_trim(class_data$description) class_data$description <- gsub("\\\\s+", " ", class_data$description) #assign proper factor levels class_data$site <- as.factor(class_data$site) levels(class_data$site) <- c(levels(class_data$site), sites[!(sites %in% levels(class_data$site))]) class_data$searchterm <- as.factor(class_data$searchterm) levels(class_data$searchterm) <- c(levels(class_data$searchterm), search_terms[!(search_terms %in% levels(class_data$searchterm))]) return(class_data) } createJoinDTM <- function(class_data, lang) { # Transforms and parses the description column for words. # Words are cleaned and stemmed, and finally added as columns # to the dataframe. # # Returns dataframe with columns for all parsed words in the description # column. The description column itself is removed. # # Arguments: # class_data - Dataframe of cleaned job ads using R_function cleanJobAds. # lang - Language of job ads, needed for stemming and removing # stopwords. #create corpus from descriptions corpus <- Corpus(VectorSource(class_data$description)) #cleaning and stemming operations applied to corpus corpus <- tm_map(corpus, tolower) corpus <- tm_map(corpus, PlainTextDocument) #need to convert after tolower corpus <- tm_map(corpus, removePunctuation) corpus <- tm_map(corpus, removeWords, stopwords(lang)) corpus <- tm_map(corpus, stemDocument, lang) #create document term matrix and remove sparse terms dtm <- DocumentTermMatrix(corpus) dtm <- removeSparseTerms(dtm, 0.98) dtm <- as.data.frame(as.matrix(dtm)) class_data <- cbind(class_data, dtm) class_data$description <- NULL colnames(class_data) <- make.names(colnames(class_data)) return (class_data) } RFmodel <- function(train_data, cutoff) { # Trains random forest binary classification model using the provided # cutoffs. # # Returns model. # # Arguments: # train_data - Dataframe containing parsed words from job ads. # cutoff - Threshold for determining whether relevant or not. train_data$relevant <- as.factor(train_data$relevant) RFmodel <- randomForest(relevant ~ ., data=train_data, cutoff = cutoff) return(RFmodel)} RFpred <- function(RFmodel, test_data) { # Classifies job ads as relevant or not using provided model. # # Returns factor of classifications. # # Arguments: # RFmodel - Model to use. # test_data - Dataframe containing parsed words from job ads as columns. # return(predict(RFmodel, newdata=test_data))} splitTerms <- function(terms_data, bool) { #Helper function for splitting data into training and testing sets. return (terms_data == bool) } model_eval <- function(predictions, actual, thold, printb=0) { # Calculates and optionally prints characteristics of model. # Returns the following in a column vector: # accuracy, sensitivity, RMSE, # true positives, true negatives, # false positives, false negatives, # fscore # predictions <- as.numeric(predictions) actual <- as.numeric(actual) if (thold != -1) { preds <- predictions >= thold } #if factor levels are 2,1 instead of 0,1 (will fail if #actual levels are 0,1 but there are no 0 predictions) if (max(predictions) == 2 || min(predictions) == 1) { preds <- predictions-1 } TP <- sum(actual + preds == 2) TN <- sum(actual + preds == 0) FP <- sum(actual - preds == -1) FN <- sum(actual - preds == 1) #accuracy acc <- (TP + TN) / (TP + TN + FP + FN) #sensitivity sens <- (TP / (TP + FN)) #fscore fscore <- 2*TP/(2*TP+FP+FN) #error measure err <- sum((actual - preds)^2) if (printb == 1) { cat("Model characteristics:", "\n") cat("Accuracy", acc, "\n") cat("Sensitivity", sens, "\n") cat("Fscore", fscore, "\n") cat("Error (RMSE)", err, "\n") } return(c(acc, sens, err, TP, TN, FP, FN, fscore)) } prepNewAds <- function(RFmodel, new_ads) { #Prepares new ads for classification by model. #Looks for words used by model and discards #words not in model. model_columns <- as.character(attr(RFmodel$terms, "variables")) new_ads <- new_ads[(names(new_ads) %in% model_columns)] for (col in model_columns[!(model_columns %in% names(new_ads))]) { new_ads[col] <- rep(0, nrow(new_ads)) } return(new_ads) } saveFile <- function(object, filename) { save(object, file=filename) } """ #columns needed for training model _train_columns = ["site", "searchterm", "title", "description", "relevant"] #columns needed for classifying new job ads _class_columns = ["id", "site", "searchterm", "title", "description"] def __init__(self, Rlibpath, search_terms, sites, language): self._RFmodel = None self._language = language self._search_terms = search_terms self._sites = sites #random forest model parameters self._threshold = 0.3 self._splitratio = 0.7 #base R assets self._utils = robjects.packages.importr("utils") self._utils.chooseCRANmirror(ind=5) #randomly chosen mirror self._base = robjects.packages.importr("base") #local library path self._base._libPaths(Rlibpath) #change locale to use utf-8 for r_repr() robjects.r['Sys.setlocale']("LC_CTYPE", "C") # tm - Framework for text mining. # SnowballC - Stemming. # textcat - Determining language of text. # # randomForest - Random forest. # caTools - Splitting data into training and test sets intelligently. # stringr - String manipulation needed_packages = ["tm", "SnowballC", "textcat", "randomForest", "caTools", "stringr"] #install packages to_install = [package for package in needed_packages if not robjects.packages.isinstalled(package)] if len(to_install) > 0: self._utils.install_packages(StrVector(to_install)) #load packages self._loaded_packages = [robjects.packages.importr(package) for package in needed_packages] self._loaded_packages = dict(zip(needed_packages, self._loaded_packages)) #load R functions self._R_functions = STAP( self.__R_functions_str, "R_functions") def _remove_diacritics(self, string): """Removes all Swedish (Finnish) diacritics from a string. Arguments ---------- string : str String to remove diacritics from. Returns ---------- clean_string : str String without diacritics. """ if isinstance(string, str): diacr = ["Ä", "ä", "Ö", "ö", "Å", "å"] replc = ["A", "a", "O", "o", "A", "a"] for i in range(0, len(diacr)): string = string.replace(diacr[i], replc[i]) return string def _create_R_dataframe(self, job_ads, include_columns): """Converts job ads to R dataframe. Arguments ---------- job_ads : list[:class:`JobAd`] List of :class:`JobAd` instances. include_columns : list[str] Defines which columns are included in the dataframe. Returns ---------- dataf : :class:`robjects.DataFrame` :class:`robjects.DataFrame` representing job ads. """ #modify structure to type {column:[rows]} if len(job_ads) == 0: raise Exception("No job ads to convert to R dataframe.") job_ads_dataf = {} for column in include_columns: job_ads_dataf[column] = [self._remove_diacritics(ad[column]) for ad in job_ads] if (column == "relevant"): job_ads_dataf[column] = IntVector(job_ads_dataf[column]) else: job_ads_dataf[column] = self._base.I(StrVector(job_ads_dataf[column])) return robjects.DataFrame(job_ads_dataf) def train_model(self, class_ads): """Trains a random forest model for classification of job ad relevance. Model is stored in the :class:`JobAdClassification` instance. Arguments ---------- class_ads : list[:class:`JobAd`] List of :class:`JobAd` instances used to train model. Each instance should have site, searchterm, title, description and relevant defined. """ ##parameters for training #typical value splitratio = self._splitratio #gave best F-score during parameter sweeping threshold = self._threshold #convert to dataframe and clean ads dataf = self._create_R_dataframe(class_ads, self._train_columns) dataf = self._R_functions.cleanJobAds(dataf, StrVector(self._search_terms), StrVector(self._sites)) dataf = self._R_functions.createJoinDTM(dataf, self._language.lower()) #create training and testing data sets if (splitratio != 1.0): split = robjects.r['sample.split'](dataf.rx2('relevant'), splitratio) train = robjects.r['subset'](dataf, self._R_functions.splitTerms(split, 'TRUE')) test = robjects.r['subset'](dataf, self._R_functions.splitTerms(split, 'FALSE')) else: train = dataf #train model self._RFmodel = self._R_functions.RFmodel(train, FloatVector([1-threshold, threshold])) #test on testing set if (splitratio != 1.0): pred = self._R_functions.RFpred(self._RFmodel, test) conf_matrix = self._R_functions.model_eval(pred, test.rx2('relevant'), -1, 1) def save_model(self, filename): """Saves :class:`JobAdClassification` instance model to file for later use. Arguments ---------- filename : str Name of file to save model in. """ self._R_functions.saveFile(self._RFmodel, filename) def load_model(self, filename): """Loads random forest classification model from file. Model is stored in :class:`JobAdClassification` instance. Arguments ---------- filename : str Name of file to load model from. """ self._RFmodel = robjects.r['get'](robjects.r['load'](filename)) def recommend_ads(self, job_ads): """Provides recommendations for ads using instance model. Arguments ---------- job_ads : list[:class:`JobAd`] Each instance should have id, site, searchterm, title and description defined. Returns ---------- results : list[:class:`JobAd`] Each instance has id and recommendation defined. """ #convert to dataframe and clean ads dataf = self._create_R_dataframe(job_ads, self._class_columns) ids = dataf.rx2('id') dataf = self._R_functions.cleanJobAds(dataf, StrVector(self._search_terms), StrVector(self._sites)) dataf = self._R_functions.createJoinDTM(dataf, self._language.lower()) dataf = self._R_functions.prepNewAds(self._RFmodel, dataf) #classify ads pred = self._R_functions.RFpred(self._RFmodel, dataf) #combine predictions with ids in a list of dictionaries results = [JobAd.create({"id" : ids[i], "recommendation": int(pred[i])-1}) for i in range(0, robjects.r['length'](ids)[0])] return results def _determine_lang(self, title, description): """Tries to determine which language a job ad is using the textcat package. Only differentiates between Finnish and English; returns English if another language is recognized. Arguments ---------- title : str Title of job ad. description : str Description of job ad. Returns ---------- language : str Determined language of job ad. """ language_both = self._loaded_packages["textcat"].textcat( " ".join([title, description])).r_repr().replace("\"", "") language_title = self._loaded_packages["textcat"].textcat(title).r_repr().replace("\"", "") language_descrip = self._loaded_packages["textcat"].textcat( description).r_repr().replace("\"", "") #English job titles with Finnish text is sometimes mistaken #as danish, frisian or middle_frisian false_finnish = ["danish", "frisian", "middle_frisian"] if (language_both == "english" or language_both == "finnish"): return language_both[0].upper() + language_both[1:] elif (language_title == "english" or language_title == "finnish"): return language_title[0].upper() + language_title[1:] elif (language_descrip == "english" or language_descrip == "finnish"): return language_descrip[0].upper() + language_descrip[1:] elif (language_both in false_finnish or language_title in false_finnish or language_descrip in false_finnish): return "Finnish" else: return "English" def det_lang_ads(self, job_ads): """Attempts to determine language of job ads. Returns list of :class:`JobAd` instances with id and language. Arguments ---------- job_ads : list[:class:`JobAd`] List of :class:`JobAd` instances. Each instance should have id, title and description defined. Returns ---------- results : list[:class:`JobAd`] List of :class:`JobAd` instances. Each instance has id and language defined. """ results = [{"id": ad["id"], "language": self._determine_lang(ad["title"], ad["description"])} for ad in job_ads] return results