def __init__(self, dataset: pd.DataFrame, predictset: pd.DataFrame = None, njobs: int = -1, test_size: float = 0.2, maxRuns: int = 1000, printFileStatistics: str = None, PreStdScaler: bool = True): """ Initiallisation of the Amadeus Framework. parameters: - dataset: a pandas data-frame containing the data to model - predictset: a pandas data-frame containing the data to model, [OPTIONAL, DEFAULT=None] - njobs: positive integer number indicating the number of processes to use for parallelisation. If set to a negative value the number of processes is set to the number of physical cores times the absolute value given in njobs. (default = -1) \Todo: fix this for multi-cpu nodes. - test_size : fraction of the data to use as test-data for train_test_split. (default = 0.2 (aka 20%) ) - maxRuns : the maximum number of runs allowed for averaging a model. (default=1000) - printFileStatistics : filename to print statistics data. (default= None, i.e. standard out) - PreStdScaler : Boolean indicating if the full-data needs to be pulled through a standard scaler before train-test splitting. Default=True, as this is the only way to get a usefull pipeline for prediction """ from HPCTools import get_num_procs self.name = "Amadeus" self.longname = "Artificial intelligence and Machine learning frAmework for DEsigning Useful materialS" self.num_workers = get_num_procs(njobs) self.test_split = test_size self.maxRuns = maxRuns self.printFileStatistics = printFileStatistics #some objects which need to be created later as they are model dependent self.dataset = dataset self.predictset = predictset self.preScale = PreStdScaler self.Pipeline = dict() #one pipeline per model self.ModelFrame = dict( ) #this frame depends on the model, so it should be a list/dict self.PredictFrame = dict( ) #this frame depends on the model, so it should be a list/dict self.AverageModel = dict() self.ModelList = None
def setAverageCoefficients(self, EnsembleData: TModelResults, setCI: bool): """ Use the ensemble data to create an "average" model, and set the "coefficients" in the current model. This should be performed in each model separately --> needs to include hyper paramaters...how do we deal with multi-preference? """ from sklearn.linear_model import Lasso import multiprocessing as mp from HPCTools import get_num_procs #import time # 1. Calculate the average coefficients # 1.1. transform them to arrays #start = time.perf_counter_ns() #print("3.1) Average Coefficients : AVG") self.coefUsage = np.zeros( EnsembleData.modelCoef[0]['coef_'][1].shape[1]) intercept = np.zeros(EnsembleData.NData) coef = np.zeros((EnsembleData.NData, EnsembleData.modelCoef[0]['coef_'][1].shape[1])) alphas = np.zeros(EnsembleData.NData) for i in range(EnsembleData.NData): mcf = EnsembleData.modelCoef[i] intercept[i] = np.asarray(mcf['intercept_'][1]).ravel() coef[i, :] = np.asarray(mcf['coef_'][1]).ravel() stra = mcf['alpha'][1] alphas[i] = stra.split()[3] for j in range(EnsembleData.modelCoef[0]['coef_'][1].shape[1]): self.coefUsage[j] = 100.0 * ( np.count_nonzero(coef[:, j]) / EnsembleData.NData ) # The %-fraction of non-zero versions of each coefficient self.bestAlpha = np.mean(alphas, axis=0) mean_intercept = np.mean( intercept, axis=0 ) #axis is the varying direction, so 0 means we calculate the average of a column by varying the row mean_coef = np.mean(coef, axis=0) print("MEANS=> Int=", mean_intercept, " COEF=", mean_coef) # 2. Set the model coefficients to these averaged values # ENR and sklearn black-boxing complicate things a bit here: self.model = Lasso( alpha=self.bestAlpha, # temp values -> will be mean? fit_intercept= True, #the data is already centered by standard scaler...but that seems not to be relevant/ or what is meant by this normalize=False, #standardization is performed elsewhere precompute=True #precompute the Gram matrix (default) ) #The black box nature of sklearn/python rears its ugly head here. #apparently we are not allowed to create an estimator ourselves by setting #the coefficients...ok maybe in this case we should replace the lasso with #a normal poly since we make no use of the power-features of lasso #however, this is rather annoyng. # So just to kill the possibility of sklearn trowing an exception when # predicting, just run it once and overwritte the intercept and coef attributes # --> f**k you python #print("FEATURE_TF=",self.feature_tf) #print("TARGET_TF=",self.target) # print("DICT=",self.model) # print("fit_intercept=", self.model.fit_intercept) # #print("__doc__=", self.model.__doc__) # print("_estimator_type=",self.model._estimator_type) # print("_preprocess_data=",self.model._preprocess_data) # print("selection=",self.model.selection) # print("__getattribute__=",self.model.__getattribute__) # #print("bool=",self.model.__bool__) # print("get_state pre fit =",self.model.__getstate__()) #print("=",self.model.__setstate__()) ftt2 = [0, 1, 2, 3, 4, 5, 6] ftt = [[1, 2, 3], [0, 0.1, 0.0], [0.1, 0.1, 0.1], [3, 2, 2], [5.2, 3, 4], [0.0, 0.2, 0.3], [-0.1, -0.2, -0.3]] self.model.fit(ftt, ftt2) # print("get_state post fit=",self.model.__getstate__()) #self.model.fit(self.feature_tf,self.target) #just to make sure the intercept and coef attributes are defined--> f**k python self.model.intercept_ = mean_intercept self.model.coef_ = mean_coef self.isAverage = True self.hasCI = False #print("get_state post setting=",self.model.__getstate__()) if setCI: #end = time.perf_counter_ns() #print("3.2.a) Average Coefficients : CI Intercept ",(end-start)/10E9) # 3. Calculate Confidence Interval using Bootstrapper tech? # & 4. Store the CI data ## For the intercept boot = TBootstrap(data=intercept, Func=np.mean) #end = time.perf_counter_ns() #print("3.2.b) NPboot",(end-start)/1E9) boot.NPbootstrap(n_iter=2000, Jackknife=True) #end = time.perf_counter_ns() #print("3.2.c) Con Int",(end-start)/1E9) avgm, avgp = boot.ConfidenceInterval( CItype="BCa", alpha=0.05, n_samples=2000) #95%confidence interval self.CI["intercept_lo"] = avgm self.CI["intercept_hi"] = avgp print("===BOOT INTERCEPT:", avgm, avgp) ## For the coefficients # Parallelisation for sections performing bootstraps. # Parallelization at the highest level of a column, # ??Is the overhead sufficiently low to have benefits? # 1. create our process pool with as many processes as physical cores pool = mp.Pool(processes=get_num_procs(-1)) # 2. set drones to work alpha = 0.05 #95%confidence interval print("col-range=", EnsembleData.modelCoef[0]['coef_'][1].shape[1]) for col in range(EnsembleData.modelCoef[0]['coef_'][1].shape[1]): print("col", col, " coef[:,col]=", coef[:, col], " type=", type(coef[:, col])) drones = [ pool.apply_async(Bootstrap_1Col, args=(col, coef[:, col], alpha)) for col in range(EnsembleData.modelCoef[0]['coef_'][1].shape[1]) ] # 3. as we can not assume the cols to be produced in the correct order # --> make it a dict ciDict = dict() for drone in drones: col, avgm, avgp = drone.get() ciDict[col] = list([avgm, avgp]) # 4. wait untill all processes are finished pool.close() pool.join() # 5. and put then in the corrcet order in the list avgml = list() avgpl = list() for col in range(EnsembleData.modelCoef[0]['coef_'][1].shape[1]): avgml.append(ciDict[col][0]) avgpl.append(ciDict[col][1]) self.CI["coef_lo"] = avgml self.CI["coef_hi"] = avgpl self.hasCI = True #store the resulting coefficients in our wrapper tracker...and we are done #print("Store resulting coefficients.") self.setCoefficients() self.Quality = TModelQualityData(EData=EnsembleData)
def setAverageCoefficients(self, EnsembleData: TModelResults, setCI: bool): """ Use the ensemble data to create an "average" model, and set the "coefficients" in the current model. This should be performed in each model separately """ import multiprocessing as mp from HPCTools import get_num_procs # 1. Calculate the average coefficients # 1.1. transform them to arrays #start = time.perf_counter_ns() #print("3.1) Average Coefficients : AVG") intercept = np.zeros(EnsembleData.NData) coef = np.zeros((EnsembleData.NData, EnsembleData.modelCoef[0]['coef_'][1].shape[1])) for i in range(EnsembleData.NData): mcf = EnsembleData.modelCoef[i] intercept[i] = np.asarray(mcf['intercept_'][1]).ravel() coef[i, :] = np.asarray(mcf['coef_'][1]).ravel() print(i, ")", coef[i, :]) mean_intercept = np.mean( intercept, axis=0 ) #axis is the varying direction, so 0 means we calculate the average of a column by varying the row mean_coef = np.mean(coef, axis=0) print("MEANS=> Int=", mean_intercept, " COEF=", mean_coef) # 2. Set the model coefficients to these averaged values self.model.intercept_ = mean_intercept self.model.coef_ = mean_coef self.isAverage = True self.hasCI = False if setCI: #end = time.perf_counter_ns() #print("3.2.a) Average Coefficients : CI Intercept ",(end-start)/10E9) # 3. Calculate Confidence Interval using Bootstrapper tech? # & 4. Store the CI data ## For the intercept boot = TBootstrap(data=intercept, Func=np.mean) #end = time.perf_counter_ns() #print("3.2.b) NPboot",(end-start)/1E9) boot.NPbootstrap(n_iter=2000, Jackknife=True) #end = time.perf_counter_ns() #print("3.2.c) Con Int",(end-start)/1E9) avgm, avgp = boot.ConfidenceInterval( CItype="BCa", alpha=0.05, n_samples=2000) #95%confidence interval self.CI["intercept_lo"] = avgm self.CI["intercept_hi"] = avgp ## For the coefficients # Parallelisation for sections performing bootstraps. # Parallelization at the highest level of a column, # ??Is the overhead sufficiently low to have benefits? # 1. create our process pool with as many processes as physical cores pool = mp.Pool(processes=get_num_procs(-1)) # 2. set drones to work alpha = 0.05 #95%confidence interval drones = [ pool.apply_async(self._BootstrapAvg_1Col, args=(col, coef[:, col], alpha)) for col in range(EnsembleData.modelCoef[0]['coef_'][1].shape[1]) ] # 3. as we can not assume the cols to be produced in the correct order # --> make it a dict ciDict = dict() for drone in drones: col, avgm, avgp = drone.get() ciDict[col] = list([avgm, avgp]) # 4. wait untill all processes are finished pool.close() pool.join() # 5. and put then in the corrcet order in the list avgml = list() avgpl = list() for col in range(EnsembleData.modelCoef[0]['coef_'][1].shape[1]): avgml.append(ciDict[col][0]) avgpl.append(ciDict[col][1]) self.CI["coef_lo"] = avgml self.CI["coef_hi"] = avgpl self.hasCI = True #store the resulting coefficients in our wrapper tracker...and we are done self.setCoefficients() self.Quality = TModelQualityData(EData=EnsembleData)
def setAverageCoefficients(self, EnsembleData: TModelResults, setCI: bool): """ Use the ensemble data to create an "average" model, and set the "coefficients" in the current model. This should be performed in each model separately NOTE: We assume 1. that the support vectors(/features) were indexed with a sequence of INTEGER values starting at 0. 2. the same sequence and order are used for all subsets drawn in the ensemble. 3. that the largest value of the sequence is the size of the set of support vectors (-1, because caounting starts at 0) --> needs to include hyper paramaters...how do we deal with multi-preference? """ from LSSVMRegression import LSSVMRegression import multiprocessing as mp from HPCTools import get_num_procs # 0. Find out how many coefficients/data-points there are FullSet = set() for i in range(EnsembleData.NData): FullSet.update( set(EnsembleData.modelCoef[i]['data_pt_index'] [1].flatten())) #flatten because nd-array SizeSet = max( FullSet ) + 1 #if for some reason some support vectors are missing altogether #size of a support vector SVar = EnsembleData.modelCoef[0]['support_'][1] SizeSV = len(SVar[0]) # 1. Calculate the average coefficients # 1.1. transform them to arrays #start = time.perf_counter_ns() #print("3.1) Average Coefficients : AVG") self.coefUsage = np.zeros(SizeSet) self.coefInEnsemble = np.zeros(SizeSet) intercept = np.zeros(EnsembleData.NData) coef = np.zeros((EnsembleData.NData, SizeSet)) support_vectors_sorted = np.zeros((SizeSet, SizeSV)) sigmas = np.zeros(EnsembleData.NData) gammas = np.zeros(EnsembleData.NData) for i in range(EnsembleData.NData): mcf = EnsembleData.modelCoef[i] intercept[i] = np.asarray(mcf['intercept_'][1]).flatten( ) #use flatten, this returns a copy, ravel, does not, and you may end up modifying the original data stra = mcf['sigma'][1] sigmas[i] = stra.split()[3] stra = mcf['gamma'][1] gammas[i] = stra.split()[3] #coefficients are linked to "support-vectors" so we need to make sure we average the # coef. of the same support vectors of different ensembles coefar = np.asarray(mcf['coef_'][1]).flatten() idxar = mcf['data_pt_index'][1].flatten() if (FullSet != set()): SVar = mcf['support_'][1] for j in range(mcf['coef_'][1].shape[1]): coef[i, idxar[j]] = coefar[j] self.coefInEnsemble[idxar[j]] += 1 if idxar[ j] in FullSet: #complicated way of only setting these values once support_vectors_sorted[idxar[j]] = SVar[j] FullSet.remove(idxar[j]) for j in range(SizeSet): self.coefUsage[j] = 100.0 * ( np.count_nonzero(coef[:, j]) / EnsembleData.NData ) # The %-fraction of non-zero versions of each coefficient self.coefInEnsemble[j] = 100.0 * ( self.coefInEnsemble[j] / EnsembleData.NData ) # The fraction of the presence of the support vector in the ensemble (upper bound for self.coefUsage) self.bestGamma = np.mean(gammas, axis=0) self.bestSigma = np.mean(sigmas, axis=0) mean_intercept = np.mean( intercept, axis=0 ) #axis is the varying direction, so 0 means we calculate the average of a column by varying the row mean_coef = np.mean(coef, axis=0) # 2. Set the model coefficients to these averaged values # LS-SVM is under our full control so we have some more # power to do what is needed ## --> FIRST: Create the "model" self.model = LSSVMRegression( gamma=self. bestGamma, #the first hyper-param of LS-SVM, for all kernels kernel=self. kernel, #the kernel to be used, which we still have from the original init c=self.bestSigma, #the scale-factor in case of a poly kernel d=self.degree, #maximum degree for poly kernel sigma=self.bestSigma, #the scale factor of the rbf kernel ) ## --> now we set the average coefficients and support vectors param = dict() param['intercept_'] = mean_intercept param['coef_'] = mean_coef param['support_'] = support_vectors_sorted self.model.set_attributes(**param) #make sure we know it is an average model self.isAverage = True self.hasCI = False #print("get_state post setting=",self.model.__getstate__()) if setCI: #end = time.perf_counter_ns() #print("3.2.a) Average Coefficients : CI Intercept ",(end-start)/10E9) # 3. Calculate Confidence Interval using Bootstrapper tech? # & 4. Store the CI data ## For the intercept boot = TBootstrap(data=intercept, Func=np.mean) #end = time.perf_counter_ns() #print("3.2.b) NPboot",(end-start)/1E9) boot.NPbootstrap(n_iter=2000, Jackknife=True) #end = time.perf_counter_ns() #print("3.2.c) Con Int",(end-start)/1E9) avgm, avgp = boot.ConfidenceInterval( CItype="BCa", alpha=0.05, n_samples=2000) #95%confidence interval self.CI["intercept_lo"] = avgm self.CI["intercept_hi"] = avgp print("===BOOT INTERCEPT:", avgm, avgp) ## For the coefficients # Parallelisation for sections performing bootstraps. # Parallelization at the highest level of a column, # ??Is the overhead sufficiently low to have benefits? # 1. create our process pool with as many processes as physical cores pool = mp.Pool(processes=get_num_procs(-1)) # 2. set drones to work alpha = 0.05 #95%confidence interval drones = [ pool.apply_async(Bootstrap_1Col, args=(col, coef[:, col], alpha)) for col in range(SizeSet) ] # 3. as we can not assume the cols to be produced in the correct order # --> make it a dict ciDict = dict() for drone in drones: col, avgm, avgp = drone.get() ciDict[col] = list([avgm, avgp]) # 4. wait untill all processes are finished pool.close() pool.join() # 5. and put then in the corrcet order in the list avgml = list() avgpl = list() for col in range(SizeSet): avgml.append(ciDict[col][0]) avgpl.append(ciDict[col][1]) self.CI["coef_lo"] = avgml self.CI["coef_hi"] = avgpl self.hasCI = True #store the resulting coefficients in our wrapper tracker...and we are done #print("Store resulting coefficients.") self.setCoefficients() self.Quality = TModelQualityData(EData=EnsembleData)
def RunPostProcess_MLpaper(basedata: str, baseresult: str, datasizes: list, predictionData: pd.DataFrame, NumDim: int, NumSKDim: int, theoryModel: list, modelFunction, fitFunction, heatmap: bool = True, n_procs: int = 1): """ - basedata : base-string of the datapoint-files (contains the model-coefficients) - baseresult: base string of the files containing the results per set - datasizes: list of ints giving the zise of the full datasets - predictionData: pandas-Frame containing the feature/target data of the artificial model for a set of datapoints to predict. - NumDim : the number of dimensions - NumSKDim : the number of features in the sk-learn model - theoryModel: list of floats with the intercept [index 0], and coefficients [indices 1: ] of the artificial model - modelFunction: is the "perfect" version of the function used to generate the data - fitFunction: is the function used in ML to create the fit. Note, this is handcoded, not sklearn type function - heatmap: bool indicating if heatmaps neet to be generated...DON'T USE FOR EXP DATA - n_procs: number of parallel processes to use (when calculating bootstrap CI). Default = 1 """ from HPCTools import get_num_procs print("START POST-PROCESS") print("===================") print("A. READING/COLLECTING DATA") print("---------------------------") nrDatasets = len(datasizes) #transform dataframe into array predictSet = np.array(predictionData.rename_axis('ID').values) #the datapoints allDataSetsFeatures = dict() allDataSetsTargets = dict() for ds in datasizes: dfn = basedata + str(ds) + ".dat" #reconstruct filename dfile = open(dfn, "r") dscheck = int(dfile.readline().replace("#", " ")) if (ds == dscheck): curlst_F = list() curlst_T = list() for dp in range(ds): data = dfile.readline().split() tmpl = list(float(x) for x in data[0:-1]) curlst_F.append( tmpl ) #from the first to the 1 but last column (remember the -1 is not included in a range, so it is the same as Fortran -2) curlst_T.append(float(data[-1])) # take the last column allDataSetsFeatures[ds] = curlst_F allDataSetsTargets[ds] = curlst_T else: print("ERROR: INCONSISTENT DATASIZES IN ", dfn, " ", ds, " vs ", dscheck) dfile.close() #the RMSE's etc fullSetResults = dict() ttsplitResults = dict() for ds in datasizes: dfn = baseresult + str(ds) + ".dat" #reconstruct filename dfile = open(dfn, "r") nruns, ncols = tuple( int(i) for i in dfile.readline().replace("#", " ").split()) nruns -= 1 data = dfile.readline().split() curlst = list([int(data[0])]) curlst.extend(list(float(i) for i in data[1:])) fullSetResults[ds] = curlst curlst = list() for dp in range(nruns): data = dfile.readline().split() clr = list([int(data[0])]) clr.extend(list(float(i) for i in data[1:])) curlst.append(clr) ttsplitResults[ds] = curlst dfile.close() print("B. Generating RMSE-curves ") print("---------------------------") ############################################################################ ########## TRAIN-TEST RESULTS ############################################# ############################################################################ header = list() header.append("# datasize RMSE TRAIN: avg CIlo CIhi min max ") header.append("TEST: avg CIlo CIhi min max ") header.append("MAE TRAIN: avg CIlo CIhi min max ") header.append("MAE TEST : avg CIlo CIhi min max ") header.append("avg-RMSE-LoO: avg CIlo CIhi min max ") header.append("avg-RMSE-5CV: avg CIlo CIhi min max ") header.append(" RMSE-full-set \n") headerstr = "" headerstr = headerstr.join( header ) #needs to be assigned because join "only" returns the string...but it needs a real string to be possible to call pltdat = "PlotResults_TRAINTEST.dat" if os.path.exists(pltdat): os.remove(pltdat) #clear the file before we start plttrainf = open(pltdat, "a+") plttrainf.write(headerstr) # Parallelisation for sections performing bootstraps. # Parallelization only at the highest level of a datasize, not a column, # this to keep overhead low, and deal with slowdowns due to large number of datasizes # 1. create our process pool pool = mp.Pool(processes=get_num_procs(n_procs)) # 2. set drones to work drones = [ pool.apply_async(getOneLineTrainTest, args=(ds, ttsplitResults, str(fullSetResults[ds][1]))) for ds in datasizes ] # 3. as we can not assume the lines to be produced in the correct order # and numbering is non-linear or incremental--> make it a dict lineDict = dict() for drone in drones: ds, line = drone.get() lineDict[ds] = line # 4. wait untill all processes are finished pool.close() pool.join() # 5. and now do the writing in an orderly fashion for ds in datasizes: plttrainf.write(lineDict[ds]) # line=str(ds)+" " #first column is the datasize # data=np.array(ttsplitResults[ds]) #needs to be a numpy array to do fortran type slicing... # # #0: index # #1: RMSE train # #2: RMSE test # #3: MAE train # #4: MAE test # #5: avg RMSE LoO train # #6: 2sig RMSE LoO train # #7: avg RMSE 5-fold train # #8: 2sig RMSE 5-fold train # #9: intercept b # #10: coef a1 # #11..: coef a2.. # # collst=[1, 2, 3, 4, 5, 7] # for col in collst: # #RMSE train: avg, avg+2sig, avg-2sig, min, max # avg=data[:,col].mean() #numpy mean # boot=TBootstrap(data=data[:,col],Func=np.mean) # boot.NPbootstrap(n_iter=2000, Jackknife=True) # avgm, avgp = boot.ConfidenceInterval(CItype="BCa",alpha=0.05)#95%confidence interval # #sig2=2.0*data[:,col].std() # dmin=np.amin(data[:,col]) #numpy has no amin/amax for numpy nd arrays... # dmax=np.amax(data[:,col]) # #avgm=avg-sig2 # #avgp=avg+sig2 # line=line+f'{avg:.7f}'+" "+f'{avgm:.7f}'+" "f'{avgp:.7f}' \ # +" "+f'{dmin:.7f}'+" "+f'{dmax:.7f}'+" | " # # line=line+str(fullSetResults[ds][1])+"\n" # plttrainf.write(line) plttrainf.close() ############################################################################ ########## ON PREDICTED DATA 1000 POINTS ################################## ########## MODELCOEFFICIENTS ############################################## ############################################################################ #How well do we predict 1000 datapoints headerstr = ( "# datasize RMSE: Theory Avg-model Full-Model Best-RMSE Worst-RMSE | MAE: Theory Avg-model Full-Model Best-MAE Worst-MAE \n " ) pltdat = "PlotResults_RMSEonPredict1K.dat" if os.path.exists(pltdat): os.remove(pltdat) #clear the file before we start plttrainf = open(pltdat, "a+") plttrainf.write(headerstr) #print(headerstr) #location to print model coefficients headermodel = ( "datasetsize ) Avg: Intercept Coeff1..n | Full: Intercept Coeff1..n | best RMSE: Intercept Coeff1..n | worst RMSE: Intercept Coeff1..n \n " ) modeldat = "PlotResults_Model_coefficients.dat" if os.path.exists(modeldat): os.remove(modeldat) #clear the file before we start pltmodel = open(modeldat, "a+") pltmodel.write(headermodel) #print(headermodel) #loop over all datasetsizes for ds in datasizes: line = f'{ds:5}' + " " #first column is the datasize linemodel = f'{ds:5}' + " ) " #first column is the datasize data = np.array( ttsplitResults[ds] ) #needs to be a numpy array to do fortran type slicing... #0: run index #1: RMSE train #2: RMSE test #3: MAE train #4: MAE test #3 5: avg RMSE LoO train #4 6: 2sig RMSE LoO train #5 7: avg RMSE 5-fold train #6 8: 2sig RMSE 5-fold train #7 9: intercept b #8 10: coef a x = list( x for x in predictSet[:, 0:-1] ) #put the features in a list of lists, every row are the different features of 1 run #print("The X's:\n",x) #print("The X[1]:\n",x[0]) #what is the error introduced due to the noise on our theoretical model (this should be the best) if len(theoryModel) > 0: a = theoryModel[1:] b = theoryModel[0] #pred=a*predictSet[:,0:-2] + b pred = modelFunction(x=x, intercept=b, slope=a) #pred=list(np.dot(a,x) + b for x in predictSet[:,0:-1] ) rmseTheory = np.sqrt(mean_squared_error(predictSet[:, -1], pred)) maeTheory = mean_absolute_error(predictSet[:, -1], pred) else: rmseTheory = 0 maeTheory = 0 #How well is the averaged model doing #a=data[:,8:].mean() a = list(column.mean() for column in data[:, 10:].T ) #the for returns rows, by transposing it gives the columns b = data[:, 9].mean() #print("The AVG Intercept's:\n",b) #print("The AVG Coeffs's:\n",a) #pred=a*predictSet[:,0] + b pred = fitFunction(x=x, intercept=b, slope=a) #print("Prediction:\n",pred) #pred=list(np.dot(a,x) + b for x in predictSet[:,0:-1] ) rmseAvg = np.sqrt(mean_squared_error(predictSet[:, -1], pred)) maeAvg = mean_absolute_error(predictSet[:, -1], pred) a_coeffs = (' '.join(['%.7f'] * len(a))) % tuple(a) linemodel = linemodel + f'{b:.7f}' + " " + a_coeffs + " | " # print(" Intercept=",b) # print(" Coef =",a) # print(" x's =",x) # print(" Predict =",pred) # print(" PredTarg =",predictSet[:,-1]) # print("RMSE =",rmseAvg) #How well does a model which used the full data-set (train+test) perform? a = fullSetResults[ds][10:] b = fullSetResults[ds][9] #pred=a*predictSet[:,0] + b pred = fitFunction(x=x, intercept=b, slope=a) #pred=list(np.dot(a,x) + b for x in predictSet[:,0:-1] ) rmseFull = np.sqrt(mean_squared_error(predictSet[:, -1], pred)) maeFull = mean_absolute_error(predictSet[:, -1], pred) a_coeffs = (' '.join(['%.7f'] * len(a))) % tuple(a) linemodel = linemodel + f'{b:.7f}' + " " + a_coeffs + " | " #How well is the "best RMSE of test" model doing #find the index of the best RMSE posRMSE = np.where(data[:, 2] == np.amin( data[:, 2])) #index returns the first occurence a = data[posRMSE[0][0], 10:] b = data[posRMSE[0][0], 9] pred = fitFunction(x=x, intercept=b, slope=a) rmseBestRMSE = np.sqrt(mean_squared_error(predictSet[:, -1], pred)) a_coeffs = (' '.join(['%.7f'] * len(a))) % tuple(a) linemodel = linemodel + f'{b:.7f}' + " " + a_coeffs + " | " #How well is the "worst RMSE of test" model doing #find the index of the worst RMSE posRMSE = np.where(data[:, 2] == np.amax( data[:, 2])) #index returns the first occurence a = data[posRMSE[0][0], 10:] b = data[posRMSE[0][0], 9] pred = fitFunction(x=x, intercept=b, slope=a) rmseWorstRMSE = np.sqrt(mean_squared_error(predictSet[:, -1], pred)) a_coeffs = (' '.join(['%.7f'] * len(a))) % tuple(a) linemodel = linemodel + f'{b:.7f}' + " " + a_coeffs + " \n " #How well is the "best MAE of test" model doing #find the index of the best MAE posMAE = np.where(data[:, 4] == np.amin( data[:, 4])) #index returns the first occurence a = data[posMAE[0][0], 10:] b = data[posMAE[0][0], 9] pred = fitFunction(x=x, intercept=b, slope=a) maeBestMAE = mean_absolute_error(predictSet[:, -1], pred) #How well is the "worst MAE of test" model doing #find the index of the worst MAE posMAE = np.where(data[:, 4] == np.amax( data[:, 4])) #index returns the first occurence a = data[posMAE[0][0], 10:] b = data[posMAE[0][0], 9] pred = fitFunction(x=x, intercept=b, slope=a) maeWorstMAE = mean_absolute_error(predictSet[:, -1], pred) line=line+f'{rmseTheory:.7f}'+" "+f'{rmseAvg:.7f}'+" "+f'{rmseFull:.7f}'+\ " "+f'{rmseBestRMSE:.7f}'+" "+f'{rmseWorstRMSE:.7f}'+" | "+\ f'{maeTheory:.7f}'+" "+f'{maeAvg:.7f}'+" "+f'{maeFull:.7f}'+\ " "+f'{maeBestMAE:.7f}'+" "+f'{maeWorstMAE:.7f}'+" \n " plttrainf.write(line) pltmodel.write(linemodel) plttrainf.close() pltmodel.close() ############################################################################ ########## HEATMAPS ####################################################### ############################################################################ print("C. Generating HEATMAPs ") print("---------------------------") if heatmap: width = 1.0 dx = width * 0.01 nx = int(width / dx) + 1 ymin = np.amin(predictSet[:, -1]) #theoryModel[0] #intercept ymax = np.amax(predictSet[:, -1]) #ymin+height #intercept + slope's height = ymax - ymin #print("HEIGHT=",height," === ",ymax," - ", ymin) dy = dx ny = int(height / dy) + 1 xval = list([0] * nx) for x in range(nx): xval[x] = x * dx NFeat = NumSKDim for dim in range(NumDim): dimi = NFeat - dim #print("DIM in NUMDIM=",dim,"of",NumDim," -> DIMI=",dimi," NFeature=",NFeat," theorymodel[1:]=",theoryModel[1:]) for ds in datasizes: grid = np.zeros((ny, nx)) data = np.array( ttsplitResults[ds] ) #needs to be a numpy array to do fortran type slicing... num_rows, num_cols = data.shape #print("NUM_ROWS NUMCOLS=", num_rows, num_cols) for run in range(num_rows): #a=data[run,num_cols-dimi] #b=data[run,num_cols-NFeat-1] #print("slope=",data[run,num_cols-dimi:]) yr = fitFunction(x=xval, intercept=data[run, num_cols - NFeat - 1], slope=data[run, num_cols - dimi:]) #print("YR=",yr) yi = ((yr - ymin) / height) * ( ny - 1 ) #array operation-> transformation to int can not be done at array level for x in range(nx): #yr=a*xval[x]+b #find position in grid #yi=int(((yr-ymin)/height)*(ny-1)) yii = int(yi[x]) if (yii > -1) and (yii < ny): grid[yii, x] += 1.0 maxval = np.amax(grid) if (maxval == 0): print("WARNING: MAXVAL=0...NOTING IN HEATMAP--> NOT GOOD") maxval = 1 grid = grid / maxval print("for ", ds, " the maxval= ", maxval) plotname = "Heatmap_Dim" + str(dimi) + "_" + str(ds) plotGrid(grid, xmin=0.0, xmax=1.0, ymin=ymin, ymax=ymax, plotname=plotname)