def __init__(self,
                 dataset: pd.DataFrame,
                 predictset: pd.DataFrame = None,
                 njobs: int = -1,
                 test_size: float = 0.2,
                 maxRuns: int = 1000,
                 printFileStatistics: str = None,
                 PreStdScaler: bool = True):
        """
        Initiallisation of the Amadeus Framework.
        
        parameters:
            - dataset: a pandas data-frame containing the data to model
            - predictset: a pandas data-frame containing the data to model, [OPTIONAL, DEFAULT=None]
            - njobs: positive integer number indicating the number of processes to use 
                     for parallelisation. If set to a negative value the number of processes is set
                     to the number of physical cores times the absolute value given in njobs. 
                     (default = -1) \Todo: fix this for multi-cpu nodes.
            - test_size : fraction of the data to use as test-data for train_test_split. (default = 0.2  (aka 20%) )
            - maxRuns : the maximum number of runs allowed for averaging a model. (default=1000)
            - printFileStatistics : filename to print statistics data. (default= None, i.e. standard out)
            - PreStdScaler : Boolean indicating if the full-data needs to be pulled through a standard scaler before
                             train-test splitting. Default=True, as this is the only way to get a usefull pipeline for prediction
        """
        from HPCTools import get_num_procs

        self.name = "Amadeus"
        self.longname = "Artificial intelligence and Machine learning frAmework for DEsigning Useful materialS"
        self.num_workers = get_num_procs(njobs)
        self.test_split = test_size
        self.maxRuns = maxRuns
        self.printFileStatistics = printFileStatistics
        #some objects which need to be created later as they are model dependent
        self.dataset = dataset
        self.predictset = predictset
        self.preScale = PreStdScaler
        self.Pipeline = dict()  #one pipeline per model
        self.ModelFrame = dict(
        )  #this frame depends on the model, so it should be a list/dict
        self.PredictFrame = dict(
        )  #this frame depends on the model, so it should be a list/dict
        self.AverageModel = dict()
        self.ModelList = None
    def setAverageCoefficients(self, EnsembleData: TModelResults, setCI: bool):
        """
        Use the ensemble data to create an "average" model, and set the "coefficients"
        in the current model. This should be performed in each model separately
        
        --> needs to include hyper paramaters...how do we deal with multi-preference?
        
        """
        from sklearn.linear_model import Lasso
        import multiprocessing as mp
        from HPCTools import get_num_procs
        #import time

        # 1. Calculate the average coefficients
        # 1.1. transform them to arrays
        #start = time.perf_counter_ns()
        #print("3.1) Average Coefficients : AVG")
        self.coefUsage = np.zeros(
            EnsembleData.modelCoef[0]['coef_'][1].shape[1])

        intercept = np.zeros(EnsembleData.NData)
        coef = np.zeros((EnsembleData.NData,
                         EnsembleData.modelCoef[0]['coef_'][1].shape[1]))
        alphas = np.zeros(EnsembleData.NData)
        for i in range(EnsembleData.NData):
            mcf = EnsembleData.modelCoef[i]
            intercept[i] = np.asarray(mcf['intercept_'][1]).ravel()
            coef[i, :] = np.asarray(mcf['coef_'][1]).ravel()
            stra = mcf['alpha'][1]
            alphas[i] = stra.split()[3]

        for j in range(EnsembleData.modelCoef[0]['coef_'][1].shape[1]):
            self.coefUsage[j] = 100.0 * (
                np.count_nonzero(coef[:, j]) / EnsembleData.NData
            )  # The %-fraction of non-zero versions of each coefficient

        self.bestAlpha = np.mean(alphas, axis=0)
        mean_intercept = np.mean(
            intercept, axis=0
        )  #axis is the varying direction, so 0 means we calculate the average of a column by varying the row
        mean_coef = np.mean(coef, axis=0)
        print("MEANS=> Int=", mean_intercept, "  COEF=", mean_coef)
        # 2. Set the model coefficients to these averaged values
        # ENR and sklearn black-boxing complicate things a bit here:
        self.model = Lasso(
            alpha=self.bestAlpha,  # temp values -> will be mean?
            fit_intercept=
            True,  #the data is already centered by standard scaler...but that seems not to be relevant/ or what is meant by this
            normalize=False,  #standardization is performed elsewhere
            precompute=True  #precompute the Gram matrix (default)
        )
        #The black box nature of sklearn/python rears its ugly head here.
        #apparently we are not allowed to create an estimator ourselves by setting
        #the coefficients...ok maybe in this case we should replace the lasso with
        #a normal poly since we make no use of the power-features of lasso
        #however, this is rather annoyng.
        # So just to kill the possibility of sklearn trowing an exception when
        # predicting, just run it once and overwritte the intercept and coef attributes
        # --> f**k you python

        #print("FEATURE_TF=",self.feature_tf)

        #print("TARGET_TF=",self.target)
        #        print("DICT=",self.model)
        #        print("fit_intercept=", self.model.fit_intercept)
        #        #print("__doc__=", self.model.__doc__)
        #        print("_estimator_type=",self.model._estimator_type)
        #        print("_preprocess_data=",self.model._preprocess_data)
        #        print("selection=",self.model.selection)
        #        print("__getattribute__=",self.model.__getattribute__)
        #        #print("bool=",self.model.__bool__)
        #        print("get_state pre fit =",self.model.__getstate__())
        #print("=",self.model.__setstate__())
        ftt2 = [0, 1, 2, 3, 4, 5, 6]
        ftt = [[1, 2, 3], [0, 0.1, 0.0], [0.1, 0.1, 0.1], [3, 2, 2],
               [5.2, 3, 4], [0.0, 0.2, 0.3], [-0.1, -0.2, -0.3]]
        self.model.fit(ftt, ftt2)
        #        print("get_state post fit=",self.model.__getstate__())
        #self.model.fit(self.feature_tf,self.target) #just to make sure the intercept and coef attributes are defined--> f**k python

        self.model.intercept_ = mean_intercept
        self.model.coef_ = mean_coef
        self.isAverage = True
        self.hasCI = False
        #print("get_state post setting=",self.model.__getstate__())

        if setCI:
            #end = time.perf_counter_ns()
            #print("3.2.a) Average Coefficients : CI Intercept ",(end-start)/10E9)
            # 3. Calculate Confidence Interval using Bootstrapper tech?
            # & 4. Store the CI data
            ## For the intercept
            boot = TBootstrap(data=intercept, Func=np.mean)
            #end = time.perf_counter_ns()
            #print("3.2.b) NPboot",(end-start)/1E9)
            boot.NPbootstrap(n_iter=2000, Jackknife=True)
            #end = time.perf_counter_ns()
            #print("3.2.c) Con Int",(end-start)/1E9)
            avgm, avgp = boot.ConfidenceInterval(
                CItype="BCa", alpha=0.05,
                n_samples=2000)  #95%confidence interval
            self.CI["intercept_lo"] = avgm
            self.CI["intercept_hi"] = avgp
            print("===BOOT INTERCEPT:", avgm, avgp)

            ## For the coefficients
            # Parallelisation for sections performing bootstraps.
            # Parallelization at the highest level of a column,
            # ??Is the overhead sufficiently low to have benefits?
            # 1. create our process pool with as many processes as physical cores
            pool = mp.Pool(processes=get_num_procs(-1))
            # 2. set drones to work
            alpha = 0.05  #95%confidence interval

            print("col-range=", EnsembleData.modelCoef[0]['coef_'][1].shape[1])
            for col in range(EnsembleData.modelCoef[0]['coef_'][1].shape[1]):
                print("col", col, " coef[:,col]=", coef[:, col], " type=",
                      type(coef[:, col]))

            drones = [
                pool.apply_async(Bootstrap_1Col,
                                 args=(col, coef[:, col], alpha)) for col in
                range(EnsembleData.modelCoef[0]['coef_'][1].shape[1])
            ]
            # 3. as we can not assume the cols to be produced in the correct order
            #    --> make it a dict
            ciDict = dict()
            for drone in drones:
                col, avgm, avgp = drone.get()
                ciDict[col] = list([avgm, avgp])
            # 4. wait untill all processes are finished
            pool.close()
            pool.join()
            # 5. and put then in the corrcet order in the list
            avgml = list()
            avgpl = list()
            for col in range(EnsembleData.modelCoef[0]['coef_'][1].shape[1]):
                avgml.append(ciDict[col][0])
                avgpl.append(ciDict[col][1])

            self.CI["coef_lo"] = avgml
            self.CI["coef_hi"] = avgpl
            self.hasCI = True

        #store the resulting coefficients in our wrapper tracker...and we are done
        #print("Store resulting coefficients.")
        self.setCoefficients()
        self.Quality = TModelQualityData(EData=EnsembleData)
    def setAverageCoefficients(self, EnsembleData: TModelResults, setCI: bool):
        """
        Use the ensemble data to create an "average" model, and set the "coefficients"
        in the current model. This should be performed in each model separately
        """
        import multiprocessing as mp
        from HPCTools import get_num_procs

        # 1. Calculate the average coefficients
        # 1.1. transform them to arrays
        #start = time.perf_counter_ns()
        #print("3.1) Average Coefficients : AVG")
        intercept = np.zeros(EnsembleData.NData)
        coef = np.zeros((EnsembleData.NData,
                         EnsembleData.modelCoef[0]['coef_'][1].shape[1]))
        for i in range(EnsembleData.NData):
            mcf = EnsembleData.modelCoef[i]
            intercept[i] = np.asarray(mcf['intercept_'][1]).ravel()
            coef[i, :] = np.asarray(mcf['coef_'][1]).ravel()
            print(i, ")", coef[i, :])

        mean_intercept = np.mean(
            intercept, axis=0
        )  #axis is the varying direction, so 0 means we calculate the average of a column by varying the row
        mean_coef = np.mean(coef, axis=0)
        print("MEANS=> Int=", mean_intercept, "  COEF=", mean_coef)
        # 2. Set the model coefficients to these averaged values
        self.model.intercept_ = mean_intercept
        self.model.coef_ = mean_coef
        self.isAverage = True
        self.hasCI = False
        if setCI:
            #end = time.perf_counter_ns()
            #print("3.2.a) Average Coefficients : CI Intercept ",(end-start)/10E9)
            # 3. Calculate Confidence Interval using Bootstrapper tech?
            # & 4. Store the CI data
            ## For the intercept
            boot = TBootstrap(data=intercept, Func=np.mean)
            #end = time.perf_counter_ns()
            #print("3.2.b) NPboot",(end-start)/1E9)
            boot.NPbootstrap(n_iter=2000, Jackknife=True)
            #end = time.perf_counter_ns()
            #print("3.2.c) Con Int",(end-start)/1E9)
            avgm, avgp = boot.ConfidenceInterval(
                CItype="BCa", alpha=0.05,
                n_samples=2000)  #95%confidence interval
            self.CI["intercept_lo"] = avgm
            self.CI["intercept_hi"] = avgp
            ## For the coefficients
            # Parallelisation for sections performing bootstraps.
            # Parallelization at the highest level of a column,
            # ??Is the overhead sufficiently low to have benefits?
            # 1. create our process pool with as many processes as physical cores
            pool = mp.Pool(processes=get_num_procs(-1))
            # 2. set drones to work
            alpha = 0.05  #95%confidence interval
            drones = [
                pool.apply_async(self._BootstrapAvg_1Col,
                                 args=(col, coef[:, col], alpha)) for col in
                range(EnsembleData.modelCoef[0]['coef_'][1].shape[1])
            ]
            # 3. as we can not assume the cols to be produced in the correct order
            #    --> make it a dict
            ciDict = dict()
            for drone in drones:
                col, avgm, avgp = drone.get()
                ciDict[col] = list([avgm, avgp])
            # 4. wait untill all processes are finished
            pool.close()
            pool.join()
            # 5. and put then in the corrcet order in the list
            avgml = list()
            avgpl = list()
            for col in range(EnsembleData.modelCoef[0]['coef_'][1].shape[1]):
                avgml.append(ciDict[col][0])
                avgpl.append(ciDict[col][1])

            self.CI["coef_lo"] = avgml
            self.CI["coef_hi"] = avgpl
            self.hasCI = True

        #store the resulting coefficients in our wrapper tracker...and we are done
        self.setCoefficients()
        self.Quality = TModelQualityData(EData=EnsembleData)
示例#4
0
    def setAverageCoefficients(self, EnsembleData: TModelResults, setCI: bool):
        """
        Use the ensemble data to create an "average" model, and set the "coefficients"
        in the current model. This should be performed in each model separately
        
            NOTE: We assume 
                1. that the support vectors(/features) were indexed with a 
                   sequence of INTEGER values starting at 0.
                2. the same sequence and order are used for all subsets drawn 
                   in the ensemble.
                3. that the largest value of the sequence is the size of the 
                   set of support vectors (-1, because caounting starts at 0)
                  
        
        --> needs to include hyper paramaters...how do we deal with multi-preference?
        
        """
        from LSSVMRegression import LSSVMRegression
        import multiprocessing as mp
        from HPCTools import get_num_procs

        # 0. Find out how many coefficients/data-points there are
        FullSet = set()
        for i in range(EnsembleData.NData):
            FullSet.update(
                set(EnsembleData.modelCoef[i]['data_pt_index']
                    [1].flatten()))  #flatten because nd-array
        SizeSet = max(
            FullSet
        ) + 1  #if for some reason some support vectors are missing altogether
        #size of a support vector
        SVar = EnsembleData.modelCoef[0]['support_'][1]
        SizeSV = len(SVar[0])

        # 1. Calculate the average coefficients
        # 1.1. transform them to arrays
        #start = time.perf_counter_ns()
        #print("3.1) Average Coefficients : AVG")
        self.coefUsage = np.zeros(SizeSet)
        self.coefInEnsemble = np.zeros(SizeSet)

        intercept = np.zeros(EnsembleData.NData)
        coef = np.zeros((EnsembleData.NData, SizeSet))
        support_vectors_sorted = np.zeros((SizeSet, SizeSV))
        sigmas = np.zeros(EnsembleData.NData)
        gammas = np.zeros(EnsembleData.NData)
        for i in range(EnsembleData.NData):
            mcf = EnsembleData.modelCoef[i]
            intercept[i] = np.asarray(mcf['intercept_'][1]).flatten(
            )  #use flatten, this returns a copy, ravel, does not, and you may end up modifying the original data
            stra = mcf['sigma'][1]
            sigmas[i] = stra.split()[3]
            stra = mcf['gamma'][1]
            gammas[i] = stra.split()[3]
            #coefficients are linked to "support-vectors" so we need to make sure we average the
            #  coef. of the same support vectors of different ensembles
            coefar = np.asarray(mcf['coef_'][1]).flatten()
            idxar = mcf['data_pt_index'][1].flatten()
            if (FullSet != set()):
                SVar = mcf['support_'][1]
            for j in range(mcf['coef_'][1].shape[1]):
                coef[i, idxar[j]] = coefar[j]
                self.coefInEnsemble[idxar[j]] += 1
                if idxar[
                        j] in FullSet:  #complicated way of only setting these values once
                    support_vectors_sorted[idxar[j]] = SVar[j]
                    FullSet.remove(idxar[j])

        for j in range(SizeSet):
            self.coefUsage[j] = 100.0 * (
                np.count_nonzero(coef[:, j]) / EnsembleData.NData
            )  # The %-fraction of non-zero versions of each coefficient
            self.coefInEnsemble[j] = 100.0 * (
                self.coefInEnsemble[j] / EnsembleData.NData
            )  # The fraction of the presence of the support vector in the ensemble (upper bound for self.coefUsage)

        self.bestGamma = np.mean(gammas, axis=0)
        self.bestSigma = np.mean(sigmas, axis=0)
        mean_intercept = np.mean(
            intercept, axis=0
        )  #axis is the varying direction, so 0 means we calculate the average of a column by varying the row
        mean_coef = np.mean(coef, axis=0)
        # 2. Set the model coefficients to these averaged values
        # LS-SVM is under our full control so we have some more
        # power to do what is needed
        ## --> FIRST: Create the "model"
        self.model = LSSVMRegression(
            gamma=self.
            bestGamma,  #the first hyper-param of LS-SVM, for all kernels
            kernel=self.
            kernel,  #the kernel to be used, which we still have from the original init
            c=self.bestSigma,  #the scale-factor in case of a poly kernel
            d=self.degree,  #maximum degree for poly kernel
            sigma=self.bestSigma,  #the scale factor of the rbf kernel
        )
        ## --> now we set the average coefficients and support vectors
        param = dict()
        param['intercept_'] = mean_intercept
        param['coef_'] = mean_coef
        param['support_'] = support_vectors_sorted
        self.model.set_attributes(**param)
        #make sure we know it is an average model
        self.isAverage = True
        self.hasCI = False
        #print("get_state post setting=",self.model.__getstate__())

        if setCI:
            #end = time.perf_counter_ns()
            #print("3.2.a) Average Coefficients : CI Intercept ",(end-start)/10E9)
            # 3. Calculate Confidence Interval using Bootstrapper tech?
            # & 4. Store the CI data
            ## For the intercept
            boot = TBootstrap(data=intercept, Func=np.mean)
            #end = time.perf_counter_ns()
            #print("3.2.b) NPboot",(end-start)/1E9)
            boot.NPbootstrap(n_iter=2000, Jackknife=True)
            #end = time.perf_counter_ns()
            #print("3.2.c) Con Int",(end-start)/1E9)
            avgm, avgp = boot.ConfidenceInterval(
                CItype="BCa", alpha=0.05,
                n_samples=2000)  #95%confidence interval
            self.CI["intercept_lo"] = avgm
            self.CI["intercept_hi"] = avgp
            print("===BOOT INTERCEPT:", avgm, avgp)

            ## For the coefficients
            # Parallelisation for sections performing bootstraps.
            # Parallelization at the highest level of a column,
            # ??Is the overhead sufficiently low to have benefits?
            # 1. create our process pool with as many processes as physical cores
            pool = mp.Pool(processes=get_num_procs(-1))
            # 2. set drones to work
            alpha = 0.05  #95%confidence interval
            drones = [
                pool.apply_async(Bootstrap_1Col,
                                 args=(col, coef[:, col], alpha))
                for col in range(SizeSet)
            ]
            # 3. as we can not assume the cols to be produced in the correct order
            #    --> make it a dict
            ciDict = dict()
            for drone in drones:
                col, avgm, avgp = drone.get()
                ciDict[col] = list([avgm, avgp])

            # 4. wait untill all processes are finished
            pool.close()
            pool.join()
            # 5. and put then in the corrcet order in the list
            avgml = list()
            avgpl = list()
            for col in range(SizeSet):
                avgml.append(ciDict[col][0])
                avgpl.append(ciDict[col][1])

            self.CI["coef_lo"] = avgml
            self.CI["coef_hi"] = avgpl
            self.hasCI = True

        #store the resulting coefficients in our wrapper tracker...and we are done
        #print("Store resulting coefficients.")
        self.setCoefficients()
        self.Quality = TModelQualityData(EData=EnsembleData)
示例#5
0
def RunPostProcess_MLpaper(basedata: str,
                           baseresult: str,
                           datasizes: list,
                           predictionData: pd.DataFrame,
                           NumDim: int,
                           NumSKDim: int,
                           theoryModel: list,
                           modelFunction,
                           fitFunction,
                           heatmap: bool = True,
                           n_procs: int = 1):
    """
    - basedata : base-string of the datapoint-files (contains the model-coefficients)
    - baseresult: base string of the files containing the results per set
    - datasizes: list of ints giving the zise of the full datasets
    - predictionData: pandas-Frame containing the feature/target data of the artificial model for a set of datapoints to predict.
    - NumDim : the number of dimensions
    - NumSKDim : the number of features in the sk-learn model
    - theoryModel: list of floats with the intercept [index 0], and coefficients [indices 1: ] of the artificial model
    - modelFunction: is the "perfect" version of the function used to generate the data
    - fitFunction: is the function used in ML to create the fit. Note, this is handcoded, not sklearn type function
    - heatmap: bool indicating if heatmaps neet to be generated...DON'T USE FOR EXP DATA
    - n_procs: number of parallel processes to use (when calculating bootstrap CI). Default = 1
    """
    from HPCTools import get_num_procs

    print("START POST-PROCESS")
    print("===================")
    print("A. READING/COLLECTING DATA")
    print("---------------------------")
    nrDatasets = len(datasizes)
    #transform dataframe into array
    predictSet = np.array(predictionData.rename_axis('ID').values)

    #the datapoints
    allDataSetsFeatures = dict()
    allDataSetsTargets = dict()
    for ds in datasizes:
        dfn = basedata + str(ds) + ".dat"  #reconstruct filename
        dfile = open(dfn, "r")
        dscheck = int(dfile.readline().replace("#", " "))
        if (ds == dscheck):
            curlst_F = list()
            curlst_T = list()
            for dp in range(ds):
                data = dfile.readline().split()
                tmpl = list(float(x) for x in data[0:-1])
                curlst_F.append(
                    tmpl
                )  #from the first to the 1 but last column (remember the -1 is not included in a range, so it is the same as Fortran -2)
                curlst_T.append(float(data[-1]))  # take the last column
            allDataSetsFeatures[ds] = curlst_F
            allDataSetsTargets[ds] = curlst_T

        else:
            print("ERROR: INCONSISTENT DATASIZES IN ", dfn, " ", ds, " vs ",
                  dscheck)
        dfile.close()

    #the RMSE's etc
    fullSetResults = dict()
    ttsplitResults = dict()
    for ds in datasizes:
        dfn = baseresult + str(ds) + ".dat"  #reconstruct filename
        dfile = open(dfn, "r")
        nruns, ncols = tuple(
            int(i) for i in dfile.readline().replace("#", " ").split())
        nruns -= 1
        data = dfile.readline().split()
        curlst = list([int(data[0])])
        curlst.extend(list(float(i) for i in data[1:]))
        fullSetResults[ds] = curlst
        curlst = list()
        for dp in range(nruns):
            data = dfile.readline().split()
            clr = list([int(data[0])])
            clr.extend(list(float(i) for i in data[1:]))
            curlst.append(clr)

        ttsplitResults[ds] = curlst
        dfile.close()

    print("B. Generating RMSE-curves ")
    print("---------------------------")

    ############################################################################
    ##########  TRAIN-TEST RESULTS #############################################
    ############################################################################
    header = list()
    header.append("# datasize RMSE TRAIN: avg  CIlo  CIhi  min  max ")
    header.append("TEST: avg  CIlo  CIhi  min  max ")
    header.append("MAE TRAIN: avg  CIlo  CIhi  min  max ")
    header.append("MAE TEST : avg  CIlo  CIhi  min  max ")
    header.append("avg-RMSE-LoO: avg  CIlo  CIhi  min  max ")
    header.append("avg-RMSE-5CV: avg  CIlo  CIhi  min  max ")
    header.append(" RMSE-full-set \n")

    headerstr = ""
    headerstr = headerstr.join(
        header
    )  #needs to be assigned because join "only" returns the string...but it needs a real string to be possible to call

    pltdat = "PlotResults_TRAINTEST.dat"
    if os.path.exists(pltdat):
        os.remove(pltdat)  #clear the file before we start
    plttrainf = open(pltdat, "a+")
    plttrainf.write(headerstr)

    # Parallelisation for sections performing bootstraps.
    # Parallelization only at the highest level of a datasize, not a column,
    # this to keep overhead low, and deal with slowdowns due to large number of datasizes
    # 1. create our process pool
    pool = mp.Pool(processes=get_num_procs(n_procs))
    # 2. set drones to work
    drones = [
        pool.apply_async(getOneLineTrainTest,
                         args=(ds, ttsplitResults, str(fullSetResults[ds][1])))
        for ds in datasizes
    ]
    # 3. as we can not assume the lines to be produced in the correct order
    #    and numbering is non-linear or incremental--> make it a dict
    lineDict = dict()

    for drone in drones:
        ds, line = drone.get()
        lineDict[ds] = line
    # 4. wait untill all processes are finished
    pool.close()
    pool.join()
    # 5. and now do the writing in an orderly fashion
    for ds in datasizes:
        plttrainf.write(lineDict[ds])

#        line=str(ds)+"  " #first column is the datasize
#        data=np.array(ttsplitResults[ds]) #needs to be a numpy array to do fortran type slicing...
#
#        #0: index
#        #1: RMSE train
#        #2: RMSE test
#        #3: MAE train
#        #4: MAE test
#        #5: avg RMSE LoO train
#        #6: 2sig RMSE LoO train
#        #7: avg RMSE 5-fold train
#        #8: 2sig RMSE 5-fold train
#        #9: intercept b
#        #10: coef a1
#        #11..: coef a2..
#
#        collst=[1, 2, 3, 4, 5, 7]
#        for col in collst:
#            #RMSE train: avg, avg+2sig, avg-2sig, min, max
#            avg=data[:,col].mean() #numpy mean
#            boot=TBootstrap(data=data[:,col],Func=np.mean)
#            boot.NPbootstrap(n_iter=2000, Jackknife=True)
#            avgm, avgp = boot.ConfidenceInterval(CItype="BCa",alpha=0.05)#95%confidence interval
#            #sig2=2.0*data[:,col].std()
#            dmin=np.amin(data[:,col]) #numpy has no amin/amax for numpy nd arrays...
#            dmax=np.amax(data[:,col])
#            #avgm=avg-sig2
#            #avgp=avg+sig2
#            line=line+f'{avg:.7f}'+"  "+f'{avgm:.7f}'+"  "f'{avgp:.7f}' \
#                        +"  "+f'{dmin:.7f}'+"  "+f'{dmax:.7f}'+" | "
#
#        line=line+str(fullSetResults[ds][1])+"\n"
#        plttrainf.write(line)
    plttrainf.close()

    ############################################################################
    ##########  ON PREDICTED DATA 1000 POINTS ##################################
    ##########  MODELCOEFFICIENTS ##############################################
    ############################################################################

    #How well do we predict 1000 datapoints
    headerstr = (
        "# datasize RMSE: Theory    Avg-model    Full-Model     Best-RMSE    Worst-RMSE | MAE: Theory    Avg-model    Full-Model     Best-MAE    Worst-MAE \n "
    )
    pltdat = "PlotResults_RMSEonPredict1K.dat"
    if os.path.exists(pltdat):
        os.remove(pltdat)  #clear the file before we start
    plttrainf = open(pltdat, "a+")
    plttrainf.write(headerstr)
    #print(headerstr)

    #location to print model coefficients
    headermodel = (
        "datasetsize ) Avg: Intercept Coeff1..n | Full: Intercept Coeff1..n  | best RMSE: Intercept Coeff1..n  | worst RMSE: Intercept Coeff1..n   \n "
    )
    modeldat = "PlotResults_Model_coefficients.dat"
    if os.path.exists(modeldat):
        os.remove(modeldat)  #clear the file before we start
    pltmodel = open(modeldat, "a+")
    pltmodel.write(headermodel)
    #print(headermodel)

    #loop over all datasetsizes
    for ds in datasizes:
        line = f'{ds:5}' + "  "  #first column is the datasize
        linemodel = f'{ds:5}' + " ) "  #first column is the datasize
        data = np.array(
            ttsplitResults[ds]
        )  #needs to be a numpy array to do fortran type slicing...
        #0: run index
        #1: RMSE train
        #2: RMSE test
        #3: MAE train
        #4: MAE test

        #3 5: avg RMSE LoO train
        #4 6: 2sig RMSE LoO train
        #5 7: avg RMSE 5-fold train
        #6 8: 2sig RMSE 5-fold train
        #7 9: intercept b
        #8 10: coef a

        x = list(
            x for x in predictSet[:, 0:-1]
        )  #put the features in a list of lists, every row are the different features of 1 run

        #print("The X's:\n",x)
        #print("The X[1]:\n",x[0])

        #what is the error introduced due to the noise on our theoretical model (this should be the best)
        if len(theoryModel) > 0:
            a = theoryModel[1:]
            b = theoryModel[0]
            #pred=a*predictSet[:,0:-2] + b
            pred = modelFunction(x=x, intercept=b, slope=a)
            #pred=list(np.dot(a,x) + b for x in predictSet[:,0:-1] )
            rmseTheory = np.sqrt(mean_squared_error(predictSet[:, -1], pred))
            maeTheory = mean_absolute_error(predictSet[:, -1], pred)
        else:
            rmseTheory = 0
            maeTheory = 0

        #How well is the averaged model doing
        #a=data[:,8:].mean()
        a = list(column.mean() for column in data[:, 10:].T
                 )  #the for returns rows, by transposing it gives the columns
        b = data[:, 9].mean()
        #print("The AVG Intercept's:\n",b)
        #print("The AVG Coeffs's:\n",a)

        #pred=a*predictSet[:,0] + b
        pred = fitFunction(x=x, intercept=b, slope=a)
        #print("Prediction:\n",pred)

        #pred=list(np.dot(a,x) + b for x in predictSet[:,0:-1] )
        rmseAvg = np.sqrt(mean_squared_error(predictSet[:, -1], pred))
        maeAvg = mean_absolute_error(predictSet[:, -1], pred)
        a_coeffs = ('  '.join(['%.7f'] * len(a))) % tuple(a)
        linemodel = linemodel + f'{b:.7f}' + "   " + a_coeffs + " | "

        #        print(" Intercept=",b)
        #        print(" Coef     =",a)
        #        print(" x's      =",x)
        #        print(" Predict  =",pred)
        #        print(" PredTarg =",predictSet[:,-1])
        #        print("RMSE      =",rmseAvg)

        #How well does a model which used the full data-set (train+test) perform?
        a = fullSetResults[ds][10:]
        b = fullSetResults[ds][9]
        #pred=a*predictSet[:,0] + b
        pred = fitFunction(x=x, intercept=b, slope=a)
        #pred=list(np.dot(a,x) + b for x in predictSet[:,0:-1] )
        rmseFull = np.sqrt(mean_squared_error(predictSet[:, -1], pred))
        maeFull = mean_absolute_error(predictSet[:, -1], pred)
        a_coeffs = ('  '.join(['%.7f'] * len(a))) % tuple(a)
        linemodel = linemodel + f'{b:.7f}' + "   " + a_coeffs + " | "

        #How well is the "best RMSE of test" model doing
        #find the index of the best RMSE
        posRMSE = np.where(data[:, 2] == np.amin(
            data[:, 2]))  #index returns the first occurence
        a = data[posRMSE[0][0], 10:]
        b = data[posRMSE[0][0], 9]
        pred = fitFunction(x=x, intercept=b, slope=a)
        rmseBestRMSE = np.sqrt(mean_squared_error(predictSet[:, -1], pred))
        a_coeffs = ('  '.join(['%.7f'] * len(a))) % tuple(a)
        linemodel = linemodel + f'{b:.7f}' + "   " + a_coeffs + " | "
        #How well is the "worst RMSE of test" model doing
        #find the index of the worst RMSE
        posRMSE = np.where(data[:, 2] == np.amax(
            data[:, 2]))  #index returns the first occurence
        a = data[posRMSE[0][0], 10:]
        b = data[posRMSE[0][0], 9]
        pred = fitFunction(x=x, intercept=b, slope=a)
        rmseWorstRMSE = np.sqrt(mean_squared_error(predictSet[:, -1], pred))
        a_coeffs = ('  '.join(['%.7f'] * len(a))) % tuple(a)
        linemodel = linemodel + f'{b:.7f}' + "   " + a_coeffs + " \n "

        #How well is the "best MAE of test" model doing
        #find the index of the best MAE
        posMAE = np.where(data[:, 4] == np.amin(
            data[:, 4]))  #index returns the first occurence
        a = data[posMAE[0][0], 10:]
        b = data[posMAE[0][0], 9]
        pred = fitFunction(x=x, intercept=b, slope=a)
        maeBestMAE = mean_absolute_error(predictSet[:, -1], pred)
        #How well is the "worst MAE of test" model doing
        #find the index of the worst MAE
        posMAE = np.where(data[:, 4] == np.amax(
            data[:, 4]))  #index returns the first occurence
        a = data[posMAE[0][0], 10:]
        b = data[posMAE[0][0], 9]
        pred = fitFunction(x=x, intercept=b, slope=a)
        maeWorstMAE = mean_absolute_error(predictSet[:, -1], pred)

        line=line+f'{rmseTheory:.7f}'+"  "+f'{rmseAvg:.7f}'+"  "+f'{rmseFull:.7f}'+\
                "  "+f'{rmseBestRMSE:.7f}'+"  "+f'{rmseWorstRMSE:.7f}'+" | "+\
                f'{maeTheory:.7f}'+"  "+f'{maeAvg:.7f}'+"  "+f'{maeFull:.7f}'+\
                "  "+f'{maeBestMAE:.7f}'+"  "+f'{maeWorstMAE:.7f}'+" \n "

        plttrainf.write(line)
        pltmodel.write(linemodel)

    plttrainf.close()
    pltmodel.close()

    ############################################################################
    ##########  HEATMAPS #######################################################
    ############################################################################
    print("C. Generating HEATMAPs ")
    print("---------------------------")
    if heatmap:

        width = 1.0
        dx = width * 0.01
        nx = int(width / dx) + 1

        ymin = np.amin(predictSet[:, -1])  #theoryModel[0]      #intercept
        ymax = np.amax(predictSet[:, -1])  #ymin+height #intercept + slope's
        height = ymax - ymin
        #print("HEIGHT=",height,"  ===  ",ymax," - ", ymin)

        dy = dx
        ny = int(height / dy) + 1
        xval = list([0] * nx)
        for x in range(nx):
            xval[x] = x * dx

        NFeat = NumSKDim
        for dim in range(NumDim):
            dimi = NFeat - dim
            #print("DIM in NUMDIM=",dim,"of",NumDim," -> DIMI=",dimi,"   NFeature=",NFeat," theorymodel[1:]=",theoryModel[1:])
            for ds in datasizes:
                grid = np.zeros((ny, nx))
                data = np.array(
                    ttsplitResults[ds]
                )  #needs to be a numpy array to do fortran type slicing...
                num_rows, num_cols = data.shape
                #print("NUM_ROWS NUMCOLS=", num_rows, num_cols)
                for run in range(num_rows):
                    #a=data[run,num_cols-dimi]
                    #b=data[run,num_cols-NFeat-1]
                    #print("slope=",data[run,num_cols-dimi:])

                    yr = fitFunction(x=xval,
                                     intercept=data[run, num_cols - NFeat - 1],
                                     slope=data[run, num_cols - dimi:])
                    #print("YR=",yr)
                    yi = ((yr - ymin) / height) * (
                        ny - 1
                    )  #array operation-> transformation to int can not be done at array level

                    for x in range(nx):
                        #yr=a*xval[x]+b
                        #find position in grid
                        #yi=int(((yr-ymin)/height)*(ny-1))
                        yii = int(yi[x])
                        if (yii > -1) and (yii < ny):
                            grid[yii, x] += 1.0
                maxval = np.amax(grid)
                if (maxval == 0):
                    print("WARNING: MAXVAL=0...NOTING IN HEATMAP--> NOT GOOD")
                    maxval = 1
                grid = grid / maxval
                print("for ", ds, " the maxval= ", maxval)

                plotname = "Heatmap_Dim" + str(dimi) + "_" + str(ds)
                plotGrid(grid,
                         xmin=0.0,
                         xmax=1.0,
                         ymin=ymin,
                         ymax=ymax,
                         plotname=plotname)