def getValidationData(vargs, data, debug): """ Structure data for validating the model. Modifies data['opts'] Args: vargs: validation data valxdata, valzdata data: shared alamo data options debug: Additional options may be specified and will be applied to the .alm """ if vargs != (): debug["validation"] = True xvaldata = vargs[0] zvaldata = vargs[1] temp = np.shape(xvaldata) data["opts"]["nvaldata"] = temp[0] if len(np.shape(zvaldata)) == 1: zvaldata = np.reshape(zvaldata, (data["opts"]["nvaldata"], 1)) if temp[1] != data["opts"]["ninputs"]: writethis( "Number of input variables inconsistent between x and xval") almerror("p2") temp = np.shape(zvaldata) if temp[0] != data["opts"]["nvaldata"] or temp[1] != data["opts"][ "noutputs"]: writethis("Problem with zval") almerror("p2") return xvaldata, zvaldata
def manageArguments(xdata, zdata, data, debug, kwargs): """ Parse additional input options The 'pargs' library is used to keep track of options a user has availible descriptions of the dictionaries data, and debug are given in shared.py Multiple keys used to make writing the .alm file easier Args: xdata: (numpy.array or list[real]) zdata: (numpy.array or list[real]) data: shared alamo data options debug: Additional options may be specified and will be applied to the .alm """ parseKwargs(data, debug, kwargs) # Check to see if a simwrapper should be built if debug["simwrap"] or "simulator" in kwargs.keys(): buildSimWrapper(data, debug) # Specific check to see if the labels of the response variables # should be used in the output dictionary # This is important for systematic testing vs. single model input if debug["outkeys"]: # outkeys are specified to be used if data["opts"]["noutputs"] > 1: # 'Must use outkeys for multiple outputs' writethis("outkeys set to TRUE for multiple outputs") debug["outkeys"] = True # Construct xmin and xmax vector based on training data if not provided if "xmin" not in kwargs.keys(): constructXBounds(xdata, zdata, data, debug)
def constructXBounds(xdata, zdata, data, debug): """ Construct xmin,xmax and zmin, zmax for alamo if none are given Args: xdata: (numpy.array or list[real]) zdata: (numpy.array or list[real]) data: shared alamo data options debug: Additional options may be specified and will be applied to the .alm """ writethis("min and max values of inputs are not provided, \ they will be calculated from the training data\n") xmin = "" xmax = "" if data["opts"]["ninputs"] > 1: for i in range(data["opts"]["ninputs"]): tn = debug["bignum"] tx = -1 * debug["bignum"] for j in range(data["opts"]["ndata"]): if float(xdata[j][i]) < float(tn): tn = xdata[j][i] if float(xdata[j][i]) > float(tx): tx = xdata[j][i] xmin = xmin + str(tn) + " " xmax = xmax + str(tx) + " " else: tn = debug["bignum"] tx = -1 * debug["bignum"] for j in range(data["opts"]["ndata"]): if float(xdata[j]) < float(tn): tn = xdata[j] if float(xdata[j]) > float(tx): tx = xdata[j] xmin = xmin + str(tn) + " " xmax = xmax + str(tx) + " " data["set4"]["xmax"] = xmax data["set4"]["xmin"] = xmin
def readTraceFile(vargs, data, debug): """ Read the alamo trace file to read in the model and metrics Args: data/debug: shared default options for .alm file vargs: Validation data """ trace_file = data["stropts"]["tracefname"] # currentDirectory = os.getcwd() trace_str = trace_file # currentDirectory + "/" + trace_file try: lf = open(trace_str).read() except (IOError, FileNotFoundError) as err: if debug["mock"]: data["results"]["clrtime"] = "0" data["results"]["size"] = "6" data["results"]["numolr"] = "16960" data["results"]["othertime"] = "0.8799995E-01" data["results"]["olrtime"] = "0.10800002" data["results"]["miptime"] = "0" data["results"]["version"] = "2018.4.3" data["results"]["status"] = "0" data["results"]["R2"] = "1" data["results"]["numclr"] = "0" data["results"]["nummip"] = "0" data["results"]["ssr"] = "0.169E-21" data["results"]["pymodel"] = "cam6alm" data["results"]["totaltime"] = "0.1760001" data["results"]["rmse"] = "0.255E-11" data["results"]["madp"] = "0.814E-09" data["results"][ "model"] = " z1 = 3.9999999999884194856747 * x1^2 \ - 3.9999999999873385725380 * x2^2 - 2.0999999999876837186719 \ * x1^4 + 3.9999999999879496392907 * x2^4 + 0.33333333333014281141260 \ * x1^6 + 1.0000000000008837375276 * x1*x2" data["results"]["nbas"] = "15" if debug["expandoutput"]: data["results"]["ssrval"] = 0 data["results"]["R2val"] = 0 data["results"]["rmseval"] = 0 data["results"]["madpval"] = 0 return else: raise almerror.AlamoError( 'Cannot read from trace file "{}": {}'.format(trace_str, err)) try: # import sympy from sympy.parsing.sympy_parser import parse_expr from sympy import symbols, lambdify except Exception: writethis("Cannot import sympy") lf2 = lf.split("\n") lf2_ind = 0 # ENGLE Allows for multiple writings to trace.trc file 5/30/19 dict_out_str = ( "#filename, NINPUTS, NOUTPUTS, INITIALPOINTS, OUTPUT, SET, " "INITIALIZER, SAMPLER, MODELER, BUILDER, GREEDYBUILD, " "BACKSTEPPER, GREEDYBACK, REGULARIZER, SOLVEMIP, SSEOLR, SSE, " "RMSE, R2, ModelSize, BIC, RIC, Cp, AICc, HQC, MSE, SSEp, MADp, " "OLRTime, numOLRs, OLRoneCalls, OLRoneFails, OLRgsiCalls, OLRgsiFails, " "OLRdgelCalls, OLRdgelFails, OLRclrCalls, OLRclrFails, OLRgmsCalls, " "OLRgmsFails, CLRTime, numCLRs, MIPTime, NumMIPs, LassoTime, " "Metric1Lasso, Metric2Lasso, LassoSuccess, LassoRed, nBasInitAct, " "nBas, SimTime, SimData, TotData, NdataConv, OtherTime, NumIters, " "IterConv, TimeConv, Step0Time, Step1Time, Step2Time, TotalTime, " "AlamoStatus, AlamoVersion, Model") lf2_ind = len(lf2) - 1 - lf2[::-1].index(dict_out_str) tkeys = lf2[lf2_ind].split(",") kl1 = list([ "ssr", "rmse", "R2", "size", "nbas", "totaltime", "olrtime", "miptime", "clrtime", "othertime", "version", "status", "madp", "numolr", "nummip", "numclr", "ninputs", ]) kl2 = list([ " SSE", " RMSE", " R2", " ModelSize", " nBasInitAct", " TotalTime", " OLRTime", " MIPTime", " CLRTime", " OtherTime", " AlamoVersion", " AlamoStatus", " MADp", " numOLRs", " NumMIPs", " numCLRs", " NINPUTS", ]) # Construct results for training data (&val if provided) ln = 1 wlparam = data["opts"]["noutputs"] # initialize multiple output expanded dictionary if debug["expandoutput"]: data["results"]["f(model)"] = {} data["results"]["model"] = {} for i in range(len(kl1)): data["results"][kl1[i]] = {} if len(vargs) > 0: for i in ["ssrval", "R2val", "rmseval", "madpval"]: data["results"][i] = {} if len(vargs) > 0: wlparam = 2 * wlparam else: wlparam = wlparam + 1 while ln < wlparam: lf3 = lf2[lf2_ind + ln].split(",") # Reapply the saved labels for the output model = lf3[tkeys.index(" Model")] # for label in data['labs']['savexlabels']: for i in range(data["opts"]["ninputs"]): label = data["labs"]["xlinks"][i][0] # Now is a convenient time to collect information that will be used in the # confidence interval analysis model = model.replace(str(label), str(data["labs"]["xlinks"][i][1])) for i in range(data["opts"]["noutputs"]): label = data["labs"]["zlinks"][i][0] model = model.replace(str(label), str(data["labs"]["zlinks"][i][1])) # determine which output label to write # if debug['outkeys'] == True use olab as a key if not dont if debug["outkeys"]: olab = model.split("=")[0] olab = olab.replace(" ", "") # print data['results'].keys data["results"]["model"][olab] = model # Record tokenized model for each output data["results"]["f(model)"][olab] = lambdify( [symbols(data["labs"]["savexlabels"])], parse_expr(model.split("=")[1].replace("^", "**")), "numpy", ) else: data["results"]["model"] = model data["results"]["f(model)"] = lambdify( [symbols(data["labs"]["savexlabels"])], parse_expr(model.split("=")[1].replace("^", "**")), "numpy", ) if debug["expandoutput"]: if debug["outkeys"]: for i in range(len(kl1)): data["results"][kl1[i]][olab] = lf3[tkeys.index(kl2[i])] # Check for validation set if len(vargs) > 0: lf3 = lf2[lf2_ind + 2].split(",") data["results"]["ssrval"][olab] = lf3[tkeys.index(" SSE")] data["results"]["R2val"][olab] = lf3[tkeys.index(" R2")] data["results"]["rmseval"][olab] = lf3[tkeys.index( " RMSE")] data["results"]["madpval"][olab] = lf3[tkeys.index( " MADp")] else: for i in range(len(kl1)): data["results"][kl1[i]] = lf3[tkeys.index(kl2[i])] # Check for validation set if len(vargs) > 0: lf3 = lf2[lf2_ind + 2].split(",") data["results"]["ssrval"] = lf3[tkeys.index(" SSE")] data["results"]["R2val"] = lf3[tkeys.index(" R2")] data["results"]["rmseval"] = lf3[tkeys.index(" RMSE")] data["results"]["madpval"] = lf3[tkeys.index(" MADp")] else: if debug["outkeys"]: data["results"]["ssr"][olab] = lf3[tkeys.index(kl2[0])] else: data["results"]["ssr"] = lf3[tkeys.index(kl2[0])] ln = ln + 1
def alamo(xdata, zdata, **kwargs): """ [almmodel] = doalamo(xdata,zdata, xvaldata, zvaldata,addopt=vals) Args: xdata: (numpy.array or list[real]) zdata: (numpy.array or list[real) kwargs: Additional options may be specified and will be applied to the .alm. Example - monomialpower=(1,2,3,4) - xlabels : labels given to input variables - zlabels : labels given to outputs - xval : validaiton data for alamo - zval : response validation data for alamo - modeler : modeler value used in alamo - solvemip : force alamo to solve mip if gams is availible - linfcns : 0-1 option to include linear transformations - expfcns : 0-1 option to include exponential transformations - logfcns : 0-1 option to include logarithmic transformations - sinfcns : 0-1 option to include sine transformations - cosfcns : 0-1 option to include cosine transformations - monomialpower : list of monomial powers - multi2power : list of binomial powers - multi3power : list of trinomials - ratiopower : list of ratio powers - screener : screening method - almname : specify a name for the .alm file - savescratch : saves .alm and .lst - savetrace : saves trace file - expandoutput : add a key to the output dictionary for the output (must be on for inputs(outputs?#Engle)>1) - almopt : direct text appending the option almopt=<file> will append a file to the end of the .alm and can be used to facilitate direct access to the .alm (no current checks) - loo : leave one out evaluation - lmo : leave many out evaluation Returns: dict: An ALAMO model with the following keys - 'model' : algebraic form of model - 'f(model)' : a callable lambda function - Syntax is depended on expandout syntax => almmodel['f(model)']['out'](inputs,sep,by,comma) almmodel['f(model)'](inputs,sep,by,comma) - 'ssr' : SSE on training set provided - 'R2' : R2 on training set provided - 'ssrval' : SSE on testing set if provided - 'R2val' : R2 on testing set if provided """ data, debug = alamopy.data, alamopy.debug # patched together validation data check if "xval" in kwargs.keys(): vargs = (kwargs["xval"], kwargs["zval"]) else: vargs = () xdata, zdata, xvaldata, zvaldata = setupData(data, debug, xdata, zdata, vargs, kwargs) manageArguments(xdata, zdata, data, debug, kwargs) data["results"] = {} writeCustomALAMOOptions(kwargs) # New Custom Options MENGLE # Cross Validation if debug["loo"]: q2 = [] if debug["outkeys"] and debug["expandoutput"]: q2 = {} # size = len(xdata) - 1 data["opts"]["ndata"] = data["opts"]["ndata"] - 1 kwargValidation = debug["validation"] kwargSaveTrace = debug["savetrace"] if kwargValidation: kwargNvaldata = data["opts"]["nvaldata"] data["opts"]["nvaldata"] = 1 debug["validation"] = True debug["savetrace"] = False for i in range(0, len(xdata)): cvxdata = [x for y, x in enumerate(xdata) if y != i] cvzdata = [x for y, x in enumerate(zdata) if y != i] alamopy.almwriter(data, debug, (cvxdata, cvzdata, [xdata[i][:]], [zdata[i][:]]), kwargs) # Calling ALAMO if not debug["mock"]: os.system(debug["almloc"] + " " + str(data["stropts"]["almname"]) + " > logscratch") data["results"] = {} readTraceFile([xdata[i][:], zdata[i][:]], data, debug) if debug["outkeys"] and debug["expandoutput"]: for k in data["results"]["R2"].keys(): if k not in q2.keys(): q2[k] = [float(data["results"]["R2val"][k])] else: q2sub = q2[k] q2sub.append(float(data["results"]["R2val"][k])) q2[k] = q2sub else: q2.append(float(data["results"]["R2val"])) cleanFiles(data, debug) if debug["outkeys"] and debug["expandoutput"]: data["results"]["Q2"] = {} for k in q2.keys(): Q2 = np.mean(q2[k]) data["results"]["Q2"][k] = Q2 print("%s: Running cross validation LOO, evaluated Q2:%f" % (k, Q2)) else: Q2 = np.mean(q2) data["results"]["Q2"] = Q2 print("Running cross validation LOO, evaluated Q2:%f" % Q2) del data["opts"]["nvaldata"] debug["validation"] = kwargValidation debug["savetrace"] = kwargSaveTrace if kwargValidation: data["opts"]["nvaldata"] = kwargNvaldata data["opts"]["ndata"] = data["opts"]["ndata"] + 1 elif debug["lmo"] > 0: q2 = [] if debug["outkeys"] and debug["expandoutput"]: q2 = {} kwargNdata = data["opts"]["ndata"] kwargValidation = debug["validation"] kwargSaveTrace = debug["savetrace"] if kwargValidation: kwargNvaldata = data["opts"]["nvaldata"] debug["validation"] = True debug["savetrace"] = False numOfFolds = debug["lmo"] print(xdata) if numOfFolds > len(xdata): raise Exception("Number of Cross validation \ folds exceeds the number of data points") # size = len(xdata) sizeOfFolds = int(len(xdata) / numOfFolds) r = len(xdata) % numOfFolds remS = 0 remE = 1 for i in range(numOfFolds): if i < r + 1: remS = i remE = i + 1 cvvalxdata = xdata[remS + sizeOfFolds * i:sizeOfFolds * (i + 1) + remE] cvvalzdata = zdata[remS + sizeOfFolds * i:sizeOfFolds * (i + 1) + remE] if i == 0: cvxdata = xdata[sizeOfFolds * (i + 1) + remE:] cvzdata = zdata[sizeOfFolds * (i + 1) + remE:] else: cvxdata = np.concatenate([ xdata[0:remS + sizeOfFolds * i], xdata[sizeOfFolds * (i + 1) + remE:], ]) cvzdata = np.concatenate([ zdata[0:remS + sizeOfFolds * i], zdata[sizeOfFolds * (i + 1) + remE:], ]) data["opts"]["nvaldata"] = len(cvvalxdata) data["opts"]["ndata"] = len(cvxdata) alamopy.almwriter(data, debug, (cvxdata, cvzdata, cvvalxdata, cvvalzdata), kwargs) # Calling ALAMO if not debug["mock"]: os.system(debug["almloc"] + " " + str(data["stropts"]["almname"]) + " > logscratch") data["results"] = {} expandOutput(xdata, zdata, [cvvalxdata, cvvalzdata], data, debug) readTraceFile([cvvalxdata, cvvalzdata], data, debug) if debug["outkeys"] and debug["expandoutput"]: for k in data["results"]["R2"].keys(): if k not in q2.keys(): q2[k] = [float(data["results"]["R2val"][k])] else: q2sub = q2[k] q2sub.append(float(data["results"]["R2val"][k])) q2[k] = q2sub else: q2.append(float(data["results"]["R2val"])) cleanFiles(data, debug) if debug["outkeys"] and debug["expandoutput"]: data["results"]["Q2"] = {} for k in q2.keys(): Q2 = np.mean(q2[k]) data["results"]["Q2"][k] = Q2 print("%s: Running cross validation LMO, evaluated Q2:%f" % (k, Q2)) else: Q2 = np.mean(q2) data["results"]["Q2"] = Q2 print("Running cross validation LMO, evaluated Q2:%f" % Q2) del data["opts"]["nvaldata"] debug["validation"] = kwargValidation debug["savetrace"] = kwargSaveTrace if kwargValidation: data["opts"]["nvaldata"] = kwargNvaldata data["opts"]["ndata"] = kwargNdata # Write alamo file if debug["validation"]: alamopy.almwriter(data, debug, (xdata, zdata, xvaldata, zvaldata), kwargs) else: alamopy.almwriter(data, debug, (xdata, zdata), kwargs) # Call alamo from the terminal if not debug["mock"]: if debug["showalm"]: os.system(debug["almloc"] + " " + str(data["stropts"]["almname"])) else: writethis("Calling ALAMO now:\n") os.system(debug["almloc"] + " " + str(data["stropts"]["almname"]) + " > logscratch") # Check to see if additional data was sampled and add it if "sampler" in kwargs.keys(): xdata, zdata = checkForSampledData(data, debug) # calculate additional statistics expandOutput(xdata, zdata, vargs, data, debug) # Open the trace file and pull appropriate results readTraceFile(vargs, data, debug) # write python file of regressed model alamopy.almpywriter(data) if debug["cvfun"]: alamopy.almcvwriter(data) # add <>alm.py to results dict data["results"]["pymodel"] = data["stropts"]["almname"].split( ".")[0] + "alm" if debug["loo"] or debug["lmo"] > 0: Q2, R2, diff = 0, 0, 0 if debug["outkeys"]: for k in data["results"]["R2"].keys(): R2 = data["results"]["R2"][k] if k in data["results"]["Q2"].keys(): Q2 = data["results"]["Q2"][k] diff = float(R2) - float(Q2) if Q2 < 0.5: print( "%s: Q2 suggests this is not a predictive model, \ Q2: %f, R2: %s" % (k, Q2, R2)) elif diff < 0.3: print( "%s: The difference of R2-Q2 is %f. This is an acceptable \ difference for predictability, Q2: %f, R2: %s" % (k, diff, Q2, R2)) else: print( "%s: The difference of R2-Q2 is %f. The surrogate model is \ not able to predict the data reliably, Q2: %f, R2: %s" % (k, diff, Q2, R2)) else: R2 = data["results"]["R2"] Q2 = data["results"]["Q2"] diff = float(R2) - float(Q2) if Q2 < 0.5: print( "Q2 suggests this is not a predictive model, Q2: %f, R2: %s" % (Q2, R2)) elif diff < 0.3: print( "The difference of R2-Q2 is %f. This is an acceptable difference for \ predictability, Q2: %f, R2: %s" % (diff, Q2, R2)) else: print( "The difference of R2-Q2 is %f. The surrogate model is not able to \ predict the data reliably, Q2: %f, R2: %s" % (diff, Q2, R2)) cleanFiles(data, debug, pywrite=True, **kwargs) return data["results"]