def getModelFileName(config, modelname): prefix = None if isRegression(config['problem']): prefix = "regressor" elif isClassification(config['problem']): prefix = "classifier" modelFileName = setFile(getModelsDir(config), "{0}-{1}.p".format(prefix,modelname)) return modelFileName
def testModel(modelname, estimator, X_test, config): info("Testing a {0} estimator".format(modelname), ind=0) info("X data is {0}".format(getDim(X_test)), ind=2) problemType = config['problem'] results = {"good": True, "label": None, "prob": None, "pred": None} if isinstance(estimator, dict): estimator = estimator['estimator'] if estimator is None: error("The {0} estimator is NULL".format(modelname)) results['good'] = False return results if isClassification(problemType): info("Predicting classification labels/classes for {0}".format(modelname), ind=4) try: results['label'] = estimator.predict(X_test) except: results['good'] = False error("There is a problem getting labels for {0}".format(modelname), ind=4) info("Predicting classification probabilities for {0}".format(modelname), ind=4) try: proba = estimator.predict_proba(X_test) results['prob'] = proba[:,1] except: results['good'] = False error("There is a problem getting probabilities for {0}".format(modelname), ind=4) if isRegression(problemType): info("Predicting regression score/output for {0}".format(modelname), ind=4) try: results['pred'] = estimator.predict(X_test) except: results['good'] = False error("There is a problem getting prediction for {0}".format(modelname), ind=4) if results['good'] == True: info("Everything looks good for the {0} estimator".format(modelname), ind=4) else: info("There is a problem with the {0} estimator".format(modelname), ind=4) return results
def getParamDist(config, modelname, nfeatures = None): info("Getting parameter distributions for {0}".format(modelname), ind=2) param_dist = None epsilon = 0.000001 problemType = config['problem'] treeParams = {"max_depth": [2, 4, 6, None], "max_features": ['auto', 'sqrt', 'log2', None], "min_impurity_decrease": sp_randfloat(0.0, 1-epsilon), "min_samples_leaf": sp_randint(1, 10)} ########################################################################### ## rf, extratrees ########################################################################### if modelname in ["rf", "extratrees", "dtree", "gbm"]: param_dist = treeParams if modelname == "rf" or modelname == "extratrees": param_dist["bootstrap"] = [True, False] if modelname == "gbm": param_dist["learning_rate"] = sp_randfloat(0.01, 0.5) if modelname = ["rf", "extratrees", "dtree"]: if isClassification(problemType): param_dist["criterion"] = ["gini", "entropy"] if isRegression(problemType): param_dist["criterion"] = ["mae", "mse"] if isClassification(problemType): param_dist["criterion"] = ['mse', 'friedman_mse'] param_dist["loss"] = ['deviance', 'exponential'] if isRegression(problemType): param_dist["criterion"] = ["friedman_mse"] param_dist["loss"] = ['ls']
def getModels(config, level): info("Getting Models For Level {0}".format(level), ind=0) problemType = config['problem'] if isClassification(problemType): models0 = ["xgboost", "logistic"] models1 = ["rf", "nn", "svmnulinear", "gbm"] models2 = [ "extratrees", "sgd", "nb", "lda", "kneighbors", "svmepslinear" ] models3 = [ "passagg", "gaussproc", "qda", "nbbern", "nbmulti", "dtree", "rneighbors", "svmlin", "svmnu", "adaboost", "svmnupoly", "svmepspoly", "svmnusigmoid", "svmepssigmoid", "svmnurbf", "svmepsrbf" ] if level == 0: models = models0 elif level == 1: models = models0 + models1 elif level == 2: models = models0 + models1 + models2 else: models = models0 + models1 + models2 + models3 if isRegression(problemType): models0 = ["xgboost", "linear"] models1 = ["xgboost", "rf", "linear", "nn", "svm", "gbm"] models2 = ["extratrees", "sgd", "earth", "kernelridge", "kneighbors"] models3 = [ "passagg", "gaussproc", "lasso", "ridge", "elasticnet", "bayesridge", "huber", "theilsen", "ransac", "dtree", "rneighbors", "svmlin", "svmnu", "adaboost" ] if level == 0: models = models0 elif level == 1: models = models0 + models1 elif level == 2: models = models0 + models1 + models2 else: models = models0 + models1 + models2 + models3 info("Using the following models: {0}".format(models), 2) return models
def plotResults(perfs, y_truth, config): info("Making Performance Plots", ind=0) outdir = getPlotsDir(config) performanceConfig = config['performance'] ext = performanceConfig['ext'] isPdf = ext == 'pdf' isMultipage = performanceConfig['multipage'] if isMultipage and isPdf: pdfname = setFile(outdir, 'results.pdf') info("Saving all performance plots to {0}".format(pdfname), ind=2) pp = PdfPages(pdfname) else: info("Saving all performance plots individually as {0}".format(ext), ind=2) pp = None badModels = [x for x in perfs.keys() if len(perfs[x]) == 0] for modelname in badModels: info("Not plotting {0}".format(modelname)) del perfs[modelname] if isClassification(config['problem']): plotKappa(perfs, outdir, ext, pp) plotPrecision(perfs, outdir, ext, pp) plotRecall(perfs, outdir, ext, pp) plotLogLoss(perfs, outdir, ext, pp) plotAccuracy(perfs, outdir, ext, pp) plotPrecisionRecall(perfs, outdir, ext, pp) plotROC(perfs, outdir, ext, pp) plotConfusionMatrix(perfs, config, outdir, ext, pp) if isRegression(config['problem']): plotMAE(perfs, outdir, ext, pp) plotMSE(perfs, outdir, ext, pp) plotExplainedVariance(perfs, outdir, ext, pp) plotR2(perfs, outdir, ext, pp) plotResiduals(perfs, outdir, ext, pp) if isMultipage and isPdf: info("Closing multipage pdf", ind=2) pp.savefig() pp.close()
def getModelPerformance(y_truth, testResults, config): info("Getting model performance", ind=0) problemType = config['problem'] if isClassification(problemType): try: results = getClassifierPerformance(y_truth, testResults) except: error( "There was a problem getting classification performance data", ind=4) results = {} if isRegression(problemType): try: results = getRegressionPerformance(y_truth, testResults) except: error("There was a problem getting regression performance data", ind=4) results = {} return results
def formatData(trainData, testData, config): info('Formatting training data of size ' + getDim(trainData), ind=0) info('Formatting testing data of size ' + getDim(testData), ind=0) ## Config info targetConfig = config['target'] targetcol = targetConfig['colname'] positiveTarget = targetConfig['positive'] targetNAstrategy = targetConfig['NAstrategy'] featureConfig = config['feature'] featureNAstrategy = featureConfig['NAstrategy'] if not isColumn(trainData, targetcol): raise ValueError("Target column", targetcol, "is not a valid column.") # 1) Get problem type targetData = trainData[targetcol] if config.get('problem'): problemType = config['problem'] else: problemType = getProblemType(targetData) config['problem'] = problemType # 2) format target based on what we want info('Formatting target', ind=1) if isClassification(problemType): convertToBinaryInt(trainData, targetcol, positiveTarget) if isColumn(testData, targetcol): convertToBinaryInt(testData, targetcol, positiveTarget) if isRegression(problemType): info('Not formatting target since it is regression', ind=1) # 3) replace NA info('Replace NA in data', ind=1) print featureNAstrategy replaceTargetNA(trainData, targetcol, targetNAstrategy) replaceFeatureNA(trainData, targetcol, featureNAstrategy) if isColumn(testData, targetcol): replaceTargetNA(testData, targetcol, targetNAstrategy) replaceFeatureNA(testData, targetcol, featureNAstrategy) # 4) drop columns we don't need dropData(trainData, config) dropData(testData, config) return trainData, testData # 5) format remaining data to numeric info('Formatting features to numeric', ind=1) convertCategoricalToNumeric(trainData, targetcol) convertCategoricalToNumeric(testData, targetcol) info('Post formatting the training data is now ' + getDim(trainData), ind=2) info('Post formatting the testing data is now ' + getDim(trainData), ind=2) #pddata.drop([colname], axis = 1, inplace = True) #pddata = pddata.join(expData) # 5) replace low variance info('Remove low variance features in data', ind=1) info('Finished formatting data', ind=0) return pddata
def formatData(pddf, config): info('Formatting data of size ' + getDim(pddf), ind=0) ## Config info targetConfig = config['target'] targetcol = targetConfig['colname'] positiveTarget = targetConfig['positive'] targetNAstrategy = targetConfig['NAstrategy'] featureConfig = config['feature'] featureNAstrategy = featureConfig['NAstrategy'] if not isColumn(pddf, targetcol): raise ValueError("Target column", targetcol, "is not a valid column.") # 1) Get problem type targetData = pddf[targetcol] if config.get('problem'): problemType = config['problem'] else: problemType = getProblemType(targetData) config['problem'] = problemType # 2) format target based on what we want info('Formatting target', ind=2) if isClassification(problemType): convertToBinaryInt(pddf, targetcol, positiveTarget) if isRegression(problemType): info('Not formatting target since it is regression', ind=1) # 3) replace NA info('Replace NA in data', ind=2) replaceTargetNA(pddf, targetcol, targetNAstrategy) replaceFeatureNA(pddf, targetcol, featureNAstrategy) # 4) remove low variance data info('Remove low variance in data', ind=2) # 5) drop columns we don't need info('Analyze data for possible drops', ind=2) analyzeColumns(pddf, config) dropData(pddf, config) info('Post column data the data is now ' + getDim(pddf), ind=2) # 6) label and one-hot encode data info('Label encode training data to numeric', ind=2) pddf, encodedCatData, labelEncoders = getEncodedData(pddf) info('Hot encode training data to sparse data frame', ind=1) encodedData = getHotEncodedData(encodedCatData, labelEncoders) info('Join training data together', ind=2) pddf = pddf.join(encodedData) info('Post formatting the data is now ' + getDim(pddf), ind=2) # 7) replace low variance info('Remove low variance features in data', ind=2) if isClassification(problemType): info('Classification is To do!', ind=4) if isRegression(problemType): info('Not removing any features since it is regression', ind=1) # 8) replace NA (if any remain) info('Replace NA (if any) in data', ind=2) replaceTargetNA(pddf, targetcol, targetNAstrategy) replaceFeatureNA(pddf, targetcol, featureNAstrategy) if sum(pddf.isnull().any()) > 0: error("There are still NA entries in the dataset!", ind=4) info('Finished formatting data. Data is now ' + getDim(pddf), ind=2) return pddf
def tuneModel(modelname, estimator, params, X_train, y_train, config): info("Tuning a {0} estimator".format(modelname), ind=0) if estimator is None or params is None: error("There is no estimator with parameters information.", ind=2) return {"estimator": None, "params": None, "cv": None} problemType = config['problem'] try: modelData = getModelData(config, modelname) except: error("There is no model parameter data for the {0} estimator".format(modelname)) if isClassification(problemType): scorers = ["accuracy", "average_precision", "f1", "f1_micro", "f1_macro", "f1_weighted", "f1_samples", "neg_log_loss", "precision", "recall", "roc_auc"] scorer = "roc_auc" if isClustering(problemType): scorers = ["adjusted_mutual_info_score", "adjusted_rand_score", "completeness_score", "fowlkes_mallows_score", "homogeneity_score", "mutual_info_score", "normalized_mutual_info_score", "v_measure_score"] scorer = "adjusted_mutual_info_score" if isRegression(problemType): scorers = ["explained_variance", "neg_mean_absolute_error", "neg_mean_squared_error", "neg_mean_squared_log_error", "neg_median_absolute_error", "r2"] scorer = "neg_mean_absolute_error" if scorer not in scorers: raise ValueError("Scorer {0} is not allowed".format(scorer)) searchType = "random" if searchType == "grid": param_grid = params['grid'] tuneEstimator = GridSearchCV(estimator, param_grid=param_grid, cv=2, scoring=scorer, verbose=1) elif searchType == "random": n_iter_search = modelData.get('iter') if n_iter_search is None: n_iter_search = 10 param_dist = params['dist'] tuneEstimator = RandomizedSearchCV(estimator, param_distributions=param_dist, cv=2, n_iter=n_iter_search, verbose=1, n_jobs=-1, return_train_score=True) else: raise ValueError("Search type {0} is not allowed".format(searchType)) info("Running {0} parameter search".format(searchType), ind=2) tuneEstimator.fit(X_train, y_train) bestEstimator = tuneEstimator.best_estimator_ bestScore = tuneEstimator.best_score_ bestParams = tuneEstimator.best_params_ cvResults = tuneEstimator.cv_results_ cvScores = cvResults['mean_test_score'] fitTimes = cvResults['mean_fit_time'] info("Tested {0} Parameter Sets".format(len(fitTimes)), ind=4) info("CV Fit Time Info (Mean,Std): ({0} , {1})".format(round(fitTimes.mean(),1), round(fitTimes.std(),1)), ind=4) info("Best Score : {0}".format(round(bestScore, 3)), ind=4) info("CV Test Scores (Mean,Std) : ({0} , {1})".format(round(cvScores.mean(),1), round(cvScores.std(),1)), ind=4) info("Best Parameters", ind=4) for paramName, paramVal in bestParams.iteritems(): info("Param: {0} = {1}".format(paramName, paramVal), ind=6) return {"estimator": bestEstimator, "params": bestParams, "cv": cvResults}
def getModel(config, modelname): info("Getting {0} Model".format(modelname), ind=0) problemType = config['problem'] modelData = getModelData(config, modelname) modelParams = modelData.get('params') retval = None ########################################################################### # Classification ########################################################################### if isClassification(problemType): if modelname == "logistic": retval = classifier(modelname, LogisticRegression(), modelParams) if modelname == "sgd": retval = classifier(modelname, SGDClassifier(), modelParams) if modelname == "passagg": retval = classifier(modelname, PassiveAggressiveClassifier(), modelParams) if modelname == "mlp": retval = classifier(modelname, MLPClassifier(), modelParams) if modelname == "xgboost": retval = classifier(modelname, XGBClassifier(), modelParams) if modelname == "gaussproc": retval = classifier(modelname, GaussianProcessClassifier(), modelParams) if modelname == "lda": retval = classifier(modelname, LinearDiscriminantAnalysis(), modelParams) if modelname == "qda": retval = classifier(modelname, QuadraticDiscriminantAnalysis(), modelParams) if modelname == "nb": retval = classifier(modelname, GaussianNB(), modelParams) if modelname == "nbbern": retval = classifier(modelname, BernoulliNB(), modelParams) if modelname == "nbmulti": retval = classifier(modelname, MultinomialNB(), modelParams) if modelname == "dtree": retval = classifier(modelname, DecisionTreeClassifier(), modelParams) if modelname == "kneighbors": retval = classifier(modelname, KNeighborsClassifier(), modelParams) if modelname == "rneighbors": retval = classifier(modelname, RadiusNeighborsClassifier(), modelParams) if modelname == "svmlin": retval = classifier(modelname, LinearSVC(), modelParams) if modelname == "svmnupoly": retval = classifier(modelname, NuSVC(), modelParams) if modelname == "svmnulinear": retval = classifier(modelname, NuSVC(), modelParams) if modelname == "svmnusigmoid": retval = classifier(modelname, NuSVC(), modelParams) if modelname == "svmnurbf": retval = classifier(modelname, NuSVC(), modelParams) if modelname == "svmepspoly": retval = classifier(modelname, SVC(), modelParams) if modelname == "svmepslinear": retval = classifier(modelname, SVC(), modelParams) if modelname == "svmepssigmoid": retval = classifier(modelname, SVC(), modelParams) if modelname == "svmepsrbf": retval = classifier(modelname, SVC(), modelParams) if modelname == "rf": retval = classifier(modelname, RandomForestClassifier(), modelParams) if modelname == "extratrees": retval = classifier(modelname, ExtraTreesClassifier(), modelParams) if modelname == "adaboost": retval = classifier(modelname, AdaBoostClassifier(), modelParams) if modelname == "gbm": retval = classifier(modelname, GradientBoostingClassifier(), modelParams) if modelname == "tpot": retval = classifier(modelname, TPOTClassifier(), modelParams) ####################################################################### # Regression ####################################################################### if modelname == "lightning": retval = external.extlightning.createLightningClassifier( modelParams) ########################################################################### # Regression ########################################################################### if isRegression(problemType): if modelname == "linear": retval = classifier(modelname, LinearRegression(), modelParams) if modelname == "ridge": retval = classifier(modelname, Ridge(), modelParams) if modelname == "lasso": retval = classifier(modelname, Lasso(), modelParams) if modelname == "elasticnet": retval = classifier(modelname, ElasticNet(), modelParams) if modelname == "omp": retval = classifier(modelname, OrthogonalMatchingPursuit(), modelParams) if modelname == "bayesridge": retval = classifier(modelname, BayesianRidge(), modelParams) if modelname == "ard": retval = classifier(modelname, ARDRegression(), modelParams) if modelname == "sgd": retval = classifier(modelname, SGDRegressor(), modelParams) if modelname == "passagg": retval = classifier(modelname, PassiveAggressiveRegressor(), modelParams) if modelname == "perceptron": retval = None if modelname == "huber": retval = classifier(modelname, HuberRegressor(), modelParams) if modelname == "theilsen": retval = classifier(modelname, TheilSenRegressor(), modelParams) if modelname == "ransac": retval = classifier(modelname, RANSACRegressor(), modelParams) if modelname == "mlp": retval = classifier(modelname, MLPRegressor(), modelParams) if modelname == "xgboost": retval = classifier(modelname, XGBRegressor(), modelParams) if modelname == "gaussproc": retval = classifier(modelname, GaussianProcessRegressor(), modelParams) if modelname == "dtree": retval = classifier(modelname, DecisionTreeRegressor(), modelParams) if modelname == "kneighbors": retval = classifier(modelname, KNeighborsRegressor(), modelParams) if modelname == "rneighbors": retval = classifier(modelname, RadiusNeighborsRegressor(), modelParams) if modelname == "svmlin": retval = classifier(modelname, LinearSVR(), modelParams) if modelname == "svmnupoly": retval = classifier(modelname, NuSVR(), modelParams) if modelname == "svmnulinear": retval = classifier(modelname, NuSVR(), modelParams) if modelname == "svmnusigmoid": retval = classifier(modelname, NuSVR(), modelParams) if modelname == "svmnurbf": retval = classifier(modelname, NuSVR(), modelParams) if modelname == "svmepspoly": retval = classifier(modelname, SVR(), modelParams) if modelname == "svmepslinear": retval = classifier(modelname, SVR(), modelParams) if modelname == "svmepssigmoid": retval = classifier(modelname, SVR(), modelParams) if modelname == "svmepsrbf": retval = classifier(modelname, SVR(), modelParams) if modelname == "rf": retval = classifier(modelname, RandomForestRegressor(), modelParams) if modelname == "extratrees": retval = classifier(modelname, ExtraTreesRegressor(), modelParams) if modelname == "adaboost": retval = classifier(modelname, AdaBoostRegressor(), modelParams) if modelname == "gbm": retval = classifier(modelname, GradientBoostingRegressor(), modelParams) if modelname == "isotonic": retval = classifier(modelname, IsotonicRegression(), modelParams) if modelname == "earth": retval = classifier(modelname, Earth(), modelParams) if modelname == "symbolic": retval = classifier(modelname, SymbolicRegressor(), modelParams) if modelname == "tpot": retval = classifier(modelname, TPOTRegressor(), modelParams) if retval is None: raise ValueError( "No model with name {0} was created".format(modelname)) model = retval.get() return model