def Execute(context): timeStamp = TimeStamp() connectionSQL = ConnectionSQL(context) criterion = 'gini' max_features = None max_depth = 14 min_samples_split=6 min_samples_leaf=6 n_jobs=-1 # model parameter ranges - used for searching for optimial parameter settings criterionList = ['gini', 'entropy'] max_featuresList = [7, 8, 9, 10, 11, 12, 13, 14, None] max_depthList = [6, 8, 10, 11, 12, 13, 14, None] min_samples_splitList = [3, 5, 6, 7, 8, 9] min_samples_leafList = [3, 4, 5, 6, 7] parameterRangeDict = {'criterion':criterionList, 'max_features':max_featuresList, 'max_depth':max_depthList, 'min_samples_split':min_samples_splitList, 'min_samples_leaf':min_samples_leafList } timeStamp1 = TimeStamp('Loading X, Y') # 0=DataID, 1=P_A, 2=P_B, 3=P_C, 4=P_D, 5=P_E, 6=P_F, 7=P_G, 8=State, ... 27=Cost # Load Training Data and Cross Validation Data train_X, train_Y, train_DataID = connectionSQL.GetFeaturesAndResultsFromCache(featuresColumns=__featuresColumns, dataType="Train", modelName=__modelName, preProcessDataFrame=__PreProcessDataFrame, viewName=__viewName) train_DataID = None # Load Cross Validation Data cross_X, cross_Y, cross_DataID = connectionSQL.GetFeaturesAndResultsFromCache(featuresColumns=__featuresColumns, dataType="Cross", modelName=__modelName, preProcessDataFrame=__PreProcessDataFrame, viewName=__viewName) cross_DataID = None print ' Elaspe=' + timeStamp1.Elaspse print '' if (__optimiseParameters): timeStamp1 = TimeStamp('Optimise parameters') # package paramaters dataDict = {'train_X':train_X, 'train_Y':train_Y, 'cross_X':cross_X, 'cross_Y':cross_Y} # Optimal Paramater selection #parameterDictMax = __OptimalParamaterSelection(dataDict=dataDict, accuracyFunct=__Accuracy, parameterRangeDict=parameterRangeDict) optimiseModelParameters = OptimiseModelParameters(modelName=__modelName) optimiseModelParameters.Percent = 0.2 # 20% parameterDictMax, accuracyMax = optimiseModelParameters.ExecuteMoneCarlo(dataDict=dataDict, accuracyFunct=__Accuracy, parameterRangeDict=parameterRangeDict) criterion = parameterDictMax['criterion'] max_features = parameterDictMax['max_features'] max_depth = parameterDictMax['max_depth'] min_samples_split = parameterDictMax['min_samples_split'] min_samples_leaf = parameterDictMax['min_samples_leaf'] print 'Elaspe=' + timeStamp1.Elaspse print '' timeStamp1 = TimeStamp('Fit model') print __modelName + ' criterion=' + criterion + ' max_features=' + str(max_features) + ' max_depth=' + str(max_depth) + ' min_samples_split=' + str(min_samples_split) + ' min_samples_leaf=' + str(min_samples_leaf) clf = DecisionTreeClassifierMultiClass(splitter='best', random_state=1, criterion=criterion, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf) model = clf.fit(train_X, train_Y) # All features must be float. print 'Elaspe=' + timeStamp1.Elaspse print '' # Training error reports SharedLibrary.TrainingError(dataType='Training', modelName=__modelName, clf=clf, model=model, dfX=train_X, dfY=train_Y) SharedLibrary.TrainingError(dataType='CrossVal', modelName=__modelName, clf=clf, model=model, dfX=cross_X, dfY=cross_Y) # Cross validation report SharedLibrary.CrossValidation(modelName=__modelName, clf=clf, dfX=train_X, dfY=train_Y, n_fold=5) # UPDATE DRV_Predict with all data predictions. timeStamp1 = TimeStamp('Update_DRV_Predict') all_X, all_Y, all_DataID = connectionSQL.GetFeaturesAndResultsFromCache(featuresColumns=__featuresColumns, dataType="ALL", modelName=__modelName, preProcessDataFrame=__PreProcessDataFrame, viewName=__viewName) all_Y = None connectionSQL.Update_DRV_Predict(modelName=__modelName, model=model, all_X=all_X, all_DataID=all_DataID) print 'Elaspe=' + timeStamp1.Elaspse print '' # Feature analysis summary featureImportances = enumerate(clf.feature_importances_) featureImportanceHistogram = np.array([(importance,train_X.columns[i]) for (i,importance) in featureImportances if importance > 0.01]) print __modelName + ' Elaspe=' + timeStamp.Elaspse return __modelName, featureImportanceHistogram
def Execute(context): connectionSQL = ConnectionSQL(context) penalty = None alpha = 0.0001 fit_intercept = True n_iter = 20 shuffle = False eta0 = 1 # model parameter ranges - used for searching for optimial parameter settings penaltyList = ['l1', 'l2', 'elasticnet', None] alphaList = [0.1, 0.001, 0.0001, 0.00001] fit_interceptList = [True, False] n_iterList = [5, 10, 20, 30, 40, 100] shuffleList = [False] eta0List = [1] parameterRangeDict = {'penalty':penaltyList, 'alpha':alphaList, 'fit_intercept':fit_interceptList, 'n_iter':n_iterList, 'shuffle':shuffleList, 'eta0':eta0List } # 0=DataID, 1=Actual, 2=Base, ... # Load Training Data and Cross Validation Data train_X, train_Y, train_DataID = __GetFeaturesAndResults(context=context, connectionSQL=connectionSQL, dataType="Train", preProcessDataFrame=__PreProcessDataFrame) train_DataID = None # Load Cross Validation Data cross_X, cross_Y, cross_DataID = __GetFeaturesAndResults(context=context, connectionSQL=connectionSQL, dataType="Cross", preProcessDataFrame=__PreProcessDataFrame) cross_DataID = None if (__optimiseParameters): parameterDictMax = {} accuracyMax = 0.0 # package paramaters dataDict = {'train_X':train_X, 'train_Y':train_Y, 'cross_X':cross_X, 'cross_Y':cross_Y} optimiseModelParameters = OptimiseModelParameters(modelName=__modelName) optimiseModelParameters.Percent = 0.20 # 20% parameterDictMax, accuracyMax = optimiseModelParameters.ExecuteMoneCarlo(dataDict=dataDict, accuracyFunct=__Accuracy, parameterRangeDict=parameterRangeDict) penalty = parameterDictMax['penalty'] alpha = parameterDictMax['alpha'] fit_intercept = parameterDictMax['fit_intercept'] n_iter = parameterDictMax['n_iter'] shuffle = parameterDictMax['shuffle'] eta0 = parameterDictMax['eta0'] print __modelName + 'Classifier' clf = Perceptron(penalty=penalty, alpha=alpha, fit_intercept=fit_intercept, n_iter=n_iter, shuffle=shuffle, random_state=1, eta0=eta0, warm_start=False) model = clf.fit(train_X, train_Y) # All features must be float. print '' # Training error reports SharedLibrary.TrainingError(dataType='Training', modelName=__modelName, clf=clf, model=model, dfX=train_X, dfY=train_Y) SharedLibrary.TrainingError(dataType='CrossVal', modelName=__modelName, clf=clf, model=model, dfX=cross_X, dfY=cross_Y) # Cross validation report SharedLibrary.CrossValidation(modelName=__modelName, clf=clf, dfX=train_X, dfY=train_Y, n_fold=5) weights = clf.coef_[0] # narray weights = weights/sum(weights) modelNames = train_X.keys(); weightsDict = {} for index in range(len(modelNames)): weightsDict[modelNames[index]] = weights[index] weightsString = '' prefix = '' for key, value in weightsDict.iteritems(): weightsString += prefix + key + '=' + str(value) prefix = ' ' print 'Weights ' + weightsString print '' # UPDATE DRV_Predict with all data predictions. all_X, all_Y, all_DataID = __GetFeaturesAndResults(context=context, connectionSQL=connectionSQL, dataType="ALL", preProcessDataFrame=__PreProcessDataFrame) all_Y = None connectionSQL.Update_DRV_Predict(modelName=__modelName, model=model, all_X=all_X, all_DataID=all_DataID) return __modelName
def Execute(context): connectionSQL = ConnectionSQL(context) kernel = 'linear' degree = 3 # model parameter ranges - used for searching for optimial parameter settings kernelList = ['linear','poly','rbf','sigmoid'] parameterRangeDict = {'kernel':kernelList } # 0=DataID, 1=Actual, 2=Pclass, ... # Load Training Data and Cross Validation Data train_X, train_Y, train_DataID = connectionSQL.GetFeaturesAndResults(featuresColumns=__featuresColumns, dataType="Train", preProcessDataFrame=__PreProcessDataFrame) train_DataID = None # Load Cross Validation Data cross_X, cross_Y, cross_DataID = connectionSQL.GetFeaturesAndResults(featuresColumns=__featuresColumns, dataType="Cross", preProcessDataFrame=__PreProcessDataFrame) cross_DataID = None if (__optimiseParameters): parameterDictMax = {} accuracyMax = 0.0 # package paramaters dataDict = {'train_X':train_X, 'train_Y':train_Y, 'cross_X':cross_X, 'cross_Y':cross_Y} # Optimal Paramater selection # Kernel=Linear kernelList = ['linear'] degreeList = [3] parameterRangeDict = {'kernel':kernelList, 'degree':degreeList } optimiseModelParameters = OptimiseModelParameters(modelName=__modelName) optimiseModelParameters.Percent = 0.2 # 20% parameterDict, accuracy = optimiseModelParameters.ExecuteExhaustive(dataDict=dataDict, accuracyFunct=__Accuracy, parameterRangeDict=parameterRangeDict) if (accuracy > accuracyMax): parameterDictMax = parameterDict accuracyMax = accuracy # Kernel=poly kernelList = ['poly'] degreeList = [2, 3, 4] parameterRangeDict = {'kernel':kernelList, 'degree':degreeList } optimiseModelParameters = OptimiseModelParameters(modelName=__modelName) optimiseModelParameters.Percent = 0.2 # 20% parameterDict, accuracy = optimiseModelParameters.ExecuteExhaustive(dataDict=dataDict, accuracyFunct=__Accuracy, parameterRangeDict=parameterRangeDict) if (accuracy > accuracyMax): parameterDictMax = parameterDict accuracyMax = accuracy # Kernel=rbf kernelList = ['rbf'] degreeList = [3] parameterRangeDict = {'kernel':kernelList, 'degree':degreeList } optimiseModelParameters = OptimiseModelParameters(modelName=__modelName) optimiseModelParameters.Percent = 0.2 # 20% parameterDict, accuracy = optimiseModelParameters.ExecuteExhaustive(dataDict=dataDict, accuracyFunct=__Accuracy, parameterRangeDict=parameterRangeDict) if (accuracy > accuracyMax): parameterDictMax = parameterDict accuracyMax = accuracy # Kernel=sigmoid kernelList = ['sigmoid'] degreeList = [3] parameterRangeDict = {'kernel':kernelList, 'degree':degreeList } optimiseModelParameters = OptimiseModelParameters(modelName=__modelName) optimiseModelParameters.Percent = 0.2 # 20% parameterDict, accuracy = optimiseModelParameters.ExecuteExhaustive(dataDict=dataDict, accuracyFunct=__Accuracy, parameterRangeDict=parameterRangeDict) if (accuracy > accuracyMax): parameterDictMax = parameterDict accuracyMax = accuracy kernel = parameterDictMax['kernel'] degree = parameterDictMax['degree'] print __modelName + 'Classifier' clf = svm.SVC(probability=True, verbose=False, random_state=1, kernel=kernel, degree=degree) model = clf.fit(train_X, train_Y) # All features must be float. print '' # Training error reports SharedLibrary.TrainingError(dataType='Training', modelName=__modelName, clf=clf, model=model, dfX=train_X, dfY=train_Y) SharedLibrary.TrainingError(dataType='CrossVal', modelName=__modelName, clf=clf, model=model, dfX=cross_X, dfY=cross_Y) # Cross validation report SharedLibrary.CrossValidation(modelName=__modelName, clf=clf, dfX=train_X, dfY=train_Y, n_fold=5) # UPDATE DRV_Predict with all data predictions. all_X, all_Y, all_DataID = connectionSQL.GetFeaturesAndResults(featuresColumns=__featuresColumns, dataType="ALL", preProcessDataFrame=__PreProcessDataFrame) all_Y = None connectionSQL.Update_DRV_Predict(modelName=__modelName, model=model, all_X=all_X, all_DataID=all_DataID) return __modelName
def Execute(context): connectionSQL = ConnectionSQL(context) # Default model parameter values n_estimators = 50 max_features = 8 # None max_depth = 12 min_samples_split=5 min_samples_leaf=4 bootstrap=False n_jobs=-1 # Avaiable CPUs random_state=1 # Use None or 0 to randomise results # model parameter ranges - used for searching for optimial parameter settings n_estimatorsList = [20, 30, 40, 50, 70, 80, 100, 110] max_featuresList = [7, 8, 9, 10, 11, 12, 13, 14, None] #max_featuresList = ['auto'] max_depthList = [6, 8, 10, 11, 12, 13, 14] min_samples_splitList = [3, 5, 6, 7, 8, 9] min_samples_leafList = [3, 4, 5, 6, 7] bootstrapList = [False] parameterRangeDict = {'n_estimators':n_estimatorsList, 'max_features':max_featuresList, 'max_depth':max_depthList, 'min_samples_split':min_samples_splitList, 'min_samples_leaf':min_samples_leafList, 'bootstrap':bootstrapList } # 0=DataID, 1=Actual, 2=Pclass, ... # Load Training Data and Cross Validation Data train_X, train_Y, train_DataID = connectionSQL.GetFeaturesAndResults(featuresColumns=__featuresColumns, dataType="Train", preProcessDataFrame=__PreProcessDataFrame) #train_DataID = None # Load Cross Validation Data cross_X, cross_Y, cross_DataID = connectionSQL.GetFeaturesAndResults(featuresColumns=__featuresColumns, dataType="Cross", preProcessDataFrame=__PreProcessDataFrame) #cross_DataID = None if (__optimiseParameters): # package paramaters dataDict = {'train_X':train_X, 'train_Y':train_Y, 'cross_X':cross_X, 'cross_Y':cross_Y} # Optimal Paramater selection #parameterDictMax = __OptimalParamaterSelection(dataDict=dataDict, accuracyFunct=__Accuracy, parameterRangeDict=parameterRangeDict) optimiseModelParameters = OptimiseModelParameters(modelName=__modelName) optimiseModelParameters.Percent = 0.05 # 5% parameterDictMax, accuracyMax = optimiseModelParameters.ExecuteMoneCarlo(dataDict=dataDict, accuracyFunct=__Accuracy, parameterRangeDict=parameterRangeDict) bootstrap = parameterDictMax['bootstrap'] min_samples_leaf = parameterDictMax['min_samples_leaf'] n_estimators = parameterDictMax['n_estimators'] max_features = parameterDictMax['max_features'] min_samples_split = parameterDictMax['min_samples_split'] max_depth = parameterDictMax['max_depth'] print __modelName + ' bootstrap=' + str(bootstrap) + ' min_samples_leaf=' + str(min_samples_leaf) + ' n_estimators=' + str(n_estimators) + " max_features=" + str(max_features) + " min_samples_split=" + str(min_samples_split) + " max_depth=" + str(max_depth) oob_score = bootstrap # Can only have oob_score if bootstrap = True clf = RandomForestClassifier(n_jobs=-1, oob_score=oob_score, random_state=1, bootstrap=bootstrap, min_samples_leaf=min_samples_leaf, n_estimators=n_estimators, max_features=max_features, min_samples_split=min_samples_split, max_depth=max_depth) model = clf.fit(train_X, train_Y) # All features must be float. print '' # Persist trained classifier to disk # clfUNC = __modelName + '.clf' #joblib.dump(clfUNC) #clf = joblib.load(clfUNC) # Training error reports SharedLibrary.TrainingError(dataType='Training', modelName=__modelName, clf=clf, model=model, dfX=train_X, dfY=train_Y) SharedLibrary.TrainingError(dataType='CrossVal', modelName=__modelName, clf=clf, model=model, dfX=cross_X, dfY=cross_Y) # Cross validation report SharedLibrary.CrossValidation(modelName=__modelName, clf=clf, dfX=train_X, dfY=train_Y, n_fold=5) # UPDATE DRV_Predict with all data predictions. all_X, all_Y, all_DataID = connectionSQL.GetFeaturesAndResults(featuresColumns=__featuresColumns, dataType="ALL", preProcessDataFrame=__PreProcessDataFrame) all_Y = None connectionSQL.Update_DRV_Predict(modelName=__modelName, model=model, all_X=all_X, all_DataID=all_DataID) # Feature analysis summary featureImportances = enumerate(clf.feature_importances_) featureImportanceHistogram = np.array([(importance,train_X.columns[i]) for (i,importance) in featureImportances if importance > 0.005]) return __modelName, featureImportanceHistogram