def main(): # Get optimal sklearn Pipeline #pipe, parameters = getPipeline() # Create gridSearchRegressor from optimal Pipeline #gridSearchRegressor = GridSearchCV(pipe,parameters,n_jobs=3, verbose=1, scoring='r2') path, target = dataut.getVars() # Load Test Set X_test = pd.read_csv(path + '/data/test.csv') # Load original merged set data = dataut.getData(path) loaded_model = joblib.load(path + '/meta/finalized_model.sav') constant = 'Recipe_code' dependentVar = 'score' X_test['priceXing_count'] = X_test['price'] * X_test['ingredients_count'] #X_test = X_test.drop([dependentVar,constant],1) # Predict Score on Test Set #predictions=gridSearchRegressor.predict(X_test) predictions = loaded_model.predict(X_test) result_df = data.loc[data.Recipe_code.isin(X_test.Recipe_code)] # Create new column in Test dataframe result_df['predicted_score'] = predictions # Save the submission dataframe with the new column submission_df = result_df[['Recipe_code', 'predicted_score']] submission_df.to_csv(path + '/out/predict_output.csv')
def getPipeline(): path, target = dataut.getVars() X = dataut.preProcessData(path) return Pipeline([('fadd', FeatureAdd()), ('gbr', GradientBoostingRegressor())])
def main(): # Get Pipeline components pipeline = modelut.getPipeline() # Get parameter options for Pipeline components parameters = modelut.getParameters() # Get best set of parameters and evaluate validation set accuracy bestParameters = getBestParameters(pipeline, parameters) path, dependentVar = dataut.getVars() # Save best parameter set res = open(path + "/meta/best_params_model.txt", 'w') res.write('best parameters set:\n') for paramName in sorted(parameters.keys()): res.write('\t %s: %r\n' % (paramName, bestParameters[paramName])) joblib.dump(bestParameters, open(path + "/meta/bestParams.pkl", "wb"))
def getPipeline(): path, dependentVar = dataut.getVars() # Load best set of parameters bestParameters = joblib.load(open(path + "/meta/bestParams.pkl", "rb")) # Create sklearn Pipeline pipe = Pipeline([('fadd', modelut.FeatureAdd()), ('gbr', GradientBoostingRegressor({ "n_estimators": bestParameters['gbr__n_estimators'], "max_features": bestParameters['gbr__max_features'], "max_depth": bestParameters['gbr__max_depth'], "learning_rate": bestParameters['gbr__learning_rate'] }))]) # We create this empty dict as it is required for the syntax of GridSearchCV parameters = {} # Return sklearn Pipeline and empty dict return pipe, parameters
def getBestParameters(pipeline, parameters): path, dependentVar = dataut.getVars() X = dataut.preProcessData(path) y = X.loc[:, dependentVar] # create and fit a GBR model grid = dataut.GridSearchCV(pipeline, parameters) grid.fit(X, y) # summarize the results of the grid search print(grid.best_score_) print(grid.best_estimator_.get_params()) bestParameters = grid.best_estimator_.get_params() #persist to disk filename = path + '/meta/finalized_model.sav' joblib.dump(grid, filename) # Display best set of parameters #print ('best parameters set:') #for paramName in sorted(parameters.keys()): # print ('\t %s: %r' % (paramName, bestParameters[paramName])) # Evaluate pegbrormance of gridSearchRegressor on Validation Set X_valid = pd.read_csv(path + '/data/validation.csv') y_valid = X_valid.loc[:, dependentVar] constant = 'Recipe_code' dependentVar = 'score' X_valid[ 'priceXing_count'] = X_valid['price'] * X_valid['ingredients_count'] X_valid = X_valid.drop([dependentVar, constant], 1) # Make predictions on validation set and calculate best set of parameters bestParameters, predictions = feedut.validate(parameters, grid, X_valid, y_valid) # Initialize DataFrame for feedback loop valdf = pd.DataFrame(index=X_valid.index.values) # Add ingredients column valdf = valdf.join(X_valid) # Add correct cuisine valdf["cuisine"] = y_valid # Add predictions column valdf["pred_cuisine"] = predictions # Add check column. This column would be false for incorrect predictions valdf["check"] = valdf.pred_cuisine == valdf.cuisine # Store DataFrame for feedback valdf.to_csv(path + "/out/feedback.csv") # Create joint DataFrame to incorporate feedback data. As of now, this will only have the ingredients and cuisine columns from the training set ultimateTraindf = pd.DataFrame(index=X.index.values) ultimateTraindf = ultimateTraindf.join(X) ultimateTraindf["cuisine"] = y # Calculate best set of parameters after retraining with feedback data. Make predictions on validation set bestParameters, predictions = feedut.feedback(pipeline, parameters, ultimateTraindf) """validation_R2 = r2_score(y_valid,grid.fit(X_valid,y_valid).predict(X_valid)) print("............................................") print("............................................") print("our validation set R2 score is %.2f%%"%(validation_R2*100)) X_valid['pred_score'] = grid.predict(X_valid) X_valid['difference'] = X_valid['score']-X_valid['pred_score'] X_valid = X_valid[['score','pred_score','difference']] X_valid.to_csv(path+'/out/validation_predict.csv') """ return bestParameters