def find_xgb_best_parameters(test_size=0.2, n_iter_search=20, X=None, y=None): if X is None or y is None: X, y = pr_kaggle.load_data(cat2vectors=True) Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=test_size, random_state=36) param_dist = { "n_estimators": [50, 100, 250, 500], "max_depth": [10, 5, 15], "learning_rate": [0.01, 0.1, 0.0333], "subsample": [0.5, 1.0, 0.80], #"gamma": [0,0.01], #"min_child_weight": [0.5, 1], "colsample_bytree": [1.0, 0.5, 0.8, 0.9] } start = time() clf = xgb.XGBClassifier() random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, n_jobs=1) print Xtrain.shape random_search.fit(Xtrain, ytrain) print( "RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) print 'training', random_search.score(Xtrain, ytrain) print 'testing', random_search.score(Xtest, ytest) return random_search
def trainClassifier(X, y, testTweetsAll, ensembleTweets, ensembleSentiment): # split our data into training and test datasets xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.33, random_state=8) classifier = RandomForestClassifier(n_estimators=20, n_jobs=-1) # for simplicity's sake, we could train a single random forest: # classifier.fit(xTrain, yTrain) # print classifier.score(xTest, yTest) # for more fun, we will optimize the hyperparameters for our random forest using RandomizedSearchCV parametersToTry = { 'max_features': ['sqrt', 'log2', None, .01, .1, .2, .3], 'criterion': ['gini', 'entropy'], 'min_samples_leaf': [1], 'min_samples_split': scipy.stats.randint(2, 30), 'bootstrap': [True, False] } # RandomizedSearchCV will optimize our hyperparameters for us in a way that is much more efficient and comprehensive than GridSearchCV. # run on all cores, fail gracefully if a combination of hyperparameters fails to converge, try 10 different combinations of hyperparameters, train on all the training data when finished, and use a third of the dataset for cross-validation while searching for the best hyperparameters searchCV = RandomizedSearchCV(classifier, parametersToTry, n_jobs=-1, error_score=0, n_iter=10, refit=True, cv=3) print 'shape of this training data set:' print xTrain.shape searchCV.fit(xTrain, yTrain) print 'the best hyperparameters from this search are:' print searchCV.best_params_ print 'best score from hyperparameter search is: ' + str( searchCV.best_score_) print 'score on the holdout portion of the training set: ' + str( searchCV.score(xTest, yTest)) print 'score on the ensemble data: ' + str( searchCV.score(ensembleTweets, ensembleSentiment)) + '\n\n' testPredictions = searchCV.predict_proba(testTweetsAll) ensemblePredictions = searchCV.predict_proba(ensembleTweets) def singlePrediction(predictions): cleanedPredictions = [] for predictionRow in predictions: cleanedPredictions.append(predictionRow[1]) return cleanedPredictions # the classifier gives us a predicted probability for both the 0 and the 1 case. Given that they're mutually exclusive, we can simplify down to a single number (the predicted probability of the 1 case) testPredictions = singlePrediction(testPredictions) ensemblePredictions = singlePrediction(ensemblePredictions) return testPredictions, ensemblePredictions
def trainClassifier(X, y, testTweetsAll, ensembleTweets, ensembleSentiment): # split our data into training and test datasets xTrain, xTest, yTrain, yTest = train_test_split( X, y, test_size=0.33, random_state=8) classifier = RandomForestClassifier(n_estimators=20, n_jobs=-1) # for simplicity's sake, we could train a single random forest: # classifier.fit(xTrain, yTrain) # print classifier.score(xTest, yTest) # for more fun, we will optimize the hyperparameters for our random forest using RandomizedSearchCV parametersToTry = { 'max_features': ['sqrt','log2',None,.01,.1,.2,.3], 'criterion': ['gini','entropy'], 'min_samples_leaf': [1], 'min_samples_split': scipy.stats.randint(2,30), 'bootstrap': [True,False] } # RandomizedSearchCV will optimize our hyperparameters for us in a way that is much more efficient and comprehensive than GridSearchCV. # run on all cores, fail gracefully if a combination of hyperparameters fails to converge, try 10 different combinations of hyperparameters, train on all the training data when finished, and use a third of the dataset for cross-validation while searching for the best hyperparameters searchCV = RandomizedSearchCV(classifier, parametersToTry, n_jobs=-1, error_score=0, n_iter=10, refit=True, cv=3) print 'shape of this training data set:' print xTrain.shape searchCV.fit(xTrain, yTrain) print 'the best hyperparameters from this search are:' print searchCV.best_params_ print 'best score from hyperparameter search is: ' + str(searchCV.best_score_) print 'score on the holdout portion of the training set: ' + str( searchCV.score(xTest, yTest) ) print 'score on the ensemble data: ' + str( searchCV.score(ensembleTweets, ensembleSentiment) ) + '\n\n' testPredictions = searchCV.predict_proba(testTweetsAll) ensemblePredictions = searchCV.predict_proba(ensembleTweets) def singlePrediction(predictions): cleanedPredictions = [] for predictionRow in predictions: cleanedPredictions.append(predictionRow[1]) return cleanedPredictions # the classifier gives us a predicted probability for both the 0 and the 1 case. Given that they're mutually exclusive, we can simplify down to a single number (the predicted probability of the 1 case) testPredictions = singlePrediction(testPredictions) ensemblePredictions = singlePrediction(ensemblePredictions) return testPredictions, ensemblePredictions
def parametr_tuning_random(model, params, scores, X_train, Y_train, X_test, Y_test, n_iter_search=10): """ """ for score in scores: log("# Tuning hyper-parameters for %s: " % score) log("", False) rnd_tune = RandomizedSearchCV(model, params, n_iter=n_iter_search, cv=5, scoring=score) rnd_tune.fit(X_train, Y_train) log("Best parameters set found on development set:") log(str(rnd_tune.best_params_), False) log("random search score _ TEST set:") log(str(rnd_tune.score(X_test, Y_test) * 100), False) log("", False) log("random search scores on development set:") log(str(rnd_tune.grid_scores_), False) log("", False) log("Detailed classification report:") log("", False) y_true, y_pred = Y_test, rnd_tune.predict(X_test) log(classification_report(y_true, y_pred), False) log("", False)
def create_svm(pd, pl, qd, ql): lsvc = LinearSVC() params = {'C': expon(scale=100)} svm = RandomizedSearchCV(lsvc, params, n_jobs=4, n_iter=10, verbose=10) print("Training Linear SVM Randomly") svm.fit(pd, pl) print("SVM Score: " + str(svm.score(qd, ql))) return svm
def hyperparameter_tuning(model, params, X, y): # tune the hyperparameters via a randomized search grid = RandomizedSearchCV(model, params) start = time.time() grid.fit(X, y) # evaluate the best randomized searched model on the testing # data print("[INFO] randomized search took {:.2f} seconds".format(time.time() - start)) acc = grid.score(X, y) print("[INFO] grid search accuracy: {:.2f}%".format(acc * 100)) print("[INFO] randomized search best parameters: {}".format( grid.best_params_))
def main(): logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logger = logging.getLogger(__name__) # Load the data and extract annotated data df = pd.read_csv('data/' + DATA_FILE) data = df[(df.password.notnull()) & (df.done == '1')] Passwords = namedtuple('Passwords', 'data target') pwds = Passwords(data=data.tip, target=data.password) # Split into training and test set X_train, X_test, y_train, y_test = cross_validation.train_test_split( pwds.data, pwds.target, test_size=TEST_SET_PERCENT, random_state=0) parameters = { 'before_cutoff': list(range(0, 6)), 'after_cutoff': list(range(0, 6)), 'before_exponential_factor': np.logspace(-1, 1, 10), 'after_exponential_factor': np.logspace(-1, 1, 10), 'after_factor': np.logspace(-2, 1, 20), 'min_password_length': list(range(1, 10)), 'margin_cutoff': list(range(0, 3)), 'margin_factor': np.logspace(-2, 0, 10), 'margin_exponential_factor': np.logspace(-1, 1, 10), 'eol_factor': np.logspace(-2, 0, 10), 'bol_factor': np.logspace(-2, 0, 10) } # Use grid search and k-fold cross validation clf = RandomizedSearchCV( PasswordEstimator(), parameters, cv=2, scoring='accuracy', n_jobs=multiprocessing.cpu_count() - 1, n_iter=BUDGET) clf = clf.fit(X_train, y_train) logger.info(clf.best_params_) test_score = clf.score(X_test, y_test) logger.info(test_score)
from tempfile import NamedTemporaryFile rain = .1 * np.load('rain.npy') rain[rain < 0] = .05/2 dates = np.load('doy.npy').astype(int) x = np.vstack((dates[:-1], np.sign(rain[:-1]))) x = x.T y = np.sign(rain[1:]) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=37) clf = tree.DecisionTreeClassifier(random_state=37) params = {"max_depth": [2, None], "min_samples_leaf": sp_randint(1, 5), "criterion": ["gini", "entropy"]} rscv = RandomizedSearchCV(clf, params) rscv.fit(x_train,y_train) sio = io.StringIO() tree.export_graphviz(rscv.best_estimator_, out_file=sio, feature_names=['day-of-year','yest']) dec_tree = pydot.graph_from_dot_data(sio.getvalue()) with NamedTemporaryFile(prefix='rain', suffix='.png', delete=False) as f: dec_tree.write_png(f.name) print("Written figure to", f.name) print("Best Train Score", rscv.best_score_) print("Test Score", rscv.score(x_test, y_test)) print("Best params", rscv.best_params_)
submission['Class_9'][pred == 9] = np.ones(len(pred == 9)) submission = submission.drop('label', axis=1) submission.to_csv('submission_svm.csv', index_label='id') # classifiers = [ SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), ] clf = SVC(gamma=2, C=1) clf.fit(X_train, y_train) score = clf.score(X_test, y_test.astype(str)) svm_pred = clf.predict(X_test) # need to convert the results to str # http://stackoverflow.com/questions/19820369/unable-to-solve-an-error-while-running-gridsearch confusion_matrix(y_test.astype(str), svm_pred.astype(str)) # very poor fit. it mostly predicts as class 2 # http://qiita.com/sotetsuk/items/16ffd76978085bfd7628 ## チューニングパラメータ tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }, { 'kernel': ['linear'], 'C': [1, 10, 100, 1000]
from tempfile import NamedTemporaryFile rain = .1 * np.load('rain.npy') rain[rain < 0] = .05/2 dates = np.load('doy.npy').astype(int) x = np.vstack((dates[:-1], np.sign(rain[:-1]))) x = x.T y = np.sign(rain[1:]) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=37) clf = tree.DecisionTreeClassifier(random_state=37) params = {"max_depth": [2, None], "min_samples_leaf": sp_randint(1, 5), "criterion": ["gini", "entropy"]} rscv = RandomizedSearchCV(clf, params) rscv.fit(x_train,y_train) sio = StringIO.StringIO() tree.export_graphviz(rscv.best_estimator_, out_file=sio, feature_names=['day-of-year','yest']) dec_tree = pydot.graph_from_dot_data(sio.getvalue()) with NamedTemporaryFile(prefix='rain', suffix='.png', delete=False) as f: dec_tree.write_png(f.name) print "Written figure to", f.name print "Best Train Score", rscv.best_score_ print "Test Score", rscv.score(x_test, y_test) print "Best params", rscv.best_params_
def main(): csv_file_object = csv.reader(open('Data/train.csv', 'rb')) #Load in the training csv file header = csv_file_object.next() #Skip the fist line as it is a header train_data = [] #Creat a variable called 'train_data' for row in csv_file_object: #Skip through each row in the csv file train_data.append(row[1:]) #adding each row to the data variable train_data = np.array(train_data) #Then convert from a list to an array #I need to convert all strings to integer classifiers: #Male = 1, female = 0: train_data[train_data[0::, 3] == 'male', 3] = -1 train_data[train_data[0::, 3] == 'female', 3] = 1 #embark c=0, s=1, q=2 train_data[train_data[0::, 10] == 'C', 10] = -1 train_data[train_data[0::, 10] == 'S', 10] = 0 train_data[train_data[0::, 10] == 'Q', 10] = 1 #Survived train_data[train_data[0::, 3] == 1, 0] = 1 train_data[train_data[0::, 3] == 0, 0] = -1 #I need to fill in the gaps of the data and make it complete. #So where there is no price, I will assume price on median of that class #Where there is no age I will give median of all ages imp = preprocessing.Imputer(missing_values=0, strategy='median', axis=0) #All the ages with no data make the median of the data #train_data[train_data[0::,4] == '',4] = np.median(train_data[train_data[0::,4]\ # != '',4].astype(np.float)) #All missing ebmbarks just make them embark from most common place #train_data[train_data[0::,10] == '',10] = np.round(np.mean(train_data[train_data[0::,10]\ # != '',10].astype(np.float))) train_data = np.delete(train_data, [2, 7, 9, 10], 1) #remove the name data, cabin and ticket train_data[train_data == ''] = '0' imp.fit_transform(train_data) #I need to do the same with the test data now so that the columns are in the same #as the training data #We finally spit the data between train set and valiation set x_train, x_test, y_train, y_test = train_test_split(train_data[0::, 1::], train_data[0::, 0], test_size=0.2, random_state=0) #Standardise data scaler = preprocessing.StandardScaler().fit(x_train) x_train_std = scaler.transform(x_train) x_test_std = scaler.transform(x_test) test_file_object = csv.reader(open('Data/test.csv', 'rb')) #Load in the test csv file header = test_file_object.next() #Skip the fist line as it is a header test_data = [] #Creat a variable called 'test_data' ids = [] for row in test_file_object: #Skip through each row in the csv file ids.append(row[0]) test_data.append(row[1:]) #adding each row to the data variable test_data = np.array(test_data) #Then convert from a list to an array #I need to convert all strings to integer classifiers: #Male = 1, female = 0: test_data[test_data[0::, 2] == 'male', 2] = 1 test_data[test_data[0::, 2] == 'female', 2] = -1 #ebark c=0, s=1, q=2 test_data[ test_data[0::, 9] == 'C', 9] = -1 #Note this is not ideal, in more complex 3 is not 3 tmes better than 1 than 2 is 2 times better than 1 test_data[test_data[0::, 9] == 'S', 9] = 0 test_data[test_data[0::, 9] == 'Q', 9] = 1 #All the ages with no data make the median of the data #test_data[test_data[0::,3] == '',3] = np.median(test_data[test_data[0::,3]\ # != '',3].astype(np.float)) #All missing ebmbarks just make them embark from most common place #test_data[test_data[0::,9] == '',9] = np.round(np.mean(test_data[test_data[0::,9]\ # != '',9].astype(np.float))) #All the missing prices assume median of their respectice class #for i in xrange(np.size(test_data[0::,0])): # if test_data[i,7] == '': # test_data[i,7] = np.median(test_data[(test_data[0::,7] != '') &\ # (test_data[0::,0] == test_data[i,0])\ # ,7].astype(np.float)) test_data = np.delete(test_data, [1, 6, 8, 9], 1) #remove the name data, cabin and ticket test_data[test_data == ''] = '0' #Impute mising values imp.fit_transform(test_data) #Standarize scaler_test = preprocessing.StandardScaler().fit(test_data) test_data_std = scaler_test.transform(test_data) #The data is now ready to go. So lets train then test! start = time() print 'Training estimators' estimators = [('linearsvc', LinearSVC()), ('KNeighborsClassifier', KNeighborsClassifier())] clf = Pipeline(estimators) # specify parameters and distributions to sample from param_dist = { "linearsvc__C": sp_randint(1, 1000), "linearsvc__loss": ["l1", "l2"], "linearsvc__dual": [True], "KNeighborsClassifier__n_neighbors": sp_randint(5, 100), "KNeighborsClassifier__weights": ["uniform", "distance"], "KNeighborsClassifier__algorithm": ["ball_tree", "kd_tree", "brute"], "KNeighborsClassifier__leaf_size": sp_randint(3, 100), } # run randomized search n_iter_search = 2000 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, n_jobs=4, verbose=1) random_search.fit(x_train_std, y_train) print 'Reporting' print( "RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) score = random_search.score(x_test_std, y_test) print 'Test score' print score print 'Predicting' output = random_search.predict(test_data_std) open_file_object = csv.writer(open("pipelinearsvcknn.csv", "wb")) open_file_object.writerow(["PassengerId", "Survived"]) open_file_object.writerows(zip(ids, output))
"svc__gamma": sp_randint(1, 10), "svc__coef0": sp_randint(1, 10), "svc__shrinking": [True, False] } #Start with data with age # run randomized search n_iter_search = 20 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search,n_jobs=4, verbose=1) random_search.fit(x_train_std,y_train) print 'Reporting' print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) score=random_search.score(x_test_std,y_test) print 'Test score' print score print 'Predicting' output = random_search.predict(test_data_std) #Finally with data without age # run randomized search <<<<<<< HEAD n_iter_search = 20 ======= n_iter_search = 2000 >>>>>>> 5b0499dbec7ef19b9617d4339731063de092e370 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search,n_jobs=4, verbose=1)
def main(): csv_file_object = csv.reader(open('Data/train.csv', 'rb')) #Load in the training csv file header = csv_file_object.next() #Skip the fist line as it is a header train_data=[] #Creat a variable called 'train_data' for row in csv_file_object: #Skip through each row in the csv file train_data.append(row[1:]) #adding each row to the data variable train_data = np.array(train_data) #Then convert from a list to an array #I need to convert all strings to integer classifiers: #Male = 1, female = 0: train_data[train_data[0::,3]=='male',3] = -1 train_data[train_data[0::,3]=='female',3] = 1 #embark c=0, s=1, q=2 train_data[train_data[0::,10] =='C',10] = -1 train_data[train_data[0::,10] =='S',10] = 0 train_data[train_data[0::,10] =='Q',10] = 1 #Survived train_data[train_data[0::,3]==1,0] = 1 train_data[train_data[0::,3]==0,0] = -1 #I need to fill in the gaps of the data and make it complete. #So where there is no price, I will assume price on median of that class #Where there is no age I will give median of all ages imp = preprocessing.Imputer(missing_values=0, strategy='median', axis=0) #All the ages with no data make the median of the data #train_data[train_data[0::,4] == '',4] = np.median(train_data[train_data[0::,4]\ # != '',4].astype(np.float)) #All missing ebmbarks just make them embark from most common place #train_data[train_data[0::,10] == '',10] = np.round(np.mean(train_data[train_data[0::,10]\ # != '',10].astype(np.float))) train_data = np.delete(train_data,[2,7,9,10],1) #remove the name data, cabin and ticket train_data[train_data=='']='0' imp.fit_transform(train_data) #I need to do the same with the test data now so that the columns are in the same #as the training data #We finally spit the data between train set and valiation set x_train, x_test, y_train, y_test=train_test_split( train_data[0::,1::],train_data[0::,0], test_size=0.2, random_state=0) #Standardise data scaler = preprocessing.StandardScaler().fit(x_train) x_train_std=scaler.transform(x_train) x_test_std=scaler.transform(x_test) test_file_object = csv.reader(open('Data/test.csv', 'rb')) #Load in the test csv file header = test_file_object.next() #Skip the fist line as it is a header test_data=[] #Creat a variable called 'test_data' ids = [] for row in test_file_object: #Skip through each row in the csv file ids.append(row[0]) test_data.append(row[1:]) #adding each row to the data variable test_data = np.array(test_data) #Then convert from a list to an array #I need to convert all strings to integer classifiers: #Male = 1, female = 0: test_data[test_data[0::,2]=='male',2] = 1 test_data[test_data[0::,2]=='female',2] = -1 #ebark c=0, s=1, q=2 test_data[test_data[0::,9] =='C',9] = -1 #Note this is not ideal, in more complex 3 is not 3 tmes better than 1 than 2 is 2 times better than 1 test_data[test_data[0::,9] =='S',9] = 0 test_data[test_data[0::,9] =='Q',9] = 1 #All the ages with no data make the median of the data #test_data[test_data[0::,3] == '',3] = np.median(test_data[test_data[0::,3]\ # != '',3].astype(np.float)) #All missing ebmbarks just make them embark from most common place #test_data[test_data[0::,9] == '',9] = np.round(np.mean(test_data[test_data[0::,9]\ # != '',9].astype(np.float))) #All the missing prices assume median of their respectice class #for i in xrange(np.size(test_data[0::,0])): # if test_data[i,7] == '': # test_data[i,7] = np.median(test_data[(test_data[0::,7] != '') &\ # (test_data[0::,0] == test_data[i,0])\ # ,7].astype(np.float)) test_data = np.delete(test_data,[1,6,8,9],1) #remove the name data, cabin and ticket test_data[test_data=='']='0' #Impute mising values imp.fit_transform(test_data) #Standarize scaler_test = preprocessing.StandardScaler().fit(test_data) test_data_std=scaler_test.transform(test_data) #The data is now ready to go. So lets train then test! start = time() print 'Training estimators' estimators = [('linearsvc', LinearSVC()), ('KNeighborsClassifier', KNeighborsClassifier())] clf = Pipeline(estimators) # specify parameters and distributions to sample from param_dist = {"linearsvc__C": sp_randint(1, 1000), "linearsvc__loss": ["l1", "l2"], "linearsvc__dual": [True], "KNeighborsClassifier__n_neighbors": sp_randint(5, 100), "KNeighborsClassifier__weights": ["uniform", "distance"], "KNeighborsClassifier__algorithm": ["ball_tree", "kd_tree", "brute"], "KNeighborsClassifier__leaf_size": sp_randint(3, 100), } # run randomized search n_iter_search = 2000 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search,n_jobs=4, verbose=1) random_search.fit(x_train_std,y_train) print 'Reporting' print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) score=random_search.score(x_test_std,y_test) print 'Test score' print score print 'Predicting' output = random_search.predict(test_data_std) open_file_object = csv.writer(open("pipelinearsvcknn.csv", "wb")) open_file_object.writerow(["PassengerId","Survived"]) open_file_object.writerows(zip(ids, output))
'max_depth': sp_randint(4, 200), 'learning_rate': sp_uniform(loc=0e0,scale=1e0), 'objective':['multi:softprob'], 'nthread': [8], 'missing': [np.nan], 'reg_alpha': [0.01,0.017782794,0.031622777,0.056234133,\ 0.1,0.17782794,0.31622777,0.56234133,1.,1.77827941,\ 3.16227766,5.62341325,10.,\ 17.7827941,31.6227766,56.2341325,100.], 'colsample_bytree': sp_uniform(loc=0.2e0,scale=0.8e0), 'subsample': np.arange(0.6,1.0,step=0.05), 'n_estimators': sp_randint(100,700), } print "Randomized XGBoost" # In[ ]: for i in range(2): print "Loop %i/20" % i search_GB = RandomizedSearchCV(GB, param_grid, scoring='log_loss', n_jobs=-1, n_iter=n_iter, cv=cv, verbose=True) search_GB.fit(X_train, y_train) log_model = search_GB.score(X_val, y_val) print "Log loss = %s" % log_model X_test = get_test() save_submission('XGBoost', log_model, search_GB.predict_proba(X_test))
def model_pred(X,Y, hyperparams , maximize='accuracy' , model_type='logreg' , n_iter_search = 30, n_cv_sets = 10 , limits = [-3 , 1 , 0.5] ): X_train, X_test , Y_train , Y_test= train_test_split(X,Y, test_size = 0.3) param_dist = hyperparams if model_type=='linreg' : model = linear_model.ElasticNet() elif model_type=='lasso' : model = linear_model.Lasso() elif model_type=='randomforest' : model = RandomForestRegressor() elif model_type=='GBT' : model = GradientBoostingRegressor() elif model_type == 'NN': model = MLPRegressor() #how to decide the score random_search = RandomizedSearchCV(model, param_distributions=param_dist,n_iter=n_iter_search, cv = n_cv_sets, )#scoring= #scorer(estimator, X, y) start = time() random_search.fit(X_train, Y_train) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) #random_search.fit(X_train, Y_train) Y_test_pred= random_search.predict(X_test) print 'score test set' , random_search.score(X_test, Y_test) print '===params best model' ,random_search.best_params_ print ' diff pred and ground ' , np.mean(abs(Y_test_pred -Y_test )) model =random_search.best_estimator_ Y_sub_pred=cross_validation.cross_val_predict(model, X, y=Y, cv=n_cv_sets, n_jobs=2) #Y_sub_pred=(model.predict(X)) best_score=random_search.best_score_ print ' score on train set' , best_score if model_type=='linreg' : print zip(X_train.columns , model.coef_) elif model_type=='lasso' : model = linear_model.Lasso() elif model_type=='randomforest' : print zip(X_train.columns , model.feature_importances_) elif model_type=='GBT' : print zip(X_train.columns , model.feature_importances_) elif model_type == 'NN': model = MLPRegressor() return Y_sub_pred , best_score
# In[ ]: print(RR_model.best_score_) # In[ ]: print(RR_model.best_params_) # In[ ]: y_prob = RR_model.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions. RR_model.score(X_test, y_pred) # In[ ]: confusion_matrix=metrics.confusion_matrix(y_test,y_pred) confusion_matrix # In[ ]: auc_roc=metrics.classification_report(y_test,y_pred) auc_roc # In[ ]:
"max_features": sp_randint(1, 11), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"], "n_estimators": sp_randint(100, 600) } # In[4]: search_GB = RandomizedSearchCV(model, param_grid, scoring='log_loss', n_jobs=-1, n_iter=n_iter, cv=cv, verbose=True) search_GB.fit(X_train, y_train.flatten()) # In[5]: log_model = search_GB.score(X_val, y_val.flatten()) print "Log loss = %s" % log_model X_test = get_test() y_pred = search_GB.predict_proba(X_test) save_submission(model_name, log_model, y_pred) # In[7]: model_name