def buildRandomForest(self, X_train, X_test, y_train, cv = 3, n_iter = 5, save = False): rf = RandomForestClassifier(random_state = 9) #Tune the model param_distributions = { 'n_estimators': range(1,50,1), 'max_depth': range(1,70,1), 'max_features': range(6,15,1), 'min_samples_split':[2,3,4], 'min_samples_leaf':[1,2,3,4], 'n_jobs':[-1] } rf_optimized = RandomizedSearchCV( estimator = rf, param_distributions = param_distributions, n_iter= n_iter, scoring = 'f1', cv = cv, random_state = 1 ) rf_optimized.fit(X_train, y_train) if save == True: joblib.dump(value = rf_optimized, filename = "rf_optimized.pkl", compress=1) print "Best parameter: %s" %rf_optimized.best_params_ print "Best average cross validated F1 score: %0.4f" %rf_optimized.best_score_ print "--------------------------------------------" #predictions predicted_y_train = rf_optimized.predict(X_train) predicted_y_test = rf_optimized.predict(X_test) return predicted_y_train, predicted_y_test
def build_sample(regressor, name): # print estimator.get_params().keys() : specify parameters and distributions to sample from param_dist = {"max_depth": [3, None], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11)}#, #"bootstrap": [True, False], #"criterion": ["mse", "entropy"]} # run randomized search n_iter_search = 20 random_search = RandomizedSearchCV(regressor, param_distributions=param_dist, n_iter=n_iter_search) # time... start = time() # repeat the CV procedure 10 times to get more precise results n = 10 # for each iteration, randomly hold out 10% of the data as CV set for i in range(n): X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( sample_X, sample_y, test_size=.10, random_state=i*SEED) # train with rand... random_search.fit(X_train, y_train) # train... #regressor = regressor.fit(X_train, y_train) # save model #store_pkl(regressor, name + ".pkl") # predict on train preds = random_search.predict(X_cv) # print #print preds # create DataFrame #preds = DataFrame(preds, columns = ["prime_tot_ttc_preds"]) #print preds #print y_cv # mape mape_r = mape(y_cv, preds) # print print "MAPE of (fold %d/%d) of %s is : %f" % (i+1 , n, name, mape_r) # time... print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) # predict on test predict_res = random_search.predict(sample_t) preds_on_test = DataFrame(list(zip(sample_id, predict_res)), columns = ["ID", "CODIS"]) preds_on_test['ID'].astype(int) # save predictions store_csv(preds_on_test, name + ".csv") return predict_res
class CVSearcher(SearcherBase): ''' Cross validation searcher is not specific for time series ''' def __init__(self, sklearn_model_class, params, scoring=None, method=None, n_randomized_search=200, cv=5): super(CVSearcher, self).__init__(sklearn_model_class, params, method=method, n_randomized_search=n_randomized_search, cv=cv, scoring=scoring) def fit(self, X, Y): if self.method == 'Grid': self.__searcher = GridSearchCV(estimator=self.ml_class(), param_grid=self.search_space, scoring=self.scoring, cv=self.cv, refit=True) elif self.method == 'Randomized' or self.method is None: self.__searcher = RandomizedSearchCV(estimator=self.ml_class(), param_distributions=self.search_space, scoring=self.scoring, n_iter=self.n_randomized_search, cv=self.cv, refit=True) else: raise ValueError('CVSearcher only support GridSearch and RandomizedSearch') self.__searcher.fit(X, Y) print("Best: %s" % (self.__searcher.best_estimator_)) return self def predict(self, X): return self.__searcher.predict(X) def get_scores(self): return self.__searcher.grid_scores_
def Decision_tree(Xtrain, Ytrain, Xtest): tuned_parameters = { 'splitter': ['best', 'random'], "max_features": ["log2", "sqrt"], 'min_samples_split': np.arange(30, 60, 5), 'min_samples_leaf': np.arange(7, 14), 'max_depth': np.arange(700, 1389, 10) } """Randomized optimizationSearch which used cross validation to optimized best parameters for the estimator. In contrast to GridSearchCV, not all parameter values are tried out, but rather a fixed number of parameter settings is sampled from the specified distributions. The number of parameter settings that are tried is given by n_iter. """ Multreg = RandomizedSearchCV(DecisionTreeRegressor(random_state=0), param_distributions=tuned_parameters, cv=10, n_iter=int(args[1]), n_jobs=-1, random_state=0) #Fitting decision tree model Multreg.fit(Xtrain, Ytrain) #Predicting with unseen testing set YMultreg = Multreg.predict(Xtest) # save the model to disk filename = 'finalized_DC.sav' pickle.dump(Multreg, open(filename, 'wb')) return YMultreg
def parametr_tuning_random(model, params, scores, X_train, Y_train, X_test, Y_test, n_iter_search=10): """ """ for score in scores: log("# Tuning hyper-parameters for %s: " % score) log("", False) rnd_tune = RandomizedSearchCV(model, params, n_iter=n_iter_search, cv=5, scoring=score) rnd_tune.fit(X_train, Y_train) log("Best parameters set found on development set:") log(str(rnd_tune.best_params_), False) log("random search score _ TEST set:") log(str(rnd_tune.score(X_test, Y_test) * 100), False) log("", False) log("random search scores on development set:") log(str(rnd_tune.grid_scores_), False) log("", False) log("Detailed classification report:") log("", False) y_true, y_pred = Y_test, rnd_tune.predict(X_test) log(classification_report(y_true, y_pred), False) log("", False)
def main(): data = pd.read_csv(args.dataset) X = data.drop(['Id', 'Class'], axis=1) Y = data.loc[:, 'Class'] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42) estimator = [('reduce_dim', SelectFromModel(RandomForestClassifier())), ('classifier', XGBClassifier())] # transform the threshold to the quantile of median tmp = map(str, np.arange(args.threshold[0],args.threshold[1],args.threshold[2])) threshold = map(lambda x: x+'*median', tmp) clf = Pipeline(estimator) params = {} params['reduce_dim__estimator__n_estimators'] = list(np.arange(args.components[0], args.components[1], args.components[2])) params['reduce_dim__threshold'] = threshold params['classifier__n_estimators'] = list(np.arange(args.num_tree[0], args.num_tree[1], args.num_tree[2])) params['classifier__max_depth'] = list(np.arange(args.depths[0], args.depths[1], args.depths[2])) params['classifier__learning_rate'] = list(np.arange(args.lr[0], args.lr[1], args.lr[2])) params['classifier__subsample'] = list(np.arange(args.subsample[0], args.subsample[1], args.subsample[2])) params['classifier__colsample_bytree'] = list(np.arange(args.colsample[0], args.colsample[1], args.colsample[2])) # Cross_validation for grid search try: grid_search = RandomizedSearchCV(clf, param_distributions=params, n_iter=args.iter, cv=5, n_jobs=-1) grid_search.fit(X_train, y_train) except: grid_search = GridSearchCV(clf, param_grid=params, cv=5, n_jobs=-1) grid_search.fit(X_train, y_train) best_parameters, score, _ = max(grid_search.grid_scores_, key=lambda x: x[1]) result = accuracy_score(y_test, grid_search.predict(X_test)) print("Predict Accuracy: " + str(result)) print("XGboost using raw pixel features:\n%s\n" % (metrics.classification_report(y_test, grid_search.predict(X_test)))) print best_parameters
def K_NN(Xtrain, Ytrain, Xtest): KNNoptparam = { "n_neighbors": np.arange(20, 200, 10), "weights": ['uniform', 'distance'], "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'] #,"leaf_size":np.arange(30,150,15) , "p": [2, 3] } #Randomized search parameter optimization RF1 = RandomizedSearchCV(KNeighborsRegressor(), param_distributions=KNNoptparam, cv=10, n_iter=int(args[1]), n_jobs=-1, random_state=0) RF1.fit(Xtrain, Ytrain) #Predicting using unseen data KNN_predict = RF1.predict(Xtest) # save the model to disk filename = 'finalized_KNN.sav' pickle.dump(RF1, open(filename, 'wb')) return KNN_predict
def parameter_tuning(Xn, yn, scale=1): # FEATURE SELECTION print Xn.shape print yn.shape # FEATURE SCALING if scale == 1: Xn = preprocessing.scale(Xn, with_mean=True) print 'NORMALIZING' elif scale == 2: Xn = preprocessing.scale(Xn, with_mean=False) print 'NORMALIZING' tuned_parameters = [{'kernel': ['rbf'], 'C': np.logspace(-2, 7, 10), 'gamma': np.logspace(-4, 2, 7)}] tuned_parameters2 = {'kernel': ['rbf'], 'C': np.logspace(-2, 7, 10), 'gamma': np.logspace(-4, 2, 7)} linear_parameters = [{'kernel': ['linear'], 'C': np.logspace(-3, 4, 8)}] linear_parameters2 = {'kernel': ['linear'], 'C': np.logspace(-3, 4, 8)} cv = cross_validation.StratifiedKFold(yn,shuffle=True, n_folds=3, random_state=42) if RBF: clf = RandomizedSearchCV(estimator=SVC(C=1, cache_size=1000), param_distributions=tuned_parameters2, cv=cv, scoring='accuracy', n_iter=30, verbose=1, n_jobs=2).fit(Xn, yn) print("Best parameters set found on development set:") print print(clf.best_estimator_) print(clf.best_score_) print() print confusion_matrix(yn, clf.predict(Xn)) if LINEAR: clf = GridSearchCV(estimator=SVC(C=1, cache_size=1000), param_grid=linear_parameters, cv=cv, scoring='accuracy', verbose=1, n_jobs=2).fit(Xn, yn) print("Best parameters set found on development set:") print print(clf.best_estimator_) print(clf.best_score_) print() print confusion_matrix(yn, clf.predict(Xn))
def svm_tuning(features_train,labels_train,features_test,labels_test,kernel="rbf",C=[],gamma=[],randomized=False,i=50): if C==[]: C = [x * 1 for x in range(1, 50)]; if gamma==[]: gamma = [x * 1 for x in range(1, 50)]; # Split the dataset X_train, X_test, y_train, y_test = train_test_split(features_train, labels_train, test_size=0.25, random_state=0) if randomized: if kernel == "linear": tuned_parameters = {"C":C} clf = RandomizedSearchCV(svm.LinearSVC(class_weight="balanced"), param_distributions=tuned_parameters, cv=5,scoring="accuracy",n_iter=i) elif kernel == "rbf": tuned_parameters = {'C': C, 'gamma': gamma} clf = RandomizedSearchCV(svm.SVC(kernel="rbf",cache_size=1000,class_weight="balanced"), param_distributions=tuned_parameters, cv=5,scoring="accuracy",n_iter=i) elif kernel == "logistic": tuned_parameters = {"C":C} clf = RandomizedSearchCV(linear_model.LogisticRegression(), param_distributions=tuned_parameters, cv=5,scoring="accuracy",n_iter=i) else: if kernel == "linear": tuned_parameters = [{'C': C}] clf = GridSearchCV(svm.LinearSVC(class_weight="balanced"), tuned_parameters, cv=5,scoring="accuracy") elif kernel == "rbf": tuned_parameters = [{'C': C, 'gamma': gamma}] clf = GridSearchCV(svm.SVC(kernel="rbf",cache_size=1000,class_weight="balanced"), tuned_parameters, cv=5,scoring="accuracy") elif kernel == "logistic": tuned_parameters = [{'C': C}] clf = GridSearchCV(linear_model.LogisticRegression(), tuned_parameters, cv=5,scoring="accuracy") clf.fit(X_train, y_train) for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)) print("Best parameters set found on development set:") print(clf.best_params_) y_true, y_pred = y_test, clf.predict(X_test) #print(classification_report(y_true, y_pred)) print(measures.avgF1(np.array(y_true),y_pred,0,1)) print("FINAL C:") bestC = clf.best_params_["C"] if kernel == "linear": model = SVM.train(features_train,labels_train,c=bestC,k="linear") elif kernel == "rbf": bestGamma = clf.best_params_["gamma"] model = SVM.train(features_train,labels_train,c=bestC,g=bestGamma,k="rbf") elif kernel == "logistic": model = LogisticRegression.train(features_train,labels_train,c=bestC) prediction = SVM.predict(features_test,model) print(measures.avgF1(labels_test,prediction,0,1)) print(" ") if kernel == "rbf": return [bestC,bestGamma] else: return bestC
def buildRandomForest(self, X_train, X_test, y_train, cv=3, n_iter=5, save=False): rf = RandomForestClassifier(random_state=9) #Tune the model param_distributions = { 'n_estimators': range(1, 50, 1), 'max_depth': range(1, 70, 1), 'max_features': range(6, 15, 1), 'min_samples_split': [2, 3, 4], 'min_samples_leaf': [1, 2, 3, 4], 'n_jobs': [-1] } rf_optimized = RandomizedSearchCV( estimator=rf, param_distributions=param_distributions, n_iter=n_iter, scoring='f1', cv=cv, random_state=1) rf_optimized.fit(X_train, y_train) if save == True: joblib.dump(value=rf_optimized, filename="rf_optimized.pkl", compress=1) print "Best parameter: %s" % rf_optimized.best_params_ print "Best average cross validated F1 score: %0.4f" % rf_optimized.best_score_ print "--------------------------------------------" #predictions predicted_y_train = rf_optimized.predict(X_train) predicted_y_test = rf_optimized.predict(X_test) return predicted_y_train, predicted_y_test
def best_RandomForest(self, df=pd.DataFrame(), flag_interactions=False, flag_clean_features=False, impute_func=None, fill_test_func=None): df = self.df if impute_func: print('imputing data...') df, self.df_X_realtest = self.impute_data(df, self.df_X_realtest, impute_func, fill_test_func) print('get X, y from training set') (self.X, self.y) = self.ready_for_model_train( df, flag_interactions=flag_interactions, flag_clean_features=flag_clean_features) clf = RandomForestClassifier(bootstrap=False) grid = {'n_estimators': sp_randint(170, 350), 'min_samples_leaf': sp_randint(1, 12), 'max_features': sp_randint(2, 50), 'max_depth': sp_randint(5, 30), 'criterion': ['entropy','gini']} clf_rfc = RandomizedSearchCV(clf, n_jobs=4, n_iter=25, cv=6, param_distributions=grid, scoring='accuracy') print('Finding the best parameters...') clf_rfc.fit(self.X, self.y.ravel()) print('preparing X, y from test set...') X_test, y_test = self.ready_for_model_test( self.df_X_realtest, flag_interactions) y_hat = clf_rfc.predict(X_test) print('Best Params: \n') for k, v in clf_rfc.best_params_.items(): print(k, v) print("Accuracy with Random Forest = %4.4f" % accuracy_score(y_test, y_hat)) #binarize_y_confustion_matrix(y_test, y_hat) return(clf_rfc.best_params_)
def best_XGboost(self, df=pd.DataFrame(), flag_interactions=False, flag_clean_features=False, impute_func=None, fill_test_func=None): df = self.df if impute_func: print('imputing data...') df, self.df_X_realtest = self.impute_data(df, self.df_X_realtest, impute_func, fill_test_func) print('get X, y from training set') (self.X, self.y) = self.ready_for_model_train( df, flag_interactions=flag_interactions, flag_clean_features=flag_clean_features) clf = XGBClassifier() grid = {'n_estimators': sp_randint(100, 600), 'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.3, 0.5], 'max_depth': sp_randint(5, 30), 'min_child_weight': sp_randint(1, 5)} clf_rfc = RandomizedSearchCV(clf, n_jobs=3, n_iter=15, cv=4, param_distributions=grid, scoring='accuracy') print('Finding the best parameters...') clf_rfc.fit(self.X, self.y.ravel()) print('preparing X, y from test set...') X_test, y_test = self.ready_for_model_test( self.df_X_realtest, flag_interactions) y_hat = clf_rfc.predict(X_test) print('Best Params: \n') for k, v in clf_rfc.best_params_.items(): print(k, v) print("Accuracy with Random Forest = %4.4f" % accuracy_score(y_test, y_hat)) #binarize_y_confustion_matrix(y_test, y_hat) return(clf_rfc.best_params_)
def run_grid_search(m, parameters, params, name, Xtrain, Ytrain, Xtest, Ytest): print('=' * 80) print("Training %s Model" % name) print('=' * 80) t0 = time() clf = RandomizedSearchCV(m, parameters, cv=3, n_jobs=4, verbose=3, error_score=0) clf.fit(Xtrain, Ytrain) Yhat = clf.predict(Xtest) print("\tDone in %1.2f seconds" % float(time() - t0)) print("\tScore: %1.2f\n" % mse(Yhat, Ytest)) print("Best Parameters" + str(clf.best_params_)) print("Writing Solution") submit = pd.DataFrame(data={'id': ids, 'quality': Yhat}) submit.to_csv('./submissions/'+name+'.csv', index = False)
def optimize_svr(X_total_train, Y_train, X_total_test, Y_test, n_iter_search): svr = SVR() # params = [ # {'C': scipy.stats.expon(scale=1e-4), 'gamma': scipy.stats.expon(scale=1e-2), 'kernel' : ['rbf']}, # {'C': scipy.stats.expon(scale=1e-4), 'degree': [2, 3, 4, 5, 6], 'kernel' : ['poly']}, # {'C': scipy.stats.expon(scale=1e-4), 'kernel': ['linear']} # ] # params = {'C': scipy.stats.expon(scale=1e-4), 'degree': [1,2,3], 'kernel' : ['poly']} params = {'C': [1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8,1e-8,1e-10], 'degree': [1,2,3], 'kernel' : ['poly']} random_search = RandomizedSearchCV(svr, param_distributions=params, n_iter=n_iter_search) random_search.fit(X_total_train, Y_train) result = random_search.predict(X_total_test) mse = metrics.mean_squared_error(result, Y_test) # hyperparams = random_search.best_params_ # return mse, random_search return mse, random_search
def Random_forest(Xtrain, Ytrain, Xtest): grid = { "n_estimators": np.arange(100, 1200, 50), "max_features": ["log2", "sqrt", "auto"], "max_depth": np.arange(20, 200, 10), "min_samples_leaf": np.arange(3, 50, 5) } #Randomized search parameter optimization RF = RandomizedSearchCV(RandomForestRegressor(random_state=0, oob_score=0), param_distributions=grid, cv=15, n_iter=int(args[1]), n_jobs=-1, random_state=0) RF.fit(Xtrain, Ytrain) #Predicting using unseen data RF_predict = RF.predict(Xtest) # save the model to disk filename = 'finalized_RF.sav' pickle.dump(RF, open(filename, 'wb')) return RF_predict
def best_RandomForest(self, df=pd.DataFrame()): if df.empty: df = self.df self.df_train, self.df_test = self.split_df(df) X_train, y_train = self.ready_for_model_train(self.df_train) X_test, y_test = self.ready_for_model_test(self.df_test) clf = RandomForestClassifier(bootstrap=False) grid = { 'n_estimators': sp_randint(250, 400), #'min_samples_leaf': sp_randint(1, 12), 'max_features': sp_randint(5, 50), 'max_depth': sp_randint(5, 30) } clf_rfc = RandomizedSearchCV(clf, n_jobs=4, n_iter=15, param_distributions=grid, scoring='accuracy') print("Finding the best parameters..") clf_rfc.fit(X_train, y_train.ravel()) print("Getting predicts for..") y_hat = clf_rfc.predict(X_test) print('Best Params: \n') for k, v in clf_rfc.best_params_.items(): print(k, v) print("Accuracy with Random Forest = %4.4f" % accuracy_score(y_test.ravel(), y_hat)) #binarize_y_confustion_matrix(y_test, y_hat) return (clf_rfc.best_params_)
def EXT_tree(Xtrain, Ytrain, Xtest): grid2 = { "n_estimators": np.arange(100, 1200, 50), "max_features": ["log2", "sqrt", "auto"], "max_depth": np.arange(20, 200, 10), "min_samples_leaf": np.arange(3, 50, 5) } RF3 = RandomizedSearchCV(ExtraTreesRegressor(random_state=0, oob_score=0), param_distributions=grid2, cv=15, n_iter=int(args[1]), n_jobs=-1, random_state=0) #MOdel fitting RF3.fit(Xtrain, Ytrain) #Predicting using unseen data EXT_predict = RF3.predict(Xtest) # save the model to disk filename = 'finalized_EXT.sav' pickle.dump(RF3, open(filename, 'wb')) return EXT_predict
def best_RandomForest(self,df=pd.DataFrame()): if df.empty: df = self.df self.df_train, self.df_test = self.split_df(df) X_train, y_train = self.ready_for_model_train(self.df_train) X_test, y_test = self.ready_for_model_test(self.df_test) clf = RandomForestClassifier(bootstrap = False) grid = {'n_estimators': sp_randint(250, 400), #'min_samples_leaf': sp_randint(1, 12), 'max_features': sp_randint(5, 50), 'max_depth': sp_randint(5, 30)} clf_rfc = RandomizedSearchCV(clf, n_jobs=4, n_iter=15, param_distributions=grid, scoring='accuracy') print("Finding the best parameters..") clf_rfc.fit(X_train, y_train.ravel()) print("Getting predicts for..") y_hat = clf_rfc.predict(X_test) print('Best Params: \n') for k, v in clf_rfc.best_params_.items(): print(k, v) print("Accuracy with Random Forest = %4.4f" % accuracy_score(y_test.ravel(), y_hat)) #binarize_y_confustion_matrix(y_test, y_hat) return(clf_rfc.best_params_)
def make_prediction(pipe, X_train, y_train, X_test): """ Assesses the model with n_iter different sets of parameters through cross-validation, choose the best one, train it on the train data and predicts on the test data. :param pipe: main pipeline, output of prepare_pipeline() :param X_train: training dataset, output of prepare_dataset(raw_train) :param y_train: target column of the training set :param X_test: testing dataset, output of prepare_dataset(raw_test) :return: pandas dataframe with two features: PassengerId and Survived (prediction for the test set) """ param_grid = {'svc__C': stats.uniform(loc=0, scale=10), 'svc__decision_function_shape': [None, 'ovo', 'ovr'], 'svc__shrinking': [True, False] } rand = RandomizedSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_iter=100) # We fit to the train sets rand.fit(X_train, y_train) print('Estimated accuracy: {:.1f} %'.format(rand.best_score_*100)) output = pd.DataFrame({'PassengerId': X_test.index, 'Survived': rand.predict(X_test)}) return output
train = pd.read_csv("./Desktop/schiz/concat_train/trainconcat.csv") test = pd.read_csv("./Desktop/schiz/concat_test/testconcat.csv") train_features = train.ix[:,1:411] #train data features train_label = train["Class"] #train data labels #test = (test - test.mean()) / (test.max() - test.min()) train_features = (train_features - train_features.mean()) / (train_features.max() - train_features.min()) features = list(train.columns[1:411]) #liste of train features label = list(train["Class"]) print("Preprocessing data") param_distributions = {'C': expon()} svc = LogisticRegression(penalty='l2', C=1.0, fit_intercept=True, solver='liblinear') clf =RandomizedSearchCV(svc, param_distributions=param_distributions, n_iter=10000) clf.fit(train_features, label) scores = cross_validation.cross_val_score(clf,train_features,label,cv=2,scoring='roc_auc') print(scores) #def get_score(clf, train_features, train_label): # X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_features, train_label, test_size=0.12, random_state=0) # clf.fit(X_train, y_train) # print clf.score(X_test, y_test) print("Training Logistic Regression") test_feature = test[features] print("Make predictions on the test set") test_probs = clf.predict(test_feature) submission = pd.DataFrame({"id": test["Id"], "probability": test_probs}) submission.to_csv("rf_xgboost_submission.csv", index=False) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def XGB_Regressor(Train_DS, y, Actual_DS, Sample_DS, Parms_DS_XGB, Grid, Ensemble): print("***************Starting xgb Regressor (sklearn)***************") t0 = time() n_iter_search = 500 Train_DS, y = shuffle(Train_DS, y, random_state=21) if Grid: # used for checking the best performance for the model using hyper parameters print("Starting model fit with Grid Search") # specify parameters and distributions to sample from param_dist = { "n_estimators": [10], "max_depth": sp_randint(1, 25), "min_child_weight": sp_randint(1, 25), "subsample": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], "colsample_bytree": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], "silent": [True], "gamma": [0.5, 0.6, 0.7, 0.8, 0.9, 1, 2], } clf = xgb.XGBRegressor(nthread=4) # run randomized search clf = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, scoring=gini_scorer, cv=10) start = time() clf.fit(Train_DS, y) print( "RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search) ) Parms_DS_Out = report(clf.grid_scores_, n_top=n_iter_search) Parms_DS_Out.to_csv(file_path + "Parms_DS_XGB_1001.csv") Parms_DS_XGB = Parms_DS_Out print("Best estimator found by grid search:") print(clf.best_estimator_) # Predict actual model pred_Actual = clf.predict(Actual_DS) print("Actual Model predicted") # Get the predictions for actual data set preds = pd.DataFrame(pred_Actual, index=Sample_DS.Id.values, columns=Sample_DS.columns[1:]) preds.to_csv(file_path + "output/Submission_Roshan_XGB_1.csv", index_label="Id") if Ensemble: print("Starting ensembling") Ensemble_DS = pd.DataFrame() for i in range(20): scores = [] clf = xgb.XGBRegressor( n_estimators=2000, max_depth=Parms_DS_XGB["max_depth"][i], learning_rate=0.01, nthread=4, min_child_weight=Parms_DS_XGB["min_child_weight"][i], subsample=Parms_DS_XGB["subsample"][i], colsample_bytree=Parms_DS_XGB["colsample_bytree"][i], silent=True, gamma=Parms_DS_XGB["gamma"][i], ) clf.fit(Train_DS, y) Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf) # scores.append(Nfold_score) # print(" %d-iteration... %s " % (i+1,scores)) pred_Actual = clf.predict(Actual_DS) Ensemble_DS[i] = pred_Actual print(" %d - Model Completed..." % (i + 1)) Ensemble_DS.to_csv(file_path + "Ensemble_DS_XGB_1.csv") if Grid == False and Ensemble == False: # CV:0.38604935169439381, LB:0.382479 # CV:0.38614992702270973 (with std scaler) # clf = xgb.XGBRegressor(n_estimators=1000,max_depth=7,learning_rate=0.01,nthread=2,min_child_weight=5, # subsample=0.8,colsample_bytree=0.8,silent=True,gamma=1) # CV:0.0.38540501304758473 # clf = xgb.XGBRegressor(n_estimators=1000,max_depth=8,learning_rate=0.01,nthread=4,min_child_weight=5, # subsample=0.8,colsample_bytree=0.8,silent=True,gamma=1) # CV:0.38672594800194787 clf = xgb.XGBRegressor( n_estimators=2000, max_depth=6, learning_rate=0.01, nthread=4, min_child_weight=15, subsample=1, colsample_bytree=0.5, silent=True, gamma=0.8, ) # CV : 0.38594904255042506) # clf = xgb.XGBRegressor(n_estimators=1000,max_depth=5,learning_rate=0.02,nthread=4,min_child_weight=1, # subsample=1,colsample_bytree=0.9,silent=True,gamma=1) # # CV : 0.38335661759105549 , 0.3877 in 2000 iter # clf = xgb.XGBRegressor(n_estimators=2000,max_depth=5,learning_rate=0.01,nthread=4,min_child_weight=19, # subsample=1,colsample_bytree=0.3,silent=True,gamma=0.6) # CV : 0.3850 in 1000 , 0.3877 in 2000 iter clf = xgb.XGBRegressor( n_estimators=2000, max_depth=5, learning_rate=0.01, nthread=4, min_child_weight=20, subsample=0.8, colsample_bytree=0.4, silent=True, gamma=0.6, ) Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf) clf.fit(Train_DS, y) # Predict actual model pred_Actual = clf.predict(Actual_DS) print("Actual Model predicted") # Get the predictions for actual data set preds = pd.DataFrame(pred_Actual, index=Sample_DS.Id.values, columns=Sample_DS.columns[1:]) preds.to_csv(file_path + "output/Submission_Roshan_XGB_1.csv", index_label="Id") print("***************Ending xgb Regressor (sklearn)***************") return pred_Actual
#y_test = y_test[:100] tuned_parameters = { 'kernel': ['rbf'], 'gamma': expon(scale=.1), 'C': expon(scale=100) } clf = RandomizedSearchCV(SVC(C=1), tuned_parameters, cv=5, scoring='accuracy', n_jobs=-1) clf.fit(X_train, y_train) print clf.best_estimator_ pred = clf.predict(X_test) pred_label = le.inverse_transform(pred) submission = DataFrame({'label': pred_label}, columns=['label'], index=np.arange(1, len(pred) + 1)) submission['Class_1'] = np.zeros(len(pred)) submission['Class_1'][pred == 1] = np.ones(len(pred == 1)) submission['Class_2'] = np.zeros(len(pred)) submission['Class_2'][pred == 2] = np.ones(len(pred == 2)) submission['Class_3'] = np.zeros(len(pred)) submission['Class_3'][pred == 3] = np.ones(len(pred == 3)) submission['Class_4'] = np.zeros(len(pred)) submission['Class_4'][pred == 4] = np.ones(len(pred == 4)) submission['Class_5'] = np.zeros(len(pred))
pipe = Pipeline(steps=[('pca', pca), ('rbfSVM', rbfSVM)]) param_dist={ "pca__n_components":sp_randint(10,700), "rbfSVM__C": scipy.stats.expon(scale=10), "rbfSVM__kernel": ["rbf"], "rbfSVM__gamma": scipy.stats.expon(scale=0.01) } n_iter_search = 500 random_search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=n_iter_search,cv=cv,verbose=6,n_jobs=4) random_search.fit(X_train,Y_train) predicted_held_out=random_search.predict(X_test) mmat=confusion_matrix(predicted_held_out,Y_test) print mmat class_map=dict(zip(set(input_kmers_counts["class"]),range(0,4))) kappa([class_map[x] for x in Y_test],[class_map[x] for x in predicted_held_out]) # We determine whether the variance of the number of components for the best CV all_scores=random_search.grid_scores_ all_scores.sort(key=lambda x:x.mean_validation_score) with open("random_search_scores_1000iter_5mers.bdat","w") as f : cPickle.dump(all_scores,f) # We generate a pandas data.frame with the results import pandas
def RFC_Classifier(Train_DS, y, Actual_DS, Sample_DS, grid): print("***************Starting RFC Classifier***************") t0 = time() if grid: #use SVD (similar to PCA) svd = TruncatedSVD( algorithm='randomized', n_iter=5, random_state=None, tol=0.0) # Initialize the standard scaler scl = StandardScaler(copy=True, with_mean=True, with_std=True) #used for checking the best performance for the model using hyper parameters print("Starting model fit with Grid/Random Search") RFC_model = RandomForestClassifier(n_estimators=500,n_jobs=-1) # Create the pipeline clf = pipeline.Pipeline([('svd', svd), ('scl', scl), ('RFC', RFC_model)]) # specify parameters and distributions to sample from param_dist = { "svd__n_components" : [200,300,400,500,600,700], "max_depth": [1, 2, 3, 4, 5, None], "max_features": sp_randint(1, 40), "min_samples_split": sp_randint(1, 20), "min_samples_leaf": sp_randint(1, 20), "bootstrap": [True, False] } # clf = GridSearchCV(estimator = clf, param_grid=param_dist, scoring=kappa_scorer, # verbose=10, n_jobs=-1, iid=True, refit=True, cv=2) # run randomized search n_iter_search = 1000 clf = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, scoring = kappa_scorer,cv=10) start = time() clf.fit(Train_DS, y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(clf.grid_scores_) print("Best estimator found by grid search:") print(clf.best_estimator_) print(clf.grid_scores_) print(clf.best_score_) print(clf.best_params_) print(clf.scorer_) else: #Setting singular value decomposition # svd = TruncatedSVD(n_components=500,algorithm='randomized', n_iter=5, random_state=None, tol=0.0) # svd.fit(Train_DS) # Train_DS = svd.transform(Train_DS) # Actual_DS = svd.transform(Actual_DS) # # #Setting Standard scaler for data # stdScaler = StandardScaler(copy=True, with_mean=True, with_std=True) # stdScaler.fit(Train_DS,y) # Train_DS = stdScaler.transform(Train_DS) # Actual_DS = stdScaler.transform(Actual_DS) clf = RandomForestClassifier(n_jobs=-1, n_estimators=500, min_samples_split=1) clf = CalibratedClassifierCV(base_estimator=clf, method='sigmoid') Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf) clf.fit(Train_DS, y) #Predict actual model pred_Actual = clf.predict(Actual_DS) print("Actual Model predicted") #Get the predictions for actual data set preds = pd.DataFrame(pred_Actual, index=Sample_DS.id.values, columns=Sample_DS.columns[1:]) preds.to_csv(file_path+'output/Submission_Roshan_RFC.csv', index_label='id') print("***************Ending RFC Classifier***************") return pred_Actual
#rs.fit(a_in, a_out) if len(X_train) != len(y_train): sys.stderr.write("Number of samples and number of labels do not match.") exit() for t in xrange(N): crash = True while(crash): try: rs.fit(X_train, y_train) crash = False except RuntimeError: sys.stderr.write("--------------------- [Crashed by RunTimeERROR. restarting] --------------------- \n") crash = True sys.stderr.write("Best Parameters: %s, score: %s\n" % (str(rs.best_params_), str(rs.best_score_))) y_ = rs.predict(X_valid) y = [] for o in y_: y.append(o[0]) input = sys.argv[3].split("/")[-1].split(".")[0] y_out = {} y_out['estimated_output'] = y y_out['best_params'] = rs.best_params_ y_out['best_score'] = rs.best_score_ with open("nn_output_headlines_30_d2v_conv_300_m5.txt", "a") as f: f.write(str(y_out)+'\n')
cv_data = train_data[0:temp:,::] train_data2 = train_data[temp::,::] forest = RandomForestClassifier(n_estimators = 25) # run randomized search n_iter_search = 30 random_search = RandomizedSearchCV(forest, param_distributions=param_dist, n_iter=n_iter_search, cv=6) start = time() random_search.fit(train_data2[::,1::], train_data2[::,0]) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) train_output = random_search.predict(train_data2[::,1::]) cv_output = random_search.predict(cv_data[::,1::]) print "Training set accuracy: %.3f CV set accuracy: %.3f"\ %(len(train_data2[train_output == train_data2[::,0]])/float(len(train_data2)), (len(cv_data[cv_output == cv_data[::,0]])/float(len(cv_data)))) # Analyzing important features forest = random_search.best_estimator_ feature_importance = forest.feature_importances_ # make importances relative to max importance feature_importance = 100.0 * (feature_importance / feature_importance.max()) feature_list = df.columns.values
def run_prediction_random_gs_split(X_file, y_file, data_str): # fixme copied function def _pred_real_scatter(y_test, y_test_predicted, title_str, in_data_name): import os import pylab as plt from matplotlib.backends.backend_pdf import PdfPages plt.scatter(y_test, y_test_predicted) plt.plot([10, 80], [10, 80], 'k') plt.xlabel('real') plt.ylabel('predicted') ax = plt.gca() ax.set_aspect('equal') plt.title(title_str) plt.tight_layout() scatter_file = os.path.join(os.getcwd(), 'scatter_' + in_data_name + '.pdf') pp = PdfPages(scatter_file) pp.savefig() pp.close() return scatter_file import os, pickle import numpy as np from sklearn.svm import SVR from sklearn.cross_validation import cross_val_score, cross_val_predict, train_test_split from sklearn.grid_search import RandomizedSearchCV from sklearn.pipeline import Pipeline from sklearn.feature_selection import VarianceThreshold from sklearn.preprocessing import MinMaxScaler, StandardScaler from sklearn.preprocessing import Imputer from sklearn.feature_selection import SelectPercentile, f_regression from sklearn.metrics import mean_absolute_error, r2_score from sklearn.svm import LinearSVR from sklearn.decomposition import PCA from scipy.stats import randint as sp_randint from scipy.stats import expon X = np.load(X_file) y = np.load(y_file) # fixme add squared values to X # X = np.hstack([X, np.square(X)]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # remove low variance features fill_missing = Imputer() var_thr = VarianceThreshold() normalize = StandardScaler() # MinMaxScaler() selection = SelectPercentile(f_regression) # regression_model = LinearSVR() #SVR(kernel='linear') from sklearn.svm import NuSVR regression_model = LinearSVR() # NuSVR(kernel='linear') #SVR(kernel='linear') pipe = Pipeline([ ('fill_missing', fill_missing), ('var_thr', var_thr), ('normalize', normalize), ('selection', selection), ('regression_model', regression_model), ]) param_dist = { 'selection__percentile': sp_randint(10, 100), 'regression_model__C': expon(scale=100), # sp_randint(.001, 14450), 'regression_model__epsilon': sp_randint(0, 100), 'regression_model__loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'], } # fixme njobs n_iter_search = 400 gs = RandomizedSearchCV(pipe, param_distributions=param_dist, cv=5, scoring='mean_absolute_error', n_jobs=15, n_iter=n_iter_search) gs.fit(X_train, y_train) best_estimator = gs.best_estimator_ grid_scores = gs.grid_scores_ gs_file = os.path.join(os.getcwd(), 'gs_' + data_str + '.pkl') with open(gs_file, 'w') as f: pickle.dump(grid_scores, f) sorted_grid_score = sorted(gs.grid_scores_, key=lambda x: x.mean_validation_score, reverse=True) score_str = [str(n) + ': ' + str(g) for n, g in enumerate(sorted_grid_score)] gs_text_file = os.path.join(os.getcwd(), 'gs_txt_' + data_str + '.txt') with open(gs_text_file, 'w') as f: f.write('\n'.join(score_str)) # fitted_model = gs.steps[-1][1] # fixme pickle crashes model_out_file = '' # model_out_file = os.path.join(os.getcwd(), 'trained_model.pkl') # with open(model_out_file, 'w') as f: # pickle.dump(gs, f) y_predicted = gs.predict(X_test) cv_scores = mean_absolute_error(y_test, y_predicted) cv_scores_r2 = r2_score(y_test, y_predicted) title_str = '{}\n mae: {:.3f}\n r2: {:.3f}'.format( data_str, cv_scores, cv_scores_r2) scatter_file = _pred_real_scatter(y_test, y_predicted, title_str, data_str) return model_out_file, scatter_file, gs_text_file, gs_file, best_estimator
# run randomized search n_iter_search = 40 random_search = RandomizedSearchCV(clf, param_distributions=QL_SVM_param_dist, n_iter=n_iter_search, cv=skf) start = time() random_search.fit(K_train, y_train) print( "Quasi_linear kernel SVM RandomSearch took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) print("Random_search Best estimator is :\n"), random_search.best_estimator_ report(random_search.grid_scores_, n_top=5) # print the classification_report y_test, y_pred = y_test, random_search.predict(K_test) #Call predict on the estimator with the best found parameters. print(classification_report(y_test, y_pred)) print() # run grid search grid_search = GridSearchCV(clf, param_grid=QL_SVM_param_dist, cv=skf) start = time() grid_search.fit(K_X, Y) print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(grid_search.grid_scores_))) print("Grid_search Best estimator is :\n"), grid_search.best_estimator_ report(grid_search.grid_scores_, n_top=10) # print the classification_report y_test, y_pred = y_test, grid_search.predict(K_test) print(classification_report(y_test, y_pred))
#Start with data with age # run randomized search n_iter_search = 20 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search,n_jobs=4, verbose=1) random_search.fit(x_train_std,y_train) print 'Reporting' print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) score=random_search.score(x_test_std,y_test) print 'Test score' print score print 'Predicting' output = random_search.predict(test_data_std) #Finally with data without age # run randomized search <<<<<<< HEAD n_iter_search = 20 ======= n_iter_search = 2000 >>>>>>> 5b0499dbec7ef19b9617d4339731063de092e370 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search,n_jobs=4, verbose=1) random_search.fit(x_train_std_noage,y_train_noage) print 'Reporting noage' print("RandomizedSearchCV noage took %.2f seconds for %d candidates"
X_train, X_test, y_train, y_test = train_test_split(X, y) pipeline = Pipeline([('data', FeatureUnion([('audio', AudioLoader()), ('vad', VADLoader())])), ('svm', SVC(kernel='rbf', gamma=1e-5, C=20))]) paramdist = { 'svm__C': np.logspace(0, 2, 50), 'data__vad__stacksize': scipy.stats.randint(11, 51), 'data__audio__stacksize': scipy.stats.randint(11, 51) } clf = RandomizedSearchCV(pipeline, paramdist, n_iter=500, verbose=1, cv=1, n_jobs=35) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) with open( path.join(data.BASEDIR, 'transcriber_rand_params_{0}.pkl'.format(monkey)), 'wb') as fid: pickle.dump(clf.best_params_, fid, -1) with open( path.join(data.BASEDIR, 'transcriber_rand_results_{0}.pkl'.format(monkey)), 'wb') as fid: pickle.dump((y_test, y_pred, labels)) print monkey print classification_report(y_test, y_pred, target_names=labels)
tuned_parameters = { 'C': [1, 10, 100,500, 1000], 'kernel': ['linear','rbf'], 'C': [1, 10, 100,500, 1000], 'gamma': [1,0.1,0.01,0.001, 0.0001], 'kernel': ['rbf'], #'degree': [2,3,4,5,6] , 'C':[1,10,100,500,1000] , 'kernel':['poly'] } from sklearn.grid_search import RandomizedSearchCV model_svm = RandomizedSearchCV(svm_model, tuned_parameters,cv=10,scoring='accuracy',n_iter=20) model_svm.fit(X_train, y_train) print(model_svm.best_score_) print(model_svm.best_params_) y_pred= model_svm.predict(X_test) print(metrics.accuracy_score(y_pred,y_test)) confusion_matrix=metrics.confusion_matrix(y_test,y_pred) print confusion_matrix auc_roc=metrics.classification_report(y_test,y_pred) print auc_roc auc_roc=metrics.roc_auc_score(y_test,y_pred) auc_roc from sklearn.metrics import roc_curve, auc false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred) roc_auc = auc(false_positive_rate, true_positive_rate) print roc_auc import matplotlib.pyplot as plt
def get_trained_clf_2(df, category, X_train_counts, Y, count_vect, clf, clf_name): params = None gs_clf = None # set clf into grid search if (isinstance(clf, tree.DecisionTreeClassifier)): print('=' * 100) print(' Optimizing tree.DecisionTreeClassifier ...') params = { 'criterion': ['gini', 'entropy'], 'max_depth': [ 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30, 40, 50, 70, 90, 120, 150 ] } gs_clf = RandomizedSearchCV(estimator=clf, param_distributions=params, cv=5, n_jobs=-1) elif (isinstance(clf, LogisticRegression)): print('=' * 100) print(' Optimizing Logistic Reg ...') params = { 'C': [0.001, 0.01, 0.1, 1, 10, 15, 20, 30, 40, 100, 1000], 'penalty': ['l1', 'l2'] } gs_clf = RandomizedSearchCV(estimator=clf, param_distributions=params, cv=5, n_jobs=-1) elif (isinstance(clf, RandomForestClassifier)): print('=' * 100) print(' Optimizing Random Forest ...') params = { "max_depth": [3, 5, None], "max_features": [1, 2, 3, 4, 5, 7, 9], "min_samples_split": [1, 2, 3, 4, 5, 7, 9], "min_samples_leaf": [1, 2, 3, 4, 5, 7, 9], "bootstrap": [True, False], "criterion": ["gini", "entropy"] } gs_clf = RandomizedSearchCV(estimator=clf, param_distributions=params, cv=5, n_jobs=-1) elif (isinstance(clf, svm.SVC)): print('=' * 100) print(' Optimizing SVM ...') C_range = 10.0**np.arange(-4, 4) gamma_range = 10.0**np.arange(-4, 4) kernels = ['rbf', 'linear', 'poly', 'sigmoid'] params = { 'C': C_range.tolist(), 'gamma': gamma_range.tolist(), 'kernel': kernels } gs_clf = RandomizedSearchCV(estimator=clf, param_distributions=params, cv=5, n_jobs=-1) elif (isinstance(clf, MultinomialNB)): print('=' * 100) print(' Optimizing MultinomialNB ...') params = { 'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001], 'fit_prior': [True, False] } gs_clf = RandomizedSearchCV(estimator=clf, param_distributions=params, cv=5, n_jobs=-1, n_iter=10) elif (isinstance(clf, AdaBoostClassifier)): print('=' * 100) print(' Optimizing Ada Boost ...') params = { 'learning_rate': stats.expon(scale=1.0), 'n_estimators': stats.randint(low=20, high=100) } gs_clf = RandomizedSearchCV(estimator=clf, param_distributions=params, cv=5, n_jobs=-1, n_iter=10) elif (isinstance(clf, KNeighborsClassifier)): print('=' * 100) print(' Optimizing KNN Neighbors ...') params = { 'n_neighbors': [i for i in range(2, 10)], 'weights': ['uniform', 'distance'] } gs_clf = RandomizedSearchCV(estimator=clf, param_distributions=params, cv=5, n_jobs=-1, n_iter=10) start = time.time() ## train classifier for recall and precision measurements gs_clf.fit(X_train_counts, Y) print(" Optimization process took %g s" % (time.time() - start)) ## get validation score for given classifier print(' Cross validation for ' + clf_name) scores = cross_val_score(gs_clf, X_train_counts, Y, scoring='recall', cv=5) ## Print accuracy predictions = gs_clf.predict(X_train_counts) print('\n Best score: ', np.mean(scores)) print(' Prediction accuracy score: ', accuracy_score(Y, predictions)) print(' Confusion matrix:') display( pandas.crosstab(pandas.Series(Y), predictions, rownames=['True'], colnames=['Predicted'], margins=True)) print('\n') ## recall measurement #false_negatives = df[((df.category_full_path_mod1 == category) & (df.type == 'False Negative'))].loc[:,'description_mod1'] #false_negatives = false_negatives.drop_duplicates() #X_test_counts = count_vect.transform(false_negatives) #Y_test = clf.predict(X_test_counts) ## precision measurement #false_positives = df[((df.category_full_path_mod1 != category) & (df.type == 'False Negative'))].loc[:,'description_mod1'] #false_positives = false_positives.drop_duplicates() #X_test_counts2 = count_vect.transform(false_positives) #Y_test2 = clf.predict(X_test_counts2) ## Persist classifier and it's scores to dict results_dict = {} results_dict["Model name"] = clf_name #results_dict["Cross Validation Score"] = np.mean(scores) #results_dict["Best Score"] = gs_clf.best_score_ results_dict["Best Score"] = np.mean(scores) #results_dict["Recall"] = np.sum(Y_test)*1.0/len(Y_test) #results_dict["Precision"] = 1 - np.sum(Y_test2)*1.0/len(Y_test2) results_dict["Model"] = gs_clf for param_name in sorted(params.keys()): results_dict[param_name] = gs_clf.best_params_[param_name] return results_dict
pca_transformer = PCA(n_components=1000) pca_transformer.fit(x_train) pca_transformer.explained_variance_ratio_.sum() x_train = pca_transformer.transform(x_train) scaler_y = StandardScaler(copy=True, with_mean=True, with_std=True) y_train = scaler_y.fit_transform(y_train.values.reshape(-1, 1)) grid.fit(x_train, y_train.ravel()) pd.DataFrame(grid.grid_scores_).sort_values("mean_validation_score") x_test = poli.transform(x_test) x_test = pca_transformer.transform(x_test) pd.DataFrame([ np.e**grid.predict(x_test), scaler_y.transform(y_test.values.reshape(-1, 1)) ]) mean_squared_error(y_test, np.e**grid.predict(x_test)) # 2004,805,126 # 716,852,668 # 759,107,470 # 2057,570,962 # 1260,684,066 # 1689,874,386 # 1608,326,518 # 5405,998,897 # 715,551,980 # 778,085,588 # 804,713,150 # 938,380,884
gbdt = GradientBoostingClassifier(verbose=1) searchcv = RandomizedSearchCV(estimator=gbdt, param_distributions=param_dist, n_iter=200, verbose=1) searchcv.fit(Xtrain, ytrain) searchcv.best_score_ searchcv.best_estimator_ searchcv.best_params_ # ---------------------- predict titanic_test = pd.read_csv("test_processed.csv", index_col="PassengerId") Xtest = titanic_test[feature_names] predictions = searchcv.predict(Xtest) submission = pd.DataFrame({ "PassengerId": titanic_test.index, "Survived": predictions }) submission.to_csv("submit_gbdt.csv", index=False) import pickle inf = open('gbdt.pkl', 'rb') gbdt = pickle.load(inf) inf.close() sorted(zip(map(lambda x: round(x, 4), gbdt.feature_importances_), feature_names), reverse=True)
def main(fold_num=0): train_ids = pickle.load(open("fold_%s_train_ids.pickle" % fold_num)) test_ids = pickle.load(open("fold_%s_test_ids.pickle" % fold_num)) # get the data with open('cui_data_sent5.csv', 'r') as f: r = csv.DictReader(f) data = [row for row in r] out = [] for row in data: for label in row['label'].split('|'): new_row = row.copy() new_row['label'] = label out.append(new_row) data = out # cui graph with open('graph_subset.pck', 'rb') as f: cui_graph = pickle.load(f) # de-unicode all this for row in data: row["sent"] = unidecode.unidecode(row["sent"]) # quick processing of cuis cui2int = lambda x: int(x[1:]) int2cui = lambda x: "C{}".format(str(x).zfill(7)) cui_ancestors = lambda x: list(nx.ancestors(cui_graph, cui2int(x))) # Generate text features vec = HashingVectorizer(ngram_range=(1, 3), stop_words='english') X_text = vec.transform((row['sent'] for row in data)) # Generate concept features indptr = [0] indices = [] csr_data = [] for row in data: ancestors = [cui2int(row['cui'])] # remember the index cui! try: ancestors = ancestors + cui_ancestors(row['cui']) except: pass for ancestor in ancestors: indices.append(ancestor) csr_data.append(1) indptr.append(len(indices)) X_cuis = csr_matrix((csr_data, indices, indptr), shape=(len(data), 10000000), dtype=np.int64) # and positional features X_pos = np.zeros(shape=(len(data), 5)) for i, row in enumerate(data): X_pos[(i, int(float(row['position']) * 4))] = 5 # and answers y = np.array([row["label"] for row in data]) # combine primary and secondary outcome for now (not sure it matters too much at this stage) y[y == 'secondary_outcome'] = 'outcome' y[y == 'primary_outcome'] = 'outcome' X = hstack([X_text, X_cuis, X_pos], format='csr') X_train = X[train_ids, :] X_test = X[test_ids, :] y_train = y[train_ids] y_test = y[test_ids] class_instance_indices = {} outcome_indices = np.where(y_train == "outcome")[0] interventions_indices = np.where(y_train == "interventions")[0] ignore_indices = np.where(y_train == "ignore")[0] population_indices = np.where(y_train == "population")[0] K = 5 targets = ['population', 'interventions', 'outcome'] #ftwo_scorer = make_scorer(fbeta_score, beta=2, labels=targets, average='macro') # favour recall a # bcw -- making comparable to CNN approach f_scorer = make_scorer(fbeta_score, beta=1, labels=targets, average='macro') class_weights = [] # generate hyperparameter search space weight_space = range(1, 50) for w1 in weight_space: for w2 in weight_space: for w3 in weight_space: class_weights.append( {t: w for t, w in zip(targets, [w1, w2, w3])}) parameters = { 'alpha': np.logspace(-1, -20, 50), 'class_weight': class_weights } clf = SGDClassifier(average=True, loss="hinge", class_weights="balanced") # do the random grid search thing grid_search = RandomizedSearchCV(clf, param_distributions=parameters, n_iter=38, verbose=3, n_jobs=19, scoring=f_scorer, cv=K) grid_search.fit(X_train, y_train) #y_hat = grid_search.predict(X_test) y_hat = grid_search.decision_function(X_test) with open("lm_raw_predictions_%s.pickle" % fold_num, 'w') as outf: pickle.dump(y_hat, outf) #import pdb; pdb.set_trace() y_hat = grid_search.predict(X_test) with open("lm_predictions_%s.pickle" % fold_num, 'w') as outf: pickle.dump(y_hat, outf) with open("lm_y_%s.pickle" % fold_num, 'w') as outf: pickle.dump(y_test, outf)
def run(args): X_train = np.nan_to_num( np.genfromtxt(args.training_data, delimiter=args.delimiter)) y_train = np.clip(np.genfromtxt(args.training_labels), 0, 1) X_trains = X_train if args.scale: print "Scaling features (mean removal divided by std)..." scaler = StandardScaler().fit(X_train) X_trains = scaler.transform(X_train) # create output folders outF = args.output_folder + "/" + os.path.basename( args.training_data) + "--FS_" + str( args.select_features) + "--i_" + str(args.iterations) buildDir(outF) maskF = outF + "/masks/" buildDir(maskF) #evaluation features first_experiments labels logs masks parameters # predictions src suca paramF = outF + "/parameters/" buildDir(paramF) #featF = outF+"/features/" #buildDir(featF) #evalF = buildDir(outF+"/evaluation") #os.path.basename( # args.training_data)]) + featsel_str + "--" + os.path.basename( # test_label # initializes numpy random seed np.random.seed(args.seed) # performs feature selection featsel_str = ".all-feats" if args.select_features: print "Performing feature selection ..." # initializes selection estimator sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000, n_jobs=8, random_state=args.seed, n_resampling=1000) sel_est.fit(X_trains, y_train) X_trains = sel_est.transform(X_trains) selected_mask = sel_est.get_support() selected_features = sel_est.get_support(indices=True) sel_feats_path = os.sep.join( # [".", "masks", os.path.basename(args.training_data)]) [maskF, os.path.basename(args.training_data)]) # saves indices np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d") # saves mask np.save(sel_feats_path + ".mask", selected_mask) featsel_str = ".randcv" estimator = ExtraTreesRegressor(random_state=args.seed, n_jobs=1) mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False) #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False) # performs parameter optimization using random search print "Performing parameter optimization ... " param_distributions = \ {"n_estimators": [5, 10, 50, 100, 200, 500], "max_depth": [3, 2, 1, None], "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)], "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False]} # "criterion": ["gini", "entropy"]} search = RandomizedSearchCV(estimator, param_distributions, n_iter=args.iterations, scoring=mae_scorer, n_jobs=8, refit=True, cv=KFold(X_train.shape[0], args.folds, shuffle=True, random_state=args.seed), verbose=1, random_state=args.seed) # fits model using best parameters found search.fit(X_trains, y_train) # ................SHAHAB ........................ models_dir = sorted(glob.glob(args.models_dir + os.sep + "*")) estimator2 = ExtraTreesRegressor(bootstrap=search.best_params_["bootstrap"], max_depth=search.best_params_["max_depth"], max_features=search.best_params_["max_features"], min_samples_leaf=search.best_params_["min_samples_leaf"], min_samples_split=search.best_params_["min_samples_split"], n_estimators=search.best_params_["n_estimators"], verbose=1, random_state=42, n_jobs=8) estimator2.fit(X_trains,y_train) from sklearn.externals import joblib print "koooonnn %s" % args.models_dir joblib.dump(estimator2, args.models_dir+"/XRT.pkl") joblib.dump(scaler, args.models_dir+"/scaler.pkl") joblib.dump(sel_est, args.models_dir+"/sel_est.pkl") # print "Kioonnn number of feat:\n", n_feature # ................SHAHAB ........................ print "Best parameters: ", search.best_params_ # saves parameters on yaml file #param_path = os.sep.join([".", "parameters", os.path.basename( param_path = os.sep.join([paramF, os.path.basename( args.training_data)]) + featsel_str + ".params.yaml" param_file = codecs.open(param_path, "w", "utf-8") yaml.dump(search.best_params_, stream=param_file) testF = os.sep.join([outF, "/test/"]) buildDir(testF) m = y_train.mean() # evaluates model on the different test sets test_features = sorted(glob.glob(args.test_data + os.sep + "*")) test_labels = sorted(glob.glob(args.test_labels + os.sep + "*")) for test_feature, test_label in zip(test_features, test_labels): print "Evaluating on %s" % test_label X_test = np.nan_to_num( np.genfromtxt(test_feature, delimiter=args.delimiter)) y_test = np.clip(np.genfromtxt(test_label), 0, 1) X_tests = X_test if args.scale: X_tests = scaler.transform(X_test) if args.select_features: X_tests = sel_est.transform(X_tests) # gets predictions on test set #y_pred = search.predict(X_tests) y_pred = np.clip(search.predict(X_tests), 0, 1) # evaluates on test set mae = mean_absolute_error(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print "Test MAE = %2.8f" % mae print "Test RMSE = %2.8f" % rmse print "Prediction range: [%2.4f, %2.4f]" % (y_pred.min(), y_pred.max()) # saves evaluation testFX = testF + "/" + os.path.basename(test_label) buildDir(testFX) buildDir(testFX + "/evaluation/") eval_path = os.sep.join([testFX, "evaluation", os.path.basename( args.training_data)]) + featsel_str + "--" + os.path.basename( test_label) mae_eval = codecs.open(eval_path + ".mae", 'w', "utf-8") mae_eval.write(str(mae) + "\n") rmse_eval = codecs.open(eval_path + ".rmse", 'w', "utf-8") rmse_eval.write(str(rmse) + "\n") mu = m * np.ones(y_test.shape[0]) # baseline on test set maeB = mean_absolute_error(y_test, mu) rmseB = np.sqrt(mean_squared_error(y_test, mu)) print "Test MAE Baseline= %2.8f" % maeB print "Test RMSE Baseline= %2.8f" % rmseB mae_eval = codecs.open(eval_path + ".mae.Base", 'w', "utf-8") mae_eval.write(str(maeB) + "\n") rmse_eval = codecs.open(eval_path + ".rmse.Base", 'w', "utf-8") rmse_eval.write(str(rmseB) + "\n") # saves predictions buildDir(testFX + "/predictions/") preds_path = os.sep.join([testFX, "predictions", os.path.basename( args.training_data)]) + featsel_str + "--" + os.path.basename( test_label) + ".preds" np.savetxt(preds_path, y_pred, fmt="%2.15f")
Rforest = RandomForestRegressor() grid_search = RandomizedSearchCV(Rforest, cv=3, param_distributions=paramDist, n_iter=100, n_jobs=4, scoring='mean_squared_error') grid_search.fit(Hold_out, y_test) scoresGrid = grid_search.grid_scores_ print grid_search.best_score_ print grid_search.best_estimator_ report(grid_search.grid_scores_) finalpred = np.expm1(grid_search.predict(Ypredict)) pred = np.vstack( [np.array(mat[~np.isnan(mat['id'])]['id'], dtype=np.int16), finalpred]).T pred = pd.DataFrame(pred) pred.columns = ['id', 'cost'] pred['id'] = pred['id'].astype(np.int16) ts = datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d-%H%M%S') pred.to_csv('pred06-stack' + ts + '.csv', index=False) """ p1 = pd.read_csv('pred04-stack.csv') p2 = pd.read_csv('pred05-stack.csv') pred = pd.concat([p1['id'],(p1['cost']+p2['cost'])/2],axis=1) pred.to_csv('pred05-stack05-04.csv',index=False) """
max_depth=max_depth_dist) gbdt = GradientBoostingClassifier(verbose=1) searchcv = RandomizedSearchCV(estimator=gbdt, param_distributions=param_dist,n_iter=200,verbose=1) searchcv.fit(Xtrain,ytrain) searchcv.best_score_ searchcv.best_estimator_ searchcv.best_params_ # ---------------------- predict titanic_test = pd.read_csv("test_processed.csv",index_col="PassengerId") Xtest = titanic_test[feature_names] predictions = searchcv.predict(Xtest) submission = pd.DataFrame({ "PassengerId": titanic_test.index, "Survived": predictions }) submission.to_csv("submit_gbdt.csv", index=False) import pickle inf = open('gbdt.pkl', 'rb') gbdt = pickle.load(inf) inf.close()
temp = np.size(train_data,0)/5 cv_data = train_data[0:temp:,::] train_data2 = train_data[temp::,::] # # forest = RandomForestClassifier(n_estimators= 100, bootstrap = True, min_samples_leaf = 7, min_samples_split = 7, # criterion = 'gini', max_features = 3, max_depth= None) # forest = forest.fit(train_data2[::,1::], train_data2[::, 0]) random_search = RandomizedSearchCV(forest, param_distributions=param_dist, n_iter=n_iter_search) random_search.fit(train_data2[::,1::], train_data2[::,0]) print "Predicting..." # output_cv = forest.predict(cv_data[::,1::]).astype(int) # output_train = forest.predict(train_data2[::,1::]).astype(int) output_cv = random_search.predict(cv_data[::,1::]).astype(int) output_train = random_search.predict(train_data2[::,1::]).astype(int) print "Done..." if (len(train_data2) != len(output_train)): print "something wrong" else: temp_cv_acc += len(output_cv[output_cv == cv_data[::,0]])/float(len(output_cv)) temp_train_acc += len(output_train[train_data2[::,0] == output_train])/float(len(output_train)) print "RF Training Accuracy:", temp_train_acc/trials, print "CV Accuracy:", temp_cv_acc/trials # real test data
def main(): csv_file_object = csv.reader(open('Data/train.csv', 'rb')) #Load in the training csv file header = csv_file_object.next() #Skip the fist line as it is a header train_data=[] #Creat a variable called 'train_data' for row in csv_file_object: #Skip through each row in the csv file train_data.append(row[1:]) #adding each row to the data variable train_data = np.array(train_data) #Then convert from a list to an array #I need to convert all strings to integer classifiers: #Male = 1, female = 0: train_data[train_data[0::,3]=='male',3] = -1 train_data[train_data[0::,3]=='female',3] = 1 #embark c=0, s=1, q=2 train_data[train_data[0::,10] =='C',10] = -1 train_data[train_data[0::,10] =='S',10] = 0 train_data[train_data[0::,10] =='Q',10] = 1 #Survived train_data[train_data[0::,3]==1,0] = 1 train_data[train_data[0::,3]==0,0] = -1 #I need to fill in the gaps of the data and make it complete. #So where there is no price, I will assume price on median of that class #Where there is no age I will give median of all ages imp = preprocessing.Imputer(missing_values=0, strategy='median', axis=0) #All the ages with no data make the median of the data #train_data[train_data[0::,4] == '',4] = np.median(train_data[train_data[0::,4]\ # != '',4].astype(np.float)) #All missing ebmbarks just make them embark from most common place #train_data[train_data[0::,10] == '',10] = np.round(np.mean(train_data[train_data[0::,10]\ # != '',10].astype(np.float))) train_data = np.delete(train_data,[2,7,9,10],1) #remove the name data, cabin and ticket train_data[train_data=='']='0' imp.fit_transform(train_data) #I need to do the same with the test data now so that the columns are in the same #as the training data #We finally spit the data between train set and valiation set x_train, x_test, y_train, y_test=train_test_split( train_data[0::,1::],train_data[0::,0], test_size=0.2, random_state=0) #Standardise data scaler = preprocessing.StandardScaler().fit(x_train) x_train_std=scaler.transform(x_train) x_test_std=scaler.transform(x_test) test_file_object = csv.reader(open('Data/test.csv', 'rb')) #Load in the test csv file header = test_file_object.next() #Skip the fist line as it is a header test_data=[] #Creat a variable called 'test_data' ids = [] for row in test_file_object: #Skip through each row in the csv file ids.append(row[0]) test_data.append(row[1:]) #adding each row to the data variable test_data = np.array(test_data) #Then convert from a list to an array #I need to convert all strings to integer classifiers: #Male = 1, female = 0: test_data[test_data[0::,2]=='male',2] = 1 test_data[test_data[0::,2]=='female',2] = -1 #ebark c=0, s=1, q=2 test_data[test_data[0::,9] =='C',9] = -1 #Note this is not ideal, in more complex 3 is not 3 tmes better than 1 than 2 is 2 times better than 1 test_data[test_data[0::,9] =='S',9] = 0 test_data[test_data[0::,9] =='Q',9] = 1 #All the ages with no data make the median of the data #test_data[test_data[0::,3] == '',3] = np.median(test_data[test_data[0::,3]\ # != '',3].astype(np.float)) #All missing ebmbarks just make them embark from most common place #test_data[test_data[0::,9] == '',9] = np.round(np.mean(test_data[test_data[0::,9]\ # != '',9].astype(np.float))) #All the missing prices assume median of their respectice class #for i in xrange(np.size(test_data[0::,0])): # if test_data[i,7] == '': # test_data[i,7] = np.median(test_data[(test_data[0::,7] != '') &\ # (test_data[0::,0] == test_data[i,0])\ # ,7].astype(np.float)) test_data = np.delete(test_data,[1,6,8,9],1) #remove the name data, cabin and ticket test_data[test_data=='']='0' #Impute mising values imp.fit_transform(test_data) #Standarize scaler_test = preprocessing.StandardScaler().fit(test_data) test_data_std=scaler_test.transform(test_data) #The data is now ready to go. So lets train then test! start = time() print 'Training estimators' estimators = [('linearsvc', LinearSVC()), ('KNeighborsClassifier', KNeighborsClassifier())] clf = Pipeline(estimators) # specify parameters and distributions to sample from param_dist = {"linearsvc__C": sp_randint(1, 1000), "linearsvc__loss": ["l1", "l2"], "linearsvc__dual": [True], "KNeighborsClassifier__n_neighbors": sp_randint(5, 100), "KNeighborsClassifier__weights": ["uniform", "distance"], "KNeighborsClassifier__algorithm": ["ball_tree", "kd_tree", "brute"], "KNeighborsClassifier__leaf_size": sp_randint(3, 100), } # run randomized search n_iter_search = 2000 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search,n_jobs=4, verbose=1) random_search.fit(x_train_std,y_train) print 'Reporting' print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) score=random_search.score(x_test_std,y_test) print 'Test score' print score print 'Predicting' output = random_search.predict(test_data_std) open_file_object = csv.writer(open("pipelinearsvcknn.csv", "wb")) open_file_object.writerow(["PassengerId","Survived"]) open_file_object.writerows(zip(ids, output))
def main(date, modelType, iterations): """ Determines the optimal hyperparameters for a given machine learning model for a set of training data. :param date: Date the training and testing data was collected (YYYY_MMDD) :param modelType: (string) type of machine learning model to train :param iterations: (int) number of iterations for hyperparameter searching :return: (None) """ # Make sure that the model is a valid choice if (not (modelType in MODELS.keys())) and (modelType != ALL): print "Invalid model type:", modelType return # Allow for training more than one model at a time if modelType == ALL: modelsToTrain = MODELS.keys() else: modelsToTrain = [modelType] # Load the training and testing data into memory trainX, trainY = FileIO.loadTrainingData(date) testX, testY = FileIO.loadTestingData(date) trainX = np.nan_to_num(trainX) testX = np.nan_to_num(testX) for modelType in modelsToTrain: # Train the desired ML model name, clfType = MODELS[modelType] print "Training the", name baseClassifier = clfType() clf = RandomizedSearchCV(baseClassifier, param_distributions=PARAMETERS[modelType], n_iter=iterations, n_jobs=4) clf.fit(trainX, trainY) # Perform some very basic accuracy testing trainResult = clf.predict(trainX) testResult = clf.predict(testX) trainingAccuracy = accuracy_score(trainY, trainResult) testingAccuracy = accuracy_score(testY, testResult) confusionMatrix = confusion_matrix(testY, testResult) print "Training Accuracy:", trainingAccuracy print "Testing Accuracy:", testingAccuracy print "Confusion Matrix:" print confusionMatrix print " " print "Hyperparameters:" for param in PARAMETERS[modelType].keys(): print param + ':', clf.best_estimator_.get_params()[param] print " " # Save the model to disk FileIO.saveModel(clf.best_estimator_, modelType, date)
parameter_space_bal = { 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['auto', 1e-3, 1e-4], 'C': [0.01, .1, 1, 10, 100, 1000], 'class_weight': [None]} print("Building balanced SVM") SVM_bal = RandomizedSearchCV(SVC(C=1), parameter_space_bal, cv=10, scoring='recall_weighted', iid=True) print("fitting balanced SVM") SVM_bal.fit(xbaltrain, ybaltrain) print("Hyperparameters for balanced SVM found:") print(SVM_bal.best_params_) print("getting predictions for balanced SVM") y_pred_svm_bal = SVM_bal.predict(xtest) print("\n\n results for SVM") winfault.clf_scoring(ytest, y_pred_svm_bal, labels) print("========================================================") print("------Building models using Imbalanced training data------") print("========================================================") parameter_space = { 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['auto', 1e-3, 1e-4], 'C': [0.01, .1, 1, 10, 100, 1000], 'class_weight': [ {0: 0.01}, {1: 1}, {1: 2}, {1: 10}, {1: 50}, 'balanced']} print("Building Imbalanced SVM") SVM = RandomizedSearchCV(SVC(C=1), parameter_space, cv=10,
#2 svc = SVC() svc_param_dist = {"C": uniform(), "gamma": uniform(), "kernel": ['linear', 'rbf'], "class_weight": [{1: 1}, {1: 2}, {1: 5}, {1: 10}], "probability": [True] } #params = [{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7], # 'kernel': ['linear'], 'class_weight': [{1: 1}, {1: 5}, {1: 2}, {1: 3}, {1: 10}]}] #clf2 = GridSearchCV(svc, param_grid=params, scoring='roc_auc', verbose=True, cv=5, n_jobs=-1) clf2 = RandomizedSearchCV(svc, param_distributions=svc_param_dist, n_iter=100) clf2.fit(X_train_2, y_train_2) clf_2_x_val_predictions = clf2.predict(X_test) class_rep_2 = classification_report(y_test, clf_2_x_val_predictions) print clf2.best_params_ print class_rep_2 #3 gbc = GradientBoostingClassifier() forest_param_dist = {"max_depth": [3,4,5,6,7], "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "subsample": uniform(), "learning_rate": uniform(), "n_estimators": sp_randint(1, 351)} clf3 = RandomizedSearchCV(gbc, param_distributions=forest_param_dist, n_iter=100)
cv_data = train_data[0:temp:, ::] train_data2 = train_data[temp::, ::] # # forest = RandomForestClassifier(n_estimators= 100, bootstrap = True, min_samples_leaf = 7, min_samples_split = 7, # criterion = 'gini', max_features = 3, max_depth= None) # forest = forest.fit(train_data2[::,1::], train_data2[::, 0]) random_search = RandomizedSearchCV(forest, param_distributions=param_dist, n_iter=n_iter_search) random_search.fit(train_data2[::, 1::], train_data2[::, 0]) print "Predicting..." # output_cv = forest.predict(cv_data[::,1::]).astype(int) # output_train = forest.predict(train_data2[::,1::]).astype(int) output_cv = random_search.predict(cv_data[::, 1::]).astype(int) output_train = random_search.predict(train_data2[::, 1::]).astype(int) print "Done..." if (len(train_data2) != len(output_train)): print "something wrong" else: temp_cv_acc += len(output_cv[output_cv == cv_data[::, 0]]) / float( len(output_cv)) temp_train_acc += len( output_train[train_data2[::, 0] == output_train]) / float( len(output_train)) print "RF Training Accuracy:", temp_train_acc / trials, print "CV Accuracy:", temp_cv_acc / trials # real test data test_data = test_df.values
K_X = Quasi_linear_kernel(X,X) clf = svm.SVC(kernel='precomputed') # y_pred = clf.predict(K_test) # run randomized search n_iter_search = 40 random_search = RandomizedSearchCV(clf, param_distributions=QL_SVM_param_dist, n_iter=n_iter_search,cv=skf) start = time() random_search.fit(K_train, y_train) print("Quasi_linear kernel SVM RandomSearch took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) print("Random_search Best estimator is :\n"), random_search.best_estimator_ report(random_search.grid_scores_,n_top=5) # print the classification_report y_test, y_pred = y_test, random_search.predict(K_test) #Call predict on the estimator with the best found parameters. print(classification_report(y_test, y_pred)) print() # run grid search grid_search = GridSearchCV(clf, param_grid=QL_SVM_param_dist, cv=skf) start = time() grid_search.fit(K_X, Y) print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(grid_search.grid_scores_))) print("Grid_search Best estimator is :\n"), grid_search.best_estimator_ report(grid_search.grid_scores_,n_top=10) # print the classification_report y_test, y_pred = y_test, grid_search.predict(K_test) print(classification_report(y_test, y_pred))
sys.stderr.write("\n:>> Model selected: %s\n" % (rs.best_params_)) except: try: num_lines = sum(1 for line in open("svr_%s_%s_H%s_%s_m%s.out" % (corpus, representation, dimensions, op, min_count), "r")) except IOError: num_lines = 0 y_out = {} y_out['estimated_output'] = range(0,len(y)) y_out['best_params'] = "Non converged model..." y_out['learned_model'] = "Nonconverged_model_%d" % num_lines y_out['performance'] = 0.0 with open("svr_%s_%s_H%s_%s_m%s.out" % (corpus, representation, dimensions, op, min_count), "a") as f: f.write(str(y_out)+'\n') continue f_x = rs.predict(X).tolist() sys.stderr.write("\n:>> R2: %s\n" % (r2_score(y, f_x))) try: num_lines = sum(1 for line in open("svr_%s_%s_H%s_%s_m%s.out" % (corpus, representation, dimensions, op, min_count), "r")) except IOError: num_lines = 0 y_out = {} if args.t: y_out['estimated_output'] = map(detener, f_x) else: y_out['estimated_output'] = f_x y_out['best_params'] = rs.best_params_ y_out['learned_model'] = {'file': "/almac/ignacio/data/svr_models/%s_%s_%s_%s_H%s_%s_m%s.model" % (svr_, corpus, num_lines, representation, dimensions, op, min_count) } if args.t:
param_dist = { "pca__n_components": sp_randint(10, 700), "rbfSVM__C": scipy.stats.expon(scale=10), "rbfSVM__kernel": ["rbf"], "rbfSVM__gamma": scipy.stats.expon(scale=0.01) } n_iter_search = 500 random_search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=n_iter_search, cv=cv, verbose=6, n_jobs=4) random_search.fit(X_train, Y_train) predicted_held_out = random_search.predict(X_test) mmat = confusion_matrix(predicted_held_out, Y_test) print mmat class_map = dict(zip(set(input_kmers_counts["class"]), range(0, 4))) kappa([class_map[x] for x in Y_test], [class_map[x] for x in predicted_held_out]) # We determine whether the variance of the number of components for the best CV all_scores = random_search.grid_scores_ all_scores.sort(key=lambda x: x.mean_validation_score) with open("random_search_scores_1000iter_5mers.bdat", "w") as f: cPickle.dump(all_scores, f) # We try the best parameters on indepedent samples X_train, X_test, Y_train, Y_test = train_test_split( normalized_counts[kmer_colums],
forest = RandomForestClassifier(n_estimators=25) # run randomized search n_iter_search = 30 random_search = RandomizedSearchCV(forest, param_distributions=param_dist, n_iter=n_iter_search, cv=6) start = time() random_search.fit(train_data2[::, 1::], train_data2[::, 0]) print( "RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) train_output = random_search.predict(train_data2[::, 1::]) cv_output = random_search.predict(cv_data[::, 1::]) print "Training set accuracy: %.3f CV set accuracy: %.3f"\ %(len(train_data2[train_output == train_data2[::,0]])/float(len(train_data2)), (len(cv_data[cv_output == cv_data[::,0]])/float(len(cv_data)))) # Analyzing important features forest = random_search.best_estimator_ feature_importance = forest.feature_importances_ # make importances relative to max importance feature_importance = 100.0 * (feature_importance / feature_importance.max()) feature_list = df.columns.values df.drop(feature_list[feature_importance < 10], inplace=True, axis=1) test_df.drop(feature_list[feature_importance < 10], inplace=True, axis=1)
"max_features": list(range(1,X_train.shape[1]+1)), "min_samples_split": list(range(1, 10)), "min_samples_leaf": list(range(1, 10))} random_search = RandomizedSearchCV(clf, param_distributions=param_grid, n_jobs=3, n_iter=10000, verbose=1) print("Finding best hyperparameters for randomized tree") random_search.fit(X_train,Y) #random_search.fit(X_train_train,Y_train) # Deal with results print("Best parameters are:") print(random_search.best_params_) #score = random_search.score(X_train_test,Y_test) #print("Score = {}".format(score)) best_clf = random_search.best_estimator_ Y_test = random_search.predict(X_test) result = np.hstack([np.expand_dims(ID_test,axis=1),np.expand_dims(Y_test,axis=1)]) # Write results to file with open("predict.csv","w") as outfile: outfile.write("PassengerId,Survived\n") for i in range(len(result)): outfile.write("{},{}\n".format(result[i,0],result[i,1])) fig = plt.figure(figsize=(16,9)) gs = gridspec.GridSpec(1,1,left=0.05,right=0.98,bottom=0.17,top=0.98) ax = fig.add_subplot(gs[0,0],xlabel="Features",ylabel="Importance") xpos = np.arange(len(labels)) width = 0.9 ax.bar(xpos,best_clf.feature_importances_,width=width) ax.set_xticks(xpos+(width/2.))
def parameter_tuning(Xn, yn, scale=1): # FEATURE SELECTION print Xn.shape print yn.shape # FEATURE SCALING if scale == 1: Xn = preprocessing.scale(Xn, with_mean=True) print 'NORMALIZING' elif scale == 2: Xn = preprocessing.scale(Xn, with_mean=False) print 'NORMALIZING' tuned_parameters = [{ 'kernel': ['rbf'], 'C': np.logspace(-2, 7, 10), 'gamma': np.logspace(-4, 2, 7) }] tuned_parameters2 = { 'kernel': ['rbf'], 'C': np.logspace(-2, 7, 10), 'gamma': np.logspace(-4, 2, 7) } linear_parameters = [{'kernel': ['linear'], 'C': np.logspace(-3, 4, 8)}] linear_parameters2 = {'kernel': ['linear'], 'C': np.logspace(-3, 4, 8)} cv = cross_validation.StratifiedKFold(yn, shuffle=True, n_folds=3, random_state=42) if RBF: clf = RandomizedSearchCV(estimator=SVC(C=1, cache_size=1000), param_distributions=tuned_parameters2, cv=cv, scoring='accuracy', n_iter=30, verbose=1, n_jobs=2).fit(Xn, yn) print("Best parameters set found on development set:") print print(clf.best_estimator_) print(clf.best_score_) print() print confusion_matrix(yn, clf.predict(Xn)) if LINEAR: clf = GridSearchCV(estimator=SVC(C=1, cache_size=1000), param_grid=linear_parameters, cv=cv, scoring='accuracy', verbose=1, n_jobs=2).fit(Xn, yn) print("Best parameters set found on development set:") print print(clf.best_estimator_) print(clf.best_score_) print() print confusion_matrix(yn, clf.predict(Xn))
clf = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_grid, n_iter=n_iter_search, cv=3, scoring='accuracy', n_jobs=-1) clf.fit(X_train, y_train) print("Best parameters set found on development set:") print(clf.best_params_) print() #print("Detailed classification report:") #print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print() y_true, y_pred = y_test, clf.predict(X_test) print(classification_report(y_true, y_pred)) print() print(confusion_matrix(y_true, y_pred)) print(best_score ,clf.best_score_) if i == 1: break else: best_score = clf.best_score_ # remove some features rfecv = RFECV(estimator=clf.best_estimator_, step=1, cv=2, scoring='accuracy') rfecv.fit(X_train, y_train) print("Optimal number of features : %d" % rfecv.n_features_) X_train = rfecv.transform(X_train)
def main(): csv_file_object = csv.reader(open('Data/train.csv', 'rb')) #Load in the training csv file header = csv_file_object.next() #Skip the fist line as it is a header train_data = [] #Creat a variable called 'train_data' for row in csv_file_object: #Skip through each row in the csv file train_data.append(row[1:]) #adding each row to the data variable train_data = np.array(train_data) #Then convert from a list to an array #I need to convert all strings to integer classifiers: #Male = 1, female = 0: train_data[train_data[0::, 3] == 'male', 3] = -1 train_data[train_data[0::, 3] == 'female', 3] = 1 #embark c=0, s=1, q=2 train_data[train_data[0::, 10] == 'C', 10] = -1 train_data[train_data[0::, 10] == 'S', 10] = 0 train_data[train_data[0::, 10] == 'Q', 10] = 1 #Survived train_data[train_data[0::, 3] == 1, 0] = 1 train_data[train_data[0::, 3] == 0, 0] = -1 #I need to fill in the gaps of the data and make it complete. #So where there is no price, I will assume price on median of that class #Where there is no age I will give median of all ages imp = preprocessing.Imputer(missing_values=0, strategy='median', axis=0) #All the ages with no data make the median of the data #train_data[train_data[0::,4] == '',4] = np.median(train_data[train_data[0::,4]\ # != '',4].astype(np.float)) #All missing ebmbarks just make them embark from most common place #train_data[train_data[0::,10] == '',10] = np.round(np.mean(train_data[train_data[0::,10]\ # != '',10].astype(np.float))) train_data = np.delete(train_data, [2, 7, 9, 10], 1) #remove the name data, cabin and ticket train_data[train_data == ''] = '0' imp.fit_transform(train_data) #I need to do the same with the test data now so that the columns are in the same #as the training data #We finally spit the data between train set and valiation set x_train, x_test, y_train, y_test = train_test_split(train_data[0::, 1::], train_data[0::, 0], test_size=0.2, random_state=0) #Standardise data scaler = preprocessing.StandardScaler().fit(x_train) x_train_std = scaler.transform(x_train) x_test_std = scaler.transform(x_test) test_file_object = csv.reader(open('Data/test.csv', 'rb')) #Load in the test csv file header = test_file_object.next() #Skip the fist line as it is a header test_data = [] #Creat a variable called 'test_data' ids = [] for row in test_file_object: #Skip through each row in the csv file ids.append(row[0]) test_data.append(row[1:]) #adding each row to the data variable test_data = np.array(test_data) #Then convert from a list to an array #I need to convert all strings to integer classifiers: #Male = 1, female = 0: test_data[test_data[0::, 2] == 'male', 2] = 1 test_data[test_data[0::, 2] == 'female', 2] = -1 #ebark c=0, s=1, q=2 test_data[ test_data[0::, 9] == 'C', 9] = -1 #Note this is not ideal, in more complex 3 is not 3 tmes better than 1 than 2 is 2 times better than 1 test_data[test_data[0::, 9] == 'S', 9] = 0 test_data[test_data[0::, 9] == 'Q', 9] = 1 #All the ages with no data make the median of the data #test_data[test_data[0::,3] == '',3] = np.median(test_data[test_data[0::,3]\ # != '',3].astype(np.float)) #All missing ebmbarks just make them embark from most common place #test_data[test_data[0::,9] == '',9] = np.round(np.mean(test_data[test_data[0::,9]\ # != '',9].astype(np.float))) #All the missing prices assume median of their respectice class #for i in xrange(np.size(test_data[0::,0])): # if test_data[i,7] == '': # test_data[i,7] = np.median(test_data[(test_data[0::,7] != '') &\ # (test_data[0::,0] == test_data[i,0])\ # ,7].astype(np.float)) test_data = np.delete(test_data, [1, 6, 8, 9], 1) #remove the name data, cabin and ticket test_data[test_data == ''] = '0' #Impute mising values imp.fit_transform(test_data) #Standarize scaler_test = preprocessing.StandardScaler().fit(test_data) test_data_std = scaler_test.transform(test_data) #The data is now ready to go. So lets train then test! start = time() print 'Training estimators' estimators = [('linearsvc', LinearSVC()), ('KNeighborsClassifier', KNeighborsClassifier())] clf = Pipeline(estimators) # specify parameters and distributions to sample from param_dist = { "linearsvc__C": sp_randint(1, 1000), "linearsvc__loss": ["l1", "l2"], "linearsvc__dual": [True], "KNeighborsClassifier__n_neighbors": sp_randint(5, 100), "KNeighborsClassifier__weights": ["uniform", "distance"], "KNeighborsClassifier__algorithm": ["ball_tree", "kd_tree", "brute"], "KNeighborsClassifier__leaf_size": sp_randint(3, 100), } # run randomized search n_iter_search = 2000 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search, n_jobs=4, verbose=1) random_search.fit(x_train_std, y_train) print 'Reporting' print( "RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) score = random_search.score(x_test_std, y_test) print 'Test score' print score print 'Predicting' output = random_search.predict(test_data_std) open_file_object = csv.writer(open("pipelinearsvcknn.csv", "wb")) open_file_object.writerow(["PassengerId", "Survived"]) open_file_object.writerows(zip(ids, output))
print(rand.best_score_) print(rand.best_params_) # ### Making predictions for new data # define X_new as the ingredient text X_new = new.ingredients_str # print the best model found by RandomizedSearchCV rand.best_estimator_ # RandomizedSearchCV/GridSearchCV automatically refit the best model with the entire dataset, and can be used to make predictions new_pred_class_rand = rand.predict(X_new) new_pred_class_rand # create a submission file (score: 0.75342) pd.DataFrame({'id':new.id, 'cuisine':new_pred_class_rand}).set_index('id').to_csv('sub3.csv') # ## Part 5: Adding features to a document-term matrix (using SciPy) # # - So far, we've trained models on either the **document-term matrix** or the **manually created features**, but not both. # - To train a model on both types of features, we need to **combine them into a single feature matrix**. # - Because one of the matrices is **sparse** and the other is **dense**, the easiest way to combine them is by using SciPy. # create a document-term matrix from all of the training data X_dtm = vect.fit_transform(X)
# In[ ]: print(model_svm.grid_scores_) # In[ ]: print(model_svm.best_params_) # In[ ]: y_pred= model_svm.predict(X_test) print(metrics.accuracy_score(y_pred,y_test)) # In[ ]: confusion_matrix=metrics.confusion_matrix(y_test,y_pred) confusion_matrix # In[ ]: auc_roc=metrics.classification_report(y_test,y_pred) auc_roc # In[ ]:
param_distributions=param_dister, n_iter=n_iter_search, n_jobs=2) start = time() random_search.fit(X, y) print("RandomizedSearchCV took %.2f s for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) # Load the testing data test_mat = genfromtxt(TRAINING_INPUT_DIRECTORY + '/testing_matrix.csv', delimiter=',') test_y = test_mat[:, 0] test_x = test_mat[:, 1:] y_true, y_pred = test_y, random_search.predict(test_x) print("Raw metirc result :") print(classification_report(y_true, y_pred)) print('Accuracy : ' + str(accuracy_score(y_true, y_pred)) + '\n') mod_y_pred = list(map(lambda x: x if x == 1 else -1, y_pred)) mod_y_true = list(map(lambda x: x if x == 1 else -1, y_true)) print("More reasonable metirc result : ") print(classification_report(mod_y_true, mod_y_pred)) print('Accuracy : ' + str(accuracy_score(mod_y_true, mod_y_pred)) + '\n')