def train(model_id,train_x,train_y,valid_x,valid_y,test_x): train_x,train_y=shuffle(train_x,train_y) random_state=random.randint(0, 1000000) rf = RandomForestClassifier(n_jobs=8) param_dist = { "n_estimators":sp_randint(100,300), "criterion": ["gini"], #"max_depth": sp_randint(3, 10000), #"min_samples_split": sp_randint(1, 300), #"min_samples_leaf": sp_randint(1, 300), "max_features": sp_randint(10, 26), "bootstrap": [True, False], 'random_state':sp_randint(1, 1000000), } clf = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=50,cv=10,scoring='roc_auc') clf.fit(train_x, train_y) valid_predictions = clf.predict_proba(valid_x)[:, 1] test_predictions= clf.predict_proba(test_x)[:, 1] loss = roc_auc_score(valid_y,valid_predictions) print('loss:') print(loss) print(clf.best_estimator_) data.saveData(valid_id,valid_predictions,"./valid_results/valid_"+str(model_id)+".csv") data.saveData(test_id,test_predictions,"./results/results_"+str(model_id)+".csv")
def trainClassifier(X, y, testTweetsAll, ensembleTweets, ensembleSentiment): # split our data into training and test datasets xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.33, random_state=8) classifier = RandomForestClassifier(n_estimators=20, n_jobs=-1) # for simplicity's sake, we could train a single random forest: # classifier.fit(xTrain, yTrain) # print classifier.score(xTest, yTest) # for more fun, we will optimize the hyperparameters for our random forest using RandomizedSearchCV parametersToTry = { 'max_features': ['sqrt', 'log2', None, .01, .1, .2, .3], 'criterion': ['gini', 'entropy'], 'min_samples_leaf': [1], 'min_samples_split': scipy.stats.randint(2, 30), 'bootstrap': [True, False] } # RandomizedSearchCV will optimize our hyperparameters for us in a way that is much more efficient and comprehensive than GridSearchCV. # run on all cores, fail gracefully if a combination of hyperparameters fails to converge, try 10 different combinations of hyperparameters, train on all the training data when finished, and use a third of the dataset for cross-validation while searching for the best hyperparameters searchCV = RandomizedSearchCV(classifier, parametersToTry, n_jobs=-1, error_score=0, n_iter=10, refit=True, cv=3) print 'shape of this training data set:' print xTrain.shape searchCV.fit(xTrain, yTrain) print 'the best hyperparameters from this search are:' print searchCV.best_params_ print 'best score from hyperparameter search is: ' + str( searchCV.best_score_) print 'score on the holdout portion of the training set: ' + str( searchCV.score(xTest, yTest)) print 'score on the ensemble data: ' + str( searchCV.score(ensembleTweets, ensembleSentiment)) + '\n\n' testPredictions = searchCV.predict_proba(testTweetsAll) ensemblePredictions = searchCV.predict_proba(ensembleTweets) def singlePrediction(predictions): cleanedPredictions = [] for predictionRow in predictions: cleanedPredictions.append(predictionRow[1]) return cleanedPredictions # the classifier gives us a predicted probability for both the 0 and the 1 case. Given that they're mutually exclusive, we can simplify down to a single number (the predicted probability of the 1 case) testPredictions = singlePrediction(testPredictions) ensemblePredictions = singlePrediction(ensemblePredictions) return testPredictions, ensemblePredictions
def random_search(): from time import time from scipy.stats import randint as sp_randint from sklearn.grid_search import RandomizedSearchCV crimes = np.load(DATA_FILE) param_dist = { 'n_estimators': sp_randint(1, 150), "criterion": ["gini", "entropy"], 'max_depth': sp_randint(1, 40), "min_samples_split": sp_randint(2, 15), "min_samples_leaf": sp_randint(1, 10), "max_features": ['auto', 'sqrt', 'log2', None] } model = RandomForestClassifier(min_weight_fraction_leaf=0.0, max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=4, random_state=42, verbose=0, warm_start=False, class_weight=None) n_iter_search = 40 random_searcher = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=n_iter_search, random_state=42) start = time() random_searcher.fit(crimes['features_train'], crimes['labels_train'].ravel()) print( "RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_searcher.grid_scores_) loss_train = log_loss( crimes['labels_train'], random_searcher.predict_proba(crimes['features_train'])) loss_val = log_loss(crimes['labels_val'], random_searcher.predict_proba(crimes['features_val'])) loss_all = log_loss(crimes['labels'], random_searcher.predict_proba(crimes['features'])) print 'loss_all: ', loss_all print 'loss_train: ', loss_train print 'loss_val: ', loss_val return loss_val
def trainClassifier(X, y, testTweetsAll, ensembleTweets, ensembleSentiment): # split our data into training and test datasets xTrain, xTest, yTrain, yTest = train_test_split( X, y, test_size=0.33, random_state=8) classifier = RandomForestClassifier(n_estimators=20, n_jobs=-1) # for simplicity's sake, we could train a single random forest: # classifier.fit(xTrain, yTrain) # print classifier.score(xTest, yTest) # for more fun, we will optimize the hyperparameters for our random forest using RandomizedSearchCV parametersToTry = { 'max_features': ['sqrt','log2',None,.01,.1,.2,.3], 'criterion': ['gini','entropy'], 'min_samples_leaf': [1], 'min_samples_split': scipy.stats.randint(2,30), 'bootstrap': [True,False] } # RandomizedSearchCV will optimize our hyperparameters for us in a way that is much more efficient and comprehensive than GridSearchCV. # run on all cores, fail gracefully if a combination of hyperparameters fails to converge, try 10 different combinations of hyperparameters, train on all the training data when finished, and use a third of the dataset for cross-validation while searching for the best hyperparameters searchCV = RandomizedSearchCV(classifier, parametersToTry, n_jobs=-1, error_score=0, n_iter=10, refit=True, cv=3) print 'shape of this training data set:' print xTrain.shape searchCV.fit(xTrain, yTrain) print 'the best hyperparameters from this search are:' print searchCV.best_params_ print 'best score from hyperparameter search is: ' + str(searchCV.best_score_) print 'score on the holdout portion of the training set: ' + str( searchCV.score(xTest, yTest) ) print 'score on the ensemble data: ' + str( searchCV.score(ensembleTweets, ensembleSentiment) ) + '\n\n' testPredictions = searchCV.predict_proba(testTweetsAll) ensemblePredictions = searchCV.predict_proba(ensembleTweets) def singlePrediction(predictions): cleanedPredictions = [] for predictionRow in predictions: cleanedPredictions.append(predictionRow[1]) return cleanedPredictions # the classifier gives us a predicted probability for both the 0 and the 1 case. Given that they're mutually exclusive, we can simplify down to a single number (the predicted probability of the 1 case) testPredictions = singlePrediction(testPredictions) ensemblePredictions = singlePrediction(ensemblePredictions) return testPredictions, ensemblePredictions
def random_search(): from time import time from scipy.stats import uniform as sp_uniform, randint as sp_randint from sklearn.grid_search import RandomizedSearchCV from sklearn.cross_validation import ShuffleSplit crimes = np.load(DATA_FILE) # features_train = crimes['features_train'] all_labels = sorted(list(set(np.unique(crimes['labels_train'])) | set(np.unique(crimes['labels_val'])))) batch_size = 64 labels_train = create_labels(crimes['labels_train'], all_labels) labels_vals = create_labels(crimes['labels_val'], all_labels) labels_full = create_labels(crimes['labels'], all_labels) param_dist = {'layers': sp_randint(1, 3), "hidden_units": [64, 128, 256], 'input_dropout': sp_uniform(0, 0.5), "hidden_dropout": sp_uniform(0, 0.75), "learning_rate": sp_uniform(0.01, 0.1), "weight_decay": sp_uniform(0, 0.01) } model = NeuralNetworkClassifier(n_classes=len(all_labels), batch_size=batch_size, valid_set=(crimes['features_val'], labels_vals)) n_iter_search = 40 np.random.seed(42) random_searcher = RandomizedSearchCV(model, param_distributions=param_dist, scoring=None, n_iter=n_iter_search, random_state=42, error_score=100, verbose=5, cv=ShuffleSplit(n=crimes['features_train'].shape[0], n_iter=1, test_size=0)) start = time() random_searcher.fit(crimes['features_train'], labels_train.ravel()) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_searcher.grid_scores_) loss_train = log_loss(labels_train, random_searcher.predict_proba(crimes['features_train'])) loss_val = log_loss(labels_vals, random_searcher.predict_proba(crimes['features_val'])) loss_all = log_loss(labels_full, random_searcher.predict_proba(crimes['features'])) print 'loss_all: ', loss_all print 'loss_train: ', loss_train print 'loss_val: ', loss_val return loss_val
def random_search(): from time import time from scipy.stats import randint as sp_randint from sklearn.grid_search import RandomizedSearchCV crimes = np.load(DATA_FILE) param_dist = {'n_estimators': sp_randint(1, 150), "criterion": ["gini", "entropy"], 'max_depth': sp_randint(1, 40), "min_samples_split": sp_randint(2, 15), "min_samples_leaf": sp_randint(1, 10), "max_features": ['auto', 'sqrt', 'log2', None] } model = RandomForestClassifier(min_weight_fraction_leaf=0.0, max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=4, random_state=42, verbose=0, warm_start=False, class_weight=None) n_iter_search = 40 random_searcher = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=n_iter_search, random_state=42) start = time() random_searcher.fit(crimes['features_train'], crimes['labels_train'].ravel()) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_searcher.grid_scores_) loss_train = log_loss(crimes['labels_train'], random_searcher.predict_proba(crimes['features_train'])) loss_val = log_loss(crimes['labels_val'], random_searcher.predict_proba(crimes['features_val'])) loss_all = log_loss(crimes['labels'], random_searcher.predict_proba(crimes['features'])) print 'loss_all: ', loss_all print 'loss_train: ', loss_train print 'loss_val: ', loss_val return loss_val
def train(model_id,train_x,train_y,valid_x,valid_y,test_x): train_x,train_y=shuffle(train_x,train_y) random_state=random.randint(0, 1000000) print('random state: {state}'.format(state=random_state)) # build a classifier clf = RandomForestClassifier(n_jobs=8) # specify parameters and distributions to sample from param_dist = { "n_estimators":sp_randint(20,40), "criterion": ["gini", "entropy"], "max_depth": sp_randint(3, 10000), "min_samples_split": sp_randint(1, 30), "min_samples_leaf": sp_randint(1, 30), "max_features": sp_randint(1, 93), "bootstrap": [True, False], 'random_state':sp_randint(1, 1000000), } # run randomized search random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=2,cv=9,n_jobs=3) random_search.fit(train_x,train_y) valid_predictions = random_search.predict_proba(valid_x) test_predictions= random_search.predict_proba(test_x) loss = test(valid_y,valid_predictions,True) if loss<10.438: output=[loss,random_search.best_estimator_] print("model[\""+str(model_id)+"\"]="), print(output) data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv") data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
def randomsearch_xgboost(df): param_distributions={'max_depth': sp.stats.randint(1, 11), 'subsample': sp.stats.uniform(0.25, 0.75), 'colsample_bytree': sp.stats.uniform(0.25, 0.75) } xgb_model = XGBClassifier() rs = RandomizedSearchCV(xgb_model, param_distributions, cv=10, n_iter=20, scoring="log_loss", n_jobs=1, verbose=2) rs.fit(train_X, train_y.transpose()[0]) predict = rs.predict_proba(test_X) return predict[:, 1]
def randomsearch_xgboost(df): param_distributions = { 'max_depth': sp.stats.randint(1, 11), 'subsample': sp.stats.uniform(0.25, 0.75), 'colsample_bytree': sp.stats.uniform(0.25, 0.75) } xgb_model = XGBClassifier() rs = RandomizedSearchCV(xgb_model, param_distributions, cv=10, n_iter=20, scoring="log_loss", n_jobs=1, verbose=2) rs.fit(train_X, train_y.transpose()[0]) predict = rs.predict_proba(test_X) return predict[:, 1]
def discriminative_straight(train_questions, test_questions): tfidf, docs = get_tfidf('statement', reference) w2v, _ = get_w2v('clause', reference, size=100, sg=0, iter=80, alpha=0.025) transformer = partial(tfidf_transformer, tfidf, False) X, y, _ = get_ensemble_dataset(train_questions, docs, w2v, transformer) test_X, test_y, corr = get_ensemble_dataset(test_questions, docs, w2v, transformer) # poly = PolynomialFeatures(2, True) # X = poly.fit_transform(X) #0.537878787879 #0.321678321678 GradientBoostingClassifier() param_grid = { 'n_estimators': [500, 1000, 2000], 'learning_rate': [0.1, 0.01, 0.001], 'max_depth': [2, 3, 6], 'subsample': np.linspace(0.01, 0.5, 20), } rf = RandomizedSearchCV(GradientBoostingClassifier(), param_grid, n_iter=100, scoring='roc_auc', cv=8, verbose=3, n_jobs=-1) # rf = LogisticRegressionCV(scoring='roc_auc', cv=8) rf = ExtraTreesClassifier(500, n_jobs=-1) rf.fit(X, y) y_prob = rf.predict_proba(test_X) # print(y_prob) # print(y_prob.shape) y_pred = y_prob[:, 1].reshape(len(test_y) / 4, 4).argmax(axis=1) rnk = y_prob[:, 1].reshape(len(test_y) / 4, 4).argsort(axis=1) rr = [] for i in range(len(corr)): rr.append(1.0 / (rnk[i, corr[i]] + 1)) print np.mean(rr) print(y_pred == np.array(corr)).mean()
test = pd.read_csv("./Desktop/schiz/concat_test/testconcat.csv") train_features = train.ix[:,1:411] #train data features train_label = train["Class"] #train data labels #test = (test - test.mean()) / (test.max() - test.min()) train_features = (train_features - train_features.mean()) / (train_features.max() - train_features.min()) features = list(train.columns[1:411]) #liste of train features label = list(train["Class"]) print("Preprocessing data") tuned_parameters = param_distributions = {'C': expon(), 'gamma': expon(),'kernel': ['linear']} svc = SVC(C=0.000001, class_weight='auto', coef0=0.0, degree=3,kernel="linear",probability=True,random_state=None, shrinking=True, tol=0.000001, verbose=False) clf =RandomizedSearchCV(svc, param_distributions=param_distributions, n_iter=10000) clf.fit(train_features, label) scores = cross_validation.cross_val_score(clf,train_features,label,cv=2,scoring='roc_auc') print(scores) #def get_score(clf, train_features, train_label): # X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_features, train_label, test_size=0.12, random_state=0) # clf.fit(X_train, y_train) # print clf.score(X_test, y_test) print("Training Support Vector Machine") print("Make predictions on the test set") test_probs = clf.predict_proba(test[features])[:,1] submission = pd.DataFrame({"id": test["Id"], "probability": test_probs}) submission.to_csv("rf_xgboost_submission.csv", index=False) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def XGB_Classifier(Train_DS, y, Actual_DS, Sample_DS, Grid): print("***************Starting XGB Classifier***************") t0 = time() if Grid: # used for checking the best performance for the model using hyper parameters print("Starting model fit with Grid Search") param_grid = { "n_estimators": [50], "max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 19, 20, 40, 80], "min_child_weight": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 40, 80], "subsample": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], "colsample_bytree": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], "silent": [True], "gamma": [2, 1, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], } # run randomized search n_iter_search = 800 clf = xgb.XGBClassifier(nthread=8) clf = RandomizedSearchCV(clf, param_distributions=param_grid, n_iter=n_iter_search, scoring="log_loss", cv=3) start = time() clf.fit(np.array(Train_DS), np.array(y)) print("GridSearchCV completed") Parms_DS_Out = report(clf.grid_scores_, n_top=n_iter_search) Parms_DS_Out.to_csv(file_path + "Parms_DS_XGB_4.csv") print("Best estimator found by grid search:") print(clf.best_estimator_) sys.exit(0) else: ##----------------------------------------------------------------------------------------------------------------## # CV: 0.78526434774405007 (full set) # CV: 0.824999 (100k set - with Age set up, all dummy) clf = xgb.XGBClassifier(n_estimators=75, nthread=8) # clf = xgb.XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators=43, # objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0) ##----------------------------------------------------------------------------------------------------------------## Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf) sys.exit(0) X_train = np.array(Train_DS) Y_train = np.array(y) clf.fit(X_train, Y_train) X_Actual = np.array(Actual_DS) # Predict actual model pred_Actual = clf.predict_proba(X_Actual) pred_Actual = get_best_five(pred_Actual, type_val=False) print("Actual Model predicted") # Get the predictions for actual data set pred_Actual.to_csv(file_path + "output/Submission_Roshan_xgb_1.csv", index_label="id") print("***************Ending XGB Classifier***************") return pred_Actual
input_shape=(None, num_features), hidden_num_units=200, # number of units in hidden layer #!200-600 output_nonlinearity=lasagne.nonlinearities.softmax, # output layer output_num_units=num_classes, # 10 target values dropout_p=0.2, #!dropout 0.2-0.7 # optimization method: update=nesterov_momentum, update_learning_rate=0.01, #!0.001-0.01 update_momentum=0.9, #!0.6-0.9 regression= False, # flag to indicate we're dealing with regression problem max_epochs=500, # we want to train this many epochs verbose=1, ) random_search = RandomizedSearchCV( net1, { 'hidden_num_units': sp_randint(200, 600), "dropout_p": sp_rand(0.2, 0.7), "update_learning_rate": sp_rand(0.001, 0.01), "update_momentum": sp_rand(0.6, 0.9), }) random_search.fit(X, y) print random_search.grid_scores_ preds = random_search.predict_proba(X_test)[:, 1] submission = pd.DataFrame(preds, index=ids, columns=['target']) submission.to_csv('Keras_BTB.csv')
def random_search(): from time import time from scipy.stats import uniform as sp_uniform, randint as sp_randint from sklearn.grid_search import RandomizedSearchCV from sklearn.cross_validation import ShuffleSplit crimes = np.load(DATA_FILE) # features_train = crimes['features_train'] all_labels = sorted( list( set(np.unique(crimes['labels_train'])) | set(np.unique(crimes['labels_val'])))) batch_size = 64 labels_train = create_labels(crimes['labels_train'], all_labels) labels_vals = create_labels(crimes['labels_val'], all_labels) labels_full = create_labels(crimes['labels'], all_labels) param_dist = { 'layers': sp_randint(1, 3), "hidden_units": [64, 128, 256], 'input_dropout': sp_uniform(0, 0.5), "hidden_dropout": sp_uniform(0, 0.75), "learning_rate": sp_uniform(0.01, 0.1), "weight_decay": sp_uniform(0, 0.01) } model = NeuralNetworkClassifier(n_classes=len(all_labels), batch_size=batch_size, valid_set=(crimes['features_val'], labels_vals)) n_iter_search = 40 np.random.seed(42) random_searcher = RandomizedSearchCV( model, param_distributions=param_dist, scoring=None, n_iter=n_iter_search, random_state=42, error_score=100, verbose=5, cv=ShuffleSplit(n=crimes['features_train'].shape[0], n_iter=1, test_size=0)) start = time() random_searcher.fit(crimes['features_train'], labels_train.ravel()) print( "RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_searcher.grid_scores_) loss_train = log_loss( labels_train, random_searcher.predict_proba(crimes['features_train'])) loss_val = log_loss(labels_vals, random_searcher.predict_proba(crimes['features_val'])) loss_all = log_loss(labels_full, random_searcher.predict_proba(crimes['features'])) print 'loss_all: ', loss_all print 'loss_train: ', loss_train print 'loss_val: ', loss_val return loss_val
class Model(object): def __init__(self): ''' Training parameters: ''' self.w2v_dim=100 self.num_feature=400 self.batch_size=16 self.num_epoch=1 #self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True) self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model') self.index2word_set = set(self.w2v_model.index2word) self.bigram=Phrases.load('./data/bigram.dat') self.trigram=Phrases.load('./data/trigram.dat') print('Build model...') param_dist = { "n_estimators":sp_randint(20,250), "criterion": ["gini", "entropy"], "max_depth": sp_randint(10, 300), "min_samples_split": sp_randint(1, 30), "min_samples_leaf": sp_randint(1, 30), "max_features": sp_randint(1, 200), "bootstrap": [True, False], 'random_state':sp_randint(1, 1000000), } # build a classifier clf = RandomForestClassifier(n_jobs=8) # run randomized search self.model=RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=10,cv=9,n_jobs=8) print('Model has been built!') def getWordVectorFeatures(self, text): words = text.split() return self.wordVectorAvg(words, self.w2v_dim) def wordVectorAvg(self, words, num_features): featureVec = np.zeros((num_features,1),dtype="float32") nwords = 0 for word in words: if word in self.index2word_set: nwords = nwords + 1 featureVec = np.add(featureVec, self.w2v_model[word].reshape(-1,1)) if nwords!=0: featureVec = np.divide(featureVec, nwords) return featureVec def getFeature(self, ori_q,rel_q): ori_q[0]=preprocess(ori_q[0],no_stopwords=True,bigram=self.bigram,trigram=self.trigram) ori_q[1]=preprocess(ori_q[1],no_stopwords=True,bigram=self.bigram,trigram=self.trigram) rel_q[0]=preprocess(rel_q[0],no_stopwords=True,bigram=self.bigram,trigram=self.trigram) rel_q[0]=preprocess(rel_q[0],no_stopwords=True,bigram=self.bigram,trigram=self.trigram) word2vec_q_subject=self.getWordVectorFeatures(ori_q[0]) word2vec_q_body=self.getWordVectorFeatures(ori_q[1]) word2vec_rel_q_subject=self.getWordVectorFeatures(rel_q[0]) word2vec_rel_q_body=self.getWordVectorFeatures(rel_q[1]) subject=np.concatenate((word2vec_q_subject*word2vec_rel_q_subject, np.abs(word2vec_q_subject-word2vec_rel_q_subject)),axis=0) body=np.concatenate((word2vec_q_body*word2vec_rel_q_body, np.abs(word2vec_q_body-word2vec_rel_q_body)),axis=0) return np.concatenate((subject, body,),axis=0).T def prepareData(self,data): size=0 for i in range(len(data)): size+=(len(data[i])/2)-1 X=np.zeros((size,self.num_feature),dtype=np.float32) y=np.zeros((size,),dtype=np.float32) meta=[] c=0 pbar = ProgressBar(widgets=[Percentage(), Bar(), ETA()], maxval=len(data)).start() for i in range(len(data)): samples = data[i] ori_q_id=samples[0]['ORGQ_ID'] ori_q=samples[1] for j in range(2,len(samples),2): rel_q_id=samples[j]['RELQ_ID'] rel_q=samples[j+1] label=samples[j]['RELQ_RELEVANCE2ORGQ'] target=0 if label=='PerfectMatch': target=2 elif label=='Relevant': target=1 label='false' if label=='Irrelevant' else 'true' X[c,:] = self.getFeature(ori_q,rel_q) y[c]=target meta.append([ori_q_id,rel_q_id,label]) c+=1 pbar.update(i) return X,y,meta def loadData(self): reader = Reader() print('loading data') self.X_train, self.y_train, self.meta_train=self.prepareData(reader.getData(TRAIN)) print('train data has been loaded!') self.X_valid, self.y_valid, self.meta_valid=self.prepareData(reader.getData(DEV)) print('valid data has been loaded!') self.X_test, self.y_test, self.meta_test=self.prepareData(reader.getData(TEST)) print('test data has been loaded!') def evaluate(self): print('evaluating...') y_pred = self.model.predict_proba(self.X_valid) f=open('./tmp/dev.pred', 'w') for i in range(len(self.meta_valid)): prob_of_true =y_pred[i][1]+y_pred[i][2] label='false' if prob_of_true>0.5: label='true' f.write( "%s %s 0 %20.16f %s\n" %(self.meta_valid[i][0], self.meta_valid[i][1], prob_of_true, label)) f.close() map=eval_reranker(res_fname='./data/eval/SemEval2016-Task3-CQA-QL-dev.xml.subtaskB.relevancy', pred_fname='./tmp/dev.pred') f=open('valid_map.txt', 'a') f.write(str(map)+'\n') f.close() print('=========================================') return map def train(self): f=open('valid_map.txt', 'w') f.close() f_train_loss=open('./train_loss.txt','w') f_valid_loss=open('./valid_loss.txt','w') f_train_acc=open('./train_acc.txt','w') f_valid_acc=open('./valid_acc.txt','w') f_train_loss.close() f_valid_loss.close() f_train_acc.close() f_valid_acc.close() print("Training...") max_map=0.0 for i in range(self.num_epoch): self.model.fit(self.X_train, self.y_train) ''' f_train_loss=open('./train_loss.txt','a') f_valid_loss=open('./valid_loss.txt','a') f_train_acc=open('./train_acc.txt','a') f_valid_acc=open('./valid_acc.txt','a') f_train_loss.write(str(np.asscalar(hist.history['loss'][0]))) f_train_loss.write('\n') f_valid_loss.write(str(np.asscalar(hist.history['val_loss'][0]))) f_valid_loss.write('\n') f_train_acc.write(str(np.asscalar(hist.history['acc'][0]))) f_train_acc.write('\n') f_valid_acc.write(str(np.asscalar(hist.history['val_acc'][0]))) f_valid_acc.write('\n') f_train_loss.close() f_valid_loss.close() f_train_acc.close() f_valid_acc.close() ''' map=self.evaluate() print('MAP on valid data: %16.16f\n'%(map)) if map>max_map: max_map=map #self.model.save_weights("./tmp/weights.hdf5") print('Training completed!')
input_shape=(None, num_features), hidden_num_units=200, # number of units in hidden layer #!200-600 output_nonlinearity=lasagne.nonlinearities.softmax, # output layer output_num_units=num_classes, # 10 target values dropout_p=0.2, #!dropout 0.2-0.7 # optimization method: update=nesterov_momentum, update_learning_rate=0.01,#!0.001-0.01 update_momentum=0.9,#!0.6-0.9 regression=False, # flag to indicate we're dealing with regression problem max_epochs=500, # we want to train this many epochs verbose=1, ) random_search = RandomizedSearchCV(net1, {'hidden_num_units': sp_randint(200, 600), "dropout_p": sp_rand(0.2,0.7), "update_learning_rate": sp_rand(0.001, 0.01), "update_momentum": sp_rand(0.6, 0.9), }) random_search.fit(X, y) print random_search.grid_scores_ preds = random_search.predict_proba(X_test)[:, 1] submission = pd.DataFrame(preds, index=ids, columns=['target']) submission.to_csv('Keras_BTB.csv')
rand_gridsearch = RandomizedSearchCV(xgb_clf, param_distributions = grid_params, \ n_iter = 6, scoring = 'roc_auc', cv = 3, verbose = True, random_state = 42) #Changed to 3 Fold CV rand_gridsearch.fit(X,y) print '=======================================================' print rand_gridsearch.best_params_ print '=======================================================' for s in rand_gridsearch.grid_scores_: print s best_xgb = rand_gridsearch.best_estimator_ y_pred = rand_gridsearch.predict_proba(X_test) submission = pd.DataFrame(y_pred[:,1], index=test.index, columns=['target']) submission.index.name = 'ID' submission.to_csv('B_XGB_GridSearch_2015_10_13.csv') sklearn.externals.joblib.dump(best_xgb, './models/XGB_2015_10_13.pkl') sklearn.externals.joblib.dump(encoders, './models/XGB2_encoders_2015_10_13.pkl')
def hyperparaTuning(data, testSet, expName, mode=2, storedPath=util.getResourcePath() + '/Pickle Files/Models/First Layer/'): # Construct the set of hyperparameters for each algorithm etTree_params = { "n_estimators": [150, 250, 350], "max_features": [None, 'sqrt', 'log2'], "min_samples_leaf": [64, 128, 256] } lightGBM_params = { "learning_rate": [0.06, 0.08, 0.1], "num_leaves": [15, 31, 63], "max_bin": [63, 127, 255], "feature_fraction": [0.6, 0.8, 0.9] } knn_params = { "n_neighbors": np.arange(5, 47, 2), "weights": ["uniform", "distance"], "metric": ["euclidean", "manhattan", "chebyshev"] } #Construct a model for each algorithm et_model = ExtraTreesClassifier() lgbm_model = lgbm.LGBMClassifier(objective='binary') knn_model = KNeighborsClassifier() #Construct the training and test data trainData = data.drop(' Label', axis=1) y_train = data[' Label'].values testData = testSet.drop(' Label', axis=1) y_test = testSet[' Label'].values #Perform Extremely Randomized ALgorithm if (mode == 1): modelName = 'ExtraTrees' params = etTree_params model = et_model #Performing LightGBM elif (mode == 2): modelName = 'LightGBM' params = lightGBM_params model = lgbm_model #Performing KNN algorithm else: modelName = 'KNN' params = knn_params model = knn_model #Standardise the data in the case of KNN scaling = StandardScaler() trainData = scaling.fit_transform(trainData) testData = scaling.transform(testData) # tune the hyperparameters via a cross-validated Randomized search grid = RandomizedSearchCV(model, params, verbose=1, cv=5, n_jobs=1) start = time.time() grid.fit(trainData, y_train) #Calculate the time end = time.time() runningTime = (end - start) / 60 # evaluate the best grid searched model on the testing data preds = grid.predict_proba(testData) auc = roc_auc_score(y_test, preds[:, 1]) print("Experiment: ", expName) print("Randomized search best parameters: {}".format(grid.best_params_)) print("AUC of the best model: ", auc) print("Running time: ", runningTime) #Save the model util.pklSaver(grid, expName, path=storedPath + modelName + '/')
# predicted probabilities will sum to 1 for each row new_pred_prob_knn[0, :].sum() # ### Model 2: Naive Bayes model using only text features # print the best model found by RandomizedSearchCV rand.best_estimator_ # define X_new as the ingredient text X_new = new.ingredients_str # calculate predicted probabilities of class membership for the new data new_pred_prob_rand = rand.predict_proba(X_new) new_pred_prob_rand.shape # print predicted probabilities for the first row only new_pred_prob_rand[0, :] # ### Ensembling models 1 and 2 # calculate the mean of the predicted probabilities for the first row (new_pred_prob_knn[0, :] + new_pred_prob_rand[0, :]) / 2 # calculate the mean of the predicted probabilities for all rows new_pred_prob = pd.DataFrame((new_pred_prob_knn + new_pred_prob_rand) / 2, columns=knn.classes_)
def XGB_Classifier(Train_DS, y, Actual_DS, Sample_DS, Grid): print("***************Starting XGB Classifier***************") t0 = time() if Grid: #used for checking the best performance for the model using hyper parameters print("Starting model fit with Grid Search") param_grid = { 'n_estimators': [50], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 19, 20], 'min_child_weight': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20], 'subsample': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], 'colsample_bytree': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], 'silent': [True], 'gamma': [2, 1, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] } #run randomized search n_iter_search = 800 clf = xgb.XGBClassifier(nthread=8) clf = RandomizedSearchCV(clf, param_distributions=param_grid, n_iter=n_iter_search, scoring='log_loss', cv=3) start = time() clf.fit(np.array(Train_DS), np.array(y)) print("GridSearchCV completed") Parms_DS_Out = report(clf.grid_scores_, n_top=n_iter_search) Parms_DS_Out.to_csv(file_path + 'Parms_DS_XGB_4.csv') print("Best estimator found by grid search:") print(clf.best_estimator_) sys.exit(0) else: ##----------------------------------------------------------------------------------------------------------------## #best lb is with n_estimators = 500 , using 1000 it is less #CV: 0.78526434774405007 (full set) #CV: 0.824999 (100k set - with Age set up, all dummy) #CV: 0.830194 (with 50 K) - n_estimators = 75 - with Age Bkt and Session (Action_Type dummy) features #CV: 0.830842 (with 50 K) - n_estimators = 75 - with Age Bkt and Session & Session 3 - features ********* clf = xgb.XGBClassifier(n_estimators=500, nthread=8) #LB : n_estimators = 100 , 0.88040 #LB : n_estimators = 125 , 0.88059 ***best, session 1,2,3,4 #LB : n_estimators = 150 , 0.88045 #LB : n_estimators = 060 , 0.87996 #LB : n_estimators = 080 , 0.88029 #LB: n_estimators = 125 , 0.88148 ***best, session 1,2,3,4 and year > 2012 #LB: n_estimators = 125 , 0.88080, session 1,2,3,4 and year > 2013 #LB: n_estimators = 125 , 0.88010, session 1,2,3,4 and year > 2011 #CV: 0.83062 (with 50 K) - n_estimators = 125 - with Age Bkt and Session,2,3,4 - features ********* clf = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8, gamma=0.6, learning_rate=0.1, max_delta_step=0, max_depth=6, min_child_weight=12, missing=None, n_estimators=135, nthread=8, objective='multi:softprob', reg_alpha=0, reg_lambda=1, scale_pos_weight=1, silent=True, subsample=0.7) #clf = CalibratedClassifierCV(base_estimator=clf, method='sigmoid') # clf = xgb.XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators=75, # objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0,nthread=8) ##----------------------------------- # Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf) # sys.exit(0) X_train = np.array(Train_DS) Y_train = np.array(y) clf.fit(X_train, Y_train) X_Actual = np.array(Actual_DS) #Predict actual model pred_Actual = clf.predict_proba(X_Actual) print("Actual Model predicted") if raw_output == False: pred_Actual = get_best_five(pred_Actual, type_val=False) #Get the predictions for actual data set pred_Actual.to_csv(file_path + 'output/Submission_Roshan_xgb_135.csv', index_label='id') else: print(pd.DataFrame(pred_Actual).head()) pred = pd.DataFrame(pred_Actual) pred['id'] = Actual_DS1 pred = pred.set_index('id') pred.to_csv(file_path + 'output/Submission_Roshan_xgb_raw_150_2012.csv', index_label='id') print("***************Ending XGB Classifier***************") return pred_Actual
def XGB_Classifier(Train_DS, y, Actual_DS, Sample_DS, Grid): print("***************Starting XGB Classifier***************") t0 = time() if Grid: #used for checking the best performance for the model using hyper parameters print("Starting model fit with Grid Search") param_grid = {'n_estimators': [100], 'max_depth': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], 'min_child_weight': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], 'subsample': [0.1,0.2,0.3,0.4,0.5,0.6, 0.7,0.8, 0.9,1], 'colsample_bytree': [0.1,0.2,0.3,0.4,0.5,0.6, 0.7,0.8, 0.9,1], 'silent':[True], 'gamma':[2,1,0.1,0.2,0.3,0.4,0.5,0.6, 0.7,0.8, 0.9] } # clf = GridSearchCV(xgb.XGBClassifier(),param_grid, scoring='roc_auc', # verbose=1,cv=10) #run randomized search n_iter_search = 3000 clf = xgb.XGBClassifier(nthread=-1) clf = RandomizedSearchCV(clf, param_distributions=param_grid, n_iter=n_iter_search, scoring = 'roc_auc',cv=10) start = time() clf.fit(Train_DS, y) print("GridSearchCV completed") report(clf.grid_scores_) print("Best estimator found by grid search:") print(clf.best_estimator_) else: #Best on grid :::: CV: # clf = xgb.XGBClassifier(n_estimators=500,max_depth=4,learning_rate=0.1,nthread=2,min_child_weight=11, # subsample=0.8,colsample_bytree=0.7,silent=True, gamma = 0.6) #from Kaggle clf = xgb.XGBClassifier(n_estimators=500,max_depth=9,learning_rate=0.01,nthread=2,min_child_weight=6, subsample=0.7,colsample_bytree=0.5,silent=True, gamma = 4) Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf) # clf = xgb.XGBClassifier(n_estimators=2000,max_depth=4,learning_rate=0.1,nthread=2,min_child_weight=11, # subsample=0.8,colsample_bytree=0.7,silent=True, gamma = 0.6) #from Kaggle (https://www.kaggle.com/c/springleaf-marketing-response/forums/t/16808/time-window-variables-features) #clf = xgb.XGBClassifier(n_estimators=2000,max_depth=10,learning_rate=0.005,nthread=2,min_child_weight=11, # subsample=0.8,colsample_bytree=0.4,silent=True, gamma = 0.6) #from Kaggle clf = xgb.XGBClassifier(n_estimators=2000,max_depth=9,learning_rate=0.01,nthread=2,min_child_weight=6, subsample=0.7,colsample_bytree=0.5,silent=True, gamma = 4) clf = CalibratedClassifierCV(base_estimator=clf, method='sigmoid') clf.fit(Train_DS, y) #Predict actual model pred_Actual = clf.predict_proba(Actual_DS)[:,1] print("Actual Model predicted") #Get the predictions for actual data set preds = pd.DataFrame(pred_Actual, index=Sample_DS.ID.values, columns=Sample_DS.columns[1:]) preds.to_csv(file_path+'output/Submission_Roshan_xgb_filter_2.csv', index_label='ID') print("***************Ending XGB Classifier***************") return pred_Actual
'max_depth': sp_randint(4, 200), 'learning_rate': sp_uniform(loc=0e0,scale=1e0), 'objective':['multi:softprob'], 'nthread': [8], 'missing': [np.nan], 'reg_alpha': [0.01,0.017782794,0.031622777,0.056234133,\ 0.1,0.17782794,0.31622777,0.56234133,1.,1.77827941,\ 3.16227766,5.62341325,10.,\ 17.7827941,31.6227766,56.2341325,100.], 'colsample_bytree': sp_uniform(loc=0.2e0,scale=0.8e0), 'subsample': np.arange(0.6,1.0,step=0.05), 'n_estimators': sp_randint(100,700), } print "Randomized XGBoost" # In[ ]: for i in range(2): print "Loop %i/20" % i search_GB = RandomizedSearchCV(GB, param_grid, scoring='log_loss', n_jobs=-1, n_iter=n_iter, cv=cv, verbose=True) search_GB.fit(X_train, y_train) log_model = search_GB.score(X_val, y_val) print "Log loss = %s" % log_model X_test = get_test() save_submission('XGBoost', log_model, search_GB.predict_proba(X_test))
class Model(object): def __init__(self): ''' Training parameters: ''' self.w2v_dim = 100 self.num_feature = 400 self.batch_size = 16 self.num_epoch = 1 #self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True) self.w2v_model = Word2Vec.load('./data/word2vec/w2v.model') self.index2word_set = set(self.w2v_model.index2word) self.bigram = Phrases.load('./data/bigram.dat') self.trigram = Phrases.load('./data/trigram.dat') print('Build model...') param_dist = { "n_estimators": sp_randint(20, 250), "criterion": ["gini", "entropy"], "max_depth": sp_randint(10, 300), "min_samples_split": sp_randint(1, 30), "min_samples_leaf": sp_randint(1, 30), "max_features": sp_randint(1, 200), "bootstrap": [True, False], 'random_state': sp_randint(1, 1000000), } # build a classifier clf = RandomForestClassifier(n_jobs=8) # run randomized search self.model = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=10, cv=9, n_jobs=8) print('Model has been built!') def getWordVectorFeatures(self, text): words = text.split() return self.wordVectorAvg(words, self.w2v_dim) def wordVectorAvg(self, words, num_features): featureVec = np.zeros((num_features, 1), dtype="float32") nwords = 0 for word in words: if word in self.index2word_set: nwords = nwords + 1 featureVec = np.add(featureVec, self.w2v_model[word].reshape(-1, 1)) if nwords != 0: featureVec = np.divide(featureVec, nwords) return featureVec def getFeature(self, ori_q, rel_q): ori_q[0] = preprocess(ori_q[0], no_stopwords=True, bigram=self.bigram, trigram=self.trigram) ori_q[1] = preprocess(ori_q[1], no_stopwords=True, bigram=self.bigram, trigram=self.trigram) rel_q[0] = preprocess(rel_q[0], no_stopwords=True, bigram=self.bigram, trigram=self.trigram) rel_q[0] = preprocess(rel_q[0], no_stopwords=True, bigram=self.bigram, trigram=self.trigram) word2vec_q_subject = self.getWordVectorFeatures(ori_q[0]) word2vec_q_body = self.getWordVectorFeatures(ori_q[1]) word2vec_rel_q_subject = self.getWordVectorFeatures(rel_q[0]) word2vec_rel_q_body = self.getWordVectorFeatures(rel_q[1]) subject = np.concatenate( (word2vec_q_subject * word2vec_rel_q_subject, np.abs(word2vec_q_subject - word2vec_rel_q_subject)), axis=0) body = np.concatenate((word2vec_q_body * word2vec_rel_q_body, np.abs(word2vec_q_body - word2vec_rel_q_body)), axis=0) return np.concatenate(( subject, body, ), axis=0).T def prepareData(self, data): size = 0 for i in range(len(data)): size += (len(data[i]) / 2) - 1 X = np.zeros((size, self.num_feature), dtype=np.float32) y = np.zeros((size, ), dtype=np.float32) meta = [] c = 0 pbar = ProgressBar(widgets=[Percentage(), Bar(), ETA()], maxval=len(data)).start() for i in range(len(data)): samples = data[i] ori_q_id = samples[0]['ORGQ_ID'] ori_q = samples[1] for j in range(2, len(samples), 2): rel_q_id = samples[j]['RELQ_ID'] rel_q = samples[j + 1] label = samples[j]['RELQ_RELEVANCE2ORGQ'] target = 0 if label == 'PerfectMatch': target = 2 elif label == 'Relevant': target = 1 label = 'false' if label == 'Irrelevant' else 'true' X[c, :] = self.getFeature(ori_q, rel_q) y[c] = target meta.append([ori_q_id, rel_q_id, label]) c += 1 pbar.update(i) return X, y, meta def loadData(self): reader = Reader() print('loading data') self.X_train, self.y_train, self.meta_train = self.prepareData( reader.getData(TRAIN)) print('train data has been loaded!') self.X_valid, self.y_valid, self.meta_valid = self.prepareData( reader.getData(DEV)) print('valid data has been loaded!') self.X_test, self.y_test, self.meta_test = self.prepareData( reader.getData(TEST)) print('test data has been loaded!') def evaluate(self): print('evaluating...') y_pred = self.model.predict_proba(self.X_valid) f = open('./tmp/dev.pred', 'w') for i in range(len(self.meta_valid)): prob_of_true = y_pred[i][1] + y_pred[i][2] label = 'false' if prob_of_true > 0.5: label = 'true' f.write("%s %s 0 %20.16f %s\n" % (self.meta_valid[i][0], self.meta_valid[i][1], prob_of_true, label)) f.close() map = eval_reranker( res_fname= './data/eval/SemEval2016-Task3-CQA-QL-dev.xml.subtaskB.relevancy', pred_fname='./tmp/dev.pred') f = open('valid_map.txt', 'a') f.write(str(map) + '\n') f.close() print('=========================================') return map def train(self): f = open('valid_map.txt', 'w') f.close() f_train_loss = open('./train_loss.txt', 'w') f_valid_loss = open('./valid_loss.txt', 'w') f_train_acc = open('./train_acc.txt', 'w') f_valid_acc = open('./valid_acc.txt', 'w') f_train_loss.close() f_valid_loss.close() f_train_acc.close() f_valid_acc.close() print("Training...") max_map = 0.0 for i in range(self.num_epoch): self.model.fit(self.X_train, self.y_train) ''' f_train_loss=open('./train_loss.txt','a') f_valid_loss=open('./valid_loss.txt','a') f_train_acc=open('./train_acc.txt','a') f_valid_acc=open('./valid_acc.txt','a') f_train_loss.write(str(np.asscalar(hist.history['loss'][0]))) f_train_loss.write('\n') f_valid_loss.write(str(np.asscalar(hist.history['val_loss'][0]))) f_valid_loss.write('\n') f_train_acc.write(str(np.asscalar(hist.history['acc'][0]))) f_train_acc.write('\n') f_valid_acc.write(str(np.asscalar(hist.history['val_acc'][0]))) f_valid_acc.write('\n') f_train_loss.close() f_valid_loss.close() f_train_acc.close() f_valid_acc.close() ''' map = self.evaluate() print('MAP on valid data: %16.16f\n' % (map)) if map > max_map: max_map = map #self.model.save_weights("./tmp/weights.hdf5") print('Training completed!')
def xgb_model(read_csv=True): print 'xgb_model randomcv yr > 2013' train_df, test_df = get_train_test_data(cache=read_csv, include_sessions=False) train_df = train_df[train_df['tfa_year'] > 2013] cols = [i for i in train_df.columns if i not in EXCLUDE_COLS] X = train_df[cols] y = train_df['country_destination'] #start classifier bst = xgb.XGBClassifier(nthread=4) # bst = xgb.XGBClassifier(max_depth=2, nthread=4, # n_estimators=50,subsample=0.4,learning_rate=0.0.05) train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=10000) # bst.fit(test_X, test_y) param_dist = {"max_depth": [2,4,6], "learning_rate": [0.05, 0.1, 0.15, 0.2], "n_estimators": [30, 50, 70], # "min_samples_leaf": sp_randint(1, 11), # "min_samples_split": sp_randint(1,11), 'subsample': [0.4, 0.5, 0.6] # 'max_features': [20, 50, 100] } n_iter_search = 20 random_search = RandomizedSearchCV(bst, param_distributions=param_dist, n_iter=n_iter_search, scoring=ndcg_scorer) start = time() random_search.fit(test_X, test_y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_search.grid_scores_) # bagging # clfbag = BaggingClassifier(bst, n_estimators=5, max_samples=5000) # clfbag.fit(train_X, train_y) # y_pred = clfbag.predict_proba(test_X) # print 'predicted prob' # score = ndcg_score(test_y, y_pred) # apply learned model # bst.fit(X.values, y) # py.test.set_trace() test_data = test_df[[i for i in test_df.columns if i not in EXCLUDE_COLS]] y_pred = random_search.predict_proba(test_data) py.test.set_trace() # kaggle_test = pd.read_csv('test.csv') sub = create_kaggle_submission(y_pred, test_df['id'], 0.841) py.test.set_trace() # print 'created kaggle sub' # kf = KFold(len(X), n_folds=10, random_state=42) # score = cross_val_score(bst, X, y, cv=kf, scoring=ndcg_scorer) # param_dist = {"max_depth": [3, None], # "max_features": sp_randint(1, 11), # "min_samples_split": sp_randint(1, 11), # "min_samples_leaf": sp_randint(1, 11), # "bootstrap": [True, False], # "criterion": ["gini", "entropy"]} # n_iter_search = 20 # random_search = RandomizedSearchCV(clf, param_distributions=param_dist, # n_iter=n_iter_search, scoring=ndcg_scorer) # start = time() # random_search.fit(X, y) # print("RandomizedSearchCV took %.2f seconds for %d candidates" # " parameter settings." % ((time() - start), n_iter_search)) # report(random_search.grid_scores_) # py.test.set_trace() # sub = create_kaggle_submission(y_pred, test_df['id'], np.mean(score)) # py.test.set_trace() #end classifier """ trying cross valid
def XGB_Classifier(Train_DS, y, Actual_DS, Sample_DS, Grid): print("***************Starting XGB Classifier***************") t0 = time() if Grid: #used for checking the best performance for the model using hyper parameters print("Starting model fit with Grid Search") param_grid = {'n_estimators': [25], 'max_depth': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,19,20,40,80,100,200], 'min_child_weight': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,20,40,80,100], 'subsample': [0.1,0.2,0.3,0.4,0.5,0.6, 0.7,0.8, 0.9,1], 'colsample_bytree': [0.1,0.2,0.3,0.4,0.5,0.6, 0.7,0.8, 0.9,1], 'silent':[True], 'gamma':[2,1,0.1,0.2,0.3,0.4,0.5,0.6, 0.7,0.8, 0.9] } #run randomized search n_iter_search = 800 clf = xgb.XGBClassifier(nthread=8) clf = RandomizedSearchCV(clf, param_distributions=param_grid, n_iter=n_iter_search, scoring = 'log_loss',cv=3) start = time() clf.fit(np.array(Train_DS), np.array(y)) print("GridSearchCV completed") Parms_DS_Out = report(clf.grid_scores_,n_top=n_iter_search) Parms_DS_Out.to_csv(file_path+'Parms_DS_XGB_4.csv') print("Best estimator found by grid search:") print(clf.best_estimator_) sys.exit(0) else: ##----------------------------------------------------------------------------------------------------------------## #best from grid Search, best n_est=175 #CV:0.936880 , 20 K , n_estimators =100 , features = 343 (without FN and Upc and using eucledean for DD)*** current best clf = xgb.XGBClassifier(n_estimators=100,max_depth=100,learning_rate=0.1,nthread=8,min_child_weight=1, subsample=0.6,colsample_bytree=0.9,silent=True, gamma = 2 ) ##----------------------------------------------------------------------------------------------------------------## #CV: 0.955185 , 20 K , n_estimators =100 , features = 343 (without FN and Upc) #CV: 0.935217 , 20 K , n_estimators =100 , features = 343 (without FN and Upc and using eucledean for DD) #CV: 0.927019 , 20 K , n_estimators =100 , features = 343 (without FN and Upc and using cos_sim for DD) *****not used ovefitting #CV: 0.922370 , 20 K , n_estimators =100 , features = 343 (without FN and Upc and using eucl + cos_sim for DD) *****not used ovefitting ##................................................................................................................## #CV: 0.942477 , 20 K , n_estimators =100 , features = 343 (without FN and Upc and using eucledean for DD) #clf = xgb.XGBClassifier(n_estimators=100,nthread=8) ##----------------------------------------------------------------------------------------------------------------## Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf) sys.exit(0) X_train = np.array(Train_DS) Y_train = np.array(y) clf.fit(X_train, Y_train) X_Actual = np.array(Actual_DS) #Predict actual model pred_Actual = clf.predict_proba(X_Actual) print("Actual Model predicted") #Get the predictions for actual data set preds = pd.DataFrame(pred_Actual, index=Sample_DS.VisitNumber.values, columns=Sample_DS.columns[1:]) preds.to_csv(file_path+'output/Submission_Roshan_xgb_6_withFNnumber.csv', index_label='VisitNumber') print("***************Ending XGB Classifier***************") return pred_Actual
def xgb_Classifier(Train_DS, y, Actual_DS, Sample_DS, grid): print("***************Starting XGB Classifier***************") t0 = time() # Train_DS = np.log( 1 + Train_DS) # Actual_DS = np.log( 1 + Actual_DS) #Setting Standard scaler for data # stdScaler = StandardScaler() # stdScaler.fit(Train_DS,y) # Train_DS = stdScaler.transform(Train_DS) # Actual_DS = stdScaler.transform(Actual_DS) if grid: #used for checking the best performance for the model using hyper parameters print("Starting model fit with Grid Search") # specify parameters and distributions to sample from # param_grid = {'n_estimators': [50], # 'max_depth': [6, 1, 3, 5, 8, 10], # 'min_child_weight': [1, 4, 7, 10], # 'subsample': [0.1, 0.2,0.3, 0.4,0.5,0.6, 0.7,0.8, 0.9,1], # 'colsample_bytree': [0.1, 0.2,0.3, 0.4,0.5,0.6, 0.7,0.8, 0.9,1], # 'silent':[True], # 'gamma':[1,0.5,0.6,0.7,0.8,0.9] # } param_grid = {'n_estimators': [500], 'max_depth': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], 'min_child_weight': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], 'subsample': [0.1,0.2,0.3,0.4,0.5,0.6, 0.7,0.8, 0.9,1], 'colsample_bytree': [0.1,0.2,0.3,0.4,0.5,0.6, 0.7,0.8, 0.9,1], 'silent':[True], 'gamma':[2,1,0.1,0.2,0.3,0.4,0.5,0.6, 0.7,0.8, 0.9] } # clf = GridSearchCV(xgb.XGBClassifier(),param_grid, scoring='roc_auc', # verbose=1,cv=10) #run randomized search n_iter_search = 3000 clf = xgb.XGBClassifier(nthread=-1) clf = RandomizedSearchCV(clf, param_distributions=param_grid, n_iter=n_iter_search, scoring = 'roc_auc',cv=10) start = time() clf.fit(Train_DS, y) print("GridSearchCV completed") report(clf.grid_scores_) print("Best estimator found by grid search:") print(clf.best_estimator_) else: #starting model # clf = xgb.XGBClassifier(n_estimators=200,max_depth=10,learning_rate=0.01,nthread=2,min_child_weight=4, # subsample=0.9,colsample_bytree=0.8,silent=True, gamma = 1) # Model with rank: 1 , Mean validation score: 0.921 (std: 0.024) # clf = xgb.XGBClassifier(n_estimators=200,max_depth=5,learning_rate=0.1,nthread=2,min_child_weight=1, # subsample=0.5,colsample_bytree=0.9,silent=True, gamma = 0.6) # Model with rank: 1 # Mean validation score: 0.919 (std: 0.031) # Parameters: {'colsample_bytree': 0.8, 'silent': True, 'min_child_weight': 4, 'n_estimators': 200, 'subsample': 1, 'max_depth': 3, 'gamma': 0.8} # # Model with rank: 2 # Mean validation score: 0.918 (std: 0.032) # Parameters: {'colsample_bytree': 0.8, 'silent': True, 'min_child_weight': 4, 'n_estimators': 200, 'subsample': 1, 'max_depth': 3, 'gamma': 1} # # Model with rank: 3 # Mean validation score: 0.918 (std: 0.028) # Parameters: {'colsample_bytree': 0.7, 'silent': True, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.6, 'max_depth': 9, 'gamma': 0.6} # # Best estimator found by grid search: # XGBClassifier(base_score=0.5, colsample_bytree=0.8, gamma=0.8, # learning_rate=0.1, max_delta_step=0, max_depth=3, # min_child_weight=4, n_estimators=200, nthread=-1, # objective='binary:logistic', seed=0, silent=True, subsample=1) #Cv = .91278 LB : 0.89863 # clf = xgb.XGBClassifier(n_estimators=1000,max_depth=3,learning_rate=0.1,nthread=2,min_child_weight=4, # subsample=1,colsample_bytree=0.8,silent=True, gamma = 1) clf = xgb.XGBClassifier(n_estimators=1000,max_depth=6,learning_rate=0.1,nthread=2,min_child_weight=1, subsample=0.9,colsample_bytree=1,silent=True, gamma = 0.7) #clf = CalibratedClassifierCV(base_estimator=clf, method='sigmoid') Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf) clf.fit(Train_DS, y) #Predict actual model pred_Actual = clf.predict_proba(Actual_DS)[:,1] print("Actual Model predicted") #Get the predictions for actual data set preds = pd.DataFrame(pred_Actual, index=Sample_DS.bidder_id.values, columns=Sample_DS.columns[1:]) preds.to_csv(file_path+'output/Submission_Roshan_xgb_305.csv', index_label='bidder_id') print("***************Ending XGB Classifier***************") return pred_Actual
print(RR_model.grid_scores_) # In[ ]: print(RR_model.best_score_) # In[ ]: print(RR_model.best_params_) # In[ ]: y_prob = RR_model.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions. RR_model.score(X_test, y_pred) # In[ ]: confusion_matrix=metrics.confusion_matrix(y_test,y_pred) confusion_matrix # In[ ]: auc_roc=metrics.classification_report(y_test,y_pred) auc_roc
"max_features": sp_randint(1, 11), "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False], "criterion": ["gini", "entropy"], "n_estimators": sp_randint(100, 600) } # In[4]: search_GB = RandomizedSearchCV(model, param_grid, scoring='log_loss', n_jobs=-1, n_iter=n_iter, cv=cv, verbose=True) search_GB.fit(X_train, y_train.flatten()) # In[5]: log_model = search_GB.score(X_val, y_val.flatten()) print "Log loss = %s" % log_model X_test = get_test() y_pred = search_GB.predict_proba(X_test) save_submission(model_name, log_model, y_pred) # In[7]: model_name