def test_predict_sklearn_pickle(self): x, y = build_dataset() kwargs = {'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', 'verbosity': 2, 'objective': 'binary:logistic', 'n_estimators': 10} model = XGBClassifier(**kwargs) model.fit(x, y) save_pickle(model, "model.pkl") del model # load model model: xgb.XGBClassifier = load_pickle("model.pkl") os.remove("model.pkl") gpu_pred = model.predict(x, output_margin=True) # Switch to CPU predictor bst = model.get_booster() bst.set_param({'predictor': 'cpu_predictor'}) cpu_pred = model.predict(x, output_margin=True) np.testing.assert_allclose(cpu_pred, gpu_pred, rtol=1e-5)
def feature_selection(model, X_train, X_test, y_train, y_test, eval_metric='auc'): thresholds = [thres for thres in sorted(model.feature_importances_) if thres != 0] # Use feat. with >0 importance roc_scores = {} for thresh in thresholds: # select features using threshold selection = SelectFromModel(model, threshold=thresh, prefit=True) select_X_train = selection.transform(X_train) selection_model = XGBClassifier() # train model selection_model.fit(select_X_train, y_train, eval_metric=eval_metric) select_X_test = selection.transform(X_test) # eval model y_pred = selection_model.predict(select_X_test) roc = roc_auc_score(y_test, y_pred) roc_scores[selection.threshold] = roc best_thresh = max(roc_scores, key=roc_scores.get) fs = SelectFromModel(model, threshold=best_thresh, prefit=True) pickle_model(fs, 'feature.select') X_train_trans_ = fs.transform(X_train) X_test_trans_ = fs.transform(X_test) print 'total features kept: {}'.format(X_train_trans_.shape[1]) return X_train_trans_, X_test_trans_
def test_predict_sklearn_pickle(self): X,y = makeXy() Xtest = makeXtest() from xgboost import XGBClassifier kwargs={} kwargs['tree_method'] = 'gpu_hist' kwargs['predictor'] = 'gpu_predictor' kwargs['silent'] = 0 kwargs['objective'] = 'binary:logistic' model = XGBClassifier(**kwargs) model.fit(X,y) print(model) # pickle model save_obj(model,"model.pkl") # delete model del model # load model model = load_obj("model.pkl") os.remove("model.pkl") # continue as before print("Before model.predict") sys.stdout.flush() tmp = time.time() gpu_pred = model.predict(Xtest, output_margin=True) print(gpu_pred) print("E non-zeroes: %d:" % (np.count_nonzero(gpu_pred))) print("E GPU Time to predict = %g" % (time.time() - tmp))
def cv(X_train, y_train, features_inner): kfold = StratifiedKFold(n_splits=5, shuffle=True) scores_f = [] scores_p = [] scores_r = [] for train, test in kfold.split(X_train, y_train): model = XGBClassifier() X_train_cv = pd.DataFrame(X_train.values[train], columns=X_train.columns) y_train_cv = pd.DataFrame(y_train.values[train], columns=["tred_cutoff"]) X_test_cv = pd.DataFrame(X_train.values[test], columns=X_train.columns) y_test_cv = pd.DataFrame(y_train.values[test], columns=["tred_cutoff"]) model.fit(X_train_cv, y_train_cv) y_pred = model.predict(X_test_cv) s_f = f1_score(y_test_cv, y_pred) s_p = precision_score(y_test_cv, y_pred) s_r = recall_score(y_test_cv, y_pred) print("\tscores f1", (s_f)) print("\tscores p", (s_p)) print("\tscores r", (s_r)) scores_f.append(s_f) scores_p.append(s_p) scores_r.append(s_r) print("mean scores f1", np.mean(scores_f)) print("mean scores p", np.mean(scores_p)) print("mean scores r", np.mean(scores_r))
def XGB_model(train,y): model=XGBClassifier(n_estimators=150, learning_rate=0.01) from sklearn import cross_validation cv = cross_validation.KFold(len(train), n_folds=5,random_state=7) for traincv,testcv in cv: model.fit(train.iloc[traincv],y.iloc[traincv]) y_XGB=model.predict(test) return y_XGB
def main(): # Set seed for reproducibility np.random.seed(0) print("Loading data...") # Load the data from the CSV files training_data = pd.read_csv('/home/vipin/Videos/train.csv', header=0) prediction_data = pd.read_csv('/home/vipin/Videos/test.csv', header=0) training_data['countrycode']=training_data['countrycode'].apply(lambda x:ord(x)) training_data['browserid']=training_data['browserid'].apply(lambda x: myfunc (x) if np.all(pd.notnull(x)) else myfunc("unknown") ) training_data['devid']=training_data['devid'].apply(lambda x: myfunc (x) if np.all(pd.notnull(x)) else myfunc("none")) #pd.to_csv('/home/vipin/Videos/train11.csv', sep=',', encoding='utf-8') #exit(0) prediction_data['countrycode']=prediction_data['countrycode'].apply(lambda x:ord(x)) prediction_data['browserid']=prediction_data['browserid'].apply(lambda x:myfunc (x) if np.all(pd.notnull(x)) else myfunc("unknown") ) prediction_data['devid']=prediction_data['devid'].apply(lambda x:myfunc (x) if np.all(pd.notnull(x)) else myfunc("none") ) features=['siteid','offerid','category','merchant','countrycode','browserid','devid'] target="click" X = training_data[features] x_prediction = prediction_data[features] Y= training_data[target] ids = prediction_data["ID"] model = XGBClassifier() #linear_model.LogisticRegression(n_jobs=-1) print("Training...") # Your model is trained on the training_data model.fit(X, Y) print("Predicting...") seed =7 test_size=0.33 X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=test_size,random_state=seed) y_prediction = model.predict_proba(x_prediction) results = y_prediction[:, 1] results_df = pd.DataFrame(data={'probability':results}) joined = pd.DataFrame(ids).join(results_df) y_pred=model.predict(X_test) accuracy=accuracy_score(y_test,y_pred) print("Accuracy: %.2f%%" % (accuracy * 100.0)) print("Writing predictions to predictions.csv") # Save the predictions out to a CSV file joined.to_csv("/home/vipin/Videos/predictions.csv", index=False)
def test_xgboost(): """Ensure that the TPOT xgboost method outputs the same as the xgboost classfier method""" tpot_obj = TPOT() result = tpot_obj._xgradient_boosting(training_testing_data, n_estimators=100, learning_rate=0, max_depth=3) result = result[result['group'] == 'testing'] xgb = XGBClassifier(n_estimators=100, learning_rate=0.0001, max_depth=3, seed=42) xgb.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, xgb.predict(testing_features))
def test_on_data(X, y): x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.5, random_state=2333) print "train set: {}, test set: {}".format(len(x_train), len(x_test)) cls = XGBClassifier() cls.fit(x_train, y_train) # on test pred = cls.predict(x_test) print "xgb accuracy score test", accuracy_score(y_test, pred) # on all pred = cls.predict(X) print "xgb accuracy score all", accuracy_score(y, pred) # compare to gbrt in sklearn cls = GradientBoostingClassifier() cls.fit(x_train, y_train) # on test pred = cls.predict(x_test) print "sklearn accuracy score test", accuracy_score(y_test, pred) # on all pred = cls.predict(X) print "sklearn accuracy score all", accuracy_score(y, pred)
def runner (): m = Model() X = m.df.drop("tred_cutoff", axis=1) Y = m.df["tred_cutoff"] features_inner = m.features + m.features_2 cv(X, Y, features_inner) model = XGBClassifier() model.fit(X, Y) y_pred = model.predict(m.X_test) s_f = f1_score(m.y_test, y_pred) s_p = precision_score(m.y_test, y_pred) s_r = recall_score(m.y_test, y_pred) print("test f1", s_f) print("test precision", s_p) print("test recall", s_r)
def trainXGB(data_subset): f.write('\nTraining XGB:'+'\n') X_train = data[data_subset]['X_train'] X_test = data[data_subset]['X_test'] y_train = data[data_subset]['y_train'] y_test = data[data_subset]['y_test'] for p in params['xgboost']: if data_subset != 'binary' and p['objective'] == 'binary:logistic': print("Skip using non-binary data with XGB binary:logistic objective") continue if data_subset == 'binary' and p['objective'] != 'binary:logistic': print("Skip using binary data with XGB multi:* objective") continue header = "@ subset: {0}, params: {1}".format(data_subset, p) f.write('\n'+header+'\n') objective = p['objective'] max_depth = p['max_depth'] try: n_estimators= p['n_estimators'] except KeyError as e: n_estimators= 100 model = XGBClassifier(objective=objective, max_depth=max_depth, n_estimators=n_estimators) start = time.time() model.fit(X_train, y_train) elapsed_train = time.time() - start y_pred = model.predict(X_test).astype(int) elapsed_predict = time.time() - start accuracy = accuracy_score(y_test, y_pred) precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, pos_label=2, average='weighted') print("\n{5}\nXGB with {0} objective, {6} max_depth, {7} n_estimators on data subset {1} trained in {2} seconds and predicted in {3} seconds with an accuracy of {4}\n".format(objective, data_subset, elapsed_train, elapsed_predict, accuracy, header, max_depth, n_estimators)) f.write(str(elapsed_train) + ', ' + str(elapsed_predict) + str(accuracy)+ ', ' + str(precision)+ ', ' + str(recall )+ ', ' + str(fscore )+ ', ' + str(support))
def get_thresh(model,train,test,label_test,label_train): if (len(test)>len(train)) or (len(label_test)>len(label_train)): raise TypeError('Invalid train and test size') model1 = XGBClassifier() if type(model)!=type(XGBClassifier()): raise TypeError('Invalid model passed') if (pd.DataFrame(label_train).shape[1]>1) or (pd.DataFrame(label_test).shape[1]>1): raise TypeError('Multiple columns in label, Invalid shape.') max_score=0 thrsh=0 thresholds = np.sort(model.feature_importances_) for thresh in thresholds: selection = feature_selection.SelectFromModel(model, threshold=thresh,prefit=True) select_X_train = selection.transform(train) selection_model = XGBClassifier() selection_model.fit(select_X_train, label_train) select_X_test = selection.transform(test) y_pred = selection_model.predict(select_X_test) scr=metrics.roc_auc_score(label_test,y_pred) if(scr>max_score): max_score=scr thrsh=thresh return thrsh
from sklearn.ensemble import RandomForestClassifier # Random Forest model_rf = RandomForestClassifier(); model_rf.fit(X_train, Y_train) predicted_rf = model_rf.predict(X_test) ; print("RandomForest",metrics.accuracy_score(Y_test, predicted_rf),"\n") from sklearn.ensemble import AdaBoostClassifier # AdaBoost model_ab = AdaBoostClassifier(); model_ab.fit(X_train, Y_train) predicted_ab = model_ab.predict(X_test) ; print("AdaBoost",metrics.accuracy_score(Y_test, predicted_ab),"\n") from sklearn.neighbors import KNeighborsClassifier #K-NN model_knn = KNeighborsClassifier(); model_knn.fit(X_train, Y_train) predicted_knn = model_knn.predict(X_test) ; print("K-NN",metrics.accuracy_score(Y_test, predicted_knn),"\n") if cond01 == 1: from xgboost import XGBClassifier # XGBoost model_xgb = XGBClassifier(); model_xgb.fit(X_train, Y_train) predicted_xgb = model_xgb.predict(X_test); print("XGBoost",metrics.accuracy_score(Y_test, predicted_xgb),"\n") if cond01 == 2: from sklearn.linear_model import LogisticRegression # Logistic Regression model_lr = LogisticRegression(); model_lr.fit(X_train, Y_train) predicted_lr = model_lr.predict(X_test) ; print("LogisticRegression",metrics.accuracy_score(Y_test, predicted_lr),"\n") #aa = model_lr.coef_ if cond01 == 3: from sklearn.naive_bayes import GaussianNB # Gaussian Naive Bayes model_nb = GaussianNB(); model_nb.fit(X_train, Y_train) predicted_nb = model_nb.predict(X_test) ; print("Gaussian Naive Bayes",metrics.accuracy_score(Y_test, predicted_nb),"\n") if cond01 == 4: from sklearn.ensemble import GradientBoostingClassifier # GradientBoosting model_gb = GradientBoostingClassifier(); model_gb.fit(X_train, Y_train)
class MLQRMine(object): def __init__(self): self._seed = randint(1, 9) self._csvfile = "" self._titles = None self._dataset = None self._X = None self._y = None self._X_original = None self._y_original = None self._dataset_original = None self._model = Sequential() self._sc = StandardScaler() self._vnum = 0 # Number of variables self._classifier = XGBClassifier() self._epochs = 10 self._samplesize = 0 self._clusters = None @property def seed(self): return self._seed @property def csvfile(self): return self._csvfile @property def dataset(self): return self._dataset @property def model(self): return self._model @property def epochs(self): return self._epochs @property def X(self): return self._X @property def y(self): return self._y @property def titles(self): return self._titles @property def head(self): return self._dataset.head # Getters should be before setters* @epochs.setter def epochs(self, epochs): self._epochs = epochs @seed.setter def seed(self, seed): self._seed = seed @csvfile.setter def csvfile(self, csvfile): self._csvfile = csvfile @titles.setter def titles(self, titles): self._titles = titles # Functions def read_csv(self): if self._titles is not None: self._dataset = read_csv(self._csvfile, usecols=self._titles) else: self._dataset = read_csv(self._csvfile) def mark_missing(self): self._dataset_original = self._dataset self._dataset = self._dataset.replace('', numpy.NaN) self._dataset.dropna(inplace=True) def restore_mark_missing(self): self._dataset = self._dataset_original def get_shape(self): return self._dataset.shape """ The actual number of IVs is vnum -2 as first is the title and the last is the DV To seperate DV, use vnum -1 to indicate last column More details on np array splicing here: https://stackoverflow.com/questions/34007632/how-to-remove-a-column-in-a-numpy-array/34008274 """ def read_xy(self): (self._samplesize, vnum) = self._dataset.shape # Last column in the csv should be the DV and first one is title (So get the number of variables) self._vnum = vnum - 2 # splice into IVs and DV values = self._dataset.values # self._X = values[:, 0:self._vnum] # First column ignored - (To be used for title) self._X = values[:, 1:vnum - 1] self._y = values[:, vnum - 1] def oversample(self): self._X_original = self._X self._y_original = self._y ros = RandomOverSampler(random_state=0) X, y = ros.fit_sample(self._X, self._y) self._X = X self._y = y def restore_oversample(self): self._X = self._X_original self._y = self._y_original def prepare_data(self, oversample=False): self.read_csv() self.mark_missing() self.read_xy() if oversample: self.oversample() def get_nnet_predictions(self): self._model.add(Dense(12, input_dim=self._vnum, kernel_initializer='uniform', activation='relu')) self._model.add(Dense(8, kernel_initializer='uniform', activation='relu')) self._model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid')) # Compile model self._model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # Fit the model self._model.fit(self._X, self._y, epochs=self._epochs, batch_size=10, verbose=2) # calculate predictions predictions = self._model.predict(self._X_original) # round predictions rounded = [round(x[0]) for x in predictions] return rounded def get_nnet_scores(self): return self._model.evaluate(self._X, self._y) def svm_confusion_matrix(self): X_train, X_test, y_train, y_test = train_test_split(self._X, self._y, test_size=0.25, random_state=0) X_train = self._sc.fit_transform(X_train) X_test = self._sc.transform(X_test) classifier = SVC(kernel='linear', random_state=0) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) return confusion_matrix(y_test, y_pred) # def knn_search(self, K=3, r=3): # """ find K nearest neighbours of data among D """ # D = self._X # x = self._X[[r-1], :] # # print("KNN: ", x) # (recs, vs) = D.shape # # print(recs) # #ndata = D.shape[0] # #K = K if K < ndata else ndata # K = K if K < recs else recs # # print(K) # # euclidean distances from the other points # sqd = sqrt(((D - x[:, :recs]) ** 2).sum(axis=0)) # idx = argsort(sqd) # sorting # # return the indexes of K nearest neighbours # print(idx[:K]) # # return idx[:K] # https://stackoverflow.com/questions/45419203/python-numpy-extracting-a-row-from-an-array def knn_search(self, n=3, r=3): kdt = KDTree(self._X, leaf_size=2, metric='euclidean') dist, ind = kdt.query(self._X[r - 1:r, :], k=n) return ind def get_kmeans(self, c=5): kmeans = KMeans(n_clusters=c, init='k-means++', random_state=42) y_kmeans = kmeans.fit_predict(self._X) self._clusters = y_kmeans self.get_centroids(c) return y_kmeans def get_centroids(self, c=1): for x in range(0, c): print("Cluster: ", x) ct = 0 cluster_list = [] for cluster in self._clusters: if cluster == x: cluster_list.append(ct) ct += 1 print("Cluster Length: ", len(cluster_list)) print("Cluster Members") print(self._dataset.iloc[cluster_list, :]) print("Mean") print(self._dataset.iloc[cluster_list, :].mean(axis=0)) """ TODO: This is not working yet. use the ColumnTransformer instead of categorical_features """ def encode_categorical(self): # labelencoder_X_1 = LabelEncoder() # self._X[:, 1] = labelencoder_X_1.fit_transform(self._X[:, 1]) # labelencoder_X_2 = LabelEncoder() # self._X[:, 2] = labelencoder_X_2.fit_transform(self._X[:, 2]) onehotencoder = OneHotEncoder(categorical_features=[1]) X = onehotencoder.fit_transform(self._X).toarray() X = X[:, 1:] print(X) return X def get_association(self): X_train, X_test, y_train, y_test = train_test_split(self._X, self._y, test_size=0.25, random_state=0) self._classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = self._classifier.predict(X_test) return confusion_matrix(y_test, y_pred) def get_apriori(self): frequent_itemsets = apriori(self.encode_categorical(), min_support=0.07, use_colnames=True) rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1) return rules def get_pca(self, n=3): # https://plot.ly/~notebook_demo/264/about-the-author-some-of-sebastian-rasc/#/ X_std = StandardScaler().fit_transform(self._X) (recs, factors) = X_std.shape print('Covariance matrix: \n%s' % numpy.cov(X_std.T)) cov_mat = numpy.cov(X_std.T) eig_vals, eig_vecs = numpy.linalg.eig(cov_mat) print('Eigenvectors \n%s' % eig_vecs) print('\nEigenvalues \n%s' % eig_vals) # Make a list of (eigenvalue, eigenvector) tuples eig_pairs = [(numpy.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))] # Sort the (eigenvalue, eigenvector) tuples from high to low eig_pairs.sort() eig_pairs.reverse() # Visually confirm that the list is correctly sorted by decreasing eigenvalues print('Eigenvalues in descending order:') for i in eig_pairs: print(i[0]) # variance explained tot = sum(eig_vals) var_exp = [(i / tot) * 100 for i in sorted(eig_vals, reverse=True)] cum_var_exp = numpy.cumsum(var_exp) print("Variance explained: ", var_exp) print("Cumulative: ", cum_var_exp) if len(eig_vals) < n: n = len(eig_vals) # Adjust according to number of features chosen (default n=2) matrix_w = numpy.hstack((eig_pairs[0][1].reshape(factors, 1), eig_pairs[1][1].reshape(factors, 1))) if n == 3: matrix_w = numpy.hstack((eig_pairs[0][1].reshape(factors, 1), eig_pairs[1][1].reshape(factors, 1), eig_pairs[2][1].reshape(factors, 1))) if n == 4: matrix_w = numpy.hstack((eig_pairs[0][1].reshape(factors, 1), eig_pairs[1][1].reshape(factors, 1), eig_pairs[2][1].reshape(factors, 1), eig_pairs[3][1].reshape(factors, 1))) if n == 5: matrix_w = numpy.hstack((eig_pairs[0][1].reshape(factors, 1), eig_pairs[1][1].reshape(factors, 1), eig_pairs[2][1].reshape(factors, 1), eig_pairs[3][1].reshape(factors, 1), eig_pairs[4][1].reshape(factors, 1))) print('Matrix W:\n', matrix_w)
print("Best Parameters:",dt_cv.best_params_) print("Best Score:",dt_cv.best_score_) """#### Random Forest""" rf = RandomForestClassifier(n_estimators=200) rf.fit(X_train,np.ravel(y_train)) print(classification_report(y_test,rf.predict(X_test))) """#### XGBoost""" xgb = XGBClassifier(n_estimators=200) xgb.fit(X_train,y_train) print(classification_report(y_test,xgb.predict(X_test))) tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] svm_cv = GridSearchCV( SVC(), tuned_parameters, scoring='precision' ) svm_cv.fit(X_train, np.ravel(y_train)) print(classification_report(y_test,svm_cv.predict(X_test))) model = tf.keras.Sequential() model.add(tf.keras.layers.Flatten(input_shape=(22,))) model.add(tf.keras.layers.Dense(16,activation="swish")) model.add(tf.keras.layers.Dense(16,activation="swish"))
# Encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X_1 = LabelEncoder() X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1]) labelencoder_X_2 = LabelEncoder() X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2]) onehotencoder = OneHotEncoder(categorical_features = [1]) X = onehotencoder.fit_transform(X).toarray() X = X[:, 1:] # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) # Fitting XGBoost to the Training set from xgboost import XGBClassifier classifier = XGBClassifier() classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Applying k-Fold Cross Validation from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10) accuracies.mean() accuracies.std()
features_to_csv(file_what_to_buy_features, what_to_buy.values()) if os.path.isfile(file_buy_or_not_features): buy_or_not = np.load(file_buy_or_not_features) else: buy_or_not = extract_buy_or_not(clicks_grouped_by_session_id, what_to_buy) np.save(file_buy_or_not_features, buy_or_not) buys = read_buys(file_buys, usecols=effective_columns_names) _, buys_grouped_by_session_id_keys = df_group_by_session_id(buys) buys_result = extract_buys(clicks_grouped_by_session_id_keys, buys_grouped_by_session_id_keys) buy_or_not_train, buy_or_not_val, buys_result_train, buys_result_val = train_test_split(buy_or_not, buys_result, test_size=0.2) classifier = XGBClassifier(n_estimators=500, subsample=0.8, colsample_bytree=0.5, max_depth=4, min_child_weight=3) classifier.fit(buy_or_not_train, buys_result_train) predictions_val = classifier.predict(buy_or_not_val) scores = metrics(buys_result_val, predictions_val) write_metrics(file_scores, scores) test = read_clicks(file_test, usecols=effective_columns_names) test_grouped_by_session_id, _ = df_group_by_session_id(test) what_to_buy_test = extract_what_to_buy(test_grouped_by_session_id) buy_or_not_test = extract_buy_or_not(test_grouped_by_session_id, what_to_buy_test) predictions_test = classifier.predict(buy_or_not_test) write_predictions(file_result, predictions_test)
model = XGBClassifier(reg_lambda=1, reg_alpha=3, max_depth=3, n_estimators=200) model.fit(X_train, Y_train) # compute accuracy def accuracy(X, Y): Y_pred = model.predict(X) Y_pred_round = [round(value) for value in Y_pred] return accuracy_score(Y, Y_pred_round) out_of_sample = accuracy(X_test, Y_test) in_sample = accuracy(X_train, Y_train) print('Accuracy in sample: {:04.2f}%'.format(in_sample * 100)) print('Accuracy out of sample: {:04.2f}%'.format(out_of_sample * 100)) if parse.validation_set is not None: validate = accuracy(X_validate, Y_validate) print('Accuracy for validation: {:04.2f}%'.format(validate * 100)) if parse.save is True: test_prediction = model.predict(X_test) test_outcome = column_stack((test_prediction, Y_test)) savetxt('./predictions/xgb_test_outcome.csv', test_outcome, delimiter=',') if parse.validation_set is not None: validation_prediction = model.predict(X_validate) validation_outcome = column_stack((validation_prediction, Y_validate)) savetxt('./predictions/xgb_validation_outcome.csv', validation_outcome, delimiter=',')
def gen_train(): global x_train global y_train x_train, y_train = [], [] for i in threads: for thread in threads[i]: x_train.append(thread) y_train.append(i) x_train, y_train = np.array(x_train), np.array(y_train) while len_threads < 729: print('#', len_threads) gen_train() model.fit(x_train, y_train) y_pred = model.predict(X) accuracy = accuracy_score(Y, y_pred) print('Accuracy: {}'.format(accuracy)) #print(y_pred) print(dict(collections.Counter(y_pred))) find = 1 if len(threads[0]) > len(threads[1]) else 0 num_proc = 6 threads_arr = {} for i in range(num_proc): threads_arr[i] = [] pi_id = 0 for i in range(len(y_pred)): if y_pred[i] == 0:
precision = [] specificity = [] recall = [] f_measure = [] auc = [] kf = StratifiedKFold(n_splits=10) #%% ## timeit -n1 -r1 for train_index, val_index in kf.split(X_train_cv, y_train_cv): X_train, X_val = X_train_cv[train_index], X_train_cv[val_index] y_train, y_val = y_train_cv[train_index], y_train_cv[val_index] clf.fit(X_train, y_train, eval_metric='auc') clf_pred_proba = clf.predict_proba(X_val) clf_pred = clf.predict(X_val) acc, prec, rec, spec, f_m = calcula_scores(y_val, clf_pred) auc.append(roc_auc_score(y_val, clf_pred_proba[:, 1])) accuracy.append(acc) precision.append(prec) specificity.append(spec) recall.append(rec) f_measure.append(f_m) print("XGBoost AUC: \n\tMédia: {:.3f}\n\tDesvio: {:.3f}".format( statistics.mean(auc), statistics.stdev(auc))) print("XGBoost Accuracy: \n\tMédia: {:.3f}\n\tDesvio: {:.3f}".format( statistics.mean(accuracy), statistics.stdev(accuracy))) print("XGBoost Precision: \n\tMédia: {:.3f}\n\tDesvio: {:.3f}".format(
#Loading the dataset dataset = pd.read_csv("creditcard.csv") #Test train Split X_train, X_test, Y_train, Y_test = train_test_split( dataset.iloc[:, :-1].values, dataset['Class'].values, test_size=0.12221) xbg = XGBClassifier( max_depth=5, learning_rate=0.1, n_estimators=1500, n_jobs=6, ) xbg.fit(X_train, Y_train) Y_test_pred = xbg.predict(X_test) Y_train_pred = xbg.predict(X_train) count_train, count_test = 0, 0 for i in range(250000): if (Y_train[i] == Y_train_pred[i]): count_train += 1 for i in range(34807): if (Y_test[i] == Y_test_pred[i]): count_test += 1 print("Train Percentage: %.7f\nTest Percentage: %.7f" % (count_train / 250000, count_test / 34807)) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm_train = confusion_matrix(Y_train, Y_train_pred)
from xgboost import XGBClassifier xgb = XGBClassifier( learning_rate=0.01, n_estimators=1000, max_depth=10, min_child_weight=4, gamma=0.1, subsample=0.3, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=3, seed=27, reg_alpha=0.001, n_jobs=4, ) xgb.fit(X_train, y_train) y_pred6 = xgb.predict(X_test) accuracy = accuracy_score(y_test, y_pred6) print("Accuracy: %.2f%%" % (accuracy * 100.0)) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm_xg = confusion_matrix(y_test, y_pred6) print(cm_xg) #with learning_rate of 0.01 and some other parameteres we achieved a better accuracy score compared to other models.
# coding: utf-8 import numpy as np import pandas as pd import os, sys from sklearn.preprocessing import MinMaxScaler from xgboost import XGBClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score #initialize dataframe df=pd.read_csv("parkinsons.data") features = df.loc[:, df.columns!='status'].values[:, 1:] labels = df.loc[:, 'status'].values #print(labels[labels==1].shape[0], labels[labels==0].shape[0]) #use MinMaxScaler scaler = MinMaxScaler((-1, 1)) x = scaler.fit_transform(features) y = labels # split model to train and test x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7) # train model model=XGBClassifier() model.fit(x_train, y_train) # predict y_pred = model.predict(x_test) print(accuracy_score(y_test, y_pred)*100)
left_index=True, right_index=True) X_ = pd.merge(hbsDF, X_, left_index=True, right_index=True) return X_ X_ = process(X) idx = X_.dropna( ).index #some entry being nan in heartbeat rate, only 4 of them anyway, i didn't bother much #one pass evaluation for model, with oversampling X_train, X_test, y_train, y_test = train_test_split(X_.iloc[idx], y[idx]) ros = RandomOverSampler(random_state=0) X_resampled, y_resampled = ros.fit_resample(X_train, y_train) clf.fit(X_resampled, y_resampled, eval_metric=f1_score) pred = clf.predict(X_test.values) scores = f1_score(y_test, pred, average='micro') print(scores) #cross validation, got 0.7x for cv=3, 5, 10. #did not apply oversampling, #adding a pipeline in cross validation is doable but i am too lazy from sklearn.model_selection import cross_val_score scores = cross_val_score(clf, X_.iloc[idx], y[idx], cv=3, scoring='f1_micro', verbose=1000) print(scores)
from sklearn.cross_validation import cross_val_score cross_val_score(rfc, X_train, y_train, cv=5).mean() cross_val_score(xgbc, X_train, y_train, cv=5).mean() rfc.fit(X_train,y_train) rfc_y_predict = rfc.predict(X_test) rfc_submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': rfc_y_predict}) rfc_submission.to_csv('rfc_submission.csv', index=False) xgbc.fit(X_train, y_train) xgbc_y_predict = xgbc.predict(X_test) xgbc_submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': xgbc_y_predict}) xgbc_submission.to_csv('xgbc_submission.csv', index=False) from sklearn.grid_search import GridSearchCV params = {'max_depth':range(2, 7), 'n_estimators':range(100, 1100, 200), 'learning_rate':[0.05, 0.1, 0.25, 0.5, 1.0]} xgbc_best = XGBClassifier() gs = GridSearchCV(xgbc_best, params, n_jobs=-1, cv=5, verbose=1) gs.fit(X_train, y_train) xgbc_best_y_predict = gs.predict(X_test)
from sklearn.neighbors import KNeighborsClassifier classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2) classifier.fit(X_train, y_train) from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0) classifier.fit(X_train, y_train) from sklearn.tree import DecisionTreeClassifier classifier = DecisionTreeClassifier(criterion='entropy',random_state=0) classifier.fit(X_train,y_train) from sklearn.svm import SVC classifier = SVC(kernel= 'rbf', random_state=0) classifier.fit(X_train,y_train) """ # Predicting the Test set results y_pred = classifier.predict(X_test) from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10) print("Accuracy: {:.2f} %".format(accuracies.mean())) print("Standar Deviation: {:.2f} %".format(accuracies.std() * 100)) """ #Apliying GridSearch to find the best model and the best paramenters from sklearn.model_selection import GridSearchCV parameters=[{'C': [0.25, 0.5, 0.75, 1], 'kernel': ['linear']}, {'C': [0.25, 0.5, 0.75, 1], 'kernel': ['rbf'], 'gamma':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]}] grid_search=GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1)
feature_imp # Visualize import matplotlib.pyplot as plt import seaborn as sns # Creating a bar plot sns.barplot(x=feature_imp, y=feature_imp.index) # Add labels to your graph plt.xlabel('Feature Importance Score') plt.ylabel('Features') plt.title("Visualizing Important Features") plt.legend() plt.show() # prediction on test set y_pred = model.predict(X_test) #Import scikit-learn metrics module for accuracy calculation from sklearn import metrics # Model Accuracy, how often is the classifier correct? print("Accuracy:", metrics.accuracy_score(y_test, y_pred)) submission = test.copy() submission['Cover_Type'] = model.predict(submission[features_selected]) #submission.info(memory_usage='deep') submission[['Id', 'Cover_Type']].to_csv('submission{0}.csv'.format(10), index=False)
#split dataset and make binary dependent full_data = hstack((title_S, hasNamedEntity_S, hasNumbers_S, hasSubTitle_S, title_lengths_S, polarity_S, subjectivity_S)) X_train_final, X_test_final, y_train_final, y_test_final = train_test_split( full_data, dat[target], test_size=0.15, random_state=123) y_train_dich_final = [0 if i <= cutoff else 1 for i in y_train_final] y_test_dich_final = [0 if i <= cutoff else 1 for i in y_test_final] #fit model model = XGBClassifier(objective='binary:logistic', booster='gbtree', learning_rate=0.3, max_depth=20, subsample=0.7) model.fit(X_train_final, y_train_dich_final) # get predictions preds = model.predict(X_test_final) probs = model.predict_proba(X_test_final) # compute metrics and print ROC curve print(accuracy_score(y_test_dich_final, preds)) print(confusion_matrix(y_test_dich_final, preds)) auc = roc_auc_score(y_test_dich_final, probs[:, 1]) fpr, tpr, thresholds = roc_curve(y_test_dich_final, probs[:, 1]) print(auc) plot_roc_curve(fpr, tpr, auc) ########################################################### ########################################################### ################ FEATURE IMPORTANCE ################### ########################################################### ###########################################################
print("# train data points: {}".format(X_train.shape[0])) print("# val data points : {}".format(X_val.shape[0])) def gen_test_labels(model, test_data, name="output"): y_pred = model.predict(test_data) predictions = [round(value) for value in y_pred] with open('data/' + name + '.csv', 'w') as csvfile: fieldnames = ['ID', 'Label'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for i, pred in enumerate(predictions): writer.writerow({'ID': str(i), 'Label': str(pred)}) model = XGBClassifier(n_estimators=2000, learning_rate=0.01, max_depth=2) model.fit(X_train, y_train) y_pred = model.predict(X_train) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_train, predictions) print("Train Accuracy: %.2f%%" % (accuracy * 100.0)) y_pred = model.predict(X_val) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_val, predictions) print("Test Accuracy: %.2f%%" % (accuracy * 100.0)) model = XGBClassifier(n_estimators=2500, learning_rate=0.01, max_depth=2) model.fit(train_data, train_label)
pca_xtest = pd.DataFrame(dim_red.transform(xtest)) pca_df_test = pd.DataFrame(dim_red.transform(df_test_final)) #Check the shape of the datasets print('Training Data(pca_xtrain):',pca_xtrain.shape) #(3367, 6) print('Validation Data(pca_xtest):', pca_xtest.shape) #(842, 6) print('Testing Data(pca_df_test):', pca_df_test.shape) #(4209, 6) print('Training_pred Label(ytrain):', ytrain.shape) #(3367,) print('Validation_pred Label(ytest):', ytest.shape) #(842,) #Applying XGBoost #Create an XGBoost object XGB_model = XGBClassifier() #Fit the XGBoost model on the training set XGB_model.fit(pca_xtrain,ytrain) #For parameters to work with, refer the link below, thanks to AV #https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/ #prediction on validation set ypred = XGB_model.predict(pca_xtest) #Prediction on test data XGB_model.predict(pca_df_test) #Metrics #check the accuracy of (prediction Vs test labels) print(mean_squared_error(ytest,ypred)) #R-Squared error print(r2_score(ytest,ypred))
# Optimize model paramaters # I run this code in google colab to make the execution much faster and use the best params in the next code param_grid = { 'min_child_weight': [1, 5, 10], 'gamma': [0.5, 1, 1.5, 2, 5], 'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0], 'max_depth': [3, 4, 5] } my_model2 = GridSearchCV(my_model, param_grid) my_model2.fit(X_Train, y_Train) print(my_model2.best_params_) from sklearn.metrics import confusion_matrix, accuracy_score # fit and Evaluate model my_model3 = XGBClassifier(min_child_weight=1, gamma=2, subsample=0.6, colsample_bytree=0.6, max_depth=3) my_model3.fit(X_Train, y_Train) y_pred = my_model3.predict(X_val) # Get error rate print("Error rate of XGBoost: ", 1 - accuracy_score(y_val, y_pred)) # Get confusion matrix confusion_matrix(y_pred, y_val)
xgb2 = XGBClassifier(learning_rate=0.3, n_estimators=700, max_depth=2, n_jobs=-1, colsample_bytree=0.1, random_state=4218) # Lets look at the eror to compare with the baseline and then we will update the model to then get predictions. eval_set = [(x_val, y_val)] eval_metric = ["aucpr","error"] %time xgb2.fit(x_train, y_train, eval_metric=eval_metric, eval_set=eval_set, verbose=2) # It lookslike we will keep our parameters. The error was lower with our tuning and we can move forward with predictions. xgb2.fit(x_train, y_train) # Lets get the tuned predictions for the train and validations sets. The predicted probabilities for predicting the class and getting our AUC and f1 score. xgb2predprob_train = xgb2.predict_proba(x_train)[:, 1] xgb2predprob_val = xgb2.predict_proba(x_val)[:, 1] # The decision predictions to help us classify and get the f1 scores and see what the recall and precision are if we want them. xgb2preds_train = xgb2.predict(x_train) xgb2preds_val = xgb2.predict(x_val) # Results from the tuned model. print ('F1 Score',f1_score(y_train, xgb2preds_train)) print ('F1 Score',f1_score(y_val, xgb2preds_val)) print ('ROC AUC Score',roc_auc_score(y_train,xgb2predprob_train)) print ('ROC AUC Score',roc_auc_score(y_val,xgb2predprob_val)) # Lets look at our confusion matrix here to understand the classification reports to help us figure out what we may have missed. print ('Training Confusion Matrix',confusion_matrix(y_train, xgb2preds_train)) print ('Val Confusion Matrix',confusion_matrix(y_val, xgb2preds_val)) print ('Training Classification report',classification_report(y_train, xgb2preds_train)) print ('Val Classification Report',classification_report(y_val, xgb2preds_val)) # Overall, we saw that the validation set tuned f1 score for the models was: LR = 79.07% vs. RF = 91.56% vs. XGB = 93.67% (when using the eval method otherwise it was 89.4364% with an error of 0.0625). The AUC score for how the model perform on the validation set was: LR = 78.43% vs. RF = 92.92% vs. XGB = 91.56%. Overall, the XGBoost model performed the best. It had the higher f1 score even though it had a lower AUC score than RF.
def XGBoostCheck(): xgbModel = XGBClassifier() ids = utility.GetShangHaiStockIdStrArray() np.random.shuffle(ids) ids = ids[:100] testCheckStartDate = datetime.strptime("2015-06-01", "%Y-%m-%d") testCheckEndDate = datetime.strptime("2017-06-1", "%Y-%m-%d") trainDatas = [] labelDatas = [] count = 0 for trainData in GetTrainningData(ids, testCheckStartDate, testCheckEndDate, posNegEq=True): print(trainData[2]) posTrainData = trainData[0] negTrainData = trainData[1] if len(posTrainData) > 0: trainDatas += posTrainData labelDatas += np.ones(len(posTrainData)).tolist() if len(negTrainData) > 0: trainDatas += negTrainData labelDatas += np.zeros(len(negTrainData)).tolist() # count += 1 # if count == 20: # break pca = PCA(n_components = 75) trainDatas = np.array(trainDatas) print(trainDatas.shape) trainDatas = pca.fit_transform(trainDatas) print(trainDatas.shape) labelDatas = np.array(labelDatas) labelDatas = np.reshape(labelDatas, (-1, 1)) print(labelDatas.shape) trianDataTmp = np.concatenate((trainDatas, labelDatas), axis=1) np.random.shuffle(trianDataTmp) # labelDatas = np.reshape(labelDatas, (-1, 1)) xgbModel.fit(trianDataTmp[:, 0:74], trianDataTmp[:, 74]) testTestCheckStartDate = datetime.strptime("2017-06-2", "%Y-%m-%d") testTestCheckEndDate = datetime.strptime("2018-06-01", "%Y-%m-%d") for trainData in GetTrainningData(ids, testTestCheckStartDate, testTestCheckEndDate): print("Predict:", trainData[2]) posTrainData = trainData[0] posTrainData = pca.transform( posTrainData ) negTrainData = trainData[1] negTrainData = pca.transform( negTrainData ) if len(posTrainData) > 0: retLabels = xgbModel.predict(posTrainData) try: print(trainData[2], "Pos Count:", len(posTrainData), "pos predict rate:", len(np.where(retLabels == 1)[0]) / len(posTrainData)) except: print(trainData[2], "Pos Count:", len(posTrainData)) if len(negTrainData) > 0: retLabels = xgbModel.predict(negTrainData) try: print(trainData[2], "Neg Count:", len(negTrainData), "neg predict rate:", len(np.where(retLabels == 0)[0]) / len(negTrainData)) except: print(trainData[2], "Neg Count:", len(negTrainData))
def main(): # Set seed for reproducibility np.random.seed(0) print("Loading data...") # Load the data from the CSV files training_data = pd.read_csv('/home/vipin/Videos/train.csv', header=0) prediction_data = pd.read_csv('/home/vipin/Videos/test.csv', header=0) training_data['countrycode'] = training_data['countrycode'].apply( lambda x: ord(x)) training_data['browserid'] = training_data['browserid'].apply( lambda x: myfunc(x) if np.all(pd.notnull(x)) else myfunc("unknown")) training_data['devid'] = training_data['devid'].apply( lambda x: myfunc(x) if np.all(pd.notnull(x)) else myfunc("none")) #pd.to_csv('/home/vipin/Videos/train11.csv', sep=',', encoding='utf-8') #exit(0) prediction_data['countrycode'] = prediction_data['countrycode'].apply( lambda x: ord(x)) prediction_data['browserid'] = prediction_data['browserid'].apply( lambda x: myfunc(x) if np.all(pd.notnull(x)) else myfunc("unknown")) prediction_data['devid'] = prediction_data['devid'].apply( lambda x: myfunc(x) if np.all(pd.notnull(x)) else myfunc("none")) features = [ 'siteid', 'offerid', 'category', 'merchant', 'countrycode', 'browserid', 'devid' ] target = "click" X = training_data[features] x_prediction = prediction_data[features] Y = training_data[target] ids = prediction_data["ID"] model = XGBClassifier() #linear_model.LogisticRegression(n_jobs=-1) print("Training...") # Your model is trained on the training_data model.fit(X, Y) print("Predicting...") seed = 7 test_size = 0.33 X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) y_prediction = model.predict_proba(x_prediction) results = y_prediction[:, 1] results_df = pd.DataFrame(data={'probability': results}) joined = pd.DataFrame(ids).join(results_df) y_pred = model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("Accuracy: %.2f%%" % (accuracy * 100.0)) print("Writing predictions to predictions.csv") # Save the predictions out to a CSV file joined.to_csv("/home/vipin/Videos/predictions.csv", index=False)
max_depth=range(2, 12, 1) param_grid = dict(max_depth=max_depth) grid = GridSearchCV(estimator=model, param_grid=param_grid) grid.fit(x2,y) print(grid.best_score_) print(grid.best_params_) ###Build the model using n_estimators as 20 and max_depth as 2 xgb1 = XGBClassifier(objective ='reg:logistic', learning_rate = 0.1, max_depth = 2, n_estimators = 20) xgb1.fit(x2_train,y_train) train_pred =xgb1.predict(x2_train) import numpy as np train_acc = np.mean(train_pred==y_train) print(train_acc) test_pred=xgb1.predict(x2_test) test_acc=np.mean(test_pred==y_test) print(test_acc) #Variable importance plot from xgboost import plot_importance plot_importance(xgb1)
def run_monthly(client, MonthGAP=1): data_dict, pullinfo_list_dict = load_data_monthly( ayonel_numerical_attr=ayonel_numerical_attr, ayonel_boolean_attr=ayonel_boolean_attr, ayonel_categorical_attr_handler=ayonel_categorical_attr_handler, MonthGAP=MonthGAP) for org, repo in [('dimagi', 'xxx')]: print(org + ",") pullinfo_list = pullinfo_list_dict[org] batch_iter = data_dict[org] train_batch = batch_iter.__next__() train_X = np.array(train_batch[0]) train_y = np.array(train_batch[1]) cursor = train_y.size # 游标,用于记录第一条开始预测pr的位置 predict_result = [] predict_result_prob = [] actual_result = [] mean_accuracy = 0 round = 1 for batch in batch_iter: if len(batch[0]) == 0: # 测试集没有数据,直接预测下一batch continue test_X = np.array(batch[0]) test_y = np.array(batch[1]) parameters = [ ("criterion", ["gini", "entropy"]), ("max_features", ["auto", "sqrt", "log2"]), ("min_weight_fraction_leaf", iandfrange(0, 0.501, 0.05)), ("oob_score", [True, False]), ] tuned_params = {} # 已调好的参数 for k, v in enumerate(parameters): tuning_param = {} tuning_param[v[0]] = v[1] estimator_rf = RandomForestClassifier(random_state=RANDOM_SEED, **tuned_params) clf = GridSearchCV(estimator=estimator_rf, param_grid=tuning_param, scoring="accuracy", cv=3) clf.fit(train_X, train_y) tuned_params = dict(tuned_params, **clf.best_params_) print(tuned_params) # 入库 client[org]['model'].update( { 'round': round, 'model': 'randomforest', 'gap': MonthGAP }, {'$set': tuned_params}, upsert=True) best_est = XGBClassifier(seed=RANDOM_SEED, **tuned_params) train(best_est, train_X, train_y) print(best_est.score(test_X, test_y)) actual_result += test_y.tolist() # 真实结果 predict_result += best_est.predict(test_X).tolist() # 预测结果 predict_result_prob += [ x[0] for x in best_est.predict_proba(test_X).tolist() ] mean_accuracy += best_est.score(test_X, test_y) train_X = np.concatenate((train_X, test_X)) train_y = np.concatenate((train_y, test_y)) round += 1 acc_num = 0 for i in range(len(actual_result)): if actual_result[i] == predict_result[i]: acc_num += 1 print(acc_num / len(actual_result))
print(f) c.append(f) train.columns=c test.columns=c # In[14]: #Part 2 model = XGBClassifier() model.fit(train, label_train) # In[15]: label_pred=model.predict(test) # In[16]: # In[17]: def get_thresh(model,train,test,label_test,label_train): if (len(test)>len(train)) or (len(label_test)>len(label_train)): raise TypeError('Invalid train and test size') model1 = XGBClassifier() if type(model)!=type(XGBClassifier()): raise TypeError('Invalid model passed') if (pd.DataFrame(label_train).shape[1]>1) or (pd.DataFrame(label_test).shape[1]>1): raise TypeError('Multiple columns in label, Invalid shape.')
Y = list_personality[:, l] print( "Training dataset-Binarized personality list for type indicator-------> ", Y) # split data into train and test sets seed = 7 test_size = 0.33 X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=test_size, random_state=seed) # fit model on training data model = XGBClassifier(**param) model.fit(X_train, y_train) # make predictions for my data y_pred = model.predict(my_X_tfidf) print("Predicted value-----> ", y_pred) result.append(y_pred[0]) # print("* %s prediction: %s" % (type_indicators[l], y_pred)) print("RESULT IS----", result) print("TRANSLATED RESULT IS------", translate_back(result)) # x= translate_back(result) # score=0 # for p in check: # if p in x: # score=score+10 # print("MY SCORE----------",score) # # r+=[(score,use)] # print("SORTED TWITTER IDs BASED ON THE SCORE")
from xgboost import XGBClassifier from sklearn.cross_validation import train_test_split from sklearn.metrics import accuracy_score from sklearn.feature_selection import SelectFromModel # load data dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") # split data into X and y X = dataset[:,0:8] Y = dataset[:,8] # split data into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7) # fit model on all training data model = XGBClassifier() model.fit(X_train, y_train) # make predictions for test data and evaluate y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) # Fit model using each importance as a threshold thresholds = sort(model.feature_importances_) for thresh in thresholds: # select features using threshold selection = SelectFromModel(model, threshold=thresh, prefit=True) select_X_train = selection.transform(X_train) # train model selection_model = XGBClassifier() selection_model.fit(select_X_train, y_train) # eval model select_X_test = selection.transform(X_test) y_pred = selection_model.predict(select_X_test)
""" from data_loader import dataLoader from xgboost import XGBClassifier from sklearn.metrics import accuracy_score if __name__ == '__main__': # Get data X_train, X_val, y_train, y_val = dataLoader(test=False, optimize_set=True) y_train, y_val = y_train.values.ravel(), y_val.values.ravel() # Define baseline XGB classifier model with default parameters defualt_xgb = XGBClassifier() defualt_xgb.fit(X_train, y_train) default_predictions = defualt_xgb.predict(X_val) print('Default XGB model test accuracy', accuracy_score(y_val, default_predictions)) # Using early_stopping_rounds to determine best n_estimators number tuned_xgb = XGBClassifier(n_estimators=10, learning_rate=0.5) tuned_xgb.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_val, y_val)], verbose=False) tuned_params = tuned_xgb.get_params() print('') print('Best n_estimators', tuned_params['n_estimators']) print('Best learning rate', tuned_params['learning_rate']) tuned_predictions = tuned_xgb.predict(X_val)
from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score numpy_X_train = X_train.as_matrix() numpy_y_train = y_train.as_matrix() numpy_X_test = X_test.as_matrix() numpy_y_test = y_test.as_matrix() # fit model to training data #model = XGBClassifier(max_depth=5, n_estimators=250) model = XGBClassifier() model.fit(numpy_X_train, numpy_y_train) print model # make predictions for test data y_pred = model.predict(numpy_X_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(numpy_y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) confusion_matrix = confusion_matrix(numpy_y_test, y_pred) print confusion_matrix ######################################################################## # SVM ######################################################################## print 'Beginning support vector machine...'
from sklearn.cross_validation import train_test_split from sklearn.metrics import accuracy_score from matplotlib import pyplot # load data dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",") # split data into X and y X = dataset[:,0:8] Y = dataset[:,8] # split data into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7) # fit model no training data model = XGBClassifier() eval_set = [(X_train, y_train), (X_test, y_test)] model.fit(X_train, y_train, eval_metric=["error", "logloss"], eval_set=eval_set, verbose=True) # make predictions for test data y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) # retrieve performance metrics results = model.evals_result() epochs = len(results['validation_0']['error']) x_axis = range(0, epochs) # plot log loss fig, ax = pyplot.subplots() ax.plot(x_axis, results['validation_0']['logloss'], label='Train') ax.plot(x_axis, results['validation_1']['logloss'], label='Test') ax.legend() pyplot.ylabel('Log Loss') pyplot.title('XGBoost Log Loss')
from __future__ import division import pandas as pd from xgboost import XGBClassifier from sklearn.cross_validation import train_test_split from sklearn.ensemble import RandomForestClassifier import pickle print 'reading data...' df = pd.read_csv("../featurized/recall_train_booking_2.csv", sep='\t') X = df[['srch_dest_ct', 'hotel_market_ct', 'dest_pct', 'market_pct']] is_booked = df.pop('is_booked') Xtrain, Xtest, ytrain, ytest = train_test_split(X, is_booked) # print 'training rf...' # rf = RandomForestClassifier(max_depth=3, n_estimators=100, n_jobs=-1).fit(Xtrain, ytrain) # rf_pred = rf.predict(ytest) # print sum(rf_pred==ytest)/len(ytest) print 'training xgb...' xgbm = XGBClassifier(max_depth=3, n_estimators=100, learning_rate=0.01).fit(Xtrain, ytrain) xgbm_pred = xgbm.predict(Xtest) print sum(xgbm_pred==ytest)/len(ytest) with open('../models/xgbm1.pkl', 'w') as f: pickle.dump(xgbm, f)
def stacking(featureNames,dataNames): rs = np.random.randint(100000) # check data set info print("*** data load ***") print(featureNames,dataNames) Xs = [] ratio = 0.7 for dataName in dataNames: fin = open("./learn/data/"+featureNames[0]+"_"+dataName+".pkl","rb") Xs.append(pickle.load(fin)) fin.close() minDataSize = min([len(X) for X in Xs]) trainSize = int(minDataSize * ratio) testSize = minDataSize-trainSize clfs = [RandomForestClassifier(),XGBClassifier(),SVC(probability=True,C=1), ExtraTreesClassifier(),LogisticRegression()] featureCount = [0] for featureName in featureNames: fin = open("./learn/data/"+featureName+"_"+dataNames[0]+".pkl","rb") X = pickle.load(fin) fin.close() featureCount.append(featureCount[-1]+len(X[0])) print("train,test = {0},{1}".format(trainSize,testSize)) print("featureCount boundary = {0}".format(featureCount)) print("models = {0}".format(clfs)) # generate 1st layer feature vector print("\n*** 1st layer ***") X1_tr = [] X1_te = [] Y1_tr = [] Y1_te = [] featCount = [0] for i,dataName in enumerate(dataNames): Xf = [] for featureName in featureNames: fin = open("./learn/data/"+featureName+"_"+dataName+".pkl","rb") X = np.array(pickle.load(fin)) fin.close() print("first layer from {0}-{1} : {2}".format(dataName,featureName,X.shape)) Xf.append(X) Xf = np.hstack(Xf) X_tr,X_te = train_test_split(Xf ,train_size=trainSize, test_size=testSize,random_state=rs) # print(X_tr.shape,X_te.shape) X1_tr.append(X_tr) X1_te.append(X_te) Y1_tr += [i]*trainSize Y1_te += [i]*testSize X1_tr = np.vstack(X1_tr) X1_te = np.vstack(X1_te) Y1_tr = np.array(Y1_tr) Y1_te = np.array(Y1_te) print("train vector : {0} label {1}".format(X1_tr.shape,Y1_tr.shape)) print("test vector : {0} label {1}".format(X1_te.shape,Y1_te.shape)) # generate 2nd layer feature vector print("\n*** 2nd layer ***") featureLength = len(featureNames)*len(dataNames)*len(clfs) featurePerModel = len(featureCount)*len(dataNames) X2_tr = np.zeros((trainSize*len(dataNames),featureLength)) X2_te = np.zeros((testSize*len(dataNames),featureLength)) Y2_tr = Y1_tr Y2_te = Y1_te nfold = 5 print("{0}-class * {1}-models * {2}-features -> length = {3}".format(len(dataNames),len(clfs),len(featureNames),featureLength)) print("{0}-fold * {1}-models * {2}-features -> #train = {3}".format(nfold,len(clfs),len(featureNames),nfold*len(clfs)*len(featureNames))) totacc = [0]*len(featureNames) skf = StratifiedKFold(Y1_tr,n_folds =nfold,shuffle=True,random_state=rs) i=0 for trind,valind in skf: Xtrall = X1_tr[trind] Xvalall = X1_tr[valind] Ytr = Y1_tr[trind] Yval = Y1_tr[valind] for fi in range(len(featureCount)-1): Xtr = Xtrall[:,featureCount[fi]:featureCount[fi+1]] Xval = Xvalall[:,featureCount[fi]:featureCount[fi+1]] for ci,clf in enumerate(clfs): clf.fit(Xtr,Ytr) proba = clf.predict_proba(Xval) # print(X2_tr.shape) # print(X2_tr[valind].shape) for pi,ind in enumerate(valind): pos = fi*len(dataNames)*len(clfs)+len(dataNames)*ci posend = pos+len(dataNames) X2_tr[ind,pos:posend] = proba[pi] # (X2_tr[valind])[:,len(dataNames)*fi:len(dataNames)*(fi+1)] = proba Yvalp = clf.predict(Xval) acc = accuracy_score(Yval,Yvalp) print("{0}th fold : {1}th feature : {2}th model : validation acc = {3}".format(i,fi,ci,acc)) totacc[fi] += acc i+=1 for fi,acc in enumerate(totacc): print("{0} th feature {1} : ave acc = {2}".format(fi,featureNames[fi],acc/nfold/len(clfs))) fout = open("./learn/stack/result.txt","a") for fi in range(len(featureCount)-1): Xtr = X1_tr[:,featureCount[fi]:featureCount[fi+1]] Xte = X1_te[:,featureCount[fi]:featureCount[fi+1]] Ytr = Y1_tr Yte = Y1_te for ci,clf in enumerate(clfs): clf.fit(Xtr,Ytr) proba = clf.predict_proba(Xte) pos = fi*len(dataNames)*len(clfs)+len(dataNames)*ci posend = pos+len(dataNames) X2_te[:,pos:posend] = proba Ypr = clf.predict(Xte) acc = accuracy_score(Yte,Ypr) print("{0}th feature : {1}th model : test acc = {2}".format(fi,ci,acc)) fout.write("{0} ".format(acc)) Xtr = X1_tr Xte = X1_te Ytr = Y1_tr Yte = Y1_te for ci,clf in enumerate(clfs): clf.fit(Xtr,Ytr) Ypr = clf.predict(Xte) acc = accuracy_score(Yte,Ypr) print("all feature : {0}th model : test acc = {1}".format(ci,acc)) fout.write("{0} ".format(acc)) print("train vector {0}".format(X2_tr.shape)) print("test vector {0}".format(X2_te.shape)) # 3rd layer print("\n*** 3rd layer ***") clf = XGBClassifier() clf.fit(X2_tr,Y2_tr) Y2_tr_pr = clf.predict(X2_tr) Y2_te_pr = clf.predict(X2_te) train_acc = accuracy_score(Y2_tr,Y2_tr_pr) test_acc = accuracy_score(Y2_te,Y2_te_pr) print("final acc (train,test) = {0},{1}".format(train_acc,test_acc)) fout.write("{0} ".format(test_acc)) fout.write("\n") fout.close()
class XGBoost(Model): def __init__(self, objective="binary:logistic", max_depth=None, learning_rate=None, n_estimators=100, verbosity=None, booster=None, tree_method=None, n_jobs=None, gamma=None, min_child_weight=None, max_delta_step=None, subsample=None, colsample_bytree=None, colsample_bylevel=None, colsample_bynode=None, reg_alpha=None, reg_lambda=None, scale_pos_weight=None, base_score=None, random_state=None, missing=np.nan, num_parallel_tree=None, monotone_constraints=None, interaction_constraints=None, importance_type="gain", gpu_id=None, validate_parameters=None, metrics=[], path='algorithms/.output', name='xgboost'): self.objective = objective self.max_depth = max_depth self.learning_rate = learning_rate self.n_estimators = n_estimators self.verbosity = verbosity self.booster = booster self.tree_method = tree_method self.n_jobs = n_jobs self.gamma = gamma self.min_child_weight = min_child_weight self.max_delta_step = max_delta_step self.subsample = subsample self.colsample_bytree = colsample_bytree self.colsample_bylevel = colsample_bylevel self.colsample_bynode = colsample_bynode self.reg_alpha = reg_alpha self.reg_lambda = reg_lambda self.scale_pos_weight = scale_pos_weight self.base_score = base_score self.random_state = random_state self.missing = missing self.num_parallel_tree = num_parallel_tree self.monotone_constraints = monotone_constraints self.interaction_constraints = interaction_constraints self.importance_type = importance_type self.gpu_id = gpu_id self.validate_parameters = validate_parameters self.path = path self.name = name self.create_model() super().__init__(self.path, self.name) def create_model(self): self.model = XGBClassifier(objective=self.objective, max_depth=self.max_depth, learning_rate=self.learning_rate, n_estimators=self.n_estimators, verbosity=self.verbosity, booster=self.booster, tree_method=self.tree_method, n_jobs=self.n_jobs, gamma=self.gamma, min_child_weight=self.min_child_weight, max_delta_step=self.max_delta_step, subsample=self.subsample, colsample_bytree=self.colsample_bytree, colsample_bylevel=self.colsample_bylevel, colsample_bynode=self.colsample_bynode, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, scale_pos_weight=self.scale_pos_weight, base_score=self.base_score, random_state=self.random_state, missing=self.missing, num_parallel_tree=self.num_parallel_tree, monotone_constraints=self.monotone_constraints, interaction_constraints=self.interaction_constraints, importance_type=self.importance_type, gpu_id=self.gpu_id, validate_parameters=self.validate_parameters) def train(self, X_train, y_train, epochs): self.model.fit(X_train, y_train) return None def evaluate(self, X_test, y_test): yhat = self.predict(X_test) self.scores = [ sqrt(mean_squared_error(y_test, yhat)), accuracy_score(y_test, yhat) ] return self.scores def predict(self, X_new): self.yhat = self.model.predict(X_new) return self.yhat def save(self, model_name): with open(model_name, 'wb') as f: pickle.dump(self.model, f) def load(self, model_name): with open(model_name, 'rb') as f: self.model = pickle.load(f)
x_train, y_train, test_size=test_size, random_state=seed) # print(x_train_part.shape) # 设置boosting迭代计算次数 num_round = 2 # bst = XGBClassifier(**params) bst = XGBClassifier(max_depth=2, learning_rate=1, n_estimators=num_round, silent=True, objective='binary:logistic') bst.fit(x_train_part, y_train_part) # 查看模型在校验集上的性能 validate_preds = bst.predict(x_validate) validate_predictions = [round(value) for value in validate_preds] train_accuracy = accuracy_score(y_validate, validate_predictions) print('Validation Accuracy:%.2f%%' % (train_accuracy * 100.0)) # 查看模型在完整训练集上的分类性能 train_preds = bst.predict(x_train) train_predictions = [round(value) for value in train_preds] train_accuracy = accuracy_score(y_train, train_predictions) print('Train Accuracy:%.2f%%' % (train_accuracy * 100.0)) # 模型训练好后,可以用训练好的模型对测试数据进行预测 # make prediction preds = bst.predict(x_test) predictions = [round(value) for value in preds] test_accuracy = accuracy_score(y_test, predictions)
import cPickle as pickle import numpy as np import os DATA_DIR = "../../data/student-alcohol" dataset = np.loadtxt(os.path.join(DATA_DIR, "merged-data.csv"), delimiter=";") X = dataset[:, 0:-1] y = dataset[:, -1] Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=42) clf = XGBClassifier() clf.fit(Xtrain, ytrain, early_stopping_rounds=10, eval_metric="logloss", eval_set=[(Xtest, ytest)], verbose=True) y_ = clf.predict(Xtest) print("Accuracy: {:.3f}".format(accuracy_score(ytest, y_))) print() print("Confusion Matrix") print(confusion_matrix(ytest, y_)) print() print("Classification Report") print(classification_report(ytest, y_)) with open(os.path.join(DATA_DIR, "model.pkl"), "wb") as fclf: pickle.dump(clf, fclf)
# 训练模型 xgb.fit(train_x, train_y) from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_curve, auc def model_metrics(model, x, y, pos_label=2): """ 评估函数 """ yhat = model.predict(x) result = { 'accuracy_score': accuracy_score(y, yhat), 'f1_score_macro': f1_score(y, yhat, average="macro"), 'precision': precision_score(y, yhat, average="macro"), 'recall': recall_score(y, yhat, average="macro") } return result # 模型评估结果 print("TRAIN") print(model_metrics(xgb, train_x, train_y)) print("TEST") print(model_metrics(xgb, test_x, test_y)) # 模型预测 xgb.predict(test_x)
min_samples_leaf = np.array([25, 50, 75, 100]) param_grid = dict(n_estimators=n_estimators, max_features=max_features, min_samples_leaf=min_samples_leaf) grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv=kf) gres = grid.fit(X_train, y_train) print("Best", gres.best_score_) print("params", gres.best_params_) clf = RandomForestClassifier(n_estimators=50, max_features=5, min_samples_leaf=50) clf.fit(X_train, y_train) # %% [markdown] # ### xgBoost # %% from xgboost import XGBClassifier model = XGBClassifier() model.fit(X_train, y_train) # make predictions for test data y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0))
# In[113]: X_sub = X[features] test_sub = test[features] # In[114]: assert (X_sub.shape[1] == test_sub.shape[1]), 'Mismatch in number of features' # In[115]: model.fit(X_sub, y) # In[119]: prediction = model.predict(test_sub) # In[120]: plt.hist(prediction) # In[124]: sub['Loan_ID'] = test_original.Loan_ID sub['Loan_Status'] = prediction # In[125]: sub.to_csv('../submissions/xgb_submission.csv', index=False) # In[ ]: