예제 #1
0
    def test_predict_sklearn_pickle(self):
        x, y = build_dataset()

        kwargs = {'tree_method': 'gpu_hist',
                  'predictor': 'gpu_predictor',
                  'verbosity': 2,
                  'objective': 'binary:logistic',
                  'n_estimators': 10}

        model = XGBClassifier(**kwargs)
        model.fit(x, y)

        save_pickle(model, "model.pkl")
        del model

        # load model
        model: xgb.XGBClassifier = load_pickle("model.pkl")
        os.remove("model.pkl")

        gpu_pred = model.predict(x, output_margin=True)

        # Switch to CPU predictor
        bst = model.get_booster()
        bst.set_param({'predictor': 'cpu_predictor'})
        cpu_pred = model.predict(x, output_margin=True)
        np.testing.assert_allclose(cpu_pred, gpu_pred, rtol=1e-5)
예제 #2
0
파일: predict.py 프로젝트: jmc856/Webpage
def feature_selection(model, X_train, X_test, y_train, y_test, eval_metric='auc'):
    thresholds = [thres for thres in sorted(model.feature_importances_) if thres != 0]  # Use feat. with >0 importance

    roc_scores = {}
    for thresh in thresholds:  # select features using threshold

        selection = SelectFromModel(model, threshold=thresh, prefit=True)
        select_X_train = selection.transform(X_train)

        selection_model = XGBClassifier()  # train model
        selection_model.fit(select_X_train, y_train, eval_metric=eval_metric)

        select_X_test = selection.transform(X_test)  # eval model
        y_pred = selection_model.predict(select_X_test)

        roc = roc_auc_score(y_test, y_pred)
        roc_scores[selection.threshold] = roc

    best_thresh = max(roc_scores, key=roc_scores.get)

    fs = SelectFromModel(model, threshold=best_thresh, prefit=True)
    pickle_model(fs, 'feature.select')
    X_train_trans_ = fs.transform(X_train)
    X_test_trans_ = fs.transform(X_test)
    print 'total features kept: {}'.format(X_train_trans_.shape[1])

    return X_train_trans_, X_test_trans_
    def test_predict_sklearn_pickle(self):
        X,y = makeXy()
        Xtest = makeXtest()

        from xgboost import XGBClassifier
        kwargs={}
        kwargs['tree_method'] = 'gpu_hist'
        kwargs['predictor'] = 'gpu_predictor'
        kwargs['silent'] = 0
        kwargs['objective'] = 'binary:logistic'

        model = XGBClassifier(**kwargs)
        model.fit(X,y)
        print(model)

        # pickle model
        save_obj(model,"model.pkl")
        # delete model
        del model
        # load model
        model = load_obj("model.pkl")
        os.remove("model.pkl")

        # continue as before
        print("Before model.predict")
        sys.stdout.flush()
        tmp = time.time()
        gpu_pred = model.predict(Xtest, output_margin=True)
        print(gpu_pred)
        print("E non-zeroes: %d:" % (np.count_nonzero(gpu_pred)))
        print("E GPU Time to predict = %g" % (time.time() - tmp))
예제 #4
0
def cv(X_train, y_train, features_inner):

    kfold = StratifiedKFold(n_splits=5, shuffle=True)

    scores_f = []
    scores_p = []
    scores_r = []

    for train, test in kfold.split(X_train, y_train):

        model = XGBClassifier()
        X_train_cv = pd.DataFrame(X_train.values[train], columns=X_train.columns)
        y_train_cv = pd.DataFrame(y_train.values[train], columns=["tred_cutoff"])
        X_test_cv = pd.DataFrame(X_train.values[test], columns=X_train.columns)
        y_test_cv = pd.DataFrame(y_train.values[test], columns=["tred_cutoff"])
        model.fit(X_train_cv, y_train_cv)

        y_pred = model.predict(X_test_cv)

        s_f = f1_score(y_test_cv, y_pred)
        s_p = precision_score(y_test_cv, y_pred)
        s_r = recall_score(y_test_cv, y_pred)
        print("\tscores f1", (s_f))
        print("\tscores p", (s_p))
        print("\tscores r", (s_r))
        scores_f.append(s_f)
        scores_p.append(s_p)
        scores_r.append(s_r)

    print("mean scores f1", np.mean(scores_f))
    print("mean scores p", np.mean(scores_p))
    print("mean scores r", np.mean(scores_r))
예제 #5
0
def XGB_model(train,y):
	model=XGBClassifier(n_estimators=150, learning_rate=0.01)
	from sklearn import cross_validation
	cv = cross_validation.KFold(len(train), n_folds=5,random_state=7)
	for traincv,testcv in cv:
	    model.fit(train.iloc[traincv],y.iloc[traincv])
	y_XGB=model.predict(test)
	return y_XGB
예제 #6
0
def main():
    # Set seed for reproducibility
    np.random.seed(0)

    print("Loading data...")
    # Load the data from the CSV files
    
    training_data = pd.read_csv('/home/vipin/Videos/train.csv', header=0)
    prediction_data = pd.read_csv('/home/vipin/Videos/test.csv', header=0)
     
     
    training_data['countrycode']=training_data['countrycode'].apply(lambda x:ord(x))
    training_data['browserid']=training_data['browserid'].apply(lambda x: myfunc (x) if np.all(pd.notnull(x)) else myfunc("unknown") )
    training_data['devid']=training_data['devid'].apply(lambda x: myfunc (x) if np.all(pd.notnull(x)) else myfunc("none"))
    
    
    #pd.to_csv('/home/vipin/Videos/train11.csv', sep=',', encoding='utf-8')
    #exit(0)
    prediction_data['countrycode']=prediction_data['countrycode'].apply(lambda x:ord(x))
    prediction_data['browserid']=prediction_data['browserid'].apply(lambda x:myfunc (x) if np.all(pd.notnull(x)) else myfunc("unknown") )
    prediction_data['devid']=prediction_data['devid'].apply(lambda x:myfunc (x) if np.all(pd.notnull(x)) else myfunc("none") )
    
    
    features=['siteid','offerid','category','merchant','countrycode','browserid','devid']
    target="click"
    X = training_data[features]
    x_prediction = prediction_data[features]
    Y= training_data[target]
    ids = prediction_data["ID"]
    model = XGBClassifier()
            
            
    #linear_model.LogisticRegression(n_jobs=-1)
        
    print("Training...")
            # Your model is trained on the training_data
    model.fit(X, Y)
        
    print("Predicting...")
    
    seed =7
    test_size=0.33
    X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=test_size,random_state=seed)
    y_prediction = model.predict_proba(x_prediction)
    results = y_prediction[:, 1]
    results_df = pd.DataFrame(data={'probability':results})
    joined = pd.DataFrame(ids).join(results_df)
        
    y_pred=model.predict(X_test)
    accuracy=accuracy_score(y_test,y_pred)
    

    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print("Writing predictions to predictions.csv")
        # Save the predictions out to a CSV file
    joined.to_csv("/home/vipin/Videos/predictions.csv", index=False)
예제 #7
0
파일: tests.py 프로젝트: booleancandy/tpot
def test_xgboost():
    """Ensure that the TPOT xgboost method outputs the same as the xgboost classfier method"""

    tpot_obj = TPOT()
    result = tpot_obj._xgradient_boosting(training_testing_data, n_estimators=100, learning_rate=0, max_depth=3)
    result = result[result['group'] == 'testing']

    xgb = XGBClassifier(n_estimators=100, learning_rate=0.0001, max_depth=3, seed=42)
    xgb.fit(training_features, training_classes)

    assert np.array_equal(result['guess'].values, xgb.predict(testing_features))
예제 #8
0
def test_on_data(X, y):

    x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.5, random_state=2333)
    print "train set: {}, test set: {}".format(len(x_train), len(x_test))
    cls = XGBClassifier()
    cls.fit(x_train, y_train)
    # on test
    pred = cls.predict(x_test)
    print "xgb accuracy score test", accuracy_score(y_test, pred)

    # on all
    pred = cls.predict(X)
    print "xgb accuracy score all", accuracy_score(y, pred)

    # compare to gbrt in sklearn
    cls = GradientBoostingClassifier()
    cls.fit(x_train, y_train)
    # on test
    pred = cls.predict(x_test)
    print "sklearn accuracy score test", accuracy_score(y_test, pred)

    # on all
    pred = cls.predict(X)
    print "sklearn accuracy score all", accuracy_score(y, pred)
예제 #9
0
def runner ():
    m = Model()
    X = m.df.drop("tred_cutoff", axis=1)
    Y = m.df["tred_cutoff"]
    features_inner = m.features + m.features_2
    cv(X, Y, features_inner)

    model = XGBClassifier()
    model.fit(X, Y)

    y_pred = model.predict(m.X_test)
    s_f = f1_score(m.y_test, y_pred)
    s_p = precision_score(m.y_test, y_pred)
    s_r = recall_score(m.y_test, y_pred)
    print("test f1", s_f)
    print("test precision", s_p)
    print("test recall", s_r)
예제 #10
0
  def trainXGB(data_subset):
    f.write('\nTraining XGB:'+'\n')

    X_train = data[data_subset]['X_train']
    X_test = data[data_subset]['X_test']
    y_train = data[data_subset]['y_train']
    y_test = data[data_subset]['y_test']

    for p in params['xgboost']:
      if data_subset != 'binary' and p['objective'] == 'binary:logistic':
        print("Skip using non-binary data with XGB binary:logistic objective")
        continue
      if data_subset == 'binary' and p['objective'] != 'binary:logistic':
        print("Skip using binary data with XGB multi:* objective")
        continue

      header = "@ subset: {0}, params: {1}".format(data_subset, p)
      f.write('\n'+header+'\n')

      objective = p['objective']
      max_depth = p['max_depth']
      try:
        n_estimators= p['n_estimators']
      except KeyError as e:
        n_estimators= 100

      model = XGBClassifier(objective=objective, max_depth=max_depth,
        n_estimators=n_estimators)

      start = time.time()
      model.fit(X_train, y_train)
      elapsed_train = time.time() - start

      y_pred = model.predict(X_test).astype(int)
      elapsed_predict = time.time() - start

      accuracy = accuracy_score(y_test, y_pred)
      precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, pos_label=2, average='weighted')

      print("\n{5}\nXGB with {0} objective, {6} max_depth, {7} n_estimators on data subset {1} trained in {2} seconds and predicted in {3} seconds with an accuracy of {4}\n".format(objective, data_subset, elapsed_train, elapsed_predict, accuracy, header, max_depth, n_estimators))

      f.write(str(elapsed_train) + ', ' + str(elapsed_predict) + str(accuracy)+ ', ' + str(precision)+ ', ' + str(recall )+ ', ' + str(fscore )+ ', ' + str(support))
예제 #11
0
def get_thresh(model,train,test,label_test,label_train):
    if (len(test)>len(train)) or (len(label_test)>len(label_train)):
        raise TypeError('Invalid train and test size')
    model1 = XGBClassifier()
    if type(model)!=type(XGBClassifier()):
        raise TypeError('Invalid model passed')
    if (pd.DataFrame(label_train).shape[1]>1) or (pd.DataFrame(label_test).shape[1]>1):
    	raise TypeError('Multiple columns in label, Invalid shape.')
    max_score=0
    thrsh=0
    thresholds = np.sort(model.feature_importances_)
    for thresh in thresholds:
        selection = feature_selection.SelectFromModel(model, threshold=thresh,prefit=True)
        select_X_train = selection.transform(train)
        selection_model = XGBClassifier()
        selection_model.fit(select_X_train, label_train)
        select_X_test = selection.transform(test)
        y_pred = selection_model.predict(select_X_test)
        scr=metrics.roc_auc_score(label_test,y_pred)
        if(scr>max_score):
            max_score=scr
            thrsh=thresh
    return thrsh
    from sklearn.ensemble import RandomForestClassifier # Random Forest
    model_rf = RandomForestClassifier(); model_rf.fit(X_train, Y_train)
    predicted_rf = model_rf.predict(X_test) ; print("RandomForest",metrics.accuracy_score(Y_test, predicted_rf),"\n")
    
    from sklearn.ensemble import AdaBoostClassifier # AdaBoost
    model_ab = AdaBoostClassifier(); model_ab.fit(X_train, Y_train)
    predicted_ab = model_ab.predict(X_test) ; print("AdaBoost",metrics.accuracy_score(Y_test, predicted_ab),"\n")
    
    from sklearn.neighbors import KNeighborsClassifier #K-NN
    model_knn = KNeighborsClassifier(); model_knn.fit(X_train, Y_train)
    predicted_knn = model_knn.predict(X_test) ; print("K-NN",metrics.accuracy_score(Y_test, predicted_knn),"\n")

if cond01 == 1:
    from xgboost import XGBClassifier  # XGBoost
    model_xgb = XGBClassifier(); model_xgb.fit(X_train, Y_train)
    predicted_xgb = model_xgb.predict(X_test); print("XGBoost",metrics.accuracy_score(Y_test, predicted_xgb),"\n")

if cond01 == 2:    
    from sklearn.linear_model import LogisticRegression # Logistic Regression
    model_lr = LogisticRegression(); model_lr.fit(X_train, Y_train)
    predicted_lr = model_lr.predict(X_test) ; print("LogisticRegression",metrics.accuracy_score(Y_test, predicted_lr),"\n")
    #aa = model_lr.coef_    
    
if cond01 == 3:
    from sklearn.naive_bayes import GaussianNB # Gaussian Naive Bayes
    model_nb = GaussianNB(); model_nb.fit(X_train, Y_train)
    predicted_nb = model_nb.predict(X_test) ; print("Gaussian Naive Bayes",metrics.accuracy_score(Y_test, predicted_nb),"\n")

if cond01 == 4:
    from sklearn.ensemble import GradientBoostingClassifier # GradientBoosting
    model_gb = GradientBoostingClassifier(); model_gb.fit(X_train, Y_train)
예제 #13
0
class MLQRMine(object):

    def __init__(self):
        self._seed = randint(1, 9)
        self._csvfile = ""
        self._titles = None
        self._dataset = None
        self._X = None
        self._y = None
        self._X_original = None
        self._y_original = None
        self._dataset_original = None
        self._model = Sequential()
        self._sc = StandardScaler()
        self._vnum = 0  # Number of variables
        self._classifier = XGBClassifier()
        self._epochs = 10
        self._samplesize = 0
        self._clusters = None

    @property
    def seed(self):
        return self._seed

    @property
    def csvfile(self):
        return self._csvfile

    @property
    def dataset(self):
        return self._dataset

    @property
    def model(self):
        return self._model

    @property
    def epochs(self):
        return self._epochs

    @property
    def X(self):
        return self._X

    @property
    def y(self):
        return self._y

    @property
    def titles(self):
        return self._titles

    @property
    def head(self):
        return self._dataset.head

    # Getters should be before setters*
    @epochs.setter
    def epochs(self, epochs):
        self._epochs = epochs

    @seed.setter
    def seed(self, seed):
        self._seed = seed

    @csvfile.setter
    def csvfile(self, csvfile):
        self._csvfile = csvfile

    @titles.setter
    def titles(self, titles):
        self._titles = titles

    # Functions
    def read_csv(self):
        if self._titles is not None:
            self._dataset = read_csv(self._csvfile, usecols=self._titles)
        else:
            self._dataset = read_csv(self._csvfile)

    def mark_missing(self):
        self._dataset_original = self._dataset
        self._dataset = self._dataset.replace('', numpy.NaN)
        self._dataset.dropna(inplace=True)

    def restore_mark_missing(self):
        self._dataset = self._dataset_original

    def get_shape(self):
        return self._dataset.shape

    """
    The actual number of IVs is vnum -2 as first is the title and the last is the DV
    To seperate DV, use vnum -1 to indicate last column
    More details on np array splicing here: 
    https://stackoverflow.com/questions/34007632/how-to-remove-a-column-in-a-numpy-array/34008274
    """

    def read_xy(self):
        (self._samplesize, vnum) = self._dataset.shape
        # Last column in the csv should be the DV and first one is title (So get the number of variables)
        self._vnum = vnum - 2
        # splice into IVs and DV
        values = self._dataset.values
        # self._X = values[:, 0:self._vnum]
        # First column ignored - (To be used for title)
        self._X = values[:, 1:vnum - 1]
        self._y = values[:, vnum - 1]

    def oversample(self):
        self._X_original = self._X
        self._y_original = self._y
        ros = RandomOverSampler(random_state=0)
        X, y = ros.fit_sample(self._X, self._y)
        self._X = X
        self._y = y

    def restore_oversample(self):
        self._X = self._X_original
        self._y = self._y_original

    def prepare_data(self, oversample=False):
        self.read_csv()
        self.mark_missing()
        self.read_xy()
        if oversample:
            self.oversample()

    def get_nnet_predictions(self):
        self._model.add(Dense(12, input_dim=self._vnum, kernel_initializer='uniform', activation='relu'))
        self._model.add(Dense(8, kernel_initializer='uniform', activation='relu'))
        self._model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
        # Compile model
        self._model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        # Fit the model
        self._model.fit(self._X, self._y, epochs=self._epochs, batch_size=10, verbose=2)

        # calculate predictions
        predictions = self._model.predict(self._X_original)
        # round predictions
        rounded = [round(x[0]) for x in predictions]
        return rounded

    def get_nnet_scores(self):
        return self._model.evaluate(self._X, self._y)

    def svm_confusion_matrix(self):
        X_train, X_test, y_train, y_test = train_test_split(self._X, self._y, test_size=0.25, random_state=0)
        X_train = self._sc.fit_transform(X_train)
        X_test = self._sc.transform(X_test)
        classifier = SVC(kernel='linear', random_state=0)
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        return confusion_matrix(y_test, y_pred)

    # def knn_search(self, K=3, r=3):
    #     """ find K nearest neighbours of data among D """
    #     D = self._X
    #     x = self._X[[r-1], :]
    #
    #     print("KNN: ", x)
    #     (recs, vs) = D.shape
    #
    #     print(recs)
    #     #ndata = D.shape[0]
    #     #K = K if K < ndata else ndata
    #     K = K if K < recs else recs
    #
    #     print(K)
    #     # euclidean distances from the other points
    #     sqd = sqrt(((D - x[:, :recs]) ** 2).sum(axis=0))
    #     idx = argsort(sqd)  # sorting
    #     # return the indexes of K nearest neighbours
    #     print(idx[:K])
    #
    #     return idx[:K]

    # https://stackoverflow.com/questions/45419203/python-numpy-extracting-a-row-from-an-array
    def knn_search(self, n=3, r=3):
        kdt = KDTree(self._X, leaf_size=2, metric='euclidean')
        dist, ind = kdt.query(self._X[r - 1:r, :], k=n)
        return ind

    def get_kmeans(self, c=5):
        kmeans = KMeans(n_clusters=c, init='k-means++', random_state=42)
        y_kmeans = kmeans.fit_predict(self._X)
        self._clusters = y_kmeans
        self.get_centroids(c)
        return y_kmeans

    def get_centroids(self, c=1):
        for x in range(0, c):
            print("Cluster: ", x)
            ct = 0
            cluster_list = []
            for cluster in self._clusters:
                if cluster == x:
                    cluster_list.append(ct)
                ct += 1
            print("Cluster Length: ", len(cluster_list))
            print("Cluster Members")
            print(self._dataset.iloc[cluster_list, :])
            print("Mean")
            print(self._dataset.iloc[cluster_list, :].mean(axis=0))


    """
    TODO: This is not working yet.
    use the ColumnTransformer instead of categorical_features
    """

    def encode_categorical(self):
        # labelencoder_X_1 = LabelEncoder()
        # self._X[:, 1] = labelencoder_X_1.fit_transform(self._X[:, 1])
        # labelencoder_X_2 = LabelEncoder()
        # self._X[:, 2] = labelencoder_X_2.fit_transform(self._X[:, 2])
        onehotencoder = OneHotEncoder(categorical_features=[1])
        X = onehotencoder.fit_transform(self._X).toarray()
        X = X[:, 1:]
        print(X)
        return X

    def get_association(self):
        X_train, X_test, y_train, y_test = train_test_split(self._X, self._y, test_size=0.25, random_state=0)
        self._classifier.fit(X_train, y_train)

        # Predicting the Test set results
        y_pred = self._classifier.predict(X_test)
        return confusion_matrix(y_test, y_pred)

    def get_apriori(self):
        frequent_itemsets = apriori(self.encode_categorical(), min_support=0.07, use_colnames=True)
        rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
        return rules

    def get_pca(self, n=3):
        # https://plot.ly/~notebook_demo/264/about-the-author-some-of-sebastian-rasc/#/
        X_std = StandardScaler().fit_transform(self._X)
        (recs, factors) = X_std.shape
        print('Covariance matrix: \n%s' % numpy.cov(X_std.T))

        cov_mat = numpy.cov(X_std.T)

        eig_vals, eig_vecs = numpy.linalg.eig(cov_mat)

        print('Eigenvectors \n%s' % eig_vecs)
        print('\nEigenvalues \n%s' % eig_vals)

        # Make a list of (eigenvalue, eigenvector) tuples
        eig_pairs = [(numpy.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))]

        # Sort the (eigenvalue, eigenvector) tuples from high to low
        eig_pairs.sort()
        eig_pairs.reverse()

        # Visually confirm that the list is correctly sorted by decreasing eigenvalues
        print('Eigenvalues in descending order:')
        for i in eig_pairs:
            print(i[0])

        # variance explained
        tot = sum(eig_vals)
        var_exp = [(i / tot) * 100 for i in sorted(eig_vals, reverse=True)]
        cum_var_exp = numpy.cumsum(var_exp)
        print("Variance explained: ", var_exp)
        print("Cumulative: ", cum_var_exp)

        if len(eig_vals) < n:
            n = len(eig_vals)

        # Adjust according to number of features chosen (default n=2)
        matrix_w = numpy.hstack((eig_pairs[0][1].reshape(factors, 1),
                                 eig_pairs[1][1].reshape(factors, 1)))

        if n == 3:
            matrix_w = numpy.hstack((eig_pairs[0][1].reshape(factors, 1),
                                     eig_pairs[1][1].reshape(factors, 1),
                                     eig_pairs[2][1].reshape(factors, 1)))

        if n == 4:
            matrix_w = numpy.hstack((eig_pairs[0][1].reshape(factors, 1),
                                     eig_pairs[1][1].reshape(factors, 1),
                                     eig_pairs[2][1].reshape(factors, 1),
                                     eig_pairs[3][1].reshape(factors, 1)))
        if n == 5:
            matrix_w = numpy.hstack((eig_pairs[0][1].reshape(factors, 1),
                                     eig_pairs[1][1].reshape(factors, 1),
                                     eig_pairs[2][1].reshape(factors, 1),
                                     eig_pairs[3][1].reshape(factors, 1),
                                     eig_pairs[4][1].reshape(factors, 1)))

        print('Matrix W:\n', matrix_w)
예제 #14
0
print("Best Parameters:",dt_cv.best_params_)
print("Best Score:",dt_cv.best_score_)

"""#### Random Forest"""

rf = RandomForestClassifier(n_estimators=200)
rf.fit(X_train,np.ravel(y_train))

print(classification_report(y_test,rf.predict(X_test)))

"""#### XGBoost"""

xgb = XGBClassifier(n_estimators=200)
xgb.fit(X_train,y_train)
print(classification_report(y_test,xgb.predict(X_test)))

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
svm_cv = GridSearchCV(
        SVC(), tuned_parameters, scoring='precision'
    )
svm_cv.fit(X_train, np.ravel(y_train))

print(classification_report(y_test,svm_cv.predict(X_test)))

model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=(22,)))
model.add(tf.keras.layers.Dense(16,activation="swish"))
model.add(tf.keras.layers.Dense(16,activation="swish"))
예제 #15
0
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
onehotencoder = OneHotEncoder(categorical_features = [1])
X = onehotencoder.fit_transform(X).toarray()
X = X[:, 1:]

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Fitting XGBoost to the Training set
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
accuracies.mean()
accuracies.std()
예제 #16
0
        features_to_csv(file_what_to_buy_features, what_to_buy.values())

    if os.path.isfile(file_buy_or_not_features):
        buy_or_not = np.load(file_buy_or_not_features)
    else:
        buy_or_not = extract_buy_or_not(clicks_grouped_by_session_id, what_to_buy)
        np.save(file_buy_or_not_features, buy_or_not)

    buys = read_buys(file_buys, usecols=effective_columns_names)
    _, buys_grouped_by_session_id_keys = df_group_by_session_id(buys)
    buys_result = extract_buys(clicks_grouped_by_session_id_keys, buys_grouped_by_session_id_keys)

    buy_or_not_train, buy_or_not_val, buys_result_train, buys_result_val = train_test_split(buy_or_not, buys_result,
                                                                                            test_size=0.2)

    classifier = XGBClassifier(n_estimators=500, subsample=0.8, colsample_bytree=0.5, max_depth=4, min_child_weight=3)
    classifier.fit(buy_or_not_train, buys_result_train)

    predictions_val = classifier.predict(buy_or_not_val)
    scores = metrics(buys_result_val, predictions_val)
    write_metrics(file_scores, scores)

    test = read_clicks(file_test, usecols=effective_columns_names)
    test_grouped_by_session_id, _ = df_group_by_session_id(test)

    what_to_buy_test = extract_what_to_buy(test_grouped_by_session_id)
    buy_or_not_test = extract_buy_or_not(test_grouped_by_session_id, what_to_buy_test)

    predictions_test = classifier.predict(buy_or_not_test)
    write_predictions(file_result, predictions_test)
예제 #17
0
model = XGBClassifier(reg_lambda=1, reg_alpha=3, max_depth=3, n_estimators=200)
model.fit(X_train, Y_train)


# compute accuracy
def accuracy(X, Y):
    Y_pred = model.predict(X)
    Y_pred_round = [round(value) for value in Y_pred]
    return accuracy_score(Y, Y_pred_round)


out_of_sample = accuracy(X_test, Y_test)
in_sample = accuracy(X_train, Y_train)
print('Accuracy in sample: {:04.2f}%'.format(in_sample * 100))
print('Accuracy out of sample: {:04.2f}%'.format(out_of_sample * 100))

if parse.validation_set is not None:
    validate = accuracy(X_validate, Y_validate)
    print('Accuracy for validation: {:04.2f}%'.format(validate * 100))

if parse.save is True:
    test_prediction = model.predict(X_test)
    test_outcome = column_stack((test_prediction, Y_test))
    savetxt('./predictions/xgb_test_outcome.csv', test_outcome, delimiter=',')
    if parse.validation_set is not None:
        validation_prediction = model.predict(X_validate)
        validation_outcome = column_stack((validation_prediction, Y_validate))
        savetxt('./predictions/xgb_validation_outcome.csv',
                validation_outcome,
                delimiter=',')
    def gen_train():
        global x_train
        global y_train
        x_train, y_train = [], []
        for i in threads:
            for thread in threads[i]:
                x_train.append(thread)
                y_train.append(i)
        x_train, y_train = np.array(x_train), np.array(y_train)

    while len_threads < 729:
        print('#', len_threads)
        gen_train()

        model.fit(x_train, y_train)
        y_pred = model.predict(X)
        accuracy = accuracy_score(Y, y_pred)
        print('Accuracy: {}'.format(accuracy))
        #print(y_pred)
        print(dict(collections.Counter(y_pred)))

        find = 1 if len(threads[0]) > len(threads[1]) else 0

        num_proc = 6
        threads_arr = {}
        for i in range(num_proc):
            threads_arr[i] = []

        pi_id = 0
        for i in range(len(y_pred)):
            if y_pred[i] == 0:
예제 #19
0
precision = []
specificity = []
recall = []
f_measure = []
auc = []

kf = StratifiedKFold(n_splits=10)
#%% ## timeit -n1 -r1
for train_index, val_index in kf.split(X_train_cv, y_train_cv):
    X_train, X_val = X_train_cv[train_index], X_train_cv[val_index]
    y_train, y_val = y_train_cv[train_index], y_train_cv[val_index]

    clf.fit(X_train, y_train, eval_metric='auc')

    clf_pred_proba = clf.predict_proba(X_val)
    clf_pred = clf.predict(X_val)

    acc, prec, rec, spec, f_m = calcula_scores(y_val, clf_pred)

    auc.append(roc_auc_score(y_val, clf_pred_proba[:, 1]))
    accuracy.append(acc)
    precision.append(prec)
    specificity.append(spec)
    recall.append(rec)
    f_measure.append(f_m)

print("XGBoost AUC: \n\tMédia: {:.3f}\n\tDesvio: {:.3f}".format(
    statistics.mean(auc), statistics.stdev(auc)))
print("XGBoost Accuracy: \n\tMédia: {:.3f}\n\tDesvio: {:.3f}".format(
    statistics.mean(accuracy), statistics.stdev(accuracy)))
print("XGBoost Precision: \n\tMédia: {:.3f}\n\tDesvio: {:.3f}".format(
예제 #20
0
#Loading the dataset
dataset = pd.read_csv("creditcard.csv")

#Test train Split
X_train, X_test, Y_train, Y_test = train_test_split(
    dataset.iloc[:, :-1].values, dataset['Class'].values, test_size=0.12221)

xbg = XGBClassifier(
    max_depth=5,
    learning_rate=0.1,
    n_estimators=1500,
    n_jobs=6,
)
xbg.fit(X_train, Y_train)

Y_test_pred = xbg.predict(X_test)
Y_train_pred = xbg.predict(X_train)

count_train, count_test = 0, 0
for i in range(250000):
    if (Y_train[i] == Y_train_pred[i]): count_train += 1

for i in range(34807):
    if (Y_test[i] == Y_test_pred[i]): count_test += 1

print("Train Percentage: %.7f\nTest Percentage: %.7f" %
      (count_train / 250000, count_test / 34807))

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm_train = confusion_matrix(Y_train, Y_train_pred)
from xgboost import XGBClassifier
xgb = XGBClassifier(
    learning_rate=0.01,
    n_estimators=1000,
    max_depth=10,
    min_child_weight=4,
    gamma=0.1,
    subsample=0.3,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=4,
    scale_pos_weight=3,
    seed=27,
    reg_alpha=0.001,
    n_jobs=4,
)
xgb.fit(X_train, y_train)

y_pred6 = xgb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred6)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm_xg = confusion_matrix(y_test, y_pred6)
print(cm_xg)

#with learning_rate of 0.01 and some other parameteres we achieved a better accuracy score compared to other models.
# coding: utf-8
import numpy as np
import pandas as pd
import os, sys
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#initialize dataframe
df=pd.read_csv("parkinsons.data")
features = df.loc[:, df.columns!='status'].values[:, 1:]
labels = df.loc[:, 'status'].values
#print(labels[labels==1].shape[0], labels[labels==0].shape[0])

#use MinMaxScaler
scaler = MinMaxScaler((-1, 1))
x = scaler.fit_transform(features)
y = labels

# split model to train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)

# train model
model=XGBClassifier()
model.fit(x_train, y_train)

# predict
y_pred = model.predict(x_test)
print(accuracy_score(y_test, y_pred)*100)
예제 #23
0
                  left_index=True,
                  right_index=True)
    X_ = pd.merge(hbsDF, X_, left_index=True, right_index=True)
    return X_


X_ = process(X)
idx = X_.dropna(
).index  #some entry being nan in heartbeat rate, only 4 of them anyway, i didn't bother much

#one pass evaluation for model, with oversampling
X_train, X_test, y_train, y_test = train_test_split(X_.iloc[idx], y[idx])
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
clf.fit(X_resampled, y_resampled, eval_metric=f1_score)
pred = clf.predict(X_test.values)
scores = f1_score(y_test, pred, average='micro')
print(scores)

#cross validation, got 0.7x for cv=3, 5, 10.
#did not apply oversampling,
#adding a pipeline in cross validation is doable but i am too lazy
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf,
                         X_.iloc[idx],
                         y[idx],
                         cv=3,
                         scoring='f1_micro',
                         verbose=1000)
print(scores)
예제 #24
0
from sklearn.cross_validation import cross_val_score

cross_val_score(rfc, X_train, y_train, cv=5).mean()

cross_val_score(xgbc, X_train, y_train, cv=5).mean()

rfc.fit(X_train,y_train)
rfc_y_predict = rfc.predict(X_test)

rfc_submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': rfc_y_predict})
rfc_submission.to_csv('rfc_submission.csv', index=False)

xgbc.fit(X_train, y_train)

xgbc_y_predict = xgbc.predict(X_test)
xgbc_submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': xgbc_y_predict})
xgbc_submission.to_csv('xgbc_submission.csv', index=False)

from sklearn.grid_search import GridSearchCV

params = {'max_depth':range(2, 7), 'n_estimators':range(100, 1100, 200), 'learning_rate':[0.05, 0.1, 0.25, 0.5, 1.0]}

xgbc_best = XGBClassifier()

gs = GridSearchCV(xgbc_best, params, n_jobs=-1, cv=5, verbose=1)

gs.fit(X_train, y_train)

xgbc_best_y_predict = gs.predict(X_test)
예제 #25
0
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy',random_state=0)
classifier.fit(X_train,y_train)
from sklearn.svm import SVC
classifier = SVC(kernel= 'rbf', random_state=0)
classifier.fit(X_train,y_train)
"""

# Predicting the Test set results
y_pred = classifier.predict(X_test)

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
print("Accuracy: {:.2f} %".format(accuracies.mean()))
print("Standar Deviation: {:.2f} %".format(accuracies.std() * 100))
"""
#Apliying GridSearch to find the best model and the best paramenters
from sklearn.model_selection import GridSearchCV
parameters=[{'C': [0.25, 0.5, 0.75, 1], 'kernel': ['linear']},
            {'C': [0.25, 0.5, 0.75, 1], 'kernel': ['rbf'], 'gamma':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]}]
grid_search=GridSearchCV(estimator=classifier,
                         param_grid=parameters,
                         scoring='accuracy',
                         cv=10,
                         n_jobs=-1)
예제 #26
0
feature_imp

# Visualize

import matplotlib.pyplot as plt
import seaborn as sns
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()

# prediction on test set
y_pred = model.predict(X_test)
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

submission = test.copy()

submission['Cover_Type'] = model.predict(submission[features_selected])

#submission.info(memory_usage='deep')

submission[['Id', 'Cover_Type']].to_csv('submission{0}.csv'.format(10),
                                        index=False)
예제 #27
0
#split dataset and make binary dependent
full_data = hstack((title_S, hasNamedEntity_S, hasNumbers_S, hasSubTitle_S,
                    title_lengths_S, polarity_S, subjectivity_S))
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(
    full_data, dat[target], test_size=0.15, random_state=123)
y_train_dich_final = [0 if i <= cutoff else 1 for i in y_train_final]
y_test_dich_final = [0 if i <= cutoff else 1 for i in y_test_final]
#fit model
model = XGBClassifier(objective='binary:logistic',
                      booster='gbtree',
                      learning_rate=0.3,
                      max_depth=20,
                      subsample=0.7)
model.fit(X_train_final, y_train_dich_final)
# get predictions
preds = model.predict(X_test_final)
probs = model.predict_proba(X_test_final)
# compute metrics and print ROC curve
print(accuracy_score(y_test_dich_final, preds))
print(confusion_matrix(y_test_dich_final, preds))
auc = roc_auc_score(y_test_dich_final, probs[:, 1])
fpr, tpr, thresholds = roc_curve(y_test_dich_final, probs[:, 1])
print(auc)
plot_roc_curve(fpr, tpr, auc)

###########################################################
###########################################################
################   FEATURE IMPORTANCE   ###################
###########################################################
###########################################################
예제 #28
0
print("# train data points: {}".format(X_train.shape[0]))
print("# val data points : {}".format(X_val.shape[0]))


def gen_test_labels(model, test_data, name="output"):
    y_pred = model.predict(test_data)
    predictions = [round(value) for value in y_pred]
    with open('data/' + name + '.csv', 'w') as csvfile:
        fieldnames = ['ID', 'Label']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for i, pred in enumerate(predictions):
            writer.writerow({'ID': str(i), 'Label': str(pred)})


model = XGBClassifier(n_estimators=2000, learning_rate=0.01, max_depth=2)
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_train, predictions)
print("Train Accuracy: %.2f%%" % (accuracy * 100.0))

y_pred = model.predict(X_val)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_val, predictions)
print("Test Accuracy: %.2f%%" % (accuracy * 100.0))

model = XGBClassifier(n_estimators=2500, learning_rate=0.01, max_depth=2)
model.fit(train_data, train_label)
예제 #29
0
pca_xtest = pd.DataFrame(dim_red.transform(xtest))
pca_df_test = pd.DataFrame(dim_red.transform(df_test_final))

#Check the shape of the datasets
print('Training Data(pca_xtrain):',pca_xtrain.shape) #(3367, 6)
print('Validation Data(pca_xtest):', pca_xtest.shape) #(842, 6)
print('Testing Data(pca_df_test):', pca_df_test.shape) #(4209, 6)
print('Training_pred Label(ytrain):', ytrain.shape) #(3367,)
print('Validation_pred Label(ytest):', ytest.shape) #(842,)

#Applying XGBoost
#Create an XGBoost object
XGB_model = XGBClassifier() 
#Fit the XGBoost model on the training set
XGB_model.fit(pca_xtrain,ytrain)
#For parameters to work with, refer the link below, thanks to AV
#https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

#prediction on validation set
ypred = XGB_model.predict(pca_xtest)

#Prediction on test data
XGB_model.predict(pca_df_test)

#Metrics
#check the accuracy of (prediction Vs test labels)
print(mean_squared_error(ytest,ypred))
#R-Squared error
print(r2_score(ytest,ypred))

# Optimize model paramaters
# I run this code in google colab to make the execution much faster and use the best params in the next code
param_grid = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5]
}
my_model2 = GridSearchCV(my_model, param_grid)
my_model2.fit(X_Train, y_Train)
print(my_model2.best_params_)

from sklearn.metrics import confusion_matrix, accuracy_score

# fit and Evaluate model
my_model3 = XGBClassifier(min_child_weight=1,
                          gamma=2,
                          subsample=0.6,
                          colsample_bytree=0.6,
                          max_depth=3)
my_model3.fit(X_Train, y_Train)
y_pred = my_model3.predict(X_val)

# Get error rate
print("Error rate of XGBoost: ", 1 - accuracy_score(y_val, y_pred))

# Get confusion matrix
confusion_matrix(y_pred, y_val)
예제 #31
0
xgb2 = XGBClassifier(learning_rate=0.3, n_estimators=700, max_depth=2, n_jobs=-1, colsample_bytree=0.1, random_state=4218)

# Lets look at the eror to compare with the baseline and then we will update the model to then get predictions.
eval_set = [(x_val, y_val)]
eval_metric = ["aucpr","error"]
%time xgb2.fit(x_train, y_train, eval_metric=eval_metric, eval_set=eval_set, verbose=2)

# It lookslike we will keep our parameters. The error was lower with our tuning and we can move forward with predictions.
xgb2.fit(x_train, y_train)

# Lets get the tuned predictions for the train and validations sets. The predicted probabilities for predicting the class and getting our AUC and f1 score.
xgb2predprob_train = xgb2.predict_proba(x_train)[:, 1]
xgb2predprob_val = xgb2.predict_proba(x_val)[:, 1]

# The decision predictions to help us classify and get the f1 scores and see what the recall and precision are if we want them.
xgb2preds_train = xgb2.predict(x_train)
xgb2preds_val = xgb2.predict(x_val)

# Results from the tuned model.
print ('F1 Score',f1_score(y_train, xgb2preds_train))
print ('F1 Score',f1_score(y_val, xgb2preds_val))
print ('ROC AUC Score',roc_auc_score(y_train,xgb2predprob_train))
print ('ROC AUC Score',roc_auc_score(y_val,xgb2predprob_val))

# Lets look at our confusion matrix here to understand the  classification reports to help us figure out what we may have missed.
print ('Training Confusion Matrix',confusion_matrix(y_train, xgb2preds_train))
print ('Val Confusion Matrix',confusion_matrix(y_val, xgb2preds_val))
print ('Training Classification report',classification_report(y_train, xgb2preds_train))
print ('Val Classification Report',classification_report(y_val, xgb2preds_val))

# Overall, we saw that the validation set tuned f1 score for the models was: LR = 79.07% vs. RF = 91.56% vs. XGB = 93.67% (when using the eval method otherwise it was 89.4364% with an error of 0.0625). The AUC score for how the model perform on the validation set was: LR = 78.43% vs. RF = 92.92% vs. XGB = 91.56%. Overall, the XGBoost model performed the best. It had the higher f1 score even though it had a lower AUC score than RF.
예제 #32
0
def XGBoostCheck():
    xgbModel = XGBClassifier()
    
    ids = utility.GetShangHaiStockIdStrArray()
    np.random.shuffle(ids)
    ids = ids[:100]
    testCheckStartDate = datetime.strptime("2015-06-01", "%Y-%m-%d")
    testCheckEndDate = datetime.strptime("2017-06-1", "%Y-%m-%d")
    trainDatas = []
    labelDatas = []
    
    count = 0
    for trainData in GetTrainningData(ids, testCheckStartDate, testCheckEndDate, posNegEq=True):
        print(trainData[2])
        
        posTrainData = trainData[0]
        negTrainData = trainData[1]
        
        if len(posTrainData) > 0:
            trainDatas += posTrainData
            labelDatas += np.ones(len(posTrainData)).tolist()
        
        if len(negTrainData) > 0:
            trainDatas += negTrainData
            labelDatas += np.zeros(len(negTrainData)).tolist()
            
#         count += 1
#         if count == 20:
#             break
    
    pca = PCA(n_components = 75)       
    trainDatas = np.array(trainDatas)
    print(trainDatas.shape)
    trainDatas = pca.fit_transform(trainDatas)
    print(trainDatas.shape)
    labelDatas = np.array(labelDatas)
    labelDatas = np.reshape(labelDatas, (-1, 1))
    print(labelDatas.shape)
    
    trianDataTmp = np.concatenate((trainDatas, labelDatas), axis=1)
    np.random.shuffle(trianDataTmp)
    
    
#     labelDatas = np.reshape(labelDatas, (-1, 1))
    xgbModel.fit(trianDataTmp[:, 0:74], trianDataTmp[:, 74])
    
    testTestCheckStartDate = datetime.strptime("2017-06-2", "%Y-%m-%d")
    testTestCheckEndDate = datetime.strptime("2018-06-01", "%Y-%m-%d")
    for trainData in GetTrainningData(ids, testTestCheckStartDate, testTestCheckEndDate):
        print("Predict:", trainData[2])
        
        posTrainData = trainData[0]
        posTrainData = pca.transform( posTrainData )
        negTrainData = trainData[1]
        negTrainData = pca.transform( negTrainData )
        
        if len(posTrainData) > 0:
            retLabels = xgbModel.predict(posTrainData)
            try:
                print(trainData[2], "Pos Count:", len(posTrainData), "pos predict rate:", len(np.where(retLabels == 1)[0]) / len(posTrainData))
            except:
                print(trainData[2], "Pos Count:", len(posTrainData))
            
        
        if len(negTrainData) > 0:
            retLabels = xgbModel.predict(negTrainData)
            try:
                print(trainData[2], "Neg Count:", len(negTrainData), "neg predict rate:", len(np.where(retLabels == 0)[0]) / len(negTrainData))
            except:
                print(trainData[2], "Neg Count:", len(negTrainData))
예제 #33
0
def main():
    # Set seed for reproducibility
    np.random.seed(0)

    print("Loading data...")
    # Load the data from the CSV files

    training_data = pd.read_csv('/home/vipin/Videos/train.csv', header=0)
    prediction_data = pd.read_csv('/home/vipin/Videos/test.csv', header=0)

    training_data['countrycode'] = training_data['countrycode'].apply(
        lambda x: ord(x))
    training_data['browserid'] = training_data['browserid'].apply(
        lambda x: myfunc(x) if np.all(pd.notnull(x)) else myfunc("unknown"))
    training_data['devid'] = training_data['devid'].apply(
        lambda x: myfunc(x) if np.all(pd.notnull(x)) else myfunc("none"))

    #pd.to_csv('/home/vipin/Videos/train11.csv', sep=',', encoding='utf-8')
    #exit(0)
    prediction_data['countrycode'] = prediction_data['countrycode'].apply(
        lambda x: ord(x))
    prediction_data['browserid'] = prediction_data['browserid'].apply(
        lambda x: myfunc(x) if np.all(pd.notnull(x)) else myfunc("unknown"))
    prediction_data['devid'] = prediction_data['devid'].apply(
        lambda x: myfunc(x) if np.all(pd.notnull(x)) else myfunc("none"))

    features = [
        'siteid', 'offerid', 'category', 'merchant', 'countrycode',
        'browserid', 'devid'
    ]
    target = "click"
    X = training_data[features]
    x_prediction = prediction_data[features]
    Y = training_data[target]
    ids = prediction_data["ID"]
    model = XGBClassifier()

    #linear_model.LogisticRegression(n_jobs=-1)

    print("Training...")
    # Your model is trained on the training_data
    model.fit(X, Y)

    print("Predicting...")

    seed = 7
    test_size = 0.33
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=test_size,
                                                        random_state=seed)
    y_prediction = model.predict_proba(x_prediction)
    results = y_prediction[:, 1]
    results_df = pd.DataFrame(data={'probability': results})
    joined = pd.DataFrame(ids).join(results_df)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print("Writing predictions to predictions.csv")
    # Save the predictions out to a CSV file
    joined.to_csv("/home/vipin/Videos/predictions.csv", index=False)
max_depth=range(2, 12, 1)
param_grid = dict(max_depth=max_depth)
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid.fit(x2,y)
print(grid.best_score_)
print(grid.best_params_)



###Build the model using n_estimators as 20 and max_depth as 2

xgb1 = XGBClassifier(objective ='reg:logistic', learning_rate = 0.1,
                max_depth = 2, n_estimators = 20)

xgb1.fit(x2_train,y_train)
train_pred =xgb1.predict(x2_train)

import numpy as np
train_acc = np.mean(train_pred==y_train) 
print(train_acc)


test_pred=xgb1.predict(x2_test)
test_acc=np.mean(test_pred==y_test)
print(test_acc)


#Variable importance plot

from xgboost import plot_importance
plot_importance(xgb1)
def run_monthly(client, MonthGAP=1):
    data_dict, pullinfo_list_dict = load_data_monthly(
        ayonel_numerical_attr=ayonel_numerical_attr,
        ayonel_boolean_attr=ayonel_boolean_attr,
        ayonel_categorical_attr_handler=ayonel_categorical_attr_handler,
        MonthGAP=MonthGAP)

    for org, repo in [('dimagi', 'xxx')]:
        print(org + ",")
        pullinfo_list = pullinfo_list_dict[org]
        batch_iter = data_dict[org]
        train_batch = batch_iter.__next__()
        train_X = np.array(train_batch[0])
        train_y = np.array(train_batch[1])
        cursor = train_y.size  # 游标,用于记录第一条开始预测pr的位置
        predict_result = []
        predict_result_prob = []
        actual_result = []
        mean_accuracy = 0
        round = 1
        for batch in batch_iter:
            if len(batch[0]) == 0:  # 测试集没有数据,直接预测下一batch
                continue
            test_X = np.array(batch[0])
            test_y = np.array(batch[1])
            parameters = [
                ("criterion", ["gini", "entropy"]),
                ("max_features", ["auto", "sqrt", "log2"]),
                ("min_weight_fraction_leaf", iandfrange(0, 0.501, 0.05)),
                ("oob_score", [True, False]),
            ]
            tuned_params = {}  # 已调好的参数
            for k, v in enumerate(parameters):
                tuning_param = {}
                tuning_param[v[0]] = v[1]
                estimator_rf = RandomForestClassifier(random_state=RANDOM_SEED,
                                                      **tuned_params)
                clf = GridSearchCV(estimator=estimator_rf,
                                   param_grid=tuning_param,
                                   scoring="accuracy",
                                   cv=3)
                clf.fit(train_X, train_y)
                tuned_params = dict(tuned_params, **clf.best_params_)

            print(tuned_params)
            # 入库
            client[org]['model'].update(
                {
                    'round': round,
                    'model': 'randomforest',
                    'gap': MonthGAP
                }, {'$set': tuned_params},
                upsert=True)

            best_est = XGBClassifier(seed=RANDOM_SEED, **tuned_params)

            train(best_est, train_X, train_y)

            print(best_est.score(test_X, test_y))

            actual_result += test_y.tolist()  # 真实结果
            predict_result += best_est.predict(test_X).tolist()  # 预测结果
            predict_result_prob += [
                x[0] for x in best_est.predict_proba(test_X).tolist()
            ]
            mean_accuracy += best_est.score(test_X, test_y)
            train_X = np.concatenate((train_X, test_X))
            train_y = np.concatenate((train_y, test_y))
            round += 1

        acc_num = 0
        for i in range(len(actual_result)):
            if actual_result[i] == predict_result[i]:
                acc_num += 1
        print(acc_num / len(actual_result))
예제 #36
0
        print(f)
    c.append(f)
train.columns=c
test.columns=c


# In[14]:

#Part 2
model = XGBClassifier()
model.fit(train, label_train)


# In[15]:

label_pred=model.predict(test)



# In[16]:

# In[17]:

def get_thresh(model,train,test,label_test,label_train):
    if (len(test)>len(train)) or (len(label_test)>len(label_train)):
        raise TypeError('Invalid train and test size')
    model1 = XGBClassifier()
    if type(model)!=type(XGBClassifier()):
        raise TypeError('Invalid model passed')
    if (pd.DataFrame(label_train).shape[1]>1) or (pd.DataFrame(label_test).shape[1]>1):
    	raise TypeError('Multiple columns in label, Invalid shape.')
예제 #37
0
        Y = list_personality[:, l]
        print(
            "Training dataset-Binarized personality list for type indicator-------> ",
            Y)
        # split data into train and test sets
        seed = 7
        test_size = 0.33
        X_train, X_test, y_train, y_test = train_test_split(
            X, Y, test_size=test_size, random_state=seed)

        # fit model on training data
        model = XGBClassifier(**param)
        model.fit(X_train, y_train)

        # make predictions for my  data
        y_pred = model.predict(my_X_tfidf)
        print("Predicted value-----> ", y_pred)
        result.append(y_pred[0])
    # print("* %s prediction: %s" % (type_indicators[l], y_pred))
    print("RESULT IS----", result)
    print("TRANSLATED RESULT IS------", translate_back(result))

#     x= translate_back(result)
#     score=0
#     for p in check:
#         if p in x:
#             score=score+10
#     print("MY SCORE----------",score)
#
#     r+=[(score,use)]
# print("SORTED TWITTER IDs BASED ON THE SCORE")
from xgboost import XGBClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
# load data
dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
# split data into X and y
X = dataset[:,0:8]
Y = dataset[:,8]
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7)
# fit model on all training data
model = XGBClassifier()
model.fit(X_train, y_train)
# make predictions for test data and evaluate
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
# Fit model using each importance as a threshold
thresholds = sort(model.feature_importances_)
for thresh in thresholds:
	# select features using threshold
	selection = SelectFromModel(model, threshold=thresh, prefit=True)
	select_X_train = selection.transform(X_train)
	# train model
	selection_model = XGBClassifier()
	selection_model.fit(select_X_train, y_train)
	# eval model
	select_X_test = selection.transform(X_test)
	y_pred = selection_model.predict(select_X_test)
예제 #39
0
"""

from data_loader import dataLoader
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

if __name__ == '__main__':

    # Get data
    X_train, X_val, y_train, y_val = dataLoader(test=False, optimize_set=True)
    y_train, y_val = y_train.values.ravel(), y_val.values.ravel()

    # Define baseline XGB classifier model with default parameters
    defualt_xgb = XGBClassifier()
    defualt_xgb.fit(X_train, y_train)
    default_predictions = defualt_xgb.predict(X_val)
    print('Default XGB model test accuracy',
          accuracy_score(y_val, default_predictions))

    # Using early_stopping_rounds to determine best n_estimators number
    tuned_xgb = XGBClassifier(n_estimators=10, learning_rate=0.5)
    tuned_xgb.fit(X_train,
                  y_train,
                  early_stopping_rounds=5,
                  eval_set=[(X_val, y_val)],
                  verbose=False)
    tuned_params = tuned_xgb.get_params()
    print('')
    print('Best n_estimators', tuned_params['n_estimators'])
    print('Best learning rate', tuned_params['learning_rate'])
    tuned_predictions = tuned_xgb.predict(X_val)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

numpy_X_train = X_train.as_matrix()
numpy_y_train = y_train.as_matrix()
numpy_X_test = X_test.as_matrix()
numpy_y_test = y_test.as_matrix()

# fit model to training data
#model = XGBClassifier(max_depth=5, n_estimators=250)
model = XGBClassifier()
model.fit(numpy_X_train, numpy_y_train)
print model

# make predictions for test data
y_pred = model.predict(numpy_X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(numpy_y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

confusion_matrix = confusion_matrix(numpy_y_test, y_pred)
print confusion_matrix



########################################################################
#	SVM
########################################################################

print 'Beginning support vector machine...'
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from matplotlib import pyplot
# load data
dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
# split data into X and y
X = dataset[:,0:8]
Y = dataset[:,8]
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7)
# fit model no training data
model = XGBClassifier()
eval_set = [(X_train, y_train), (X_test, y_test)]
model.fit(X_train, y_train, eval_metric=["error", "logloss"], eval_set=eval_set, verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
# retrieve performance metrics
results = model.evals_result()
epochs = len(results['validation_0']['error'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
pyplot.ylabel('Log Loss')
pyplot.title('XGBoost Log Loss')
예제 #42
0
from __future__ import division
import pandas as pd
from xgboost import XGBClassifier
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pickle

print 'reading data...'
df = pd.read_csv("../featurized/recall_train_booking_2.csv", sep='\t')
X = df[['srch_dest_ct', 'hotel_market_ct', 'dest_pct', 'market_pct']]
is_booked = df.pop('is_booked')

Xtrain, Xtest, ytrain, ytest = train_test_split(X, is_booked)

# print 'training rf...'
# rf = RandomForestClassifier(max_depth=3, n_estimators=100, n_jobs=-1).fit(Xtrain, ytrain)
# rf_pred = rf.predict(ytest)
# print sum(rf_pred==ytest)/len(ytest)


print 'training xgb...'
xgbm = XGBClassifier(max_depth=3, n_estimators=100, learning_rate=0.01).fit(Xtrain, ytrain)
xgbm_pred = xgbm.predict(Xtest)
print sum(xgbm_pred==ytest)/len(ytest)

with open('../models/xgbm1.pkl', 'w') as f:
    pickle.dump(xgbm, f)
예제 #43
0
def stacking(featureNames,dataNames):

    rs = np.random.randint(100000)

    # check data set info
    print("*** data load ***")
    print(featureNames,dataNames)
    Xs = []
    ratio = 0.7
    for dataName in dataNames:
        fin = open("./learn/data/"+featureNames[0]+"_"+dataName+".pkl","rb")
        Xs.append(pickle.load(fin))
        fin.close()
    minDataSize = min([len(X) for X in Xs])
    trainSize = int(minDataSize * ratio)
    testSize = minDataSize-trainSize
    clfs = [RandomForestClassifier(),XGBClassifier(),SVC(probability=True,C=1),
        ExtraTreesClassifier(),LogisticRegression()]

    featureCount = [0]
    for featureName in featureNames:
        fin = open("./learn/data/"+featureName+"_"+dataNames[0]+".pkl","rb")
        X = pickle.load(fin)
        fin.close()
        featureCount.append(featureCount[-1]+len(X[0]))

    print("train,test = {0},{1}".format(trainSize,testSize))
    print("featureCount boundary = {0}".format(featureCount))
    print("models = {0}".format(clfs))

    # generate 1st layer feature vector
    print("\n*** 1st layer ***")
    X1_tr = []
    X1_te = []
    Y1_tr = []
    Y1_te = []
    featCount = [0]
    for i,dataName in enumerate(dataNames):
        Xf = []
        for featureName in featureNames:
            fin = open("./learn/data/"+featureName+"_"+dataName+".pkl","rb")
            X = np.array(pickle.load(fin))
            fin.close()
            print("first layer from {0}-{1} : {2}".format(dataName,featureName,X.shape))
            Xf.append(X)
        Xf = np.hstack(Xf)
        X_tr,X_te = train_test_split(Xf ,train_size=trainSize, test_size=testSize,random_state=rs)
#        print(X_tr.shape,X_te.shape)
        X1_tr.append(X_tr)
        X1_te.append(X_te)
        Y1_tr += [i]*trainSize
        Y1_te += [i]*testSize
    X1_tr = np.vstack(X1_tr)
    X1_te = np.vstack(X1_te)
    Y1_tr = np.array(Y1_tr)
    Y1_te = np.array(Y1_te)

    print("train vector : {0} label {1}".format(X1_tr.shape,Y1_tr.shape))
    print("test vector : {0} label {1}".format(X1_te.shape,Y1_te.shape))

    # generate 2nd layer feature vector
    print("\n*** 2nd layer ***")
    featureLength = len(featureNames)*len(dataNames)*len(clfs)
    featurePerModel = len(featureCount)*len(dataNames)
    X2_tr = np.zeros((trainSize*len(dataNames),featureLength))
    X2_te = np.zeros((testSize*len(dataNames),featureLength))
    Y2_tr = Y1_tr
    Y2_te = Y1_te
    nfold = 5

    print("{0}-class * {1}-models * {2}-features -> length = {3}".format(len(dataNames),len(clfs),len(featureNames),featureLength))
    print("{0}-fold * {1}-models * {2}-features -> #train = {3}".format(nfold,len(clfs),len(featureNames),nfold*len(clfs)*len(featureNames)))

    totacc = [0]*len(featureNames)
    skf = StratifiedKFold(Y1_tr,n_folds =nfold,shuffle=True,random_state=rs)
    i=0
    for trind,valind in skf:
        Xtrall = X1_tr[trind]
        Xvalall = X1_tr[valind]
        Ytr = Y1_tr[trind]
        Yval = Y1_tr[valind]
        for fi in range(len(featureCount)-1):
            Xtr = Xtrall[:,featureCount[fi]:featureCount[fi+1]]
            Xval = Xvalall[:,featureCount[fi]:featureCount[fi+1]]

            for ci,clf in enumerate(clfs):
                clf.fit(Xtr,Ytr)
                proba = clf.predict_proba(Xval)

#                           print(X2_tr.shape)
#               print(X2_tr[valind].shape)
                for pi,ind in enumerate(valind):
                    pos = fi*len(dataNames)*len(clfs)+len(dataNames)*ci
                    posend = pos+len(dataNames)
                    X2_tr[ind,pos:posend] = proba[pi]
#               (X2_tr[valind])[:,len(dataNames)*fi:len(dataNames)*(fi+1)] = proba

                Yvalp = clf.predict(Xval)
                acc = accuracy_score(Yval,Yvalp)
                print("{0}th fold : {1}th feature : {2}th model : validation acc = {3}".format(i,fi,ci,acc))
                totacc[fi] += acc
        i+=1
    for fi,acc in enumerate(totacc):
        print("{0} th feature {1} : ave acc = {2}".format(fi,featureNames[fi],acc/nfold/len(clfs)))

    fout = open("./learn/stack/result.txt","a")

    for fi in range(len(featureCount)-1):
        Xtr = X1_tr[:,featureCount[fi]:featureCount[fi+1]]
        Xte = X1_te[:,featureCount[fi]:featureCount[fi+1]]
        Ytr = Y1_tr
        Yte = Y1_te
        for ci,clf in enumerate(clfs):
            clf.fit(Xtr,Ytr)
            proba = clf.predict_proba(Xte)
            pos = fi*len(dataNames)*len(clfs)+len(dataNames)*ci
            posend = pos+len(dataNames)
            X2_te[:,pos:posend] = proba

            Ypr = clf.predict(Xte)
            acc = accuracy_score(Yte,Ypr)
            print("{0}th feature : {1}th model : test acc = {2}".format(fi,ci,acc))
            fout.write("{0} ".format(acc))
    Xtr = X1_tr
    Xte = X1_te
    Ytr = Y1_tr
    Yte = Y1_te
    for ci,clf in enumerate(clfs):
        clf.fit(Xtr,Ytr)
        Ypr = clf.predict(Xte)
        acc = accuracy_score(Yte,Ypr)
        print("all feature : {0}th model : test acc = {1}".format(ci,acc))
        fout.write("{0} ".format(acc))

    print("train vector {0}".format(X2_tr.shape))
    print("test vector {0}".format(X2_te.shape))

    # 3rd layer
    print("\n*** 3rd layer ***")
    clf = XGBClassifier()
    clf.fit(X2_tr,Y2_tr)
    Y2_tr_pr = clf.predict(X2_tr)
    Y2_te_pr = clf.predict(X2_te)
    train_acc = accuracy_score(Y2_tr,Y2_tr_pr)
    test_acc = accuracy_score(Y2_te,Y2_te_pr)
    print("final acc (train,test) = {0},{1}".format(train_acc,test_acc))
    fout.write("{0} ".format(test_acc))
    fout.write("\n")
    fout.close()
예제 #44
0
class XGBoost(Model):
    def __init__(self, objective="binary:logistic", max_depth=None, learning_rate=None, n_estimators=100, verbosity=None, booster=None, tree_method=None, n_jobs=None, gamma=None, min_child_weight=None, max_delta_step=None, subsample=None, colsample_bytree=None, colsample_bylevel=None, colsample_bynode=None, reg_alpha=None, reg_lambda=None, scale_pos_weight=None, base_score=None, random_state=None, missing=np.nan, num_parallel_tree=None, monotone_constraints=None, interaction_constraints=None, importance_type="gain", gpu_id=None, validate_parameters=None, metrics=[], path='algorithms/.output', name='xgboost'):
        
        self.objective = objective
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.verbosity = verbosity
        self.booster = booster
        self.tree_method = tree_method
        self.n_jobs = n_jobs
        self.gamma = gamma
        self.min_child_weight = min_child_weight
        self.max_delta_step = max_delta_step
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.colsample_bylevel = colsample_bylevel
        self.colsample_bynode = colsample_bynode
        self.reg_alpha = reg_alpha
        self.reg_lambda = reg_lambda
        self.scale_pos_weight = scale_pos_weight
        self.base_score = base_score
        self.random_state = random_state
        self.missing = missing
        self.num_parallel_tree = num_parallel_tree
        self.monotone_constraints = monotone_constraints
        self.interaction_constraints = interaction_constraints
        self.importance_type = importance_type
        self.gpu_id = gpu_id
        self.validate_parameters = validate_parameters
        self.path = path
        self.name = name

        self.create_model()

        super().__init__(self.path, self.name)
    

    def create_model(self):
        self.model = XGBClassifier(objective=self.objective,
                              max_depth=self.max_depth,
                              learning_rate=self.learning_rate,
                              n_estimators=self.n_estimators,
                              verbosity=self.verbosity,
                              booster=self.booster,
                              tree_method=self.tree_method,
                              n_jobs=self.n_jobs,
                              gamma=self.gamma,
                              min_child_weight=self.min_child_weight,
                              max_delta_step=self.max_delta_step,
                              subsample=self.subsample,
                              colsample_bytree=self.colsample_bytree,
                              colsample_bylevel=self.colsample_bylevel,
                              colsample_bynode=self.colsample_bynode,
                              reg_alpha=self.reg_alpha,
                              reg_lambda=self.reg_lambda,
                              scale_pos_weight=self.scale_pos_weight,
                              base_score=self.base_score,
                              random_state=self.random_state,
                              missing=self.missing,
                              num_parallel_tree=self.num_parallel_tree,
                              monotone_constraints=self.monotone_constraints,
                              interaction_constraints=self.interaction_constraints,
                              importance_type=self.importance_type,
                              gpu_id=self.gpu_id,
                              validate_parameters=self.validate_parameters)


    def train(self, X_train, y_train, epochs):

        self.model.fit(X_train, y_train)
        return None


    def evaluate(self, X_test, y_test):
        yhat = self.predict(X_test)
        self.scores = [
            sqrt(mean_squared_error(y_test, yhat)),
            accuracy_score(y_test, yhat)
        ]
        return self.scores
    

    def predict(self, X_new):
        self.yhat = self.model.predict(X_new)
        return self.yhat
        

    def save(self, model_name):
        with open(model_name, 'wb') as f:
            pickle.dump(self.model, f)
    
    
    def load(self, model_name):
        with open(model_name, 'rb') as f:
            self.model = pickle.load(f)
예제 #45
0
    x_train, y_train, test_size=test_size, random_state=seed)
# print(x_train_part.shape)

# 设置boosting迭代计算次数
num_round = 2

# bst = XGBClassifier(**params)
bst = XGBClassifier(max_depth=2,
                    learning_rate=1,
                    n_estimators=num_round,
                    silent=True,
                    objective='binary:logistic')
bst.fit(x_train_part, y_train_part)

# 查看模型在校验集上的性能
validate_preds = bst.predict(x_validate)
validate_predictions = [round(value) for value in validate_preds]
train_accuracy = accuracy_score(y_validate, validate_predictions)
print('Validation Accuracy:%.2f%%' % (train_accuracy * 100.0))

# 查看模型在完整训练集上的分类性能
train_preds = bst.predict(x_train)
train_predictions = [round(value) for value in train_preds]
train_accuracy = accuracy_score(y_train, train_predictions)
print('Train Accuracy:%.2f%%' % (train_accuracy * 100.0))

# 模型训练好后,可以用训练好的模型对测试数据进行预测
# make prediction
preds = bst.predict(x_test)
predictions = [round(value) for value in preds]
test_accuracy = accuracy_score(y_test, predictions)
예제 #46
0
import cPickle as pickle
import numpy as np
import os

DATA_DIR = "../../data/student-alcohol"

dataset = np.loadtxt(os.path.join(DATA_DIR, "merged-data.csv"), 
                     delimiter=";")
X = dataset[:, 0:-1]
y = dataset[:, -1]

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, 
                                                random_state=42)

clf = XGBClassifier()
clf.fit(Xtrain, ytrain, early_stopping_rounds=10, eval_metric="logloss",
        eval_set=[(Xtest, ytest)], verbose=True)

y_ = clf.predict(Xtest)

print("Accuracy: {:.3f}".format(accuracy_score(ytest, y_)))
print()
print("Confusion Matrix")
print(confusion_matrix(ytest, y_))
print()
print("Classification Report")
print(classification_report(ytest, y_))

with open(os.path.join(DATA_DIR, "model.pkl"), "wb") as fclf:
    pickle.dump(clf, fclf)
예제 #47
0
# 训练模型
xgb.fit(train_x, train_y)

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_curve, auc


def model_metrics(model, x, y, pos_label=2):
    """
    评估函数
    """
    yhat = model.predict(x)
    result = {
        'accuracy_score': accuracy_score(y, yhat),
        'f1_score_macro': f1_score(y, yhat, average="macro"),
        'precision': precision_score(y, yhat, average="macro"),
        'recall': recall_score(y, yhat, average="macro")
    }
    return result


# 模型评估结果
print("TRAIN")
print(model_metrics(xgb, train_x, train_y))

print("TEST")
print(model_metrics(xgb, test_x, test_y))

# 模型预测
xgb.predict(test_x)
예제 #48
0
min_samples_leaf = np.array([25, 50, 75, 100])

param_grid = dict(n_estimators=n_estimators,
                  max_features=max_features,
                  min_samples_leaf=min_samples_leaf)
grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv=kf)
gres = grid.fit(X_train, y_train)

print("Best", gres.best_score_)
print("params", gres.best_params_)

clf = RandomForestClassifier(n_estimators=50,
                             max_features=5,
                             min_samples_leaf=50)
clf.fit(X_train, y_train)

# %% [markdown]
# ### xgBoost

# %%
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
예제 #49
0
파일: xgb.py 프로젝트: sk48880/av-practice
# In[113]:

X_sub = X[features]
test_sub = test[features]

# In[114]:

assert (X_sub.shape[1] == test_sub.shape[1]), 'Mismatch in number of features'

# In[115]:

model.fit(X_sub, y)

# In[119]:

prediction = model.predict(test_sub)

# In[120]:

plt.hist(prediction)

# In[124]:

sub['Loan_ID'] = test_original.Loan_ID
sub['Loan_Status'] = prediction

# In[125]:

sub.to_csv('../submissions/xgb_submission.csv', index=False)

# In[ ]: