예제 #1
1
파일: ada7.py 프로젝트: daxiongshu/bnp
def kfold_cv(X_train, y_train,idx,k):

    kf = StratifiedKFold(y_train,n_folds=k)
    xx=[]
    count=0
    for train_index, test_index in kf:
        count+=1
        X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:]
        gc.collect()
        y_train_cv, y_test_cv = y_train[train_index],y_train[test_index]
        y_pred=np.zeros(X_test_cv.shape[0])
        m=0
         
        for j in range(m):
            clf=xgb_classifier(eta=0.05,min_child_weight=20,col=0.5,subsample=0.7,depth=7,num_round=400,seed=j*77,gamma=0.1)
            y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv))
            yqq=y_pred*(1.0/(j+1))

            print j,llfun(y_test_cv,yqq)

        #y_pred/=m;
        clf=XGBClassifier(max_depth=10,colsample_bytree=0.8,learning_rate=0.02,n_estimators=500,nthread=-1)
        #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100)
        clf.fit(X_train_cv,(y_train_cv),eval_metric="logloss",eval_set=[(X_test_cv, y_test_cv)])
        y_pred=clf.predict_proba(X_test_cv).T[1]
        print y_pred.shape
        xx.append(llfun(y_test_cv,(y_pred)))
        ypred=y_pred
        yreal=y_test_cv
        idx=idx[test_index]
        print xx[-1]#,y_pred.shape
        break

    print xx,'average:',np.mean(xx),'std',np.std(xx)
    return ypred,yreal,idx#np.mean(xx)
    def test_predict_sklearn_pickle(self):
        X,y = makeXy()
        Xtest = makeXtest()

        from xgboost import XGBClassifier
        kwargs={}
        kwargs['tree_method'] = 'gpu_hist'
        kwargs['predictor'] = 'gpu_predictor'
        kwargs['silent'] = 0
        kwargs['objective'] = 'binary:logistic'

        model = XGBClassifier(**kwargs)
        model.fit(X,y)
        print(model)

        # pickle model
        save_obj(model,"model.pkl")
        # delete model
        del model
        # load model
        model = load_obj("model.pkl")
        os.remove("model.pkl")

        # continue as before
        print("Before model.predict")
        sys.stdout.flush()
        tmp = time.time()
        gpu_pred = model.predict(Xtest, output_margin=True)
        print(gpu_pred)
        print("E non-zeroes: %d:" % (np.count_nonzero(gpu_pred)))
        print("E GPU Time to predict = %g" % (time.time() - tmp))
예제 #3
0
    def test_predict_sklearn_pickle(self):
        x, y = build_dataset()

        kwargs = {'tree_method': 'gpu_hist',
                  'predictor': 'gpu_predictor',
                  'verbosity': 2,
                  'objective': 'binary:logistic',
                  'n_estimators': 10}

        model = XGBClassifier(**kwargs)
        model.fit(x, y)

        save_pickle(model, "model.pkl")
        del model

        # load model
        model: xgb.XGBClassifier = load_pickle("model.pkl")
        os.remove("model.pkl")

        gpu_pred = model.predict(x, output_margin=True)

        # Switch to CPU predictor
        bst = model.get_booster()
        bst.set_param({'predictor': 'cpu_predictor'})
        cpu_pred = model.predict(x, output_margin=True)
        np.testing.assert_allclose(cpu_pred, gpu_pred, rtol=1e-5)
예제 #4
0
 def xgboost_classifier(self):
     cls = XGBClassifier()
     print 'xgboost cross validation score', cross_val_score(cls,self.x_data,self.y_data)
     start_time = time.time()
     cls.fit(self.x_train, self.y_train)
     print 'score', cls.score(self.x_test, self.y_test)
     print 'time cost', time.time() - start_time
예제 #5
0
파일: predict.py 프로젝트: jmc856/Webpage
def feature_selection(model, X_train, X_test, y_train, y_test, eval_metric='auc'):
    thresholds = [thres for thres in sorted(model.feature_importances_) if thres != 0]  # Use feat. with >0 importance

    roc_scores = {}
    for thresh in thresholds:  # select features using threshold

        selection = SelectFromModel(model, threshold=thresh, prefit=True)
        select_X_train = selection.transform(X_train)

        selection_model = XGBClassifier()  # train model
        selection_model.fit(select_X_train, y_train, eval_metric=eval_metric)

        select_X_test = selection.transform(X_test)  # eval model
        y_pred = selection_model.predict(select_X_test)

        roc = roc_auc_score(y_test, y_pred)
        roc_scores[selection.threshold] = roc

    best_thresh = max(roc_scores, key=roc_scores.get)

    fs = SelectFromModel(model, threshold=best_thresh, prefit=True)
    pickle_model(fs, 'feature.select')
    X_train_trans_ = fs.transform(X_train)
    X_test_trans_ = fs.transform(X_test)
    print 'total features kept: {}'.format(X_train_trans_.shape[1])

    return X_train_trans_, X_test_trans_
예제 #6
0
def xgboostcv(max_depth,
              learning_rate,
              n_estimators,
              subsample,
              colsample_bytree,
              gamma,
              min_child_weight,
              silent=True,
              nthread=-1,
              seed=1234):

    clf = XGBClassifier(max_depth=int(max_depth),
                        learning_rate=learning_rate,
                        n_estimators=int(n_estimators),
                        silent=silent,
                        nthread=nthread,
                        subsample=subsample,
                        colsample_bytree=colsample_bytree,
                        gamma=gamma,
                        min_child_weight = min_child_weight,
                        seed=seed,
                        objective="binary:logistic")

    clf.fit(x0, y0, eval_metric="logloss", eval_set=[(x1, y1)],early_stopping_rounds=25)
    ll = -log_loss(y1, clf.predict_proba(x1))
    return ll
def get_xgb_feature_importance_plot(best_param_, experiment_, 
                                    png_folder,
                                    png_fname,
                                    score_threshold=0.8):

    # 1. 
    train_X, train_y = experiment_.get_train_data()
    clf = XGBClassifier()
    try:
        del best_param_['model_type']
    except:
        pass
    clf.set_params(**best_param_)
    clf.fit(train_X, train_y)
    index2feature = clf.booster().get_fscore()
    fis = pd.DataFrame({'name':index2feature.keys(),
                        'score':index2feature.values()})
    fis = fis.sort('score', ascending=False)
    if len(fis.index) > 20:
        score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold)
        #where_str = 'score > %f & score > %f' % (score_threshold, 0.0)
        where_str = 'score >= %f' % (score_threshold)
        fis = fis.query(where_str)

    # 2. plot
    #gs = GridSpec(2,2)
    #ax1 = plt.subplot(gs[:,0])
    #ax2 = plt.subplot(gs[0,1])
    #ax3 = plt.subplot(gs[1,1])

    # 3.1 feature importance
    sns.barplot(x = 'score', y = 'name',
                data = fis,
                #ax=ax1,
                color="blue")
    #plt.title("Feature_Importance", fontsize=10)
    plt.ylabel("Feature", fontsize=10)
    plt.xlabel("Feature_Importance : f-Score", fontsize=10)

    """
    # 3.2 PDF
    confidence_score = clf.oob_decision_function_[:,1]
    sns.distplot(confidence_score, kde=False, rug=False, ax=ax2)
    ax2.set_title("PDF")

    # 3.3 CDF
    num_bins = min(best_param_.get('n_estimators',1), 100)
    counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True)
    cdf = np.cumsum(counts)
    ax3.plot(bin_edges[1:], cdf / cdf.max())
    ax3.set_title("CDF")
    ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10)
    """

    png_fname = os.path.join(Config.get_string('data.path'), 'graph', png_fname)
    plt.tight_layout()
    plt.savefig(png_fname)#, bbox_inches='tight', pad_inches=1)
    plt.close()

    return True
예제 #8
0
def cv(X_train, y_train, features_inner):

    kfold = StratifiedKFold(n_splits=5, shuffle=True)

    scores_f = []
    scores_p = []
    scores_r = []

    for train, test in kfold.split(X_train, y_train):

        model = XGBClassifier()
        X_train_cv = pd.DataFrame(X_train.values[train], columns=X_train.columns)
        y_train_cv = pd.DataFrame(y_train.values[train], columns=["tred_cutoff"])
        X_test_cv = pd.DataFrame(X_train.values[test], columns=X_train.columns)
        y_test_cv = pd.DataFrame(y_train.values[test], columns=["tred_cutoff"])
        model.fit(X_train_cv, y_train_cv)

        y_pred = model.predict(X_test_cv)

        s_f = f1_score(y_test_cv, y_pred)
        s_p = precision_score(y_test_cv, y_pred)
        s_r = recall_score(y_test_cv, y_pred)
        print("\tscores f1", (s_f))
        print("\tscores p", (s_p))
        print("\tscores r", (s_r))
        scores_f.append(s_f)
        scores_p.append(s_p)
        scores_r.append(s_r)

    print("mean scores f1", np.mean(scores_f))
    print("mean scores p", np.mean(scores_p))
    print("mean scores r", np.mean(scores_r))
예제 #9
0
def XGB_model(train,y):
	model=XGBClassifier(n_estimators=150, learning_rate=0.01)
	from sklearn import cross_validation
	cv = cross_validation.KFold(len(train), n_folds=5,random_state=7)
	for traincv,testcv in cv:
	    model.fit(train.iloc[traincv],y.iloc[traincv])
	y_XGB=model.predict(test)
	return y_XGB
예제 #10
0
def main():
    # Set seed for reproducibility
    np.random.seed(0)

    print("Loading data...")
    # Load the data from the CSV files
    
    training_data = pd.read_csv('/home/vipin/Videos/train.csv', header=0)
    prediction_data = pd.read_csv('/home/vipin/Videos/test.csv', header=0)
     
     
    training_data['countrycode']=training_data['countrycode'].apply(lambda x:ord(x))
    training_data['browserid']=training_data['browserid'].apply(lambda x: myfunc (x) if np.all(pd.notnull(x)) else myfunc("unknown") )
    training_data['devid']=training_data['devid'].apply(lambda x: myfunc (x) if np.all(pd.notnull(x)) else myfunc("none"))
    
    
    #pd.to_csv('/home/vipin/Videos/train11.csv', sep=',', encoding='utf-8')
    #exit(0)
    prediction_data['countrycode']=prediction_data['countrycode'].apply(lambda x:ord(x))
    prediction_data['browserid']=prediction_data['browserid'].apply(lambda x:myfunc (x) if np.all(pd.notnull(x)) else myfunc("unknown") )
    prediction_data['devid']=prediction_data['devid'].apply(lambda x:myfunc (x) if np.all(pd.notnull(x)) else myfunc("none") )
    
    
    features=['siteid','offerid','category','merchant','countrycode','browserid','devid']
    target="click"
    X = training_data[features]
    x_prediction = prediction_data[features]
    Y= training_data[target]
    ids = prediction_data["ID"]
    model = XGBClassifier()
            
            
    #linear_model.LogisticRegression(n_jobs=-1)
        
    print("Training...")
            # Your model is trained on the training_data
    model.fit(X, Y)
        
    print("Predicting...")
    
    seed =7
    test_size=0.33
    X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=test_size,random_state=seed)
    y_prediction = model.predict_proba(x_prediction)
    results = y_prediction[:, 1]
    results_df = pd.DataFrame(data={'probability':results})
    joined = pd.DataFrame(ids).join(results_df)
        
    y_pred=model.predict(X_test)
    accuracy=accuracy_score(y_test,y_pred)
    

    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print("Writing predictions to predictions.csv")
        # Save the predictions out to a CSV file
    joined.to_csv("/home/vipin/Videos/predictions.csv", index=False)
예제 #11
0
파일: tests.py 프로젝트: booleancandy/tpot
def test_xgboost():
    """Ensure that the TPOT xgboost method outputs the same as the xgboost classfier method"""

    tpot_obj = TPOT()
    result = tpot_obj._xgradient_boosting(training_testing_data, n_estimators=100, learning_rate=0, max_depth=3)
    result = result[result['group'] == 'testing']

    xgb = XGBClassifier(n_estimators=100, learning_rate=0.0001, max_depth=3, seed=42)
    xgb.fit(training_features, training_classes)

    assert np.array_equal(result['guess'].values, xgb.predict(testing_features))
예제 #12
0
파일: predict.py 프로젝트: jmc856/Webpage
def update_model(current_year):
    print 'Creating model...\nDate: {}'.format(datetime.now().strftime('%Y-%m-%d_%H:%M:%S'))

    managers = tuple(unique_managers(current_year))

    sql = "select * from (select week, year, manager1_name, manager2_name, team1_points, team1_projected, team2_points, team2_projected, type \
         from scoreboard_all WHERE team1_points > 0 and week<=13 \
        UNION select week, year, manager2_name AS manager1_name, manager1_name as manager2_name, team2_points AS team1_points, \
        team2_projected AS team1_projected, team1_points as team2_points, team1_projected AS team2_projected, type FROM scoreboard_all \
        where team1_points>0 and week<=13) order by year, week, type;"

    ff1 = download_data(os.path.join(os.getcwd(), 'data/fantasy_football.db'), sql)

    data_features = custom_features(ff1)
    data_features = data_features[(data_features.manager1_name.isin(managers)) & (data_features.manager2_name.isin(managers))]
    X, y, managers, league_type = dummy_and_interaction(data_features)
    # feats = X.columns.tolist()
    sc = StandardScaler()
    X_std = sc.fit_transform(X)
    pickle_model(sc, 'standard.scaler')

    # Select best features
    X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.25, random_state=None)

    model = XGBClassifier()
    model.fit(X_train, y_train)
    # imports = model.feature_importances_.tolist()
    # g = zip(feats, imports)
    # feat_importance = sorted(g, key=lambda x: x[1], reverse=True)
    # print feat_importance
    X_train_trans, X_test_trans = feature_selection(model, X_train, X_test, y_train, y_test, eval_metric='auc')

    # Select best params
    model = XGBClassifier()
    learning_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
    n_estimators = [50, 100, 150, 200, 250, 300]
    param_grid = dict(n_estimators=n_estimators, learning_rate=learning_rate)

    grid_search = GridSearchCV(model, param_grid, scoring="log_loss", cv=10, verbose=1)
    result = grid_search.fit(X_train_trans, y_train)

    print("Best: {0} using {1}".format(result.best_score_, result.best_params_))
    print 'Best params: ', result.best_params_
    best_est = result.best_estimator_
    validation = best_est.predict_proba(X_train_trans)
    print("Roc AUC Train: ", roc_auc_score(y_train, validation[:, 1], average='macro'))

    probs = best_est.predict_proba(X_test_trans)
    print("Roc AUC Validation: ", roc_auc_score(y_test, probs[:, 1], average='macro'))

    pickle_model(best_est, 'fantasy.predict')
예제 #13
0
def xgboostcv(max_depth,
              learning_rate,
              n_estimators,
              gamma,
              min_child_weight,
              max_delta_step,
              subsample,
              colsample_bytree,
              silent=True,
              nthread=-1,
              seed=1234):

    clf = XGBClassifier(max_depth=int(max_depth),
                        learning_rate=learning_rate,
                        n_estimators=int(n_estimators),
                        silent=silent,
                        nthread=nthread,
                        gamma=gamma,
                        min_child_weight=min_child_weight,
                        max_delta_step=max_delta_step,
                        subsample=subsample,
                        colsample_bytree=colsample_bytree,
                        seed=seed,
                        objective="binary:logistic")

    # Run Kfolds on the data model to stop over-fitting
    X_train, X_valid, y_train, y_valid = train_test_split(train,
                                                          train_labels,
                                                          test_size=0.1,
                                                          random_state=seed)
    xgb_model = clf.fit(X_train, y_train, eval_metric="auc", eval_set=[(X_valid, y_valid)], early_stopping_rounds=20)
    y_pred = xgb_model.predict_proba(X_valid)[:,1]

    return auc(y_valid, y_pred)
예제 #14
0
def train_model_xgb_meta(train_x, train_y, xgb_features):
    train_ind = StratifiedShuffleSplit(train_y, random_state=1, test_size=0.2)

    for train_index, test_index in train_ind:
        x_train = train_x.ix[train_index, :]
        y_train = train_y.ix[train_index]

        x_eval = train_x.ix[test_index, :]
        y_eval = train_y.ix[test_index]


    #Classifier
    xgb = XGBClassifier(max_depth=xgb_features['max_depth'], learning_rate=xgb_features['learning_rate'], n_estimators=int(xgb_features['n_estimators']), objective='binary:logistic',
                        subsample=xgb_features['subsample'], colsample_bytree=xgb_features['colsample_bytree'], min_child_weight=xgb_features['min_child_weight'])
    # gives 0.458

    #  bag_clf = BaggingClassifier(xgb, max_samples=10, warm_start=True, verbose=10)
    #  x_train = pd.DataFrame(x_train, dtype=float)
    #  bag_clf.fit(x_train, y_train)
    xgb = xgb.fit(x_train, y_train, verbose=True, eval_metric='logloss',  eval_set=[(x_eval, y_eval)], early_stopping_rounds=10)

    #  cv_score = cross_val_score(xgb, x_train, y_train, cv=4, n_jobs=1, pre_dispatch=1, verbose=10, scoring='log_loss')
    #  print(cv_score)
    #  print(np.mean(cv_score))

    #  predictions = pd.Series(xgb.predict_proba(x_train, ntree_limit=xgb.best_iteration)[:, 1], name='PredictedProb')

    return xgb  #  , predictions
예제 #15
0
def runner ():
    m = Model()
    X = m.df.drop("tred_cutoff", axis=1)
    Y = m.df["tred_cutoff"]
    features_inner = m.features + m.features_2
    cv(X, Y, features_inner)

    model = XGBClassifier()
    model.fit(X, Y)

    y_pred = model.predict(m.X_test)
    s_f = f1_score(m.y_test, y_pred)
    s_p = precision_score(m.y_test, y_pred)
    s_r = recall_score(m.y_test, y_pred)
    print("test f1", s_f)
    print("test precision", s_p)
    print("test recall", s_r)
def main():
    titanic = pandas.read_csv('dataset/titanic.csv')

    x_set = titanic[['pclass', 'age', 'sex']]
    y_set = titanic['survived']
    x_set.fillna(x_set['age'].mean(), inplace=True)
    x_train, x_test, y_train, y_test = utils.prepare_train_and_test_sets(x_set, y_set)

    dict_vectorizer = DictVectorizer(sparse=False)
    x_train = dict_vectorizer.fit_transform(x_train.to_dict(orient='record'))
    x_test = dict_vectorizer.transform(x_test.to_dict(orient='record'))

    decision_tree_classifier = DecisionTreeClassifier()
    utils.get_trained_result(decision_tree_classifier, x_test, x_train, y_test, y_train)

    xgb_classifier = XGBClassifier()
    xgb_classifier.fit(x_train, y_train)
    utils.get_trained_result(xgb_classifier, x_test, x_train, y_test, y_train)
예제 #17
0
  def trainXGB(data_subset):
    f.write('\nTraining XGB:'+'\n')

    X_train = data[data_subset]['X_train']
    X_test = data[data_subset]['X_test']
    y_train = data[data_subset]['y_train']
    y_test = data[data_subset]['y_test']

    for p in params['xgboost']:
      if data_subset != 'binary' and p['objective'] == 'binary:logistic':
        print("Skip using non-binary data with XGB binary:logistic objective")
        continue
      if data_subset == 'binary' and p['objective'] != 'binary:logistic':
        print("Skip using binary data with XGB multi:* objective")
        continue

      header = "@ subset: {0}, params: {1}".format(data_subset, p)
      f.write('\n'+header+'\n')

      objective = p['objective']
      max_depth = p['max_depth']
      try:
        n_estimators= p['n_estimators']
      except KeyError as e:
        n_estimators= 100

      model = XGBClassifier(objective=objective, max_depth=max_depth,
        n_estimators=n_estimators)

      start = time.time()
      model.fit(X_train, y_train)
      elapsed_train = time.time() - start

      y_pred = model.predict(X_test).astype(int)
      elapsed_predict = time.time() - start

      accuracy = accuracy_score(y_test, y_pred)
      precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, pos_label=2, average='weighted')

      print("\n{5}\nXGB with {0} objective, {6} max_depth, {7} n_estimators on data subset {1} trained in {2} seconds and predicted in {3} seconds with an accuracy of {4}\n".format(objective, data_subset, elapsed_train, elapsed_predict, accuracy, header, max_depth, n_estimators))

      f.write(str(elapsed_train) + ', ' + str(elapsed_predict) + str(accuracy)+ ', ' + str(precision)+ ', ' + str(recall )+ ', ' + str(fscore )+ ', ' + str(support))
예제 #18
0
def get_thresh(model,train,test,label_test,label_train):
    if (len(test)>len(train)) or (len(label_test)>len(label_train)):
        raise TypeError('Invalid train and test size')
    model1 = XGBClassifier()
    if type(model)!=type(XGBClassifier()):
        raise TypeError('Invalid model passed')
    if (pd.DataFrame(label_train).shape[1]>1) or (pd.DataFrame(label_test).shape[1]>1):
    	raise TypeError('Multiple columns in label, Invalid shape.')
    max_score=0
    thrsh=0
    thresholds = np.sort(model.feature_importances_)
    for thresh in thresholds:
        selection = feature_selection.SelectFromModel(model, threshold=thresh,prefit=True)
        select_X_train = selection.transform(train)
        selection_model = XGBClassifier()
        selection_model.fit(select_X_train, label_train)
        select_X_test = selection.transform(test)
        y_pred = selection_model.predict(select_X_test)
        scr=metrics.roc_auc_score(label_test,y_pred)
        if(scr>max_score):
            max_score=scr
            thrsh=thresh
    return thrsh
예제 #19
0
def test_on_data(X, y):

    x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.5, random_state=2333)
    print "train set: {}, test set: {}".format(len(x_train), len(x_test))
    cls = XGBClassifier()
    cls.fit(x_train, y_train)
    # on test
    pred = cls.predict(x_test)
    print "xgb accuracy score test", accuracy_score(y_test, pred)

    # on all
    pred = cls.predict(X)
    print "xgb accuracy score all", accuracy_score(y, pred)

    # compare to gbrt in sklearn
    cls = GradientBoostingClassifier()
    cls.fit(x_train, y_train)
    # on test
    pred = cls.predict(x_test)
    print "sklearn accuracy score test", accuracy_score(y_test, pred)

    # on all
    pred = cls.predict(X)
    print "sklearn accuracy score all", accuracy_score(y, pred)
예제 #20
0
def train(imgfile='img/segmentation', modelfile='segmentation.pkl'):
    
    filelabel = getFiles(imgfile)
    row = 120
    col=40
    data = filter(lambda z: z is not None ,map(lambda x:Img(x[1],row,col,x[0]).imgmap,filelabel))
    data = filter(lambda x:x[0] is not None,sum(data,[]))
    label = np.array(map(lambda x:CHARACTER.get(x[0]),data))
    feature = np.array(map(lambda x:np.array(x[1]),data))
    from xgboost import XGBClassifier
    xgb = XGBClassifier(objective='multi:softmax',reg_alpha=1.0,reg_lambda=0.0,subsample=0.7,n_estimators=100,learning_rate=0.3)
    model = xgb.fit(feature,label,eval_set=[(feature,label)],eval_metric='mlogloss')
    import pickle
    fn = modelfile
    with open(fn, 'w') as f:                     # open file with write-mode
        pickle.dump(model, f)
예제 #21
0
def train_model_xgb(train_x, train_y, xgb_features):

    train_ind = StratifiedShuffleSplit(train_y, random_state=1, test_size=0.1)

    for train_index, test_index in train_ind:
        x_train = train_x.ix[train_index, :]
        y_train = train_y.ix[train_index]

        x_eval = train_x.ix[test_index, :]
        y_eval = train_y.ix[test_index]

    #Classifier
    xgb = XGBClassifier(max_depth=xgb_features['max_depth'], learning_rate=xgb_features['learning_rate'], n_estimators=int(xgb_features['n_estimators']), objective='binary:logistic',
                        subsample=xgb_features['subsample'], colsample_bytree=xgb_features['colsample_bytree'], min_child_weight=xgb_features['min_child_weight'])
    # gives 0.458
    xgb = xgb.fit(x_train, y_train, verbose=True, eval_metric='logloss',  eval_set=[(x_eval, y_eval)], early_stopping_rounds=10)

    predictions = pd.Series(xgb.predict_proba(x_train, ntree_limit=xgb.best_iteration)[:, 1], name='PredictedProb')

    return xgb, predictions
예제 #22
0
    new_test_data_handle = test_datas.drop(to_drop, axis=1)

    return new_test_data_handle


train_data = pd.read_csv('../../data/train.csv')
test_data = pd.read_csv('../../data/test.csv')
pro_datas, target = pro_train_data(train_data)
pre_datas = pro_test_data(test_data)

X_train, X_test, y_train, y_test = train_test_split(pro_datas,
                                                    target,
                                                    test_size=0.20,
                                                    random_state=RANDOM_STATE)

xgcmodel = XGBClassifier(n_estimators=48,
                         max_depth=4,
                         n_jobs=-1,
                         random_state=RANDOM_STATE)
xgcmodel.fit(X_train, y_train)

train_predictions = xgcmodel.predict(X_test)
print(classification_report(y_test, train_predictions, digits=4))

predictions = pd.DataFrame(xgcmodel.predict(pre_datas), columns=['Survived'])

predictions = pd.concat([test_data['PassengerId'], predictions],
                        axis=1,
                        join='inner')

predictions.to_csv('predictions.csv', index=False)
예제 #23
0
from sklearn.metrics import accuracy_score
# load data
dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
# split data into X and y
X = dataset[:, 0:8]
Y = dataset[:, 8]
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=test_size,
                                                    random_state=seed)
# fit model on training data
model = XGBClassifier()
eval_set = [(X_test, y_test)]
# specify a window of the number of epochs over which no improvement is observed.
# This is specified in the early stopping rounds parameter.
model.fit(X_train,
          y_train,
          early_stopping_rounds=10,
          eval_metric="logloss",
          eval_set=eval_set,
          verbose=True)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
예제 #24
0
## train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=66)

## 모델링
model = XGBClassifier(
    n_estimators=1000,  # verbose의 갯수, epochs와 동일
    learning_rate=0.1)

model.fit(x_train,
          y_train,
          verbose=True,
          eval_metric='rmse',
          eval_set=[(x_train, y_train), (x_test, y_test)])
# eval_metic의 종류 : rmse, mae, logloss, error(error가 0.2면 accuracy는 0.8), auc(정확도, 정밀도; accuracy의 친구다)

results = model.evals_result()
# print("eval's result : ", results)
y_pred = model.predict(x_test)

acc = accuracy_score(y_pred, y_test)
print("acc: ", acc)
# r2 = r2_score(y_pred, y_test)
# print("r2: %.2f" %(r2 * 100.0))

import pickle
pickle.dump(model, open("./model/xgb_save/cancer.pickle.dat", "wb"))
예제 #25
0
     training_indices, testing_indices = next(iter(StratifiedShuffleSplit(input_data['class'].values,
                                                                          n_iter=1,
                                                                          train_size=0.75,
                                                                          test_size=0.25,
                                                                          random_state=dataset_repeat)))
 
     training_features = input_data.loc[training_indices].drop('class', axis=1).values
     training_classes = input_data.loc[training_indices, 'class'].values
 
     testing_features = input_data.loc[testing_indices].drop('class', axis=1).values
     testing_classes = input_data.loc[testing_indices, 'class'].values
 
     # Create and fit the model on the training data
     try:
         clf = XGBClassifier(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth)
         clf.fit(training_features, training_classes)
         testing_score = clf.score(testing_features, testing_classes)
     except:
         continue
 
     param_string = ''
     param_string += 'learning_rate={},'.format(learning_rate)
     param_string += 'n_estimators={},'.format(n_estimators)
     param_string += 'max_depth={}'.format(max_depth)
 
     out_text = '\t'.join([dataset.split('/')[-1][:-7],
                           'XGBClassifier',
                           param_string,
                           str(testing_score)])
 
     print(out_text)
예제 #26
0
def getvalues_and_recommend():
    userid = 2552
    shop1 = request.form['shop1']
    rate1 = float(request.form['rate1'])
    shop2 = request.form['shop2']
    rate2 = float(request.form['rate2'])
    shop3 = request.form['shop3']
    rate3 = float(request.form['rate3'])
    shop4 = request.form['shop4']
    rate4 = float(request.form['rate4'])
    shop5 = request.form['shop5']
    rate5 = float(request.form['rate5'])
    shop6 = request.form['shop6']
    rate6 = float(request.form['rate6'])
    shop7 = request.form['shop7']
    rate7 = float(request.form['rate7'])
    shop8 = request.form['shop8']
    rate8 = float(request.form['rate8'])
    shop9 = request.form['shop9']
    rate9 = float(request.form['rate9'])
    shop10 = request.form['shop10']
    rate10 = float(request.form['rate10'])

    #creating a new spark session
    newspark = SparkSession.builder.appName('hybrid_rec').getOrCreate()
    #reading in prepped dataset for model-based collaborative filtering recommendation
    mbcf = newspark.read.csv('mbcf.csv', header=True, inferSchema=True)
    #making a copy for each new user input
    mbcf_try = mbcf
    vals = [(shop1,rate1,userid),(shop2,rate2,userid),(shop3,rate3,userid),(shop4,rate4,userid),(shop5,rate5,userid),(shop6,rate6,userid),(shop7,rate7,userid),(shop8,rate8,userid),(shop9,rate9,userid),(shop10,rate10,userid)]
    #pyspark's convention to adding new rows to the end of an existing spark dataframe-1
    newRows = newspark.createDataFrame(vals,mbcf_try.columns)
    #pyspark's convention to adding new rows to the end of an existing spark dataframe-2
    mbcf_try = mbcf_try.union(newRows)
    #converting df to pandas df for easier manipulation later on...
    mbcf_try_pd = mbcf_try.toPandas()
    #getting a look again at the outlets and ratings provided by userid2552 so we know which outlets to exclude in recommending outlets to userid2552 later on...
    user_item_2552 = mbcf_try_pd[mbcf_try_pd['userids']==2552]
    #as part of ALS requirements for the feature columns to be in numerical format, am converting both shops and userids to the double precision format just in case (even though userids is already in a float format)
    indexer_try = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in list(set(mbcf_try.columns)-set(['ratings']))]
    pipeline_try = PL(stages=indexer_try)
    transformed_try = pipeline_try.fit(mbcf_try).transform(mbcf_try)
    #rank=300 and regParam=0.1 was a pair of tuned best params while retuning als with train test split stratified for userids...
    als = ALS(rank=300, regParam=0.1, maxIter=20, seed=42, userCol='userids_index',itemCol='shops_index', ratingCol='ratings',coldStartStrategy='drop')
    #training the dataset containing the new user's ratings...
    als_model_rec = als.fit(transformed_try)
    #making recommendations for model-based collaborative filtering alone first, passing in all 981 outlets so as to ensure as much overlap between collaborative filtering and content-based filtering in the outlets that they generate rating predictions for
    recs=als_model_rec.recommendForAllUsers(981).toPandas()
    nrecs=recs.recommendations.apply(pd.Series) \
                .merge(recs, right_index = True, left_index = True) \
                .drop(["recommendations"], axis = 1) \
                .melt(id_vars = ['userids_index'], value_name = "recommendation") \
                .drop("variable", axis = 1) \
                .dropna()
    nrecs=nrecs.sort_values('userids_index')
    nrecs=pd.concat([nrecs['recommendation'].apply(pd.Series), nrecs['userids_index']], axis = 1)
    nrecs.columns = [

            'Shop_index',
            'Rating',
            'UserID_index'

         ]
    md=transformed_try.select(transformed_try['userids'],transformed_try['userids_index'],transformed_try['shops'],transformed_try['shops_index'])
    md=md.toPandas()
    dict1=dict(zip(md['userids_index'],md['userids']))
    dict2=dict(zip(md['shops_index'],md['shops']))
    nrecs['UserID']=nrecs['UserID_index'].map(dict1)
    nrecs['shops']=nrecs['Shop_index'].map(dict2)
    nrecs=nrecs.sort_values('UserID')
    nrecs.reset_index(drop=True, inplace=True)
    new=nrecs[['UserID','shops','Rating']]
    new['recommendations'] = list(zip(new.shops, new.Rating))
    res=new[['UserID','recommendations']]
    res_new=res['recommendations'].groupby([res.UserID]).apply(list).reset_index()

    #creating a new df for userid2552's collaborative filtering-derived recommendations
    collab_rec_2552 = pd.DataFrame(dict(res_new[res_new["UserID"]==2552]['recommendations'].tolist()[0]),index=[0]).T.sort_values(0,ascending=False)

    #creating a list of outlets userid2552 has rated earlier on
    rated_2552 = mbcf_try_pd[mbcf_try_pd['userids']==2552]['shops'].tolist()

    #filtering out those 10 outlets userid2552 has rated initially from the collaborative filtering recommendation list...
    collab_rankedrecs_2552 = collab_rec_2552.loc[[shop for shop in collab_rec_2552.index if shop not in rated_2552],0]

    #organizing the above series column into a df of recommendations and collaborative filtering rating predictions
    collab_2552_df = pd.DataFrame({'recommendations':collab_rankedrecs_2552.index,'collab_filter_predicted_ratings':collab_rankedrecs_2552})

    #reading in the previously prepped df meant for content-based filtering here for content-based filtering recommendations..
    content_f = pd.read_csv('content_based_df_nouser.csv')

    #merging userid2552's info with the df meant for content-based filtering so that rcontent-based filtering can make recommendations via rating predictions for userid 2552 later on...
    content_2552 = pd.merge(content_f,user_item_2552,how='left',on='shops')

    #getting dummies for categorical columns...
    content_2552_wdummies = pd.get_dummies(content_2552, columns=['shops','category_alias'], drop_first=False)

    #setting feature and target
    X = content_2552_wdummies.drop(['ratings'], axis=1)
    y = content_2552_wdummies['ratings']

    #collating dummified columns
    shops_cats_list = [col for col in content_2552_wdummies.columns if (col.startswith('shops')) or (col.startswith('category'))]

    #extending with review_count and rating
    shops_cats_list.extend(['review_count','rating','userids'])

    #as tfidf can only work on one column of texts at a time, am separating features as below...
    X1 = X['reviews']
    X2 = X[shops_cats_list]

    #Assigning a new variable name to X1 for processing.
    rev = X1

    #creating customized stop words' list
    cust_stop_words = [word for word in stop_words.ENGLISH_STOP_WORDS]

    #adding on to the above list based on preliminary word cloud EDA
    cust_stop_words.extend(["wa","ha","just","ve","did","got","quite"])

    #preprocessing text in reviews by defining a function to do so
    lemm = WordNetLemmatizer()

    def text_processer(raw_text):
        # Function to convert a raw string of text to a string of words
        # The input is a single string (a raw unprocessed text), and
        # the output is a single string (a preprocessed text)

        # 1. Remove http urls.
        review_text = re.sub("\(http.+\)", " ", raw_text)

        # 2. Remove non-letters.
        letters_only = re.sub("[^a-zA-Z]", " ", review_text)

        # 3. Convert to lower case, split into individual words.
        words = letters_only.lower().split()

        # 4. Lemmatize words.
        lemmed_words = [lemm.lemmatize(i) for i in words]

        # 5. Remove stop words.

        meaningful_words = [w for w in lemmed_words if not w in cust_stop_words]

        # 6. Join the words back into one string separated by space,
        # and return the result.
        return(" ".join(meaningful_words))

    #showing how the processed reviews look like
    rev_processed = pd.Series([text_processer(text) for text in rev])

    #using tfidf vectorizer to convert the reviews into term frequency columns...
    tvec_naive = TfidfVectorizer(stop_words = cust_stop_words)  #instantiating TfidfVectorizer with customized stop words

    X1_tvec_naive = tvec_naive.fit_transform(rev_processed).todense()   #fitting tvec and transforming the processed reviews
    X1_tvec_naive_df = pd.DataFrame(X1_tvec_naive, columns = tvec_naive.get_feature_names())  #converting it into a dataframe for easy lookup.

    #combining tvec-df with the rest of the features for rating prediction for userid 2552 later on...
    X_legit = pd.concat([X1_tvec_naive_df,X2], axis=1)

    #adding back the column of ratings so that it can be dropped below-sorry sometimes my train of thought may sound illogical
    X_legit['ratings'] = y

    #creating X_train manually for userid 2552
    X_train_2552 = X_legit[X_legit['userids']==2552].drop(['ratings','userids'],axis=1)

    #creating y_train manually for userid 2552
    y_train_2552 = X_legit[X_legit['userids']==2552]['ratings']

    #creating X_test manually for userid 2552 which contains all outlets that have not been rated by userid 2552
    X_test_2552 = X_legit[X_legit['userids']!=2552].drop(['ratings','userids'],axis=1)

    #instantiate scaler since not all of the features are of the same scale, eg. review_count and rating
    ss= StandardScaler()

    #fitting the train and transforming both the train and test sets
    X_train_2552_sc = ss.fit_transform(X_train_2552)
    X_test_2552_sc = ss.transform(X_test_2552)

    #learning rate, max depth, and n_estimators were retrieved from a tuned xgb model (notebook on future plan for xgb) saved in the folder but in order to use random_state which was not used during tuning, I am just instantiating a new xgb instance with the 3 tuned hyperparams set accordingly...
    xgb = XGBClassifier(learning_rate=0.5, max_depth=9, n_estimators=200, random_state=42)

    #training the loaded model on the dataset containing the new user, userid 2552's ratings.
    xgb.fit(X_train_2552_sc, y_train_2552)

    #stacking X_test_2552 as first step in regenerating the shops column for predictions
    trial = X_test_2552.stack()

    #creating loop to re-generate original X_test_2552 order of shops
    index_lst = []
    outlets_lst = []
    for n in range(len(trial.index)):
        if trial.index[n][1].startswith('shops_') and trial[n]!=0:
            index_lst.append(str(trial.index[n][0]))
            outlets_lst.append(trial.index[n][1])
    index_lst = [int(x) for x in index_lst]
    reconstructed_X_test_2552 = pd.DataFrame({'shops':outlets_lst}, index=index_lst)

    #generating content-based filtering rating predictions for userid 2552
    rating_predictions = xgb.predict(X_test_2552_sc)

    #adding new column of rating predictions into the reconstructed X_test_2552
    reconstructed_X_test_2552['predicted_ratings']=rating_predictions

    #giving the reconstructed df a more easily understood name for distinction from the collaborative filtering df dealt with above
    content_2552_df = reconstructed_X_test_2552

    #trimming off the shops' prefixes so that they can eventually be merged with the collaborative filtering df
    content_2552_df['shops'] = content_2552_df['shops'].apply(lambda x: x[6:])

    #renaming the column of rating predictions to distinguish from collaborative filtering's prediction column later on when both dfs are merged.
    content_2552_df.rename(columns={'predicted_ratings':'content_filter_predicted_ratings'},inplace=True)

    #renaming collaborative filtering df's recommendations' column so that it can be merged with the content-based filtering df.
    collab_2552_df.rename(columns={'recommendations':'shops'},inplace=True)

    #reseting the index in the collaborative filtering df so that the index is numerical again
    collab_2552_df.reset_index(drop=True,inplace=True)

    #merging both content-based filtering and collaborating filtering df to prepare to make hybrid recommendations for userid 2552
    content_collab_2552_df = pd.merge(content_2552_df,collab_2552_df,how='inner',on='shops')

    #as mentioned in the previous sub-notebook on this hybrid recommender's evaluation, the following are the content-based and collaborative filtering's ratings' weights
    con_wt = 0.97 / (0.97 + 1.0)
    collab_wt = 1.0 / (0.97 + 1.0)

    #feature engineering to add hybrid recommender's rating predictions into the combined df by multiplying the respective rating predictions by weights based on both models' f1 scores derived from prior evaluation and summing them up to yield hybrid predictions
    content_collab_2552_df['final_weighted_rating_predictions'] = (content_collab_2552_df['content_filter_predicted_ratings']*con_wt) + (content_collab_2552_df['collab_filter_predicted_ratings']*collab_wt)

    #top 5 coffee-drinking outlet recommendations for userid 2552 (me!) based on my ratings given rather randomly to 10 of the outlets earlier on...
    #recommendations_top_5 = content_collab_2552_df.sort_values('final_weighted_rating_predictions',ascending=False).head()
    top_5_recs = content_collab_2552_df[['shops','final_weighted_rating_predictions']].sort_values('final_weighted_rating_predictions',ascending=False).head()
    top_5_recs.reset_index(drop=True,inplace=True)
    first = top_5_recs.loc[0,'shops']
    second = top_5_recs.loc[1,'shops']
    third = top_5_recs.loc[2,'shops']
    fourth = top_5_recs.loc[3,'shops']
    fifth = top_5_recs.loc[4,'shops']

    return render_template('outcome.html', first=first, second=second, third=third, fourth=fourth, fifth=fifth, shop1=shop1, rate1=rate1, shop2=shop2, rate2=rate2, shop3=shop3, rate3=rate3, shop4=shop4, rate4=rate4, shop5=shop5, rate5=rate5, shop6=shop6, rate6=rate6, shop7=shop7, rate7=rate7, shop8=shop8, rate8=rate8, shop9=shop9, rate9=rate9, shop10=shop10, rate10=rate10, url_alias=url_alias)
def test_run():
    #read data
    data = pd.read_csv('/Desktop/creditcard.csv')

    #get some correlations
    corr_matrix = data.corr()
    print('Correlations')
    print(corr_matrix["Class"].sort_values(ascending=False))

    #select predictor variables and drop missing data
    df = data.loc[:, data.columns != 'Class']
    df.dropna()

    #assign target variable (Class in this case)
    target = pd.DataFrame(data, columns=["Class"])

    X = df
    y = target

    #solve dataset imbalances on dependent variable using SMOTEENN algorithm
    #sme = SMOTEENN(random_state=42)
    #X, y = sme.fit_sample(X, y)

    # Standardize features
    scaler = StandardScaler()
    X_std = scaler.fit_transform(X)

    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
        X_std, y, test_size=0.3, random_state=42)

    #Create model
    clf = XGBClassifier(max_depth=6,
                        min_child_weight=1,
                        eta=0.1,
                        silent=1,
                        objective='multi:softmax',
                        num_class=2)

    # Train model
    model = clf.fit(X_train, Y_train.values.ravel())

    # make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    #print(predictions)

    print('Classification Report')
    print(classification_report(Y_test, predictions))

    #confusion matrix
    print('Confusion Matrix')
    print(confusion_matrix(Y_test, predictions))

    #k fold validation
    kfold = StratifiedKFold(n_splits=10, random_state=7)
    results = cross_val_score(clf, X_std, y, cv=kfold)
    print("Stratified K-Fold Accuracy: %.2f%% (%.2f%%)" %
          (results.mean() * 100, results.std() * 100))

    # plot feature importance
    plot_importance(model)
    plt.show()

    # save model
    filename = '/Desktop/Credit_model.pkl'
    pickle.dump(model, open(filename, 'wb'))

    # predict values in original data, to see how our model's predictions compare with real values
    b = model.predict(X_std)

    # send predictions to csv, after merging them with original data
    df2 = pd.DataFrame(data={"predicted": b})
    pd.set_option('display.max_colwidth', -1)

    data['Predicted'] = df2

    data.to_csv(r"/Desktop/predicted.csv")
예제 #28
0
def fit_xgboost(params, X, y):
    clf = XGBClassifier(**params)
    clf.fit(X, y)
    return clf
예제 #29
0
X = dataset.iloc[:, :7]

#Converting words to integer values
# def convert_to_int(word):
#     word_dict = {'one':1, 'two':2, 'three':3, 'four':4, 'five':5, 'six':6, 'seven':7, 'eight':8,
#                 'nine':9, 'ten':10, 'eleven':11, 'twelve':12, 'zero':0, 0: 0}
#     return word_dict[word]

# X['experience'] = X['experience'].apply(lambda x : convert_to_int(x))

y = dataset.iloc[:, -1]

#Splitting Training and Test Set
#Since we have a very small dataset, we will train our model with all availabe data.

# from sklearn.linear_model import LinearRegression
# regressor = LinearRegression()
from xgboost import XGBClassifier
regressor = XGBClassifier()


#Fitting model with trainig data
regressor.fit(X, y)

# Saving model to disk
pickle.dump(regressor, open('model.pkl','wb'))

# Loading model to compare the results
# model = pickle.load(open('model.pkl','rb'))
# print(model.predict([[2, 9, 6]]))
예제 #30
0
                    n_jobs=4,
                    eta=0.02,
                    gamma=0,
                    max_depth=8,
                    subsample=0.8715623,
                    colsample_bytree=0.9497036,
                    colsample_bylevel=0.8,
                    min_child_weight=39.3259775,
                    reg_alpha=0.041545473,
                    reg_lambda=0.0735294,
                    random_state=42,
                    n_estimators=10000)
#%%
clf.fit(train_x,
        train_y,
        eval_set=[(train_x, train_y), (valid_x, valid_y)],
        eval_metric='mlogloss',
        verbose=10,
        early_stopping_rounds=30)
pickle.dump(clf, open("xgb3.pickle", "wb"))
#%%
pred_valid_label = list(clf.predict(valid_x))
print('Accuracy_score %.6f' % accuracy_score(valid_y, pred_valid_label))
feats = [f for f in train.columns if f not in ['acc_id', 'label']]
importance_data = pd.DataFrame()
importance_data["feature"] = feats
importance_data["importance"] = clf.feature_importances_
#%%
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
#2month, month, retained, week 순 입니다.
cm = confusion_matrix(valid_y, pred_valid_label)
예제 #31
0
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# Training XGBoost on the Training set
from xgboost import XGBClassifier

classifier = XGBClassifier()
classifier.fit(X_train, y_train)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
print("Accuracy: {:.2f} %".format(accuracies.mean() * 100))
print("Standard Deviation: {:.2f} %".format(accuracies.std() * 100))
예제 #32
0
# Divide each dataset into Indep Vars and Dep var
Train_X = Train.drop('is_click', axis = 1).copy()
Train_Y = Train['is_click'].copy()
Test_X = Test.drop('is_click', axis = 1).copy()
Test_Y = Test['is_click'].copy()


##########################################
# Fitting the XGBoost to the training set
##########################################

from xgboost import XGBClassifier
classifier = XGBClassifier()

XGB_Model = classifier.fit(Train_X,Train_Y)

########################
# Predict on testset
########################

y_pred = XGB_Model.predict(Test_X)


# Making the confusion matrix 
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Test_Y,y_pred)

cm

#############################
def XGBoosting(trainData, trainLable):
	clf = XGBClassifier()
	clf.fit(trainData, trainLable)
	return clf
def build_model():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)
    model = XGBClassifier()
    result = model.fit(X_train, y_train)

    return result
예제 #35
0
end_time2 = timeit.default_timer() # 시작 시간 체크


g_best_model = model.best_estimator_.named_steps["anyway"]
g_feature_importance = model.best_estimator_.named_steps["anyway"].feature_importances_
print("g_feature_importance:\r\n",g_feature_importance)

######## 최적 파라미터로 구한 f.i를 정렬 후, 최대 R2에서 threshold 구하기 
thresholds = np.sort(g_feature_importance) # default 오름차순
temp_array =[]
for thresh in thresholds:
    selection = SelectFromModel(g_best_model, threshold=thresh, prefit=True)
    select_x_train = selection.transform(x_train)
    selection_model = XGBClassifier()
    selection_model.fit(select_x_train, y_train)
    select_x_test = selection.transform(x_test)
    y_predict = selection_model.predict(select_x_test)
    score = accuracy_score(y_test, y_predict)
    # print('Thresh=%.6f, n=%d, R2:%.6f' 
    #         %(thresh, select_x_train.shape[1], score))
    temp_array.append([thresh, score])

# temp_array를 R2 기준으로 오름차순 정렬하고,
# 마지막 값이 최대 R2일 때의 thresh를 적용
# print("temp_array:\r\n", temp_array)
temp_array.sort(key=lambda x: x[1])
# print("temp_array:\r\n", temp_array)

feature_thresh = temp_array[-1][0]
print("feature_thresh:",feature_thresh)
예제 #36
0
y_pred_proba_LR = classifier_LR.predict_proba(X_test)[::,1]
fpr2, tpr2, _ = metrics.roc_curve(ytest,  y_pred_proba_RF)
auc2 = metrics.roc_auc_score(ytest, y_pred_proba_RF)
print(auc2)

# ROC
plt.figure(figsize=(10,7))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr2,tpr2,label="Logistic Regression, auc="+str(round(auc2,2)))

## XGboost

#Running the Model
from xgboost import XGBClassifier
classifier_XGB=XGBClassifier()
classifier_XGB.fit(X_train,ytrain)

# Predicting the results
y_pred=classifier_XGB.predict(X_test)

#Making the Confusion Matrix
confusion_matrix = pd.crosstab(ytest, y_pred, rownames=['Actual'], colnames=['Predicted'], margins = True)
print(confusion_matrix)
sns.heatmap(confusion_matrix,annot=True,fmt='d',linewidths=.9)

#Model Performance

# Model Accuracy
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(ytest, y_pred))
 'subsample': 1,
 'colsample_bytree': 1,
 'colsample_bylevel': 1,
 'learning_rate': 0.0536444221653737,
 'gamma': 8.491520978228445,
 'max_depth': 3,
 'min_child_weight': 1,
 'max_delta_weight': 12,
 'rate_drop': 0.9445947559908133}


# In[15]:


xgb_model = XGBClassifier(**xgboost_params)
xgb_model.fit(x_pci_train, y_pci_train)

y_pci_pred = xgb_model.predict(x_pci_test)
predictions = [round(value) for value in y_pci_pred]
accuracy = accuracy_score(y_pci_test, predictions)
print(1-accuracy)


# # LGBM 

# In[16]:


import lightgbm
import lightgbm as lgb
from lightgbm import LGBMClassifier
                     max_iter=2000,
                     momentum=0.87)
clf7 = LogisticRegression(solver='saga')
clf8 = KNeighborsClassifier(n_neighbors=3)
clf9 = KNeighborsClassifier(n_neighbors=5)
clfA = KNeighborsClassifier(n_neighbors=7)
clfB = KNeighborsClassifier(n_neighbors=9)
clfC = GaussianNB()
clfD = LinearDiscriminantAnalysis()
clfE = AdaBoostClassifier(n_estimators=500)
clfF = XGBClassifier(n_estimators=500, objective='binary:logistic', gamma=7)
'''
new_X_train, new_X_test = construct_metafeatures(
    [clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9,
    clfA, clfB, clfC, clfD, clfE], 
    X_train, X_test, y_train)
'''

n_fold = 10
train1, test1 = Stacking(clf1, X_train, y_train, X_test, n_fold)
#train2, test2 = Stacking(clf2, X_train, y_train, X_test, n_fold)

print(train1.shape, test1.shape)
exit()
new_X_train = np.concatenate((train1, train2), axis=1)
new_X_test = np.concatenate((test1, test2), axis=1)

clfF.fit(new_X_train, y_train)
pred = clfF.predict(new_X_test)
scr = accuracy_score(y_test, pred)
print('Meta classifier score: {:.4f}'.format(scr))
예제 #39
0
        "name": i,
        "id": i
    } for i in ['index', value.lower()]], describe_df[['index',
                                                       value.lower()
                                                       ]].to_dict('records')


iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)
bst = XGBClassifier(max_depth=1, silent=True, objective='multi:softprob')
bst.fit(X_train, y_train)
preds = bst.predict(X_test)
with open("model.txt", "rb") as f:
    model = pickle.loads(f.read())
confusion_matrix = confusion_matrix(y_test, preds)

feature_importance_graphs = list()
for importance_type in [
        'weight', 'gain', 'cover', 'total_gain', 'total_cover'
]:
    curr_importances = bst.get_booster().get_score(
        importance_type=importance_type)
    curr_importances = {
        k: v
        for k, v in sorted(curr_importances.items(), key=lambda item: item[1])
    }
예제 #40
0
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=1)

model = XGBClassifier(n_estimators=1000,
                      learning_rate=0.1,
                      n_jobs=-1,
                      objective='multi:softmax')
# model = MultiOutputClassifier(xgb)
# model.fit(x_train, y_train, verbose=True,  eval_metric= "error",
#                 eval_set=[(x_train, y_train), (x_test, y_test)])
model.fit(x_train,
          y_train,
          verbose=True,
          eval_metric=["mlogloss", "merror"],
          eval_set=[(x_train, y_train), (x_test, y_test)],
          early_stopping_rounds=20)

# rmse, mae, logloss, error, auc

result = model.evals_result()
print(result)

y_pred = model.predict(x_test)

r2 = r2_score(y_pred, y_test)
print(f"r2: {r2}")

epochs = len(result['validation_0']['mlogloss'])
x_axis = range(0, epochs)
예제 #41
0
                      colsample_bylevel=1,
                      gamma=0,
                      colsample_bytree=1,
                      max_delta_step=0,
                      min_child_weight=1,
                      missing=None,
                      reg_alpha=0,
                      reg_lambda=1,
                      scale_pos_weight=1,
                      seed=0,
                      subsample=1)

model.feature_names = feature_names

print(model)
model.fit(X_train, np.ravel(Y_train))

#save model
model_save_name = save_directory + '/antgc_' + signal + '_bdt'
model._Booster.dump_model(model_save_name + '.xgb')
model._Booster.save_model(model_save_name + '_bin.xgb')
pk.dump(model, open(model_save_name + '.pickle', 'wb'))
print 'Saved model ' + model_save_name + '(*.xgb, *.pickle)'

# save test and test sets
train_save_file = save_directory + '/train_set' + signal + '.txt'
test_save_file = save_directory + '/test_set' + signal + '.txt'
train_save = np.append(X_train, Y_train, axis=1)
test_save = np.append(X_test, Y_test, axis=1)
np.savetxt(train_save_file, train_save, delimiter=",")
np.savetxt(test_save_file, test_save, delimiter=",")
예제 #42
0
    learning_rate = [0.001, 0.01, 0.1, 0.2]
    xgb_params = dict(n_estimators=n_estimators, learning_rate=learning_rate)

    kfold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=10)
    grid_search = GridSearchCV(model, xgb_params, scoring="neg_log_loss", n_jobs=-1, cv=kfold_cv)
    grid_result = grid_search.fit(X_train, label_encoded_y_train)
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))'''

    xgb = XGBClassifier(learning_rate=0.01, n_estimators=100)
    xgb.fit(X_train, y_train)
    xgb_preds = xgb.predict(X_test)

    print(classification_report(y_test, xgb_preds))

    conf_matrix_xgb = metrics.plot_confusion_matrix(xgb, X_test, y_test, cmap=plt.cm.Blues)
    conf_matrix_xgb.ax_.set_title("XGBoost Confusion Matrix")

    '''
        LOGISITIC REGRESSION
    '''
    '''clf_lr_10 = LogisticRegression(solver='liblinear', random_state=0)
    clf_lr_10.fit(X_train_10, y_train_10)'''

    lr = LogisticRegression(solver='liblinear', random_state=10)
    lr.fit(X_train, y_train)
from xgboost import XGBClassifier

# Recall cv/hyper parameter tuning data sets
#hyper_train_features_df, hyper_tuning_features_df, hyper_train_labels_df, hyper_tuning_labels_df

xgb_model = XGBClassifier(eval_metric='logloss',
                          random_seed=11,
                          logging_level='Silent',
                          nan_mode='Min')

evaluation_set = [(hyper_train_features_df, hyper_train_labels_df),
                  (hyper_tuning_features_df, hyper_tuning_labels_df)]

xgb_model.fit(hyper_train_features_df,
              hyper_train_labels_df,
              eval_set=evaluation_set)

print("XGboost On Test Data")
print(
    classification_report(test_labels_df, xgb_model.predict(test_features_df)))

#~~~~~~~~~~~~~~~~~~~~~~~~~~
# 5. Using a Neural Network
#~~~~~~~~~~~~~~~~~~~~~~~~~~

# Set up a neural network with one input layer of 768 input neurons and an output layer of 2 neurons
nnmodel = Sequential()

# Add layers - defining number of input and output nodes appropriately
# Note the softmax activation function on the output layer to convert output to a probability
예제 #44
0
target = df['TARGET']
del df['TARGET']
id = df_test['ID']

from src.transfomations import remove_correlated
_, to_remove = remove_correlated(df, 0.99)

df_test.drop(to_remove, axis=1, inplace=True
             )
variance_threshold = VarianceThreshold(threshold=0.001)
df = variance_threshold.fit_transform(df)

df_test = variance_threshold.fit(df_test)

m2_xgb = XGBClassifier(n_estimators=110, nthread=1, max_depth=4, scale_pos_weight=.8)
m2_xgb.fit(df, target, eval_metric='auc')

param_dist = {
    "n_estimators": [80, 100, 110, 130],
    "max_depth": [3, 4, 5],
    "scale_pos_weight": [0.8, 1, 1.2],
    "learning_rate": [0.1, 0.05, 0.02],
}

randomizedSearch = RandomizedSearchCV(m2_xgb, n_iter=20, param_distributions=param_dist, verbose=2)
randomizedSearch.fit(df, target)

best = randomizedSearch.best_estimator_
print(randomizedSearch.best_params_)
scores = cross_validation.cross_val_score(best, df, target,
                                          cv=5, scoring='roc_auc')
# plot decision tree
from numpy import loadtxt
from xgboost import XGBClassifier
from xgboost import plot_tree
from matplotlib import pyplot
# load data
dataset = loadtxt('pima-indians-diabetes.csv', delimiter=",")
# split data into X and y
X = dataset[:,0:8]
y = dataset[:,8]
# fit model no training data
model = XGBClassifier()
model.fit(X, y)
# plot single tree
plot_tree(model)
pyplot.show()
예제 #46
0
log_model = LogisticRegression(C=1,
                               penalty="l1",
                               solver="liblinear",
                               random_state=7).fit(X_train, y_train)
model = SelectFromModel(log_model, prefit=True)
X_new = model.transform(X_train)

selected_features = pd.DataFrame(model.inverse_transform(X_new),
                                 index=X_train.index,
                                 columns=X_train.columns)
sel_col = selected_features.columns[selected_features.var() != 0]
#print(sel_col)

#clf = XGBClassifier()
clf = XGBClassifier(n_estimators=100, learning_rate=0.3)
clf.fit(X_train, y_train)

#prediction on the test set
y_pred = clf.predict(X_val)

# round float and convert to int
y_pred = y_pred.round(0)
y_pred = y_pred.astype(int)

# Calculating F1 Score
f1 = f1_score(y_val, y_pred, average='macro')
print("F1 score of the model is :", f1)

submission = clf.predict(final_test)
submission2 = submission.round(0)
예제 #47
0
#RandomForest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train3, y_train3)
rfc_y_predict = rfc.predict(X_test3)

print('rfc accuracy:', rfc.score(X_test3, y_test3))
print(
    classification_report(y_test3,
                          rfc_y_predict,
                          target_names=['died', 'suvived']))

#GradientBoosting
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train3, y_train3)
gbc_y_predict = gbc.predict(X_test3)

print('dtc accuracy:', gbc.score(X_test3, y_test3))
print(
    classification_report(y_test3,
                          gbc_y_predict,
                          target_names=['died', 'suvived']))

#xgboost
from xgboost import XGBClassifier
xgbc = XGBClassifier()
xgbc.fit(X_train3, y_train3)
print('xgbc accuracy:', xgbc.score(X_test3, y_test3))
예제 #48
0
x = dataset.data
y = dataset.target

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=1)

model = XGBClassifier(n_estimators=1000, learning_rate=0.1)

# model.fit(x_train, y_train, verbose=True,  eval_metric= "error",
#                 eval_set=[(x_train, y_train), (x_test, y_test)])
model.fit(x_train,
          y_train,
          verbose=True,
          eval_metric="rmse",
          eval_set=[(x_train, y_train), (x_test, y_test)],
          early_stopping_rounds=20)
# rmse, mae, logloss, error, auc

result = model.evals_result()
# print(result)

y_pred = model.predict(x_test)

r2 = r2_score(y_pred, y_test)
acc = accuracy_score(y_pred, y_test)
print(f"acc : {acc}")

# pickle.dump(model, open("./model/xgbsave/cancer.pickle.dat", "wb"))
예제 #49
0
def main():
    args = parse_args()
    config = parse_config(args.config_file)
    if config is None:
        print('No configuration file is defined. '
              'Define one with `--config-file`.')
        sys.exit(1)

    # read dataset
    files = config['files']
    if 'filepath' in config:
        files = [config['filepath'] + f for f in files]
    kwargs = config['pandas_kwargs']

    print('Reading ', end='')
    entries = 0
    for f in files:
        rootfile = ROOT.TFile(f)
        tree = rootfile.Get(kwargs['key'])
        entries += tree.GetEntries()
    maxslices = args.max_slices
    chunksize = kwargs['chunksize']
    total = (maxslices
             if maxslices is not None and maxslices < (entries / chunksize)
             else (entries / chunksize))
    print(total * chunksize, 'events.')
    df = pd.concat([
        df for df in tqdm(
            islice(
                read_root(files, flatten=True, **kwargs), maxslices),
            total=total)])

    # rename the tagging particle branches
    df.rename(columns=dict(zip(df.columns,
        [c.replace(config['tagging_particle_prefix'], 'tp').replace('-', '_')
            for c in df.columns])),
        inplace=True)
    df['event_id'] = df.runNumber.apply(str) + '_' + df.eventNumber.apply(str)
    if 'invert_target' in config and config['invert_target']:
        df['target'] = np.sign(df.B_ID) != np.sign(df.tp_ID)
    else:
        df['target'] = np.sign(df.B_ID) == np.sign(df.tp_ID)

    # read features and selections
    try:
        if 'inclusive_mva_features' in config:
            mva_features = ['tp_' + f for f in config['inclusive_mva_features']]
        else:
            mva_features = ['tp_' + f.split(' ')[0] for f in config['selections']]
    except:
        raise ValueError('Tried to parse features for the BDT.'
                         ' Either provide well-formatted `selections` or'
                         ' define a `inclusive_mva_features` set.')

    # build BDT model and train the classifier n_cv x 3 times
    xgb_kwargs = config['xgb_kwargs']
    n_jobs = config['n_jobs']

    bootstrap_scores = []
    bootstrap_d2s = []
    nfold = (args.bootstrap_folds
             if args.bootstrap_folds is not None
             else config['n_cv'])
    print('Starting bootstrapping.')
    pbar = tqdm(total=nfold * 3)
    for _ in range(nfold):
        # yield 3-fold split for CV
        df_sets = [df.iloc[indices] for indices in NSplit(df)]

        cv_scores = []
        for i in range(3):
            df1, df2, df3 = (df_sets[i % 3].copy(),
                             df_sets[(i + 1) % 3].copy(),
                             df_sets[(i + 2) % 3].copy())
            model = XGBClassifier(nthread=n_jobs, **xgb_kwargs)
            sample_weight = (df1.target
                             if 'training_weights' in config
                                and config['training_weights']
                             else None)
            model.fit(df1[mva_features], df1.target,
                      sample_weight=df1.SigYield_sw)

            df2['probas'] = model.predict_proba(df2[mva_features])[:, 1]
            df2.reset_index(inplace=True, drop=True)
            df2_max = df2.iloc[df2.groupby('event_id')['probas'].idxmax()].copy()
            df3['probas'] = model.predict_proba(df3[mva_features])[:, 1]
            df3.reset_index(inplace=True, drop=True)
            df3_max = df3.iloc[df3.groupby('event_id')['probas'].idxmax()].copy()

            # calibrate
            calibrator = PolynomialLogisticRegression(power=4,
                                                      solver='lbfgs',
                                                      n_jobs=n_jobs)
            calibrator.fit(df2_max.probas.reshape(-1, 1), df2_max.target,
                           sample_weight=df2_max.SigYield_sw)

            df3_max['calib_probas'] = calibrator.predict_proba(df3_max.probas)[:, 1]

            score = tagging_power_score(df3_max.calib_probas,
                                        tot_event_number=get_event_number(df3_max),
                                        sample_weight=df3_max.SigYield_sw)
            bootstrap_scores.append(score)
            bootstrap_d2s.append(d2_score(df3_max.calib_probas,
                                          sample_weight=df3_max.SigYield_sw))
            pbar.update(1)

    pbar.close()
    print(dedent("""\
          Final {}-fold bootstrap performance
             D2 = {:<6}%
          ε_eff = {:<6}%""")
          .format(nfold,
                  100 * ufloat(np.mean(bootstrap_d2s),
                               np.std(bootstrap_d2s)),
                  100 * ufloat(np.mean(noms(bootstrap_scores)),
                               np.std(noms(bootstrap_scores)))))
예제 #50
0
col_y = cols[-1]  # y변수명

# plt.scatter(x=iris[:,0], y=iris[:,1], s=100, c=iris[:,4], marker='o')
# '(slice(None, None, None), 0)' is an invalid key
y = iris[col_y]
import numpy as np

y = np.array(y)
plt.scatter(x=iris[:, 0], y=iris[:, 1], c=y, marker='o')

# 단계2 : 훈련/검정 데이터셋 생성
train_set, test_set = train_test_split(iris, test_size=0.25)

# 단계3 : model 생성 : train data 이용
xgb = XGBClassifier()
model = xgb.fit(train_set[col_x], train_set[col_y])
model

# 단계4 :예측치 생성 : test data 이용
y_pred = model.predict(test_set[col_x])
y_true = test_set[col_y]

y_pred2 = model.predict_proba(test_set[col_x])
y_pred2.shape  # (38, 3)
y_pred2
'''
array([[2.1746019e-03, 9.9590498e-01, 1.9204022e-03],
       [9.9528944e-01, 3.9060446e-03, 8.0451195e-04],
       [9.9526840e-01, 3.9059622e-03, 8.2559639e-04],
       ...
'''
예제 #51
0
파일: xgb10.py 프로젝트: daxiongshu/bnp
X=np.hstack([train[good+goodx].as_matrix(),train1.as_matrix()])
Xt=np.hstack([test[good+goodx].as_matrix(),test1.as_matrix()])

from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
names_categorical = []
cand=['v40','v63','v109']
for name in train.columns.values :
    if train[name].value_counts().shape[0]<1000 or name in cand:# and name not in good:
        train[name] = map(str, train[name])
        test[name] = map(str, test[name])
        names_categorical.append(name)
        print name,train[name].value_counts().shape[0] 
X_sparse = vec.fit_transform(train[names_categorical].T.to_dict().values())
Xt_sparse = vec.transform(test[names_categorical].T.to_dict().values())

idx=np.array(train.index)
del train
gc.collect()
X=sparse.hstack([X,X_sparse],format='csr')#.toarray()
Xt=sparse.hstack([Xt,Xt_sparse],format='csr')
print X.shape,y.shape,Xt.shape
clf=XGBClassifier(max_depth=11,colsample_bytree=0.5,learning_rate=0.01,n_estimators=1200,nthread=-1)
clf.fit(X,y)
idx=np.array(test.index)#id_test
yp=clf.predict_proba(Xt).T[1]
s=pd.DataFrame({idname:idx,'PredictedProb':yp})
s.to_csv('xgb10.csv',index=False)

                                                    random_state=55)
print(x_train.shape, x_test.shape)  #(56000, 154) (14000, 154)
print(y_train.shape, y_test.shape)  #(56000,) (14000,)
x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                  y_train,
                                                  train_size=0.8,
                                                  random_state=55)

# print(x_train.max)
# print(x_train.min)

#2. 모델링
model = XGBClassifier(n_job=-1, use_label_encoder=False)

#3. 컴파일, 훈련
model.fit(x_train, y_train)

#4. 평가, 예측
loss = model.evaluate(x_test, y_test)
print('loss : ', loss)

y_pred = model.predict(x_test[:10])
# print(y_pred)
print(y_test[:10])
print(np.argmax(y_test[:10], axis=-1))

#DNN
#(784, )
# loss :  [0.09116600453853607, 0.9779000282287598]
# [7 2 1 0 4 1 4 9 5 9]
예제 #53
0
파일: Stacking.py 프로젝트: Ewen2015/Kaggle
print('We have %d classes and %d models TOTAL so in resulting arrays \
we expect to see %d columns.' % (n_classes, len(models_1) + len(models_2), 
                                 n_classes * (len(models_1) + len(models_2))))

# Create empty arrays
S_train_all = np.zeros((X_train.shape[0], 0))
S_test_all = np.zeros((X_test.shape[0], 0))

# Load results
for name in sorted(glob('*.npy')):
    print('Loading: %s' % name)
    S = np.load(name)
    S_train_all = np.c_[S_train_all, S[0]]
    S_test_all = np.c_[S_test_all, S[1]]
    
print('\nS_train_all shape:', S_train_all.shape)
print('S_test_all shape: ', S_test_all.shape)

# Initialize 2nd level model
model = XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                      n_estimators=100, max_depth=3)
    
# Fit 2nd level model
model = model.fit(S_train_all, y_train)

# Predict
y_pred = model.predict_proba(S_test_all)

# Final prediction score
print('Final prediction score: %.8f' % log_loss(y_test, y_pred))
예제 #54
0
x_test = x_test.reshape(-1, x_test.shape[1] * x_test.shape[2])

pca = PCA(n_components=713)
x2_train = pca.fit_transform(x_train)
x2_test = pca.fit_transform(x_test)

x_train, x_test, y_train, y_test = train_test_split(x2_train,
                                                    y_train,
                                                    train_size=0.8,
                                                    random_state=77)

start = time.time()
#2. 모델
model = XGBClassifier(n_jobs=-1, use_label_encoder=False)

#3. 훈련
model.fit(x_train, y_train, eval_metric='logloss')

#4. 평가 예측
acc = model.score(x_test, y_test)
print('acc : ', acc)

sec = time.time() - start
times = str(datetime.timedelta(seconds=sec)).split(".")
times = times[0]

print("작업 시간 : ", times)

# acc :  0.9579166666666666
# 작업 시간 :  892.8674330711365
예제 #55
0
y22 = np.argmax(y_pred_model2,axis=1)
y_test22 = np.argmax(y_test , axis = 1)

count = 0
for i in range(y22.shape[0]):
    if y22[i] == y_test22[i]:
        count+=1
        
print('Accuracy for model 2 : ' + str((count / y22.shape[0]) * 100))


X_train2,X_test2,y_train2,y_test2 = train_test_split(feature_all,y,test_size = 0.3,random_state=20)

########################### MODEL 3 ###########################
model3 = XGBClassifier()
model3.fit(X_train2,y_train2)
model3.evals_result()
score = cross_val_score(model3, X_train2, y_train2, cv=5)
y_pred3 = model3.predict(X_test)

count = 0
for i in range(y_pred3.shape[0]):
    if y_pred3[i] == y_test2[i]:
        count+=1   
        
print('Accuracy for model 3 : ' + str((count / y_pred3.shape[0]) * 100))


########################### TESTING ###########################
test_file_path = sys.argv[2]
X,sr = librosa.load(test_file_path, sr = None)
예제 #56
0
# In[97]:

xg_class = XGBClassifier(earning_rate=0.3,
                         max_delta_step=0,
                         max_depth=3,
                         min_child_weight=1,
                         missing=None,
                         n_estimators=7,
                         n_jobs=1,
                         nthread=None,
                         objective='binary:logistic',
                         random_state=0)

# In[98]:

xg_class.fit(xtrain, ytrain.values.ravel())
print(xg_class)

# In[99]:

x.dtypes

# In[100]:

test_sub1 = test_sub_m.copy()
test_sub1['Parch'] = test_sub1['Parch'].astype('int64')
test_sub1['male'] = test_sub1['male'].astype('int64')
test_sub1['female'] = test_sub1['female'].astype('int64')
test_sub1['S'] = test_sub1['S'].astype('int64')
test_sub1['PC1'] = test_sub1['PC1'].astype('int64')
test_sub1['PC2'] = test_sub1['PC2'].astype('int64')
     
     trainingSet = train.iloc[itr]
     validationSet = train.iloc[icv]
     
     gbm = XGBClassifier(max_depth=4,
                         learning_rate = 0.01,
                         n_estimators=3000,
                         subsample=0.8,
                         colsample_bytree=0.5,
                         objective="binary:logistic",
                         silent = False,
                         min_child_weight=5,                       
                         nthread=-1)
                         
     gbm.fit(trainingSet[feature_names], np.array(trainingSet["TARGET"]),
             eval_metric="auc",
             eval_set=[(trainingSet[feature_names], np.array(trainingSet["TARGET"])), (validationSet[feature_names], np.array(validationSet["TARGET"]))],
                      early_stopping_rounds=200,verbose=20)    
                       
     ll = gbm.best_score
     best_iter = gbm.best_iteration
     cv.append(ll)
     biter.append(best_iter)
     print "---auc : %0.6f\n" %ll
     print "---best_iter: %d\n" %best_iter
     gc.collect()
 
 gbm = XGBClassifier(max_depth=4,
                         learning_rate = 0.01,
                         n_estimators=370,
                         subsample=0.8,
                         colsample_bytree=0.5,
예제 #58
0
test['title_count']=test.apply(title_count,axis=1)
test['genres_count']=test.apply(genres_count,axis=1)
test['dow_count'] = test.apply(dow_count,axis=1)
test['tod_count']=test.apply(tod_count,axis=1)
testdf= test.copy()
test.drop(['titles','genres','ID','dow','tod','cities'],axis=1,inplace=True)
print("now predicting")
# pca.transform(test)
# sc.transform(test)
#rf = RandomForestClassifier(n_estimators=460,max_depth=12, max_features=8,class_weight='balanced')
#xgb
model2 = XGBClassifier(max_depth=5, n_estimators=460, learning_rate=0.05,scale_pos_weight = 1,min_child_weight = 2,gamma = 0.0,subsample =0.5, colsample_bytree = 0.5,max_delta_step=1)
#model = VotingClassifier(estimators=[('rf',rf),('xgb',xgb)],voting='soft')
sfm = SelectFromModel(model2,threshold = 0.013)
X_traindf2 = pd.DataFrame(sfm.fit_transform(X_traindf,Y_traindf))
test2 = pd.DataFrame(sfm.transform(test))
print("now grid serach")
model = XGBClassifier(max_depth=5, n_estimators=460, learning_rate=0.05,scale_pos_weight = 1,min_child_weight = 2,gamma = 0.0,subsample =0.5, colsample_bytree = 0.5,max_delta_step=1)

model.fit(X_traindf2,Y_traindf)# print("best_params: "+str(rf.best_params_))
probabilities = model.predict_proba(test2)
print probabilities
probabilities1 = pd.DataFrame(probabilities,columns=['neg','segment'])
probabilities1 = probabilities1.drop('neg',axis=1)
answer = pd.concat([pd.DataFrame(testdf['ID']),probabilities1],axis=1)
answer.to_csv('segmentspredanswer78.csv',index=False)
# print("best_score: " + str(cv_rfc.best_score_))
# print("best_params: "+str(cv_rfc.best_params_))
#
예제 #59
0
target = df['TARGET']
del df['TARGET']
# del df['ID']
id = df_test['ID']
# del df_test['ID']

pca = PCA(n_components=250)
train_pcaed = pca.fit_transform(df, target)

random_forest = RandomForestClassifier(n_estimators=30, max_depth=5, max_features=20)
random_forest.fit(train_pcaed, target)
forested = random_forest.predict_proba(train_pcaed)
# pipe = Pipeline(steps=[('pca', pca), ('random_forest', random_forest)])

m2_xgb = XGBClassifier(n_estimators=110, nthread=1, max_depth=4)
m2_xgb.fit(train_pcaed, target)
m2_xgbed = m2_xgb.predict_proba(train_pcaed)

logistic_regression = LogisticRegression(penalty='l1')
logistic_regression.fit(train_pcaed, target)
logistic_regressioned = logistic_regression.predict_proba(train_pcaed)

combined = np.concatenate([forested, m2_xgbed, logistic_regressioned], axis=1)


log_reg = LogisticRegression()
log_reg.fit(combined, target)

scores = cross_validation.cross_val_score(log_reg, combined, target,
                                              cv=5, scoring='roc_auc')
print(scores.mean(), scores)
예제 #60
0
# x, y = load_boston(return_X_y=True)
datasets = load_wine()
x = datasets.data
y = datasets['target']

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=66)

# 2. 모델
model = XGBClassifier(n_estimators=100, learning_rate=0.01, n_jobs=8)

# 3. 훈련
model.fit(x_train,
          y_train,
          verbose=1,
          eval_metric='mlogloss',
          eval_set=[(x_train, y_train), (x_test, y_test)])

aaa = model.score(x_test, y_test)
print('aaa :', aaa)

y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)
print('r2 :', r2)

print('====================================')
results = model.evals_result()
print(results)