예제 #1
0
파일: rf.py 프로젝트: hujiewang/facebookIV
def train(model_id,train_x,train_y,valid_x,valid_y,test_x):
    train_x,train_y=shuffle(train_x,train_y)


    random_state=random.randint(0, 1000000)
    rf = RandomForestClassifier(n_jobs=8)

    param_dist = {
            "n_estimators":sp_randint(100,300),
        "criterion": ["gini"],
        #"max_depth": sp_randint(3, 10000),
        #"min_samples_split": sp_randint(1, 300),
        #"min_samples_leaf": sp_randint(1, 300),
        "max_features": sp_randint(10, 26),
        "bootstrap": [True, False],
        'random_state':sp_randint(1, 1000000),
        }

    clf = RandomizedSearchCV(rf, param_distributions=param_dist,
                                   n_iter=50,cv=10,scoring='roc_auc')

    clf.fit(train_x, train_y)
    valid_predictions = clf.predict_proba(valid_x)[:, 1]
    test_predictions= clf.predict_proba(test_x)[:, 1]

    loss = roc_auc_score(valid_y,valid_predictions)
    print('loss:')
    print(loss)
    print(clf.best_estimator_)
    data.saveData(valid_id,valid_predictions,"./valid_results/valid_"+str(model_id)+".csv")
    data.saveData(test_id,test_predictions,"./results/results_"+str(model_id)+".csv")
예제 #2
0
def trainClassifier(X, y, testTweetsAll, ensembleTweets, ensembleSentiment):

    # split our data into training and test datasets
    xTrain, xTest, yTrain, yTest = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=8)

    classifier = RandomForestClassifier(n_estimators=20, n_jobs=-1)

    # for simplicity's sake, we could train a single random forest:
    # classifier.fit(xTrain, yTrain)
    # print classifier.score(xTest, yTest)

    # for more fun, we will optimize the hyperparameters for our random forest using RandomizedSearchCV
    parametersToTry = {
        'max_features': ['sqrt', 'log2', None, .01, .1, .2, .3],
        'criterion': ['gini', 'entropy'],
        'min_samples_leaf': [1],
        'min_samples_split': scipy.stats.randint(2, 30),
        'bootstrap': [True, False]
    }

    # RandomizedSearchCV will optimize our hyperparameters for us in a way that is much more efficient and comprehensive than GridSearchCV.
    # run on all cores, fail gracefully if a combination of hyperparameters fails to converge, try 10 different combinations of hyperparameters, train on all the training data when finished, and use a third of the dataset for cross-validation while searching for the best hyperparameters
    searchCV = RandomizedSearchCV(classifier,
                                  parametersToTry,
                                  n_jobs=-1,
                                  error_score=0,
                                  n_iter=10,
                                  refit=True,
                                  cv=3)

    print 'shape of this training data set:'
    print xTrain.shape
    searchCV.fit(xTrain, yTrain)
    print 'the best hyperparameters from this search are:'
    print searchCV.best_params_
    print 'best score from hyperparameter search is: ' + str(
        searchCV.best_score_)
    print 'score on the holdout portion of the training set: ' + str(
        searchCV.score(xTest, yTest))
    print 'score on the ensemble data: ' + str(
        searchCV.score(ensembleTweets, ensembleSentiment)) + '\n\n'

    testPredictions = searchCV.predict_proba(testTweetsAll)
    ensemblePredictions = searchCV.predict_proba(ensembleTweets)

    def singlePrediction(predictions):
        cleanedPredictions = []
        for predictionRow in predictions:
            cleanedPredictions.append(predictionRow[1])
        return cleanedPredictions

    # the classifier gives us a predicted probability for both the 0 and the 1 case. Given that they're mutually exclusive, we can simplify down to a single number (the predicted probability of the 1 case)
    testPredictions = singlePrediction(testPredictions)
    ensemblePredictions = singlePrediction(ensemblePredictions)

    return testPredictions, ensemblePredictions
예제 #3
0
def random_search():
    from time import time
    from scipy.stats import randint as sp_randint
    from sklearn.grid_search import RandomizedSearchCV

    crimes = np.load(DATA_FILE)

    param_dist = {
        'n_estimators': sp_randint(1, 150),
        "criterion": ["gini", "entropy"],
        'max_depth': sp_randint(1, 40),
        "min_samples_split": sp_randint(2, 15),
        "min_samples_leaf": sp_randint(1, 10),
        "max_features": ['auto', 'sqrt', 'log2', None]
    }

    model = RandomForestClassifier(min_weight_fraction_leaf=0.0,
                                   max_leaf_nodes=None,
                                   bootstrap=True,
                                   oob_score=False,
                                   n_jobs=4,
                                   random_state=42,
                                   verbose=0,
                                   warm_start=False,
                                   class_weight=None)

    n_iter_search = 40

    random_searcher = RandomizedSearchCV(model,
                                         param_distributions=param_dist,
                                         n_iter=n_iter_search,
                                         random_state=42)

    start = time()
    random_searcher.fit(crimes['features_train'],
                        crimes['labels_train'].ravel())

    print(
        "RandomizedSearchCV took %.2f seconds for %d candidates"
        " parameter settings." % ((time() - start), n_iter_search))
    report(random_searcher.grid_scores_)

    loss_train = log_loss(
        crimes['labels_train'],
        random_searcher.predict_proba(crimes['features_train']))
    loss_val = log_loss(crimes['labels_val'],
                        random_searcher.predict_proba(crimes['features_val']))
    loss_all = log_loss(crimes['labels'],
                        random_searcher.predict_proba(crimes['features']))
    print 'loss_all: ', loss_all
    print 'loss_train: ', loss_train
    print 'loss_val: ', loss_val

    return loss_val
예제 #4
0
def trainClassifier(X, y, testTweetsAll, ensembleTweets, ensembleSentiment):

    # split our data into training and test datasets
    xTrain, xTest, yTrain, yTest = train_test_split(
        X, y, test_size=0.33, random_state=8)


    classifier = RandomForestClassifier(n_estimators=20, n_jobs=-1)

    # for simplicity's sake, we could train a single random forest:
    # classifier.fit(xTrain, yTrain)
    # print classifier.score(xTest, yTest)


    # for more fun, we will optimize the hyperparameters for our random forest using RandomizedSearchCV
    parametersToTry = {
        'max_features': ['sqrt','log2',None,.01,.1,.2,.3],
        'criterion': ['gini','entropy'],
        'min_samples_leaf': [1],
        'min_samples_split': scipy.stats.randint(2,30),
        'bootstrap': [True,False]
    }

    # RandomizedSearchCV will optimize our hyperparameters for us in a way that is much more efficient and comprehensive than GridSearchCV.
    # run on all cores, fail gracefully if a combination of hyperparameters fails to converge, try 10 different combinations of hyperparameters, train on all the training data when finished, and use a third of the dataset for cross-validation while searching for the best hyperparameters
    searchCV = RandomizedSearchCV(classifier, parametersToTry, n_jobs=-1, error_score=0, n_iter=10, refit=True, cv=3)


    print 'shape of this training data set:'
    print xTrain.shape
    searchCV.fit(xTrain, yTrain)
    print 'the best hyperparameters from this search are:'
    print searchCV.best_params_
    print 'best score from hyperparameter search is: ' + str(searchCV.best_score_)
    print 'score on the holdout portion of the training set: ' + str( searchCV.score(xTest, yTest) )
    print 'score on the ensemble data: ' + str( searchCV.score(ensembleTweets, ensembleSentiment) ) + '\n\n'


    testPredictions = searchCV.predict_proba(testTweetsAll)
    ensemblePredictions = searchCV.predict_proba(ensembleTweets)


    def singlePrediction(predictions):
        cleanedPredictions = []
        for predictionRow in predictions:
            cleanedPredictions.append(predictionRow[1])
        return cleanedPredictions

    # the classifier gives us a predicted probability for both the 0 and the 1 case. Given that they're mutually exclusive, we can simplify down to a single number (the predicted probability of the 1 case)
    testPredictions = singlePrediction(testPredictions)
    ensemblePredictions = singlePrediction(ensemblePredictions)

    return testPredictions, ensemblePredictions
예제 #5
0
def random_search():
    from time import time
    from scipy.stats import uniform as sp_uniform, randint as sp_randint
    from sklearn.grid_search import RandomizedSearchCV
    from sklearn.cross_validation import ShuffleSplit

    crimes = np.load(DATA_FILE)
    # features_train = crimes['features_train']
    all_labels = sorted(list(set(np.unique(crimes['labels_train'])) | set(np.unique(crimes['labels_val']))))
    batch_size = 64

    labels_train = create_labels(crimes['labels_train'], all_labels)
    labels_vals = create_labels(crimes['labels_val'], all_labels)
    labels_full = create_labels(crimes['labels'], all_labels)

    param_dist = {'layers': sp_randint(1, 3),
                  "hidden_units": [64, 128, 256],
                  'input_dropout': sp_uniform(0, 0.5),
                  "hidden_dropout": sp_uniform(0, 0.75),
                  "learning_rate": sp_uniform(0.01, 0.1),
                  "weight_decay": sp_uniform(0, 0.01)
                  }

    model = NeuralNetworkClassifier(n_classes=len(all_labels), batch_size=batch_size,
                                    valid_set=(crimes['features_val'], labels_vals))

    n_iter_search = 40
    np.random.seed(42)

    random_searcher = RandomizedSearchCV(model, param_distributions=param_dist, scoring=None,
                                         n_iter=n_iter_search, random_state=42, error_score=100,
                                         verbose=5,
                                         cv=ShuffleSplit(n=crimes['features_train'].shape[0], n_iter=1, test_size=0))

    start = time()
    random_searcher.fit(crimes['features_train'], labels_train.ravel())

    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    report(random_searcher.grid_scores_)

    loss_train = log_loss(labels_train, random_searcher.predict_proba(crimes['features_train']))
    loss_val = log_loss(labels_vals, random_searcher.predict_proba(crimes['features_val']))
    loss_all = log_loss(labels_full, random_searcher.predict_proba(crimes['features']))

    print 'loss_all: ', loss_all
    print 'loss_train: ', loss_train
    print 'loss_val: ', loss_val

    return loss_val
예제 #6
0
def random_search():
    from time import time
    from scipy.stats import randint as sp_randint
    from sklearn.grid_search import RandomizedSearchCV

    crimes = np.load(DATA_FILE)

    param_dist = {'n_estimators': sp_randint(1, 150),
                  "criterion": ["gini", "entropy"],
                  'max_depth': sp_randint(1, 40),
                  "min_samples_split": sp_randint(2, 15),
                  "min_samples_leaf": sp_randint(1, 10),
                  "max_features": ['auto', 'sqrt', 'log2', None]
                  }

    model = RandomForestClassifier(min_weight_fraction_leaf=0.0,
                                   max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=4,
                                   random_state=42, verbose=0, warm_start=False, class_weight=None)

    n_iter_search = 40

    random_searcher = RandomizedSearchCV(model, param_distributions=param_dist,
                                         n_iter=n_iter_search, random_state=42)

    start = time()
    random_searcher.fit(crimes['features_train'], crimes['labels_train'].ravel())

    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    report(random_searcher.grid_scores_)

    loss_train = log_loss(crimes['labels_train'], random_searcher.predict_proba(crimes['features_train']))
    loss_val = log_loss(crimes['labels_val'], random_searcher.predict_proba(crimes['features_val']))
    loss_all = log_loss(crimes['labels'], random_searcher.predict_proba(crimes['features']))
    print 'loss_all: ', loss_all
    print 'loss_train: ', loss_train
    print 'loss_val: ', loss_val

    return loss_val
예제 #7
0
파일: rf.py 프로젝트: hujiewang/otto
def train(model_id,train_x,train_y,valid_x,valid_y,test_x):
    train_x,train_y=shuffle(train_x,train_y)

    random_state=random.randint(0, 1000000)
    print('random state: {state}'.format(state=random_state))

    # build a classifier
    clf = RandomForestClassifier(n_jobs=8)

   # specify parameters and distributions to sample from

    param_dist = {
            "n_estimators":sp_randint(20,40),
            "criterion": ["gini", "entropy"],
            "max_depth": sp_randint(3, 10000),
            "min_samples_split": sp_randint(1, 30),
            "min_samples_leaf": sp_randint(1, 30),
            "max_features": sp_randint(1, 93),
            "bootstrap": [True, False],
            'random_state':sp_randint(1, 1000000),
            }


    # run randomized search
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=2,cv=9,n_jobs=3)
    random_search.fit(train_x,train_y)
    valid_predictions = random_search.predict_proba(valid_x)
    test_predictions= random_search.predict_proba(test_x)
    loss = test(valid_y,valid_predictions,True)
    if  loss<10.438:
        output=[loss,random_search.best_estimator_]
        print("model[\""+str(model_id)+"\"]="),
        print(output)

        data.saveData(valid_predictions,"../valid_results/valid_"+str(model_id)+".csv")
        data.saveData(test_predictions,"../results/results_"+str(model_id)+".csv")
 def randomsearch_xgboost(df):
     param_distributions={'max_depth': sp.stats.randint(1, 11),
                          'subsample': sp.stats.uniform(0.25, 0.75),
                          'colsample_bytree': sp.stats.uniform(0.25, 0.75)
     }
     xgb_model = XGBClassifier()
     rs = RandomizedSearchCV(xgb_model,
                             param_distributions,
                             cv=10,
                             n_iter=20,
                             scoring="log_loss",
                             n_jobs=1,
                             verbose=2)
     rs.fit(train_X, train_y.transpose()[0]) 
     predict = rs.predict_proba(test_X)
     return predict[:, 1]
예제 #9
0
 def randomsearch_xgboost(df):
     param_distributions = {
         'max_depth': sp.stats.randint(1, 11),
         'subsample': sp.stats.uniform(0.25, 0.75),
         'colsample_bytree': sp.stats.uniform(0.25, 0.75)
     }
     xgb_model = XGBClassifier()
     rs = RandomizedSearchCV(xgb_model,
                             param_distributions,
                             cv=10,
                             n_iter=20,
                             scoring="log_loss",
                             n_jobs=1,
                             verbose=2)
     rs.fit(train_X, train_y.transpose()[0])
     predict = rs.predict_proba(test_X)
     return predict[:, 1]
예제 #10
0
파일: baseline.py 프로젝트: reactiv/thesis
def discriminative_straight(train_questions, test_questions):
    tfidf, docs = get_tfidf('statement', reference)
    w2v, _ = get_w2v('clause', reference, size=100, sg=0, iter=80, alpha=0.025)
    transformer = partial(tfidf_transformer, tfidf, False)
    X, y, _ = get_ensemble_dataset(train_questions, docs, w2v, transformer)
    test_X, test_y, corr = get_ensemble_dataset(test_questions, docs, w2v,
                                                transformer)

    # poly = PolynomialFeatures(2, True)
    # X = poly.fit_transform(X)
    #0.537878787879
    #0.321678321678
    GradientBoostingClassifier()
    param_grid = {
        'n_estimators': [500, 1000, 2000],
        'learning_rate': [0.1, 0.01, 0.001],
        'max_depth': [2, 3, 6],
        'subsample': np.linspace(0.01, 0.5, 20),
    }
    rf = RandomizedSearchCV(GradientBoostingClassifier(),
                            param_grid,
                            n_iter=100,
                            scoring='roc_auc',
                            cv=8,
                            verbose=3,
                            n_jobs=-1)

    # rf = LogisticRegressionCV(scoring='roc_auc', cv=8)
    rf = ExtraTreesClassifier(500, n_jobs=-1)
    rf.fit(X, y)
    y_prob = rf.predict_proba(test_X)
    # print(y_prob)
    # print(y_prob.shape)
    y_pred = y_prob[:, 1].reshape(len(test_y) / 4, 4).argmax(axis=1)
    rnk = y_prob[:, 1].reshape(len(test_y) / 4, 4).argsort(axis=1)
    rr = []
    for i in range(len(corr)):
        rr.append(1.0 / (rnk[i, corr[i]] + 1))
    print np.mean(rr)
    print(y_pred == np.array(corr)).mean()
예제 #11
0
test  = pd.read_csv("./Desktop/schiz/concat_test/testconcat.csv")
train_features = train.ix[:,1:411] #train data features
train_label = train["Class"] #train data labels
#test = (test - test.mean()) / (test.max() - test.min())
train_features = (train_features - train_features.mean()) / (train_features.max() - train_features.min())
features = list(train.columns[1:411]) #liste of train features
label = list(train["Class"])
print("Preprocessing data")
tuned_parameters = param_distributions = {'C': expon(), 'gamma': expon(),'kernel': ['linear']}
svc = SVC(C=0.000001, class_weight='auto', coef0=0.0, degree=3,kernel="linear",probability=True,random_state=None, shrinking=True, tol=0.000001, verbose=False)
clf =RandomizedSearchCV(svc, param_distributions=param_distributions, n_iter=10000)
clf.fit(train_features, label)


scores = cross_validation.cross_val_score(clf,train_features,label,cv=2,scoring='roc_auc')
print(scores)

#def get_score(clf, train_features, train_label):
#    X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_features, train_label, test_size=0.12, random_state=0)
#    clf.fit(X_train, y_train)
#    print clf.score(X_test, y_test) 

print("Training Support Vector Machine")

print("Make predictions on the test set")
test_probs = clf.predict_proba(test[features])[:,1]
submission = pd.DataFrame({"id": test["Id"], "probability": test_probs})
submission.to_csv("rf_xgboost_submission.csv", index=False)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

예제 #12
0
def XGB_Classifier(Train_DS, y, Actual_DS, Sample_DS, Grid):

    print("***************Starting XGB Classifier***************")
    t0 = time()

    if Grid:
        # used for checking the best performance for the model using hyper parameters
        print("Starting model fit with Grid Search")

        param_grid = {
            "n_estimators": [50],
            "max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 19, 20, 40, 80],
            "min_child_weight": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 40, 80],
            "subsample": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
            "colsample_bytree": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
            "silent": [True],
            "gamma": [2, 1, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
        }

        # run randomized search
        n_iter_search = 800
        clf = xgb.XGBClassifier(nthread=8)
        clf = RandomizedSearchCV(clf, param_distributions=param_grid, n_iter=n_iter_search, scoring="log_loss", cv=3)
        start = time()
        clf.fit(np.array(Train_DS), np.array(y))

        print("GridSearchCV completed")
        Parms_DS_Out = report(clf.grid_scores_, n_top=n_iter_search)
        Parms_DS_Out.to_csv(file_path + "Parms_DS_XGB_4.csv")

        print("Best estimator found by grid search:")
        print(clf.best_estimator_)
        sys.exit(0)
    else:
        ##----------------------------------------------------------------------------------------------------------------##
        # CV: 0.78526434774405007 (full set)
        # CV: 0.824999 (100k set - with Age set up, all dummy)
        clf = xgb.XGBClassifier(n_estimators=75, nthread=8)

        # clf = xgb.XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators=43,
        #             objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0)

        ##----------------------------------------------------------------------------------------------------------------##
        Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf)

        sys.exit(0)

        X_train = np.array(Train_DS)
        Y_train = np.array(y)

        clf.fit(X_train, Y_train)

    X_Actual = np.array(Actual_DS)

    # Predict actual model
    pred_Actual = clf.predict_proba(X_Actual)

    pred_Actual = get_best_five(pred_Actual, type_val=False)
    print("Actual Model predicted")

    # Get the predictions for actual data set
    pred_Actual.to_csv(file_path + "output/Submission_Roshan_xgb_1.csv", index_label="id")

    print("***************Ending XGB Classifier***************")
    return pred_Actual
        input_shape=(None, num_features),
        hidden_num_units=200,  # number of units in hidden layer #!200-600
        output_nonlinearity=lasagne.nonlinearities.softmax,  # output layer
        output_num_units=num_classes,  # 10 target values
        dropout_p=0.2,
        #!dropout 0.2-0.7

        # optimization method:
        update=nesterov_momentum,
        update_learning_rate=0.01,  #!0.001-0.01
        update_momentum=0.9,  #!0.6-0.9
        regression=
        False,  # flag to indicate we're dealing with regression problem
        max_epochs=500,  # we want to train this many epochs
        verbose=1,
    )

    random_search = RandomizedSearchCV(
        net1, {
            'hidden_num_units': sp_randint(200, 600),
            "dropout_p": sp_rand(0.2, 0.7),
            "update_learning_rate": sp_rand(0.001, 0.01),
            "update_momentum": sp_rand(0.6, 0.9),
        })
    random_search.fit(X, y)
    print random_search.grid_scores_

    preds = random_search.predict_proba(X_test)[:, 1]
    submission = pd.DataFrame(preds, index=ids, columns=['target'])
    submission.to_csv('Keras_BTB.csv')
예제 #14
0
def random_search():
    from time import time
    from scipy.stats import uniform as sp_uniform, randint as sp_randint
    from sklearn.grid_search import RandomizedSearchCV
    from sklearn.cross_validation import ShuffleSplit

    crimes = np.load(DATA_FILE)
    # features_train = crimes['features_train']
    all_labels = sorted(
        list(
            set(np.unique(crimes['labels_train']))
            | set(np.unique(crimes['labels_val']))))
    batch_size = 64

    labels_train = create_labels(crimes['labels_train'], all_labels)
    labels_vals = create_labels(crimes['labels_val'], all_labels)
    labels_full = create_labels(crimes['labels'], all_labels)

    param_dist = {
        'layers': sp_randint(1, 3),
        "hidden_units": [64, 128, 256],
        'input_dropout': sp_uniform(0, 0.5),
        "hidden_dropout": sp_uniform(0, 0.75),
        "learning_rate": sp_uniform(0.01, 0.1),
        "weight_decay": sp_uniform(0, 0.01)
    }

    model = NeuralNetworkClassifier(n_classes=len(all_labels),
                                    batch_size=batch_size,
                                    valid_set=(crimes['features_val'],
                                               labels_vals))

    n_iter_search = 40
    np.random.seed(42)

    random_searcher = RandomizedSearchCV(
        model,
        param_distributions=param_dist,
        scoring=None,
        n_iter=n_iter_search,
        random_state=42,
        error_score=100,
        verbose=5,
        cv=ShuffleSplit(n=crimes['features_train'].shape[0],
                        n_iter=1,
                        test_size=0))

    start = time()
    random_searcher.fit(crimes['features_train'], labels_train.ravel())

    print(
        "RandomizedSearchCV took %.2f seconds for %d candidates"
        " parameter settings." % ((time() - start), n_iter_search))
    report(random_searcher.grid_scores_)

    loss_train = log_loss(
        labels_train, random_searcher.predict_proba(crimes['features_train']))
    loss_val = log_loss(labels_vals,
                        random_searcher.predict_proba(crimes['features_val']))
    loss_all = log_loss(labels_full,
                        random_searcher.predict_proba(crimes['features']))

    print 'loss_all: ', loss_all
    print 'loss_train: ', loss_train
    print 'loss_val: ', loss_val

    return loss_val
예제 #15
0
class Model(object):
    def __init__(self):

        '''
        Training parameters:
        '''

        self.w2v_dim=100
        self.num_feature=400
        self.batch_size=16
        self.num_epoch=1

        #self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True)
        self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model')
        self.index2word_set = set(self.w2v_model.index2word)
        self.bigram=Phrases.load('./data/bigram.dat')
        self.trigram=Phrases.load('./data/trigram.dat')

        print('Build model...')

        param_dist = {
            "n_estimators":sp_randint(20,250),
            "criterion": ["gini", "entropy"],
            "max_depth": sp_randint(10, 300),
            "min_samples_split": sp_randint(1, 30),
            "min_samples_leaf": sp_randint(1, 30),
            "max_features": sp_randint(1, 200),
            "bootstrap": [True, False],
            'random_state':sp_randint(1, 1000000),
        }
        # build a classifier
        clf = RandomForestClassifier(n_jobs=8)
        # run randomized search
        self.model=RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=10,cv=9,n_jobs=8)

        print('Model has been built!')

    def getWordVectorFeatures(self, text):
        words = text.split()
        return self.wordVectorAvg(words, self.w2v_dim)

    def wordVectorAvg(self, words, num_features):
        featureVec = np.zeros((num_features,1),dtype="float32")

        nwords = 0
        for word in words:
            if word in self.index2word_set:
                nwords = nwords + 1
                featureVec = np.add(featureVec, self.w2v_model[word].reshape(-1,1))

        if nwords!=0:
            featureVec = np.divide(featureVec, nwords)
        return featureVec

    def getFeature(self, ori_q,rel_q):
        ori_q[0]=preprocess(ori_q[0],no_stopwords=True,bigram=self.bigram,trigram=self.trigram)
        ori_q[1]=preprocess(ori_q[1],no_stopwords=True,bigram=self.bigram,trigram=self.trigram)

        rel_q[0]=preprocess(rel_q[0],no_stopwords=True,bigram=self.bigram,trigram=self.trigram)
        rel_q[0]=preprocess(rel_q[0],no_stopwords=True,bigram=self.bigram,trigram=self.trigram)

        word2vec_q_subject=self.getWordVectorFeatures(ori_q[0])
        word2vec_q_body=self.getWordVectorFeatures(ori_q[1])

        word2vec_rel_q_subject=self.getWordVectorFeatures(rel_q[0])
        word2vec_rel_q_body=self.getWordVectorFeatures(rel_q[1])



        subject=np.concatenate((word2vec_q_subject*word2vec_rel_q_subject,
                                np.abs(word2vec_q_subject-word2vec_rel_q_subject)),axis=0)

        body=np.concatenate((word2vec_q_body*word2vec_rel_q_body,
                                np.abs(word2vec_q_body-word2vec_rel_q_body)),axis=0)


        return np.concatenate((subject, body,),axis=0).T


    def prepareData(self,data):
        size=0
        for i in range(len(data)):
            size+=(len(data[i])/2)-1
        X=np.zeros((size,self.num_feature),dtype=np.float32)
        y=np.zeros((size,),dtype=np.float32)
        meta=[]

        c=0
        pbar = ProgressBar(widgets=[Percentage(), Bar(), ETA()], maxval=len(data)).start()
        for i in range(len(data)):
            samples = data[i]

            ori_q_id=samples[0]['ORGQ_ID']
            ori_q=samples[1]

            for j in range(2,len(samples),2):

                rel_q_id=samples[j]['RELQ_ID']
                rel_q=samples[j+1]
                label=samples[j]['RELQ_RELEVANCE2ORGQ']
                target=0
                if label=='PerfectMatch':
                    target=2
                elif label=='Relevant':
                    target=1

                label='false' if label=='Irrelevant' else 'true'

                X[c,:] = self.getFeature(ori_q,rel_q)
                y[c]=target
                meta.append([ori_q_id,rel_q_id,label])
                c+=1
            pbar.update(i)
        return X,y,meta



    def loadData(self):
        reader = Reader()
        print('loading data')
        self.X_train, self.y_train, self.meta_train=self.prepareData(reader.getData(TRAIN))
        print('train data has been loaded!')
        self.X_valid, self.y_valid, self.meta_valid=self.prepareData(reader.getData(DEV))
        print('valid data has been loaded!')
        self.X_test, self.y_test, self.meta_test=self.prepareData(reader.getData(TEST))
        print('test data has been loaded!')


    def evaluate(self):
        print('evaluating...')

        y_pred = self.model.predict_proba(self.X_valid)
        f=open('./tmp/dev.pred', 'w')
        for i in range(len(self.meta_valid)):
            prob_of_true =y_pred[i][1]+y_pred[i][2]
            label='false'
            if prob_of_true>0.5:
                label='true'
            f.write( "%s %s 0 %20.16f %s\n" %(self.meta_valid[i][0], self.meta_valid[i][1], prob_of_true, label))
        f.close()

        map=eval_reranker(res_fname='./data/eval/SemEval2016-Task3-CQA-QL-dev.xml.subtaskB.relevancy',
                          pred_fname='./tmp/dev.pred')
        f=open('valid_map.txt', 'a')
        f.write(str(map)+'\n')
        f.close()
        print('=========================================')
        return map


    def train(self):

        f=open('valid_map.txt', 'w')
        f.close()
        f_train_loss=open('./train_loss.txt','w')
        f_valid_loss=open('./valid_loss.txt','w')
        f_train_acc=open('./train_acc.txt','w')
        f_valid_acc=open('./valid_acc.txt','w')
        f_train_loss.close()
        f_valid_loss.close()
        f_train_acc.close()
        f_valid_acc.close()

        print("Training...")
        max_map=0.0
        for i in range(self.num_epoch):
            self.model.fit(self.X_train, self.y_train)

            '''
            f_train_loss=open('./train_loss.txt','a')
            f_valid_loss=open('./valid_loss.txt','a')
            f_train_acc=open('./train_acc.txt','a')
            f_valid_acc=open('./valid_acc.txt','a')

            f_train_loss.write(str(np.asscalar(hist.history['loss'][0])))
            f_train_loss.write('\n')
            f_valid_loss.write(str(np.asscalar(hist.history['val_loss'][0])))
            f_valid_loss.write('\n')

            f_train_acc.write(str(np.asscalar(hist.history['acc'][0])))
            f_train_acc.write('\n')
            f_valid_acc.write(str(np.asscalar(hist.history['val_acc'][0])))
            f_valid_acc.write('\n')

            f_train_loss.close()
            f_valid_loss.close()
            f_train_acc.close()
            f_valid_acc.close()
            '''

            map=self.evaluate()
            print('MAP on valid data: %16.16f\n'%(map))
            if map>max_map:
                max_map=map
                #self.model.save_weights("./tmp/weights.hdf5")


        print('Training completed!')
        input_shape=(None, num_features),
        hidden_num_units=200,  # number of units in hidden layer #!200-600
        output_nonlinearity=lasagne.nonlinearities.softmax,  # output layer
        output_num_units=num_classes, # 10 target values
        dropout_p=0.2,
        #!dropout 0.2-0.7

        # optimization method:
        update=nesterov_momentum,
        update_learning_rate=0.01,#!0.001-0.01
        update_momentum=0.9,#!0.6-0.9

        regression=False,  # flag to indicate we're dealing with regression problem
        max_epochs=500,  # we want to train this many epochs
        verbose=1,


    )

    random_search = RandomizedSearchCV(net1, {'hidden_num_units': sp_randint(200, 600),
                                              "dropout_p": sp_rand(0.2,0.7),
                                              "update_learning_rate": sp_rand(0.001, 0.01),
                                              "update_momentum": sp_rand(0.6, 0.9),
                                               })
    random_search.fit(X, y)
    print random_search.grid_scores_

    preds = random_search.predict_proba(X_test)[:, 1]
    submission = pd.DataFrame(preds, index=ids, columns=['target'])
    submission.to_csv('Keras_BTB.csv')
예제 #17
0
rand_gridsearch = RandomizedSearchCV(xgb_clf, param_distributions = grid_params, \
                                     n_iter = 6, scoring = 'roc_auc', cv = 3, verbose = True, random_state = 42) #Changed to 3 Fold CV

rand_gridsearch.fit(X,y)

print '======================================================='
print rand_gridsearch.best_params_
print '======================================================='


for s in rand_gridsearch.grid_scores_:
    print s

best_xgb = rand_gridsearch.best_estimator_ 

y_pred = rand_gridsearch.predict_proba(X_test)


submission = pd.DataFrame(y_pred[:,1], index=test.index, columns=['target'])
submission.index.name = 'ID'
submission.to_csv('B_XGB_GridSearch_2015_10_13.csv')


sklearn.externals.joblib.dump(best_xgb, './models/XGB_2015_10_13.pkl')
sklearn.externals.joblib.dump(encoders, './models/XGB2_encoders_2015_10_13.pkl')





def hyperparaTuning(data,
                    testSet,
                    expName,
                    mode=2,
                    storedPath=util.getResourcePath() +
                    '/Pickle Files/Models/First Layer/'):
    # Construct the set of hyperparameters for each algorithm
    etTree_params = {
        "n_estimators": [150, 250, 350],
        "max_features": [None, 'sqrt', 'log2'],
        "min_samples_leaf": [64, 128, 256]
    }

    lightGBM_params = {
        "learning_rate": [0.06, 0.08, 0.1],
        "num_leaves": [15, 31, 63],
        "max_bin": [63, 127, 255],
        "feature_fraction": [0.6, 0.8, 0.9]
    }

    knn_params = {
        "n_neighbors": np.arange(5, 47, 2),
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan", "chebyshev"]
    }

    #Construct a model for each algorithm
    et_model = ExtraTreesClassifier()
    lgbm_model = lgbm.LGBMClassifier(objective='binary')
    knn_model = KNeighborsClassifier()

    #Construct the training and test data
    trainData = data.drop(' Label', axis=1)
    y_train = data[' Label'].values

    testData = testSet.drop(' Label', axis=1)
    y_test = testSet[' Label'].values

    #Perform Extremely Randomized ALgorithm
    if (mode == 1):
        modelName = 'ExtraTrees'
        params = etTree_params
        model = et_model

    #Performing LightGBM
    elif (mode == 2):
        modelName = 'LightGBM'
        params = lightGBM_params
        model = lgbm_model

    #Performing KNN algorithm
    else:
        modelName = 'KNN'
        params = knn_params
        model = knn_model

        #Standardise the data in the case of KNN
        scaling = StandardScaler()
        trainData = scaling.fit_transform(trainData)
        testData = scaling.transform(testData)

    # tune the hyperparameters via a cross-validated Randomized search
    grid = RandomizedSearchCV(model, params, verbose=1, cv=5, n_jobs=1)
    start = time.time()
    grid.fit(trainData, y_train)

    #Calculate the time
    end = time.time()
    runningTime = (end - start) / 60

    # evaluate the best grid searched model on the testing data
    preds = grid.predict_proba(testData)
    auc = roc_auc_score(y_test, preds[:, 1])

    print("Experiment: ", expName)
    print("Randomized search best parameters: {}".format(grid.best_params_))
    print("AUC of the best model: ", auc)
    print("Running time: ", runningTime)

    #Save the model
    util.pklSaver(grid, expName, path=storedPath + modelName + '/')
예제 #19
0
# predicted probabilities will sum to 1 for each row
new_pred_prob_knn[0, :].sum()


# ### Model 2: Naive Bayes model using only text features

# print the best model found by RandomizedSearchCV
rand.best_estimator_


# define X_new as the ingredient text
X_new = new.ingredients_str


# calculate predicted probabilities of class membership for the new data
new_pred_prob_rand = rand.predict_proba(X_new)
new_pred_prob_rand.shape


# print predicted probabilities for the first row only
new_pred_prob_rand[0, :]


# ### Ensembling models 1 and 2

# calculate the mean of the predicted probabilities for the first row
(new_pred_prob_knn[0, :] + new_pred_prob_rand[0, :]) / 2


# calculate the mean of the predicted probabilities for all rows
new_pred_prob = pd.DataFrame((new_pred_prob_knn + new_pred_prob_rand) / 2, columns=knn.classes_)
예제 #20
0
def XGB_Classifier(Train_DS, y, Actual_DS, Sample_DS, Grid):

    print("***************Starting XGB Classifier***************")
    t0 = time()

    if Grid:
        #used for checking the best performance for the model using hyper parameters
        print("Starting model fit with Grid Search")

        param_grid = {
            'n_estimators': [50],
            'max_depth':
            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 19, 20],
            'min_child_weight':
            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20],
            'subsample': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
            'colsample_bytree':
            [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
            'silent': [True],
            'gamma': [2, 1, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
        }

        #run randomized search
        n_iter_search = 800
        clf = xgb.XGBClassifier(nthread=8)
        clf = RandomizedSearchCV(clf,
                                 param_distributions=param_grid,
                                 n_iter=n_iter_search,
                                 scoring='log_loss',
                                 cv=3)
        start = time()
        clf.fit(np.array(Train_DS), np.array(y))

        print("GridSearchCV completed")
        Parms_DS_Out = report(clf.grid_scores_, n_top=n_iter_search)
        Parms_DS_Out.to_csv(file_path + 'Parms_DS_XGB_4.csv')

        print("Best estimator found by grid search:")
        print(clf.best_estimator_)
        sys.exit(0)
    else:
        ##----------------------------------------------------------------------------------------------------------------##
        #best lb is with n_estimators = 500 , using 1000 it is less
        #CV: 0.78526434774405007 (full set)
        #CV: 0.824999 (100k set - with Age set up, all dummy)
        #CV: 0.830194 (with 50 K) - n_estimators = 75 - with Age Bkt and Session (Action_Type dummy) features
        #CV: 0.830842 (with 50 K) - n_estimators = 75 - with Age Bkt and Session & Session 3 - features  *********

        clf = xgb.XGBClassifier(n_estimators=500, nthread=8)

        #LB : n_estimators = 100 , 0.88040
        #LB : n_estimators = 125 , 0.88059 ***best, session 1,2,3,4
        #LB : n_estimators = 150 , 0.88045
        #LB : n_estimators = 060 , 0.87996
        #LB : n_estimators = 080 , 0.88029
        #LB:  n_estimators = 125 , 0.88148 ***best, session 1,2,3,4 and year > 2012
        #LB:  n_estimators = 125 , 0.88080, session 1,2,3,4 and year > 2013
        #LB:  n_estimators = 125 , 0.88010, session 1,2,3,4 and year > 2011

        #CV: 0.83062 (with 50 K) - n_estimators = 125 - with Age Bkt and Session,2,3,4 - features  *********

        clf = xgb.XGBClassifier(base_score=0.5,
                                colsample_bylevel=1,
                                colsample_bytree=0.8,
                                gamma=0.6,
                                learning_rate=0.1,
                                max_delta_step=0,
                                max_depth=6,
                                min_child_weight=12,
                                missing=None,
                                n_estimators=135,
                                nthread=8,
                                objective='multi:softprob',
                                reg_alpha=0,
                                reg_lambda=1,
                                scale_pos_weight=1,
                                silent=True,
                                subsample=0.7)

        #clf = CalibratedClassifierCV(base_estimator=clf, method='sigmoid')

        # clf = xgb.XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators=75,
        #             objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0,nthread=8)

        ##-----------------------------------
        # Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf)
        # sys.exit(0)

        X_train = np.array(Train_DS)
        Y_train = np.array(y)

        clf.fit(X_train, Y_train)

    X_Actual = np.array(Actual_DS)

    #Predict actual model
    pred_Actual = clf.predict_proba(X_Actual)
    print("Actual Model predicted")

    if raw_output == False:
        pred_Actual = get_best_five(pred_Actual, type_val=False)

        #Get the predictions for actual data set
        pred_Actual.to_csv(file_path + 'output/Submission_Roshan_xgb_135.csv',
                           index_label='id')

    else:

        print(pd.DataFrame(pred_Actual).head())
        pred = pd.DataFrame(pred_Actual)
        pred['id'] = Actual_DS1
        pred = pred.set_index('id')

        pred.to_csv(file_path +
                    'output/Submission_Roshan_xgb_raw_150_2012.csv',
                    index_label='id')

    print("***************Ending XGB Classifier***************")
    return pred_Actual
예제 #21
0
def XGB_Classifier(Train_DS, y, Actual_DS, Sample_DS, Grid):

    print("***************Starting XGB Classifier***************")
    t0 = time()

    if Grid:
       #used for checking the best performance for the model using hyper parameters
        print("Starting model fit with Grid Search")

        param_grid = {'n_estimators': [100],
                      'max_depth': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
                      'min_child_weight': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
                      'subsample': [0.1,0.2,0.3,0.4,0.5,0.6, 0.7,0.8, 0.9,1],
                      'colsample_bytree': [0.1,0.2,0.3,0.4,0.5,0.6, 0.7,0.8, 0.9,1],
                      'silent':[True],
                      'gamma':[2,1,0.1,0.2,0.3,0.4,0.5,0.6, 0.7,0.8, 0.9]
                     }

        # clf = GridSearchCV(xgb.XGBClassifier(),param_grid, scoring='roc_auc',
        #                    verbose=1,cv=10)

        #run randomized search
        n_iter_search = 3000
        clf = xgb.XGBClassifier(nthread=-1)
        clf = RandomizedSearchCV(clf, param_distributions=param_grid,
                                           n_iter=n_iter_search, scoring = 'roc_auc',cv=10)

        start = time()
        clf.fit(Train_DS, y)

        print("GridSearchCV completed")
        report(clf.grid_scores_)

        print("Best estimator found by grid search:")
        print(clf.best_estimator_)

    else:

        #Best on grid ::::   CV:
        # clf = xgb.XGBClassifier(n_estimators=500,max_depth=4,learning_rate=0.1,nthread=2,min_child_weight=11,
        #                      subsample=0.8,colsample_bytree=0.7,silent=True, gamma = 0.6)

        #from Kaggle
        clf = xgb.XGBClassifier(n_estimators=500,max_depth=9,learning_rate=0.01,nthread=2,min_child_weight=6,
                             subsample=0.7,colsample_bytree=0.5,silent=True, gamma = 4)

        Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf)


        # clf = xgb.XGBClassifier(n_estimators=2000,max_depth=4,learning_rate=0.1,nthread=2,min_child_weight=11,
        #                      subsample=0.8,colsample_bytree=0.7,silent=True, gamma = 0.6)

        #from Kaggle (https://www.kaggle.com/c/springleaf-marketing-response/forums/t/16808/time-window-variables-features)
        #clf = xgb.XGBClassifier(n_estimators=2000,max_depth=10,learning_rate=0.005,nthread=2,min_child_weight=11,
        #                     subsample=0.8,colsample_bytree=0.4,silent=True, gamma = 0.6)

        #from Kaggle
        clf = xgb.XGBClassifier(n_estimators=2000,max_depth=9,learning_rate=0.01,nthread=2,min_child_weight=6,
                             subsample=0.7,colsample_bytree=0.5,silent=True, gamma = 4)

        clf = CalibratedClassifierCV(base_estimator=clf, method='sigmoid')

        clf.fit(Train_DS, y)

    #Predict actual model
    pred_Actual = clf.predict_proba(Actual_DS)[:,1]
    print("Actual Model predicted")

    #Get the predictions for actual data set

    preds = pd.DataFrame(pred_Actual, index=Sample_DS.ID.values, columns=Sample_DS.columns[1:])
    preds.to_csv(file_path+'output/Submission_Roshan_xgb_filter_2.csv', index_label='ID')

    print("***************Ending XGB Classifier***************")
    return pred_Actual
              'max_depth': sp_randint(4, 200),
              'learning_rate': sp_uniform(loc=0e0,scale=1e0),
              'objective':['multi:softprob'],
              'nthread': [8],
              'missing': [np.nan],
              'reg_alpha': [0.01,0.017782794,0.031622777,0.056234133,\
                            0.1,0.17782794,0.31622777,0.56234133,1.,1.77827941,\
                            3.16227766,5.62341325,10.,\
                            17.7827941,31.6227766,56.2341325,100.],
              'colsample_bytree': sp_uniform(loc=0.2e0,scale=0.8e0),
              'subsample': np.arange(0.6,1.0,step=0.05),
              'n_estimators': sp_randint(100,700),
}

print "Randomized XGBoost"
# In[ ]:
for i in range(2):
    print "Loop %i/20" % i
    search_GB = RandomizedSearchCV(GB,
                                   param_grid,
                                   scoring='log_loss',
                                   n_jobs=-1,
                                   n_iter=n_iter,
                                   cv=cv,
                                   verbose=True)
    search_GB.fit(X_train, y_train)
    log_model = search_GB.score(X_val, y_val)
    print "Log loss = %s" % log_model
    X_test = get_test()
    save_submission('XGBoost', log_model, search_GB.predict_proba(X_test))
예제 #23
0
class Model(object):
    def __init__(self):
        '''
        Training parameters:
        '''

        self.w2v_dim = 100
        self.num_feature = 400
        self.batch_size = 16
        self.num_epoch = 1

        #self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True)
        self.w2v_model = Word2Vec.load('./data/word2vec/w2v.model')
        self.index2word_set = set(self.w2v_model.index2word)
        self.bigram = Phrases.load('./data/bigram.dat')
        self.trigram = Phrases.load('./data/trigram.dat')

        print('Build model...')

        param_dist = {
            "n_estimators": sp_randint(20, 250),
            "criterion": ["gini", "entropy"],
            "max_depth": sp_randint(10, 300),
            "min_samples_split": sp_randint(1, 30),
            "min_samples_leaf": sp_randint(1, 30),
            "max_features": sp_randint(1, 200),
            "bootstrap": [True, False],
            'random_state': sp_randint(1, 1000000),
        }
        # build a classifier
        clf = RandomForestClassifier(n_jobs=8)
        # run randomized search
        self.model = RandomizedSearchCV(clf,
                                        param_distributions=param_dist,
                                        n_iter=10,
                                        cv=9,
                                        n_jobs=8)

        print('Model has been built!')

    def getWordVectorFeatures(self, text):
        words = text.split()
        return self.wordVectorAvg(words, self.w2v_dim)

    def wordVectorAvg(self, words, num_features):
        featureVec = np.zeros((num_features, 1), dtype="float32")

        nwords = 0
        for word in words:
            if word in self.index2word_set:
                nwords = nwords + 1
                featureVec = np.add(featureVec,
                                    self.w2v_model[word].reshape(-1, 1))

        if nwords != 0:
            featureVec = np.divide(featureVec, nwords)
        return featureVec

    def getFeature(self, ori_q, rel_q):
        ori_q[0] = preprocess(ori_q[0],
                              no_stopwords=True,
                              bigram=self.bigram,
                              trigram=self.trigram)
        ori_q[1] = preprocess(ori_q[1],
                              no_stopwords=True,
                              bigram=self.bigram,
                              trigram=self.trigram)

        rel_q[0] = preprocess(rel_q[0],
                              no_stopwords=True,
                              bigram=self.bigram,
                              trigram=self.trigram)
        rel_q[0] = preprocess(rel_q[0],
                              no_stopwords=True,
                              bigram=self.bigram,
                              trigram=self.trigram)

        word2vec_q_subject = self.getWordVectorFeatures(ori_q[0])
        word2vec_q_body = self.getWordVectorFeatures(ori_q[1])

        word2vec_rel_q_subject = self.getWordVectorFeatures(rel_q[0])
        word2vec_rel_q_body = self.getWordVectorFeatures(rel_q[1])

        subject = np.concatenate(
            (word2vec_q_subject * word2vec_rel_q_subject,
             np.abs(word2vec_q_subject - word2vec_rel_q_subject)),
            axis=0)

        body = np.concatenate((word2vec_q_body * word2vec_rel_q_body,
                               np.abs(word2vec_q_body - word2vec_rel_q_body)),
                              axis=0)

        return np.concatenate((
            subject,
            body,
        ), axis=0).T

    def prepareData(self, data):
        size = 0
        for i in range(len(data)):
            size += (len(data[i]) / 2) - 1
        X = np.zeros((size, self.num_feature), dtype=np.float32)
        y = np.zeros((size, ), dtype=np.float32)
        meta = []

        c = 0
        pbar = ProgressBar(widgets=[Percentage(), Bar(),
                                    ETA()],
                           maxval=len(data)).start()
        for i in range(len(data)):
            samples = data[i]

            ori_q_id = samples[0]['ORGQ_ID']
            ori_q = samples[1]

            for j in range(2, len(samples), 2):

                rel_q_id = samples[j]['RELQ_ID']
                rel_q = samples[j + 1]
                label = samples[j]['RELQ_RELEVANCE2ORGQ']
                target = 0
                if label == 'PerfectMatch':
                    target = 2
                elif label == 'Relevant':
                    target = 1

                label = 'false' if label == 'Irrelevant' else 'true'

                X[c, :] = self.getFeature(ori_q, rel_q)
                y[c] = target
                meta.append([ori_q_id, rel_q_id, label])
                c += 1
            pbar.update(i)
        return X, y, meta

    def loadData(self):
        reader = Reader()
        print('loading data')
        self.X_train, self.y_train, self.meta_train = self.prepareData(
            reader.getData(TRAIN))
        print('train data has been loaded!')
        self.X_valid, self.y_valid, self.meta_valid = self.prepareData(
            reader.getData(DEV))
        print('valid data has been loaded!')
        self.X_test, self.y_test, self.meta_test = self.prepareData(
            reader.getData(TEST))
        print('test data has been loaded!')

    def evaluate(self):
        print('evaluating...')

        y_pred = self.model.predict_proba(self.X_valid)
        f = open('./tmp/dev.pred', 'w')
        for i in range(len(self.meta_valid)):
            prob_of_true = y_pred[i][1] + y_pred[i][2]
            label = 'false'
            if prob_of_true > 0.5:
                label = 'true'
            f.write("%s %s 0 %20.16f %s\n" %
                    (self.meta_valid[i][0], self.meta_valid[i][1],
                     prob_of_true, label))
        f.close()

        map = eval_reranker(
            res_fname=
            './data/eval/SemEval2016-Task3-CQA-QL-dev.xml.subtaskB.relevancy',
            pred_fname='./tmp/dev.pred')
        f = open('valid_map.txt', 'a')
        f.write(str(map) + '\n')
        f.close()
        print('=========================================')
        return map

    def train(self):

        f = open('valid_map.txt', 'w')
        f.close()
        f_train_loss = open('./train_loss.txt', 'w')
        f_valid_loss = open('./valid_loss.txt', 'w')
        f_train_acc = open('./train_acc.txt', 'w')
        f_valid_acc = open('./valid_acc.txt', 'w')
        f_train_loss.close()
        f_valid_loss.close()
        f_train_acc.close()
        f_valid_acc.close()

        print("Training...")
        max_map = 0.0
        for i in range(self.num_epoch):
            self.model.fit(self.X_train, self.y_train)
            '''
            f_train_loss=open('./train_loss.txt','a')
            f_valid_loss=open('./valid_loss.txt','a')
            f_train_acc=open('./train_acc.txt','a')
            f_valid_acc=open('./valid_acc.txt','a')

            f_train_loss.write(str(np.asscalar(hist.history['loss'][0])))
            f_train_loss.write('\n')
            f_valid_loss.write(str(np.asscalar(hist.history['val_loss'][0])))
            f_valid_loss.write('\n')

            f_train_acc.write(str(np.asscalar(hist.history['acc'][0])))
            f_train_acc.write('\n')
            f_valid_acc.write(str(np.asscalar(hist.history['val_acc'][0])))
            f_valid_acc.write('\n')

            f_train_loss.close()
            f_valid_loss.close()
            f_train_acc.close()
            f_valid_acc.close()
            '''

            map = self.evaluate()
            print('MAP on valid data: %16.16f\n' % (map))
            if map > max_map:
                max_map = map
                #self.model.save_weights("./tmp/weights.hdf5")

        print('Training completed!')
예제 #24
0
def xgb_model(read_csv=True):
	print 'xgb_model randomcv yr > 2013'
	train_df, test_df = get_train_test_data(cache=read_csv, include_sessions=False)

	train_df = train_df[train_df['tfa_year'] > 2013]
	cols = [i for i in train_df.columns if i not in EXCLUDE_COLS]
	X = train_df[cols]
	y = train_df['country_destination']

	#start classifier
	bst = xgb.XGBClassifier(nthread=4)
	# bst = xgb.XGBClassifier(max_depth=2, nthread=4,
	# 	n_estimators=50,subsample=0.4,learning_rate=0.0.05)
	train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=10000)
	# bst.fit(test_X, test_y)



	param_dist = {"max_depth": [2,4,6],
			  "learning_rate": [0.05, 0.1, 0.15, 0.2],
			  "n_estimators": [30, 50, 70],
			  # "min_samples_leaf": sp_randint(1, 11),
			  # "min_samples_split": sp_randint(1,11),
			  'subsample': [0.4, 0.5, 0.6]
			  # 'max_features': [20, 50, 100]
			  }
	n_iter_search = 20
	random_search = RandomizedSearchCV(bst, param_distributions=param_dist,
								   n_iter=n_iter_search, scoring=ndcg_scorer)
	start = time()
	random_search.fit(test_X, test_y)
	print("RandomizedSearchCV took %.2f seconds for %d candidates"
		  " parameter settings." % ((time() - start), n_iter_search))
	report(random_search.grid_scores_)


	# bagging
	# clfbag = BaggingClassifier(bst, n_estimators=5, max_samples=5000)
	# clfbag.fit(train_X, train_y)
	# y_pred = clfbag.predict_proba(test_X)
	# print 'predicted prob'

	# score = ndcg_score(test_y, y_pred)


	# apply learned model


	# bst.fit(X.values, y)
	# py.test.set_trace()

	test_data = test_df[[i for i in test_df.columns if i not in EXCLUDE_COLS]]
	y_pred = random_search.predict_proba(test_data)
	py.test.set_trace()
	# kaggle_test = pd.read_csv('test.csv')


	sub = create_kaggle_submission(y_pred, test_df['id'], 0.841)
	py.test.set_trace()
	# print 'created kaggle sub'

	# kf = KFold(len(X), n_folds=10, random_state=42)
	# score = cross_val_score(bst, X, y, cv=kf, scoring=ndcg_scorer)

	# param_dist = {"max_depth": [3, None],
	# 		  "max_features": sp_randint(1, 11),
	# 		  "min_samples_split": sp_randint(1, 11),
	# 		  "min_samples_leaf": sp_randint(1, 11),
	# 		  "bootstrap": [True, False],
	# 		  "criterion": ["gini", "entropy"]}
	# n_iter_search = 20
	# random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
	# 							   n_iter=n_iter_search, scoring=ndcg_scorer)
	# start = time()
	# random_search.fit(X, y)
	# print("RandomizedSearchCV took %.2f seconds for %d candidates"
	# 	  " parameter settings." % ((time() - start), n_iter_search))
	# report(random_search.grid_scores_)

	# py.test.set_trace()

	# sub = create_kaggle_submission(y_pred, test_df['id'], np.mean(score))

	# py.test.set_trace()

	#end classifier

	""" trying cross valid
예제 #25
0
def XGB_Classifier(Train_DS, y, Actual_DS, Sample_DS, Grid):

    print("***************Starting XGB Classifier***************")
    t0 = time()

    if Grid:
       #used for checking the best performance for the model using hyper parameters
        print("Starting model fit with Grid Search")

        param_grid = {'n_estimators': [25],
                      'max_depth': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,19,20,40,80,100,200],
                      'min_child_weight': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,20,40,80,100],
                      'subsample': [0.1,0.2,0.3,0.4,0.5,0.6, 0.7,0.8, 0.9,1],
                      'colsample_bytree': [0.1,0.2,0.3,0.4,0.5,0.6, 0.7,0.8, 0.9,1],
                      'silent':[True],
                      'gamma':[2,1,0.1,0.2,0.3,0.4,0.5,0.6, 0.7,0.8, 0.9]
                     }

        #run randomized search
        n_iter_search = 800
        clf = xgb.XGBClassifier(nthread=8)
        clf = RandomizedSearchCV(clf, param_distributions=param_grid,
                                           n_iter=n_iter_search, scoring = 'log_loss',cv=3)
        start = time()
        clf.fit(np.array(Train_DS), np.array(y))

        print("GridSearchCV completed")
        Parms_DS_Out = report(clf.grid_scores_,n_top=n_iter_search)
        Parms_DS_Out.to_csv(file_path+'Parms_DS_XGB_4.csv')

        print("Best estimator found by grid search:")
        print(clf.best_estimator_)
        sys.exit(0)
    else:
        ##----------------------------------------------------------------------------------------------------------------##
        #best from grid Search, best n_est=175
        #CV:0.936880  , 20 K , n_estimators =100 , features = 343 (without FN and Upc and using eucledean for DD)*** current best
        clf = xgb.XGBClassifier(n_estimators=100,max_depth=100,learning_rate=0.1,nthread=8,min_child_weight=1,
                             subsample=0.6,colsample_bytree=0.9,silent=True, gamma = 2 )

        ##----------------------------------------------------------------------------------------------------------------##
        #CV: 0.955185 , 20 K , n_estimators =100 , features = 343 (without FN and Upc)
        #CV: 0.935217 , 20 K , n_estimators =100 , features = 343 (without FN and Upc and using eucledean for DD)
        #CV: 0.927019 , 20 K , n_estimators =100 , features = 343 (without FN and Upc and using cos_sim for DD) *****not used ovefitting
        #CV: 0.922370 , 20 K , n_estimators =100 , features = 343 (without FN and Upc and using eucl + cos_sim for DD) *****not used ovefitting

        ##................................................................................................................##
        #CV: 0.942477 , 20 K , n_estimators =100 , features = 343 (without FN and Upc and using eucledean for DD)
        #clf = xgb.XGBClassifier(n_estimators=100,nthread=8)

        ##----------------------------------------------------------------------------------------------------------------##

        Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf)

        sys.exit(0)

        X_train = np.array(Train_DS)
        Y_train = np.array(y)

        clf.fit(X_train, Y_train)

    X_Actual = np.array(Actual_DS)

    #Predict actual model
    pred_Actual = clf.predict_proba(X_Actual)
    print("Actual Model predicted")

    #Get the predictions for actual data set
    preds = pd.DataFrame(pred_Actual, index=Sample_DS.VisitNumber.values, columns=Sample_DS.columns[1:])
    preds.to_csv(file_path+'output/Submission_Roshan_xgb_6_withFNnumber.csv', index_label='VisitNumber')

    print("***************Ending XGB Classifier***************")
    return pred_Actual
예제 #26
0
def xgb_Classifier(Train_DS, y, Actual_DS, Sample_DS, grid):
    print("***************Starting XGB Classifier***************")
    t0 = time()

    # Train_DS = np.log( 1 + Train_DS)
    # Actual_DS = np.log( 1 + Actual_DS)

    #Setting Standard scaler for data
    # stdScaler = StandardScaler()
    # stdScaler.fit(Train_DS,y)
    # Train_DS = stdScaler.transform(Train_DS)
    # Actual_DS = stdScaler.transform(Actual_DS)

    if grid:
       #used for checking the best performance for the model using hyper parameters
        print("Starting model fit with Grid Search")

        # specify parameters and distributions to sample from
        # param_grid = {'n_estimators': [50],
        #               'max_depth': [6, 1, 3, 5, 8, 10],
        #               'min_child_weight': [1, 4, 7, 10],
        #               'subsample': [0.1, 0.2,0.3, 0.4,0.5,0.6, 0.7,0.8, 0.9,1],
        #               'colsample_bytree': [0.1, 0.2,0.3, 0.4,0.5,0.6, 0.7,0.8, 0.9,1],
        #               'silent':[True],
        #               'gamma':[1,0.5,0.6,0.7,0.8,0.9]
        #              }

        param_grid = {'n_estimators': [500],
                      'max_depth': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
                      'min_child_weight': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
                      'subsample': [0.1,0.2,0.3,0.4,0.5,0.6, 0.7,0.8, 0.9,1],
                      'colsample_bytree': [0.1,0.2,0.3,0.4,0.5,0.6, 0.7,0.8, 0.9,1],
                      'silent':[True],
                      'gamma':[2,1,0.1,0.2,0.3,0.4,0.5,0.6, 0.7,0.8, 0.9]
                     }

        # clf = GridSearchCV(xgb.XGBClassifier(),param_grid, scoring='roc_auc',
        #                    verbose=1,cv=10)

        #run randomized search
        n_iter_search = 3000
        clf = xgb.XGBClassifier(nthread=-1)
        clf = RandomizedSearchCV(clf, param_distributions=param_grid,
                                           n_iter=n_iter_search, scoring = 'roc_auc',cv=10)

        start = time()
        clf.fit(Train_DS, y)

        print("GridSearchCV completed")
        report(clf.grid_scores_)

        print("Best estimator found by grid search:")
        print(clf.best_estimator_)

    else:

        #starting model
        # clf = xgb.XGBClassifier(n_estimators=200,max_depth=10,learning_rate=0.01,nthread=2,min_child_weight=4,
        #                      subsample=0.9,colsample_bytree=0.8,silent=True, gamma = 1)

        # Model with rank: 1 , Mean validation score: 0.921 (std: 0.024)
        # clf = xgb.XGBClassifier(n_estimators=200,max_depth=5,learning_rate=0.1,nthread=2,min_child_weight=1,
        #                      subsample=0.5,colsample_bytree=0.9,silent=True, gamma = 0.6)

        # Model with rank: 1
        # Mean validation score: 0.919 (std: 0.031)
        # Parameters: {'colsample_bytree': 0.8, 'silent': True, 'min_child_weight': 4, 'n_estimators': 200, 'subsample': 1, 'max_depth': 3, 'gamma': 0.8}
        #
        # Model with rank: 2
        # Mean validation score: 0.918 (std: 0.032)
        # Parameters: {'colsample_bytree': 0.8, 'silent': True, 'min_child_weight': 4, 'n_estimators': 200, 'subsample': 1, 'max_depth': 3, 'gamma': 1}
        #
        # Model with rank: 3
        # Mean validation score: 0.918 (std: 0.028)
        # Parameters: {'colsample_bytree': 0.7, 'silent': True, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.6, 'max_depth': 9, 'gamma': 0.6}
        #
        # Best estimator found by grid search:
        # XGBClassifier(base_score=0.5, colsample_bytree=0.8, gamma=0.8,
        # learning_rate=0.1, max_delta_step=0, max_depth=3,
        # min_child_weight=4, n_estimators=200, nthread=-1,
        # objective='binary:logistic', seed=0, silent=True, subsample=1)

        #Cv = .91278 LB : 0.89863
        # clf = xgb.XGBClassifier(n_estimators=1000,max_depth=3,learning_rate=0.1,nthread=2,min_child_weight=4,
        #                      subsample=1,colsample_bytree=0.8,silent=True, gamma = 1)


        clf = xgb.XGBClassifier(n_estimators=1000,max_depth=6,learning_rate=0.1,nthread=2,min_child_weight=1,
                             subsample=0.9,colsample_bytree=1,silent=True, gamma = 0.7)


        #clf = CalibratedClassifierCV(base_estimator=clf, method='sigmoid')

        Nfold_score = Nfold_Cross_Valid(Train_DS, y, clf)
        clf.fit(Train_DS, y)

    #Predict actual model
    pred_Actual = clf.predict_proba(Actual_DS)[:,1]
    print("Actual Model predicted")

    #Get the predictions for actual data set

    preds = pd.DataFrame(pred_Actual, index=Sample_DS.bidder_id.values, columns=Sample_DS.columns[1:])
    preds.to_csv(file_path+'output/Submission_Roshan_xgb_305.csv', index_label='bidder_id')

    print("***************Ending XGB Classifier***************")
    return pred_Actual
예제 #27
0
print(RR_model.grid_scores_)

# In[ ]:


print(RR_model.best_score_)

# In[ ]:


print(RR_model.best_params_)

# In[ ]:


y_prob = RR_model.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
RR_model.score(X_test, y_pred)

# In[ ]:


confusion_matrix=metrics.confusion_matrix(y_test,y_pred)
confusion_matrix

# In[ ]:


auc_roc=metrics.classification_report(y_test,y_pred)
auc_roc
    "max_features": sp_randint(1, 11),
    "min_samples_split": sp_randint(1, 11),
    "min_samples_leaf": sp_randint(1, 11),
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy"],
    "n_estimators": sp_randint(100, 600)
}

# In[4]:

search_GB = RandomizedSearchCV(model,
                               param_grid,
                               scoring='log_loss',
                               n_jobs=-1,
                               n_iter=n_iter,
                               cv=cv,
                               verbose=True)
search_GB.fit(X_train, y_train.flatten())

# In[5]:

log_model = search_GB.score(X_val, y_val.flatten())
print "Log loss = %s" % log_model
X_test = get_test()
y_pred = search_GB.predict_proba(X_test)
save_submission(model_name, log_model, y_pred)

# In[7]:

model_name