예제 #1
0
def find_xgb_best_parameters(test_size=0.2, n_iter_search=20, X=None, y=None):
    if X is None or y is None:
        X, y = pr_kaggle.load_data(cat2vectors=True)
    Xtrain, Xtest, ytrain, ytest = train_test_split(X,
                                                    y,
                                                    test_size=test_size,
                                                    random_state=36)
    param_dist = {
        "n_estimators": [50, 100, 250, 500],
        "max_depth": [10, 5, 15],
        "learning_rate": [0.01, 0.1, 0.0333],
        "subsample": [0.5, 1.0, 0.80],
        #"gamma": [0,0.01],
        #"min_child_weight": [0.5, 1],
        "colsample_bytree": [1.0, 0.5, 0.8, 0.9]
    }
    start = time()
    clf = xgb.XGBClassifier()
    random_search = RandomizedSearchCV(clf,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       n_jobs=1)

    print Xtrain.shape
    random_search.fit(Xtrain, ytrain)
    print(
        "RandomizedSearchCV took %.2f seconds for %d candidates"
        " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.grid_scores_)

    print 'training', random_search.score(Xtrain, ytrain)
    print 'testing', random_search.score(Xtest, ytest)
    return random_search
예제 #2
0
def trainClassifier(X, y, testTweetsAll, ensembleTweets, ensembleSentiment):

    # split our data into training and test datasets
    xTrain, xTest, yTrain, yTest = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=8)

    classifier = RandomForestClassifier(n_estimators=20, n_jobs=-1)

    # for simplicity's sake, we could train a single random forest:
    # classifier.fit(xTrain, yTrain)
    # print classifier.score(xTest, yTest)

    # for more fun, we will optimize the hyperparameters for our random forest using RandomizedSearchCV
    parametersToTry = {
        'max_features': ['sqrt', 'log2', None, .01, .1, .2, .3],
        'criterion': ['gini', 'entropy'],
        'min_samples_leaf': [1],
        'min_samples_split': scipy.stats.randint(2, 30),
        'bootstrap': [True, False]
    }

    # RandomizedSearchCV will optimize our hyperparameters for us in a way that is much more efficient and comprehensive than GridSearchCV.
    # run on all cores, fail gracefully if a combination of hyperparameters fails to converge, try 10 different combinations of hyperparameters, train on all the training data when finished, and use a third of the dataset for cross-validation while searching for the best hyperparameters
    searchCV = RandomizedSearchCV(classifier,
                                  parametersToTry,
                                  n_jobs=-1,
                                  error_score=0,
                                  n_iter=10,
                                  refit=True,
                                  cv=3)

    print 'shape of this training data set:'
    print xTrain.shape
    searchCV.fit(xTrain, yTrain)
    print 'the best hyperparameters from this search are:'
    print searchCV.best_params_
    print 'best score from hyperparameter search is: ' + str(
        searchCV.best_score_)
    print 'score on the holdout portion of the training set: ' + str(
        searchCV.score(xTest, yTest))
    print 'score on the ensemble data: ' + str(
        searchCV.score(ensembleTweets, ensembleSentiment)) + '\n\n'

    testPredictions = searchCV.predict_proba(testTweetsAll)
    ensemblePredictions = searchCV.predict_proba(ensembleTweets)

    def singlePrediction(predictions):
        cleanedPredictions = []
        for predictionRow in predictions:
            cleanedPredictions.append(predictionRow[1])
        return cleanedPredictions

    # the classifier gives us a predicted probability for both the 0 and the 1 case. Given that they're mutually exclusive, we can simplify down to a single number (the predicted probability of the 1 case)
    testPredictions = singlePrediction(testPredictions)
    ensemblePredictions = singlePrediction(ensemblePredictions)

    return testPredictions, ensemblePredictions
예제 #3
0
def trainClassifier(X, y, testTweetsAll, ensembleTweets, ensembleSentiment):

    # split our data into training and test datasets
    xTrain, xTest, yTrain, yTest = train_test_split(
        X, y, test_size=0.33, random_state=8)


    classifier = RandomForestClassifier(n_estimators=20, n_jobs=-1)

    # for simplicity's sake, we could train a single random forest:
    # classifier.fit(xTrain, yTrain)
    # print classifier.score(xTest, yTest)


    # for more fun, we will optimize the hyperparameters for our random forest using RandomizedSearchCV
    parametersToTry = {
        'max_features': ['sqrt','log2',None,.01,.1,.2,.3],
        'criterion': ['gini','entropy'],
        'min_samples_leaf': [1],
        'min_samples_split': scipy.stats.randint(2,30),
        'bootstrap': [True,False]
    }

    # RandomizedSearchCV will optimize our hyperparameters for us in a way that is much more efficient and comprehensive than GridSearchCV.
    # run on all cores, fail gracefully if a combination of hyperparameters fails to converge, try 10 different combinations of hyperparameters, train on all the training data when finished, and use a third of the dataset for cross-validation while searching for the best hyperparameters
    searchCV = RandomizedSearchCV(classifier, parametersToTry, n_jobs=-1, error_score=0, n_iter=10, refit=True, cv=3)


    print 'shape of this training data set:'
    print xTrain.shape
    searchCV.fit(xTrain, yTrain)
    print 'the best hyperparameters from this search are:'
    print searchCV.best_params_
    print 'best score from hyperparameter search is: ' + str(searchCV.best_score_)
    print 'score on the holdout portion of the training set: ' + str( searchCV.score(xTest, yTest) )
    print 'score on the ensemble data: ' + str( searchCV.score(ensembleTweets, ensembleSentiment) ) + '\n\n'


    testPredictions = searchCV.predict_proba(testTweetsAll)
    ensemblePredictions = searchCV.predict_proba(ensembleTweets)


    def singlePrediction(predictions):
        cleanedPredictions = []
        for predictionRow in predictions:
            cleanedPredictions.append(predictionRow[1])
        return cleanedPredictions

    # the classifier gives us a predicted probability for both the 0 and the 1 case. Given that they're mutually exclusive, we can simplify down to a single number (the predicted probability of the 1 case)
    testPredictions = singlePrediction(testPredictions)
    ensemblePredictions = singlePrediction(ensemblePredictions)

    return testPredictions, ensemblePredictions
def parametr_tuning_random(model,
                           params,
                           scores,
                           X_train,
                           Y_train,
                           X_test,
                           Y_test,
                           n_iter_search=10):
    """
    """
    for score in scores:
        log("# Tuning hyper-parameters for %s: " % score)
        log("", False)
        rnd_tune = RandomizedSearchCV(model,
                                      params,
                                      n_iter=n_iter_search,
                                      cv=5,
                                      scoring=score)
        rnd_tune.fit(X_train, Y_train)

        log("Best parameters set found on development set:")
        log(str(rnd_tune.best_params_), False)
        log("random search score _ TEST set:")
        log(str(rnd_tune.score(X_test, Y_test) * 100), False)
        log("", False)
        log("random search scores on development set:")
        log(str(rnd_tune.grid_scores_), False)
        log("", False)
        log("Detailed classification report:")
        log("", False)
        y_true, y_pred = Y_test, rnd_tune.predict(X_test)
        log(classification_report(y_true, y_pred), False)
        log("", False)
예제 #5
0
def create_svm(pd, pl, qd, ql):
    lsvc = LinearSVC()
    params = {'C': expon(scale=100)}
    svm = RandomizedSearchCV(lsvc, params, n_jobs=4, n_iter=10, verbose=10)
    print("Training Linear SVM Randomly")
    svm.fit(pd, pl)
    print("SVM Score: " + str(svm.score(qd, ql)))
    return svm
예제 #6
0
def hyperparameter_tuning(model, params, X, y):
    # tune the hyperparameters via a randomized search
    grid = RandomizedSearchCV(model, params)
    start = time.time()
    grid.fit(X, y)

    # evaluate the best randomized searched model on the testing
    # data
    print("[INFO] randomized search took {:.2f} seconds".format(time.time() -
                                                                start))
    acc = grid.score(X, y)
    print("[INFO] grid search accuracy: {:.2f}%".format(acc * 100))
    print("[INFO] randomized search best parameters: {}".format(
        grid.best_params_))
예제 #7
0
def main():
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
    logger = logging.getLogger(__name__)

    # Load the data and extract annotated data
    df = pd.read_csv('data/' + DATA_FILE)
    data = df[(df.password.notnull()) & (df.done == '1')]

    Passwords = namedtuple('Passwords', 'data target')
    pwds = Passwords(data=data.tip, target=data.password)

    # Split into training and test set
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        pwds.data, pwds.target, test_size=TEST_SET_PERCENT, random_state=0)

    parameters = {
        'before_cutoff': list(range(0, 6)),
        'after_cutoff': list(range(0, 6)),
        'before_exponential_factor': np.logspace(-1, 1, 10),
        'after_exponential_factor': np.logspace(-1, 1, 10),
        'after_factor': np.logspace(-2, 1, 20),
        'min_password_length': list(range(1, 10)),
        'margin_cutoff': list(range(0, 3)),
        'margin_factor': np.logspace(-2, 0, 10),
        'margin_exponential_factor': np.logspace(-1, 1, 10),
        'eol_factor': np.logspace(-2, 0, 10),
        'bol_factor': np.logspace(-2, 0, 10)
    }

    # Use grid search and k-fold cross validation
    clf = RandomizedSearchCV(
        PasswordEstimator(), parameters, cv=2, scoring='accuracy',
        n_jobs=multiprocessing.cpu_count() - 1, n_iter=BUDGET)

    clf = clf.fit(X_train, y_train)

    logger.info(clf.best_params_)

    test_score = clf.score(X_test, y_test)
    logger.info(test_score)
예제 #8
0
from tempfile import NamedTemporaryFile

rain = .1 * np.load('rain.npy')
rain[rain < 0] = .05/2
dates = np.load('doy.npy').astype(int)
x = np.vstack((dates[:-1], np.sign(rain[:-1])))
x = x.T

y = np.sign(rain[1:])

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=37)

clf = tree.DecisionTreeClassifier(random_state=37)
params = {"max_depth": [2, None],
              "min_samples_leaf": sp_randint(1, 5),
              "criterion": ["gini", "entropy"]}
rscv = RandomizedSearchCV(clf, params)
rscv.fit(x_train,y_train)

sio = io.StringIO()
tree.export_graphviz(rscv.best_estimator_, out_file=sio, feature_names=['day-of-year','yest'])
dec_tree = pydot.graph_from_dot_data(sio.getvalue())

with NamedTemporaryFile(prefix='rain', suffix='.png', delete=False) as f:
    dec_tree.write_png(f.name)
    print("Written figure to", f.name)

print("Best Train Score", rscv.best_score_)
print("Test Score", rscv.score(x_test, y_test))
print("Best params", rscv.best_params_)
예제 #9
0
submission['Class_9'][pred == 9] = np.ones(len(pred == 9))

submission = submission.drop('label', axis=1)
submission.to_csv('submission_svm.csv', index_label='id')

#

classifiers = [
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
]

clf = SVC(gamma=2, C=1)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test.astype(str))
svm_pred = clf.predict(X_test)
# need to convert the results to str
# http://stackoverflow.com/questions/19820369/unable-to-solve-an-error-while-running-gridsearch
confusion_matrix(y_test.astype(str), svm_pred.astype(str))
# very poor fit. it mostly predicts as class 2

# http://qiita.com/sotetsuk/items/16ffd76978085bfd7628
## チューニングパラメータ
tuned_parameters = [{
    'kernel': ['rbf'],
    'gamma': [1e-3, 1e-4],
    'C': [1, 10, 100, 1000]
}, {
    'kernel': ['linear'],
    'C': [1, 10, 100, 1000]
예제 #10
0
from tempfile import NamedTemporaryFile

rain = .1 * np.load('rain.npy')
rain[rain < 0] = .05/2
dates = np.load('doy.npy').astype(int)
x = np.vstack((dates[:-1], np.sign(rain[:-1])))
x = x.T

y = np.sign(rain[1:])

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=37)

clf = tree.DecisionTreeClassifier(random_state=37)
params = {"max_depth": [2, None],
              "min_samples_leaf": sp_randint(1, 5),
              "criterion": ["gini", "entropy"]}
rscv = RandomizedSearchCV(clf, params)
rscv.fit(x_train,y_train)

sio = StringIO.StringIO()
tree.export_graphviz(rscv.best_estimator_, out_file=sio, feature_names=['day-of-year','yest'])
dec_tree = pydot.graph_from_dot_data(sio.getvalue())

with NamedTemporaryFile(prefix='rain', suffix='.png', delete=False) as f:
    dec_tree.write_png(f.name)
    print "Written figure to", f.name

print "Best Train Score", rscv.best_score_
print "Test Score", rscv.score(x_test, y_test)
print "Best params", rscv.best_params_
예제 #11
0
def main():
    csv_file_object = csv.reader(open('Data/train.csv',
                                      'rb'))  #Load in the training csv file
    header = csv_file_object.next()  #Skip the fist line as it is a header
    train_data = []  #Creat a variable called 'train_data'
    for row in csv_file_object:  #Skip through each row in the csv file
        train_data.append(row[1:])  #adding each row to the data variable
    train_data = np.array(train_data)  #Then convert from a list to an array

    #I need to convert all strings to integer classifiers:
    #Male = 1, female = 0:
    train_data[train_data[0::, 3] == 'male', 3] = -1
    train_data[train_data[0::, 3] == 'female', 3] = 1
    #embark c=0, s=1, q=2
    train_data[train_data[0::, 10] == 'C', 10] = -1
    train_data[train_data[0::, 10] == 'S', 10] = 0
    train_data[train_data[0::, 10] == 'Q', 10] = 1
    #Survived
    train_data[train_data[0::, 3] == 1, 0] = 1
    train_data[train_data[0::, 3] == 0, 0] = -1

    #I need to fill in the gaps of the data and make it complete.
    #So where there is no price, I will assume price on median of that class
    #Where there is no age I will give median of all ages

    imp = preprocessing.Imputer(missing_values=0, strategy='median', axis=0)

    #All the ages with no data make the median of the data
    #train_data[train_data[0::,4] == '',4] = np.median(train_data[train_data[0::,4]\
    #                                          != '',4].astype(np.float))
    #All missing ebmbarks just make them embark from most common place
    #train_data[train_data[0::,10] == '',10] = np.round(np.mean(train_data[train_data[0::,10]\
    #                                                   != '',10].astype(np.float)))

    train_data = np.delete(train_data, [2, 7, 9, 10],
                           1)  #remove the name data, cabin and ticket
    train_data[train_data == ''] = '0'
    imp.fit_transform(train_data)
    #I need to do the same with the test data now so that the columns are in the same
    #as the training data

    #We finally spit the data between train set and valiation set
    x_train, x_test, y_train, y_test = train_test_split(train_data[0::, 1::],
                                                        train_data[0::, 0],
                                                        test_size=0.2,
                                                        random_state=0)

    #Standardise data
    scaler = preprocessing.StandardScaler().fit(x_train)
    x_train_std = scaler.transform(x_train)
    x_test_std = scaler.transform(x_test)

    test_file_object = csv.reader(open('Data/test.csv',
                                       'rb'))  #Load in the test csv file
    header = test_file_object.next()  #Skip the fist line as it is a header
    test_data = []  #Creat a variable called 'test_data'
    ids = []
    for row in test_file_object:  #Skip through each row in the csv file
        ids.append(row[0])
        test_data.append(row[1:])  #adding each row to the data variable
    test_data = np.array(test_data)  #Then convert from a list to an array

    #I need to convert all strings to integer classifiers:
    #Male = 1, female = 0:
    test_data[test_data[0::, 2] == 'male', 2] = 1
    test_data[test_data[0::, 2] == 'female', 2] = -1
    #ebark c=0, s=1, q=2
    test_data[
        test_data[0::, 9] == 'C',
        9] = -1  #Note this is not ideal, in more complex 3 is not 3 tmes better than 1 than 2 is 2 times better than 1
    test_data[test_data[0::, 9] == 'S', 9] = 0
    test_data[test_data[0::, 9] == 'Q', 9] = 1

    #All the ages with no data make the median of the data
    #test_data[test_data[0::,3] == '',3] = np.median(test_data[test_data[0::,3]\
    #                                           != '',3].astype(np.float))
    #All missing ebmbarks just make them embark from most common place
    #test_data[test_data[0::,9] == '',9] = np.round(np.mean(test_data[test_data[0::,9]\
    #                                                   != '',9].astype(np.float)))
    #All the missing prices assume median of their respectice class
    #for i in xrange(np.size(test_data[0::,0])):
    #    if test_data[i,7] == '':
    #        test_data[i,7] = np.median(test_data[(test_data[0::,7] != '') &\
    #                                             (test_data[0::,0] == test_data[i,0])\
    #            ,7].astype(np.float))

    test_data = np.delete(test_data, [1, 6, 8, 9],
                          1)  #remove the name data, cabin and ticket
    test_data[test_data == ''] = '0'
    #Impute mising values
    imp.fit_transform(test_data)

    #Standarize
    scaler_test = preprocessing.StandardScaler().fit(test_data)
    test_data_std = scaler_test.transform(test_data)
    #The data is now ready to go. So lets train then test!

    start = time()
    print 'Training estimators'
    estimators = [('linearsvc', LinearSVC()),
                  ('KNeighborsClassifier', KNeighborsClassifier())]
    clf = Pipeline(estimators)
    # specify parameters and distributions to sample from
    param_dist = {
        "linearsvc__C": sp_randint(1, 1000),
        "linearsvc__loss": ["l1", "l2"],
        "linearsvc__dual": [True],
        "KNeighborsClassifier__n_neighbors": sp_randint(5, 100),
        "KNeighborsClassifier__weights": ["uniform", "distance"],
        "KNeighborsClassifier__algorithm": ["ball_tree", "kd_tree", "brute"],
        "KNeighborsClassifier__leaf_size": sp_randint(3, 100),
    }

    # run randomized search
    n_iter_search = 2000
    random_search = RandomizedSearchCV(clf,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       n_jobs=4,
                                       verbose=1)
    random_search.fit(x_train_std, y_train)

    print 'Reporting'
    print(
        "RandomizedSearchCV took %.2f seconds for %d candidates"
        " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.grid_scores_)
    score = random_search.score(x_test_std, y_test)
    print 'Test score'
    print score
    print 'Predicting'
    output = random_search.predict(test_data_std)

    open_file_object = csv.writer(open("pipelinearsvcknn.csv", "wb"))
    open_file_object.writerow(["PassengerId", "Survived"])
    open_file_object.writerows(zip(ids, output))
예제 #12
0
              "svc__gamma": sp_randint(1, 10),
              "svc__coef0": sp_randint(1, 10),
              "svc__shrinking": [True, False]
              }
#Start with data with age
# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search,n_jobs=4, verbose=1)
random_search.fit(x_train_std,y_train)

print 'Reporting'
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)
score=random_search.score(x_test_std,y_test)
print 'Test score'
print score
print 'Predicting'
output = random_search.predict(test_data_std)


#Finally with data without age
# run randomized search
<<<<<<< HEAD
n_iter_search = 20
=======
n_iter_search = 2000
>>>>>>> 5b0499dbec7ef19b9617d4339731063de092e370
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search,n_jobs=4, verbose=1)
예제 #13
0
def main():
    csv_file_object = csv.reader(open('Data/train.csv', 'rb')) #Load in the training csv file
    header = csv_file_object.next() #Skip the fist line as it is a header
    train_data=[] #Creat a variable called 'train_data'
    for row in csv_file_object: #Skip through each row in the csv file
        train_data.append(row[1:]) #adding each row to the data variable
    train_data = np.array(train_data) #Then convert from a list to an array
    
    #I need to convert all strings to integer classifiers:
    #Male = 1, female = 0:
    train_data[train_data[0::,3]=='male',3] = -1
    train_data[train_data[0::,3]=='female',3] = 1
    #embark c=0, s=1, q=2
    train_data[train_data[0::,10] =='C',10] = -1
    train_data[train_data[0::,10] =='S',10] = 0
    train_data[train_data[0::,10] =='Q',10] = 1
    #Survived
    train_data[train_data[0::,3]==1,0] = 1
    train_data[train_data[0::,3]==0,0] = -1
    
    #I need to fill in the gaps of the data and make it complete.
    #So where there is no price, I will assume price on median of that class
    #Where there is no age I will give median of all ages
    
    imp = preprocessing.Imputer(missing_values=0, strategy='median', axis=0)
    
    #All the ages with no data make the median of the data
    #train_data[train_data[0::,4] == '',4] = np.median(train_data[train_data[0::,4]\
    #                                          != '',4].astype(np.float))
    #All missing ebmbarks just make them embark from most common place
    #train_data[train_data[0::,10] == '',10] = np.round(np.mean(train_data[train_data[0::,10]\
    #                                                   != '',10].astype(np.float)))
    
    train_data = np.delete(train_data,[2,7,9,10],1) #remove the name data, cabin and ticket
    train_data[train_data=='']='0'
    imp.fit_transform(train_data)
    #I need to do the same with the test data now so that the columns are in the same
    #as the training data
    
    
    
    #We finally spit the data between train set and valiation set
    x_train, x_test, y_train, y_test=train_test_split(
        train_data[0::,1::],train_data[0::,0], test_size=0.2, random_state=0)
    
    #Standardise data
    scaler = preprocessing.StandardScaler().fit(x_train)
    x_train_std=scaler.transform(x_train)
    x_test_std=scaler.transform(x_test)
    
    
    test_file_object = csv.reader(open('Data/test.csv', 'rb')) #Load in the test csv file
    header = test_file_object.next() #Skip the fist line as it is a header
    test_data=[] #Creat a variable called 'test_data'
    ids = []
    for row in test_file_object: #Skip through each row in the csv file
        ids.append(row[0])
        test_data.append(row[1:]) #adding each row to the data variable
    test_data = np.array(test_data) #Then convert from a list to an array
    
    #I need to convert all strings to integer classifiers:
    #Male = 1, female = 0:
    test_data[test_data[0::,2]=='male',2] = 1
    test_data[test_data[0::,2]=='female',2] = -1
    #ebark c=0, s=1, q=2
    test_data[test_data[0::,9] =='C',9] = -1 #Note this is not ideal, in more complex 3 is not 3 tmes better than 1 than 2 is 2 times better than 1
    test_data[test_data[0::,9] =='S',9] = 0
    test_data[test_data[0::,9] =='Q',9] = 1
    
    #All the ages with no data make the median of the data
    #test_data[test_data[0::,3] == '',3] = np.median(test_data[test_data[0::,3]\
    #                                           != '',3].astype(np.float))
    #All missing ebmbarks just make them embark from most common place
    #test_data[test_data[0::,9] == '',9] = np.round(np.mean(test_data[test_data[0::,9]\
    #                                                   != '',9].astype(np.float)))
    #All the missing prices assume median of their respectice class
    #for i in xrange(np.size(test_data[0::,0])):
    #    if test_data[i,7] == '':
    #        test_data[i,7] = np.median(test_data[(test_data[0::,7] != '') &\
    #                                             (test_data[0::,0] == test_data[i,0])\
    #            ,7].astype(np.float))
    
    test_data = np.delete(test_data,[1,6,8,9],1) #remove the name data, cabin and ticket
    test_data[test_data=='']='0'
    #Impute mising values
    imp.fit_transform(test_data)
    
    #Standarize
    scaler_test = preprocessing.StandardScaler().fit(test_data)
    test_data_std=scaler_test.transform(test_data)
    #The data is now ready to go. So lets train then test!
    
    start = time()
    print 'Training estimators'
    estimators = [('linearsvc', LinearSVC()), ('KNeighborsClassifier', KNeighborsClassifier())]
    clf = Pipeline(estimators)
    # specify parameters and distributions to sample from
    param_dist = {"linearsvc__C": sp_randint(1, 1000),
                  "linearsvc__loss": ["l1", "l2"],
                  "linearsvc__dual": [True],
                  "KNeighborsClassifier__n_neighbors": sp_randint(5, 100),
                  "KNeighborsClassifier__weights": ["uniform", "distance"],
                  "KNeighborsClassifier__algorithm": ["ball_tree", "kd_tree", "brute"],
                  "KNeighborsClassifier__leaf_size": sp_randint(3, 100),
                  
                  }
    
    # run randomized search
    n_iter_search = 2000
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                       n_iter=n_iter_search,n_jobs=4, verbose=1)
    random_search.fit(x_train_std,y_train)
    
    print 'Reporting'
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.grid_scores_)
    score=random_search.score(x_test_std,y_test)
    print 'Test score'
    print score
    print 'Predicting'
    output = random_search.predict(test_data_std)
    
    open_file_object = csv.writer(open("pipelinearsvcknn.csv", "wb"))
    open_file_object.writerow(["PassengerId","Survived"])
    open_file_object.writerows(zip(ids, output))
              'max_depth': sp_randint(4, 200),
              'learning_rate': sp_uniform(loc=0e0,scale=1e0),
              'objective':['multi:softprob'],
              'nthread': [8],
              'missing': [np.nan],
              'reg_alpha': [0.01,0.017782794,0.031622777,0.056234133,\
                            0.1,0.17782794,0.31622777,0.56234133,1.,1.77827941,\
                            3.16227766,5.62341325,10.,\
                            17.7827941,31.6227766,56.2341325,100.],
              'colsample_bytree': sp_uniform(loc=0.2e0,scale=0.8e0),
              'subsample': np.arange(0.6,1.0,step=0.05),
              'n_estimators': sp_randint(100,700),
}

print "Randomized XGBoost"
# In[ ]:
for i in range(2):
    print "Loop %i/20" % i
    search_GB = RandomizedSearchCV(GB,
                                   param_grid,
                                   scoring='log_loss',
                                   n_jobs=-1,
                                   n_iter=n_iter,
                                   cv=cv,
                                   verbose=True)
    search_GB.fit(X_train, y_train)
    log_model = search_GB.score(X_val, y_val)
    print "Log loss = %s" % log_model
    X_test = get_test()
    save_submission('XGBoost', log_model, search_GB.predict_proba(X_test))
예제 #15
0
def model_pred(X,Y,
               hyperparams ,
               maximize='accuracy' ,
               model_type='logreg' ,
               n_iter_search = 30,
               n_cv_sets = 10 ,
               limits = [-3 , 1 , 0.5] ):



    X_train, X_test , Y_train , Y_test= train_test_split(X,Y, test_size = 0.3)

    param_dist = hyperparams
    
    if model_type=='linreg' :
        model = linear_model.ElasticNet()
    elif model_type=='lasso' :
        model = linear_model.Lasso()
    elif model_type=='randomforest' :
        model = RandomForestRegressor()
    elif model_type=='GBT' :
        model = GradientBoostingRegressor()
    elif model_type == 'NN':
        model = MLPRegressor()

    #how to decide the score
    random_search = RandomizedSearchCV(model, param_distributions=param_dist,n_iter=n_iter_search, cv = n_cv_sets, )#scoring=
    #scorer(estimator, X, y)

    start = time()
    random_search.fit(X_train, Y_train)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.grid_scores_)
    #random_search.fit(X_train, Y_train)
    Y_test_pred= random_search.predict(X_test)
    print 'score test set' , random_search.score(X_test, Y_test)




    print '===params best model' ,random_search.best_params_ 



    print ' diff pred and ground ' , np.mean(abs(Y_test_pred -Y_test ))
    


    model =random_search.best_estimator_    
    Y_sub_pred=cross_validation.cross_val_predict(model, X, y=Y, cv=n_cv_sets, n_jobs=2)
    #Y_sub_pred=(model.predict(X))
    best_score=random_search.best_score_ 
    print ' score on train set' , best_score

    if model_type=='linreg' :
        print zip(X_train.columns , model.coef_)
    elif model_type=='lasso' :
        model = linear_model.Lasso()
    elif model_type=='randomforest' :
        print zip(X_train.columns , model.feature_importances_)
    elif model_type=='GBT' :
        print zip(X_train.columns , model.feature_importances_)         
    elif model_type == 'NN':
        model = MLPRegressor()


    return Y_sub_pred , best_score
예제 #16
0
# In[ ]:


print(RR_model.best_score_)

# In[ ]:


print(RR_model.best_params_)

# In[ ]:


y_prob = RR_model.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
RR_model.score(X_test, y_pred)

# In[ ]:


confusion_matrix=metrics.confusion_matrix(y_test,y_pred)
confusion_matrix

# In[ ]:


auc_roc=metrics.classification_report(y_test,y_pred)
auc_roc

# In[ ]:
    "max_features": sp_randint(1, 11),
    "min_samples_split": sp_randint(1, 11),
    "min_samples_leaf": sp_randint(1, 11),
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy"],
    "n_estimators": sp_randint(100, 600)
}

# In[4]:

search_GB = RandomizedSearchCV(model,
                               param_grid,
                               scoring='log_loss',
                               n_jobs=-1,
                               n_iter=n_iter,
                               cv=cv,
                               verbose=True)
search_GB.fit(X_train, y_train.flatten())

# In[5]:

log_model = search_GB.score(X_val, y_val.flatten())
print "Log loss = %s" % log_model
X_test = get_test()
y_pred = search_GB.predict_proba(X_test)
save_submission(model_name, log_model, y_pred)

# In[7]:

model_name