Пример #1
0
def tune_train_test(X, y):
    # Utilize the so-called outer-inner-cv (Batuwitage, p55)
    outer_kfold = 5
    inner_kfold = 5
    gmeans = np.zeros(shape=(outer_kfold, 3))  # column: gmean, SE, SP

    outer_iter_idx = 0
    skf = cv.StratifiedKFold(
        y, n_folds=outer_kfold
    )  # contains n_folds clones datasets, each contain te_ and tr_ data with the ratio of 1:(n_fols-1)
    for tr_idx, te_idx in skf:  # the outer loop: StratifiedKFold
        print 'Outer loop iter-th=', outer_iter_idx + 1, '--------------------'
        X_tr, X_te = X[tr_idx], X[te_idx]
        y_tr, y_te = y[tr_idx], y[te_idx]

        # Inner loop: A _coarse_ grid-search k-fold CV, (Batuwitage, phdthesis, p55)
        coarse_param_space = {'C': [2**i for i in range(-15,0)], \
                              'gamma': [2**i for i in range(1,16)]}

        coarse_clf = svm.SVC(kernel='rbf')
        coarse_grid_search = GridSearchCV(coarse_clf,
                                          param_grid=coarse_param_space,
                                          cv=inner_kfold)
        coarse_grid_search.fit(X_tr,
                               y_tr)  # Run fit with all sets of parameters.

        primary_C = coarse_grid_search.get_params()['estimator__C']
        primary_gamma = coarse_grid_search.get_params()['estimator__gamma']

        # Inner loop: A _narrow_ grid search
        narrow_param_space = {'C': [ 2**(primary_C+i) for i in [float(j)/100 for j in range(-75,100, 25)] ], \
                              'gamma': [ 2**(primary_gamma+i) for i in [float(j)/100 for j in range(-75,100, 25)] ]}

        narrow_clf = svm.SVC(kernel='rbf')
        narrow_grid_search = GridSearchCV(coarse_clf,
                                          param_grid=coarse_param_space,
                                          cv=inner_kfold)
        narrow_grid_search.fit(X_tr,
                               y_tr)  # Run fit with all sets of parameters.

        secondary_C = narrow_grid_search.get_params()['estimator__C']
        secondary_gamma = narrow_grid_search.get_params()['estimator__gamma']

        # a new SVM model was trained by using the complete training dataset on those secondary_C and secondary_gamma
        clf = svm.SVC(kernel='rbf', C=secondary_C, gamma=secondary_gamma)
        clf.fit(X_tr, y_tr)

        # the performance of the resulted model was tested on the remaining separate testing partition.
        y_pred = clf.predict(X_te)
        confusion_matrix(y_te, y_pred)
        gmeans[outer_iter_idx, :] = util.get_gmean(
            confusion_matrix(y_te, y_pred))
        outer_iter_idx = outer_iter_idx + 1

    return (np.mean(gmeans, axis=0), np.std(gmeans, axis=0))
Пример #2
0
def train_svm_classifier(debug=False, n_jobs=-1):
    """train SVM classifier"""
    print('Start to load data')
    tr_mat, tr_labels, te_mat, te_labels = generate_tr_te_data()
    print('Data loaded')
    print('Training a linear SVM model')
    estimator = LinearSVC()
    k_fold = KFold(tr_mat.shape[0], n_folds=5)
    Cs = np.logspace(-10, 0, 10)
    clf = GridSearchCV(estimator=estimator, cv=k_fold, n_jobs=n_jobs,
                       param_grid=dict(C=Cs), verbose=10)
    clf.fit(tr_mat, tr_labels)

    if debug:
        # debug with learning curve
        title = 'Learning Curves(SVM, linear kernel, $C=%.6f$)'\
                % clf.best_estimator_.C
        estimator = LinearSVC(C=clf.best_estimator_.C)
        plot_learning_curve(estimator, title, tr_mat, tr_labels, cv=k_fold,
                            n_jobs=n_jobs)
        plt.savefig('learning_curve.png')

    print('Testing linear SVM model')
    print('Evaluate on test set')
    clf = LinearSVC(C=clf.best_estimator_.C)
    clf.fit(tr_mat, tr_labels)
    score = clf.score(te_mat, te_labels)
    print('score: {}'.format(score))
    print(clf.get_params())

    joblib.dump(clf, config.model_path)
    print('Classifier saved to {}'.format(config.model_path))
Пример #3
0
def optimize(svr):
    paramaters = {
        'kernel': ('rbf', 'linear', 'poly'),
        'C': tuple(range(1, 10))
    }
    clf = GridSearchCV(svr, paramaters)
    print 'Best Parameters:', clf.get_params()
    return clf
Пример #4
0
def tune_train_test(X, y):
    # Utilize the so-called outer-inner-cv (Batuwitage, p55)
    outer_kfold = 5
    inner_kfold = 5
    gmeans = np.zeros(shape=(outer_kfold,3))# column: gmean, SE, SP

    outer_iter_idx = 0
    skf = cv.StratifiedKFold(y, n_folds=outer_kfold)# contains n_folds clones datasets, each contain te_ and tr_ data with the ratio of 1:(n_fols-1)
    for tr_idx, te_idx in skf:# the outer loop: StratifiedKFold
        print 'Outer loop iter-th=', outer_iter_idx+1, '--------------------'
        X_tr, X_te = X[tr_idx], X[te_idx]
        y_tr, y_te = y[tr_idx], y[te_idx]
        
        # Inner loop: A _coarse_ grid-search k-fold CV, (Batuwitage, phdthesis, p55)
        coarse_param_space = {'C': [2**i for i in range(-15,0)], \
                              'gamma': [2**i for i in range(1,16)]}
        
        coarse_clf = svm.SVC(kernel='rbf')
        coarse_grid_search = GridSearchCV(coarse_clf, param_grid=coarse_param_space, cv=inner_kfold)
        coarse_grid_search.fit(X_tr,y_tr)# Run fit with all sets of parameters.
        
        primary_C = coarse_grid_search.get_params()['estimator__C']
        primary_gamma = coarse_grid_search.get_params()['estimator__gamma']
        
        # Inner loop: A _narrow_ grid search
        narrow_param_space = {'C': [ 2**(primary_C+i) for i in [float(j)/100 for j in range(-75,100, 25)] ], \
                              'gamma': [ 2**(primary_gamma+i) for i in [float(j)/100 for j in range(-75,100, 25)] ]}
        
        narrow_clf = svm.SVC(kernel='rbf')
        narrow_grid_search = GridSearchCV(coarse_clf, param_grid=coarse_param_space, cv=inner_kfold)
        narrow_grid_search.fit(X_tr,y_tr)# Run fit with all sets of parameters.
        
        secondary_C = narrow_grid_search.get_params()['estimator__C']
        secondary_gamma = narrow_grid_search.get_params()['estimator__gamma']
        
        # a new SVM model was trained by using the complete training dataset on those secondary_C and secondary_gamma
        clf = svm.SVC(kernel='rbf', C=secondary_C, gamma=secondary_gamma)
        clf.fit(X_tr, y_tr)
        
        # the performance of the resulted model was tested on the remaining separate testing partition.
        y_pred = clf.predict(X_te)
        confusion_matrix(y_te, y_pred)
        gmeans[outer_iter_idx,:] = util.get_gmean(confusion_matrix(y_te, y_pred))
        outer_iter_idx = outer_iter_idx + 1    
        
    return (np.mean(gmeans, axis=0), np.std(gmeans, axis=0))
Пример #5
0
def tune_nuSVC(X_tr, y_tr):
    clf = NuSVC()
    
    param_space = { 'nu': [0.3, 0.5, 0.8], 'kernel': ['rbf'], 'degree': [3, 5], 'gamma': [0.0, 0.5] }
    
    grid_search = GridSearchCV(clf, param_grid=param_space, cv=10)
    grid_search.fit(X_tr,y_tr)# Run fit with all sets of parameters.
    
    return grid_search.get_params()['estimator']
		def eval(input_x,input_y,test_x,test,label,write_folder = None):
			tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
              		  			'C': [1, 10, 100, 1000]},
                				{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
			grid_clf = GridSearchCV(SVR(C=1,epsilon=0.2), tuned_parameters)
			grid_clf.fit(input_x,input_y)
			print "params : \t"
			print grid_clf.get_params()
			result = grid_clf.predict(test_x)
			#py_weka = python_weka(input_x,input_y,label)
			#py_weka.train()
			#result = py_weka.predict(test_x)
			#py_weka.close()
			#clf = SVR(C=1.0, epsilon=0.2)
			#clf.fit(input_x,input_y)
			#result =  clf.predict(test_x)
			score_index = 0
			produce_set = []
			for i in test:
				produce_set.append([])
				score_list = []
				index_list = []
				for j in i.thread:
					for k in j.sentences:
						k.predict_score = result[score_index]
						score_index += 1
						score_list.append(k.predict_score)
						index_list.append(k.index)
				sorted_index_array = sorted_index(score_list)
				sen_length = 0
				for j in range(len(index_list)):
					if sen_length < float(len(index_list))*0.3:
						produce_set[-1].append(index_list[sorted_index_array[len(index_list)-j-1]])
						sen_length += 1
					else:
						break
			score =  weightRecall(test,produce_set,write_folder)
			print score
			rouge_eval = rouge(test,produce_set)
			rouge_score =  rouge_eval.eval()['rouge_l_f_score']
			print rouge_score
			return score,rouge_score
Пример #7
0
    def set_model_parameters(self, model_name, verbose=3, file_path=""):
        if not self.model is None:
            model_name = self.model.__class__.__name__

        if model_name == "LinearSVC":
            model_to_set = LinearSVC()
            parameters = {"C": [1, 2, 4, 8], "loss": ["l1", "l2"]}
        elif model_name == "SVC":
            model_to_set = OneVsRestClassifier(SVC(kernel="poly"))
            parameters = {
                "estimator__C": [1, 2, 4, 8],
                "estimator__kernel": ["poly", "rbf"],
                "estimator__degree": [1, 2, 3, 4]
            }
        elif model_name == "LogisticRegression":
            model_to_set = LogisticRegression()
            parameters = {"penalty": ["l1", "l2"], "C": [1, 2, 4, 8]}
        else:
            raise ValueError("Invalid model name.")

        # Perform Grid Search with 10-fold cross-validation to estimate the parameters
        # cv_generator = StratifiedKFold(self.label_encoder.transform(self.train_labels), n_folds=7)
        cv_generator = KFold(len(self.train_labels), n_folds=10, shuffle=True)
        model_tunning = GridSearchCV(model_to_set,
                                     param_grid=parameters,
                                     scoring=f1_score,
                                     n_jobs=1,
                                     cv=cv_generator,
                                     verbose=verbose)

        # Perform parameter setting
        model_tunning.fit(self.train_feats,
                          self.label_encoder.transform(self.train_labels))

        if verbose > 0:
            print "Best model:"
            print model_tunning.best_estimator_
            print "Best parameters:"
            print model_tunning.best_params_
            print "Best score {}:".format(
                model_tunning.get_params()["score_func"])
            print model_tunning.best_score_

        if file_path != "":
            file_name = file_path + model_name + "AI_Semi.bin"
            if verbose > 0:
                print "Saving best model {}...".format(file_name)
            tunned_model_file = open(file_name, "wb")
            cPickle.dump(model_tunning.best_estimator_, tunned_model_file)
            tunned_model_file.close()

        self.model = model_tunning.best_estimator_

        return self.model
Пример #8
0
def tune_NearestNeigbours(X_tr, y_tr):
    print 'tune_NearestNeigbours(X_tr, y_tr):'
    clf = KNeighborsClassifier(algorithm='ball_tree')

    param_space = {'n_neighbors': [3, 5, 10], \
                   'leaf_size': [20, 30, 50], \
                   'metric': ['minkowski']}# 'euclidean' is unapplicable to the ball_tree algorithm, but for brute

    grid_search = GridSearchCV(clf, param_grid=param_space, cv=10)
    grid_search.fit(X_tr, y_tr)  # Run fit with all sets of parameters.

    return grid_search.get_params()['estimator']
Пример #9
0
def tune_GradientBoosting(X_tr, y_tr):
    print 'tune_GradientBoosting(X_tr, y_tr):'
    clf = GradientBoostingClassifier()
    
    param_space = {'n_estimators': [50, 100, 200], \
                   'learning_rate': [0.1, 0.3], \
                   'max_features': [0.3, 0.5]}
    
    grid_search = GridSearchCV(clf, param_grid=param_space, cv=10)
    grid_search.fit(X_tr,y_tr)# Run fit with all sets of parameters.
    
    return grid_search.get_params()['estimator']
Пример #10
0
def tune_XRandomizedTrees(X_tr, y_tr):
    print 'tune_XRandomizedTrees(X_tr, y_tr):'
    clf = ExtraTreesClassifier()

    param_space = {'n_estimators': [10, 30, 70], \
                   'criterion': ['gini', 'entropy'], \
                   'max_features': [0.3, 0.5]}

    grid_search = GridSearchCV(clf, param_grid=param_space, cv=10)
    grid_search.fit(X_tr, y_tr)  # Run fit with all sets of parameters.

    return grid_search.get_params()['estimator']
Пример #11
0
def tune_GradientBoosting(X_tr, y_tr):
    print 'tune_GradientBoosting(X_tr, y_tr):'
    clf = GradientBoostingClassifier()

    param_space = {'n_estimators': [50, 100, 200], \
                   'learning_rate': [0.1, 0.3], \
                   'max_features': [0.3, 0.5]}

    grid_search = GridSearchCV(clf, param_grid=param_space, cv=10)
    grid_search.fit(X_tr, y_tr)  # Run fit with all sets of parameters.

    return grid_search.get_params()['estimator']
Пример #12
0
def tune_NearestNeigbours(X_tr, y_tr):
    print 'tune_NearestNeigbours(X_tr, y_tr):'
    clf = KNeighborsClassifier(algorithm='ball_tree')
    
    param_space = {'n_neighbors': [3, 5, 10], \
                   'leaf_size': [20, 30, 50], \
                   'metric': ['minkowski']}# 'euclidean' is unapplicable to the ball_tree algorithm, but for brute

    grid_search = GridSearchCV(clf, param_grid=param_space, cv=10)
    grid_search.fit(X_tr,y_tr)# Run fit with all sets of parameters.
    
    return grid_search.get_params()['estimator']
Пример #13
0
def tune_XRandomizedTrees(X_tr, y_tr):
    print 'tune_XRandomizedTrees(X_tr, y_tr):'
    clf = ExtraTreesClassifier()
    
    param_space = {'n_estimators': [10, 30, 70], \
                   'criterion': ['gini', 'entropy'], \
                   'max_features': [0.3, 0.5]}
    
    grid_search = GridSearchCV(clf, param_grid=param_space, cv=10)
    grid_search.fit(X_tr,y_tr)# Run fit with all sets of parameters.
    
    return grid_search.get_params()['estimator']
Пример #14
0
def tune_SGD(X_tr, y_tr):
    print 'tune_SGD(X_tr, y_tr):'
    clf = SGDClassifier()
    
    param_space = {'loss': ['hinge', 'log', 'modified_huber'], \
                   'penalty': ['l2', 'l1'], \
                   'alpha' : [0.0001, 0.001]}
    
    grid_search = GridSearchCV(clf, param_grid=param_space, cv=10)
    grid_search.fit(X_tr,y_tr)# Run fit with all sets of parameters.
    
    return grid_search.get_params()['estimator']
Пример #15
0
def tune_SGD(X_tr, y_tr):
    print 'tune_SGD(X_tr, y_tr):'
    clf = SGDClassifier()

    param_space = {'loss': ['hinge', 'log', 'modified_huber'], \
                   'penalty': ['l2', 'l1'], \
                   'alpha' : [0.0001, 0.001]}

    grid_search = GridSearchCV(clf, param_grid=param_space, cv=10)
    grid_search.fit(X_tr, y_tr)  # Run fit with all sets of parameters.

    return grid_search.get_params()['estimator']
Пример #16
0
def tune_nuSVC(X_tr, y_tr):
    print 'tune_nuSVC(X_tr, y_tr):'
    clf = NuSVC()

    param_space = {'nu': [0.3, 0.5, 0.8], \
                   'kernel': ['poly', 'rbf', 'sigmoid'], \
                   'degree': [3, 5], \
                   'gamma': [0.0, 0.5]}

    grid_search = GridSearchCV(clf, param_grid=param_space, cv=10)
    grid_search.fit(X_tr, y_tr)  # Run fit with all sets of parameters.

    return grid_search.get_params()['estimator']
Пример #17
0
def runDecisionTreewithAdaboost(Y, Xtrain, Xtest, isBlind):

    # Note time to run this setup
    run_start = time.time()

    # Reduce feature based on importance
    (Xtrain, Xtest) = reduceFeatureswithExtraTrees(Y, Xtrain, Xtest)

    model_start = time.time()

    # Specify parameters for GridSearch
    param_grid = {
        'base_estimator__criterion': ["gini", "entropy"],
        'base_estimator__max_depth': [8, 10],
        'base_estimator__max_features': ['sqrt', 0.25],
        'n_estimators': [25, 30],
        'learning_rate': [0.8, 1.0]
    }

    dtc = DecisionTreeClassifier(random_state=11,
                                 max_features="auto",
                                 class_weight="balanced")
    abc = AdaBoostClassifier(base_estimator=dtc, algorithm="SAMME.R")

    # run grid search
    abc_tuned = GridSearchCV(abc, param_grid=param_grid, scoring='f1_weighted')

    # Fit the model
    abc_tuned.fit(Xtrain, Y)

    model_end = time.time()
    logger.info('Time to run Gridsearch with AdaBoost(DecisionTree): %0.3fs' %
                (model_end - model_start))

    logger.info('Model params = %s' % abc_tuned.get_params())
    logger.info('AUC per class = %s' %
                getAUCByClass(abc_tuned, Xtrain, Y, classes=[1, 2, 3, 4]))
    logger.info('F1 Score per class = %s' %
                getF1ScoreByClass(abc_tuned, Xtrain, Y, classes=[1, 2, 3, 4]))

    # Create submission file
    createSubmission(abc_tuned, Xtest, isBlind)

    # Note the end time
    run_end = time.time()
    logger.info('Time to run analysis(AdaBoost): %0.3fs' %
                (run_end - run_start))
Пример #18
0
def find_params(model, data, labels, param_grid={}, test_frac=0.6, seed=500):
    '''
    Use a grid search to determine the optimum parameters for the given model.
    '''

    train_set, test_set = \
        train_test_split(data, labels, test_size=test_frac, random_state=seed)

    clf = GridSearchCV(model, param_grid)

    clf.fit(train_set)

    score = clf.score()

    pars = clf.get_params()

    return pars, score
Пример #19
0
def find_params(model, data, labels, param_grid={}, test_frac=0.6, seed=500):
    '''
    Use a grid search to determine the optimum parameters for the given model.
    '''

    train_set, test_set = \
        train_test_split(data, labels, test_size=test_frac, random_state=seed)

    clf = GridSearchCV(model, param_grid)

    clf.fit(train_set)

    score = clf.score()

    pars = clf.get_params()

    return pars, score
Пример #20
0
param_grid = {
    'scaler': [None, preprocessing.MinMaxScaler()],
    'selector__k': [10, 15, 'all']
}

clf_gnb_grd = GridSearchCV(pipe, param_grid, scoring='f1')
#ptest_classifier(clf_gnb_grd, my_dataset, features_list_all)
tester.dump_classifier_and_data(clf_gnb_grd, my_dataset, features_list_eng_all)
tester.main();

## Features and the Feature score of the final model
print 
print "*********************************************************************************************************************************"
print "GaussianNB parameters"
print clf_gnb_grd.get_params()
print "*********************************************************************************************************************************"

'''
print
print "*********************************************************************************************************************************"
print "Features and the Feature score of the final model" 
gt_kbest_scores = sorted(zip(selector.scores_, features_list_eng_all[:10]), reverse=True)
print gt_kbest_scores
print "*********************************************************************************************************************************"

'''



# # Performance Tuning and Validation of DecisionTree model
Пример #21
0
X_test = scaler.fit_transform(X_test)
X_train = scaler.fit_transform(X_train)

cv_1 = StratifiedKFold(y_test, n_folds=10, random_state=12)

C_range = np.logspace(-5, 15, 11, base=2)
gamma_range = np.logspace(-15, 3, 10, base=2)
param_grid = dict(gamma=gamma_range, C=C_range)
    
#Find the best parameters for SVM.
grid = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid, cv=cv_1, scoring='accuracy', n_jobs = 320, verbose=2)
print("GridSearchCV line... done")
grid.fit(X_test, y_test)
print("grid.fit() line... done")

print(grid.get_params())

print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))

print(grid.best_estimator_)
clf = grid.best_estimator_ 

cv_2 = StratifiedKFold(y_train, n_folds=10, random_state=12)

scores = cross_val_score(clf, X_train, y_train, cv=cv_2, scoring='accuracy', n_jobs=n_jobs_)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print(scores)

predicted_test_cross = cross_val_predict(clf, X_train, y_train, cv=cv_2, n_jobs=n_jobs_)
print(np.count_nonzero(predicted_test_cross == 1, axis=0))
Пример #22
0
        'epsilon': [0.0],
        'gamma': [0.01],
        'kernel': ['rbf']
    },
]

# {'kernel': 'rbf', 'C': 1, 'verbose': False, 'degree': 3, 'epsilon': 0.0, 'shrinking': True, 'max_iter': -1,
# 'tol': 0.001, 'cache_size': 200, 'coef0': 0.0, 'gamma': 0.01}
# ()

model = GridSearchCV(SVR(), param_grid, n_jobs=2, cv=5)
model.fit(features_train, sval_train)
model = model.best_estimator_

print('Model params:')
print(model.get_params())
print()

# cross validation results

from sklearn.cross_validation import cross_val_score

scores = cross_val_score(model, features_train, sval_train, cv=5)
scores_mse = cross_val_score(model,
                             features_train,
                             sval_train,
                             cv=5,
                             scoring='mean_squared_error')

print('Cross validation:')
Пример #23
0
def skSVM(X, y, scoring, tuned_parameters, data_parameters, cv_parameters):
    '''DESCRIPTION: This function takes the data set, already processed, along with relevant parameters. It then generates models, selects the best model with the training set, trains the best model with the training dataset, and tests the model on the test dataset. This includes spliting the data into training and testing, gridding, generating cross-validation metrics for all score types, training, testing, and reporting to the terminal.
    PRECONDITIONS: X. This is the independent variable dataset. It must be a numpy float array that is already preprocessed (per the function process in the guiTK module). No empty vaues allowed.
                   y. This is the dependent variable dataset. It must be a numpy float array that is already preprocessed and has the same number of elements as rows in the X set. No empty values allowed. Binary categories (0 and 1) only with at least one of each.
                   scoring. One of the standard scikit learn scoring values (ex. accuracy, precision ...)
                   tuned parameters. The standard scikit learn tuned_parameters list of dictionaries (ex. kernel, ...).
                   data_parameters. This is a dictionary of the parameters used for the data. The relevant ones here are how to split the datafile. Generated by process.
                   cv_parameters. This is a dictionary of all options required to implement the cross validations. Generated by process.
    POSTCONDITIONS: Print to terminal. This function prints out results to the terminal including the best estimator, cross validation results on the training data, and prediction metrics from the testing data
    SIDE EFFECTS: None.
    RETURN: None. dummy function prints. Future work will make it initialize and store variables in a class
    '''


    # Users have the option of stratifying data, or guaranteeing that an equal number of pass/fail are in both testing and training sets. Uses Cross Validation instead of simple test split function
    # First we create the mask for sorting, then we apply it to new numpy arrays
    if data_parameters['stratify']:
        split = StratifiedKFold(y=y, n_folds=int(1./data_parameters['testSize']), shuffle=True, random_state=data_parameters['random'])
    else:
        split = KFold(n=len(y), n_folds=int(1./data_parameters['testSize']), shuffle=True, random_state=data_parameters['random'])

    for train, test in split:
        X_train = X[train]
        X_test = X[test]
        y_train = y[train]
        y_test = y[test]
        break

    print "Starting the gridding process."
    print

    # We must guarantee that we have a valid number of folds, must be at least two but no lower that the mininum count of either DV class or five samples per fold
    numLabel1 = np.count_nonzero(y_train)
    numLabel0 = len(y_train) - numLabel1
    numFiveFolds = len(y_train) % 5
    n_folds = max(2, min(cv_parameters['folds'], numLabel0, numLabel1, numFiveFolds))

    # Build a cross validation data sorting method to pass into the gridding process based on data_parameters (from the GUI input)
    if cv_parameters['cvType'] == 'skf':
        cv = StratifiedKFold(y=y_train, n_folds=n_folds, shuffle=True, random_state=data_parameters['random'])
    elif cv_parameters['cvType'] == 'kf':
        cv = KFold(n=len(y_train), n_folds=n_folds, shuffle=True, random_state=data_parameters['random'])
    elif cv_parameters['cvType'] == 'sss':
        cv = StratifiedShuffleSplit(y=y_train, n_iter=cv_parameters['nIter'], test_size=cv_parameters['testSize'], random_state=data_parameters['random'])
    elif cv_parameters['cvType'] == 'ss':
        cv = ShuffleSplit(n=len(y_train), n_iter=cv_parameters['nIter'], test_size=cv_parameters['testSize'], random_state=data_parameters['random'])
    elif cv_parameters['cvType'] == 'lolo':
        cv = LeaveOneOut(n=len(y_train))
    elif cv_parameters['cvType'] == 'lplo':
        cv = LeavePOut(n=len(y_train), p=cv_parameters['p'])

    # Now grid based on the scoring, tuned parameters and CV class above
    grid = GridSearchCV(estimator=SVC(), param_grid=tuned_parameters, scoring = scoring, cv=cv)
    grid.fit(X_train, y_train)

    # Retrain the best CLF from the gridding process with the entire training data set and cross validate against all scoring types
    clf = grid.best_estimator_
    clf.fit(X_train, y_train)
    accuracy_scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=cv)
    precision_scores = cross_val_score(clf, X_train, y_train, scoring='precision', cv=cv)
    recall_scores = cross_val_score(clf, X_train, y_train, scoring='recall', cv=cv)
    f1_scores = cross_val_score(clf, X_train, y_train, scoring='f1', cv=cv)
    y_test_pred = clf.predict(X_test)
    y_train_pred = clf.predict(X_train)

    # Print the relevant results to the terminal for 
    # 1) The best estimator parameters
    # 2) Cross validation results for the training data and best estimator
    # 3) Prediction metrics on the test data
    print
    print "ESTIMATOR SELECTED FOR OPTIMAL {}:".format(grid.get_params(deep=True)['scoring'].upper())
    print "Parameters:"
    pprint.pprint(grid.best_params_, width=1)
    print
    print "Cross-validation scores on training data:"
    print "  Accuracy:  {:.1f} +/- {:.1f}%".format(accuracy_scores.mean() * 100, accuracy_scores.std() * 100)
    print "  Precision: {:.1f} +/- {:.1f}%".format(precision_scores.mean() * 100, precision_scores.std() * 100)
    print "  Recall:    {:.1f} +/- {:.1f}%".format(recall_scores.mean() * 100, recall_scores.std() * 100)
    print "  F1:        {:.1f} +/- {:.1f}%".format(f1_scores.mean() * 100, f1_scores.std() * 100)
    print
    print "Trained estimator scores on testing data:"
    print "  Accuracy:  {:.1f}%".format(accuracy_score(y_test, y_test_pred)*100)
    print "  Precision: {:.1f}%".format(precision_score(y_test, y_test_pred, average='weighted')*100)
    print "  Recall:    {:.1f}%".format(recall_score(y_test, y_test_pred, average='weighted')*100)
    print "  F1:        {:.1f}%".format(f1_score(y_test, y_test_pred, average='weighted')*100)
    print
    print "Trained estimator classification report on testing data:"
    print
    print classification_report(y_test, y_test_pred, target_names=['Fail', 'Pass'])
Пример #24
0
from self import preprocess
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.grid_search import GridSearchCV

features_train, features_test, labels_train, labels_test, mail = preprocess()
grid = {'C': [0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10]}
clf = GridSearchCV(SVC(kernel="rbf"), grid)
clf.fit(features_train, labels_train)
x = clf.predict(features_test)
print clf.predict(mail)
print clf.get_params()
print f1_score(labels_test, x, average="micro")
print confusion_matrix(labels_test, x)
Пример #25
0
the size of the original data set.
"""
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.1, random_state=0)

print "X_train shape =", X_train.shape, "  y_train shape=", y_train.shape
#print "X_test shape =", X==.shape, "  y_test shape=", y_test.shape
print

"""
The following lines train the SVM using our extracted training dataset and
is parameterized based on the gridding results. Then the trained SVM is
used to carry out predictions on the test data set. The percentage 
of accuracy predictions is printed
"""
clf = svm.SVC(kernel='rbf', C=10, gamma = 0.00001, degree = 3.0, coef0 = 0.0).fit(X_train, y_train)
print "clf.get_params(deep=True) =", clf.get_params(deep=True)
print "clf.score(X_test, y_test) = {0}%".format(int((clf.score(X_test, y_test) * 10000))/100.)
print "clf.predict(X_test) = ", clf.predict(X_test)
print "clf.decision_function(X_test) = ", clf.decision_function(X_test)
print "======================="
print "clf.score(X_train, y_train) = {0}%".format(int((clf.score(X_train, y_train) * 10000))/100.)
print "clf.predict(X_train) = ", clf.predict(X_train)
print "clf.decision_function(X_train) = ", clf.decision_function(X_train)
print "======================="
print
print
print "#####################################"
"""
http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
"""
print "clf.support_ = ", clf.support_
Пример #26
0
class BaseMethod(object):
    def __init__(self,
                 docs_train,
                 y_train,
                 extra={},
                 useCrossValidation=False,
                 vect_options={}):

        if sys.flags.debug:
            self.options = {}
        else:
            self.options = {
                'vect__ngram_range': [(1, 1)],  # (2, 2), (3,3)],
                # 'vect__stop_words': ('english', None),
                'vect__preprocessor':
                (None, pr.no_prep, pr.no_usernames, pr.remove_noise,
                 pr.placeholders, pr.all, pr.remove_all, pr.reduced_attached,
                 pr.no_url_usernames_reduced_attached),
                'vect__use_idf': (True, False),
                'vect__max_df': (0.5, ),
                'vect__smooth_idf': (True, False),
                'vect__sublinear_tf': (True, False)
            }

        self.train(docs_train, y_train, extra, useCrossValidation,
                   vect_options)

    def train(self,
              docs_train,
              y_train,
              extra={},
              useCrossValidation=False,
              vect_options={}):

        options = dict(self.options.items() + extra.items())
        cv = StratifiedKFold(y_train,
                             n_folds=10) if useCrossValidation else None

        pipeline = Pipeline([
            ('vect',
             TfidfVectorizer(charset_error='ignore',
                             tokenizer=t.tokenize,
                             **vect_options)),
            ('clf', self.clf),
        ])

        useGrid = sys.flags.optimize

        if useGrid:
            self.grid = GridSearchCV(pipeline,
                                     options,
                                     cv=cv,
                                     refit=True,
                                     n_jobs=-1,
                                     verbose=1)
        else:
            self.grid = pipeline

        cache_key = str(self.grid) + str(docs_train)
        cached = cache.get(cache_key)

        if cached and sys.flags.debug == 0:
            logging.debug("# Fetched cached version of %s " %
                          self.clf.__class__.__name__)
            self.best_estimator = cached['est']
            self.best_score = cached['scr']
            self.best_params = cached['parm']

        else:
            logging.debug("# Training new instance of %s " %
                          self.clf.__class__.__name__)

            self.grid.fit(docs_train, y_train)

            if useGrid:
                self.best_estimator = self.grid.best_estimator_
                self.best_params = self.grid.best_params_
                self.best_score = self.grid.best_score_
            else:
                self.best_estimator = self.grid
                self.best_params = self.grid.get_params(False)
                self.best_score = 1

                logging.debug("Saving to cache for %s " %
                              self.clf.__class__.__name__)
                cache.save(
                    cache_key, {
                        "est": self.best_estimator,
                        "scr": self.best_score,
                        "parm": self.best_params
                    })

        self.steps = self.best_estimator.named_steps

        logging.debug("# Best params for  %s :" % self.clf.__class__.__name__)
        logging.debug(self.best_params)

        logging.debug("# Best score for  %s :" % self.clf.__class__.__name__)
        logging.debug(self.best_score)

        return self.grid

    def predict(self, arg_input):
        orig = arg_input
        if isinstance(arg_input, basestring):
            orig = [orig]

        predictions = self.best_estimator.predict(orig)
        if isinstance(arg_input, basestring):
            return predictions[0]

        return predictions

    def __str__(self):
        return "%s" % self.__class__.__name__
Пример #27
0
gs =gs.fit(X_train,y_train)
print(gs.best_score_)
print(gs.best_params_)



pipe_neighbors=Pipeline([('scl',StandardScaler()),('clf',KNeighborsClassifier())])
metrics       = ['minkowski','euclidean','manhattan'] 
weights       = ['uniform','distance'] #10.0**np.arange(-5,4)
numNeighbors  = np.arange(5,10)
param_grid    = dict(metric_params=metrics,weights=weights,n_neighbors=numNeighbors)
neighbors_range = range(1, 21)
# param_grid = dict(kneighborsclassifier__n_neighbors=neighbors_range)
grid= GridSearchCV(estimator=pipe_neighbors,
					param_grid=param_grid,
					scoring='accuracy',
					cv=10,
					n_jobs=1)
print grid.get_params().keys()		
# pipe = Pipeline.make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))
# cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

# using GridSearchCV with Pipeline
# neighbors_range = range(1, 21)
# param_grid = [{'kneighborsclassifier__n_neighbors':neighbors_range}]
# grid = GridSearchCV(pipe_neighbors, param_grid, cv=5, scoring='accuracy')

grid =grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
Пример #28
0
svm = Pipeline([('scaler', StandardScaler()), ('selector', SelectKBest()),
                ('svm', svm.SVC())])
param_grid = ([{
    'svm__C': [1, 50, 100, 1000],
    'svm__gamma': [0.5, 0.1],
    'svm__degree': [1, 2],
    'svm__kernel': ['rbf', 'poly'],
    'selector__k': range(1, len(total_features))
}])

svm_clf = GridSearchCV(svm, param_grid, scoring='recall',
                       cv=5).fit(features, labels).best_estimator_
tester.test_classifier(svm_clf, my_dataset, total_features)

best_features = []
for i, feature in enumerate(svm_clf.get_params()['selector'].scores_):
    best_features.append([total_features[1:][i], feature])
pd_feature = pd.DataFrame(best_features,
                          index=np.arange(1,
                                          len(best_features) + 1),
                          columns=['Feature', 'Score'])

best_features = ['poi'] + pd_feature.nlargest(13, 'Score')['Feature'].tolist()
pd_feature.nlargest(13, 'Score')
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, best_features, sort_keys=True)
labels, features = targetFeatureSplit(data)

### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
Пример #29
0
# testing subsets
X = run1meta_features.values
y = cputime.values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)

RF = RandomForestRegressor(random_state=seed)
# Run a grid search with Random Forests over n_estimators
param_grid_RF = {'n_estimators': np.array([16,32,64, 128, 256, 512])}

np.set_printoptions(suppress=True)
print(param_grid_RF)

RF_tuned = GridSearchCV(RF, param_grid_RF, verbose=3)

RF_tuned.fit(X_train, y_train)
print(RF_tuned.get_params())

y_RF_tuned_pred = RF_tuned.predict(X_test)

mse_RF_tuned = mean_squared_error(y_test,y_RF_tuned_pred)
r2_RF_tuned = RF_tuned.score(X_test, y_test)

# Print some metrics of the quality of the RF modeling
print('MSE (tuned RF) =', mse_RF_tuned)
print('R2 score (tuned RF) =',r2_RF_tuned)

RFbest = RF_tuned.best_estimator_

print(run1meta_features.keys())
print(RFbest.feature_importances_)
        # 'clf__criterion': ['gini', 'entropy'],
        # 'clf__max_depth': [3, 6, 8, 11, 15, 20]

        # KNC
        'clf__n_neighbors': [2, 4, 6, 10],
        'clf__weights': ['distance', 'uniform'],
        'clf__algorithm': ['kd_tree', 'ball_tree', 'auto', 'brute'],
    },
    scoring='recall')

grid_search.fit(X_train, y_train)

clf = pipeline.set_params(**grid_search.best_params_)
pipeline.fit(X_train, y_train)

print(grid_search.best_params_)
print(dir(grid_search))

report = classification_report(y_test, clf.predict(X_test))
print report

# dump classifier and dta
dump_classifier_and_data(clf, my_dataset, features_list)

# Getting the feature Scores
k = grid_search.get_params(
    True)['estimator__feature_selection__transformer_list'][0][1]
features_scores = zip(features_list[1:], k.scores_)
for f, s in sorted(features_scores, key=lambda x: x[1], reverse=True):
    print('%s: %s' % (f, s))
Пример #31
0
class BaseMethod(object):

  def __init__(self, docs_train, y_train, extra = {}, useCrossValidation = False, vect_options = {}):

    if sys.flags.debug:
        self.options = {}
    else: 
        self.options = {
                'vect__ngram_range': [(1, 1)], # (2, 2), (3,3)],
                # 'vect__stop_words': ('english', None),
                'vect__preprocessor': (None, pr.no_prep, pr.no_usernames, pr.remove_noise, pr.placeholders, pr.all, pr.remove_all, pr.reduced_attached, pr.no_url_usernames_reduced_attached),
                'vect__use_idf': (True, False),
                'vect__max_df': (0.5,),
                'vect__smooth_idf': (True, False),
                'vect__sublinear_tf': (True, False)
              }

    self.train(docs_train, y_train, extra, useCrossValidation, vect_options)


  def train(self, docs_train, y_train, extra = {}, useCrossValidation = False, vect_options={}):

    options = dict(self.options.items() + extra.items())
    cv = StratifiedKFold(y_train, n_folds=10) if useCrossValidation else None

    pipeline = Pipeline([
        ('vect', TfidfVectorizer(charset_error='ignore', tokenizer=t.tokenize, **vect_options)),
        ('clf', self.clf),
    ])

    useGrid = sys.flags.optimize

    if useGrid:
        self.grid = GridSearchCV(
                pipeline, 
                options,
                cv=cv,
                refit=True,
                n_jobs=-1,
                verbose=1
              )
    else:
        self.grid = pipeline

    cache_key = str(self.grid) + str(docs_train)
    cached = cache.get(cache_key)
    
    if cached and sys.flags.debug == 0: 
        logging.debug("# Fetched cached version of %s " % self.clf.__class__.__name__)
        self.best_estimator = cached['est']
        self.best_score = cached['scr']
        self.best_params = cached['parm']

    else:
      logging.debug("# Training new instance of %s " % self.clf.__class__.__name__)

      self.grid.fit(docs_train, y_train)

      if useGrid:
          self.best_estimator = self.grid.best_estimator_
          self.best_params = self.grid.best_params_
          self.best_score = self.grid.best_score_
      else:
          self.best_estimator = self.grid
          self.best_params = self.grid.get_params(False)
          self.best_score = 1

          logging.debug("Saving to cache for %s " % self.clf.__class__.__name__)
          cache.save(cache_key, {
              "est": self.best_estimator,
              "scr": self.best_score,
              "parm": self.best_params
            })

    self.steps = self.best_estimator.named_steps

    logging.debug("# Best params for  %s :" % self.clf.__class__.__name__)
    logging.debug(self.best_params)

    logging.debug("# Best score for  %s :" % self.clf.__class__.__name__)
    logging.debug(self.best_score)

    return self.grid

  def predict(self, arg_input):
    orig = arg_input
    if isinstance(arg_input, basestring):
        orig = [orig]

    predictions = self.best_estimator.predict(orig)
    if isinstance(arg_input, basestring):
      return predictions[0]

    return predictions

  def __str__(self):
    return "%s" % self.__class__.__name__
    plt.plot(ln_x_test,
             y_predict,
             colors[t],
             lw=t + 3,
             label=u'%s算法估计值,$R^2$=%.3f' % (titles[t], model.best_score_))
# 图形显示
plt.legend(loc='upper left')
plt.grid(True)
plt.title(u"波士顿房屋价格预测")
plt.show()

# 模型训练 ====> 单个Lasso模型(一阶特征选择)<2参数给定1阶情况的最优参数>
model = Pipeline([('ss', StandardScaler()),
                  ('poly',
                   PolynomialFeatures(degree=1,
                                      include_bias=True,
                                      interaction_only=True)),
                  ('linear',
                   LassoCV(alphas=np.logspace(-3, 1, 20),
                           fit_intercept=False))])
# 模型训练
model.fit(x_train, y_train)

# 模型评测
# 数据输出
print("参数:", zip(names, model.get_params('linear')['linear'].coef_))
print("截距:", model.get_params('linear')['linear'].intercept_)

# 参数: [('CRIM', 21.135499741068376), ('ZN', -0.0), ('INDUS', -0.0), ('CHAS', -0.0), ('NOX', 0.19539929236955278), ('RM', -0.0), ('AGE', 1.5662356175920531), ('DIS', -0.38131114313786807), ('RAD', -0.69604251661926086), ('TAX', 0.0), ('PTRATIO', -0.0), ('B', -1.5063986238529539), ('LSTAT', 0.0)]
# 截距: 0.0