Пример #1
0
def train(param_search=False):
    data = load_files(download())
    y = [data.target_names[t] for t in data.target]

    # The random state on the LR estimator is fixed to the most arbitrary value
    # that I could come up with. It is biased toward the middle number keys on
    # my keyboard.
    clf = make_pipeline(TfidfVectorizer(min_df=2, dtype=float,
                                        sublinear_tf=True,
                                        ngram_range=(1, 2),
                                        strip_accents='unicode'),
                        LogisticRegression(random_state=623, C=5000))

    if param_search:
        params = {'tfidf__ngram_range': [(1, 1), (1, 2)],
                  'lr__C': [1000, 5000, 10000]}

        print("Starting parameter search for review sentiment classification")
        # We ignore the original folds in the data, preferring a simple 5-fold
        # CV instead; this is intended to get a working model, not results for
        # publication.
        gs = GridSearchCV(clf, params, cv=5, refit=True, n_jobs=-1, verbose=2)
        gs.fit(data.data, y)

        print("Parameters found:")
        pprint(gs.best_params_)
        print("Cross-validation accuracy: %.3f" % gs.best_score_)

        return gs.best_estimator_

    else:
        print("Training logistic regression for movie review polarity")
        return clf.fit(data.data, y)
def test_grid_search_precomputed_kernel():
    """Test that grid search works when the input features are given in the
    form of a precomputed kernel matrix """
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    # compute the training kernel matrix corresponding to the linear kernel
    K_train = np.dot(X_[:180], X_[:180].T)
    y_train = y_[:180]

    clf = SVC(kernel='precomputed')
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(K_train, y_train)

    assert_true(cv.best_score_ >= 0)

    # compute the test kernel matrix
    K_test = np.dot(X_[180:], X_[:180].T)
    y_test = y_[180:]

    y_pred = cv.predict(K_test)

    assert_true(np.mean(y_pred == y_test) >= 0)

    # test error is raised when the precomputed kernel is not array-like
    # or sparse
    assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
Пример #3
0
	def do_cross_validation(self, param_grid, svmtype, score_func, inputdata_train, outputdata_train, inputdata_test, outputdata_test):
		""" Fitting of classifier used for cross validation """

		if svmtype == 'ln':
			svm_clf = LinearSVC()
		if svmtype == 'rbf':
			svm_clf = SVC()
		#clf_cv = GridSearchCV(SVC(), param_grid, score_func=score_func,  n_jobs=-1 )
		#clf_cv = GridSearchCV( LinearSVC(), param_grid, score_func=score_func,  n_jobs=-1 )
		
		clf_cv = GridSearchCV(svm_clf, param_grid, score_func=score_func,  n_jobs=-1 )

		clf_cv.fit(inputdata_train, outputdata_train)
		y_pred_cv = clf_cv.predict(inputdata_test)

		f1 = metrics.f1_score(outputdata_test, y_pred_cv, pos_label=0)
		dict_param = clf_cv.best_params_
		c = dict_param['C']	

		if svmtype == 'rbf':
			gamma1 = dict_param['gamma']
		else:
			gamma1 = 0


		return(f1, gamma1, c)
Пример #4
0
def search_parameters(data_file):
    with open(data_file, 'r') as f:
            data = pickle.load(f)
            labels = data['labels']
            features = data['features']
    # Split the dataset in two equal parts
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    features, labels, test_size=0.5, random_state=0)
    scores = [
    ('error_rate', zero_one_score),]
    
    #classifier = svm.LinearSVC()
    classifier = MultinomialNB()
    
    tuned_parameters = {'alpha' :(0.001, 0.01,0.1,0.5,1,1.5,2,5,10) }
    #tuned_parameters = {'C' :(0.00001, 0.001, 0.01, 0.1,0.5,1,1.5,2,5,10,20,50,100,500,1000)}
    for score_name, score_func in scores:
        print "# Tuning hyper-parameters for %s" % score_name
        print
    
        clf = GridSearchCV(classifier, tuned_parameters, score_func=score_func)
        clf.fit(X_train, y_train, cv=5)
    
        print "Best parameters set found on development set:"
        best_parameters, score,_ = max(clf.grid_scores_, key=lambda x: x[1])
        for param_name in sorted(tuned_parameters.keys()):
            print "%s: %r" % (param_name, best_parameters[param_name])
def learn(tuned_parameters,model):

	# produceFeature(trainfile)
	dataset = genfromtxt(open('Data/'+trainfile,'r'), delimiter=',',dtype='f8')[0:]
	target = [x[0] for x in dataset]
	train = [x[1:] for x in dataset]
	# print train[1:10]
	# print target
	# print len(train)

	# produceFeature(testfile)
	test = genfromtxt(open('Data/'+testfile,'r'),delimiter=',',dtype='f8')[0:]
	test_target = [x[1:] for x in test]


	# X, y = digits.data, digits.target
	trainnp = np.asarray(train)
	targetnp = np.asarray(target)


	# turn the data in a (samples, feature) matrix:
	X, y = trainnp, targetnp
	# X = digits.images.reshape((n_samples, -1))
	# y = digits.target

	# Split the dataset in two equal parts
	X_train, X_test, y_train, y_test = train_test_split(
	    X, y, test_size=0.5, random_state=0)



	scores = ['precision', 'recall']

	for score in scores:
	    print("# Tuning hyper-parameters for %s" % score)
	    print()

	    clf = GridSearchCV(model, tuned_parameters, cv=5,
	                       scoring='%s_weighted' % score)
	    clf.fit(X_train, y_train)

	    print("Best parameters set found on development set:")
	    print()
	    print(clf.best_params_)
	    print()
	    print("Grid scores on development set:")
	    print()
	    for params, mean_score, scores in clf.grid_scores_:
	        print("%0.3f (+/-%0.03f) for %r"
	              % (mean_score, scores.std() * 2, params))
	    print()

	    print("Detailed classification report:")
	    print()
	    print("The model is trained on the full development set.")
	    print("The scores are computed on the full evaluation set.")
	    print()
	    y_true, y_pred = y_test, clf.predict(X_test)
	    print(classification_report(y_true, y_pred))
	    print()
Пример #6
0
def train_classifier(data, labels):
    
    nIter = 50
    alphaVals = [10**i for i in range(3,5)]
    params = { "loss": ["log"],
        "penalty": ['l1', 'l2'],
        "n_iter": [nIter],
        "alpha": alphaVals
    }
    params_log = { 
        "penalty": ['l2'] ,
        "C": [10**i for i in range(-3,-1)]
    }
    #sgd = SGDClassifier()
    sgd = LogisticRegression()
    clf = GridSearchCV(sgd, params_log)
    #data = data.tocsr()[:, 0:13]
    train, val, t_labs, val_labs = train_test_split(data,labels, train_size=.2, random_state=44)
    s = time.time()
    clf.fit(train, t_labs)
    print "Elapsed Training Time for ", len(params_log['C']), 'regularization vals: ', time.time() - s
    print clf.best_params_ 
    

    print "The Validation Score: ", clf.score(val, val_labs)
    probs =  clf.predict_proba(val)
    print "The log loss for the validation set is"
    print log_loss(probs[:,1], val_labs)
    return clf
Пример #7
0
def test_ovo_gridsearch():
    ovo = OneVsOneClassifier(LinearSVC(random_state=0))
    Cs = [0.1, 0.5, 0.8]
    cv = GridSearchCV(ovo, {'estimator__C': Cs})
    cv.fit(iris.data, iris.target)
    best_C = cv.best_estimator_.estimators_[0].C
    assert_true(best_C in Cs)
Пример #8
0
Файл: knn.py Проект: kbai/uss
def Gridsearch_impl(X,Y,clf,param,cv):

    grid_search = GridSearchCV(clf,param,verbose=10,cv=cv,n_jobs=10)
    start = time()
    grid_search.fit(X,Y)
#    print(grid_search.grid_scores_)
    best = report(grid_search.grid_scores_)
Пример #9
0
def getOptCandGamma(cv_train, cv_label):
    print "Finding optimal C and gamma for SVM with RBF Kernel"
    C_range = 10.0 ** np.arange(-2, 9)
    gamma_range = 10.0 ** np.arange(-5, 4)
    param_grid = dict(gamma=gamma_range, C=C_range)
    cv = StratifiedKFold(y=cv_label, n_folds=40)

    # Use the svm.SVC() as the cost function to evaluate parameter choices
    # NOTE: Perhaps we should run computations in parallel if needed. Does it
    # do that already within the class?
    grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv)
    grid.fit(cv_train, cv_label)

    score_dict = grid.grid_scores_
    scores = [x[1] for x in score_dict]
    scores = np.array(scores).reshape(len(C_range), len(gamma_range))
    pl.figure(figsize=(8,6))
    pl.subplots_adjust(left=0.05, right=0.95, bottom=0.15, top=0.95)
    pl.imshow(scores, interpolation='nearest', cmap=pl.cm.spectral)
    pl.xlabel('gamma')
    pl.ylabel('C')
    pl.colorbar()
    pl.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
    pl.yticks(np.arange(len(C_range)), C_range)
    pl.show()

    print "The best classifier is: ", grid.best_estimator_
Пример #10
0
def run_gridsearch(X, y, clf, param_grid, cv=5):
    """Run a grid search for best Decision Tree parameters.

    Args
    ----
    X -- features
    y -- targets (classes)
    cf -- scikit-learn Decision Tree
    param_grid -- [dict] parameter settings to test
    cv -- fold of cross-validation, default 5

    Returns
    -------
    top_params -- [dict] from report()
    """
    grid_search = GridSearchCV(clf,
                               param_grid=param_grid,
                               cv=cv,scoring = 'recall')
    start = time()
    grid_search.fit(X, y)

    print(("\nGridSearchCV took {:.2f} "
           "seconds for {:d} candidate "
           "parameter settings.").format(time() - start,
                len(grid_search.grid_scores_)))

    top_params = report(grid_search.grid_scores_, 3)
    return top_params
Пример #11
0
def model_search(estimator, tuned_params, scores, X_train, y_train, X_test, y_test):  
    
    cv = ShuffleSplit(len(X_train), n_iter=3, test_size=0.30, random_state=0)

    for score in scores:
        print"# Tuning hyper-parameters for %s" % score
        print

        clf = GridSearchCV(estimator, tuned_params, cv=cv,
                           scoring='%s' % score)
        clf.fit(X_train, y_train)

        print"Best parameters set found on development set:"
        print
        print clf.best_params_
        print
        print "Grid scores on development set:"
        print
        for params, mean_score, scores in clf.grid_scores_:
            print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)
        print

        print "Detailed classification report:"
        print
        print "The model is trained on the full development set."
        print "The scores are computed on the full evaluation set."
        print
        y_true, y_pred = y_test, clf.predict(X_test)
        print classification_report(y_true, y_pred)
        print
Пример #12
0
def grid_search(dataset_loader_train, model, grid_search):
    with timer(logger.info, "Loading data"):
        X, y = dataset_loader_train()

    grid_search_kwargs = {
        'refit': False,
        }
    grid_search_kwargs.update(grid_search)

    cv = grid_search_kwargs.get('cv', None)
    if callable(cv):
        grid_search_kwargs['cv'] = apply_kwargs(cv, n=len(y), y=y)

    if not (hasattr(model, 'score') or 'scoring' in grid_search_kwargs):
        raise ValueError(
            "Your model doesn't seem to implement a 'score' method.  You may "
            "want to pass a 'scoring' argument to 'grid_search' instead."
            )

    with timer(logger.info, "Running grid search"):
        gs = GridSearchCV(model, **grid_search_kwargs)
        gs.fit(X, y)

    scores = sorted(gs.grid_scores_, key=lambda x: -x.mean_validation_score)
    logger.info("\n{}".format(pformat(scores)))
    return scores
Пример #13
0
def dogridsearch(X,Y,param_space,clf,cv):
    grid_search = GridSearchCV(clf,param_space,verbose=10l,cv=cv,n_jobs=-1)
    start = time()
    grid_search.fit(X,Y)
    print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
          % (time() - start, len(grid_search.grid_scores_)))
    best = report(grid_search.grid_scores_)
def estimateParameters(X_train, X_test, y_train, y_test):

    tuned_parameters = [{'kernel': ['rbf'], \
                         'gamma': [1e-3, 1e-4], \
                         'C': [1, 10, 100, 1000]}, \
                        {'kernel': ['linear'], \
                         'C': [1, 10, 100, 1000]}]

    scores = ['precision', 'recall']
    for score in scores:

        print("# Tuning hyper-parameters for %s\n" % score)

        clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5, scoring=score)
        clf.fit(X_train, y_train)

        print("Best parameters set found on development set:\n")
        print(clf.best_estimator_)

        print("\nGrid scores on development set:\n")
        for params, mean_score, scores in clf.grid_scores_:
            print("%.3f (+/-%.03f) for %r" % (mean_score, scores.std() / 2, params))

        print("\nDetailed classification report:")
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        print()
Пример #15
0
def test_nntools_functional_grid_search(mnist, monkeypatch):
    # Make sure that we can satisfy the grid search interface.
    from nolearn.nntools import NeuralNet

    nn = NeuralNet(
        layers=[],
        X_tensor_type=T.matrix,
        )

    param_grid = {
        'more_params': [{'hidden_num_units': 100}, {'hidden_num_units': 200}],
        'update_momentum': [0.9, 0.98],
        }
    X, y = mnist

    vars_hist = []

    def fit(self, X, y):
        vars_hist.append(vars(self).copy())
        return self

    with patch.object(NeuralNet, 'fit', autospec=True) as mock_fit:
        mock_fit.side_effect = fit
        with patch('nolearn.nntools.NeuralNet.score') as score:
            score.return_value = 0.3
            gs = GridSearchCV(nn, param_grid, cv=2, refit=False, verbose=4)
            gs.fit(X, y)

    assert [entry['update_momentum'] for entry in vars_hist] == [
        0.9, 0.9, 0.98, 0.98] * 2
    assert [entry['more_params'] for entry in vars_hist] == (
        [{'hidden_num_units': 100}] * 4 +
        [{'hidden_num_units': 200}] * 4
        )
Пример #16
0
def separable_demo():
    """ Generate a linearly-separable dataset D, train a linear SVM on
    D, then output the resulting decision boundary on a figure.
    """
    from sklearn.datasets import make_blobs
    X, y = make_blobs(n_samples=200, n_features=2, 
                      centers=((0,0), (4, 4)),
                      cluster_std=1.0)
    plot_data(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    svc = svm.SVC(class_weight='auto')
    param_grid = {'kernel': ['linear'],
                  'C': [1e0, 1e1, 1e2, 1e3, 1e4]}
    strat_2fold = StratifiedKFold(y_train, k=2)
    print "    Parameters to be chosen through cross validation:"
    for name, vals in param_grid.iteritems():
        if name != 'kernel':
            print "        {0}: {1}".format(name, vals)
    clf = GridSearchCV(svc, param_grid, n_jobs=1, cv=strat_2fold)
    clf.fit(X_train, y_train)
    print "== Best Parameters:", clf.best_params_
    y_pred = clf.predict(X_test)
    acc = len(np.where(y_pred == y_test)[0]) / float(len(y_pred))
    print "== Accuracy:", acc
    print classification_report(y_test, y_pred)
    plot_svm(clf.best_estimator_, X, y, X_test, y_test, 
             title="SVM Decision Boundary, Linear Kernel ({0} accuracy, C={1})".format(acc, clf.best_params_['C']))
Пример #17
0
    def score_nestedCV(self, G1, model, param_grid, effect, nested):
        k_fold = cross_validation.KFold(n=self.Y.shape[0], n_folds=self.n_folds, indices=True)
        i_fold=0
        scores = sp.zeros(self.n_folds)
        params = list()

        for train, test in k_fold:
            (trainData, trainY) = self._packData(G1, train, effect)
            (testData, testY) = self._packData(G1, test, effect)

            if nested:
                clf = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs = self.n_jobs_grid,
                                   cv=self.n_folds_params, scoring=self.scoring, verbose=self.verbose)

                clf.fit(trainData, trainY.flatten())

                params.append(clf.best_params_)

                scores[i_fold] = clf.score(testData, testY.flatten(), method_scorer=False)
            else:

                model.fit(trainData, trainY.flatten())
                scores[i_fold] = SCORERS[self.scoring](model, testData, testY.flatten())
            i_fold+=1

        return scores,params
Пример #18
0
def classification_level_SGDReg_pipeline(classifications_DF):
   X = classifications_DF.iloc[:,3:89]
   #assign the target (session length) to y and convert to int
   y_actual = classifications_DF.iloc[:,2:3].astype(float)

   #scaling the data for feature selection
   X_scaled = preprocessing.scale(X)

   X_scaled_train, X_scaled_test, y_actual_train, y_actual_test = train_test_split(X_scaled, y_actual, test_size=0.5, random_state=0)

   pca_selection = PCA(n_components=2)

   X_features = pca_selection.fit(X_scaled_train['session_length'].values).transform(X_scaled_train)

   SGDReg = SGDRegressor(alpha=0.0001)

   # Do grid search over k, n_components and SVR parameters:
   pipeline = Pipeline([('pca', pca_selection),('SGDReg',SGDReg)])

   tuned_params = dict(pca__n_components=[5,30,40,50],
                     SGDReg__alpha=[0.1,0.01,0.001,0.0001,0.00001],
                     SGDReg__l1_ratio=[.05, .15, .5, .7, .9, .95, .99, 1],
                     SGDReg__penalty=['l2','l1','elasticnet'])

   grid_search = GridSearchCV(pipeline, param_grid=tuned_params,scoring='mean_squared_error',cv=3,verbose=10)
   grid_search.fit(X_scaled_train, y_actual_train['session_length'].values)
   print(grid_search.best_estimator_)
   y_true, y_pred = y_actual_test['session_length'].values,grid_search.best_estimator_.predict(X_scaled_test)
   print "Mean squared error:"+str(mean_squared_error(y_true,y_pred))
   pd.DataFrame(y_true, y_pred).to_csv("SGDReg_pred_true.csv")
def make_grid_search(pipeline, parameters, model_name, params):
    print model_name
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=4, verbose=3,
                               #loss_func=f1_score,
                               scoring="f1",
                               iid=False,
                               refit=True)
    #model_name = "ExtraTree_min_sample2_10trees_gridcv_desc_log"

    print("Performing grid search...")
    print("pipeline:", pipeline) # [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(features, salaries_enc)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_params_
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    best_estimator = pipeline.set_params(**best_parameters)
    params = params + " ", grid_search.cv_scores_
    dio.save_model(best_estimator, model_name, mae_cv=grid_search.best_score_, parameters=params)
    print grid_search.cv_scores_
    prediction = grid_search.predict(validation_features)
    dio.save_prediction(model_name, prediction, "valid_classes")
Пример #20
0
def MyGridSearch(X,y):
    kfold = cross_validation.KFold(len(X), 5)
    for train, test in kfold:
    	#X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5, random_state = 0)
    	#parameters = {'kernel': ('linear', 'rbf'), 'C':[1.5, 10]}
    	#parameters = {'kernel': ['rbf'], 'gamma': [1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8,1e-9 ], 'epsilon' : [0.1],
    	#                 'C': [1, 5, 10, 50,100,500,1000,5000,10000]}
    	#parameters = {'kernel': ['poly'], 'gamma': [1e-2,1e-3,1e-4 ], 'epsilon' : [0.1],'degree':[3],
    	#                 'C': [ 50,100,500,1000]}
    	parameters = {'kernel': ['rbf'], 'gamma': [1e-5], 'epsilon' : [0.2],
                     'C': [100000]}
        #parameters = [{'C': sp.stats.expon(scale=100), 'gamma': sp.stats.expon(scale=.1),
        #             'kernel': ['rbf'], 'class_weight':['auto', None]}]
    	model = svm.SVR()

    	grid = GridSearchCV(model,parameters)
    	#grid = RandomizedSearchCV(model,parameters)
    	grid.fit(X[train], y[train])
	#print grid
    	predictions = grid.predict(X[test])
	print grid.best_score_
	if grid.best_score_ > 0.98:
		return grid
		break
   	#print grid.best_estimator_.coef_
    return grid
Пример #21
0
def classification_level_RandForest_pipeline(classifications_DF):
   X = classifications_DF.iloc[:,3:89]
   #assign the target (session length) to y and convert to int
   y_actual = classifications_DF.iloc[:,2:3].astype(float)

   #scaling the data for feature selection
   X_scaled = preprocessing.scale(X)

   X_scaled_train, X_scaled_test, y_actual_train, y_actual_test = train_test_split(X_scaled, y_actual, test_size=0.3, random_state=0)

 # Maybe some original features where good, too?
   selectKbest = SelectKBest(k=1,score_func=f_regression)

   # Build estimator from PCA and Univariate selection:
   X_features = selectKbest.fit(X_scaled_train,y_actual_train).transform(X_scaled_train)
   
   randomForestReg = RandomForestRegressor(n_estimators=1, criterion='mse')

   # Do grid search over k, n_components and SVR parameters:
   pipeline = Pipeline([('selectKbest', selectKbest),('randomForestReg',randomForestReg)])

   tuned_params = dict(selectKbest__k=[5,10,20,30,40,50,80],
                       randomForestReg__n_estimators=[1,2,4,8,16,32,64],
                       randomForestReg__min_samples_split=[2,3,5,10,20])

   grid_search = GridSearchCV(pipeline, param_grid=tuned_params,scoring='mean_squared_error',cv=3,verbose=10)
   grid_search.fit(X_scaled_train, y_actual_train['session_length'].values)
   print(grid_search.best_estimator_)
   y_true, y_pred = y_actual_test['session_length'].values,grid_search.best_estimator_.predict(X_scaled_test)
   print "Mean squared error:"+str(mean_squared_error(y_true,y_pred))
   pd.DataFrame(y_true, y_pred).to_csv("randomForestReg_pred_true.csv")
Пример #22
0
def run_support_vector_regressor(
    training_features, training_labels, test_features, test_labels, passed_parameters=None
):

    estimator = svm.SVR()

    # set up parameters for the classifier
    if passed_parameters == None:
        parameters = {"kernel": ["linear"]}
    else:
        parameters = passed_parameters

    # create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0)

    # set up tuning algorithm
    regressor = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters)

    # fit the classifier
    regressor.fit(training_features, training_labels)

    test_prediction = regressor.predict(test_features)
    test_accuracy = regressor.score(test_features, test_labels)

    time_2 = time.time()

    return test_prediction, test_accuracy
def fit_predict_model(city_data):
    """Find and tune the optimal model. Make a prediction on housing data."""

    # Get the features and labels from the Boston housing data
    X, y = city_data.data, city_data.target

    # Setup a Decision Tree Regressor
    regressor = DecisionTreeRegressor()

    parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)}

    ###################################
    ### Step 4. YOUR CODE GOES HERE ###
    ###################################

    # 1. Find the best performance metric
    # should be the same as your performance_metric procedure
    # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
    dtr_scorer = make_scorer(mean_squared_error, greater_is_better=False)

    # 2. Use gridearch to fine tune the Decision Tree Regressor and find the best model
    # http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV
    reg = GridSearchCV(regressor, parameters, scoring=dtr_scorer, cv=6)

    # Fit the learner to the training data
    print "Final Model: "
    print reg.fit(X, y)
    print "Best estimator choosen by GridSearchCV: ", reg.best_estimator_
    
    # Use the model to predict the output of a particular sample
    x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]
    y = reg.predict(x)
    print "House: " + str(x)
    print "Prediction: " + str(y)
Пример #24
0
    def run_linear_experiment(self, rocs_filename, iterations=10):
        """
        Run a classification experiment by running several iterations.
        In each iteration data is randomized, a linear svm classifier
        is trained and evaluated using cross-validation over a the 
        cost parameter in the range np.logspace(-3, 3, 7). The best
        classifier is used for testing and a ROC curve is computed
        and saved as property and locally.

        :param rocs_filename: the file to save all rocs computed
        :param iterations: number of runs (training/testing)
        """
        for i in xrange(iterations):
            print "[*] Iteration {0}".format(i)
            print "[*] Randomizing dataset..."
            self.randomize_dataset()
            clf = GridSearchCV(svm.LinearSVC(), {'C': np.logspace(-3, 3, 7)})
            print "[*] Training..."
            clf.fit(self.X_train, self.Y_train)
            out = clf.best_estimator_.decision_function(self.X_test)
            print "[*] Testing..."
            roc = eval.compute_roc(np.float32(out.flatten()),
                                   np.float32(self.Y_test))
            self.rocs.append(roc)
            print "[*] ROC saved."
        pz.save(self.rocs, rocs_filename)
Пример #25
0
    def run_linear_open_experiment(self, iterations=10, save=False):
        """
        Train a classifier on test data, obtain the best combination of
        parameters through a grid search cross-validation and test the
        classifier using a open-world split of the dataset. The results
        from the number of iterations are saved as pz files.

        :param iterations: number of runs (training/testing)
        :save: save predictions and labels if True
        """
        self.true_labels = np.array([])
        self.predictions = np.array([])
        for i in xrange(iterations):
            self.randomize_dataset_open_world()
            clf = GridSearchCV(svm.LinearSVC(), {'C': np.logspace(-3, 3, 7)})
            clf.fit(self.X_train, self.Y_train)
            out = clf.best_estimator_.decision_function(self.X_test)
            classes = clf.best_estimator_.classes_
            for scores in out:
                m = np.max(scores)
                if (abs(m/scores[:][:]) < 0.5).any():
                    self.predictions = np.append(self.predictions, 99)
                else:
                    p = classes[np.where(scores==m)]
                    self.predictions = np.append(self.predictions, p)
            self.true_labels = np.append(self.true_labels, self.Y_test)

        if save:
            pz.save(self.predictions, "mca_predictions_open.pz")
            pz.save(self.true_labels, "mca_true_labels_open.pz")
Пример #26
0
def test_krr_regP():
    
    dim = 5
    n = 1000
    ntest = 1001    

    pref = np.random.random(size=dim) - 0.5

    #pref /= np.sqrt(pref.dot(pref))

    Xtrain = np.random.random((n, dim)) + 1.0
    ytrain = Xtrain.dot(pref) + np.random.normal(scale=0.05, size=n) + 10.0

    Xtest = np.random.random((ntest, dim)) + 1.0
    yref = Xtest.dot(pref) + 10.0

    krr = kRidgeRegression(kernel=Linear(), eta=1.0)

    gs = GridSearchCV(krr, {'eta' : [0, 1E-16, 1E-14, 1E-12, 1E-10, 1E-8, 1E-6, 1E-4, 1E-2, 1]})

    gs.fit(Xtrain, ytrain)

    krr = gs.best_estimator_

    ytest = krr.transform(Xtest).flatten()
    print krr.beta.shape
    print krr.Ku.shape

    print krr.score(Xtest, yref)
Пример #27
0
def grid_search_model(clf_factory, X, Y,save_file="read/best_param.txt"):
    u"""最適なパラメータを調べる
        Args:
            clf_factory:機械学習モデル
            X:特徴量
            Y:ラベル
        Returns:
            clf:最も良かったモデル
    """
    stopwords=load_stopwords_old()
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, random_state=0)
    param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
                      vect__min_df=[1, 2],
                      vect__stop_words=[None, stopwords],
                      vect__smooth_idf=[False, True],
                      vect__use_idf=[False, True],
                      vect__sublinear_tf=[False, True],
                      vect__binary=[False, True],
                      )
    grid_search = GridSearchCV(clf_factory(),
                              param_grid=param_grid,
                              cv=cv,
                              verbose=10)
    grid_search.fit(X, Y)
    clf = grid_search.best_estimator_
    write_to_text(grid_search.best_params_,save_file)

    return clf
Пример #28
0
def run_random_forest(training_features, training_labels, test_features, test_labels, passed_parameters=None):

    estimator = ensemble.RandomForestRegressor(random_state=0, n_estimators=25)

    # set up parameters for the classifier
    if passed_parameters == None:
        parameters = {"max_depth": None}
    else:
        parameters = passed_parameters

    # create cross validation iterator
    cv = ShuffleSplit(training_features.shape[0], n_iter=5, test_size=0.2, random_state=0)

    # set up tuning algorithm
    regressor = GridSearchCV(estimator=estimator, cv=cv, param_grid=parameters)

    # fit the classifier
    regressor.fit(training_features, training_labels)

    test_prediction = regressor.predict(test_features)
    test_accuracy = regressor.score(test_features, test_labels)

    time_2 = time.time()

    return test_prediction, test_accuracy
Пример #29
0
def test_krr_regbeta():
    
    dim = 5
    n = 1000
    ntest = 1001    

    pref = np.random.random(size=dim) - 0.5

    #pref /= np.sqrt(pref.dot(pref))

    Xtrain = np.random.random((n, dim)) 
    ytrain = Xtrain.dot(pref) + np.random.normal(scale=0.05, size=n) + 10.0

    Xtest = np.random.random((ntest, dim)) 
    yref = Xtest.dot(pref) + 10.0

    krr = kRidgeRegression(kernel=Linear(), eta=1.0, regularize_beta=True)
    gs = GridSearchCV(krr, {'eta' : [1E-6, 1E-4, 1E-2, 1, 1E2, 1E4, 1E6]})
    gs.fit(Xtrain, ytrain)

    krr = gs.best_estimator_

    ytest = krr.transform(Xtest).flatten()

    print krr.score(Xtest, yref)
Пример #30
0
def grid_search(X, y):
    '''
    cross validated grid search using Ridge Regressor and Random
    Forest Regressor
    '''

    nids = df_subset.index
    titles = df_subset['title']

    pars = {'alpha': [0.8, 0.6, 0.5, 0.45, 0.4, 0.2, 0.1,
                      0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02]}

    gs = GridSearchCV(Ridge(), pars, cv=5)
    gs.fit(X, y)

    ridge = gs.best_estimator_
    dill.dump(ridge, open('ridge.pkl', 'wb'))

    pars = {'max_depth': [5, 8, 10, 20, 50, 100],
            'min_samples_split': [2, 3, 5, 10, 20]}

    gs = GridSearchCV(RFR(n_estimators=100, random_state=42, n_jobs=2),
                      pars, cv=5)
    rfr = gs.best_estimator_
    dill.dump(rfr, open('rfr.pkl', 'wb'))
    return ridge, rfr
Пример #31
0
    def use_pipeline_with_fs(self):

        #####################
        #Build a vectorizer / classifier pipeline that filters out tokens that are too rare or too frequent
        #####################

        pipeline = Pipeline([
            ('vect',
             TfidfVectorizer(stop_words=stopwords, min_df=3, max_df=0.90)),
            ("selector", SelectPercentile()),
            ('clf', RandomForestClassifier()),
        ])

        # Build a grid search to find the best parameter
        # Fit the pipeline on the training set using grid search for the parameters
        parameters = {
            'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
            'vect__use_idf': (True, False),
            'clf__n_estimators': (10, 50, 100),
            'clf__criterion': ("gini", "entropy"),
            'clf__max_depth': (None, 2, 4),
            'clf__min_samples_split': (2, 4, 6),
            'selector__score_func': (chi2, f_classif),
            'selector__percentile': (85, 95, 100),
        }

        #################
        # Exhaustive search over specified parameter values for an estimator, use cv to generate data to be used
        # implements the usual estimator API: when “fitting” it on a dataset all the possible combinations of parameter values are evaluated and the best combination is retained.
        #################

        cv = StratifiedShuffleSplit(y_train,
                                    n_iter=5,
                                    test_size=0.2,
                                    random_state=42)
        grid_search = GridSearchCV(pipeline,
                                   param_grid=parameters,
                                   cv=cv,
                                   n_jobs=-1)
        clf_gs = grid_search.fit(docs_train, y_train)

        ###############
        # print the cross-validated scores for the each parameters set explored by the grid search
        ###############

        best_parameters, score, _ = max(clf_gs.grid_scores_,
                                        key=lambda x: x[1])
        for param_name in sorted(parameters.keys()):
            print("%s: %r" % (param_name, best_parameters[param_name]))

        print("Score for gridsearch is %0.2f" % score)

        #y_predicted = clf_gs.predict(docs_test)

        ###############
        # run the classifier again with the best parameters
        # in order to get 'clf' for get_important_feature function!
        ###############

        ngram_range = best_parameters['vect__ngram_range']
        use_idf = best_parameters['vect__use_idf']
        score_func = best_parameters['selector__score_func']
        percentile = best_parameters['selector__percentile']

        # vectorisation

        count_vect = CountVectorizer(stop_words=stopwords,
                                     min_df=3,
                                     max_df=0.90,
                                     ngram_range=ngram_range)
        X_CV = count_vect.fit_transform(docs_train)

        # print number of unique words (n_features)
        print("Shape of train data is " + str(X_CV.shape))

        # tfidf transformation

        tfidf_transformer = TfidfTransformer(use_idf=use_idf)
        X_tfidf = tfidf_transformer.fit_transform(X_CV)

        #################
        # feature selection
        #################

        selector = SelectPercentile(score_func=score_func,
                                    percentile=percentile)

        combined_features = Pipeline([("vect", count_vect),
                                      ("tfidf", tfidf_transformer),
                                      ("feat_select", selector)])

        X_features = combined_features.fit_transform(docs_train, y_train)
        X_test_features = combined_features.transform(docs_test)

        print("Shape of train data after feature selection is " +
              str(X_features.shape))
        print("Shape of test data after feature selection is " +
              str(X_test_features.shape))

        # run classifier on selected features

        clf = RandomForestClassifier().fit(X_features, y_train)

        # get the features which are selected and write to file

        feature_boolean = selector.get_support(indices=False)

        f = open(path_to_store_feature_selection_boolean_file, 'w')

        for fb in feature_boolean:
            f.write(str(fb) + '\n')

        f.close()

        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf,
                                 X_features,
                                 y_train,
                                 cv=10,
                                 scoring='f1_weighted')
        print("Cross validation score: " + str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" %
              (scores.mean(), scores.std() * 2))

        #################
        # run classifier on test data
        #################

        y_predicted = clf.predict(X_test_features)

        # print the mean accuracy on the given test data and labels

        print("Classifier score on test data is: %0.2f " %
              clf.score(X_test_features, y_test))

        # Print and plot the confusion matrix

        print(metrics.classification_report(y_test, y_predicted))
        cm = metrics.confusion_matrix(y_test, y_predicted)
        print(cm)

        # import matplotlib.pyplot as plt
        # plt.matshow(cm)
        # plt.show()

        return clf, count_vect
''' '''''' '''     九、模型调参    3.调优max_depth,min_child_samples, min_split_gain, 确定xgb整体架构    ''' '''''' ''
'''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' '''''' ''''''

param_test1 = {
    'max_depth': list(range(3, 7)),
    'min_child_samples': [1, 3, 5, 10],
    'min_split_gain': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 1, 2],
}

gsearch1 = GridSearchCV(lgbm_model,
                        param_grid=param_test1,
                        scoring='roc_auc',
                        cv=5)

starttime = datetime.datetime.now()
gsearch1.fit(X.loc[:, chosen_final_feature], Y)
endtime = datetime.datetime.now()
print('第一次gridsearch耗时{0} seconds'.format((endtime - starttime).seconds))

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
'''可以直接读取最大auc对应的参数设置,也可以根据观察和经验选取效果好且稳定的参数组合'''
#lgbm_model.set_params(max_depth=gsearch1.best_params_['max_depth'])
#lgbm_model.set_params(min_child_samples=gsearch1.best_params_['min_child_samples'])
#lgbm_model.set_params(min_split_gain=gsearch1.best_params_['min_split_gain'])

lgbm_model.set_params(max_depth=5)
lgbm_model.set_params(min_child_samples=1)
lgbm_model.set_params(min_split_gain=0.3)
'''选好参数后重新矫正best_n_estimators'''
lgbm_model.set_params(n_estimators=500)
lgbm_param_temp = lgbm_model.get_params()
Пример #33
0
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!

feature_train, feature_test, labels_train, labels_test = \
 train_test_split( features, labels, test_size=0.3, random_state=42)

# from sklearn.naive_bayes import GaussianNB
#Naive bayes
gnb_clf = GaussianNB()
parameters = {}
algo = GridSearchCV(gnb_clf, parameters)
print '\nGaussianNB:'
algo.fit(feature_train, labels_train)
test_classifier(algo.best_estimator_, my_dataset, features_list)

# Testing the response of classifier without using 'person_to_poi_rate' feature in our features_list
print "\n Testing of classifer by removing 'person_to_poi_rate' feature from our features_list"
test_classifier(algo.best_estimator_, my_dataset, ['poi', \
    'exercised_stock_options', 'total_stock_value', \
    'bonus', 'salary', 'total'] )

#Decision Tree
print '\nDecision Tree:'
dt_clf = tree.DecisionTreeClassifier()
parameters = {'criterion': ['gini', 'entropy'], \
   'min_samples_split': [2, 5, 10, 20], \
   'max_depth': [None, 2, 5, 10], \
   'splitter': ['random', 'best'], \
Пример #34
0
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print "done in %0.3fs" % (time() - t0)

###############################################################################
# Train a SVM classification model

print "Fitting the classifier to the training set"
t0 = time()
param_grid = {
    'C': [1e3, 5e3, 1e4, 5e4, 1e5],
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
# for sklearn version 0.16 or prior, the class_weight parameter value is 'auto'
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print "done in %0.3fs" % (time() - t0)
print "Best estimator found by grid search:"
print clf.best_estimator_

###############################################################################
# Quantitative evaluation of the model quality on the test set

print "Predicting the people names on the testing set"
t0 = time()
y_pred = clf.predict(X_test_pca)
print "done in %0.3fs" % (time() - t0)

print classification_report(y_test, y_pred, target_names=target_names)
print confusion_matrix(y_test, y_pred, labels=range(n_classes))
    def grid_search(self, X_train, y_train, param_grid, eval_func, seed=42):

        gsearch = GridSearchCV(self.model, param_grid,verbose=10,cv=10)
        gsearch.fit(X_train,y_train)

        print(gsearch.best_params_)
Пример #36
0
print "样本数据量:%d, 特征个数:%d" % x.shape
print "target样本数据量:%d" % y.shape[0]

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=28)

parameters = {
    'kernel': ['rbf', 'linear'],
    'C': [0.1, 0.5],
    'gamma': [0.0001, 0.0005]
}
model = GridSearchCV(SVR(), param_grid=parameters, cv=3)
model.fit(x_train, y_train)

print "最优参数列表:", model.best_params_
print "最优模型:", model.best_estimator_
print "最优准确率:", model.best_score_

print "训练集准确率:%.2f%%" % (model.score(x_train, y_train) * 100)
print "测试集准确率:%.2f%%" % (model.score(x_test, y_test) * 100)

## 画图
colors = ['g-', 'b-']
ln_x_test = range(len(x_test))
y_predict = model.predict(x_test)

plt.figure(figsize=(16, 8), facecolor='w')
plt.plot(ln_x_test, y_test, 'r-', lw=2, label=u'真实值')
    "pca__n_components": [0.25, 0.5, 0.75, 1],
    "decision__max_depth": np.linspace(1, 20, 20).astype(np.int8)
}, {
    "decision__max_depth": np.linspace(1, 20, 20).astype(np.int8)
}, {
    "decision__max_depth": np.linspace(1, 20, 20).astype(np.int8)
}]
# 获取数据
x_train2, x_test2, y_train2, y_test2 = x_train1, x_test1, y_train1, y_test1

for t in range(3):
    pipe = pipes[t]

    gscv = GridSearchCV(pipe, param_grid=parameters[t])

    gscv.fit(x_train2, y_train2)

    print(t, "score值:", gscv.best_score_, "最优参数列表:", gscv.best_params_)

# 使用最优参数看看正确率
mms_best = MinMaxScaler()
decision3 = DecisionTreeRegressor(criterion='mse', max_depth=4)

x_train3, x_test3, y_train3, y_test3 = x_train1, x_test1, y_train1, y_test1
x_train3 = mms_best.fit_transform(x_train3, y_train3)
x_test3 = mms_best.transform(x_test3)
decision3.fit(x_train3, y_train3)

print("正确率:", decision3.score(x_test3, y_test3))

# 查看各个不同深度的错误率
Пример #38
0
train_predict(clf_C,X_train_100,y_train_100, X_test,y_test)
train_predict(clf_C,X_train_200,y_train_200, X_test,y_test)
train_predict(clf_C,X_train_300,y_train_300, X_test,y_test)

# AdaBoost Model tuning 
# Create the parameters list you wish to tune
parameters = {'n_estimators':[20,30,40,50,60,70]}

# initialize the classifier
clf = clf_A

# Make an f1 scoriing function using 'make_scorer'
f1_scorer = make_scorer(f1_score, pos_label = 'yes')

# Perform grid search on the classifier using the f1_scorer as the  scoring method
grid_obj = GridSearchCV(clf, param_grid = parameters, scoring = f1_scorer)

# Fit the grid search object to the training data and find the optimal parameters
grid_obj.fit(X_train,y_train)

# Get the best tuned estimator
clf = grid_obj.best_estimator_

# Report the final F1 score for training and testing after parameter tuning
print
print "Tuned model has a training F1 score of {:.4f}.".format(predict_labels(clf,X_train,y_train))
print "Tuned model has a testing F1 score of {:.4f}.".format(predict_labels(clf,X_test,y_test))
print "Tuned model has an optimal parameter: ", grid_obj.best_params_
print "Features importances array is :", clf.feature_importances_
print "Key Features for identifying 'Pass/Fail' are:", X_all.columns[clf.feature_importances_>0.1]
Пример #39
0
    X = d[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Spouse']].values

    # check CV score for max depth = 3
    ctree = tree.DecisionTreeClassifier(max_depth=3)
    np.mean(cross_val_score(ctree, X, y, cv=5, scoring='roc_auc'))

    # check CV score for max depth = 10
    ctree = tree.DecisionTreeClassifier(max_depth=10)
    np.mean(cross_val_score(ctree, X, y, cv=5, scoring='roc_auc'))

    # Conduct a grid search for the best tree depth
    ctree = tree.DecisionTreeClassifier(random_state=1, min_samples_leaf=20)
    depth_range = range(1, 20)
    param_grid = dict(max_depth=depth_range)
    grid = GridSearchCV(ctree, param_grid, cv=5, scoring='roc_auc')
    grid.fit(X, y)

    # Check out the scores of the grid search
    grid_mean_scores = [result[1] for result in grid.grid_scores_]

    print(grid_mean_scores)

    # Plot the results of the grid search
    plt.figure()
    plt.plot(depth_range, grid_mean_scores)
    plt.hold(True)
    plt.grid(True)
    plt.plot(grid.best_params_['max_depth'], grid.best_score_, 'ro', markersize=12, markeredgewidth=1.5,
             markerfacecolor='None', markeredgecolor='r')

    # Get the best estimator
Пример #40
0
    spm = SPMFeature(patch_file=patches, method=method, all_x=all_x, img_size=600)
    svm = SVC(kernel='linear', probability = True,random_state=42)
    clf = Pipeline([('spm', spm),('svm',svm)])
    
    params = {
            "svm__C": [0.01, 1, 100],
            "spm__clusters": [256, 512, 1024]
            }

    
    print "SEARCHING SPM+SVM"
    # perform a grid search over the parameter
    # 如果没有score,没办法搜索参数
    start = time.time()
    gs = GridSearchCV(clf, params, cv=2, n_jobs = -1, verbose = 1)
    gs.fit(x_train, y_train)
 
    # print diagnostic information to the user and grab the
    # best model
    print "\ndone in %0.3fs" % (time.time() - start)
    print "best score: %0.3f" % (gs.best_score_)
    print "SPM + SVM PARAMETERS"
    bestParams = gs.best_estimator_.get_params()

    # loop over the parameters and print each of them out
    # so they can be manually set
    for p in sorted(params.keys()):
        print "\t %s: %f" % (p, bestParams[p])
        
    best = gs.best_estimator_
    
Пример #41
0
if run_gs:
    parameter_grid = {
        'max_depth': [4, 6, 8],
        'n_estimators': [50, 10],
        'max_features': ['sqrt', 'auto', 'log2'],
        'min_samples_split': [1.0, 3, 10],
        'min_samples_leaf': [1, 3, 10],
        'bootstrap': [True, False]
    }

    forest = RandomForestClassifier()
    cross_validation = StratifiedKFold(targets, n_folds=5)

    grid_search = GridSearchCV(forest, scoring='accuracy', param_grid=parameter_grid, cv=cross_validation)

    grid_search.fit(train, targets)
    model = grid_search
    parameters = grid_search.best_params_

    print('Best Score: {}', format(grid_search.best_score_))
    print('Best Parameters: {}', format(grid_search.best_params_))
else:
    parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50,
                  'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6}

    model = RandomForestClassifier(**parameters)
    model.fit(train, targets)


print compute_score(model, train, targets, scoring='accuracy')
    un_train_data, un_test_data, un_train_labels, un_test_labels = train_test_split(UnProc_Data, All_labs,
                                                                                    test_size=0.4, random_state=0)

    # Create feature vectors
    vectorizer = TfidfVectorizer()
    train_vectors = vectorizer.fit_transform(train_data)  # Processed feature vectors
    test_vectors = vectorizer.transform(test_data)
    un_train_vectors = vectorizer.fit_transform(un_train_data)  # Unprocessed feature vectors
    un_test_vectors = vectorizer.transform(un_test_data)

    # Perform classification with Optimized SVM
    classifier_rbf = GridSearchCV(svm.SVC(), tuned_parameters, cv=cv,
                                  scoring=make_scorer(f1_score, pos_label='pos', average='weighted'), n_jobs=7)

    t0 = time.time()
    classifier_rbf.fit(train_vectors, train_labels)
    t1 = time.time()
    prediction_rbf = classifier_rbf.predict(test_vectors)
    t2 = time.time()
    time_rbf_train = t1 - t0
    time_rbf_predict = t2 - t1

    # Print results in a nice table
    print("Results for Optimized SVM - processed data")
    print("Training time: %fs; Prediction time: %fs" % (time_rbf_train, time_rbf_predict))
    print(classification_report(test_labels, prediction_rbf))
    print
    print sklearn.metrics.confusion_matrix(test_labels, prediction_rbf)

    t0 = time.time()
    classifier_rbf.fit(un_train_vectors, un_train_labels)
Пример #43
0
y_predprob = gbm0.predict_proba(X)[:,1]
print ("Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))
#拟合还可以,接下去通过调参提高模型的泛化能力


#######(1)步长(learning rate)和迭代次数(n_estimators)
#######一般来说,开始选择一个较小的步长来搜索最好的迭代次数。
#######这里,我们将步长初始值设置为0.1,迭代最优的迭代次数
##################
param_test1 = {'n_estimators':list(range(20,81,10))}
#python3中的range()返回的是迭代对象,需要用list()转为列表
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=300,
                                                               min_samples_leaf=20,max_depth=8,max_features='sqrt', subsample=0.8,random_state=10),
                        param_grid = param_test1, scoring='roc_auc',iid=False,cv=5)
gsearch1.fit(X,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
#输出表明,最好的迭代次数是60


#######(2) 决策树最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索,将n_estimators=60加入参数集
##################
param_test2 = {'max_depth':list(range(3,14,2)), 'min_samples_split':list(range(100,801,200))}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=60, min_samples_leaf=20,
                                                               max_features='sqrt', subsample=0.8, random_state=10),
                        param_grid = param_test2, scoring='roc_auc',iid=False, cv=5)
gsearch2.fit(X,y)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
#输出如下,最好的最大树深度是7,内部节点再划分所需最小样本数是300
#由于决策树深度7是一个比较合理的值,可以把它定下来,但是对于内部节点再划分所需最小样本数min_samples_split,暂时不能一起定下来,因为这个还和决策树其他的参数存在关联。
Пример #44
0
    params = {
        #'PCA__n_components': [2],
        'SKB__k': [5, 6, 7, 8, 9, 10, 11, 12],
        'SKB__score_func': [f_classif]
    }
    params.update(clf_step_params)

    sss = StratifiedShuffleSplit(labels_train,
                                 n_iter=20,
                                 test_size=0.5,
                                 random_state=0)

    gscv = GridSearchCV(pipe, params, verbose=0, scoring='f1_weighted', cv=sss)

    gscv.fit(features_train, labels_train)

    pred = gscv.predict(features_test)

    clf = gscv.best_estimator_

    # Get the selected features
#    pipe.fit(features_train, labels_train)
#    selected_features = gscv.best_estimator_.named_steps['SKB'].get_support(indices=True)
#    feature_scores = gscv.best_estimator_.named_steps['SKB'].scores_
#    sfs = []
#    for sf in selected_features:
#        sfs.append((features_list[sf + 1], feature_scores[sf]))
#    print len(sfs), "best parameters with scores:"
#    for f, s in sfs: print f, "{0:.3f}".format(s)
#
Пример #45
0
# Split dataset in train, test
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

# Tuning hyperparameters for logistic regression
pipe_logistic = Pipeline([('scl', StandardScaler()),
                          ('clf', LogisticRegression(penalty='l2'))])

param_grid = {'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

gs = GridSearchCV(estimator=pipe_logistic,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=10,
                  n_jobs=-1)

gs.fit(x_train, y_train)
clf = gs.best_estimator_

gs = gs.fit(x_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

clf.fit(x_train, y_train)

print('Train accuracy %.3f' % clf.score(x_train, y_train))

print('Test accuracy %.3f' % clf.score(x_test, y_test))

# Tuning hyperparameters for svc via grid search
pipe_svc = Pipeline([('scl', StandardScaler()),
                     ('clf', SVC(random_state=1, probability=True))])
Пример #46
0
gbc = GradientBoostingClassifier()
xgbc = XGBClassifier()

#
# print cross_val_score(gbc,X_train,y_train,cv=5).mean()
# print cross_val_score(xgbc,X_train,y_train,cv=5).mean()

params = {
    'max_depth': range(2, 7),
    'n_estimators': range(100, 1200, 200),
    'learning_rate': [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1]
}
begin = datetime.datetime.now()
gs_gbc = GridSearchCV(gbc, params, n_jobs=-1, cv=5, verbose=1)
gs_gbc.fit(X_train, y_train)
print gs_gbc.best_score_
print gs_gbc.best_params_
print datetime.datetime.now() - begin
# 0.838383838384
# {'n_estimators': 900, 'learning_rate': 0.01, 'max_depth': 4}
# 0:09:58.525028
gbc_y_pre = gs_gbc.predict(X_test)
gbc_submission = pd.DataFrame({
    'PassengerID': test['PassengerId'],
    'Survived': gbc_y_pre
})
gbc_submission.to_csv('./gbc_submission.csv', index=False)

# gs_xgbc=GridSearchCV(xgbc,params,n_jobs=-1,cv=5,verbose=1)
# gs_xgbc.fit(X_train,y_train)
Пример #47
0
                          msg_train,
                          label_train,
                          cv=5)
plt.show()
params = {
    'tfidf__use_idf': (True, False),
    'bow__analyzer': (split_into_lemmas, split_into_tokens),
}

grid = GridSearchCV(
    pipeline,  # pipeline from above
    params,  # parameters to tune via cross validation
    refit=
    True,  # fit using all available data at the end, on the best found param combination
    scoring='accuracy',  # what score are we optimizing?
    cv=StratifiedKFold(label_train,
                       n_folds=5),  # what type of cross validation to use
)
nb_detector = grid.fit(msg_train, label_train)
print nb_detector.grid_scores_
# print nb_detector.predict(["#Wedding / Special Occasion Wear ANN BALON Designer Chain Link Maxi Evening Dress http://goo.gl/hZHDGq "])[0]
predictions = nb_detector.predict(msg_test)
confusion_matrix(label_test, predictions)
print classification_report(label_test, predictions)
conf = sklearn.metrics.confusion_matrix(label_test, predictions)
plt.imshow(conf, cmap='Accent', interpolation='nearest')
plt.colorbar()
# plt.imshow(np.random.random((5,5)), interpolation='nearest')
plt.xticks(np.arange(0, 2), ['no', 'yes'])
plt.yticks(np.arange(0, 2), ['no', 'yes'])
plt.show()
    axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.3)
    axarr[idx[0], idx[1]].scatter(
        X_train_std[y_train==0, 0], X_train_std[y_train==0, 1],
        c='blue', marker='^', s=50
    )
    axarr[idx[0], idx[1]].scatter(
        X_train_std[y_train==1, 0], X_train_std[y_train==1, 1],
        c='red', marker='o', s=50
    )
    axarr[idx[0], idx[1]].set_title(tt)
plt.text(
    -3.5, -4.5, s='Sepal width [standardized]',
    ha='center', va='center', fontsize=12
)
plt.text(
    -10.5, 4.5, s='Petal length [standardized]',
    ha='center', va='center', fontsize=12, rotation=90
)
plt.show()

from sklearn.grid_search import GridSearchCV
params = {'decisiontreeclassifier__max_depth': [1,2],
            'pipeline-1__clf__C': [0.001, 0.1, 100.0]} # mv_clf.get_params()
grid = GridSearchCV(
    estimator=mv_clf, param_grid=params, cv=10, scoring='roc_auc'
)
grid.fit(X_train, y_train)
for params, mean_score, scores in grid.grid_scores_:
    print('%0.3f+/-%0.2f %r' % (mean_score, scores.std() / 2, params))
print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)
Пример #49
0
                         cv=3)
elif conf["model"] == "KNN":
    param_grid = {'n_neighbors': range(5, 30, 2)}
    model = GridSearchCV(KNeighborsClassifier(), param_grid, cv=3)
elif conf["model"] == "SVM":
    param_grid = {
        'C': [1, 10, 100, 1000],
        'kernel': ['linear']
    }, {
        'C': [1, 10, 100, 1000],
        'gamma': [0.001, 0.0001],
        'kernel': ['rbf']
    }
    model = GridSearchCV(SVC(probability=True), param_grid, cv=3)

model.fit(trainData, trainLabels)
print("[INFO] best hyperparameters: {}".format(model.best_params_))

# open the results file for writing and initialize the total number of accurate
# rank-1 and rank-5 predictions
print("[INFO] evaluating...")
f = open(conf["results_path"] + conf["model"] + ".txt", "w")
rank1 = 0
rank5 = 0

# loop over the testing data
for (label, features) in zip(testLabels, testData):
    # predict the probability of each class label and grab the top-5 labels
    # (based on probabiltiy)
    preds = model.predict_proba(np.atleast_2d(features))[0]
    preds = np.argsort(preds)[::-1][:5]
Пример #50
0
def main():
    model_name = 'Radom Forest'

    parser = argparse.ArgumentParser(usage=model_name)

    parser.add_argument("train_feature",
                        help="Input file of training features and target")
    parser.add_argument("test_feature", help="Input file of test features")
    parser.add_argument("test_pred",
                        help="Output file of predicted test target")
    parser.add_argument("--prob",
                        action='store_true',
                        help='Predict probability of class 1')
    parser.add_argument("--cores",
                        type=int,
                        default=-1,
                        help='Number of cores to use')

    args = parser.parse_args()

    print(model_name)

    # Read training data and test data
    print('Read training data and test data')
    df_train_feature_target = pd.read_csv(args.train_feature, dtype=np.float32)
    df_test_feature = pd.read_csv(args.test_feature, dtype=np.float32)

    train_X = df_train_feature_target.values[:, :-1]
    train_y = df_train_feature_target.values[:, -1]
    test_X = df_test_feature.values

    # Model specification and parameter range
    model = RandomForestClassifier(n_jobs=-1)
    parameters = [{'n_estimators': [200, 100, 50, 25, 10]}]

    # Cross validation search
    print('Cross validation search')
    clf = GridSearchCV(model,
                       parameters,
                       cv=5,
                       scoring='roc_auc',
                       n_jobs=args.cores,
                       pre_dispatch=args.cores,
                       verbose=3)
    clf.fit(train_X, train_y)

    # Make predictions with the best model
    print('Make predictions with the best model')
    train_pred = clf.predict(train_X)
    train_pred_prob = clf.predict_proba(train_X)[:, 1]
    test_pred = clf.predict(test_X)
    test_pred_prob = clf.predict_proba(test_X)[:, 1]

    # Write out the prediction result
    print('Write out the prediction result')
    pd.Series(test_pred_prob  if args.prob else test_pred, name='Prob' if args.prob else 'Pred') \
        .to_csv(args.test_pred, index=False, header=True)

    # Report the result
    print('Report the result')
    print('Best Score: ', clf.best_score_)
    print('Best Parameter: ', clf.best_params_)
    print('Parameter Scores: ', clf.grid_scores_)
    print('Model: ', clf)
    print('Accuracy: ', accuracy_score(train_y, train_pred))
    print('F1:       ', f1_score(train_y, train_pred))
    print('ROC AUC:  ', roc_auc_score(train_y, train_pred_prob))
    print(args.test_pred + '~~' + str(clf.best_score_))
Пример #51
0
def test_no_refit():
    # Test that grid search can be used for model selection only
    clf = MockClassifier()
    grid_search = GridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=False)
    grid_search.fit(X, y)
    assert_true(hasattr(grid_search, "best_params_"))
Пример #52
0
    label = train_data['y']
    #Assign training data without y as features
    features = train_data.drop(['y'], axis=1)
    #prepare training and testing sets for training 
    X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.4, random_state=0)
    #select best parameters, these were reached after quite a while of trial and error 
    parameters = {'max_depth': [3], 'learning_rate': [0.1], 'n_estimators': [50], 'silent': [False], 'objective': ['reg:linear'], 'booster': ['dart'], 'gamma': [0], 'min_child_weight': [15], 'max_delta_step': [0], 'subsample': [1], 'colsample_bytree': [0.7], 'colsample_bylevel':[0.5], 'reg_alpha': [0.0001], 'reg_lambda': [1], 'scale_pos_weight': [1], 'base_score': [0.5], 'random_state': [2018]}
    #Needed GridSearchCV parameter for gpu usage
    params = {'tree_method': 'gpu_hist'}
    #The scoring method indicated by problem evaluation metric on the competition page
    scorer = make_scorer(r2_score)

    #Initializing the grid object with a Gradient Boosting Regressor supplied with the parameters mentioned above and with R^2 scoring method
    grid_obj = GridSearchCV(XGBRegressor(**params), param_grid=parameters, scoring=scorer)
    #Begin training
    grid_fit = grid_obj.fit(X_train, y_train)
    #We use this line to exctract the best estimator in case we used multiple hyperparameters in the grid 
    best_estimator = grid_fit.best_estimator_
    #Then we predict on the test data split that we got from the dataset features
    best_predictions = best_estimator.predict(X_test)
    #Here we register our score
    score = grid_fit.score(X_test, y_test)
    #We print the best parameters that got us our best estimator
    print("best_params_:", grid_fit.best_params_)
    #We print the score that we got
    print("score:", score)

    #test output
    test_output_predictions = best_estimator.predict(test_data)
    #Here we store the IDs from the ID column
    test_data = test_data[['ID']]
Пример #53
0
X_train, X_test, y_train, y_test = train_test_split(rf_data.iloc[:, 0:42],
                                                    rf_data.iloc[:, [42]],
                                                    test_size=0.33,
                                                    random_state=42)

# 训练模型_start

# 首先对n_estimators进行网格搜索
param_test1 = {'n_estimators': list(range(450, 550, 10))}
gsearch1 = GridSearchCV(estimator=RandomForestRegressor(max_features="log2",
                                                        min_samples_leaf=2,
                                                        oob_score=True),
                        param_grid=param_test1,
                        scoring=None,
                        cv=5)
gsearch1.fit(X_train.iloc[:, 0:18], y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

# 接着对决策树最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索。
param_test2 = {
    'max_depth': list(range(80, 100, 2)),
    'min_samples_split': list(range(2, 101, 2))
}
gsearch2 = GridSearchCV(estimator=RandomForestRegressor(n_estimators=50,
                                                        max_features="log2",
                                                        min_samples_leaf=2,
                                                        oob_score=True),
                        param_grid=param_test2,
                        scoring=None,
                        iid=False,
                        cv=5)
Пример #54
0
parameter_candidates = [
    {
        'C': [1, 10, 100, 1000],
        'kernel': ['linear']
    },
    {
        'C': [1, 10, 100, 1000],
        'gamma': [0.001, 0.0001],
        'kernel': ['rbf']
    },
]

clf = GridSearchCV(estimator=svm.SVC(),
                   param_grid=parameter_candidates,
                   n_jobs=-1)
clf.fit(x_train, y_train)
prediction = clf.predict(x_test)

print('Best score for training data:', clf.best_score_)
print('Best `C`:', clf.best_estimator_.C)
print('Best kernel:', clf.best_estimator_.kernel)
print('Best `gamma`:', clf.best_estimator_.gamma)

#clf = cluster.KMeans(init='k-means++', n_clusters=5, random_state=42)
#clf.fit(x_train)
#prediction = clf.fit_predict(x_train)

fig, ax = plt.subplots(2, 2, figsize=(8, 4))

print("Start Down Scale")
from sklearn.decomposition import PCA
Пример #55
0
# In[49]:

param = {
    'max_depth':
    [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24],
    'min_samples_split': [4, 8, 12, 16, 20, 25],
    'criterion': ['gini', 'entropy']
}
grid_search_params = {
    'estimator': DecisionTreeClassifier(),
    'param_grid': param,  # 前面定义的我们想要优化的参数
    'cv': ps,  # 使用前面自定义的split验证策略
    'n_jobs': -1
}  # 并行运行的任务数,-1表示使用所有CPU
gridsearch = GridSearchCV(**grid_search_params)
gridsearch.fit(train_val_features, train_val_labels)

# In[86]:

import pandas as pd
cv_result = pd.DataFrame.from_dict(gridsearch.grid_scores_)
criterion, max_depth, min_samples_split, score = [], [], [], []
for i in range(len(cv_result)):
    criterion.append(cv_result['parameters'][i]['criterion'])
    max_depth.append(cv_result['parameters'][i]['max_depth'])
    min_samples_split.append(cv_result['parameters'][i]['min_samples_split'])
    #    score.append(str(cv_result['cv_validation_scores'][i]).split('[')[1].split(']')[0])
    score.append(cv_result['cv_validation_scores'][i])
df = pd.DataFrame({
    'criterion': criterion,
    'max_depth': max_depth,
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
for train_index, test_index in kf:
    xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print(confusion_matrix(actuals, predictions))

print("Boston Housing: regression")
boston = load_boston()
y = boston['target']
X = boston['data']
kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
for train_index, test_index in kf:
    xgb_model = xgb.XGBRegressor().fit(X[train_index], y[train_index])
    predictions = xgb_model.predict(X[test_index])
    actuals = y[test_index]
    print(mean_squared_error(actuals, predictions))

print("Parameter optimization")
y = boston['target']
X = boston['data']
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model, {
    'max_depth': [2, 4, 6],
    'n_estimators': [50, 100, 200]
},
                   verbose=1)
clf.fit(X, y)
print(clf.best_score_)
print(clf.best_params_)
Пример #57
0
    features.append(FeatureExtractor.getFeatures(image))

features = np.array(features)
labels = np.array(image_labels)
selector = SelectKBest(chi2, k='all')
scaler = StandardScaler().fit(features)
features = scaler.transform(features)

if calculate_best_params:
    C_range = 10.**np.arange(-10, 10)
    gamma_range = 10.**np.arange(-10, 10)
    param_grid = dict(gamma=gamma_range, C=C_range)
    grid = GridSearchCV(SVC(),
                        param_grod=param_grid,
                        cv=StratifiedKFold(labels, 5))
    grid.fit(features, labels)

    print "Best classifier is :", grid.best_estimator_

classifier = SVC(kernel="rbf",
                 gamma=0.1,
                 C=100.0,
                 probability=True,
                 class_weight=None,
                 coef0=0.0,
                 degree=3,
                 shrinking=True,
                 tol=0.001,
                 verbose=False)
classifier.fit(features, labels)
joblib.dump((classifier, training_names, normalization_parameter, features,
Пример #58
0
param_test1 = {'max_depth': [5], 'min_child_weight': [1]}
gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.05,
                                                n_estimators=160,
                                                max_depth=5,
                                                min_child_weight=1,
                                                gamma=0,
                                                subsample=0.8,
                                                colsample_bytree=0.8,
                                                objective='multi:softprob',
                                                scale_pos_weight=1,
                                                seed=123),
                        param_grid=param_test1,
                        scoring='neg_log_loss',
                        iid=False,
                        cv=5)
gsearch1.fit(train, y)
print gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

param_test2 = {
    'max_depth': [4, 5, 6],
}
gsearch2 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.05,
                                                n_estimators=150,
                                                max_depth=5,
                                                min_child_weight=1,
                                                gamma=0,
                                                subsample=0.8,
                                                colsample_bytree=0.8,
                                                objective='multi:softprob',
                                                scale_pos_weight=1,
                                                seed=123),
Пример #59
0
    #regressorAtual = names[min_index]

    #if (regressorAtual == 'gbm'):
    print "procurando os melhores parametros para GBM"
    for train_index_skf, test_index_skf in skf.split(X_data_train):
        X_train_skf, X_test_skf = \
            X_data_train[train_index_skf], X_data_train[test_index_skf]

        Y_train_skf, Y_test_skf = \
            values[train_index_skf], values[test_index_skf]
        print "Fazendo o GridSearsh para o Gradien Boosting regressor...."
        clf3 = GridSearchCV(gbmObj,
                            param_grid=parameters_gbm,
                            scoring='neg_mean_absolute_error',
                            n_jobs=n_cores)
        clf3.fit(X_train_skf, Y_train_skf)
        print "Finalizado o GridSearsh para a Gradien Boosting regressor."
        print "MAE do Gradien Boosting obtido para o conjunto de treino: " + str(
            -clf3.best_score_) + "com os parametros: " + str(clf3.best_params_)
        gbmPreditcTestKfold = clf3.predict(X_test_skf)
        MAE_GBM = mean_absolute_error(Y_test_skf, gbmPreditcTestKfold)
        print "MAE_GBM obtido para o conjunto de teste: " + str(MAE_GBM)
        if (MAE_GBM < bestScore_gbm):
            bestScore_gbm = MAE_GBM
            gbm_n_estimators_best = clf3.best_params_['n_estimators']
            gbm_max_features_best = clf3.best_params_['max_features']
            gbm_max_depth_best = clf3.best_params_['max_depth']
            gbm_learning_rate_best = clf3.best_params_['learning_rate']
        print "melhor GBM parametros ate o momento: " + str(bestScore_gbm) + " n_estimators: " + str(
            gbm_n_estimators_best) \
              + " max_features: " + str(gbm_max_features_best) + " max_depth: " + str(gbm_max_depth_best) + \
Пример #60
0
    # The last column describes the targets
    explanatory_variable_columns.remove(len(df.columns.values) - 1)

    y = [1 if e == 'ad.' else 0 for e in response_variable_column]
    X = df[list(explanatory_variable_columns)]
    X.replace(to_replace=' *\?', value=-1, regex=True, inplace=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    pipeline = Pipeline([('clf', DecisionTreeClassifier(criterion='entropy'))])
    parameters = {
        'clf__max_depth': (150, 155, 160),
        'clf__min_samples_split': (1, 2, 3),
        'clf__min_samples_leaf': (1, 2, 3)
    }

    grid_search = GridSearchCV(pipeline,
                               parameters,
                               n_jobs=-1,
                               verbose=1,
                               scoring='f1')
    grid_search.fit(X_train, y_train)
    print('Best score: %0.3f' % grid_search.best_score_)
    print('Best parameters set:')
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print('\t%s: %r' % (param_name, best_parameters[param_name]))

    predictions = grid_search.predict(X_test)
    print(classification_report(y_test, predictions))
    print(grid_search.score(X_test, y_test))