Пример #1
0
    def setUp(self):
        name = "grid search test"
        author = "srinidhi"
        description = "Grid search cross validation - 3 folds"
        syncer_obj = SyncerTest(
            NewOrExistingProject(name, author, description),
            DefaultExperiment(), NewExperimentRun("Abc"),
            ThriftConfig(None, None))
        X = pd.DataFrame(np.random.randint(0, 100, size=(2000, 4)),
                         columns=list('ABCD'))
        y = pd.DataFrame(np.random.randint(0, 100, size=(2000, 1)),
                         columns=['output'])

        # Add tag for dataframe
        syncer_obj.add_tag(X, "digits-dataset")
        syncer_obj.clear_buffer()

        tuned_parameters = [{
            'kernel': ['rbf'],
            'gamma': [1e-3, 1e-4],
            'C': [10, 100]
        }]
        clf = GridSearchCV(SVC(), tuned_parameters, cv=3)
        y = y.values.ravel()
        clf.fit_sync(X, y)
        events = syncer_obj.sync()
        self.grid_search_event = events[0]
Пример #2
0
Parameter-tuning
"""
parameter_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [1, 2, 3, 4, 5],
    'max_features': [1, 2, 3, 4]
}

cross_validation = StratifiedKFold(all_classes, n_folds=10)

grid_search = GridSearchCV(decision_tree_classifier,
                           param_grid=parameter_grid,
                           cv=cross_validation)

grid_search.fit_sync(all_inputs, all_classes)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

# Get the best estimator
decision_tree_classifier = grid_search.best_estimator_

random_forest_classifier = RandomForestClassifier()

parameter_grid = {
    'n_estimators': [5, 10, 25, 50],
    'criterion': ['gini', 'entropy'],
    'max_features': [1, 2, 3, 4],
    'warm_start': [True, False]
}
Пример #3
0
#Uses GridSearch and Pipeline objects in scikit, adapted from http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html
name = "grid search cross validation"
author = "srinidhi"
description = "digits dataset"
syncer_obj = Syncer(NewOrExistingProject(name, author, description),
                    DefaultExperiment(), NewExperimentRun("Abc"))

digits = datasets.load_digits()
x = digits.data[:1000]
y = digits.target[:1000]

parameters = {
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet')
}

pipeline = Pipeline([
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

clf = GridSearchCV(pipeline,
                   parameters,
                   cv=None,
                   scoring='%s_weighted' % 'precision')

clf.fit_sync(x, y)
syncer_obj.sync()
#    NewExperimentRun("my_experiment_id"))

# Loading the Digits dataset
digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

# Split the dataset in two equal parts
x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync(
    X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5)
clf.fit_sync(x_train, y_train)

print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
y_pred = clf.predict_sync(x_test)
mean_error = SyncableMetrics.compute_metrics(
    clf, accuracy_score, y_test, y_pred, x_test, '', '')

syncer_obj.sync()