def setUp(self): name = "grid search test" author = "srinidhi" description = "Grid search cross validation - 3 folds" syncer_obj = SyncerTest( NewOrExistingProject(name, author, description), DefaultExperiment(), NewExperimentRun("Abc"), ThriftConfig(None, None)) X = pd.DataFrame(np.random.randint(0, 100, size=(2000, 4)), columns=list('ABCD')) y = pd.DataFrame(np.random.randint(0, 100, size=(2000, 1)), columns=['output']) # Add tag for dataframe syncer_obj.add_tag(X, "digits-dataset") syncer_obj.clear_buffer() tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [10, 100] }] clf = GridSearchCV(SVC(), tuned_parameters, cv=3) y = y.values.ravel() clf.fit_sync(X, y) events = syncer_obj.sync() self.grid_search_event = events[0]
Parameter-tuning """ parameter_grid = { 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': [1, 2, 3, 4, 5], 'max_features': [1, 2, 3, 4] } cross_validation = StratifiedKFold(all_classes, n_folds=10) grid_search = GridSearchCV(decision_tree_classifier, param_grid=parameter_grid, cv=cross_validation) grid_search.fit_sync(all_inputs, all_classes) print('Best score: {}'.format(grid_search.best_score_)) print('Best parameters: {}'.format(grid_search.best_params_)) # Get the best estimator decision_tree_classifier = grid_search.best_estimator_ random_forest_classifier = RandomForestClassifier() parameter_grid = { 'n_estimators': [5, 10, 25, 50], 'criterion': ['gini', 'entropy'], 'max_features': [1, 2, 3, 4], 'warm_start': [True, False] }
#Uses GridSearch and Pipeline objects in scikit, adapted from http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html name = "grid search cross validation" author = "srinidhi" description = "digits dataset" syncer_obj = Syncer(NewOrExistingProject(name, author, description), DefaultExperiment(), NewExperimentRun("Abc")) digits = datasets.load_digits() x = digits.data[:1000] y = digits.target[:1000] parameters = { 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': (0.00001, 0.000001), 'clf__penalty': ('l2', 'elasticnet') } pipeline = Pipeline([ ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) clf = GridSearchCV(pipeline, parameters, cv=None, scoring='%s_weighted' % 'precision') clf.fit_sync(x, y) syncer_obj.sync()
# NewExperimentRun("my_experiment_id")) # Loading the Digits dataset digits = datasets.load_digits() # To apply an classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.images) X = digits.images.reshape((n_samples, -1)) y = digits.target # Split the dataset in two equal parts x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync( X, y, test_size=0.5, random_state=0) # Set the parameters by cross-validation tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5) clf.fit_sync(x_train, y_train) print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") y_pred = clf.predict_sync(x_test) mean_error = SyncableMetrics.compute_metrics( clf, accuracy_score, y_test, y_pred, x_test, '', '') syncer_obj.sync()