def run_pipeline_anova_workflow(): name = "pipeline scikit example" author = "srinidhi" description = "anova filter pipeline" syncer_obj = Syncer( NewOrExistingProject(name, author, description), DefaultExperiment(), NewExperimentRun("Abc")) # import some data to play with X, y = samples_generator.make_classification( n_informative=5, n_redundant=0, random_state=42) x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync( X, y, test_size=0.3, random_state=0) syncer_obj.add_tag(X, "samples generated data") syncer_obj.add_tag(x_train, "training data") syncer_obj.add_tag(x_test, "testing data") # ANOVA SVM-C # 1) anova filter, take 5 best ranked features anova_filter = SelectKBest(f_regression, k=5) syncer_obj.add_tag(anova_filter, "Anova filter, with k=5") # 2) svm clf = svm.SVC(kernel='linear') syncer_obj.add_tag(clf, "SVC with linear kernel") anova_svm = Pipeline([('anova', anova_filter), ('svc', clf)]) syncer_obj.add_tag(anova_svm, "Pipeline with anova_filter and SVC") # Fit the pipeline on the training set anova_svm.fit_sync(x_train, y_train) y_pred = anova_svm.predict(x_test) # Compute metrics for the model on the testing set f1 = SyncableMetrics.compute_metrics( anova_svm, f1_score, y_test, y_pred, x_test, "predictionCol", 'label_col') precision = SyncableMetrics.compute_metrics( anova_svm, precision_score, y_test, y_pred, x_test, "predictionCol", 'label_col') syncer_obj.sync() return syncer_obj, f1, precision, x_train, x_test
def run_linear_model_workflow(): """ Sample workflow using OneHotEncoder and LinearRegression. """ syncer_obj = Syncer.create_syncer("test1", "test_user", "pandas-linear-regression") data, target = load_pandas_dataset() syncer_obj.add_tag(data, "occupation dataset") # Hot encode occupation column of data hot_enc = preprocessing.OneHotEncoder() syncer_obj.add_tag(hot_enc, "Hot encoding occupation column") hot_enc.fit_sync(data['occupation'].reshape(-1, 1)) hot_enc_rows = hot_enc.transform_sync(data['occupation'].reshape(-1, 1)) hot_enc_df = pd.DataFrame(hot_enc_rows.toarray()) # Drop column as it is now encoded dropped_data = data.drop_sync('occupation', axis=1) # Join the hot encoded rows with the rest of the data data = dropped_data.join(hot_enc_df) x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync( data, target, test_size=0.3, random_state=1) syncer_obj.add_tag(x_train, "training data - 70%") syncer_obj.add_tag(x_test, "testing data - 30%") model = linear_model.LinearRegression() syncer_obj.add_tag(model, "Basic linear reg") model.fit_sync(x_train, y_train) y_pred = model.predict_sync(x_test) mean_error = SyncableMetrics.compute_metrics(model, mean_squared_error, y_test, y_pred, x_test, "", 'affairs') # Sync all the events to database syncer_obj.sync() # Certain variables are returned so they can be used for unittests below. return syncer_obj, x_test, mean_error, dropped_data
orig = pd.read_csv_sync(DATA_PATH + 'adult_with_colnames.csv', index_col=0) [train, test] = cross_validation.train_test_split_sync(orig, test_size=0.3, random_state=501) [lb, train] = oneHotEncoding(None, "workclass", train) cols = [col for col in train.columns if "workclass_" in col] [lb2, train] = oneHotEncoding(None, "sex", train) cols = [col for col in train.columns if "sex_" in col] train = train.drop(["workclass", "sex"], axis=1) new_cols = [ col for col in train.columns if "workclass_" in col or "sex_" in col ] logreg = linear_model.LogisticRegression(C=10) features = ['capital-gain', 'capital-loss', 'age'] + new_cols logreg.fit_sync(train[features], train.income) [lb, test] = oneHotEncoding(lb, "workclass", test) [lb2, test] = oneHotEncoding(lb2, "sex", test) test = test.drop(["workclass", "sex"], axis=1) test_pred = logreg.predict_sync(test[features]) test_proba = logreg.predict_proba(test[features]) accuracy = SyncableMetrics.compute_metrics(logreg, accuracy_score, test.income, test_pred, test[features], "predictionCol", 'income_level') syncer_obj.sync()
""" Cross Validation """ # Create the classifier decision_tree_classifier = DecisionTreeClassifier() syncer_obj.add_tag(decision_tree_classifier, "decision tree") # Train the classifier on the training set decision_tree_classifier.fit_sync(training_inputs, training_classes) # Validate the classifier on the testing set using classification accuracy #decision_tree_classifier.score(testing_inputs, testing_classes) # NOTE: score is equivalent to sklearn.metrics.accuracy_score. SyncableMetrics.compute_metrics( decision_tree_classifier, accuracy_score, testing_classes, decision_tree_classifier.predict(testing_inputs), training_inputs, "", "") # cross_val_score returns a list of the scores, which we can visualize # to get a reasonable estimate of our classifier's performance cv_scores = cross_validation.cross_val_score_sync(decision_tree_classifier, all_inputs, all_classes, cv=10) """ Parameter-tuning """ parameter_grid = { 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': [1, 2, 3, 4, 5],
def run_otto_workflow(): name = "test1" author = "author" description = "kaggle-otto-script" # Creating a new project syncer_obj = Syncer(NewOrExistingProject(name, author, description), NewOrExistingExperiment("expName", "expDesc"), NewExperimentRun("otto test")) # Import Data # Note: This dataset is not included in the repo because of Kaggle # restrictions. # It can be downloaded from # https://www.kaggle.com/c/otto-group-product-classification-challenge/data X = pd.read_csv_sync(DATA_PATH + 'otto-train.csv') syncer_obj.add_tag(X, "original otto csv data") X = X.drop_sync('id', axis=1) syncer_obj.add_tag(X, "dropped id column") # Extract target # Encode it to make it manageable by ML algo y = X.target.values y = LabelEncoder().fit_transform_sync(y) # Remove target from train, else it's too easy ... X = X.drop_sync('target', axis=1) syncer_obj.add_tag(X, "data with dropped id and target columns") # Split Train / Test x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync( X, y, test_size=0.20, random_state=36) syncer_obj.add_tag(x_test, "testing data") syncer_obj.add_tag(x_train, "training data") # First, we will train and apply a Random Forest WITHOUT calibration # we use a BaggingClassifier to make 5 predictions, and average # because that's what CalibratedClassifierCV do behind the scene, # and we want to compare things fairly, i.e. be sure that averaging several # models # is not what explains a performance difference between no calibration, # and calibration. clf = RandomForestClassifier(n_estimators=50, n_jobs=-1) clfbag = BaggingClassifier(clf, n_estimators=5) clfbag.fit_sync(x_train, y_train) y_preds = clfbag.predict_proba_sync(x_test) SyncableMetrics.compute_metrics(clfbag, log_loss, y_test, y_preds, x_test, "", "", eps=1e-15, normalize=True) # print("loss WITHOUT calibration : ", log_loss( # ytest, ypreds, eps=1e-15, normalize=True)) # Now, we train and apply a Random Forest WITH calibration # In our case, 'isotonic' worked better than default 'sigmoid' # This is not always the case. Depending of the case, you have to test the # two possibilities clf = RandomForestClassifier(n_estimators=50, n_jobs=-1) calibrated_clf = CalibratedClassifierCV(clf, method='isotonic', cv=5) calibrated_clf.fit_sync(x_train, y_train) y_preds = calibrated_clf.predict_proba_sync(x_test) SyncableMetrics.compute_metrics(calibrated_clf, log_loss, y_test, y_preds, x_test, "", "", eps=1e-15, normalize=True) # print("loss WITH calibration : ", log_loss( # ytest, ypreds, eps=1e-15, normalize=True)) print(" ") print("Conclusion : in our case, calibration improved" "performance a lot ! (reduced loss)") syncer_obj.sync() return syncer_obj, x_train, x_test
# NewExperimentRun("my_experiment_id")) # Loading the Digits dataset digits = datasets.load_digits() # To apply an classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.images) X = digits.images.reshape((n_samples, -1)) y = digits.target # Split the dataset in two equal parts x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync( X, y, test_size=0.5, random_state=0) # Set the parameters by cross-validation tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5) clf.fit_sync(x_train, y_train) print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") y_pred = clf.predict_sync(x_test) mean_error = SyncableMetrics.compute_metrics( clf, accuracy_score, y_test, y_pred, x_test, '', '') syncer_obj.sync()
df['income_level'] = df['income_level'].replace(['<=50K'], [0.0]) df['income_level'] = df['income_level'].replace(['>50K'], [1.0]) # calling labelEncoder on any columns that are object types for coltype, colname in zip(df.dtypes, df.columns): if coltype == 'object': le.fit_sync(df[colname]) transformed_vals = le.transform_sync(df[colname]) new_df[colname + "_index"] = transformed_vals else: new_df[colname] = df[colname] lr = linear_model.LogisticRegression() x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync( new_df, new_df['income_level'], test_size=0.3, random_state=0) # We don't want to include our label (income_level) when fitting partial_training = x_train[x_train.columns[:-1]] partial_testing = x_test[x_test.columns[:-1]] lr.fit_sync(partial_training, y_train) y_pred = lr.predict_sync(partial_testing) SyncableMetrics.compute_metrics( lr, precision_score, y_test, y_pred, partial_testing, "predictionCol", 'income_level') SyncableMetrics.compute_metrics( lr, recall_score, y_test, y_pred, partial_testing, "predictionCol", 'income_level') syncer_obj.sync()
# modeldb start df = pd.read_csv_sync(DATA_PATH + 'credit-default.csv', skiprows=[0]) # modeldb end target = df['default payment next month'] df = df[["LIMIT_BAL", "SEX", "EDUCATION", "MARRIAGE", "AGE"]] x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync( df, target, test_size=0.3) lr = linear_model.LogisticRegression(C=2) # modeldb start lr.fit_sync(x_train, y_train) # modeldb end # modeldb start y_pred = lr.predict_sync(x_test) # modeldb end # modeldb start score = SyncableMetrics.compute_metrics(lr, accuracy_score, y_test, y_pred, x_train, "features", 'default payment next month') # modeldb end # modeldb start syncer_obj.sync() # modeldb end
# To apply an classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.images) X = digits.images.reshape((n_samples, -1)) y = digits.target # Split the dataset in two equal parts x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync( X, y, test_size=0.5, random_state=0) # Set the parameters by cross-validation tuned_parameters = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }, { 'kernel': ['linear'], 'C': [1, 10, 100, 1000] }] clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5) clf.fit_sync(x_train, y_train) print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") y_pred = clf.predict_sync(x_test) mean_error = SyncableMetrics.compute_metrics(clf, precision_score, y_test, y_pred, x_test, '', '') syncer_obj.sync()
'hidden_layer_sizes': [(1, ), (1, ), ( 1, 1, 1, )] } mlp = MLPClassifier(verbose=10, learning_rate='adaptive') clf = GridSearchCV(mlp, params, verbose=10, n_jobs=1, cv=2) clf.fit_sync(x_train, y_train) print( 'Finished with grid search with best mean cross-validated score:', clf.best_score_) print('Best params appeared to be', clf.best_params_) joblib.dump(clf, PATH) y_pred = clf.predict_sync(x_test) score = SyncableMetrics.compute_metrics(clf, accuracy_score, y_test, y_pred, x_train, "", "") clf = clf.best_estimator_ print('Test accuracy:', clf.score(x_test, y_test)) # datasets = { # "train" : Dataset("/path/to/train", {"num_cols" : 15, "dist" : "random"}), # "test" : Dataset("/path/to/test", {"num_cols" : 15, "dist" : "gaussian"}) # } # model = "model_obj" # model_type = "NN" # mdb_model1 = Model(model_type, model, "./model.pkl") # model_config1 = ModelConfig(model_type, {"l1" : 10}) # model_metrics1 = ModelMetrics({"accuracy" : 0.8}) # syncer_obj.sync_datasets(datasets) # syncer_obj.sync_model("train", model_config1, mdb_model1)