def run_otto_workflow(): name = "test1" author = "author" description = "kaggle-otto-script" # Creating a new project syncer_obj = Syncer(NewOrExistingProject(name, author, description), NewOrExistingExperiment("expName", "expDesc"), NewExperimentRun("otto test")) # Import Data # Note: This dataset is not included in the repo because of Kaggle # restrictions. # It can be downloaded from # https://www.kaggle.com/c/otto-group-product-classification-challenge/data X = pd.read_csv_sync(DATA_PATH + 'otto-train.csv') syncer_obj.add_tag(X, "original otto csv data") X = X.drop_sync('id', axis=1) syncer_obj.add_tag(X, "dropped id column") # Extract target # Encode it to make it manageable by ML algo y = X.target.values y = LabelEncoder().fit_transform_sync(y) # Remove target from train, else it's too easy ... X = X.drop_sync('target', axis=1) syncer_obj.add_tag(X, "data with dropped id and target columns") # Split Train / Test x_train, x_test, y_train, y_test = cross_validation.train_test_split_sync( X, y, test_size=0.20, random_state=36) syncer_obj.add_tag(x_test, "testing data") syncer_obj.add_tag(x_train, "training data") # First, we will train and apply a Random Forest WITHOUT calibration # we use a BaggingClassifier to make 5 predictions, and average # because that's what CalibratedClassifierCV do behind the scene, # and we want to compare things fairly, i.e. be sure that averaging several # models # is not what explains a performance difference between no calibration, # and calibration. clf = RandomForestClassifier(n_estimators=50, n_jobs=-1) clfbag = BaggingClassifier(clf, n_estimators=5) clfbag.fit_sync(x_train, y_train) y_preds = clfbag.predict_proba_sync(x_test) SyncableMetrics.compute_metrics(clfbag, log_loss, y_test, y_preds, x_test, "", "", eps=1e-15, normalize=True) # print("loss WITHOUT calibration : ", log_loss( # ytest, ypreds, eps=1e-15, normalize=True)) # Now, we train and apply a Random Forest WITH calibration # In our case, 'isotonic' worked better than default 'sigmoid' # This is not always the case. Depending of the case, you have to test the # two possibilities clf = RandomForestClassifier(n_estimators=50, n_jobs=-1) calibrated_clf = CalibratedClassifierCV(clf, method='isotonic', cv=5) calibrated_clf.fit_sync(x_train, y_train) y_preds = calibrated_clf.predict_proba_sync(x_test) SyncableMetrics.compute_metrics(calibrated_clf, log_loss, y_test, y_preds, x_test, "", "", eps=1e-15, normalize=True) # print("loss WITH calibration : ", log_loss( # ytest, ypreds, eps=1e-15, normalize=True)) print(" ") print("Conclusion : in our case, calibration improved" "performance a lot ! (reduced loss)") syncer_obj.sync() return syncer_obj, x_train, x_test