def run_random_forest_pipeline(): ###### DATA LOADING xy = XYLOADER() # CAN CHANGE X = xy["X_train"] y = xy["y_train"] X_val = xy["X_val"] y_val = xy["y_val"] X_test = xy["X_test"] output_index = xy["X_test_index"] print("LOADED DATA") ###### PIPELINE/CV VARIABLES ###### DO NOT CHANGE BEFORE clf = RandomForestClassifier() fl = X.shape[1] # use for n_components cv_grid = { "clf__n_estimators": [100], "clf__criterion": ["gini"], "clf__min_samples_split": np.linspace(1, 20, 3).astype(int), "clf__min_samples_leaf": np.linspace(1, 20, 3).astype(int), } num_folds = 3 ####### START PREDICTIONS print("TRAINING ESTIMATOR") pred_pipe = Pipeline(steps=[("clf", clf)]) ###### DO NOT CHANGE AFTER estimator = GridSearchCV(pred_pipe, cv_grid, cv=num_folds) # DO NOT NEED TO CHANGE BEYOND THIS LINE KH.start_pipeline() KH.record_metric("validation", "start", "prepross", preprocess_called, "", "") KH.record_metric("validation", "start", estimator, "training", "", "") estimator.fit(X, y) KH.record_metric("validation", "end", estimator, "training", "", "") KH.record_metric("validation", "end", estimator, "best_params", str(estimator.best_params_), "NA") KH.record_metric("validation", "end", estimator, "best_estimator", str(estimator.best_estimator_), "NA") KH.record_metric("validation", "end", estimator, "best_score", str(estimator.best_score_), "NA") validation_score = str(estimator.score(X_val, y_val)) KH.record_metric("validation", "end", estimator, "validation score", validation_score, "") preds = estimator.predict(X_test) predictions = pd.DataFrame({"VisitNumber": output_index, "TripType": preds}) KH.save_test_predictions(utils.convert_predictions(predictions), estimator, "predictions") KH.end_pipeline() return estimator
def run_knn_pipeline(): ###### DATA LOADING xy = loader.XY4(KH) # CAN CHANGE X = xy["X_train"] y = xy["y_train"] X_val = xy["X_val"] y_val = xy["y_val"] X_test = xy["X_test"] output_index = xy["X_test_index"] print("LOADED DATA") ###### PIPELINE/CV VARIABLES ###### DO NOT CHANGE BEFORE clf = KNeighborsClassifier() fl = X.shape[1] # use for n_components cv_grid = {"clf__metric": ["euclidean", "manhattan"], "clf__n_neighbors": [10, 100, 1000]} num_folds = 3 ####### START PREDICTIONS print("TRAINING ESTIMATOR") pred_pipe = Pipeline(steps=[("clf", clf)]) ###### DO NOT CHANGE AFTER estimator = GridSearchCV(pred_pipe, cv_grid, cv=num_folds) # DO NOT NEED TO CHANGE BEYOND THIS LINE KH.record_metric("validation", "start", estimator, "training", "", "") estimator.fit(X, y) KH.record_metric("validation", "end", estimator, "training", "", "") KH.record_metric("validation", "end", estimator, "best_params", str(estimator.best_params_), "NA") KH.record_metric("validation", "end", estimator, "best_estimator", str(estimator.best_estimator_), "NA") KH.record_metric("validation", "end", estimator, "best_score", str(estimator.best_score_), "NA") validation_score = str(estimator.score(X_val, y_val)) KH.record_metric("validation", "end", estimator, "validation score", validation_score, "") preds = estimator.predict(X_test) predictions = pd.DataFrame({"VisitNumber": output_index, "TripType": preds}) KH.save_test_predictions(utils.convert_predictions(predictions), estimator, "predictions") KH.end_pipeline() return estimator