Exemplo n.º 1
0
def run_random_forest_pipeline():
    ###### DATA LOADING
    xy = XYLOADER()  # CAN CHANGE

    X = xy["X_train"]
    y = xy["y_train"]
    X_val = xy["X_val"]
    y_val = xy["y_val"]
    X_test = xy["X_test"]
    output_index = xy["X_test_index"]
    print("LOADED DATA")

    ###### PIPELINE/CV VARIABLES
    ###### DO NOT CHANGE BEFORE
    clf = RandomForestClassifier()
    fl = X.shape[1]  # use for n_components
    cv_grid = {
        "clf__n_estimators": [100],
        "clf__criterion": ["gini"],
        "clf__min_samples_split": np.linspace(1, 20, 3).astype(int),
        "clf__min_samples_leaf": np.linspace(1, 20, 3).astype(int),
    }
    num_folds = 3

    ####### START PREDICTIONS
    print("TRAINING ESTIMATOR")
    pred_pipe = Pipeline(steps=[("clf", clf)])

    ###### DO NOT CHANGE AFTER
    estimator = GridSearchCV(pred_pipe, cv_grid, cv=num_folds)

    # DO NOT NEED TO CHANGE BEYOND THIS LINE
    KH.start_pipeline()
    KH.record_metric("validation", "start", "prepross", preprocess_called, "", "")
    KH.record_metric("validation", "start", estimator, "training", "", "")
    estimator.fit(X, y)
    KH.record_metric("validation", "end", estimator, "training", "", "")
    KH.record_metric("validation", "end", estimator, "best_params", str(estimator.best_params_), "NA")
    KH.record_metric("validation", "end", estimator, "best_estimator", str(estimator.best_estimator_), "NA")
    KH.record_metric("validation", "end", estimator, "best_score", str(estimator.best_score_), "NA")
    validation_score = str(estimator.score(X_val, y_val))
    KH.record_metric("validation", "end", estimator, "validation score", validation_score, "")

    preds = estimator.predict(X_test)
    predictions = pd.DataFrame({"VisitNumber": output_index, "TripType": preds})
    KH.save_test_predictions(utils.convert_predictions(predictions), estimator, "predictions")
    KH.end_pipeline()

    return estimator
Exemplo n.º 2
0
def run_knn_pipeline():
    ###### DATA LOADING
    xy = loader.XY4(KH)  # CAN CHANGE

    X = xy["X_train"]
    y = xy["y_train"]
    X_val = xy["X_val"]
    y_val = xy["y_val"]
    X_test = xy["X_test"]
    output_index = xy["X_test_index"]
    print("LOADED DATA")

    ###### PIPELINE/CV VARIABLES
    ###### DO NOT CHANGE BEFORE
    clf = KNeighborsClassifier()
    fl = X.shape[1]  # use for n_components
    cv_grid = {"clf__metric": ["euclidean", "manhattan"], "clf__n_neighbors": [10, 100, 1000]}
    num_folds = 3

    ####### START PREDICTIONS
    print("TRAINING ESTIMATOR")
    pred_pipe = Pipeline(steps=[("clf", clf)])

    ###### DO NOT CHANGE AFTER
    estimator = GridSearchCV(pred_pipe, cv_grid, cv=num_folds)

    # DO NOT NEED TO CHANGE BEYOND THIS LINE
    KH.record_metric("validation", "start", estimator, "training", "", "")
    estimator.fit(X, y)
    KH.record_metric("validation", "end", estimator, "training", "", "")
    KH.record_metric("validation", "end", estimator, "best_params", str(estimator.best_params_), "NA")
    KH.record_metric("validation", "end", estimator, "best_estimator", str(estimator.best_estimator_), "NA")
    KH.record_metric("validation", "end", estimator, "best_score", str(estimator.best_score_), "NA")
    validation_score = str(estimator.score(X_val, y_val))
    KH.record_metric("validation", "end", estimator, "validation score", validation_score, "")

    preds = estimator.predict(X_test)
    predictions = pd.DataFrame({"VisitNumber": output_index, "TripType": preds})
    KH.save_test_predictions(utils.convert_predictions(predictions), estimator, "predictions")
    KH.end_pipeline()

    return estimator