Exemplo n.º 1
0
def run_model(model_A, model_B, data_set_name, mode, cutoff_filename=""):
    """
    Runs the most recent update model for each day
    in the dataset and returns the result to a
    csv file in the python directory.
    """

    # Load list of folder names which each contain a day
    if data_set_name == "InitialTrainingSet_rev1":
        days_list = fn.folder_names_init_set()
    elif data_set_name == "PublicLeaderboardSet":
        days_list = fn.folder_names_test_set()
    else:
        days_list = []
        print "Problem with data set name!"

    # Fin_X contains the final predictions for model X
    fin_A = fd.FlightPredictions()

    # If we have a second model to compare against, to check for
    # improvements or anything like that load it into B
    if model_B != None:
        fin_B = fd.FlightPredictions()

    print "Using mode: {}".format(mode)
    print "Using data from {}".format(data_set_name)

    # Loop through all of the days in the data set
    for i, d in enumerate(days_list):
        if model_B == None:
            print "Running model '{}' on day {} (day {} of {}):".format(model_A, d, i + 1, len(days_list))
        else:
            print "Running models '{}', '{}' on day {} (day {} of {}):".format(model_A,
                model_B, d, i + 1, len(days_list))

        # Initialize all the information about each day.
        # Extended means we include the flight history events file
        # day = efd.ExtendedFlightDay(d, data_set_name, mode, cutoff_filename)
        day = fd.FlightDay(d, data_set_name, mode, cutoff_filename)
        # day = ad.ASDIDay(d, data_set_name, mode, cutoff_filename)
        # day = ead.ExtendedASDIDay(d, data_set_name, mode, cutoff_filename)

        # Compute the predicitons for the day
        fin_A = return_predictions(model_A, day, fin_A)

        if model_B != None:
            fin_B = return_predictions(model_B, day, fin_B)

        print "\tDay {} has finished".format(d)
        print ""

    print "All days in {} are done!".format(data_set_name)

    if "leaderboard" in mode:
        # In leaderboard mode we just write the predictions to a csv file
        # for submission to kaggle
        fin_A.flight_predictions = fin_A.flight_predictions.sort(columns='flight_history_id')
        fin_A.flight_predictions.to_csv('test.csv', index=False)
        print "Predictions written to csv file in Python folder."
        if model_B != None:
            print "Warning: we have disregarded the output of '{}'!".format(model_B)

    elif "training" in mode:
        # In training mode we can calculate the root mean squared error as we
        # know the true values
        score_A = rmse.calculate_rmse_score(fin_A.flight_predictions, fin_A.test_data)

        if model_B != None:
            score_B = rmse.calculate_rmse_score(fin_B.flight_predictions, fin_B.test_data)
        else:
            score_B = None

        scores = {str(model_A): score_A,
                  str(model_B): score_B}

        # Write the scores to the score log for record keeping
        # See if we are making improvements
        log_predictions(day, model_A, model_B, scores, "scores.log")

        return scores

    else:
        print "Not an option!"
def r_forest():

    [X_train, y_train, ind_train] = load_and_format_data('all_combined_no_dates')
    [X_test, y_test, ind_test] = load_and_format_data('all_combined_no_dates')

    print len(X_train)


    # y_train_runway = y_train['actual_runway_arrival_minutes_after_midnight']
    # y_train_gate   = y_train['actual_gate_arrival_minutes_after_midnight']


    forest = RandomForestRegressor(n_estimators=2, random_state=None, n_jobs=-1)
    # forest = ExtraTreesRegressor(n_estimators=200, random_state=None, n_jobs=-1)
    # forest = GradientBoostingRegressor(n_estimators=200,
    #     learn_rate=0.1, max_depth=5, random_state=None, loss='ls')


    # forest.fit(X_train, y_train_runway)
    # y_pred_runway = forest.predict(X_pred)

    # forest.fit(X_train, y_train_gate)
    # y_pred_gate = forest.predict(X_pred)

    # forest.fit(X_train, y_train)
    # y_pred = forest.predict(X_pred)


    # y_pred_runway = y_pred[:,0]
    # y_pred_gate = y_pred[:,1]


    # pred = fd.FlightPredictions()

    # pred.flight_predictions = pred.flight_predictions.reindex(range(len(ind_pred)))

    # pred.flight_predictions['flight_history_id']     = ind_pred
    # pred.flight_predictions['actual_runway_arrival'] = y_pred_runway
    # pred.flight_predictions['actual_gate_arrival']   = y_pred_gate

    # pred.flight_predictions = pred.flight_predictions.sort(columns='flight_history_id')

    # pred.flight_predictions.to_csv('test_rand_forest.csv', index=False)

    score = []
    kfold = cross_validation.KFold(n=len(X_train), k=2, indices=False, shuffle=True)

    for i, (traincv, testcv) in enumerate(kfold):

            print i

            pred = fd.FlightPredictions()
            y_pred = []; ind_pred = []; y_pred_runway = []; y_pred_gate = []

            print "Starting training...",
            forest.fit(X_train[traincv], y_train[traincv])
            print "done"
            print "Starting prediction...",
            y_pred = forest.predict(X_train[testcv])
            print "done"

            ind_pred = ind_train[testcv].values

            y_pred_runway = y_pred[:,0]
            y_pred_gate = y_pred[:,1]

            pred.flight_predictions = \
                pred.flight_predictions.reindex(range(len(ind_pred)))
            pred.test_data = \
                pred.test_data.reindex(range(len(ind_pred)))

            pred.flight_predictions['flight_history_id']     = ind_pred
            pred.flight_predictions['actual_runway_arrival'] = y_pred_runway
            pred.flight_predictions['actual_gate_arrival']   = y_pred_gate

            pred.test_data['flight_history_id']     = ind_pred
            pred.test_data['actual_runway_arrival'] = \
                y_train['actual_runway_arrival_minutes_after_midnight'][testcv].values
            pred.test_data['actual_gate_arrival']   = \
                y_train['actual_gate_arrival_minutes_after_midnight'][testcv].values

            score.append(rmse.calculate_rmse_score(pred.flight_predictions, pred.test_data))

    print score

    print np.mean(score)
    print np.std(score)
Exemplo n.º 3
0
def r_forest():

    [X_train, y_train,
     ind_train] = load_and_format_data('all_combined_no_dates')
    [X_test, y_test, ind_test] = load_and_format_data('all_combined_no_dates')

    print len(X_train)

    # y_train_runway = y_train['actual_runway_arrival_minutes_after_midnight']
    # y_train_gate   = y_train['actual_gate_arrival_minutes_after_midnight']

    forest = RandomForestRegressor(n_estimators=2,
                                   random_state=None,
                                   n_jobs=-1)
    # forest = ExtraTreesRegressor(n_estimators=200, random_state=None, n_jobs=-1)
    # forest = GradientBoostingRegressor(n_estimators=200,
    #     learn_rate=0.1, max_depth=5, random_state=None, loss='ls')

    # forest.fit(X_train, y_train_runway)
    # y_pred_runway = forest.predict(X_pred)

    # forest.fit(X_train, y_train_gate)
    # y_pred_gate = forest.predict(X_pred)

    # forest.fit(X_train, y_train)
    # y_pred = forest.predict(X_pred)

    # y_pred_runway = y_pred[:,0]
    # y_pred_gate = y_pred[:,1]

    # pred = fd.FlightPredictions()

    # pred.flight_predictions = pred.flight_predictions.reindex(range(len(ind_pred)))

    # pred.flight_predictions['flight_history_id']     = ind_pred
    # pred.flight_predictions['actual_runway_arrival'] = y_pred_runway
    # pred.flight_predictions['actual_gate_arrival']   = y_pred_gate

    # pred.flight_predictions = pred.flight_predictions.sort(columns='flight_history_id')

    # pred.flight_predictions.to_csv('test_rand_forest.csv', index=False)

    score = []
    kfold = cross_validation.KFold(n=len(X_train),
                                   k=2,
                                   indices=False,
                                   shuffle=True)

    for i, (traincv, testcv) in enumerate(kfold):

        print i

        pred = fd.FlightPredictions()
        y_pred = []
        ind_pred = []
        y_pred_runway = []
        y_pred_gate = []

        print "Starting training...",
        forest.fit(X_train[traincv], y_train[traincv])
        print "done"
        print "Starting prediction...",
        y_pred = forest.predict(X_train[testcv])
        print "done"

        ind_pred = ind_train[testcv].values

        y_pred_runway = y_pred[:, 0]
        y_pred_gate = y_pred[:, 1]

        pred.flight_predictions = \
            pred.flight_predictions.reindex(range(len(ind_pred)))
        pred.test_data = \
            pred.test_data.reindex(range(len(ind_pred)))

        pred.flight_predictions['flight_history_id'] = ind_pred
        pred.flight_predictions['actual_runway_arrival'] = y_pred_runway
        pred.flight_predictions['actual_gate_arrival'] = y_pred_gate

        pred.test_data['flight_history_id'] = ind_pred
        pred.test_data['actual_runway_arrival'] = \
            y_train['actual_runway_arrival_minutes_after_midnight'][testcv].values
        pred.test_data['actual_gate_arrival']   = \
            y_train['actual_gate_arrival_minutes_after_midnight'][testcv].values

        score.append(
            rmse.calculate_rmse_score(pred.flight_predictions, pred.test_data))

    print score

    print np.mean(score)
    print np.std(score)