示例#1
0
def asdi_test():
    from models import extended_asdiday as ead

    fold = fn.folder_names_init_set()
    data_set_name = "InitialTrainingSet_rev1"
    cutoff_file = "cutoff_time_list_my_cutoff.csv"

    ead.ExtendedASDIDay(fold[0], data_set_name, "training", cutoff_file)
示例#2
0
def asdi_test():
    from models import extended_asdiday as ead

    fold = fn.folder_names_init_set()
    data_set_name = "InitialTrainingSet_rev1"
    cutoff_file = "cutoff_time_list_my_cutoff.csv"

    ead.ExtendedASDIDay(fold[0], data_set_name,
        "training", cutoff_file)
    def __init__(self, data):
        data_set_name = "InitialTrainingSet_rev1"
        self.flight_history = pd.DataFrame(None)
        self.parsed_fhe = pd.DataFrame(None)

        if data == "flight_history":
            print "AllTrainingData Initializing: using data {}".format(data)
            for f in fn.folder_names_init_set():
                print "\tLoading flight_history.csv folder {}...".format(f),
                temp = \
                    pd.read_csv("../Data/" + data_set_name + \
                    "/" + f + "/" + "FlightHistory/flighthistory.csv",
                    converters = dut.get_flight_history_date_converter())
                self.flight_history = pd.concat([self.flight_history, temp])
                print "done"

        if data == "parsed_fhe":
            print "AllTrainingData Initializing: using data {}".format(data)
            for f in fn.folder_names_init_set():
                print "\tLoading parsed_fhe.csv file {}...".format(f),
                temp = \
                    pd.read_csv('output_csv/parsed_fhe_' + f + '_' + "all" + '_filtered.csv',
                    # might have to fix to work with test data?
                    na_values=["MISSING"], keep_default_na=False,
                    parse_dates=[9,10,11,12,13,14,15,16,17,18,27,28,29,30,31,32,33,34,35,36,37,38,43,47])
                self.parsed_fhe = pd.concat([self.parsed_fhe, temp])
                print "done"

        if data == "parsed_fhe_test":
            print "AllTrainingData Initializing: using data {}".format(data)
            for f in fn.folder_names_init_set():
                print "\tLoading parsed_fhe.csv file {}...".format(f),
                temp = \
                    pd.read_csv('output_csv/parsed_fhe_' + f + '_' + "test" + '_filtered.csv',
                    na_values=["MISSING"], keep_default_na=False,
                    parse_dates=[9,10,11,12,13,14,15,16,17,18,27,28,29,30,31,32,33,34,35,36,37,38,43,47])
                self.parsed_fhe = pd.concat([self.parsed_fhe, temp])
                print "done"

        if data == "parsed_fhe_no_dates":
            print "AllTrainingData Initializing: using data {}".format(data)
            for f in fn.folder_names_init_set():
                print "\tLoading parsed_fhe.csv file {}...".format(f),
                temp = \
                    pd.read_csv('output_csv/parsed_fhe_' + f + '_' + "all" + '_filtered.csv',
                    # might have to fix to work with test data?
                    na_values=["MISSING"], keep_default_na=False)
                self.parsed_fhe = pd.concat([self.parsed_fhe, temp])
                print "done"

        if data == "parsed_fhe_test_no_dates":
            print "AllTrainingData Initializing: using data {}".format(data)
            for f in fn.folder_names_test_set():
                print "\tLoading parsed_fhe_test.csv file {}...".format(f),
                temp = \
                    pd.read_csv('output_csv/parsed_fhe_' + f + '_' + "test" + '_filtered.csv',
                    # might have to fix to work with test data?
                    na_values=["MISSING"], keep_default_na=False)
                self.parsed_fhe = pd.concat([self.parsed_fhe, temp])
                print "done"

        if data == "parsed_fhe_test_no_dates_with_best":
            print "AllTrainingData Initializing: using data {}".format(data)
            for f in fn.folder_names_test_set():
                print "\tLoading parsed_fhe_test.csv file {}...".format(f),
                temp = \
                    pd.read_csv('output_csv/parsed_fhe_' + f + '_' + "test" + \
                    '_filtered_with_dates_with_best_prediction.csv',
                    # might have to fix to work with test data?
                    na_values=["MISSING"], keep_default_na=False)
                self.parsed_fhe = pd.concat([self.parsed_fhe, temp])
                print "done"
示例#4
0
def run_model(model_A, model_B, data_set_name, mode, cutoff_filename=""):
    """
    Runs the most recent update model for each day
    in the dataset and returns the result to a
    csv file in the python directory.
    """

    # Load list of folder names which each contain a day
    if data_set_name == "InitialTrainingSet_rev1":
        days_list = fn.folder_names_init_set()
    elif data_set_name == "PublicLeaderboardSet":
        days_list = fn.folder_names_test_set()
    else:
        days_list = []
        print "Problem with data set name!"

    # Fin_X contains the final predictions for model X
    fin_A = fd.FlightPredictions()

    # If we have a second model to compare against, to check for
    # improvements or anything like that load it into B
    if model_B != None:
        fin_B = fd.FlightPredictions()

    print "Using mode: {}".format(mode)
    print "Using data from {}".format(data_set_name)

    # Loop through all of the days in the data set
    for i, d in enumerate(days_list):
        if model_B == None:
            print "Running model '{}' on day {} (day {} of {}):".format(model_A, d, i + 1, len(days_list))
        else:
            print "Running models '{}', '{}' on day {} (day {} of {}):".format(model_A,
                model_B, d, i + 1, len(days_list))

        # Initialize all the information about each day.
        # Extended means we include the flight history events file
        # day = efd.ExtendedFlightDay(d, data_set_name, mode, cutoff_filename)
        day = fd.FlightDay(d, data_set_name, mode, cutoff_filename)
        # day = ad.ASDIDay(d, data_set_name, mode, cutoff_filename)
        # day = ead.ExtendedASDIDay(d, data_set_name, mode, cutoff_filename)

        # Compute the predicitons for the day
        fin_A = return_predictions(model_A, day, fin_A)

        if model_B != None:
            fin_B = return_predictions(model_B, day, fin_B)

        print "\tDay {} has finished".format(d)
        print ""

    print "All days in {} are done!".format(data_set_name)

    if "leaderboard" in mode:
        # In leaderboard mode we just write the predictions to a csv file
        # for submission to kaggle
        fin_A.flight_predictions = fin_A.flight_predictions.sort(columns='flight_history_id')
        fin_A.flight_predictions.to_csv('test.csv', index=False)
        print "Predictions written to csv file in Python folder."
        if model_B != None:
            print "Warning: we have disregarded the output of '{}'!".format(model_B)

    elif "training" in mode:
        # In training mode we can calculate the root mean squared error as we
        # know the true values
        score_A = rmse.calculate_rmse_score(fin_A.flight_predictions, fin_A.test_data)

        if model_B != None:
            score_B = rmse.calculate_rmse_score(fin_B.flight_predictions, fin_B.test_data)
        else:
            score_B = None

        scores = {str(model_A): score_A,
                  str(model_B): score_B}

        # Write the scores to the score log for record keeping
        # See if we are making improvements
        log_predictions(day, model_A, model_B, scores, "scores.log")

        return scores

    else:
        print "Not an option!"