def asdi_test(): from models import extended_asdiday as ead fold = fn.folder_names_init_set() data_set_name = "InitialTrainingSet_rev1" cutoff_file = "cutoff_time_list_my_cutoff.csv" ead.ExtendedASDIDay(fold[0], data_set_name, "training", cutoff_file)
def __init__(self, data): data_set_name = "InitialTrainingSet_rev1" self.flight_history = pd.DataFrame(None) self.parsed_fhe = pd.DataFrame(None) if data == "flight_history": print "AllTrainingData Initializing: using data {}".format(data) for f in fn.folder_names_init_set(): print "\tLoading flight_history.csv folder {}...".format(f), temp = \ pd.read_csv("../Data/" + data_set_name + \ "/" + f + "/" + "FlightHistory/flighthistory.csv", converters = dut.get_flight_history_date_converter()) self.flight_history = pd.concat([self.flight_history, temp]) print "done" if data == "parsed_fhe": print "AllTrainingData Initializing: using data {}".format(data) for f in fn.folder_names_init_set(): print "\tLoading parsed_fhe.csv file {}...".format(f), temp = \ pd.read_csv('output_csv/parsed_fhe_' + f + '_' + "all" + '_filtered.csv', # might have to fix to work with test data? na_values=["MISSING"], keep_default_na=False, parse_dates=[9,10,11,12,13,14,15,16,17,18,27,28,29,30,31,32,33,34,35,36,37,38,43,47]) self.parsed_fhe = pd.concat([self.parsed_fhe, temp]) print "done" if data == "parsed_fhe_test": print "AllTrainingData Initializing: using data {}".format(data) for f in fn.folder_names_init_set(): print "\tLoading parsed_fhe.csv file {}...".format(f), temp = \ pd.read_csv('output_csv/parsed_fhe_' + f + '_' + "test" + '_filtered.csv', na_values=["MISSING"], keep_default_na=False, parse_dates=[9,10,11,12,13,14,15,16,17,18,27,28,29,30,31,32,33,34,35,36,37,38,43,47]) self.parsed_fhe = pd.concat([self.parsed_fhe, temp]) print "done" if data == "parsed_fhe_no_dates": print "AllTrainingData Initializing: using data {}".format(data) for f in fn.folder_names_init_set(): print "\tLoading parsed_fhe.csv file {}...".format(f), temp = \ pd.read_csv('output_csv/parsed_fhe_' + f + '_' + "all" + '_filtered.csv', # might have to fix to work with test data? na_values=["MISSING"], keep_default_na=False) self.parsed_fhe = pd.concat([self.parsed_fhe, temp]) print "done" if data == "parsed_fhe_test_no_dates": print "AllTrainingData Initializing: using data {}".format(data) for f in fn.folder_names_test_set(): print "\tLoading parsed_fhe_test.csv file {}...".format(f), temp = \ pd.read_csv('output_csv/parsed_fhe_' + f + '_' + "test" + '_filtered.csv', # might have to fix to work with test data? na_values=["MISSING"], keep_default_na=False) self.parsed_fhe = pd.concat([self.parsed_fhe, temp]) print "done" if data == "parsed_fhe_test_no_dates_with_best": print "AllTrainingData Initializing: using data {}".format(data) for f in fn.folder_names_test_set(): print "\tLoading parsed_fhe_test.csv file {}...".format(f), temp = \ pd.read_csv('output_csv/parsed_fhe_' + f + '_' + "test" + \ '_filtered_with_dates_with_best_prediction.csv', # might have to fix to work with test data? na_values=["MISSING"], keep_default_na=False) self.parsed_fhe = pd.concat([self.parsed_fhe, temp]) print "done"
def run_model(model_A, model_B, data_set_name, mode, cutoff_filename=""): """ Runs the most recent update model for each day in the dataset and returns the result to a csv file in the python directory. """ # Load list of folder names which each contain a day if data_set_name == "InitialTrainingSet_rev1": days_list = fn.folder_names_init_set() elif data_set_name == "PublicLeaderboardSet": days_list = fn.folder_names_test_set() else: days_list = [] print "Problem with data set name!" # Fin_X contains the final predictions for model X fin_A = fd.FlightPredictions() # If we have a second model to compare against, to check for # improvements or anything like that load it into B if model_B != None: fin_B = fd.FlightPredictions() print "Using mode: {}".format(mode) print "Using data from {}".format(data_set_name) # Loop through all of the days in the data set for i, d in enumerate(days_list): if model_B == None: print "Running model '{}' on day {} (day {} of {}):".format(model_A, d, i + 1, len(days_list)) else: print "Running models '{}', '{}' on day {} (day {} of {}):".format(model_A, model_B, d, i + 1, len(days_list)) # Initialize all the information about each day. # Extended means we include the flight history events file # day = efd.ExtendedFlightDay(d, data_set_name, mode, cutoff_filename) day = fd.FlightDay(d, data_set_name, mode, cutoff_filename) # day = ad.ASDIDay(d, data_set_name, mode, cutoff_filename) # day = ead.ExtendedASDIDay(d, data_set_name, mode, cutoff_filename) # Compute the predicitons for the day fin_A = return_predictions(model_A, day, fin_A) if model_B != None: fin_B = return_predictions(model_B, day, fin_B) print "\tDay {} has finished".format(d) print "" print "All days in {} are done!".format(data_set_name) if "leaderboard" in mode: # In leaderboard mode we just write the predictions to a csv file # for submission to kaggle fin_A.flight_predictions = fin_A.flight_predictions.sort(columns='flight_history_id') fin_A.flight_predictions.to_csv('test.csv', index=False) print "Predictions written to csv file in Python folder." if model_B != None: print "Warning: we have disregarded the output of '{}'!".format(model_B) elif "training" in mode: # In training mode we can calculate the root mean squared error as we # know the true values score_A = rmse.calculate_rmse_score(fin_A.flight_predictions, fin_A.test_data) if model_B != None: score_B = rmse.calculate_rmse_score(fin_B.flight_predictions, fin_B.test_data) else: score_B = None scores = {str(model_A): score_A, str(model_B): score_B} # Write the scores to the score log for record keeping # See if we are making improvements log_predictions(day, model_A, model_B, scores, "scores.log") return scores else: print "Not an option!"