def test_get_day_boundaries(self): self.assertRaises(AssertionError, utilities.get_day_boundaries, datetime(2012, 10, 15, tzinfo=tzoffset(None, 3600))) start_time, end_time = utilities.get_day_boundaries(datetime(2012, 10, 5, 12, 00, tzinfo=tzutc()), -9) self.assertEqual(start_time, datetime(2012, 10, 5, 9, 00, tzinfo=tzutc())) self.assertEqual(end_time, datetime(2012, 10, 6, 9, 00, tzinfo=tzutc())) start_time, end_time = utilities.get_day_boundaries(datetime(2012, 10, 5, 01, 00, tzinfo=tzutc()), -9) self.assertEqual(start_time, datetime(2012, 10, 4, 9, 00, tzinfo=tzutc())) self.assertEqual(end_time, datetime(2012, 10, 5, 9, 00, tzinfo=tzutc())) start_time, end_time = utilities.get_day_boundaries(datetime(2012, 10, 6, 5, 00, tzinfo=tzutc()), -9) self.assertEqual(start_time, datetime(2012, 10, 5, 9, 00, tzinfo=tzutc())) self.assertEqual(end_time, datetime(2012, 10, 6, 9, 00, tzinfo=tzutc()))
def raw_data_to_training_days(raw_data_path, training_days_path, cutoff_times): #df_flight_history = flighthistory.get_df_flight_history_from_raw_format(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv")) print(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv")) days_flight_ids = flighthistory.process_flight_history_to_train_day_files( input_path = os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"), output_path = training_days_path, output_folder_name = "FlightHistory", output_file_name = "flighthistory.csv", cutoff_times = cutoff_times, start_hours_offset = -9) print("Flight History Events") utilities.split_file_based_on_times_filter_on_ids_streaming( os.path.join(raw_data_path, "FlightHistory", "flighthistoryevents.csv"), training_days_path, "FlightHistory", "flighthistoryevents.csv", "flight_history_id", days_flight_ids) print("ASDI Flight Plan") days_flight_plan_ids = utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiflightplan.csv"), training_days_path, "ASDI", "asdiflightplan.csv", "flighthistoryid", days_flight_ids, ids_to_track_column_name="asdiflightplanid") print("ASDI Position") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiposition.csv"), training_days_path, "ASDI", "asdiposition.csv", "flighthistoryid", days_flight_ids) print("ASDI Airway") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiairway.csv"), training_days_path, "ASDI", "asdiairway.csv", "asdiflightplanid", days_flight_plan_ids) print("ASDI FPFix") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpfix.csv"), training_days_path, "ASDI", "asdifpfix.csv", "asdiflightplanid", days_flight_plan_ids) print("ASDI FPCenter") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpcenter.csv"), training_days_path, "ASDI", "asdifpcenter.csv", "asdiflightplanid", days_flight_plan_ids) print("ASDI FPSector") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpsector.csv"), training_days_path, "ASDI", "asdifpsector.csv", "asdiflightplanid", days_flight_plan_ids) print("ASDI FPWaypoint") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpwaypoint.csv"), training_days_path, "ASDI", "asdifpwaypoint.csv", "asdiflightplanid", days_flight_plan_ids) for ct in cutoff_times: print ct day_output_path = os.path.join(training_days_path, utilities.get_day_str(ct, -9)) day_beginning, day_end = utilities.get_day_boundaries(ct, -9) if not os.path.exists(day_output_path): os.makedirs(day_output_path) weather.process_one_day(raw_data_path, day_output_path, day_beginning, day_end, "train")
def raw_data_to_training_days(raw_data_path, training_days_path, cutoff_times): #df_flight_history = flighthistory.get_df_flight_history_from_raw_format(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv")) print(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv")) days_flight_ids = flighthistory.process_flight_history_to_train_day_files( input_path = os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"), output_path = training_days_path, output_folder_name = "FlightHistory", output_file_name = "flighthistory.csv", cutoff_times = cutoff_times, start_hours_offset = -9) print("Flight History Events") utilities.split_file_based_on_times_filter_on_ids_streaming( os.path.join(raw_data_path, "FlightHistory", "flighthistoryevents.csv"), training_days_path, "FlightHistory", "flighthistoryevents.csv", "flight_history_id", days_flight_ids) print("ASDI Flight Plan") days_flight_plan_ids = utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiflightplan.csv"), training_days_path, "ASDI", "asdiflightplan.csv", "flighthistoryid", days_flight_ids, ids_to_track_column_name="asdiflightplanid") print("ASDI Position") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiposition.csv"), training_days_path, "ASDI", "asdiposition.csv", "flighthistoryid", days_flight_ids) print("ASDI Airway") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiairway.csv"), training_days_path, "ASDI", "asdiairway.csv", "asdiflightplanid", days_flight_plan_ids) print("ASDI FPFix") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpfix.csv"), training_days_path, "ASDI", "asdifpfix.csv", "asdiflightplanid", days_flight_plan_ids) print("ASDI FPCenter") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpcenter.csv"), training_days_path, "ASDI", "asdifpcenter.csv", "asdiflightplanid", days_flight_plan_ids) print("ASDI FPSector") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpsector.csv"), training_days_path, "ASDI", "asdifpsector.csv", "asdiflightplanid", days_flight_plan_ids) print("ASDI FPWaypoint") utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpwaypoint.csv"), training_days_path, "ASDI", "asdifpwaypoint.csv", "asdiflightplanid", days_flight_plan_ids) for ct in cutoff_times: print ct day_output_path = os.path.join(output_path, utilities.get_day_str(ct, -9)) day_beginning, day_end = utilities.get_day_boundaries(ct, -9) if not os.path.exists(day_output_path): os.makedirs(day_output_path) weather.process_one_day(raw_data_path, day_output_path, day_beginning, day_end, "train")
def test_get_day_boundaries(self): self.assertRaises(AssertionError, utilities.get_day_boundaries, datetime(2012, 10, 15, tzinfo=tzoffset(None, 3600))) start_time, end_time = utilities.get_day_boundaries( datetime(2012, 10, 5, 12, 00, tzinfo=tzutc()), -9) self.assertEqual(start_time, datetime(2012, 10, 5, 9, 00, tzinfo=tzutc())) self.assertEqual(end_time, datetime(2012, 10, 6, 9, 00, tzinfo=tzutc())) start_time, end_time = utilities.get_day_boundaries( datetime(2012, 10, 5, 01, 00, tzinfo=tzutc()), -9) self.assertEqual(start_time, datetime(2012, 10, 4, 9, 00, tzinfo=tzutc())) self.assertEqual(end_time, datetime(2012, 10, 5, 9, 00, tzinfo=tzutc())) start_time, end_time = utilities.get_day_boundaries( datetime(2012, 10, 6, 5, 00, tzinfo=tzutc()), -9) self.assertEqual(start_time, datetime(2012, 10, 5, 9, 00, tzinfo=tzutc())) self.assertEqual(end_time, datetime(2012, 10, 6, 9, 00, tzinfo=tzutc()))
def process_flight_history_to_train_day_files( input_path, output_path, output_folder_name, output_file_name, cutoff_times, start_hours_offset = -9): """ """ file_started_for_day = {cutoff_time: False for cutoff_time in cutoff_times} i=0 cnt=0 departure_date_columns = get_flight_history_departure_date_columns() arrival_date_columns = get_flight_history_arrival_date_columns() reader = utilities.HeaderCsvReader(open(input_path)) header_out = reader.get_header() for col in get_flight_history_columns_to_delete(): header_out.remove(col) day_flight_history_ids = {cutoff_time:set() for cutoff_time in cutoff_times} day_str_to_cutoff_time = {} file_handles = {} writers = {} for cutoff_time in cutoff_times: day_output_path = utilities.get_full_output_path(output_path, output_folder_name, cutoff_time) file_output_path = os.path.join(day_output_path, output_file_name) file_handles[cutoff_time] = open(file_output_path, "w") writers[cutoff_time] = csv.writer(file_handles[cutoff_time], dialect=utilities.CsvDialect()) writers[cutoff_time].writerow(header_out) day_str_to_cutoff_time[utilities.get_day_str(cutoff_time)] = cutoff_time i_row_mod = 0 buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times} start_time, end_time = utilities.get_day_boundaries(cutoff_time, start_hours_offset) codes_file = os.path.join(os.environ["DataPath"], "GEFlight", "Reference", "usairporticaocodes.txt") us_icao_codes = get_us_airport_icao_codes(codes_file) for row in reader: i_row_mod += 1 if not is_flight_in_or_out_of_us(row, us_icao_codes): continue parse_flight_history_dates(row, departure_date_columns, arrival_date_columns) row_day_str = get_departure_day_str(row, start_hours_offset) if row_day_str not in day_str_to_cutoff_time: continue cutoff_time = day_str_to_cutoff_time[row_day_str] cnt += 1 buffer_dict[cutoff_time].append([row[col] for col in header_out]) day_flight_history_ids[cutoff_time].add(row["flight_history_id"]) if i_row_mod < 100000: continue i+=1 print("%s: %d00k records processed, %d with relevant flights in this chunk" % (output_file_name, i, cnt)) cnt=0 for cutoff_time in cutoff_times: writers[cutoff_time].writerows(buffer_dict[cutoff_time]) file_handles[cutoff_time].flush() i_row_mod = 0 buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times} for cutoff_time in cutoff_times: writers[cutoff_time].writerows(buffer_dict[cutoff_time]) file_handles[cutoff_time].close() return day_flight_history_ids
def training_day_to_test_day(training_day_path, test_day_path, solution_path, cutoff_time): flighthistory.write_flight_history_test_day_file( os.path.join(training_day_path, "FlightHistory", "flighthistory.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "FlightHistory"), "flighthistory.csv"), cutoff_time) flighthistory.write_flight_history_test_day_and_solution_test_flights_only( os.path.join(training_day_path, "FlightHistory", "flighthistory.csv"), os.path.join(test_day_path, "test_flights.csv"), os.path.join(solution_path, utilities.get_day_str(cutoff_time) + "_solution.csv"), cutoff_time) utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "FlightHistory", "flighthistoryevents.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "FlightHistory"), "flighthistoryevents.csv"), "date_time_recorded", utilities.parse_datetime_format3, cutoff_time) utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "ASDI", "asdiposition.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiposition.csv"), "received", utilities.parse_datetime_format1, cutoff_time) flight_plan_ids = utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "ASDI", "asdiflightplan.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiflightplan.csv"), "updatetimeutc", utilities.parse_datetime_format2, cutoff_time, ids_to_track_column_name = "asdiflightplanid") utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdiairway.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiairway.csv"), "asdiflightplanid", flight_plan_ids) utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpfix.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpfix.csv"), "asdiflightplanid", flight_plan_ids) utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpcenter.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpcenter.csv"), "asdiflightplanid", flight_plan_ids) utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpsector.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpsector.csv"), "asdiflightplanid", flight_plan_ids) utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpwaypoint.csv"), os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpwaypoint.csv"), "asdiflightplanid", flight_plan_ids) day_beginning, day_end = utilities.get_day_boundaries(cutoff_time) weather.process_one_day( training_day_path, test_day_path, day_beginning, cutoff_time, "test", cutoff_time = cutoff_time)
def process_flight_history_to_train_day_files(input_path, output_path, output_folder_name, output_file_name, cutoff_times, start_hours_offset=-9): """ """ file_started_for_day = {cutoff_time: False for cutoff_time in cutoff_times} i = 0 cnt = 0 departure_date_columns = get_flight_history_departure_date_columns() arrival_date_columns = get_flight_history_arrival_date_columns() reader = utilities.HeaderCsvReader(open(input_path)) header_out = reader.get_header() for col in get_flight_history_columns_to_delete(): header_out.remove(col) day_flight_history_ids = { cutoff_time: set() for cutoff_time in cutoff_times } day_str_to_cutoff_time = {} file_handles = {} writers = {} for cutoff_time in cutoff_times: day_output_path = utilities.get_full_output_path( output_path, output_folder_name, cutoff_time) file_output_path = os.path.join(day_output_path, output_file_name) file_handles[cutoff_time] = open(file_output_path, "w") writers[cutoff_time] = csv.writer(file_handles[cutoff_time], dialect=utilities.CsvDialect()) writers[cutoff_time].writerow(header_out) day_str_to_cutoff_time[utilities.get_day_str( cutoff_time)] = cutoff_time i_row_mod = 0 buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times} start_time, end_time = utilities.get_day_boundaries( cutoff_time, start_hours_offset) codes_file = os.path.join(os.environ["DataPath"], "GEFlight", "Reference", "usairporticaocodes.txt") us_icao_codes = get_us_airport_icao_codes(codes_file) for row in reader: i_row_mod += 1 if not is_flight_in_or_out_of_us(row, us_icao_codes): continue parse_flight_history_dates(row, departure_date_columns, arrival_date_columns) row_day_str = get_departure_day_str(row, start_hours_offset) if row_day_str not in day_str_to_cutoff_time: continue cutoff_time = day_str_to_cutoff_time[row_day_str] cnt += 1 buffer_dict[cutoff_time].append([row[col] for col in header_out]) day_flight_history_ids[cutoff_time].add(row["flight_history_id"]) if i_row_mod < 100000: continue i += 1 print( "%s: %d00k records processed, %d with relevant flights in this chunk" % (output_file_name, i, cnt)) cnt = 0 for cutoff_time in cutoff_times: writers[cutoff_time].writerows(buffer_dict[cutoff_time]) file_handles[cutoff_time].flush() i_row_mod = 0 buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times} for cutoff_time in cutoff_times: writers[cutoff_time].writerows(buffer_dict[cutoff_time]) file_handles[cutoff_time].close() return day_flight_history_ids
import os import pandas from datetime import datetime, timedelta from dateutil import parser, tz from geflight.transform import utilities import pytz import random import weather raw_data_path = os.path.join(os.environ["DataPath"], "GEFlight", "RawPublicLeaderboard") output_path = os.path.join(os.environ["DataPath"], "GEFlight", "Release 2", "PublicLeaderboardTrainDays") start_day = datetime(2012,11,26,20,00, tzinfo=tz.tzutc()) cutoff_times = [start_day] for i in range(1,14): cutoff_times.append(start_day + timedelta(i, 0)) for ct in cutoff_times: print ct day_output_path = os.path.join(output_path, utilities.get_day_str(ct, -9)) day_beginning, day_end = utilities.get_day_boundaries(ct, -9) if not os.path.exists(day_output_path): os.makedirs(day_output_path) weather.process_one_day(raw_data_path, day_output_path, day_beginning, day_end, "train")