Exemplo n.º 1
0
    def test_get_day_boundaries(self):
        self.assertRaises(AssertionError, utilities.get_day_boundaries, datetime(2012, 10, 15, tzinfo=tzoffset(None, 3600)))
        
        start_time, end_time = utilities.get_day_boundaries(datetime(2012, 10, 5, 12, 00, tzinfo=tzutc()), -9)
        self.assertEqual(start_time, datetime(2012, 10, 5, 9, 00, tzinfo=tzutc()))
        self.assertEqual(end_time, datetime(2012, 10, 6, 9, 00, tzinfo=tzutc()))

        start_time, end_time = utilities.get_day_boundaries(datetime(2012, 10, 5, 01, 00, tzinfo=tzutc()), -9)
        self.assertEqual(start_time, datetime(2012, 10, 4, 9, 00, tzinfo=tzutc()))
        self.assertEqual(end_time, datetime(2012, 10, 5, 9, 00, tzinfo=tzutc()))

        start_time, end_time = utilities.get_day_boundaries(datetime(2012, 10, 6, 5, 00, tzinfo=tzutc()), -9)
        self.assertEqual(start_time, datetime(2012, 10, 5, 9, 00, tzinfo=tzutc()))
        self.assertEqual(end_time, datetime(2012, 10, 6, 9, 00, tzinfo=tzutc()))
def raw_data_to_training_days(raw_data_path, training_days_path, cutoff_times):
    #df_flight_history = flighthistory.get_df_flight_history_from_raw_format(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"))
 
    print(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"))

    days_flight_ids = flighthistory.process_flight_history_to_train_day_files(
        input_path = os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"),
        output_path = training_days_path,
        output_folder_name = "FlightHistory",
        output_file_name = "flighthistory.csv",
        cutoff_times = cutoff_times,
        start_hours_offset = -9)

    print("Flight History Events")
    utilities.split_file_based_on_times_filter_on_ids_streaming(
        os.path.join(raw_data_path, "FlightHistory", "flighthistoryevents.csv"),
        training_days_path,
        "FlightHistory",
        "flighthistoryevents.csv",
        "flight_history_id",
        days_flight_ids)
    
    print("ASDI Flight Plan")
    days_flight_plan_ids = utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiflightplan.csv"), training_days_path,
        "ASDI", "asdiflightplan.csv", "flighthistoryid", days_flight_ids, ids_to_track_column_name="asdiflightplanid")

    print("ASDI Position")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiposition.csv"), training_days_path,
        "ASDI", "asdiposition.csv", "flighthistoryid", days_flight_ids)

    print("ASDI Airway")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiairway.csv"), training_days_path,
        "ASDI", "asdiairway.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPFix")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpfix.csv"), training_days_path,
        "ASDI", "asdifpfix.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPCenter")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpcenter.csv"), training_days_path,
        "ASDI", "asdifpcenter.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPSector")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpsector.csv"), training_days_path,
        "ASDI", "asdifpsector.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPWaypoint")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpwaypoint.csv"), training_days_path,
        "ASDI", "asdifpwaypoint.csv", "asdiflightplanid", days_flight_plan_ids)

    for ct in cutoff_times:
        print ct
        day_output_path = os.path.join(training_days_path, utilities.get_day_str(ct, -9))
        day_beginning, day_end = utilities.get_day_boundaries(ct, -9)

        if not os.path.exists(day_output_path):
            os.makedirs(day_output_path)
        weather.process_one_day(raw_data_path, day_output_path, day_beginning, day_end, "train")
def raw_data_to_training_days(raw_data_path, training_days_path, cutoff_times):
    #df_flight_history = flighthistory.get_df_flight_history_from_raw_format(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"))
 
    print(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"))

    days_flight_ids = flighthistory.process_flight_history_to_train_day_files(
        input_path = os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"),
        output_path = training_days_path,
        output_folder_name = "FlightHistory",
        output_file_name = "flighthistory.csv",
        cutoff_times = cutoff_times,
        start_hours_offset = -9)

    print("Flight History Events")
    utilities.split_file_based_on_times_filter_on_ids_streaming(
        os.path.join(raw_data_path, "FlightHistory", "flighthistoryevents.csv"),
        training_days_path,
        "FlightHistory",
        "flighthistoryevents.csv",
        "flight_history_id",
        days_flight_ids)
    
    print("ASDI Flight Plan")
    days_flight_plan_ids = utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiflightplan.csv"), training_days_path,
        "ASDI", "asdiflightplan.csv", "flighthistoryid", days_flight_ids, ids_to_track_column_name="asdiflightplanid")

    print("ASDI Position")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiposition.csv"), training_days_path,
        "ASDI", "asdiposition.csv", "flighthistoryid", days_flight_ids)

    print("ASDI Airway")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiairway.csv"), training_days_path,
        "ASDI", "asdiairway.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPFix")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpfix.csv"), training_days_path,
        "ASDI", "asdifpfix.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPCenter")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpcenter.csv"), training_days_path,
        "ASDI", "asdifpcenter.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPSector")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpsector.csv"), training_days_path,
        "ASDI", "asdifpsector.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPWaypoint")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpwaypoint.csv"), training_days_path,
        "ASDI", "asdifpwaypoint.csv", "asdiflightplanid", days_flight_plan_ids)

    for ct in cutoff_times:
        print ct
        day_output_path = os.path.join(output_path, utilities.get_day_str(ct, -9))
        day_beginning, day_end = utilities.get_day_boundaries(ct, -9)

        if not os.path.exists(day_output_path):
            os.makedirs(day_output_path)
        weather.process_one_day(raw_data_path, day_output_path, day_beginning, day_end, "train")
Exemplo n.º 4
0
    def test_get_day_boundaries(self):
        self.assertRaises(AssertionError, utilities.get_day_boundaries,
                          datetime(2012, 10, 15, tzinfo=tzoffset(None, 3600)))

        start_time, end_time = utilities.get_day_boundaries(
            datetime(2012, 10, 5, 12, 00, tzinfo=tzutc()), -9)
        self.assertEqual(start_time,
                         datetime(2012, 10, 5, 9, 00, tzinfo=tzutc()))
        self.assertEqual(end_time, datetime(2012, 10, 6, 9, 00,
                                            tzinfo=tzutc()))

        start_time, end_time = utilities.get_day_boundaries(
            datetime(2012, 10, 5, 01, 00, tzinfo=tzutc()), -9)
        self.assertEqual(start_time,
                         datetime(2012, 10, 4, 9, 00, tzinfo=tzutc()))
        self.assertEqual(end_time, datetime(2012, 10, 5, 9, 00,
                                            tzinfo=tzutc()))

        start_time, end_time = utilities.get_day_boundaries(
            datetime(2012, 10, 6, 5, 00, tzinfo=tzutc()), -9)
        self.assertEqual(start_time,
                         datetime(2012, 10, 5, 9, 00, tzinfo=tzutc()))
        self.assertEqual(end_time, datetime(2012, 10, 6, 9, 00,
                                            tzinfo=tzutc()))
Exemplo n.º 5
0
def process_flight_history_to_train_day_files(
    input_path,
    output_path,
    output_folder_name,
    output_file_name,
    cutoff_times,
    start_hours_offset = -9):
    """
    
    """

    file_started_for_day = {cutoff_time: False for cutoff_time in cutoff_times}

    i=0
    cnt=0

    departure_date_columns = get_flight_history_departure_date_columns()
    arrival_date_columns = get_flight_history_arrival_date_columns()

    reader = utilities.HeaderCsvReader(open(input_path))
    header_out = reader.get_header()
    for col in get_flight_history_columns_to_delete():
        header_out.remove(col)

    day_flight_history_ids = {cutoff_time:set() for cutoff_time in cutoff_times}
    day_str_to_cutoff_time = {}
    file_handles = {}
    writers = {}
    for cutoff_time in cutoff_times:
        day_output_path = utilities.get_full_output_path(output_path, output_folder_name, cutoff_time)
        file_output_path = os.path.join(day_output_path, output_file_name)
        file_handles[cutoff_time] = open(file_output_path, "w")
        writers[cutoff_time] = csv.writer(file_handles[cutoff_time], dialect=utilities.CsvDialect())
        writers[cutoff_time].writerow(header_out)
        day_str_to_cutoff_time[utilities.get_day_str(cutoff_time)] = cutoff_time

    i_row_mod = 0
    buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times}

    start_time, end_time = utilities.get_day_boundaries(cutoff_time, start_hours_offset)

    codes_file = os.path.join(os.environ["DataPath"], "GEFlight", "Reference", "usairporticaocodes.txt")
    us_icao_codes = get_us_airport_icao_codes(codes_file)

    for row in reader:
        i_row_mod += 1
        if not is_flight_in_or_out_of_us(row, us_icao_codes):
            continue
        parse_flight_history_dates(row, departure_date_columns, arrival_date_columns)
        row_day_str = get_departure_day_str(row, start_hours_offset)
        if row_day_str not in day_str_to_cutoff_time:
            continue
        cutoff_time = day_str_to_cutoff_time[row_day_str]
        cnt += 1
        buffer_dict[cutoff_time].append([row[col] for col in header_out])
        day_flight_history_ids[cutoff_time].add(row["flight_history_id"])
        if i_row_mod < 100000:
            continue
        i+=1
        print("%s: %d00k records processed, %d with relevant flights in this chunk" % (output_file_name, i, cnt))
        cnt=0
        for cutoff_time in cutoff_times:
            writers[cutoff_time].writerows(buffer_dict[cutoff_time])
            file_handles[cutoff_time].flush()

        i_row_mod = 0
        buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times}

    for cutoff_time in cutoff_times:
        writers[cutoff_time].writerows(buffer_dict[cutoff_time])
        file_handles[cutoff_time].close()

    return day_flight_history_ids
Exemplo n.º 6
0
def training_day_to_test_day(training_day_path, test_day_path, solution_path, cutoff_time): 
    flighthistory.write_flight_history_test_day_file(
        os.path.join(training_day_path, "FlightHistory", "flighthistory.csv"),
        os.path.join(utilities.get_output_subdirectory(test_day_path, "FlightHistory"), "flighthistory.csv"),
        cutoff_time)

    flighthistory.write_flight_history_test_day_and_solution_test_flights_only(
        os.path.join(training_day_path, "FlightHistory", "flighthistory.csv"),
        os.path.join(test_day_path, "test_flights.csv"),
        os.path.join(solution_path, utilities.get_day_str(cutoff_time) + "_solution.csv"),
        cutoff_time)

    utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "FlightHistory", "flighthistoryevents.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "FlightHistory"), "flighthistoryevents.csv"),
        "date_time_recorded",
        utilities.parse_datetime_format3,
        cutoff_time)

    utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "ASDI", "asdiposition.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiposition.csv"),
        "received",
        utilities.parse_datetime_format1,
        cutoff_time)

    flight_plan_ids = utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "ASDI", "asdiflightplan.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiflightplan.csv"),
        "updatetimeutc",
        utilities.parse_datetime_format2,
        cutoff_time,
        ids_to_track_column_name = "asdiflightplanid")

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdiairway.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiairway.csv"),
        "asdiflightplanid",
        flight_plan_ids)

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpfix.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpfix.csv"),
        "asdiflightplanid",
        flight_plan_ids)

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpcenter.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpcenter.csv"),
        "asdiflightplanid",
        flight_plan_ids)

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpsector.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpsector.csv"),
        "asdiflightplanid",
        flight_plan_ids)

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpwaypoint.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpwaypoint.csv"),
        "asdiflightplanid",
        flight_plan_ids)

    day_beginning, day_end = utilities.get_day_boundaries(cutoff_time)

    weather.process_one_day(
        training_day_path, 
        test_day_path, 
        day_beginning, 
        cutoff_time, 
        "test", 
        cutoff_time = cutoff_time)
def process_flight_history_to_train_day_files(input_path,
                                              output_path,
                                              output_folder_name,
                                              output_file_name,
                                              cutoff_times,
                                              start_hours_offset=-9):
    """
    
    """

    file_started_for_day = {cutoff_time: False for cutoff_time in cutoff_times}

    i = 0
    cnt = 0

    departure_date_columns = get_flight_history_departure_date_columns()
    arrival_date_columns = get_flight_history_arrival_date_columns()

    reader = utilities.HeaderCsvReader(open(input_path))
    header_out = reader.get_header()
    for col in get_flight_history_columns_to_delete():
        header_out.remove(col)

    day_flight_history_ids = {
        cutoff_time: set()
        for cutoff_time in cutoff_times
    }
    day_str_to_cutoff_time = {}
    file_handles = {}
    writers = {}
    for cutoff_time in cutoff_times:
        day_output_path = utilities.get_full_output_path(
            output_path, output_folder_name, cutoff_time)
        file_output_path = os.path.join(day_output_path, output_file_name)
        file_handles[cutoff_time] = open(file_output_path, "w")
        writers[cutoff_time] = csv.writer(file_handles[cutoff_time],
                                          dialect=utilities.CsvDialect())
        writers[cutoff_time].writerow(header_out)
        day_str_to_cutoff_time[utilities.get_day_str(
            cutoff_time)] = cutoff_time

    i_row_mod = 0
    buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times}

    start_time, end_time = utilities.get_day_boundaries(
        cutoff_time, start_hours_offset)

    codes_file = os.path.join(os.environ["DataPath"], "GEFlight", "Reference",
                              "usairporticaocodes.txt")
    us_icao_codes = get_us_airport_icao_codes(codes_file)

    for row in reader:
        i_row_mod += 1
        if not is_flight_in_or_out_of_us(row, us_icao_codes):
            continue
        parse_flight_history_dates(row, departure_date_columns,
                                   arrival_date_columns)
        row_day_str = get_departure_day_str(row, start_hours_offset)
        if row_day_str not in day_str_to_cutoff_time:
            continue
        cutoff_time = day_str_to_cutoff_time[row_day_str]
        cnt += 1
        buffer_dict[cutoff_time].append([row[col] for col in header_out])
        day_flight_history_ids[cutoff_time].add(row["flight_history_id"])
        if i_row_mod < 100000:
            continue
        i += 1
        print(
            "%s: %d00k records processed, %d with relevant flights in this chunk"
            % (output_file_name, i, cnt))
        cnt = 0
        for cutoff_time in cutoff_times:
            writers[cutoff_time].writerows(buffer_dict[cutoff_time])
            file_handles[cutoff_time].flush()

        i_row_mod = 0
        buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times}

    for cutoff_time in cutoff_times:
        writers[cutoff_time].writerows(buffer_dict[cutoff_time])
        file_handles[cutoff_time].close()

    return day_flight_history_ids
def training_day_to_test_day(training_day_path, test_day_path, solution_path, cutoff_time): 
    flighthistory.write_flight_history_test_day_file(
        os.path.join(training_day_path, "FlightHistory", "flighthistory.csv"),
        os.path.join(utilities.get_output_subdirectory(test_day_path, "FlightHistory"), "flighthistory.csv"),
        cutoff_time)

    flighthistory.write_flight_history_test_day_and_solution_test_flights_only(
        os.path.join(training_day_path, "FlightHistory", "flighthistory.csv"),
        os.path.join(test_day_path, "test_flights.csv"),
        os.path.join(solution_path, utilities.get_day_str(cutoff_time) + "_solution.csv"),
        cutoff_time)
    
    utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "FlightHistory", "flighthistoryevents.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "FlightHistory"), "flighthistoryevents.csv"),
        "date_time_recorded",
        utilities.parse_datetime_format3,
        cutoff_time)

    utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "ASDI", "asdiposition.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiposition.csv"),
        "received",
        utilities.parse_datetime_format1,
        cutoff_time)

    flight_plan_ids = utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "ASDI", "asdiflightplan.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiflightplan.csv"),
        "updatetimeutc",
        utilities.parse_datetime_format2,
        cutoff_time,
        ids_to_track_column_name = "asdiflightplanid")

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdiairway.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiairway.csv"),
        "asdiflightplanid",
        flight_plan_ids)

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpfix.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpfix.csv"),
        "asdiflightplanid",
        flight_plan_ids)

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpcenter.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpcenter.csv"),
        "asdiflightplanid",
        flight_plan_ids)

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpsector.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpsector.csv"),
        "asdiflightplanid",
        flight_plan_ids)

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpwaypoint.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpwaypoint.csv"),
        "asdiflightplanid",
        flight_plan_ids)

    day_beginning, day_end = utilities.get_day_boundaries(cutoff_time)

    weather.process_one_day(
        training_day_path, 
        test_day_path, 
        day_beginning, 
        cutoff_time, 
        "test", 
        cutoff_time = cutoff_time)
import os
import pandas
from datetime import datetime, timedelta
from dateutil import parser, tz
from geflight.transform import utilities
import pytz
import random

import weather

raw_data_path = os.path.join(os.environ["DataPath"], "GEFlight", "RawPublicLeaderboard")
output_path = os.path.join(os.environ["DataPath"], "GEFlight", "Release 2", "PublicLeaderboardTrainDays")

start_day = datetime(2012,11,26,20,00, tzinfo=tz.tzutc())
cutoff_times = [start_day]
for i in range(1,14):
    cutoff_times.append(start_day + timedelta(i, 0))

for ct in cutoff_times:
    print ct
    day_output_path = os.path.join(output_path, utilities.get_day_str(ct, -9))
    day_beginning, day_end = utilities.get_day_boundaries(ct, -9)

    if not os.path.exists(day_output_path):
        os.makedirs(day_output_path)
    weather.process_one_day(raw_data_path, day_output_path, day_beginning, day_end, "train")