def raw_data_to_training_days(raw_data_path, training_days_path, cutoff_times):
    #df_flight_history = flighthistory.get_df_flight_history_from_raw_format(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"))
 
    print(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"))

    days_flight_ids = flighthistory.process_flight_history_to_train_day_files(
        input_path = os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"),
        output_path = training_days_path,
        output_folder_name = "FlightHistory",
        output_file_name = "flighthistory.csv",
        cutoff_times = cutoff_times,
        start_hours_offset = -9)

    print("Flight History Events")
    utilities.split_file_based_on_times_filter_on_ids_streaming(
        os.path.join(raw_data_path, "FlightHistory", "flighthistoryevents.csv"),
        training_days_path,
        "FlightHistory",
        "flighthistoryevents.csv",
        "flight_history_id",
        days_flight_ids)
    
    print("ASDI Flight Plan")
    days_flight_plan_ids = utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiflightplan.csv"), training_days_path,
        "ASDI", "asdiflightplan.csv", "flighthistoryid", days_flight_ids, ids_to_track_column_name="asdiflightplanid")

    print("ASDI Position")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiposition.csv"), training_days_path,
        "ASDI", "asdiposition.csv", "flighthistoryid", days_flight_ids)

    print("ASDI Airway")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiairway.csv"), training_days_path,
        "ASDI", "asdiairway.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPFix")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpfix.csv"), training_days_path,
        "ASDI", "asdifpfix.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPCenter")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpcenter.csv"), training_days_path,
        "ASDI", "asdifpcenter.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPSector")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpsector.csv"), training_days_path,
        "ASDI", "asdifpsector.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPWaypoint")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpwaypoint.csv"), training_days_path,
        "ASDI", "asdifpwaypoint.csv", "asdiflightplanid", days_flight_plan_ids)

    for ct in cutoff_times:
        print ct
        day_output_path = os.path.join(training_days_path, utilities.get_day_str(ct, -9))
        day_beginning, day_end = utilities.get_day_boundaries(ct, -9)

        if not os.path.exists(day_output_path):
            os.makedirs(day_output_path)
        weather.process_one_day(raw_data_path, day_output_path, day_beginning, day_end, "train")
def raw_data_to_training_days(raw_data_path, training_days_path, cutoff_times):
    #df_flight_history = flighthistory.get_df_flight_history_from_raw_format(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"))
 
    print(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"))

    days_flight_ids = flighthistory.process_flight_history_to_train_day_files(
        input_path = os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"),
        output_path = training_days_path,
        output_folder_name = "FlightHistory",
        output_file_name = "flighthistory.csv",
        cutoff_times = cutoff_times,
        start_hours_offset = -9)

    print("Flight History Events")
    utilities.split_file_based_on_times_filter_on_ids_streaming(
        os.path.join(raw_data_path, "FlightHistory", "flighthistoryevents.csv"),
        training_days_path,
        "FlightHistory",
        "flighthistoryevents.csv",
        "flight_history_id",
        days_flight_ids)
    
    print("ASDI Flight Plan")
    days_flight_plan_ids = utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiflightplan.csv"), training_days_path,
        "ASDI", "asdiflightplan.csv", "flighthistoryid", days_flight_ids, ids_to_track_column_name="asdiflightplanid")

    print("ASDI Position")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiposition.csv"), training_days_path,
        "ASDI", "asdiposition.csv", "flighthistoryid", days_flight_ids)

    print("ASDI Airway")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiairway.csv"), training_days_path,
        "ASDI", "asdiairway.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPFix")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpfix.csv"), training_days_path,
        "ASDI", "asdifpfix.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPCenter")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpcenter.csv"), training_days_path,
        "ASDI", "asdifpcenter.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPSector")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpsector.csv"), training_days_path,
        "ASDI", "asdifpsector.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPWaypoint")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpwaypoint.csv"), training_days_path,
        "ASDI", "asdifpwaypoint.csv", "asdiflightplanid", days_flight_plan_ids)

    for ct in cutoff_times:
        print ct
        day_output_path = os.path.join(output_path, utilities.get_day_str(ct, -9))
        day_beginning, day_end = utilities.get_day_boundaries(ct, -9)

        if not os.path.exists(day_output_path):
            os.makedirs(day_output_path)
        weather.process_one_day(raw_data_path, day_output_path, day_beginning, day_end, "train")
Exemplo n.º 3
0
def training_day_to_test_day(training_day_path, test_day_path, solution_path, cutoff_time): 
    flighthistory.write_flight_history_test_day_file(
        os.path.join(training_day_path, "FlightHistory", "flighthistory.csv"),
        os.path.join(utilities.get_output_subdirectory(test_day_path, "FlightHistory"), "flighthistory.csv"),
        cutoff_time)

    flighthistory.write_flight_history_test_day_and_solution_test_flights_only(
        os.path.join(training_day_path, "FlightHistory", "flighthistory.csv"),
        os.path.join(test_day_path, "test_flights.csv"),
        os.path.join(solution_path, utilities.get_day_str(cutoff_time) + "_solution.csv"),
        cutoff_time)
    
    utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "FlightHistory", "flighthistoryevents.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "FlightHistory"), "flighthistoryevents.csv"),
        "date_time_recorded",
        utilities.parse_datetime_format3,
        cutoff_time)

    utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "ASDI", "asdiposition.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiposition.csv"),
        "received",
        utilities.parse_datetime_format1,
        cutoff_time)

    flight_plan_ids = utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "ASDI", "asdiflightplan.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiflightplan.csv"),
        "updatetimeutc",
        utilities.parse_datetime_format2,
        cutoff_time,
        ids_to_track_column_name = "asdiflightplanid")

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdiairway.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiairway.csv"),
        "asdiflightplanid",
        flight_plan_ids)

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpfix.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpfix.csv"),
        "asdiflightplanid",
        flight_plan_ids)

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpcenter.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpcenter.csv"),
        "asdiflightplanid",
        flight_plan_ids)

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpsector.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpsector.csv"),
        "asdiflightplanid",
        flight_plan_ids)

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpwaypoint.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpwaypoint.csv"),
        "asdiflightplanid",
        flight_plan_ids)
Exemplo n.º 4
0
def get_departure_day_str(row, start_hours_offset):
    """
    Returns the date_str for the specific day that a flighthistory row belongs to
    based on the departure date for the flight

    Sample return value: "2012_11_15"
    """
    departure_time = get_departure_time(row)
    if departure_time=="MISSING":
        return ""
    return utilities.get_day_str(departure_time, start_hours_offset)
def get_departure_day_str(row, start_hours_offset):
    """
    Returns the date_str for the specific day that a flighthistory row belongs to
    based on the departure date for the flight

    Sample return value: "2012_11_15"
    """
    departure_time = get_departure_time(row)
    if departure_time == "MISSING":
        return ""
    return utilities.get_day_str(departure_time, start_hours_offset)
def write_flight_history_test_day_and_solution_test_flights_only(
        input_path, test_output_path, solution_path, cutoff_time):
    diverted_or_redirected_flight_ids = get_diverted_or_redirected_flights(
        input_path)

    codes_file = os.path.join(os.environ["DataPath"], "GEFlight", "Reference",
                              "usairporticaocodes.txt")
    us_icao_codes = get_us_airport_icao_codes(codes_file)
    midnight_time = datetime.datetime(cutoff_time.year,
                                      cutoff_time.month,
                                      cutoff_time.day,
                                      tzinfo=tz.tzutc())

    df = get_df_flight_history_from_train_format(input_path)

    original_length = len(df)

    df = df.select(lambda i: flight_history_row_in_test_set(
        df.irow(i), cutoff_time, us_icao_codes,
        diverted_or_redirected_flight_ids))

    df_test = df[[
        "flight_history_id", "departure_airport_code", "arrival_airport_code",
        "published_departure", "published_arrival", "scheduled_gate_departure",
        "scheduled_gate_arrival", "scheduled_runway_departure",
        "scheduled_runway_arrival"
    ]]

    df_test.to_csv(test_output_path, index=False)

    df_solution = df[[
        "flight_history_id", "actual_runway_arrival", "actual_gate_arrival"
    ]]

    for i in df_solution.index:
        df_solution["actual_runway_arrival"][i] = utilities.minutes_difference(
            df_solution["actual_runway_arrival"][i], midnight_time)
        df_solution["actual_gate_arrival"][i] = utilities.minutes_difference(
            df_solution["actual_gate_arrival"][i], midnight_time)

    df_solution.to_csv(solution_path, index=False)

    print("%s, %s: %d rows kept out of %d original lines" %
          (utilities.get_day_str(cutoff_time), "test_flights.csv",
           len(df_test), original_length))

    return df_test, df_solution
Exemplo n.º 7
0
def write_flight_history_test_day_file(input_path, output_path, cutoff_time):
    df = get_df_flight_history_from_train_format(input_path)

    cols_to_mask = get_flight_history_date_columns_to_hide()
    rows_modified = 0

    for i in range(len(df)):
        row_modified = False
        for col in cols_to_mask:
            if df[col][i] == "MISSING":
                continue
            if df[col][i] <= cutoff_time:
                continue
            df[col][i] = "HIDDEN"
            row_modified = True
        if row_modified:
            rows_modified += 1

    df.to_csv(output_path, index=False)

    print("%s, %s: %d rows modified out of %d original lines" % (utilities.get_day_str(cutoff_time), "flighthistory.csv", rows_modified, len(df)))
Exemplo n.º 8
0
def write_flight_history_test_day_and_solution_test_flights_only(input_path, test_output_path, solution_path, cutoff_time):
    diverted_or_redirected_flight_ids = get_diverted_or_redirected_flights(input_path)

    codes_file = os.path.join(os.environ["DataPath"], "GEFlight", "Reference", "usairporticaocodes.txt")
    us_icao_codes = get_us_airport_icao_codes(codes_file)
    midnight_time = datetime.datetime(cutoff_time.year, cutoff_time.month, cutoff_time.day, tzinfo=tz.tzutc())

    df = get_df_flight_history_from_train_format(input_path)
    
    original_length = len(df)

    df = df.select(lambda i: flight_history_row_in_test_set(df.irow(i), cutoff_time, us_icao_codes, diverted_or_redirected_flight_ids))

    df_test = df[["flight_history_id"
                , "departure_airport_code"
                , "arrival_airport_code"
                , "published_departure"
                , "published_arrival"
                , "scheduled_gate_departure"
                , "scheduled_gate_arrival"
                , "scheduled_runway_departure"
                , "scheduled_runway_arrival"]]        

    df_test.to_csv(test_output_path, index=False)
    
    df_solution = df[["flight_history_id"
                    , "actual_runway_arrival"
                    , "actual_gate_arrival"]]

    for i in df_solution.index:
        df_solution["actual_runway_arrival"][i] = utilities.minutes_difference(df_solution["actual_runway_arrival"][i], midnight_time)
        df_solution["actual_gate_arrival"][i] = utilities.minutes_difference(df_solution["actual_gate_arrival"][i], midnight_time)

    df_solution.to_csv(solution_path, index=False)

    print("%s, %s: %d rows kept out of %d original lines" % (utilities.get_day_str(cutoff_time), "test_flights.csv", len(df_test), original_length))

    return df_test, df_solution
def write_flight_history_test_day_file(input_path, output_path, cutoff_time):
    df = get_df_flight_history_from_train_format(input_path)

    cols_to_mask = get_flight_history_date_columns_to_hide()
    rows_modified = 0

    for i in range(len(df)):
        row_modified = False
        for col in cols_to_mask:
            if df[col][i] == "MISSING":
                continue
            if df[col][i] <= cutoff_time:
                continue
            df[col][i] = "HIDDEN"
            row_modified = True
        if row_modified:
            rows_modified += 1

    df.to_csv(output_path, index=False)

    print("%s, %s: %d rows modified out of %d original lines" %
          (utilities.get_day_str(cutoff_time), "flighthistory.csv",
           rows_modified, len(df)))
Exemplo n.º 10
0
    def test_get_day_str(self):
        cutoff_time = datetime(2012, 10, 25, 8, 00, tzinfo=tzutc())
        self.assertEqual("2012_10_24", utilities.get_day_str(cutoff_time))

        cutoff_time = datetime(2012, 10, 25, 8, 00, tzinfo=tzutc())
        self.assertEqual("2012_10_25", utilities.get_day_str(cutoff_time, 0))
Exemplo n.º 11
0
def process_flight_history_to_train_day_files(
    input_path,
    output_path,
    output_folder_name,
    output_file_name,
    cutoff_times,
    start_hours_offset = -9):
    """
    
    """

    file_started_for_day = {cutoff_time: False for cutoff_time in cutoff_times}

    i=0
    cnt=0

    departure_date_columns = get_flight_history_departure_date_columns()
    arrival_date_columns = get_flight_history_arrival_date_columns()

    reader = utilities.HeaderCsvReader(open(input_path))
    header_out = reader.get_header()
    for col in get_flight_history_columns_to_delete():
        header_out.remove(col)

    day_flight_history_ids = {cutoff_time:set() for cutoff_time in cutoff_times}
    day_str_to_cutoff_time = {}
    file_handles = {}
    writers = {}
    for cutoff_time in cutoff_times:
        day_output_path = utilities.get_full_output_path(output_path, output_folder_name, cutoff_time)
        file_output_path = os.path.join(day_output_path, output_file_name)
        file_handles[cutoff_time] = open(file_output_path, "w")
        writers[cutoff_time] = csv.writer(file_handles[cutoff_time], dialect=utilities.CsvDialect())
        writers[cutoff_time].writerow(header_out)
        day_str_to_cutoff_time[utilities.get_day_str(cutoff_time)] = cutoff_time

    i_row_mod = 0
    buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times}

    start_time, end_time = utilities.get_day_boundaries(cutoff_time, start_hours_offset)

    codes_file = os.path.join(os.environ["DataPath"], "GEFlight", "Reference", "usairporticaocodes.txt")
    us_icao_codes = get_us_airport_icao_codes(codes_file)

    for row in reader:
        i_row_mod += 1
        if not is_flight_in_or_out_of_us(row, us_icao_codes):
            continue
        parse_flight_history_dates(row, departure_date_columns, arrival_date_columns)
        row_day_str = get_departure_day_str(row, start_hours_offset)
        if row_day_str not in day_str_to_cutoff_time:
            continue
        cutoff_time = day_str_to_cutoff_time[row_day_str]
        cnt += 1
        buffer_dict[cutoff_time].append([row[col] for col in header_out])
        day_flight_history_ids[cutoff_time].add(row["flight_history_id"])
        if i_row_mod < 100000:
            continue
        i+=1
        print("%s: %d00k records processed, %d with relevant flights in this chunk" % (output_file_name, i, cnt))
        cnt=0
        for cutoff_time in cutoff_times:
            writers[cutoff_time].writerows(buffer_dict[cutoff_time])
            file_handles[cutoff_time].flush()

        i_row_mod = 0
        buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times}

    for cutoff_time in cutoff_times:
        writers[cutoff_time].writerows(buffer_dict[cutoff_time])
        file_handles[cutoff_time].close()

    return day_flight_history_ids
Exemplo n.º 12
0
def training_day_to_test_day(training_day_path, test_day_path, solution_path, cutoff_time): 
    flighthistory.write_flight_history_test_day_file(
        os.path.join(training_day_path, "FlightHistory", "flighthistory.csv"),
        os.path.join(utilities.get_output_subdirectory(test_day_path, "FlightHistory"), "flighthistory.csv"),
        cutoff_time)

    flighthistory.write_flight_history_test_day_and_solution_test_flights_only(
        os.path.join(training_day_path, "FlightHistory", "flighthistory.csv"),
        os.path.join(test_day_path, "test_flights.csv"),
        os.path.join(solution_path, utilities.get_day_str(cutoff_time) + "_solution.csv"),
        cutoff_time)

    utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "FlightHistory", "flighthistoryevents.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "FlightHistory"), "flighthistoryevents.csv"),
        "date_time_recorded",
        utilities.parse_datetime_format3,
        cutoff_time)

    utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "ASDI", "asdiposition.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiposition.csv"),
        "received",
        utilities.parse_datetime_format1,
        cutoff_time)

    flight_plan_ids = utilities.filter_file_based_on_cutoff_time_streaming(os.path.join(training_day_path, "ASDI", "asdiflightplan.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiflightplan.csv"),
        "updatetimeutc",
        utilities.parse_datetime_format2,
        cutoff_time,
        ids_to_track_column_name = "asdiflightplanid")

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdiairway.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdiairway.csv"),
        "asdiflightplanid",
        flight_plan_ids)

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpfix.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpfix.csv"),
        "asdiflightplanid",
        flight_plan_ids)

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpcenter.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpcenter.csv"),
        "asdiflightplanid",
        flight_plan_ids)

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpsector.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpsector.csv"),
        "asdiflightplanid",
        flight_plan_ids)

    utilities.filter_file_based_on_ids_streaming(os.path.join(training_day_path, "ASDI", "asdifpwaypoint.csv"), 
        os.path.join(utilities.get_output_subdirectory(test_day_path, "ASDI"), "asdifpwaypoint.csv"),
        "asdiflightplanid",
        flight_plan_ids)

    day_beginning, day_end = utilities.get_day_boundaries(cutoff_time)

    weather.process_one_day(
        training_day_path, 
        test_day_path, 
        day_beginning, 
        cutoff_time, 
        "test", 
        cutoff_time = cutoff_time)
Exemplo n.º 13
0
    def test_get_day_str(self):
        cutoff_time = datetime(2012, 10, 25, 8, 00, tzinfo = tzutc())
        self.assertEqual("2012_10_24", utilities.get_day_str(cutoff_time))

        cutoff_time = datetime(2012, 10, 25, 8, 00, tzinfo = tzutc())
        self.assertEqual("2012_10_25", utilities.get_day_str(cutoff_time, 0))
def process_flight_history_to_train_day_files(input_path,
                                              output_path,
                                              output_folder_name,
                                              output_file_name,
                                              cutoff_times,
                                              start_hours_offset=-9):
    """
    
    """

    file_started_for_day = {cutoff_time: False for cutoff_time in cutoff_times}

    i = 0
    cnt = 0

    departure_date_columns = get_flight_history_departure_date_columns()
    arrival_date_columns = get_flight_history_arrival_date_columns()

    reader = utilities.HeaderCsvReader(open(input_path))
    header_out = reader.get_header()
    for col in get_flight_history_columns_to_delete():
        header_out.remove(col)

    day_flight_history_ids = {
        cutoff_time: set()
        for cutoff_time in cutoff_times
    }
    day_str_to_cutoff_time = {}
    file_handles = {}
    writers = {}
    for cutoff_time in cutoff_times:
        day_output_path = utilities.get_full_output_path(
            output_path, output_folder_name, cutoff_time)
        file_output_path = os.path.join(day_output_path, output_file_name)
        file_handles[cutoff_time] = open(file_output_path, "w")
        writers[cutoff_time] = csv.writer(file_handles[cutoff_time],
                                          dialect=utilities.CsvDialect())
        writers[cutoff_time].writerow(header_out)
        day_str_to_cutoff_time[utilities.get_day_str(
            cutoff_time)] = cutoff_time

    i_row_mod = 0
    buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times}

    start_time, end_time = utilities.get_day_boundaries(
        cutoff_time, start_hours_offset)

    codes_file = os.path.join(os.environ["DataPath"], "GEFlight", "Reference",
                              "usairporticaocodes.txt")
    us_icao_codes = get_us_airport_icao_codes(codes_file)

    for row in reader:
        i_row_mod += 1
        if not is_flight_in_or_out_of_us(row, us_icao_codes):
            continue
        parse_flight_history_dates(row, departure_date_columns,
                                   arrival_date_columns)
        row_day_str = get_departure_day_str(row, start_hours_offset)
        if row_day_str not in day_str_to_cutoff_time:
            continue
        cutoff_time = day_str_to_cutoff_time[row_day_str]
        cnt += 1
        buffer_dict[cutoff_time].append([row[col] for col in header_out])
        day_flight_history_ids[cutoff_time].add(row["flight_history_id"])
        if i_row_mod < 100000:
            continue
        i += 1
        print(
            "%s: %d00k records processed, %d with relevant flights in this chunk"
            % (output_file_name, i, cnt))
        cnt = 0
        for cutoff_time in cutoff_times:
            writers[cutoff_time].writerows(buffer_dict[cutoff_time])
            file_handles[cutoff_time].flush()

        i_row_mod = 0
        buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times}

    for cutoff_time in cutoff_times:
        writers[cutoff_time].writerows(buffer_dict[cutoff_time])
        file_handles[cutoff_time].close()

    return day_flight_history_ids
import os
import pandas
from datetime import datetime, timedelta
from dateutil import parser, tz
from geflight.transform import utilities
import pytz
import random

import weather

raw_data_path = os.path.join(os.environ["DataPath"], "GEFlight", "RawPublicLeaderboard")
output_path = os.path.join(os.environ["DataPath"], "GEFlight", "Release 2", "PublicLeaderboardTrainDays")

start_day = datetime(2012,11,26,20,00, tzinfo=tz.tzutc())
cutoff_times = [start_day]
for i in range(1,14):
    cutoff_times.append(start_day + timedelta(i, 0))

for ct in cutoff_times:
    print ct
    day_output_path = os.path.join(output_path, utilities.get_day_str(ct, -9))
    day_beginning, day_end = utilities.get_day_boundaries(ct, -9)

    if not os.path.exists(day_output_path):
        os.makedirs(day_output_path)
    weather.process_one_day(raw_data_path, day_output_path, day_beginning, day_end, "train")