Exemplo n.º 1
0
    def test_get_full_output_path(self):
        original_path = os.tempnam()
        os.mkdir(original_path)

        cutoff_time = datetime(2012, 10, 25, 8, 00, tzinfo = tzutc())
        output_path = utilities.get_full_output_path(original_path, "FlightHistory", cutoff_time, 0)
        self.assertEqual(os.path.join(original_path, "2012_10_25", "FlightHistory"), output_path)
        self.assertTrue(os.path.exists(output_path))
        os.rmdir(output_path)
        os.rmdir(os.path.join(original_path, "2012_10_25"))
        os.rmdir(original_path)
Exemplo n.º 2
0
    def test_get_full_output_path(self):
        original_path = os.tempnam()
        os.mkdir(original_path)

        cutoff_time = datetime(2012, 10, 25, 8, 00, tzinfo=tzutc())
        output_path = utilities.get_full_output_path(original_path,
                                                     "FlightHistory",
                                                     cutoff_time, 0)
        self.assertEqual(
            os.path.join(original_path, "2012_10_25", "FlightHistory"),
            output_path)
        self.assertTrue(os.path.exists(output_path))
        os.rmdir(output_path)
        os.rmdir(os.path.join(original_path, "2012_10_25"))
        os.rmdir(original_path)
Exemplo n.º 3
0
def process_flight_history_to_train_day_files(
    input_path,
    output_path,
    output_folder_name,
    output_file_name,
    cutoff_times,
    start_hours_offset = -9):
    """
    
    """

    file_started_for_day = {cutoff_time: False for cutoff_time in cutoff_times}

    i=0
    cnt=0

    departure_date_columns = get_flight_history_departure_date_columns()
    arrival_date_columns = get_flight_history_arrival_date_columns()

    reader = utilities.HeaderCsvReader(open(input_path))
    header_out = reader.get_header()
    for col in get_flight_history_columns_to_delete():
        header_out.remove(col)

    day_flight_history_ids = {cutoff_time:set() for cutoff_time in cutoff_times}
    day_str_to_cutoff_time = {}
    file_handles = {}
    writers = {}
    for cutoff_time in cutoff_times:
        day_output_path = utilities.get_full_output_path(output_path, output_folder_name, cutoff_time)
        file_output_path = os.path.join(day_output_path, output_file_name)
        file_handles[cutoff_time] = open(file_output_path, "w")
        writers[cutoff_time] = csv.writer(file_handles[cutoff_time], dialect=utilities.CsvDialect())
        writers[cutoff_time].writerow(header_out)
        day_str_to_cutoff_time[utilities.get_day_str(cutoff_time)] = cutoff_time

    i_row_mod = 0
    buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times}

    start_time, end_time = utilities.get_day_boundaries(cutoff_time, start_hours_offset)

    codes_file = os.path.join(os.environ["DataPath"], "GEFlight", "Reference", "usairporticaocodes.txt")
    us_icao_codes = get_us_airport_icao_codes(codes_file)

    for row in reader:
        i_row_mod += 1
        if not is_flight_in_or_out_of_us(row, us_icao_codes):
            continue
        parse_flight_history_dates(row, departure_date_columns, arrival_date_columns)
        row_day_str = get_departure_day_str(row, start_hours_offset)
        if row_day_str not in day_str_to_cutoff_time:
            continue
        cutoff_time = day_str_to_cutoff_time[row_day_str]
        cnt += 1
        buffer_dict[cutoff_time].append([row[col] for col in header_out])
        day_flight_history_ids[cutoff_time].add(row["flight_history_id"])
        if i_row_mod < 100000:
            continue
        i+=1
        print("%s: %d00k records processed, %d with relevant flights in this chunk" % (output_file_name, i, cnt))
        cnt=0
        for cutoff_time in cutoff_times:
            writers[cutoff_time].writerows(buffer_dict[cutoff_time])
            file_handles[cutoff_time].flush()

        i_row_mod = 0
        buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times}

    for cutoff_time in cutoff_times:
        writers[cutoff_time].writerows(buffer_dict[cutoff_time])
        file_handles[cutoff_time].close()

    return day_flight_history_ids
def process_flight_history_to_train_day_files(input_path,
                                              output_path,
                                              output_folder_name,
                                              output_file_name,
                                              cutoff_times,
                                              start_hours_offset=-9):
    """
    
    """

    file_started_for_day = {cutoff_time: False for cutoff_time in cutoff_times}

    i = 0
    cnt = 0

    departure_date_columns = get_flight_history_departure_date_columns()
    arrival_date_columns = get_flight_history_arrival_date_columns()

    reader = utilities.HeaderCsvReader(open(input_path))
    header_out = reader.get_header()
    for col in get_flight_history_columns_to_delete():
        header_out.remove(col)

    day_flight_history_ids = {
        cutoff_time: set()
        for cutoff_time in cutoff_times
    }
    day_str_to_cutoff_time = {}
    file_handles = {}
    writers = {}
    for cutoff_time in cutoff_times:
        day_output_path = utilities.get_full_output_path(
            output_path, output_folder_name, cutoff_time)
        file_output_path = os.path.join(day_output_path, output_file_name)
        file_handles[cutoff_time] = open(file_output_path, "w")
        writers[cutoff_time] = csv.writer(file_handles[cutoff_time],
                                          dialect=utilities.CsvDialect())
        writers[cutoff_time].writerow(header_out)
        day_str_to_cutoff_time[utilities.get_day_str(
            cutoff_time)] = cutoff_time

    i_row_mod = 0
    buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times}

    start_time, end_time = utilities.get_day_boundaries(
        cutoff_time, start_hours_offset)

    codes_file = os.path.join(os.environ["DataPath"], "GEFlight", "Reference",
                              "usairporticaocodes.txt")
    us_icao_codes = get_us_airport_icao_codes(codes_file)

    for row in reader:
        i_row_mod += 1
        if not is_flight_in_or_out_of_us(row, us_icao_codes):
            continue
        parse_flight_history_dates(row, departure_date_columns,
                                   arrival_date_columns)
        row_day_str = get_departure_day_str(row, start_hours_offset)
        if row_day_str not in day_str_to_cutoff_time:
            continue
        cutoff_time = day_str_to_cutoff_time[row_day_str]
        cnt += 1
        buffer_dict[cutoff_time].append([row[col] for col in header_out])
        day_flight_history_ids[cutoff_time].add(row["flight_history_id"])
        if i_row_mod < 100000:
            continue
        i += 1
        print(
            "%s: %d00k records processed, %d with relevant flights in this chunk"
            % (output_file_name, i, cnt))
        cnt = 0
        for cutoff_time in cutoff_times:
            writers[cutoff_time].writerows(buffer_dict[cutoff_time])
            file_handles[cutoff_time].flush()

        i_row_mod = 0
        buffer_dict = {cutoff_time: [] for cutoff_time in cutoff_times}

    for cutoff_time in cutoff_times:
        writers[cutoff_time].writerows(buffer_dict[cutoff_time])
        file_handles[cutoff_time].close()

    return day_flight_history_ids