示例#1
0
    def split_file_based_on_times_filter_on_ids_streaming(self):
        input_dir = os.tempnam()
        os.mkdir(input_dir)
        input_file = os.path.join(input_dir, "raw.csv")
        data = [(str(x), str(y), z) for x, y, z in zip(
            range(200010), itertools.cycle(range(5)),
            itertools.cycle(["Andrew", "Ben", "Chris", "David", "Anthony"]))]
        f = open(input_file, "w")
        w = csv.writer(f)
        w.writerow(["id1", "id2", "name"])
        w.writerows(data)
        f.close()

        ct1 = datetime(2012, 10, 25, 15, 00, tzinfo=tzutc())
        ct2 = datetime(2012, 10, 26, 15, 00, tzinfo=tzutc())

        cutoff_times = [ct1, ct2]
        ids_dict = {
            x: y
            for x, y in
            zip(cutoff_times,
                [set(["1", "3", "200005"]),
                 set(["2", "100001", "200006"])])
        }
        output_path = os.tempnam()
        os.mkdir(output_path)
        ids_back = utilities.split_file_based_on_times_filter_on_ids_streaming(
            input_file,
            output_path,
            "MyTestFolder",
            "test_output.csv",
            "id1",
            ids_dict,
            ids_to_track_column_name="id2",
            start_hours_offset=-9)
        self.assertEqual(set(["1", "3", "0"]), ids_back[ct1])
        self.assertEqual(set(["2", "1", "1"]), ids_back[ct2])

        f1path = os.path.join(output_path, "2012_10_25", "MyTestFolder",
                              "test_output.csv")
        f2path = os.path.join(output_path, "2012_10_26", "MyTestFolder",
                              "test_output.csv")

        f1 = open(f1path)
        f2 = open(f2path)

        f1data = [x for x in csv.reader(f1)]
        f2data = [x for x in csv.reader(f2)]

        self.assertEqual([["id1", "id2", "name"], ["1", "1", "Ben"],
                          ["3", "3", "David"], ["200005", "0", "Andrew"]],
                         f1data)

        self.assertEqual([["id1", "id2", "name"], ["2", "2", "Chris"],
                          ["100001", "1", "Ben"], ["200006", "1", "Ben"]],
                         f2data)

        # Clean up
        f1.close()
        f2.close()

        os.remove(f1path)
        os.remove(f2path)
        os.remove(input_file)
        for x in [
                os.path.join(output_path, "2012_10_25", "MyTestFolder"),
                os.path.join(output_path, "2012_10_26", "MyTestFolder"),
                os.path.join(output_path, "2012_10_25"),
                os.path.join(output_path, "2012_10_26"), output_path, input_dir
        ]:
            os.rmdir(x)
def raw_data_to_training_days(raw_data_path, training_days_path, cutoff_times):
    #df_flight_history = flighthistory.get_df_flight_history_from_raw_format(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"))
 
    print(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"))

    days_flight_ids = flighthistory.process_flight_history_to_train_day_files(
        input_path = os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"),
        output_path = training_days_path,
        output_folder_name = "FlightHistory",
        output_file_name = "flighthistory.csv",
        cutoff_times = cutoff_times,
        start_hours_offset = -9)

    print("Flight History Events")
    utilities.split_file_based_on_times_filter_on_ids_streaming(
        os.path.join(raw_data_path, "FlightHistory", "flighthistoryevents.csv"),
        training_days_path,
        "FlightHistory",
        "flighthistoryevents.csv",
        "flight_history_id",
        days_flight_ids)
    
    print("ASDI Flight Plan")
    days_flight_plan_ids = utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiflightplan.csv"), training_days_path,
        "ASDI", "asdiflightplan.csv", "flighthistoryid", days_flight_ids, ids_to_track_column_name="asdiflightplanid")

    print("ASDI Position")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiposition.csv"), training_days_path,
        "ASDI", "asdiposition.csv", "flighthistoryid", days_flight_ids)

    print("ASDI Airway")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiairway.csv"), training_days_path,
        "ASDI", "asdiairway.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPFix")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpfix.csv"), training_days_path,
        "ASDI", "asdifpfix.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPCenter")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpcenter.csv"), training_days_path,
        "ASDI", "asdifpcenter.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPSector")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpsector.csv"), training_days_path,
        "ASDI", "asdifpsector.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPWaypoint")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpwaypoint.csv"), training_days_path,
        "ASDI", "asdifpwaypoint.csv", "asdiflightplanid", days_flight_plan_ids)
    def split_file_based_on_times_filter_on_ids_streaming(self):
        input_dir = os.tempnam()
        os.mkdir(input_dir)
        input_file = os.path.join(input_dir, "raw.csv")
        data = [(str(x), str(y), z) for x,y,z in zip(range(200010),
            itertools.cycle(range(5)),
            itertools.cycle(["Andrew", "Ben", "Chris", "David", "Anthony"]))]
        f = open(input_file, "w")
        w = csv.writer(f)
        w.writerow(["id1", "id2", "name"])
        w.writerows(data)
        f.close()

        ct1 = datetime(2012, 10, 25, 15, 00, tzinfo = tzutc())
        ct2 = datetime(2012, 10, 26, 15, 00, tzinfo = tzutc()) 

        cutoff_times = [ct1, ct2]
        ids_dict = {x:y for x,y in zip(cutoff_times, [set(["1", "3", "200005"]), set(["2", "100001", "200006"])])}
        output_path = os.tempnam()
        os.mkdir(output_path)
        ids_back = utilities.split_file_based_on_times_filter_on_ids_streaming(
            input_file,
            output_path,
            "MyTestFolder",
            "test_output.csv",
            "id1",
            ids_dict,
            ids_to_track_column_name = "id2",
            start_hours_offset = -9)
        self.assertEqual(set(["1", "3", "0"]), ids_back[ct1])
        self.assertEqual(set(["2", "1", "1"]), ids_back[ct2])

        f1path = os.path.join(output_path, "2012_10_25", "MyTestFolder", "test_output.csv")
        f2path = os.path.join(output_path, "2012_10_26", "MyTestFolder", "test_output.csv")

        f1 = open(f1path)
        f2 = open(f2path)

        f1data = [x for x in csv.reader(f1)]
        f2data = [x for x in csv.reader(f2)]

        self.assertEqual([["id1", "id2", "name"],
                          ["1", "1", "Ben"], 
                          ["3", "3", "David"],
                          ["200005", "0", "Andrew"]], f1data)

        self.assertEqual([["id1", "id2", "name"],
                          ["2", "2", "Chris"], 
                          ["100001", "1", "Ben"],
                          ["200006", "1", "Ben"]], f2data)

        # Clean up
        f1.close()
        f2.close()

        os.remove(f1path)
        os.remove(f2path)
        os.remove(input_file)
        for x in [os.path.join(output_path, "2012_10_25", "MyTestFolder"),
                  os.path.join(output_path, "2012_10_26", "MyTestFolder"),
                  os.path.join(output_path, "2012_10_25"),
                  os.path.join(output_path, "2012_10_26"),
                  output_path, input_dir]:
            os.rmdir(x)
def raw_data_to_training_days(raw_data_path, training_days_path, cutoff_times):
    # df_flight_history = flighthistory.get_df_flight_history_from_raw_format(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"))

    print (os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"))

    days_flight_ids = flighthistory.process_flight_history_to_train_day_files(
        input_path=os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"),
        output_path=training_days_path,
        output_folder_name="FlightHistory",
        output_file_name="flighthistory.csv",
        cutoff_times=cutoff_times,
        start_hours_offset=-9,
    )

    print ("Flight History Events")
    utilities.split_file_based_on_times_filter_on_ids_streaming(
        os.path.join(raw_data_path, "FlightHistory", "flighthistoryevents.csv"),
        training_days_path,
        "FlightHistory",
        "flighthistoryevents.csv",
        "flight_history_id",
        days_flight_ids,
    )

    print ("ASDI Flight Plan")
    days_flight_plan_ids = utilities.split_file_based_on_times_filter_on_ids_streaming(
        os.path.join(raw_data_path, "ASDI", "asdiflightplan.csv"),
        training_days_path,
        "ASDI",
        "asdiflightplan.csv",
        "flighthistoryid",
        days_flight_ids,
        ids_to_track_column_name="asdiflightplanid",
    )

    print ("ASDI Position")
    utilities.split_file_based_on_times_filter_on_ids_streaming(
        os.path.join(raw_data_path, "ASDI", "asdiposition.csv"),
        training_days_path,
        "ASDI",
        "asdiposition.csv",
        "flighthistoryid",
        days_flight_ids,
    )

    print ("ASDI Airway")
    utilities.split_file_based_on_times_filter_on_ids_streaming(
        os.path.join(raw_data_path, "ASDI", "asdiairway.csv"),
        training_days_path,
        "ASDI",
        "asdiairway.csv",
        "asdiflightplanid",
        days_flight_plan_ids,
    )

    print ("ASDI FPFix")
    utilities.split_file_based_on_times_filter_on_ids_streaming(
        os.path.join(raw_data_path, "ASDI", "asdifpfix.csv"),
        training_days_path,
        "ASDI",
        "asdifpfix.csv",
        "asdiflightplanid",
        days_flight_plan_ids,
    )

    print ("ASDI FPCenter")
    utilities.split_file_based_on_times_filter_on_ids_streaming(
        os.path.join(raw_data_path, "ASDI", "asdifpcenter.csv"),
        training_days_path,
        "ASDI",
        "asdifpcenter.csv",
        "asdiflightplanid",
        days_flight_plan_ids,
    )

    print ("ASDI FPSector")
    utilities.split_file_based_on_times_filter_on_ids_streaming(
        os.path.join(raw_data_path, "ASDI", "asdifpsector.csv"),
        training_days_path,
        "ASDI",
        "asdifpsector.csv",
        "asdiflightplanid",
        days_flight_plan_ids,
    )

    print ("ASDI FPWaypoint")
    utilities.split_file_based_on_times_filter_on_ids_streaming(
        os.path.join(raw_data_path, "ASDI", "asdifpwaypoint.csv"),
        training_days_path,
        "ASDI",
        "asdifpwaypoint.csv",
        "asdiflightplanid",
        days_flight_plan_ids,
    )

    for ct in cutoff_times:
        print ct
        day_output_path = os.path.join(training_days_path, utilities.get_day_str(ct, -9))
        day_beginning, day_end = utilities.get_day_boundaries(ct, -9)

        if not os.path.exists(day_output_path):
            os.makedirs(day_output_path)
        weather.process_one_day(raw_data_path, day_output_path, day_beginning, day_end, "train")
def raw_data_to_training_days(raw_data_path, training_days_path, cutoff_times):
    #df_flight_history = flighthistory.get_df_flight_history_from_raw_format(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"))
 
    print(os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"))

    days_flight_ids = flighthistory.process_flight_history_to_train_day_files(
        input_path = os.path.join(raw_data_path, "FlightHistory", "flighthistory.csv"),
        output_path = training_days_path,
        output_folder_name = "FlightHistory",
        output_file_name = "flighthistory.csv",
        cutoff_times = cutoff_times,
        start_hours_offset = -9)

    print("Flight History Events")
    utilities.split_file_based_on_times_filter_on_ids_streaming(
        os.path.join(raw_data_path, "FlightHistory", "flighthistoryevents.csv"),
        training_days_path,
        "FlightHistory",
        "flighthistoryevents.csv",
        "flight_history_id",
        days_flight_ids)
    
    print("ASDI Flight Plan")
    days_flight_plan_ids = utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiflightplan.csv"), training_days_path,
        "ASDI", "asdiflightplan.csv", "flighthistoryid", days_flight_ids, ids_to_track_column_name="asdiflightplanid")

    print("ASDI Position")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiposition.csv"), training_days_path,
        "ASDI", "asdiposition.csv", "flighthistoryid", days_flight_ids)

    print("ASDI Airway")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdiairway.csv"), training_days_path,
        "ASDI", "asdiairway.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPFix")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpfix.csv"), training_days_path,
        "ASDI", "asdifpfix.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPCenter")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpcenter.csv"), training_days_path,
        "ASDI", "asdifpcenter.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPSector")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpsector.csv"), training_days_path,
        "ASDI", "asdifpsector.csv", "asdiflightplanid", days_flight_plan_ids)

    print("ASDI FPWaypoint")
    utilities.split_file_based_on_times_filter_on_ids_streaming(os.path.join(raw_data_path, "ASDI", "asdifpwaypoint.csv"), training_days_path,
        "ASDI", "asdifpwaypoint.csv", "asdiflightplanid", days_flight_plan_ids)

    for ct in cutoff_times:
        print ct
        day_output_path = os.path.join(training_days_path, utilities.get_day_str(ct, -9))
        day_beginning, day_end = utilities.get_day_boundaries(ct, -9)

        if not os.path.exists(day_output_path):
            os.makedirs(day_output_path)
        weather.process_one_day(raw_data_path, day_output_path, day_beginning, day_end, "train")