Exemplo n.º 1
0
def predict_traffic_from_scats(_df):
    print("*** scats predictions ***")

    df = _df.copy()
    df["hour"] = df["arrival_time"].apply(lambda t: get_dt(t, "%H:%M:%S").hour)
    df["dow"] = df.apply(apply_dow,
                         ["start_date", "start_time", "arrival_time"])

    pca_coord = vaex.ml.PCA(features=["lat", "lon"],
                            n_components=2,
                            prefix="pca")
    df = pca_coord.fit_transform(df)

    cycl_transform_hour = vaex.ml.CycleTransformer(features=["hour"], n=24)
    df = cycl_transform_hour.fit_transform(df)

    cycl_transform_dow = vaex.ml.CycleTransformer(features=["dow"], n=7)
    df = cycl_transform_dow.fit_transform(df)

    # load the scats ml model
    scats_model = load(scats_model_path)

    # get the predictions from scats data
    df = scats_model.transform(df)
    print(f"made predictions, time: {duration()}")

    return df[_df.get_column_names() + ["p_avg_vol"]]
Exemplo n.º 2
0
def transform_data(df):
    # strong type casting in case str
    df["direction"] = df["direction"].astype("int64")

    df["is_delayed"] = df["arrival"].apply(is_delay)

    # transform the features into more machine learning friendly vars
    pca_coord = vaex.ml.PCA(features=["lat", "lon"],
                            n_components=2,
                            prefix="pca")
    df = pca_coord.fit_transform(df)

    cycl_transform_angle = vaex.ml.CycleTransformer(
        features=["direction_angle"], n=360)
    df = cycl_transform_angle.fit_transform(df)

    # transform timestamp
    df["t_dow"] = df["timestamp"].apply(
        lambda t: get_dt(t, "%Y-%m-%d %H:%M:%S").weekday())
    df["t_hour"] = df["timestamp"].apply(
        lambda t: get_dt(t, "%Y-%m-%d %H:%M:%S").hour)
    df["t_minute"] = df["timestamp"].apply(
        lambda t: get_dt(t, "%Y-%m-%d %H:%M:%S").minute)
    df["t_second"] = df["timestamp"].apply(
        lambda t: get_dt(t, "%Y-%m-%d %H:%M:%S").second)

    # transform arrival
    df["arr_minute"] = df["arrival_time"].apply(
        lambda t: get_dt(t, "%H:%M:%S").minute)
    df["arr_second"] = df["arrival_time"].apply(
        lambda t: get_dt(t, "%H:%M:%S").second)

    cycl_transform_dow = vaex.ml.CycleTransformer(
        features=["t_dow", "arr_dow"], n=7)
    df = cycl_transform_dow.fit_transform(df)

    cycl_transform_hour = vaex.ml.CycleTransformer(
        features=["t_hour", "arr_hour"], n=24)
    df = cycl_transform_hour.fit_transform(df)

    cycl_transform_minute = vaex.ml.CycleTransformer(
        features=["t_minute", "t_second", "arr_minute", "arr_second"], n=60)
    df = cycl_transform_minute.fit_transform(df)

    label_encoder = vaex.ml.LabelEncoder(features=["route_id"],
                                         prefix="label_encode_")
    df = label_encoder.fit_transform(df)

    standard_scaler = vaex.ml.StandardScaler(
        features=["arrival_mean", "p_mean_vol"])
    df = standard_scaler.fit_transform(df)

    minmax_scaler = vaex.ml.MinMaxScaler(
        features=["shape_dist_traveled", "shape_dist_between"])
    df = minmax_scaler.fit_transform(df)

    print(f"dataWrangling done, ready to create model, time: {duration()}s")
    return df
Exemplo n.º 3
0
def process_data():
    if not os.path.exists(stop_time_data_path):
        create_stop_time_data()

    print("*** processing data ***")
    df = vaex.open(gtfs_final_hdf5_path)

    # compute direction and day of week from realtime data
    df["direction"] = df["trip_id"].apply(lambda t: dir_f_trip(t))
    df["dow"] = df["start_date"].apply(lambda t: get_dt(t, "%Y%m%d").weekday())

    # store these columns in memory
    df.materialize("direction", inplace=True)
    df.materialize("dow", inplace=True)

    # 500 is set as an error column to remove, None isnt supported
    df = df[df["direction"] != 500]

    # drop trip_id to remove duplicates
    df.drop("trip_id", inplace=True)

    # important, we use these columns and later service days in order to
    # stop being dependent on trip_id.
    cols = ["route_id", "stop_sequence", "stop_id", "start_time", "direction"]
    df = vaex_mjoin(df.shallow_copy(),
                    vaex.open(stop_time_data_path),
                    cols,
                    cols,
                    how="inner",
                    allow_duplication=True)

    # filter to keep only trips that happened on that dayof week
    df["keep_trip"] = df.apply(
        lambda sd, dow: sd.replace("[", "").replace("]", "").replace(" ", "").
        split(",")[dow], ["service_days", "dow"])
    df = df[df.keep_trip == "True"]

    # drop redundant columns
    df.drop(["service_days", "dow", "keep_trip"], inplace=True)

    df = vaex.from_pandas(df.to_pandas_df().drop_duplicates(
        subset=[i for i in df.get_column_names() if i != "trip_id"]))
    # df = vx_dedupe(df, columns=[i for i in df.get_column_names() if i != "trip_id"])

    print(f"merged stop_time & gtfsr data, time: {duration()}")

    df = predict_traffic_from_scats(df)

    df.export_hdf5(gtfsr_processed_path)
    print(f"finished processing data, {duration()}")
Exemplo n.º 4
0
def make_prediction(data):
    st_df = MlConfig.st_df  # stop_time_data
    hm_df = MlConfig.hm_df  # historical means dataset
    model = MlConfig.state_model  # GTFSR vaex model state

    empty = ("", "")

    if not "start_time" in data or not "start_date" in data:
        return empty

    formatted_data = {
        "route_id": [str(data["route_id"])],
        "direction": [int(data["direction"])],
        "stop_sequence": [int(data["stop_sequence"])],
        "stop_id": [str(data["stop_id"])],
        "start_time": [str(data["start_time"])],
        "start_date": [int(data["start_date"])],
        "timestamp": [str(data["timestamp"])],
        "arrival": [int(data["arrival"] / 60)],
    }

    live_df = vaex.from_dict(formatted_data)

    live_df["arr_dow"] = live_df.start_date.apply(
        lambda d: get_dt(d, "%Y%m%d").weekday())
    live_df.materialize("arr_dow", inplace=True)

    # print(live_df.dtypes, "\n", st_df.dtypes, "\n", hm_df.dtypes, "\n")

    temp_df = st_df[
        (st_df["route_id"] == live_df[["route_id"]][0][0])
        & (st_df["stop_sequence"] == live_df[["stop_sequence"]][0][0])
        & (st_df["stop_id"] == live_df[["stop_id"]][0][0])
        & (st_df["start_time"] == live_df[["start_time"]][0][0])
        & (st_df["direction"] == live_df[["direction"]][0][0])].copy()

    if len(temp_df) < 1:
        return empty

    # join stop time data, filtering improves speed by only copying relevant rows
    cols = ["route_id", "stop_sequence", "stop_id", "start_time", "direction"]
    live_df = vaex_mjoin(live_df,
                         temp_df,
                         cols,
                         cols,
                         how="inner",
                         allow_duplication=True)

    live_df["keep_trip"] = live_df.apply(
        lambda sd, dow: sd.replace("[", "").replace("]", "").replace(" ", "").
        split(",")[dow],
        ["service_days", "arr_dow"],
    )
    live_df = live_df[live_df.keep_trip == "True"]
    live_df.drop(["service_days", "keep_trip"], inplace=True)

    if len(live_df) < 1:
        return empty

    live_df["arr_hour"] = live_df["arrival_time"].apply(
        lambda t: get_dt(t, "%H:%M:%S").hour)
    live_df.materialize("arr_hour", inplace=True)

    # join the historical means to our dataset
    temp_df = hm_df[(hm_df["route_id"] == data["route_id"])
                    & (hm_df["stop_id"] == data["stop_id"])
                    & (hm_df["arr_dow"] == live_df[["arr_dow"]][0][0])
                    & (hm_df["arr_hour"] == live_df[["arr_hour"]][0][0])
                    & (hm_df["direction"] == int(data["direction"]))
                    & (hm_df["stop_sequence"] == live_df[["stop_sequence"
                                                          ]][0][0])].copy()

    if len(temp_df) < 1:
        return empty

    cols = [
        "route_id", "stop_id", "arr_dow", "arr_hour", "direction",
        "stop_sequence"
    ]
    live_df = vaex_mjoin(
        live_df,
        temp_df,
        cols,
        cols,
        how="inner",
    )

    if len(live_df) < 1:
        return empty

    # assert same type
    live_df["direction"] = live_df["direction"].astype("int64")
    live_df["shape_dist_traveled"] = live_df["shape_dist_traveled"].astype(
        "float64")
    live_df["lat"] = live_df["lat"].astype("float64")
    live_df["lon"] = live_df["lon"].astype("float64")
    live_df["direction_angle"] = live_df["direction_angle"].astype("float64")
    live_df["shape_dist_between"] = live_df["shape_dist_between"].astype(
        "float64")

    # materialize virtual columns to match model state
    [
        live_df.materialize(col, inplace=True)
        for col in live_df.get_column_names()
        if not col in live_df.get_column_names(virtual=False)
    ]
    try:
        live_df.state_set(model)

        if len(live_df) > 0:
            return (round(live_df[["p_arrival_lgbm"]][0][0]) *
                    60), live_df[["p_arrival_lgbm"]][0][0]
    except:
        return empty
    return empty
Exemplo n.º 5
0
def create_model():
    if not os.path.exists(gtfsr_model_df_path):
        df = vaex.open(gtfsr_processed_path)
        df = df.sample(frac=1)

        # # remove outliers from dataset, all delays over 20 minutes
        outlier = 60 * 20
        df = df[(df["arrival"] >= -outlier)
                & (df["arrival"] <= outlier)
                & (df["departure"] >= -outlier)
                & (df["departure"] <= outlier)]

        df["arr_dow"] = df.apply(apply_dow,
                                 ["start_date", "start_time", "arrival_time"])
        df["arr_hour"] = df["arrival_time"].apply(
            lambda t: get_dt(t, "%H:%M:%S").hour)
        df["arrival"] = df["arrival"].apply(lambda t: 0 if t == 0 else t / 60)

        cols = [
            "route_id", "stop_id", "arr_dow", "arr_hour", "direction",
            "stop_sequence"
        ]

        # if the arrival historical means dataset is not created we create it
        if not os.path.exists(gtfsr_historical_means_path):
            print("*** creating gtfsr historical means dataset ***")
            # creates a dataset of historical average means using the stop_id, arrival_day_of_week and trip_id identifiers

            vaex.from_pandas(
                (df.to_pandas_df().groupby(cols).agg({
                    "arrival": "mean",
                    "p_avg_vol": "mean"
                }).rename(columns={
                    "arrival": "arrival_mean",
                    "p_avg_vol": "p_mean_vol"
                }).reset_index())).export_hdf5(gtfsr_historical_means_path)

        print("*** joining hist means ***")

        # join the arrival means to our dataset
        df = vaex_mjoin(df,
                        vaex.open(gtfsr_historical_means_path),
                        cols,
                        cols,
                        how="left")

        df = df[[
            "start_date",
            "start_time",
            "stop_sequence",
            "arrival",
            "timestamp",
            "stop_id",
            "arrival_time",
            "shape_dist_traveled",
            "direction",
            "route_id",
            "lat",
            "lon",
            "direction_angle",
            "shape_dist_between",
            "arr_dow",
            "arr_hour",
            "arrival_mean",
            "p_mean_vol",
        ]]

        df.export_hdf5(gtfsr_model_df_path)

    print("*** Start training ***")
    # open model ready
    df = vaex.open(gtfsr_model_df_path)

    # transform our data
    df = transform_data(df)

    # train our data
    train_gtfsr(df)

    return