def predict_traffic_from_scats(_df): print("*** scats predictions ***") df = _df.copy() df["hour"] = df["arrival_time"].apply(lambda t: get_dt(t, "%H:%M:%S").hour) df["dow"] = df.apply(apply_dow, ["start_date", "start_time", "arrival_time"]) pca_coord = vaex.ml.PCA(features=["lat", "lon"], n_components=2, prefix="pca") df = pca_coord.fit_transform(df) cycl_transform_hour = vaex.ml.CycleTransformer(features=["hour"], n=24) df = cycl_transform_hour.fit_transform(df) cycl_transform_dow = vaex.ml.CycleTransformer(features=["dow"], n=7) df = cycl_transform_dow.fit_transform(df) # load the scats ml model scats_model = load(scats_model_path) # get the predictions from scats data df = scats_model.transform(df) print(f"made predictions, time: {duration()}") return df[_df.get_column_names() + ["p_avg_vol"]]
def transform_data(df): # strong type casting in case str df["direction"] = df["direction"].astype("int64") df["is_delayed"] = df["arrival"].apply(is_delay) # transform the features into more machine learning friendly vars pca_coord = vaex.ml.PCA(features=["lat", "lon"], n_components=2, prefix="pca") df = pca_coord.fit_transform(df) cycl_transform_angle = vaex.ml.CycleTransformer( features=["direction_angle"], n=360) df = cycl_transform_angle.fit_transform(df) # transform timestamp df["t_dow"] = df["timestamp"].apply( lambda t: get_dt(t, "%Y-%m-%d %H:%M:%S").weekday()) df["t_hour"] = df["timestamp"].apply( lambda t: get_dt(t, "%Y-%m-%d %H:%M:%S").hour) df["t_minute"] = df["timestamp"].apply( lambda t: get_dt(t, "%Y-%m-%d %H:%M:%S").minute) df["t_second"] = df["timestamp"].apply( lambda t: get_dt(t, "%Y-%m-%d %H:%M:%S").second) # transform arrival df["arr_minute"] = df["arrival_time"].apply( lambda t: get_dt(t, "%H:%M:%S").minute) df["arr_second"] = df["arrival_time"].apply( lambda t: get_dt(t, "%H:%M:%S").second) cycl_transform_dow = vaex.ml.CycleTransformer( features=["t_dow", "arr_dow"], n=7) df = cycl_transform_dow.fit_transform(df) cycl_transform_hour = vaex.ml.CycleTransformer( features=["t_hour", "arr_hour"], n=24) df = cycl_transform_hour.fit_transform(df) cycl_transform_minute = vaex.ml.CycleTransformer( features=["t_minute", "t_second", "arr_minute", "arr_second"], n=60) df = cycl_transform_minute.fit_transform(df) label_encoder = vaex.ml.LabelEncoder(features=["route_id"], prefix="label_encode_") df = label_encoder.fit_transform(df) standard_scaler = vaex.ml.StandardScaler( features=["arrival_mean", "p_mean_vol"]) df = standard_scaler.fit_transform(df) minmax_scaler = vaex.ml.MinMaxScaler( features=["shape_dist_traveled", "shape_dist_between"]) df = minmax_scaler.fit_transform(df) print(f"dataWrangling done, ready to create model, time: {duration()}s") return df
def process_data(): if not os.path.exists(stop_time_data_path): create_stop_time_data() print("*** processing data ***") df = vaex.open(gtfs_final_hdf5_path) # compute direction and day of week from realtime data df["direction"] = df["trip_id"].apply(lambda t: dir_f_trip(t)) df["dow"] = df["start_date"].apply(lambda t: get_dt(t, "%Y%m%d").weekday()) # store these columns in memory df.materialize("direction", inplace=True) df.materialize("dow", inplace=True) # 500 is set as an error column to remove, None isnt supported df = df[df["direction"] != 500] # drop trip_id to remove duplicates df.drop("trip_id", inplace=True) # important, we use these columns and later service days in order to # stop being dependent on trip_id. cols = ["route_id", "stop_sequence", "stop_id", "start_time", "direction"] df = vaex_mjoin(df.shallow_copy(), vaex.open(stop_time_data_path), cols, cols, how="inner", allow_duplication=True) # filter to keep only trips that happened on that dayof week df["keep_trip"] = df.apply( lambda sd, dow: sd.replace("[", "").replace("]", "").replace(" ", ""). split(",")[dow], ["service_days", "dow"]) df = df[df.keep_trip == "True"] # drop redundant columns df.drop(["service_days", "dow", "keep_trip"], inplace=True) df = vaex.from_pandas(df.to_pandas_df().drop_duplicates( subset=[i for i in df.get_column_names() if i != "trip_id"])) # df = vx_dedupe(df, columns=[i for i in df.get_column_names() if i != "trip_id"]) print(f"merged stop_time & gtfsr data, time: {duration()}") df = predict_traffic_from_scats(df) df.export_hdf5(gtfsr_processed_path) print(f"finished processing data, {duration()}")
def make_prediction(data): st_df = MlConfig.st_df # stop_time_data hm_df = MlConfig.hm_df # historical means dataset model = MlConfig.state_model # GTFSR vaex model state empty = ("", "") if not "start_time" in data or not "start_date" in data: return empty formatted_data = { "route_id": [str(data["route_id"])], "direction": [int(data["direction"])], "stop_sequence": [int(data["stop_sequence"])], "stop_id": [str(data["stop_id"])], "start_time": [str(data["start_time"])], "start_date": [int(data["start_date"])], "timestamp": [str(data["timestamp"])], "arrival": [int(data["arrival"] / 60)], } live_df = vaex.from_dict(formatted_data) live_df["arr_dow"] = live_df.start_date.apply( lambda d: get_dt(d, "%Y%m%d").weekday()) live_df.materialize("arr_dow", inplace=True) # print(live_df.dtypes, "\n", st_df.dtypes, "\n", hm_df.dtypes, "\n") temp_df = st_df[ (st_df["route_id"] == live_df[["route_id"]][0][0]) & (st_df["stop_sequence"] == live_df[["stop_sequence"]][0][0]) & (st_df["stop_id"] == live_df[["stop_id"]][0][0]) & (st_df["start_time"] == live_df[["start_time"]][0][0]) & (st_df["direction"] == live_df[["direction"]][0][0])].copy() if len(temp_df) < 1: return empty # join stop time data, filtering improves speed by only copying relevant rows cols = ["route_id", "stop_sequence", "stop_id", "start_time", "direction"] live_df = vaex_mjoin(live_df, temp_df, cols, cols, how="inner", allow_duplication=True) live_df["keep_trip"] = live_df.apply( lambda sd, dow: sd.replace("[", "").replace("]", "").replace(" ", ""). split(",")[dow], ["service_days", "arr_dow"], ) live_df = live_df[live_df.keep_trip == "True"] live_df.drop(["service_days", "keep_trip"], inplace=True) if len(live_df) < 1: return empty live_df["arr_hour"] = live_df["arrival_time"].apply( lambda t: get_dt(t, "%H:%M:%S").hour) live_df.materialize("arr_hour", inplace=True) # join the historical means to our dataset temp_df = hm_df[(hm_df["route_id"] == data["route_id"]) & (hm_df["stop_id"] == data["stop_id"]) & (hm_df["arr_dow"] == live_df[["arr_dow"]][0][0]) & (hm_df["arr_hour"] == live_df[["arr_hour"]][0][0]) & (hm_df["direction"] == int(data["direction"])) & (hm_df["stop_sequence"] == live_df[["stop_sequence" ]][0][0])].copy() if len(temp_df) < 1: return empty cols = [ "route_id", "stop_id", "arr_dow", "arr_hour", "direction", "stop_sequence" ] live_df = vaex_mjoin( live_df, temp_df, cols, cols, how="inner", ) if len(live_df) < 1: return empty # assert same type live_df["direction"] = live_df["direction"].astype("int64") live_df["shape_dist_traveled"] = live_df["shape_dist_traveled"].astype( "float64") live_df["lat"] = live_df["lat"].astype("float64") live_df["lon"] = live_df["lon"].astype("float64") live_df["direction_angle"] = live_df["direction_angle"].astype("float64") live_df["shape_dist_between"] = live_df["shape_dist_between"].astype( "float64") # materialize virtual columns to match model state [ live_df.materialize(col, inplace=True) for col in live_df.get_column_names() if not col in live_df.get_column_names(virtual=False) ] try: live_df.state_set(model) if len(live_df) > 0: return (round(live_df[["p_arrival_lgbm"]][0][0]) * 60), live_df[["p_arrival_lgbm"]][0][0] except: return empty return empty
def create_model(): if not os.path.exists(gtfsr_model_df_path): df = vaex.open(gtfsr_processed_path) df = df.sample(frac=1) # # remove outliers from dataset, all delays over 20 minutes outlier = 60 * 20 df = df[(df["arrival"] >= -outlier) & (df["arrival"] <= outlier) & (df["departure"] >= -outlier) & (df["departure"] <= outlier)] df["arr_dow"] = df.apply(apply_dow, ["start_date", "start_time", "arrival_time"]) df["arr_hour"] = df["arrival_time"].apply( lambda t: get_dt(t, "%H:%M:%S").hour) df["arrival"] = df["arrival"].apply(lambda t: 0 if t == 0 else t / 60) cols = [ "route_id", "stop_id", "arr_dow", "arr_hour", "direction", "stop_sequence" ] # if the arrival historical means dataset is not created we create it if not os.path.exists(gtfsr_historical_means_path): print("*** creating gtfsr historical means dataset ***") # creates a dataset of historical average means using the stop_id, arrival_day_of_week and trip_id identifiers vaex.from_pandas( (df.to_pandas_df().groupby(cols).agg({ "arrival": "mean", "p_avg_vol": "mean" }).rename(columns={ "arrival": "arrival_mean", "p_avg_vol": "p_mean_vol" }).reset_index())).export_hdf5(gtfsr_historical_means_path) print("*** joining hist means ***") # join the arrival means to our dataset df = vaex_mjoin(df, vaex.open(gtfsr_historical_means_path), cols, cols, how="left") df = df[[ "start_date", "start_time", "stop_sequence", "arrival", "timestamp", "stop_id", "arrival_time", "shape_dist_traveled", "direction", "route_id", "lat", "lon", "direction_angle", "shape_dist_between", "arr_dow", "arr_hour", "arrival_mean", "p_mean_vol", ]] df.export_hdf5(gtfsr_model_df_path) print("*** Start training ***") # open model ready df = vaex.open(gtfsr_model_df_path) # transform our data df = transform_data(df) # train our data train_gtfsr(df) return