def roll_multi_result(df: pd.DataFrame, apply_func: callable, window: int, return_col_num: int, **kwargs): """ rolling with multiple columns on 2 dim pd.Dataframe * the result can apply the function which can return pd.Series with multiple columns call apply function with numpy ndarray :param return_col_num: apply_func返回的个数(列数) :param apply_func: --注意:传递的参数,前N个为原index,N=index的维数 :param df: [pd.DataFrame,pd.Series] :param window: 滚动窗口 :param kwargs: :return: """ # move index to values v = df.reset_index().values dim0, dim1 = v.shape stride0, stride1 = v.strides stride_values = stride(v, (dim0 - (window - 1), window, dim1), (stride0, stride0, stride1)) result_values = np.full((dim0, return_col_num), np.nan) for idx, values in enumerate(stride_values, window - 1): # values : col 1 is index, other is value result_values[idx,] = apply_func(values, **kwargs) return result_values
def roll(df: pd.DataFrame, w: int, **kwargs): """ Rolling window on dataframe using multiple columns >>> roll(pd.DataFrame(np.random.randn(10,3), index=list('ABCDEFGHIJ')), 3).apply(print) or alternatively >>> pd.DataFrame(np.random.randn(10,3), index=list('ABCDEFGHIJ')).pipe(roll, 3).apply(lambda x: print(x[2])) :param df: pandas DataFrame :param w: window size (only integers) :return: rolling window """ if w > len(df): raise ValueError("Window size exceeds number of rows !") v = df.values d0, d1 = v.shape s0, s1 = v.strides a = stride(v, (d0 - (w - 1), w, d1), (s0, s0, s1)) rolled_df = pd.concat({ row: pd.DataFrame(values, columns=df.columns) for row, values in zip(df.index, a) }) return rolled_df.groupby(level=0, **kwargs)
def roll(df, w, **kwargs): v = df.values d0, d1 = v.shape s0, s1 = v.strides a = stride(v, (d0 - (w - 1), w, d1), (s0, s0, s1)) rolled_df = pd.concat({ row: pd.DataFrame(values, columns=df.columns) for row, values in zip(df.index, a) }) return rolled_df.groupby(level=0, **kwargs)
def makeS(full, train, var): """ Given 2 dataframes, "train" which is a subset of "full" used to train a model, and a variable name, makes a new matrix of a row length(n) equal to the row length of the training data frame and a column length(m) equal to the row length of the full data frame. Each column is a repeat of the values of the selected variable from the training data frame. The new matrix created however only takes up the memory space of what a single column does. """ ncol = len(full) vec = train[var].values return stride(vec, shape=(len(vec), ncol), strides=(vec.itemsize, 0))
def roll(df, w, **kwargs): # returns iterable of DataFrames each having some length (for window-type functions) v = df.values d0, d1 = v.shape s0, s1 = v.strides a = stride(v, (d0 - (w - 1), w, d1), (s0, s0, s1)) if len(a) == 0: return np.array([]) rolled_df = pd.concat({ row: pd.DataFrame(values, columns=df.columns) for row, values in zip(df.index, a) }) return rolled_df.groupby(level=0, **kwargs)
def roll(df, w, **kwargs): """ Helper function to apply rolling calculation to multiple columns """ v = df.values d0, d1 = v.shape s0, s1 = v.strides a = stride(v, (d0 - (w - 1), w, d1), (s0, s0, s1)) rolled_df = pd.concat({ row: pd.DataFrame(values, columns=df.columns) for row, values in zip(df.index, a) }) return rolled_df.groupby(level=0, **kwargs)
def roll(df, w, **kwargs): """ Roll a df. Input: df -> df to be rolled w -> the length of the rolling window """ v = df.values d0, d1 = v.shape s0, s1 = v.strides a = stride(v, (d0 - (w - 1), w, d1), (s0, s0, s1)) rolled_df = pd.concat({ row: pd.DataFrame(values, columns=df.columns) for row, values in zip(df.index, a) }) return rolled_df.groupby(level=0, **kwargs)
def _transform_all_data(self): self._log.debug("transforming NGSIM data") df = pd.read_csv( self._path, sep=r"\s+", header=None, names=( "vehicle_id", "frame_id", # 1 frame per .1s "total_frames", "global_time", # msecs # front center in feet from left lane edge "position_x" if not self._swap_xy else "position_y", # front center in feet from entry edge "position_y" if not self._swap_xy else "position_x", "global_x" if not self._swap_xy else "global_y", # front center in feet "global_y" if not self._swap_xy else "global_x", # front center in feet "length", # feet "width", # feet "type", # 1 = motorcycle, 2 = auto, 3 = truck "speed", # feet / sec "acceleration", # feet / sec^2 "lane_id", # lower is further left "preceding_vehicle_id", "following_vehicle_id", "spacing", # feet "headway", # secs ), ) df["sim_time"] = df["global_time"] - min(df["global_time"]) # offset of the map from the data... x_offset = self._dataset_spec.get("x_offset_px", 0) / self.scale y_offset = self._dataset_spec.get("y_offset_px", 0) / self.scale df["length"] *= METERS_PER_FOOT df["width"] *= METERS_PER_FOOT df["speed"] *= METERS_PER_FOOT df["acceleration"] *= METERS_PER_FOOT df["spacing"] *= METERS_PER_FOOT df["position_y"] *= METERS_PER_FOOT # SMARTS uses center not front df["position_x"] = ( df["position_x"] * METERS_PER_FOOT - 0.5 * df["length"] - x_offset ) if y_offset: df["position_x"] = df["position_y"] - y_offset if self._flip_y: max_y = self._dataset_spec["map_net"]["max_y"] df["position_y"] = (max_y / self.scale) - df["position_y"] # Use moving average to smooth positions... df.sort_values("sim_time", inplace=True) # just in case it wasn't already... k = 15 # kernel size for positions for vehicle_id in set(df["vehicle_id"]): same_car = df["vehicle_id"] == vehicle_id df.loc[same_car, "position_x"] = ( df.loc[same_car, "position_x"] .rolling(window=k) .mean() .shift(1 - k) .values ) df.loc[same_car, "position_y"] = ( df.loc[same_car, "position_y"] .rolling(window=k) .mean() .shift(1 - k) .values ) # and compute heading with (smaller) rolling window (=3) too.. v = df.loc[same_car, ["position_x", "position_y"]].shift(1).values d0, d1 = v.shape s0, s1 = v.strides headings = [ self._cal_heading(values) for values in stride(v, (d0 - 2, 3, d1), (s0, s0, s1)) ] df.loc[same_car, "heading_rad"] = headings + [headings[-1], headings[-1]] # ... and new speeds (based on these smoothed positions) # (This also overcomes problem that NGSIM speeds are "instantaneous" # and so don't match with dPos/dt, which can affect some models.) speeds = [ self._cal_speed(values) for values in stride(v, (d0 - 2, 3, d1), (s0, s0, s1)) ] df.loc[same_car, "speed_discrete"] = speeds + [None, None] map_width = self._dataset_spec["map_net"].get("width") if map_width: valid_x = (df["position_x"] * self.scale).between(0, map_width) df = df[valid_x] return df
def _transform_all_data(self): self._log.debug("transforming NGSIM data") cols = ( "vehicle_id", "frame_id", # 1 frame per .1s "total_frames", "global_time", # msecs # front center in feet from left lane edge "position_x" if not self._swap_xy else "position_y", # front center in feet from entry edge "position_y" if not self._swap_xy else "position_x", "global_x" if not self._swap_xy else "global_y", # front center in feet "global_y" if not self._swap_xy else "global_x", # front center in feet "length", # feet "width", # feet "type", # 1 = motorcycle, 2 = auto, 3 = truck "speed", # feet / sec "acceleration", # feet / sec^2 "lane_id", # lower is further left "preceding_vehicle_id", "following_vehicle_id", "spacing", # feet "headway", # secs ) if self._dataset_spec.get("source") == "NGSIM2": extra_cols = ( "origin_zone", "destination_zone", "intersection", "section", "direction", "movement", ) cols = cols[:16] + extra_cols + cols[16:] df = pd.read_csv(self._path, sep=r"\s+", header=None, names=cols) df["sim_time"] = df["global_time"] - min(df["global_time"]) # offset of the map from the data... x_margin = self._dataset_spec.get("x_margin_px", 0) / self.scale y_margin = self._dataset_spec.get("y_margin_px", 0) / self.scale df["length"] *= METERS_PER_FOOT df["width"] *= METERS_PER_FOOT df["speed"] *= METERS_PER_FOOT df["acceleration"] *= METERS_PER_FOOT df["spacing"] *= METERS_PER_FOOT df["position_x"] *= METERS_PER_FOOT df["position_y"] *= METERS_PER_FOOT if x_margin: df["position_x"] = df["position_x"] - x_margin if y_margin: df["position_x"] = df["position_y"] - y_margin if self._flip_y: max_y = self._dataset_spec["map_net"]["max_y"] df["position_y"] = (max_y / self.scale) - df["position_y"] # Use moving average to smooth positions... df.sort_values("sim_time", inplace=True) # just in case it wasn't already... k = 15 # kernel size for positions for vehicle_id in set(df["vehicle_id"]): same_car = df["vehicle_id"] == vehicle_id df.loc[same_car, "position_x"] = (df.loc[same_car, "position_x"].rolling( window=k).mean().shift(1 - k).values) df.loc[same_car, "position_y"] = (df.loc[same_car, "position_y"].rolling( window=k).mean().shift(1 - k).values) # and compute heading with (smaller) rolling window (=3) too.. shift = int(self._heading_window / 2) pad = self._heading_window - shift - 1 v = df.loc[same_car, ["position_x", "position_y", "speed"]].values v = np.insert(v, 0, [[np.nan, np.nan, np.nan]] * shift, axis=0) headings = [ self._cal_heading(values) for values in sliding_window_view(v, (self._heading_window, 3)) ] df.loc[same_car, "heading_rad"] = headings + [headings[-1]] * pad # ... and new speeds (based on these smoothed positions) # (This also overcomes problem that NGSIM speeds are "instantaneous" # and so don't match with dPos/dt, which can affect some models.) v = df.loc[same_car, ["position_x", "position_y"]].shift(1).values d0, d1 = v.shape s0, s1 = v.strides speeds = [ self._cal_speed(values) for values in stride(v, (d0 - 2, 3, d1), (s0, s0, s1)) ] df.loc[same_car, "speed_discrete"] = speeds + [None, None] # since SMARTS' positions are the vehicle centerpoints, but NGSIM's are at the front, # now adjust the vehicle position to its centerpoint based on its angle (+y = 0 rad) df["position_x"] = df["position_x"] - 0.5 * df["length"] * np.cos( df["heading_rad"] + 0.5 * math.pi) df["position_y"] = df["position_y"] - 0.5 * df["length"] * np.sin( df["heading_rad"] + 0.5 * math.pi) map_width = self._dataset_spec["map_net"].get("width") if map_width: valid_x = (df["position_x"] * self.scale).between( df["length"] / 2, map_width - df["length"] / 2) df = df[valid_x] return df