def distances(): if cache['preprocessed']['distances'] is None: filepath = get_path_preprocessed('', '', 'distances.csv.gz') cache['preprocessed']['distances'] = convert_to_datetime( pd.read_csv(filepath, engine='c')) return cache['preprocessed']['distances']
def sensors_original(): if cache['originals']['sensors'] is None: filepath = get_path_originals('sensors.csv.gz') cache['originals']['sensors'] = convert_to_datetime( pd.read_csv(filepath, engine='c')) return cache['originals']['sensors']
def base_dataset(mode='local', t='train'): check_mode_and_t(mode, t) if cache['preprocessed'][mode][t]['base_dataset'] is None: filepath = get_path_preprocessed(mode, t, 'base_dataset.csv.gz') cache['preprocessed'][mode][t]['base_dataset'] = convert_to_datetime( pd.read_csv(filepath, engine='c')) return cache['preprocessed'][mode][t]['base_dataset']
def weather(): if cache['preprocessed']['weather'] is None: filepath = get_path_preprocessed('', '', 'base_structure_df_weather.csv.gz') cache['preprocessed']['weather'] = convert_to_datetime( pd.read_csv(filepath, engine='c')) return cache['preprocessed']['weather']
def weather_original(t='train'): check_t(t) if cache['originals'][t]['weather'] is None: filename = 'weather_2019.csv.gz' if t == 'test2' else 'weather_{t}.csv.gz' filepath = get_path_originals(filename) cache['originals'][t]['weather'] = convert_to_datetime( pd.read_csv(filepath, engine='c')) return cache['originals'][t]['weather']
def speed_test_masked(): if cache['preprocessed']['local']['test']['speeds_masked'] is None: filepath = get_path_preprocessed('local', 'test', 'speeds_test_masked.csv.gz') cache['preprocessed']['local']['test'][ 'speeds_masked'] = convert_to_datetime( pd.read_csv(filepath, engine='c', index_col=0)) return cache['preprocessed']['local']['test']['speeds_masked']
def distances_original(): if cache['originals']['distances'] is None: filepath = get_path_originals('distances.csv.gz') cache['originals']['distances'] = convert_to_datetime( pd.read_csv(filepath, engine='c', sep='|', names=['KEY_KM', 'STATIONS'])) return cache['originals']['distances']
def dataset(mode='local', t='train', onehot=True, drop_index_columns=True, export=False): check_mode_and_t(mode, t) if mode == 'full' and t == 'test': export = True if cache['preprocessed'][mode][t]['dataset'] is None: filepath = get_path_preprocessed(mode, t, 'merged_dataset.csv.gz') cache['preprocessed'][mode][t]['dataset'] = convert_to_datetime( pd.read_csv(filepath, engine='c')) # SORT BY TIMESTAMP (to replicate their split) cache['preprocessed'][mode][t]['dataset'].sort_values( 'DATETIME_UTC_y_0', inplace=True) return split_dataset_X_y(cache['preprocessed'][mode][t]['dataset'], onehot, drop_index_columns, export)
def join_to(self, df, one_hot=False): """ Join this feature to the specified dataframe. The default implementation will join based on the common column between the 2 dataframes. Override to provide a custom join logic. """ feature_df = convert_to_datetime(self.read_feature(one_hot=one_hot)) return pd.merge(df, feature_df, how='left')
def join_to(self, df, one_hot=False): feature_df = convert_to_datetime(self.read_feature(one_hot=one_hot)) feature_df.DATETIME_UTC_SPEED_SENSOR_HOUR = feature_df.DATETIME_UTC_SPEED_SENSOR_HOUR.dt.strftime( '%H:%M:%S') feature_df_y_0 = feature_df.rename( columns={ 'avg_speed_sensor_hour': 'avg_speed_sensor_hour_y_0', 'avg_speed_sd_sensor_hour': 'avg_speed_sd_sensor_hour_y_0', 'avg_speed_min_sensor_hour': 'avg_speed_min_sensor_hour_y_0', 'avg_speed_max_sensor_hour': 'avg_speed_max_sensor_hour_y_0', 'avg_n_vehicles_sensor_hour': 'avg_n_vehicles_sensor_hour_y_0' }) df['DATETIME_UTC_y_0_m'] = pd.to_datetime(df.DATETIME_UTC_y_0) df['DATETIME_UTC_y_0_m'] = df['DATETIME_UTC_y_0_m'].dt.strftime( '%H:%M:%S') df = df.merge(feature_df_y_0, left_on=['KEY', 'KM', 'DATETIME_UTC_y_0_m'], right_on=['KEY', 'KM', 'DATETIME_UTC_SPEED_SENSOR_HOUR'], how='left') df = df.drop(['DATETIME_UTC_y_0_m', 'DATETIME_UTC_SPEED_SENSOR_HOUR'], axis=1) feature_df_y_1 = feature_df.rename( columns={ 'avg_speed_sensor_hour': 'avg_speed_sensor_hour_y_1', 'avg_speed_sd_sensor_hour': 'avg_speed_sd_sensor_hour_y_1', 'avg_speed_min_sensor_hour': 'avg_speed_min_sensor_hour_y_1', 'avg_speed_max_sensor_hour': 'avg_speed_max_sensor_hour_y_1', 'avg_n_vehicles_sensor_hour': 'avg_n_vehicles_sensor_hour_y_1' }) df['DATETIME_UTC_y_1_m'] = pd.to_datetime(df.DATETIME_UTC_y_1) df['DATETIME_UTC_y_1_m'] = df['DATETIME_UTC_y_1_m'].dt.strftime( '%H:%M:%S') df = df.merge(feature_df_y_1, left_on=['KEY', 'KM', 'DATETIME_UTC_y_1_m'], right_on=['KEY', 'KM', 'DATETIME_UTC_SPEED_SENSOR_HOUR'], how='left') df = df.drop(['DATETIME_UTC_y_1_m', 'DATETIME_UTC_SPEED_SENSOR_HOUR'], axis=1) feature_df_y_2 = feature_df.rename( columns={ 'avg_speed_sensor_hour': 'avg_speed_sensor_hour_y_2', 'avg_speed_sd_sensor_hour': 'avg_speed_sd_sensor_hour_y_2', 'avg_speed_min_sensor_hour': 'avg_speed_min_sensor_hour_y_2', 'avg_speed_max_sensor_hour': 'avg_speed_max_sensor_hour_y_2', 'avg_n_vehicles_sensor_hour': 'avg_n_vehicles_sensor_hour_y_2' }) df['DATETIME_UTC_y_2_m'] = pd.to_datetime(df.DATETIME_UTC_y_2) df['DATETIME_UTC_y_2_m'] = df['DATETIME_UTC_y_2_m'].dt.strftime( '%H:%M:%S') df = df.merge(feature_df_y_2, left_on=['KEY', 'KM', 'DATETIME_UTC_y_2_m'], right_on=['KEY', 'KM', 'DATETIME_UTC_SPEED_SENSOR_HOUR'], how='left') df = df.drop(['DATETIME_UTC_y_2_m', 'DATETIME_UTC_SPEED_SENSOR_HOUR'], axis=1) feature_df_y_3 = feature_df.rename( columns={ 'avg_speed_sensor_hour': 'avg_speed_sensor_hour_y_3', 'avg_speed_sd_sensor_hour': 'avg_speed_sd_sensor_hour_y_3', 'avg_speed_min_sensor_hour': 'avg_speed_min_sensor_hour_y_3', 'avg_speed_max_sensor_hour': 'avg_speed_max_sensor_hour_y_3', 'avg_n_vehicles_sensor_hour': 'avg_n_vehicles_sensor_hour_y_3' }) df['DATETIME_UTC_y_3_m'] = pd.to_datetime(df.DATETIME_UTC_y_3) df['DATETIME_UTC_y_3_m'] = df['DATETIME_UTC_y_3_m'].dt.strftime( '%H:%M:%S') df = df.merge(feature_df_y_3, left_on=['KEY', 'KM', 'DATETIME_UTC_y_3_m'], right_on=['KEY', 'KM', 'DATETIME_UTC_SPEED_SENSOR_HOUR'], how='left') df = df.drop(['DATETIME_UTC_y_3_m', 'DATETIME_UTC_SPEED_SENSOR_HOUR'], axis=1) return df
def join_to(self, df, one_hot=False): f = convert_to_datetime(self.read_feature()) return pd.merge(df, f, how='left')