def distances():
    if cache['preprocessed']['distances'] is None:
        filepath = get_path_preprocessed('', '', 'distances.csv.gz')
        cache['preprocessed']['distances'] = convert_to_datetime(
            pd.read_csv(filepath, engine='c'))

    return cache['preprocessed']['distances']
def sensors_original():
    if cache['originals']['sensors'] is None:
        filepath = get_path_originals('sensors.csv.gz')
        cache['originals']['sensors'] = convert_to_datetime(
            pd.read_csv(filepath, engine='c'))

    return cache['originals']['sensors']
def base_dataset(mode='local', t='train'):
    check_mode_and_t(mode, t)
    if cache['preprocessed'][mode][t]['base_dataset'] is None:
        filepath = get_path_preprocessed(mode, t, 'base_dataset.csv.gz')
        cache['preprocessed'][mode][t]['base_dataset'] = convert_to_datetime(
            pd.read_csv(filepath, engine='c'))

    return cache['preprocessed'][mode][t]['base_dataset']
def weather():
    if cache['preprocessed']['weather'] is None:
        filepath = get_path_preprocessed('', '',
                                         'base_structure_df_weather.csv.gz')
        cache['preprocessed']['weather'] = convert_to_datetime(
            pd.read_csv(filepath, engine='c'))

    return cache['preprocessed']['weather']
def weather_original(t='train'):
    check_t(t)
    if cache['originals'][t]['weather'] is None:
        filename = 'weather_2019.csv.gz' if t == 'test2' else 'weather_{t}.csv.gz'
        filepath = get_path_originals(filename)
        cache['originals'][t]['weather'] = convert_to_datetime(
            pd.read_csv(filepath, engine='c'))

    return cache['originals'][t]['weather']
def speed_test_masked():
    if cache['preprocessed']['local']['test']['speeds_masked'] is None:
        filepath = get_path_preprocessed('local', 'test',
                                         'speeds_test_masked.csv.gz')
        cache['preprocessed']['local']['test'][
            'speeds_masked'] = convert_to_datetime(
                pd.read_csv(filepath, engine='c', index_col=0))

    return cache['preprocessed']['local']['test']['speeds_masked']
def distances_original():
    if cache['originals']['distances'] is None:
        filepath = get_path_originals('distances.csv.gz')
        cache['originals']['distances'] = convert_to_datetime(
            pd.read_csv(filepath,
                        engine='c',
                        sep='|',
                        names=['KEY_KM', 'STATIONS']))

    return cache['originals']['distances']
def dataset(mode='local',
            t='train',
            onehot=True,
            drop_index_columns=True,
            export=False):
    check_mode_and_t(mode, t)
    if mode == 'full' and t == 'test':
        export = True
    if cache['preprocessed'][mode][t]['dataset'] is None:
        filepath = get_path_preprocessed(mode, t, 'merged_dataset.csv.gz')
        cache['preprocessed'][mode][t]['dataset'] = convert_to_datetime(
            pd.read_csv(filepath, engine='c'))

        # SORT BY TIMESTAMP (to replicate their split)
        cache['preprocessed'][mode][t]['dataset'].sort_values(
            'DATETIME_UTC_y_0', inplace=True)
    return split_dataset_X_y(cache['preprocessed'][mode][t]['dataset'], onehot,
                             drop_index_columns, export)
 def join_to(self, df, one_hot=False):
     """ Join this feature to the specified dataframe. The default implementation will join based on the
     common column between the 2 dataframes. Override to provide a custom join logic. """
     feature_df = convert_to_datetime(self.read_feature(one_hot=one_hot))
     return pd.merge(df, feature_df, how='left')
示例#10
0
    def join_to(self, df, one_hot=False):
        feature_df = convert_to_datetime(self.read_feature(one_hot=one_hot))
        feature_df.DATETIME_UTC_SPEED_SENSOR_HOUR = feature_df.DATETIME_UTC_SPEED_SENSOR_HOUR.dt.strftime(
            '%H:%M:%S')

        feature_df_y_0 = feature_df.rename(
            columns={
                'avg_speed_sensor_hour': 'avg_speed_sensor_hour_y_0',
                'avg_speed_sd_sensor_hour': 'avg_speed_sd_sensor_hour_y_0',
                'avg_speed_min_sensor_hour': 'avg_speed_min_sensor_hour_y_0',
                'avg_speed_max_sensor_hour': 'avg_speed_max_sensor_hour_y_0',
                'avg_n_vehicles_sensor_hour': 'avg_n_vehicles_sensor_hour_y_0'
            })
        df['DATETIME_UTC_y_0_m'] = pd.to_datetime(df.DATETIME_UTC_y_0)
        df['DATETIME_UTC_y_0_m'] = df['DATETIME_UTC_y_0_m'].dt.strftime(
            '%H:%M:%S')
        df = df.merge(feature_df_y_0,
                      left_on=['KEY', 'KM', 'DATETIME_UTC_y_0_m'],
                      right_on=['KEY', 'KM', 'DATETIME_UTC_SPEED_SENSOR_HOUR'],
                      how='left')
        df = df.drop(['DATETIME_UTC_y_0_m', 'DATETIME_UTC_SPEED_SENSOR_HOUR'],
                     axis=1)

        feature_df_y_1 = feature_df.rename(
            columns={
                'avg_speed_sensor_hour': 'avg_speed_sensor_hour_y_1',
                'avg_speed_sd_sensor_hour': 'avg_speed_sd_sensor_hour_y_1',
                'avg_speed_min_sensor_hour': 'avg_speed_min_sensor_hour_y_1',
                'avg_speed_max_sensor_hour': 'avg_speed_max_sensor_hour_y_1',
                'avg_n_vehicles_sensor_hour': 'avg_n_vehicles_sensor_hour_y_1'
            })
        df['DATETIME_UTC_y_1_m'] = pd.to_datetime(df.DATETIME_UTC_y_1)
        df['DATETIME_UTC_y_1_m'] = df['DATETIME_UTC_y_1_m'].dt.strftime(
            '%H:%M:%S')
        df = df.merge(feature_df_y_1,
                      left_on=['KEY', 'KM', 'DATETIME_UTC_y_1_m'],
                      right_on=['KEY', 'KM', 'DATETIME_UTC_SPEED_SENSOR_HOUR'],
                      how='left')
        df = df.drop(['DATETIME_UTC_y_1_m', 'DATETIME_UTC_SPEED_SENSOR_HOUR'],
                     axis=1)

        feature_df_y_2 = feature_df.rename(
            columns={
                'avg_speed_sensor_hour': 'avg_speed_sensor_hour_y_2',
                'avg_speed_sd_sensor_hour': 'avg_speed_sd_sensor_hour_y_2',
                'avg_speed_min_sensor_hour': 'avg_speed_min_sensor_hour_y_2',
                'avg_speed_max_sensor_hour': 'avg_speed_max_sensor_hour_y_2',
                'avg_n_vehicles_sensor_hour': 'avg_n_vehicles_sensor_hour_y_2'
            })
        df['DATETIME_UTC_y_2_m'] = pd.to_datetime(df.DATETIME_UTC_y_2)
        df['DATETIME_UTC_y_2_m'] = df['DATETIME_UTC_y_2_m'].dt.strftime(
            '%H:%M:%S')
        df = df.merge(feature_df_y_2,
                      left_on=['KEY', 'KM', 'DATETIME_UTC_y_2_m'],
                      right_on=['KEY', 'KM', 'DATETIME_UTC_SPEED_SENSOR_HOUR'],
                      how='left')
        df = df.drop(['DATETIME_UTC_y_2_m', 'DATETIME_UTC_SPEED_SENSOR_HOUR'],
                     axis=1)

        feature_df_y_3 = feature_df.rename(
            columns={
                'avg_speed_sensor_hour': 'avg_speed_sensor_hour_y_3',
                'avg_speed_sd_sensor_hour': 'avg_speed_sd_sensor_hour_y_3',
                'avg_speed_min_sensor_hour': 'avg_speed_min_sensor_hour_y_3',
                'avg_speed_max_sensor_hour': 'avg_speed_max_sensor_hour_y_3',
                'avg_n_vehicles_sensor_hour': 'avg_n_vehicles_sensor_hour_y_3'
            })
        df['DATETIME_UTC_y_3_m'] = pd.to_datetime(df.DATETIME_UTC_y_3)
        df['DATETIME_UTC_y_3_m'] = df['DATETIME_UTC_y_3_m'].dt.strftime(
            '%H:%M:%S')
        df = df.merge(feature_df_y_3,
                      left_on=['KEY', 'KM', 'DATETIME_UTC_y_3_m'],
                      right_on=['KEY', 'KM', 'DATETIME_UTC_SPEED_SENSOR_HOUR'],
                      how='left')
        df = df.drop(['DATETIME_UTC_y_3_m', 'DATETIME_UTC_SPEED_SENSOR_HOUR'],
                     axis=1)

        return df
示例#11
0
 def join_to(self, df, one_hot=False):
     f = convert_to_datetime(self.read_feature())
     return pd.merge(df, f, how='left')