예제 #1
0
def create_base_structure_hours():
    """
    Call to create the base structure it is a pd Dataframe composed as follow:
    KEY | DATETIME_UTC | KM
    it is usefull to do join with other dataframe
    in it there are all the DATETIME_UTC present both in train and test speeds.csv files
    """
    start = time()

    # define the base path where to save the base_structure
    _BASE_PATH = 'resources/dataset/preprocessed'

    # check if the folder exsist if not create it
    utils.check_folder(_BASE_PATH)

    speeds_train = data.speeds('train')
    speeds_test = data.speeds('test')

    # create all the datetimes between min train and max test datetime
    min_train_datetime = sorted(
        pd.to_datetime(
            speeds_train['DATETIME_UTC']).unique())[0].astype('int') // 10**9
    max_test_datetime = sorted(
        pd.to_datetime(
            speeds_test['DATETIME_UTC']).unique())[-1].astype('int') // 10**9
    range_datetimes = np.arange(min_train_datetime, max_test_datetime, 60 * 60)
    datetime_df = pd.DataFrame(pd.to_datetime(range_datetimes, unit='s'),
                               columns=['DATETIME_UTC'])

    key_2_train = speeds_train.KEY_2.unique()
    key_2_test = speeds_test.KEY_2.unique()

    # get all the unique key_2 in train and test
    key_2_full = sorted(set(key_2_test) | set(key_2_train))

    temp = pd.DataFrame(list(map(lambda x: x.split('_'), key_2_full)),
                        columns=['KEY', 'KM'])

    # add dummy column to let a merge do a cartesian product
    temp['dummy'] = 0
    datetime_df['dummy'] = 0

    print('Doing cartesian product... it will take a while!')
    base_structure = pd.merge(datetime_df, temp).drop(['dummy'], axis=1)
    print('Done\n')

    print('sorting values...')
    base_structure = base_structure.sort_values(['DATETIME_UTC', 'KEY',
                                                 'KM']).reset_index(drop=True)
    print('Done\n')

    # save the base structure
    print('Saving base structure to {}/base_structure.csv'.format(_BASE_PATH))
    base_structure.to_csv(f'{_BASE_PATH}/base_structure_hours.csv',
                          index=False)
    print('Done\n')

    print(f'PROCEDURE ENDED SUCCESSFULLY IN: {round(time() - start, 4)} s')
    def extract_feature(self):
        print('Loading datasets')
        speeds = None

        if self.mode == 'local':
            tr = data.speeds_original('train')
            te = data.speed_test_masked()
            speeds = pd.concat([tr, te])
            del tr
            del te

        elif self.mode == 'full':
            tr = data.speeds(mode='full')
            te = data.speeds_original('test2')
            speeds = pd.concat([tr, te])
            del tr
            del te

        sensors = data.sensors()
        print('Done')

        df = pd.merge(speeds.dropna(),
                      sensors,
                      left_on=[KEY, KM],
                      right_on=[KEY, KM])
        df[DATETIME] = pd.to_datetime(df.DATETIME_UTC)

        return df[['ROAD_TYPE', 'SPEED_AVG']].groupby('ROAD_TYPE').mean().reset_index()\
            .rename(columns={'SPEED_AVG': 'avg_speed_roadtype'})
    def extract_feature(self):
        speeds = None

        if self.mode == 'local':
            tr = data.speeds_original('train')
            te = data.speed_test_masked()
            speeds = pd.concat([tr, te])
            del tr
            del te

        elif self.mode == 'full':
            tr = data.speeds(mode='full')
            te = data.speeds_original('test2')
            speeds = pd.concat([tr, te])
            del tr
            del te

        print('Extracting min and max timestamps...')
        min_datetime = speeds.DATETIME_UTC.min()
        max_datetime = speeds.DATETIME_UTC.max()
        print('Done')
        df = pd.DataFrame(
            pd.date_range(min_datetime, max_datetime,
                          freq='15min').to_series()).reset_index()
        df[DATETIME] = pd.to_datetime(df['index'])
        df = df[[DATETIME]]
        df['WEEK_DAY'] = pd.to_datetime(df[DATETIME]).dt.weekday
        df['IS_WEEKEND'] = df.WEEK_DAY.map(lambda x: 1 if x in [5, 6] else 0)
        return df.rename(columns={'DATETIME_UTC': 'DATETIME_UTC_y_0'})
예제 #4
0
    def extract_feature(self):
        df = None

        if self.mode == 'local':
            tr = data.speeds_original('train')
            te = data.speed_test_masked()
            df = pd.concat([tr, te])
            del tr
            del te

        elif self.mode == 'full':
            tr = data.speeds(mode='full')
            te = data.speeds_original('test2')
            df = pd.concat([tr, te])
            del tr
            del te

        df.DATETIME_UTC = df.DATETIME_UTC.dt.strftime('%H:%M:%S')
        return df[['KEY', 'KM', 'DATETIME_UTC', 'SPEED_AVG', 'SPEED_SD', 'SPEED_MIN', 'SPEED_MAX', 'N_VEHICLES']].groupby(['KEY', 'KM', 'DATETIME_UTC']).mean().reset_index()\
            .rename(columns={'DATETIME_UTC': 'DATETIME_UTC_SPEED_SENSOR_HOUR',
                            'SPEED_AVG': 'avg_speed_sensor_hour',
                            'SPEED_SD': 'avg_speed_sd_sensor_hour',
                            'SPEED_MIN': 'avg_speed_min_sensor_hour',
                            'SPEED_MAX': 'avg_speed_max_sensor_hour',
                            'N_VEHICLES': 'avg_n_vehicles_sensor_hour'})
예제 #5
0
    def extract_feature(self):
        speeds = None

        if self.mode == 'local':
            tr = data.speeds_original('train')
            te = data.speed_test_masked()
            speeds = pd.concat([tr, te])
            del tr
            del te

        elif self.mode == 'full':
            tr = data.speeds(mode='full')
            te = data.speeds_original('test2')
            speeds = pd.concat([tr, te])
            del tr
            del te

        feature_cols = ["DATETIME_UTC", "KEY", "KM", "N_VEHICLES"]
        speeds = speeds.loc[:, feature_cols]
        speeds["N_VEHICLES"] = speeds.N_VEHICLES.fillna(0).astype(int)
        #contains also weekday
        speeds["day"] = speeds.DATETIME_UTC.dt.weekday
        speeds = speeds[['KEY', 'KM', 'N_VEHICLES',
                         'day']].groupby(['KEY', 'KM',
                                          'day']).mean().reset_index()

        return speeds.rename(
            columns={'N_VEHICLES': 'avg_n_vehicles_sensor_per_day'})
    def extract_feature(self):
        s = None
        if self.mode == 'local':
            tr = data.speeds_original('train').drop(['KEY_2'], axis=1)
            te = data.speed_test_masked().drop(['KEY_2'], axis=1)
            s = pd.concat([tr, te])
            del tr
            del te

        elif self.mode == 'full':
            tr = data.speeds(mode='full').drop(['KEY_2'], axis=1)
            te = data.speeds_original('test2').drop(['KEY_2'], axis=1)
            s = pd.concat([tr, te])
            del tr
            del te

        f = s[['KEY', 'DATETIME_UTC', 'KM']].copy()
        s = s.rename(columns={'DATETIME_UTC': 'DATETIME_UTC_drop'})
        for i in tqdm(range(1, self.n_days_before + 1)):
            colname = 'DATETIME_UTC_{}_D'.format(i)
            f[colname] = f.DATETIME_UTC - pd.Timedelta(days=i)
            f = pd.merge(f, s, how='left', left_on=['KEY', 'KM', colname], \
                        right_on=['KEY', 'KM', 'DATETIME_UTC_drop']) \
                        .drop([colname, 'DATETIME_UTC_drop'], axis=1)
            f = f.rename(
                columns={
                    'SPEED_AVG': 'SPEED_AVG_{}_DAY_BEFORE'.format(i),
                    'SPEED_SD': 'SPEED_SD_{}_DAY_BEFORE'.format(i),
                    'SPEED_MIN': 'SPEED_MIN_{}_DAY_BEFORE'.format(i),
                    'SPEED_MAX': 'SPEED_MAX_{}_DAY_BEFORE'.format(i),
                    'N_VEHICLES': 'N_VEHICLES_{}_DAY_BEFORE'.format(i)
                })
        return f.rename(columns={'DATETIME_UTC': 'DATETIME_UTC_y_0'})
def avg_speed_for_roadtype() -> pd.DataFrame:
    print('Loading datasets')
    speeds = data.speeds()
    sensors = data.sensors()
    print('Done')

    df = pd.merge(speeds.dropna(),
                  sensors,
                  left_on=[KEY, KM],
                  right_on=[KEY, KM])
    df[DATETIME] = pd.to_datetime(df.DATETIME_UTC)

    return df[['ROAD_TYPE', 'SPEED_AVG']].groupby('ROAD_TYPE').mean()
예제 #8
0
    def extract_feature(self):

        if self.mode == 'local':
            tr = data.speeds_original('train')
            te = data.speed_test_masked()
            f = pd.concat([tr, te])
        elif self.mode == 'full':
            tr = data.speeds(mode='full')
            te = data.speeds_original('test2')
            f = pd.concat([tr, te])
        del tr
        del te

        etr = data.events(self.mode, 'train')
        ete = data.events(self.mode, 'test')
        ef = pd.concat([etr, ete])
        del etr
        del ete

        m = pd.merge(ef, f, left_on=['KEY', 'DATETIME_UTC'], right_on=['KEY', 'DATETIME_UTC'])
        m = m[(m.KM >= m.KM_START) & (m.KM <= m.KM_END)]

        df['start_event_distance'] = df[]
        return df
예제 #9
0
    def extract_feature(self):
        df = None

        if self.mode == 'local':
            tr = data.speeds_original('train')
            te = data.speed_test_masked()
            df = pd.concat([tr, te])
            del tr
            del te

        elif self.mode == 'full':
            tr = data.speeds(mode='full')
            te = data.speeds_original('test2')
            df = pd.concat([tr, te])
            del tr
            del te

        f = df[['KEY', 'SPEED_AVG', 'SPEED_SD', 'SPEED_MIN', 'SPEED_MAX', 'N_VEHICLES']].groupby(['KEY']).mean().reset_index()\
                .rename(columns={'SPEED_AVG': 'avg_speed_street',\
                                'SPEED_SD': 'avg_speed_sd_street', \
                                'SPEED_MIN': 'avg_speed_min_street', \
                                'SPEED_MAX': 'avg_speed_max_street', \
                                'N_VEHICLES': 'avg_n_vehicles_street'})
        return f
    def extract_feature(self):
        speeds = None

        if self.mode == 'local':
            tr = data.speeds_original('train')
            te = data.speed_test_masked()
            speeds = pd.concat([tr, te])
            del tr
            del te

        elif self.mode == 'full':
            tr = data.speeds(mode='full')
            te = data.speeds_original('test2')
            speeds = pd.concat([tr, te])
            del tr
            del te

        etr = data.events(self.mode, 'train')
        ete = data.events(self.mode, 'test')
        ef = pd.concat([etr, ete])
        del etr
        del ete

        t = ef[['START_DATETIME_UTC', 'END_DATETIME_UTC', 'KEY', 'KM_START', \
        'KM_END', 'DATETIME_UTC', 'EVENT_TYPE', 'EVENT_DETAIL']]
        t = t.loc[t.groupby(['START_DATETIME_UTC', 'END_DATETIME_UTC', 'KEY'],
                            as_index=False).DATETIME_UTC.idxmin()]
        t['DATETIME_UTC-1'] = t.DATETIME_UTC - pd.Timedelta(minutes=15)
        t = t.drop(['START_DATETIME_UTC', 'END_DATETIME_UTC', 'DATETIME_UTC'],
                   axis=1)
        speeds = speeds[['KEY', 'KM', 'DATETIME_UTC', 'SPEED_AVG']]

        final = pd.merge(t,
                         speeds,
                         left_on=['KEY', 'DATETIME_UTC-1'],
                         right_on=['KEY', 'DATETIME_UTC'])
        final = final.rename(columns={
            'SPEED_AVG': 'speed_avg-1',
            'KM': 'KM-1'
        })
        final = final.drop(['DATETIME_UTC'], axis=1)
        final = final[(final['KM-1'] >= final.KM_START)
                      & (final['KM-1'] <= final.KM_END)]

        ds = []
        for ts in range(4):
            m_ = t.copy()
            print(len(m_))
            quarters_delta = ts + 1
            m_['DATETIME_UTC_{}'.format(
                quarters_delta)] = m_['DATETIME_UTC-1'] + pd.Timedelta(
                    minutes=15 * quarters_delta)
            m_ = pd.merge(m_, speeds, \
                        left_on=['KEY', 'DATETIME_UTC_{}'.format(quarters_delta)], \
                        right_on=['KEY', 'DATETIME_UTC'], how='left')
            m_ = m_.rename(columns={'SPEED_AVG': 'speed_avg_{}'.format(quarters_delta), \
                                    'KM': 'KM_{}'.format(quarters_delta)})
            m_ = m_.drop(['DATETIME_UTC'], axis=1)
            m_ = m_[(m_['KM_{}'.format(quarters_delta)] >= m_.KM_START)
                    & (m_['KM_{}'.format(quarters_delta)] <= m_.KM_END)]
            m_ = m_.rename(columns={'KM_{}'.format(quarters_delta): 'KM-1'})
            m_ = m_.drop(['DATETIME_UTC_{}'.format(quarters_delta)], axis=1)
            print(len(m_))
            ds.append(m_)

        final = final.drop(['KM-1'], axis=1)
        for i in range(len(ds)):
            df = ds[i]
            j = i + 1
            print('shape before {}'.format(len(final)))
            final = pd.merge(final, df)
            print('shape after {}'.format(len(final)))

        final = final[[
            'EVENT_TYPE', 'speed_avg-1', 'speed_avg_1', 'speed_avg_2',
            'speed_avg_3', 'speed_avg_4'
        ]]
        final['diff-1-step'] = final['speed_avg_1'] - final['speed_avg-1']
        final['diff-2-step'] = final['speed_avg_2'] - final['speed_avg-1']
        final['diff-3-step'] = final['speed_avg_3'] - final['speed_avg-1']
        final['diff-4-step'] = final['speed_avg_4'] - final['speed_avg-1']
        final = final.drop([
            'speed_avg_1', 'speed_avg-1', 'speed_avg_2', 'speed_avg_3',
            'speed_avg_4'
        ],
                           axis=1)
        return final.groupby(['EVENT_TYPE'], as_index=False).mean()
#if you want to know current working dir
sys.path.append(os.getcwd())

from src.utils import *
from src.utility import merge_speed_events
import src.data as data
import src.utility as utils
from src.utils import resources_path
from src.preprocessing.other_features import avg_speed_for_roadtype_event
from tqdm import tqdm

if __name__ == '__main__':
    for t in ['train', 'test', '2019']:
        print('Reading datasets...')
        X_df = data.base_dataset(mode=t)
        speeds = data.speeds(mode=t)
        print('Done')

        speeds[DATETIME] = pd.to_datetime(speeds[DATETIME])
        print('Inferring...')
        window_len = sum(X_df.columns.str.match('^SPEED_AVG_-.*$') * 1)
        for i in tqdm(range(1, window_len + 1)):
            time = 'DATETIME_UTC_-' + str(i)
            speed_avg = 'SPEED_AVG_-' + str(i)
            speed_max = 'SPEED_MAX_-' + str(i)
            speed_min = 'SPEED_MIN_-' + str(i)
            speed_std = 'SPEED_SD_-' + str(i)
            n_cars = 'N_VEHICLES_-' + str(i)
            X_df[time] = pd.to_datetime(X_df[time])

            X_df.drop(
예제 #12
0
def create_base_dataset(mode,
                        steps_behind_event,
                        steps_after_event=3,
                        validation_split=0.2):
    """
    Create the dataframe containing the road measurements for every timestamp and related
    additional information about sensors, events and weather
    """
    print(
        f'Creating base dataset for {mode.upper()} with timewindows ({steps_behind_event}, {steps_after_event})'
    )

    # load dataframes to be joined
    # - sensors
    sensors = data.sensors()
    weather = data.weather()

    for t in ['train', 'test']:
        print()
        print('Creating dataset', t.upper())
        # - speeds
        # if speed_imputed:
        #     s = data.speeds(mode).merge(sensors, how='left')
        # else:
        print('Merging speeds and events...')
        e = data.events(mode, t)

        if mode == 'local':
            speeds = data.speeds_original(t)
        elif mode == 'full':
            speeds = data.speeds(mode=mode, t=t)

        print('Done')
        print_memory_usage()

        # create the time windows for each event
        print('Creating time windows for events...')

        # find the starting time of each event
        ev_agg = e.astype({
            'KEY': 'int'
        }).groupby('index').agg({
            'step_duration': 'first',
            'EVENT_DETAIL': 'first',
            'EVENT_TYPE': 'first',
            'KM_END': 'first',
            'KM_START': 'first',
            'KEY': 'first',
            'KEY_2': 'first',
            'KM_EVENT': 'first',
            'START_DATETIME_UTC': 'min',
        }).rename(columns={'step_duration': 'event_duration'})

        ev_agg['timewind_start'] = ev_agg.START_DATETIME_UTC - pd.to_timedelta(
            15 * steps_behind_event, unit='m')
        ev_agg['timewind_end'] = ev_agg.START_DATETIME_UTC + pd.to_timedelta(
            15 * steps_after_event, unit='m')

        # add speeds info
        ev_agg = merge_speed_events(speeds, ev_agg)

        # expand different sensors
        base_df = pd.DataFrame({col:np.repeat(ev_agg[col], ev_agg['sensors'].str.len()) \
                           for col in ev_agg.columns.drop('sensors')} \
            ).assign(**{'KM': np.concatenate(ev_agg['sensors'].values)})
        # expand timestamps
        base_df = utility.expand_timestamps(base_df, col_ts_start='timewind_start', col_ts_end='timewind_end')\
                    .drop(['timewind_start','timewind_end','step_duration'], axis=1) \
                    .rename(columns={'index':'event_index'}) \
                    .sort_values('event_index')
        base_df['DATETIME_UTC'] = pd.to_datetime(base_df['DATETIME_UTC'],
                                                 unit='s')

        joined_df = base_df.drop('KEY_2', axis=1).merge(
            speeds.astype({'KEY': 'int'}),
            how='left',
            on=['KEY', 'KM', 'DATETIME_UTC'])

        # add other dataframes
        # - weather
        joined_df = joined_df.merge(weather, how='left')
        # - sensors
        joined_df = joined_df.merge(sensors, how='left')

        print('Aggregating events in samples...')
        joined_df = joined_df.sort_values(['KEY','KM','DATETIME_UTC']) \
            .groupby(['event_index','KEY','KM'], as_index=False).agg({
            'KM_START':'first',
            'KM_END':'first',
            'DATETIME_UTC':list,
            'event_duration':'first',
            'SPEED_AVG':list, #[list, lambda x: x[0:event_beginning_step].dropna().mean()],
            'SPEED_SD':list,
            'SPEED_MAX':list,
            'SPEED_MIN':list,
            'N_VEHICLES':list,
            'EMERGENCY_LANE':'first',
            'LANES':'first',
            'ROAD_TYPE':'first',
            'EVENT_DETAIL':lambda x: x.values[steps_behind_event],
            'EVENT_TYPE':lambda x: x.values[steps_behind_event],
            'WEATHER': list,
            'DISTANCE': list,
            'TEMPERATURE': list,
            'MIN_TEMPERATURE': list,
            'MAX_TEMPERATURE': list
        })
        # set sensor distance from event start and end
        joined_df['distance_start'] = joined_df['KM'] - joined_df['KM_START']
        joined_df['distance_end'] = joined_df['KM'] - joined_df['KM_END']
        joined_df.drop(['KM_END', 'KM_START'], axis=1, inplace=True)

        # split the last m measures in different columns
        def split_prediction_fields(row, event_beginning_step):
            return pd.Series((
                row.DATETIME_UTC[:event_beginning_step],
                row.DATETIME_UTC[event_beginning_step:],
                row.SPEED_AVG[:event_beginning_step],
                row.SPEED_AVG[event_beginning_step:],
                row.SPEED_SD[:event_beginning_step],
                row.SPEED_MAX[:event_beginning_step],
                row.SPEED_MIN[:event_beginning_step],
                row.N_VEHICLES[:event_beginning_step],
                row.WEATHER[:event_beginning_step],
                row.DISTANCE[:event_beginning_step],
                row.TEMPERATURE[:event_beginning_step],
                row.MIN_TEMPERATURE[:event_beginning_step],
                row.MAX_TEMPERATURE[:event_beginning_step],
            ))

        print('Splitting time steps into separate columns...')

        columns_to_split = [
            'DATETIME_UTC', 'DATETIME_UTC_y', 'SPEED_AVG', 'SPEED_AVG_Y',
            'SPEED_SD', 'SPEED_MAX', 'SPEED_MIN', 'N_VEHICLES', 'WEATHER',
            'DISTANCE', 'TEMPERATURE', 'MIN_TEMPERATURE', 'MAX_TEMPERATURE'
        ]
        joined_df[columns_to_split] = joined_df.apply(
            split_prediction_fields,
            axis=1,
            event_beginning_step=steps_behind_event)

        for col_name in columns_to_split:
            if col_name.upper().endswith('_Y'):
                new_cols = [
                    '{}_{}'.format(col_name, i)
                    for i in range(0, steps_after_event + 1)
                ]
            else:
                new_cols = [
                    '{}_{}'.format(col_name, i)
                    for i in range(-steps_behind_event, 0)
                ]

            joined_df[new_cols] = pd.DataFrame(
                joined_df[col_name].values.tolist(), index=joined_df.index)

        # removed the residual columns of lists
        joined_df = joined_df.drop(columns_to_split, axis=1)

        # drop the rows for which all speeds are NaNs
        print('Dataset shape:', joined_df.shape)
        #print('Dropping not available speeds...')
        #joined_df.dropna(how='all', subset=[f'SPEED_AVG_{i}' for i in range(-steps_behind_event, 0)], inplace=True)
        #print('Dataset shape reduced to:', joined_df.shape)

        # set to NaN some of the target speeds if the events is shorter than 4 time steps
        joined_df.loc[joined_df['event_duration'] == 3,
                      'SPEED_AVG_Y_3'] = np.nan
        joined_df.loc[joined_df['event_duration'] == 2,
                      ['SPEED_AVG_Y_2', 'SPEED_AVG_Y_3']] = np.nan
        joined_df.loc[
            joined_df['event_duration'] == 1,
            ['SPEED_AVG_Y_1', 'SPEED_AVG_Y_2', 'SPEED_AVG_Y_3']] = np.nan
        joined_df.drop('event_duration', axis=1, inplace=True)

        # cast to int some columns
        joined_df = joined_df.astype({
            'EMERGENCY_LANE': 'int',
            'LANES': 'int',
            'ROAD_TYPE': 'int',
            'EVENT_DETAIL': 'int',
            'KEY': 'int',
            'KM': 'int',
            'event_index': 'int'
        })
        """
        if mode == 'train':
            # take random validation rows

            # random_indices = random.shuffle(joined_df.index)
            # validation_indices = random_indices[0: int(len(random_indices) * validation_split)]
            # train_df = joined_df.drop(validation_indices)
            # valid_df = joined_df.loc[validation_indices]
        """

        # save the base dataset
        filepath = data.get_path_preprocessed(mode, t, 'base_dataset.csv.gz')

        print('Saving base dataframe to {}'.format(filepath))
        joined_df.to_csv(filepath, index=False, compression='gzip')
        del joined_df
        print('Done')