def prepare_data_for_train(df, metadata, input_days, window, verbose=True): pred_days = WINDOW_TO_PRED_DAYS[window] past_consumption, future_consumption = [], [] is_day_off, cluster_features_v2 = [], [] clock = [] if verbose: iterator = tqdm_notebook(df.series_id.unique(), desc='Preparing data') else: iterator = df.series_id.unique() for series_id in iterator: sub_df = df[df.series_id == series_id] consumption = sub_df.consumption.values days_off = sub_df.is_holiday.values if window != 'hourly': consumption = group_sum(consumption, 24) days_off = days_off[::24] step = input_days else: clock_values = np.concatenate([np.linspace(0, 1, 24)] * (len(sub_df) // 24), axis=0) step = input_days * 24 for start_idx in range(len(consumption) - step - 1): is_day_off.append(days_off[start_idx:start_idx + step]) past_consumption_values = consumption[start_idx:start_idx + step] mean_value, std_value = np.mean(past_consumption_values), np.std( past_consumption_values) past_consumption.append(past_consumption_values / mean_value) future_consumption.append(consumption[start_idx + step] / mean_value) cluster_features_v2.append(get_cluster_features_v2(series_id)) clock.append(clock_values[start_idx:start_idx + step]) past_consumption = np.array(past_consumption, dtype=np.float32) past_consumption = np.expand_dims(past_consumption, 2) future_consumption = np.array(future_consumption, dtype=np.float32) is_day_off = np.array(is_day_off, dtype=np.float32) is_day_off = np.expand_dims(is_day_off, 2) cluster_features_v2 = np.array(cluster_features_v2, dtype=np.float32) cluster_features_v2 = np.expand_dims(cluster_features_v2, 1) cluster_features_v2 = np.repeat(cluster_features_v2, is_day_off.shape[1], axis=1) clock = np.array(clock, dtype=np.float32) clock = np.expand_dims(clock, 2) x = { 'past_consumption': past_consumption, 'is_day_off': is_day_off, 'cluster_features_v2': cluster_features_v2, 'clock': clock, } return x, future_consumption
def _prepare_cluster_features_v2(series_id): cluster_ohe = get_cluster_features_v2(series_id) cluster_ohe = np.array(cluster_ohe, dtype=np.float32) cluster_ohe = np.expand_dims(cluster_ohe, axis=0) return cluster_ohe
def prepare_data_for_train(df, metadata, input_days, window, only_working_days, verbose=True): pred_days = WINDOW_TO_PRED_DAYS[window] past_consumption, future_consumption = [], [] is_day_off, cluster_features_v2 = [], [] weekday = [] if verbose: iterator = tqdm_notebook(df.series_id.unique(), desc='Preparing data') else: iterator = df.series_id.unique() for series_id in iterator: sub_df = df[df.series_id == series_id] consumption = sub_df.consumption.values days_off = sub_df.is_holiday.values weekdays = sub_df.weekday.values if window != 'hourly': consumption = group_sum(consumption, 24) days_off = days_off[::24] step = 1 past_samples = input_days future_samples = WINDOW_TO_PRED_DAYS[window] else: step = 24 past_samples = input_days * 24 future_samples = 24 for start_idx in range( 0, len(consumption) - future_samples - past_samples + step, step): if days_off[start_idx + past_samples] and only_working_days: continue if not days_off[start_idx + past_samples] and not only_working_days: continue is_day_off.append(days_off[start_idx:start_idx + past_samples]) weekday.append(weekdays[start_idx:start_idx + past_samples]) past_consumption_values = consumption[start_idx:start_idx + past_samples] mean_value = np.mean(past_consumption_values) mean_value *= normalization_factor( days_off[start_idx:start_idx + past_samples], 1 - int(only_working_days)) past_consumption.append(past_consumption_values / mean_value) future_consumption.append( consumption[start_idx + past_samples:start_idx + past_samples + future_samples] / mean_value) cluster_features_v2.append(get_cluster_features_v2(series_id)) past_consumption = np.array(past_consumption, dtype=np.float32) past_consumption = np.expand_dims(past_consumption, 2) future_consumption = np.array(future_consumption, dtype=np.float32) future_consumption = np.expand_dims(future_consumption, 2) is_day_off = np.array(is_day_off, dtype=np.float32) is_day_off = np.expand_dims(is_day_off, 2) cluster_features_v2 = np.array(cluster_features_v2, dtype=np.float32) cluster_features_v2 = np.expand_dims(cluster_features_v2, 1) cluster_features_v2 = np.repeat(cluster_features_v2, is_day_off.shape[1], axis=1) weekday = np.array(weekday, dtype=np.float32) weekday = np.expand_dims(weekday, 2) weekday /= 7. x = { 'past_consumption': past_consumption, 'is_day_off': is_day_off, 'cluster_features_v2': cluster_features_v2, 'weekday': weekday, } return x, future_consumption
def prepare_data_for_train(df, input_days, window, verbose=True): """ Returns -------- :: x = { 'past_consumption': past_consumption, 'cluster_features_v2': cluster_features_v2, 'past_weekday': past_weekday, 'future_weekday': future_weekday, 'past_day_off': past_day_off, 'future_day_off': future_day_off, } return x, future_consumption """ pred_days = WINDOW_TO_PRED_DAYS[window] past_consumption, future_consumption = [], [] past_day_off, future_day_off = [], [] past_weekday, future_weekday = [], [] cluster_features_v2 = [] if verbose: iterator = tqdm_notebook(df.series_id.unique(), desc='Preparing data') else: iterator = df.series_id.unique() for series_id in iterator: sub_df = df[df.series_id == series_id] consumption = sub_df.consumption.values days_off = sub_df.is_holiday.values weekdays = sub_df.weekday.values if window != 'hourly': consumption = group_sum(consumption, 24) days_off = days_off[::24] weekdays = weekdays[::24] step = 1 past_samples = input_days future_samples = pred_days else: step = 24 past_samples = input_days * 24 future_samples = 24 for start_idx in range( 0, len(consumption) - future_samples - past_samples + step, step): past_idx = start_idx + past_samples past_day_off.append(days_off[start_idx:past_idx]) past_weekday.append(weekdays[start_idx:past_idx]) future_idx = past_idx + future_samples if window == 'weekly': future_day_off.append( np.reshape(days_off[past_idx:future_idx], (2, -1))) future_weekday.append(weekdays[past_idx:future_idx:7]) else: future_day_off.append(days_off[past_idx:future_idx]) future_weekday.append(weekdays[past_idx:future_idx]) past_consumption_values = consumption[start_idx:past_idx] mean_value = np.mean(past_consumption_values) mean_value *= normalization_factor(past_day_off[-1], future_day_off[-1]) if window == 'weekly': mean_value *= 7 past_consumption.append(past_consumption_values / mean_value) if window == 'weekly': future_consumption.append( group_sum(consumption[past_idx:future_idx], 7) / mean_value) else: future_consumption.append(consumption[past_idx:future_idx] / mean_value) cluster_features_v2.append(get_cluster_features_v2(series_id)) # TODO: I should do refinement on weekly predictions past_consumption = np.array(past_consumption, dtype=np.float32) past_consumption = np.expand_dims(past_consumption, 2) future_consumption = np.array(future_consumption, dtype=np.float32) future_consumption = np.expand_dims(future_consumption, 2) past_day_off = np.array(past_day_off, dtype=np.float32) past_day_off = np.expand_dims(past_day_off, 2) future_day_off = np.array(future_day_off, dtype=np.float32) if window != 'weekly': future_day_off = np.expand_dims(future_day_off, 2) past_weekday = [[_weekday_ohe(weekday) for weekday in week] for week in past_weekday] past_weekday = np.array(past_weekday, dtype=np.float32) future_weekday = [[_weekday_ohe(weekday) for weekday in week] for week in future_weekday] future_weekday = np.array(future_weekday, dtype=np.float32) cluster_features_v2 = np.array(cluster_features_v2, dtype=np.float32) x = { 'past_consumption': past_consumption, 'cluster_features_v2': cluster_features_v2, 'past_weekday': past_weekday, 'future_weekday': future_weekday, 'past_day_off': past_day_off, 'future_day_off': future_day_off, } return x, future_consumption
def prepare_data_for_train(df, metadata, input_days, window, verbose=True): pred_days = WINDOW_TO_PRED_DAYS[window] past_consumption, future_consumption = [], [] is_day_off, data_trend, metadata_ohe = [], [], [] metadata_days_off = [] cluster_id_ohe, cluster_features_v2 = [], [] if verbose: iterator = tqdm_notebook(df.series_id.unique(), desc='Preparing data') else: iterator = df.series_id.unique() for series_id in iterator: sub_df = df[df.series_id == series_id] consumption = sub_df.consumption.values if window != 'hourly': consumption = group_sum(consumption, 24) series_is_day_off = [ int(value) for value in sub_df.is_holiday.values[::24] ] for start_idx in range( len(series_is_day_off) - input_days - pred_days + 1): is_day_off.append(series_is_day_off[start_idx:start_idx + input_days + pred_days]) val_idx = start_idx + input_days if window == 'hourly': x = np.reshape(consumption[start_idx * 24:val_idx * 24], newshape=(-1, 24)) y = consumption[val_idx * 24:(val_idx + pred_days) * 24] else: x = consumption[start_idx:val_idx] x = np.expand_dims(x, axis=1) y = consumption[val_idx:val_idx + pred_days] if window == 'weekly': x = np.repeat(x, 2, axis=1) y = group_sum(y, 7) else: x = np.repeat(x, 7, axis=1) y_mean = np.mean(y) past_consumption.append(x / y_mean) future_consumption.append(y / y_mean) # Data trend if window == 'hourly': _data_trend = group_sum( consumption[start_idx * 24:val_idx * 24], 24) else: _data_trend = consumption[start_idx:val_idx].copy() _data_trend /= np.mean(_data_trend) data_trend.append(_data_trend) metadata_ohe.append(_get_metadata_ohe(metadata, series_id)) metadata_days_off.append( _get_metadata_days_off(metadata, series_id)) cluster_id_ohe.append(get_cluster_ohe(series_id)) cluster_features_v2.append(get_cluster_features_v2(series_id)) past_consumption = np.array(past_consumption, dtype=np.float32) past_consumption = np.transpose(past_consumption, axes=(0, 2, 1)) future_consumption = np.array(future_consumption, dtype=np.float32) is_day_off = np.array(is_day_off, dtype=np.float32) is_day_off[is_day_off == 0] = -1 data_trend = np.array(data_trend, dtype=np.float32) metadata_ohe = np.array(metadata_ohe, dtype=np.float32) metadata_days_off = np.array(metadata_days_off, dtype=np.float32) cluster_id_ohe = np.array(cluster_id_ohe, dtype=np.float32) cluster_features_v2 = np.array(cluster_features_v2, dtype=np.float32) x = { 'past_consumption': past_consumption, 'is_day_off': is_day_off, 'data_trend': data_trend, 'metadata_ohe': metadata_ohe, 'metadata_days_off': metadata_days_off, 'cluster_id_ohe': cluster_id_ohe, 'cluster_features_v2': cluster_features_v2, } return x, future_consumption
def test_ẗhat_all_series_id_have_v2_features(all_series_ids): for series_id in all_series_ids: get_cluster_features_v2(series_id)