def _prepare_data_trend(window, df): consumption = df.consumption.values data_trend = group_sum(consumption, 24) data_trend /= np.mean(data_trend) data_trend = np.array(data_trend, dtype=np.float32) data_trend = np.expand_dims(data_trend, axis=0) return data_trend
def _daily_predict(self, series_id, consumption, weekdays, dates): is_day_off = self._get_is_day_off(weekdays, series_id, dates) is_day_off = is_day_off[-self._input_days:] org_key = ''.join([str(i) for i in is_day_off]) pred = [] for offset in range(7): weekday = weekdays[-1] date = dates[-1] for _ in range(offset + 1): weekday = _get_next_weekday(weekday) date = _get_next_date(date, offset + 1) key = org_key + str(self._is_day_off(weekday, series_id, date)) while 1: if key in self.train_data['daily'][offset]: break else: # print(key, 'not found') key = key[1:] if not len(key): msg = 'Key not found: %s\tWindow: %s\tOffset: %s' % ( org_key, 'daily', offset) raise KeyError(msg) x = consumption[-(len(key) - 1) * 24:] x = group_sum(x, 24) x = np.expand_dims(x, axis=0) weights = self.train_data['daily'][offset][key]['weights'] # print(consumption.shape, x.shape, weights.shape, key) pred.append(x.dot(weights)[0]) return np.array(pred)
def _prepare_past_consumption(window, df): past_consumption = df.consumption.values.copy() if window != 'hourly': past_consumption = group_sum(past_consumption, 24) mean_consumption = np.mean(past_consumption) past_consumption /= mean_consumption past_consumption = np.expand_dims(past_consumption, axis=1) past_consumption = np.array(past_consumption, dtype=np.float32) past_consumption = np.expand_dims(past_consumption, axis=0) return past_consumption, mean_consumption
def prepare_data_for_train(df, metadata, input_days, window, verbose=True): pred_days = WINDOW_TO_PRED_DAYS[window] past_consumption, future_consumption = [], [] is_day_off, cluster_features_v2 = [], [] clock = [] if verbose: iterator = tqdm_notebook(df.series_id.unique(), desc='Preparing data') else: iterator = df.series_id.unique() for series_id in iterator: sub_df = df[df.series_id == series_id] consumption = sub_df.consumption.values days_off = sub_df.is_holiday.values if window != 'hourly': consumption = group_sum(consumption, 24) days_off = days_off[::24] step = input_days else: clock_values = np.concatenate([np.linspace(0, 1, 24)] * (len(sub_df) // 24), axis=0) step = input_days * 24 for start_idx in range(len(consumption) - step - 1): is_day_off.append(days_off[start_idx:start_idx + step]) past_consumption_values = consumption[start_idx:start_idx + step] mean_value, std_value = np.mean(past_consumption_values), np.std( past_consumption_values) past_consumption.append(past_consumption_values / mean_value) future_consumption.append(consumption[start_idx + step] / mean_value) cluster_features_v2.append(get_cluster_features_v2(series_id)) clock.append(clock_values[start_idx:start_idx + step]) past_consumption = np.array(past_consumption, dtype=np.float32) past_consumption = np.expand_dims(past_consumption, 2) future_consumption = np.array(future_consumption, dtype=np.float32) is_day_off = np.array(is_day_off, dtype=np.float32) is_day_off = np.expand_dims(is_day_off, 2) cluster_features_v2 = np.array(cluster_features_v2, dtype=np.float32) cluster_features_v2 = np.expand_dims(cluster_features_v2, 1) cluster_features_v2 = np.repeat(cluster_features_v2, is_day_off.shape[1], axis=1) clock = np.array(clock, dtype=np.float32) clock = np.expand_dims(clock, 2) x = { 'past_consumption': past_consumption, 'is_day_off': is_day_off, 'cluster_features_v2': cluster_features_v2, 'clock': clock, } return x, future_consumption
def _prepare_past_consumption(window, df): past_consumption = df.consumption.values.copy() if window != 'hourly': past_consumption = group_sum(past_consumption, 24) mean_consumption = np.mean(past_consumption) past_consumption /= mean_consumption pred_size = WINDOW_TO_PRED_DAYS[window] if window == 'hourly': pred_size *= 24 pred = np.zeros(pred_size) past_consumption = np.concatenate([past_consumption, pred]) past_consumption = np.expand_dims(past_consumption, axis=1) past_consumption = np.array(past_consumption, dtype=np.float32) past_consumption = np.expand_dims(past_consumption, axis=0) return past_consumption, mean_consumption
def _prepare_past_consumption(window, df): consumption = df.consumption.values if window == 'hourly': past_consumption = np.reshape(consumption, newshape=(-1, 24)) else: past_consumption = group_sum(consumption, 24) past_consumption = np.expand_dims(past_consumption, axis=1) if window == 'weekly': past_consumption = np.repeat(past_consumption, 2, axis=1) else: past_consumption = np.repeat(past_consumption, 7, axis=1) past_consumption = np.array(past_consumption, dtype=np.float32) past_consumption = np.expand_dims(past_consumption, axis=0) past_consumption = np.transpose(past_consumption, axes=(0, 2, 1)) return past_consumption
def _prepare_weekly_data(self, df): for series_id in tqdm_notebook(df.series_id.unique(), desc='Preparing data'): sub_df = df[df.series_id == series_id] consumption = sub_df.consumption.values consumption = group_sum(consumption, 24) is_day_off = self._get_is_day_off_from_df(sub_df) for input_days in range(1, 1 + self._input_days): for start_idx in range(len(is_day_off) - input_days - 14): key = ''.join([ str(i) for i in is_day_off[start_idx:start_idx + input_days] ]) x = consumption[start_idx:start_idx + input_days] x = np.expand_dims(x, axis=0) val_idx = start_idx + input_days y = consumption[val_idx:val_idx + 14] y = group_sum(y, 7) y_mean = np.mean(y) for offset in range(2): final_key = key self._add_train_data(x / y_mean, [y[offset] / y_mean], 'weekly', offset, final_key)
def visualize_idx(idx, train, train_arrange, preds, metadata): row = train_arrange.loc[idx] df = train[train.series_id == row['series_id']] consumption = df.consumption.values[ row['train_start_idx']:row['val_end_idx']] dates = df.timestamp.values[row['train_start_idx']:row['val_end_idx']] weekdays = df.weekday.values[row['train_start_idx']:row['val_end_idx']] if row['window'] == 'hourly': batch_size = 24 elif row['window'] == 'daily': batch_size = 1 weekdays = weekdays[::24] dates = dates[::24] consumption = group_sum(consumption, 24) plt.plot(dates[-len(preds[idx]):], preds[idx], color='green', lw=3) plt.plot(dates[-len(preds[idx]):][::batch_size], preds[idx][::batch_size], 'o', color='green', lw=3) for i in range(len(dates) // batch_size): weekday = weekdays[i * batch_size] if _is_day_off(row['series_id'], weekday, metadata): color = 'orange' else: color = 'blue' plt.plot(dates[i * batch_size:(i + 1) * batch_size + 1], consumption[i * batch_size:(i + 1) * batch_size + 1], color=color) plt.plot(dates[i * batch_size:(i) * batch_size + 1], consumption[i * batch_size:(i) * batch_size + 1], 'o', color=color) plt.title('%i Nmae: %.3f' % (idx, row['nmae']))
def prepare_data_for_train(df, metadata, input_days, window, only_working_days, verbose=True): pred_days = WINDOW_TO_PRED_DAYS[window] past_consumption, future_consumption = [], [] is_day_off, cluster_features_v2 = [], [] weekday = [] if verbose: iterator = tqdm_notebook(df.series_id.unique(), desc='Preparing data') else: iterator = df.series_id.unique() for series_id in iterator: sub_df = df[df.series_id == series_id] consumption = sub_df.consumption.values days_off = sub_df.is_holiday.values weekdays = sub_df.weekday.values if window != 'hourly': consumption = group_sum(consumption, 24) days_off = days_off[::24] step = 1 past_samples = input_days future_samples = WINDOW_TO_PRED_DAYS[window] else: step = 24 past_samples = input_days * 24 future_samples = 24 for start_idx in range( 0, len(consumption) - future_samples - past_samples + step, step): if days_off[start_idx + past_samples] and only_working_days: continue if not days_off[start_idx + past_samples] and not only_working_days: continue is_day_off.append(days_off[start_idx:start_idx + past_samples]) weekday.append(weekdays[start_idx:start_idx + past_samples]) past_consumption_values = consumption[start_idx:start_idx + past_samples] mean_value = np.mean(past_consumption_values) mean_value *= normalization_factor( days_off[start_idx:start_idx + past_samples], 1 - int(only_working_days)) past_consumption.append(past_consumption_values / mean_value) future_consumption.append( consumption[start_idx + past_samples:start_idx + past_samples + future_samples] / mean_value) cluster_features_v2.append(get_cluster_features_v2(series_id)) past_consumption = np.array(past_consumption, dtype=np.float32) past_consumption = np.expand_dims(past_consumption, 2) future_consumption = np.array(future_consumption, dtype=np.float32) future_consumption = np.expand_dims(future_consumption, 2) is_day_off = np.array(is_day_off, dtype=np.float32) is_day_off = np.expand_dims(is_day_off, 2) cluster_features_v2 = np.array(cluster_features_v2, dtype=np.float32) cluster_features_v2 = np.expand_dims(cluster_features_v2, 1) cluster_features_v2 = np.repeat(cluster_features_v2, is_day_off.shape[1], axis=1) weekday = np.array(weekday, dtype=np.float32) weekday = np.expand_dims(weekday, 2) weekday /= 7. x = { 'past_consumption': past_consumption, 'is_day_off': is_day_off, 'cluster_features_v2': cluster_features_v2, 'weekday': weekday, } return x, future_consumption
def prepare_data_for_train(df, input_days, window, verbose=True): """ Returns -------- :: x = { 'past_consumption': past_consumption, 'cluster_features_v2': cluster_features_v2, 'past_weekday': past_weekday, 'future_weekday': future_weekday, 'past_day_off': past_day_off, 'future_day_off': future_day_off, } return x, future_consumption """ pred_days = WINDOW_TO_PRED_DAYS[window] past_consumption, future_consumption = [], [] past_day_off, future_day_off = [], [] past_weekday, future_weekday = [], [] cluster_features_v2 = [] if verbose: iterator = tqdm_notebook(df.series_id.unique(), desc='Preparing data') else: iterator = df.series_id.unique() for series_id in iterator: sub_df = df[df.series_id == series_id] consumption = sub_df.consumption.values days_off = sub_df.is_holiday.values weekdays = sub_df.weekday.values if window != 'hourly': consumption = group_sum(consumption, 24) days_off = days_off[::24] weekdays = weekdays[::24] step = 1 past_samples = input_days future_samples = pred_days else: step = 24 past_samples = input_days * 24 future_samples = 24 for start_idx in range( 0, len(consumption) - future_samples - past_samples + step, step): past_idx = start_idx + past_samples past_day_off.append(days_off[start_idx:past_idx]) past_weekday.append(weekdays[start_idx:past_idx]) future_idx = past_idx + future_samples if window == 'weekly': future_day_off.append( np.reshape(days_off[past_idx:future_idx], (2, -1))) future_weekday.append(weekdays[past_idx:future_idx:7]) else: future_day_off.append(days_off[past_idx:future_idx]) future_weekday.append(weekdays[past_idx:future_idx]) past_consumption_values = consumption[start_idx:past_idx] mean_value = np.mean(past_consumption_values) mean_value *= normalization_factor(past_day_off[-1], future_day_off[-1]) if window == 'weekly': mean_value *= 7 past_consumption.append(past_consumption_values / mean_value) if window == 'weekly': future_consumption.append( group_sum(consumption[past_idx:future_idx], 7) / mean_value) else: future_consumption.append(consumption[past_idx:future_idx] / mean_value) cluster_features_v2.append(get_cluster_features_v2(series_id)) # TODO: I should do refinement on weekly predictions past_consumption = np.array(past_consumption, dtype=np.float32) past_consumption = np.expand_dims(past_consumption, 2) future_consumption = np.array(future_consumption, dtype=np.float32) future_consumption = np.expand_dims(future_consumption, 2) past_day_off = np.array(past_day_off, dtype=np.float32) past_day_off = np.expand_dims(past_day_off, 2) future_day_off = np.array(future_day_off, dtype=np.float32) if window != 'weekly': future_day_off = np.expand_dims(future_day_off, 2) past_weekday = [[_weekday_ohe(weekday) for weekday in week] for week in past_weekday] past_weekday = np.array(past_weekday, dtype=np.float32) future_weekday = [[_weekday_ohe(weekday) for weekday in week] for week in future_weekday] future_weekday = np.array(future_weekday, dtype=np.float32) cluster_features_v2 = np.array(cluster_features_v2, dtype=np.float32) x = { 'past_consumption': past_consumption, 'cluster_features_v2': cluster_features_v2, 'past_weekday': past_weekday, 'future_weekday': future_weekday, 'past_day_off': past_day_off, 'future_day_off': future_day_off, } return x, future_consumption
def test_group_sum(x, group_size, output): assert all(output == group_sum(x, group_size))
def prepare_data_for_train(df, metadata, input_days, window, verbose=True): pred_days = WINDOW_TO_PRED_DAYS[window] past_consumption, future_consumption = [], [] is_day_off, data_trend, metadata_ohe = [], [], [] metadata_days_off = [] cluster_id_ohe, cluster_features_v2 = [], [] if verbose: iterator = tqdm_notebook(df.series_id.unique(), desc='Preparing data') else: iterator = df.series_id.unique() for series_id in iterator: sub_df = df[df.series_id == series_id] consumption = sub_df.consumption.values if window != 'hourly': consumption = group_sum(consumption, 24) series_is_day_off = [ int(value) for value in sub_df.is_holiday.values[::24] ] for start_idx in range( len(series_is_day_off) - input_days - pred_days + 1): is_day_off.append(series_is_day_off[start_idx:start_idx + input_days + pred_days]) val_idx = start_idx + input_days if window == 'hourly': x = np.reshape(consumption[start_idx * 24:val_idx * 24], newshape=(-1, 24)) y = consumption[val_idx * 24:(val_idx + pred_days) * 24] else: x = consumption[start_idx:val_idx] x = np.expand_dims(x, axis=1) y = consumption[val_idx:val_idx + pred_days] if window == 'weekly': x = np.repeat(x, 2, axis=1) y = group_sum(y, 7) else: x = np.repeat(x, 7, axis=1) y_mean = np.mean(y) past_consumption.append(x / y_mean) future_consumption.append(y / y_mean) # Data trend if window == 'hourly': _data_trend = group_sum( consumption[start_idx * 24:val_idx * 24], 24) else: _data_trend = consumption[start_idx:val_idx].copy() _data_trend /= np.mean(_data_trend) data_trend.append(_data_trend) metadata_ohe.append(_get_metadata_ohe(metadata, series_id)) metadata_days_off.append( _get_metadata_days_off(metadata, series_id)) cluster_id_ohe.append(get_cluster_ohe(series_id)) cluster_features_v2.append(get_cluster_features_v2(series_id)) past_consumption = np.array(past_consumption, dtype=np.float32) past_consumption = np.transpose(past_consumption, axes=(0, 2, 1)) future_consumption = np.array(future_consumption, dtype=np.float32) is_day_off = np.array(is_day_off, dtype=np.float32) is_day_off[is_day_off == 0] = -1 data_trend = np.array(data_trend, dtype=np.float32) metadata_ohe = np.array(metadata_ohe, dtype=np.float32) metadata_days_off = np.array(metadata_days_off, dtype=np.float32) cluster_id_ohe = np.array(cluster_id_ohe, dtype=np.float32) cluster_features_v2 = np.array(cluster_features_v2, dtype=np.float32) x = { 'past_consumption': past_consumption, 'is_day_off': is_day_off, 'data_trend': data_trend, 'metadata_ohe': metadata_ohe, 'metadata_days_off': metadata_days_off, 'cluster_id_ohe': cluster_id_ohe, 'cluster_features_v2': cluster_features_v2, } return x, future_consumption