def add_help_data(pickle_dir=Path('pickles')): file_paths = get_file_paths(pickle_dir) print(file_paths) for path in file_paths: print(path) path = pickle_dir / Path(path) df_phases = list( map(lambda p: pd.read_pickle(path / ("phase" + p)), ['1', '2', '3'])) print("Opened pickle") phase_values = pd.DataFrame() for i, df_p in enumerate(df_phases): df_p.drop(columns=['Unit', 'AliasName'], inplace=True) phase = 'p' + str(i + 1) phase_values[phase] = df_p.Value for df_p in df_phases: df_p['row_dif'] = df_p.Value.diff() print("Created help values") np.diff(phase_values.values) phase_values['max_dif'] = phase_values.apply(lambda row: max( abs(row['p1'] - row['p2']), abs(row['p1'] - row['p3']), abs(row['p2'] - row['p3'])), axis=1) print("Calculated help data") for df_p in df_phases: df_p['phase_dif'] = phase_values['max_dif'] print("Assigned help data") for i, df_p in enumerate(df_phases): print(df_p) df_p.to_pickle(path / ("h_phase" + str(i + 1)))
def update_trafo(pickle_dir=Path('pickles')): # pd.options.mode.chained_assignment = None file_paths = get_file_paths(pickle_dir) print(file_paths) for path in file_paths: print(path) path = pickle_dir / Path(path) df_phases = list( map(lambda p: pd.read_pickle(path / ("h_phase" + p)), ['1', '2', '3'])) print("Opened pickle") df_row_difs = pd.DataFrame() for p, df_p in enumerate(df_phases): df_p['row_dif'] = df_p.Value.diff() / df_p.Value.index.to_series( ).diff().dt.total_seconds() df_row_difs[str(p)] = df_p['row_dif'] df_row_difs.loc[True ^ (((df_row_difs['0'] >= 0) & (df_row_difs['1'] >= 0) & (df_row_difs['2'] >= 0)) | ( (df_row_difs['0'] < 0) & (df_row_difs['1'] < 0) & (df_row_difs['2'] < 0)))] = 0 df_row_difs = df_row_difs.abs() for df_p in df_phases: # df_p['trafo'] = min(df_phases[0]['row_dif'].abs(), df_phases[1]['row_dif'].abs(), df_phases[2]['row_dif'].abs()) df_p['trafo'] = df_row_difs.min(axis=1) print("Assigned help data") for i, df_p in enumerate(df_phases): # print(df_p) df_p.to_pickle(path / ("h_phase" + str(i + 1)))
def create_mean_street_pickles(pickle_dir=Path('pickles')): station_avgs = pd.DataFrame() file_paths = get_file_paths(pickle_dir) print(file_paths) day = pd.Timedelta('1d') for path in file_paths: station_name = path print(path) path = pickle_dir / Path(path) df_phases = pd.DataFrame() for p, df_p in enumerate( list( map( lambda p: pd.read_pickle(path / ("phase" + p))[['Value']], ['1', '2', '3']))): df_phases = df_phases.join( other=df_p.rename(columns={'Value': 'ValueP' + str(p + 1)}), how='outer') df_phases = df_phases.resample('30s').mean() df_phases[station_name] = df_phases.mean(axis=1) station_avgs = station_avgs.join(df_phases[[station_name]], how='outer') station_avgs = station_avgs.mean(axis=1) print(station_avgs) station_avgs.to_pickle(pickle_dir / 'meanStationValues')
def add_cross_station_data(pickle_dir=Path('pickles')): station_avgs = pd.read_pickle(pickle_directory / "meanStationValues") file_paths = get_file_paths(pickle_dir) for path in file_paths: print(path) path = pickle_dir / Path(path) df_phases = list( map(lambda p: pd.read_pickle(path / ("h_phase" + p)), ['1', '2', '3'])) for p, df_p in enumerate(df_phases): print(p) print(df_p) v1s = [] for index, row in df_p.iterrows(): v1 = row['Value'] - station_avgs.loc[ index - datetime.timedelta(seconds=index.second % 30, microseconds=index.microsecond)] v1s.append(v1) df_p['StationDif'] = v1s # df_p.apply(lambda row:print(row), axis=1) # df_p['StationDif'] = df_p.apply(lambda row: (row['Value'] - station_avgs.loc[ # (row.name - datetime.timedelta(seconds=row.name.second % 30, # microseconds=row.name.microsecond)).time()]), axis=1) print(df_p) df_p.to_pickle(path / ("h_phase" + str(p + 1)))
def add_new_seasonal_data(pickle_dir=Path('pickles')): file_paths = get_file_paths(pickle_dir) for path in file_paths: station_season = pd.read_pickle(pickle_dir / (path + 'season_aggregation')) print(path) path = pickle_dir / Path(path) df_phases = list( map(lambda p: pd.read_pickle(path / ("h_phase" + p)), ['1', '2', '3'])) for p, df_p in enumerate(df_phases): df_p.drop(labels='SeasDif', inplace=True, errors='ignore') print(p) print(df_p) v1s = [] print(station_season) print(station_season.sort_index()) for index, row in df_p.iterrows(): print(row['Value']) print(index) print(index - datetime.timedelta( seconds=index.second % 30, microseconds=index.microsecond)) print(station_season.loc[index - datetime.timedelta( seconds=index.second % 30, microseconds=index.microsecond)]) v1 = row['Value'] - station_season.loc[ index - datetime.timedelta(seconds=index.second % 30, microseconds=index.microsecond)] print(v1) v1s.append(v1) df_p['SeasDif'] = v1s print(df_p) df_p.to_pickle(path / ("h_phase" + str(p + 1)))
def drop_useless_labels(pickle_dir=Path('pickles')): file_paths = get_file_paths(pickle_dir) print(file_paths) day = pd.Timedelta('1d') for path in file_paths: path = pickle_dir / Path(path) df_phases_h = list( map(lambda p: pd.read_pickle(path / ("h_phase" + p)), ['1', '2', '3'])) for p, df_p in enumerate(df_phases_h): df_p.drop(columns=['Unit', 'AliasName'], inplace=True) df_p.to_pickle(path / ("h_phase" + str(p + 1)))
def add_seasonal_data(pickle_dir=Path('pickles')): seasonal_data = pd.DataFrame() file_paths = get_file_paths(pickle_dir) print(file_paths) day = pd.Timedelta('1d') for path in file_paths: print(path) path = pickle_dir / Path(path) df_phases = list( map(lambda p: pd.read_pickle(path / ("phase" + p))[['Value']], ['1', '2', '3'])) weekday_dfs_phases = [[None for x in range(7)] for y in range(3)] min_date = min(list(map(lambda df: df.index.min(), df_phases))).date() max_date = max(list(map(lambda df: df.index.max(), df_phases))).date() for p, df_p in enumerate(df_phases): for start_time in pd.date_range(min_date, max_date, freq='d'): end_time = start_time + day df_p_day = df_p.loc[start_time:end_time] df_p_day_med = df_p_day.resample('30s').median().rename( columns={'Value': str(start_time.date())}) df_p_day_med.index = df_p_day_med.index.time weekday = start_time.date().weekday() # print(weekday_dfs_phases[p][weekday]) if weekday_dfs_phases[p][weekday] is None: weekday_df = df_p_day_med weekday_dfs_phases[p][weekday] = weekday_df else: weekday_df = weekday_dfs_phases[p][weekday] weekday_df = weekday_df.join(df_p_day_med, how='outer') weekday_dfs_phases[p][weekday] = weekday_df print("Split DF") for p, df_weekdays in enumerate(weekday_dfs_phases): for w, df in enumerate(df_weekdays): df['med'] = df.median(axis=1) # print(df) df_phases_h = list( map(lambda p: pd.read_pickle(path / ("h_phase" + p)), ['1', '2', '3'])) print(df_phases_h) for p, df_p in enumerate(df_phases_h): print(p) df_weekdays = weekday_dfs_phases[p] df_p['SeasDif'] = df_p.apply( lambda row: (row['Value'] - df_weekdays[row.name.weekday()].loc[ (row.name - datetime.timedelta( seconds=row.name.second % 30, microseconds=row.name.microsecond)).time()]['med']), axis=1) print(df_p) df_p.to_pickle(path / ("h_phase" + str(p + 1)))
def add_time_gaps(pickle_dir=Path('pickles')): file_paths = get_file_paths(pickle_dir) print(file_paths) day = pd.Timedelta('1d') for path in file_paths: print(path) path = pickle_dir / Path(path) df_phases_h = list( map(lambda p: pd.read_pickle(path / ("h_phase" + p)), ['1', '2', '3'])) for p, df_p in enumerate(df_phases_h): df_p['time_passed'] = df_p.index.to_series().diff( ).dt.total_seconds() df_p.to_pickle(path / ("h_phase" + str(p + 1)))
def create_season_pickle(pickle_dir=Path('pickles')): file_paths = get_file_paths(pickle_dir) print(file_paths) for path in file_paths: print(path) station_name = path df_mean_season = pd.Series() df_mean_pickle = pd.read_pickle(pickle_dir / (str(path) + 'aggregation')) print('len mean_pickle: ' + str(len(df_mean_pickle))) # df_mean_pickle = df_mean_pickle.iloc[:100800] print(df_mean_pickle) column_name = 'windowed_means' # df_mean_pickle = lf.generators.add_daytypes(df_mean_pickle) # df_mean_pickle = lf.generators.add_holidays(df_mean_pickle, 'NW') holidays_nrw = list(holidays.DE(years=2017, state='NW').keys()) # df_mean_pickle_restday = df_mean_pickle[ # ((df_mean_pickle.is_saturday == 1) | (df_mean_pickle.is_sunday == 1) | (df_mean_pickle.is_holiday == True))] # df_mean_pickle_workday = df_mean_pickle[ # True ^ ((df_mean_pickle.is_saturday == 1) | (df_mean_pickle.is_sunday == 1) | ( # df_mean_pickle.is_holiday == True))] print(holidays_nrw) # test = df_mean_pickle[df_mean_pickle.index.isin(holidays_nrw)] # print(test) df_mean_pickle_restday = df_mean_pickle[( (df_mean_pickle.index.dayofweek >= 5) | (df_mean_pickle.index).isin(holidays_nrw))] df_mean_pickle_workday = df_mean_pickle[True ^ ( (df_mean_pickle.index.dayofweek >= 5) | (df_mean_pickle.index).isin(holidays_nrw))] print('Split_dataframe') for i, df_mean_pickle_typeday in enumerate( [df_mean_pickle_restday, df_mean_pickle_workday]): df_mean_pickle_typeday = df_mean_pickle_typeday[[station_name ]].dropna() v1s = [] min_date = df_mean_pickle_typeday.index.min() max_date = df_mean_pickle_typeday.index.max() three_w_timedelta = pd.Timedelta('3w') old_window_min_date = min_date.date() old_window_max_date = max_date.date() print(min_date) for index, row in df_mean_pickle_typeday.iterrows(): window_min_date = max(min_date, index - three_w_timedelta) window_max_date = min(max_date, index + three_w_timedelta) window_slice = df_mean_pickle_typeday.loc[ window_min_date:window_max_date] window_slice = window_slice.loc[window_slice.index.time == index.time()] v1 = window_slice[station_name].mean() if old_window_min_date != window_min_date.date( ) or old_window_max_date != window_max_date.date(): print(str(window_min_date) + ' -> ' + str(window_max_date)) old_window_min_date = window_min_date.date() old_window_max_date = window_max_date.date() print(window_slice) print(v1) v1s.append(v1) df_mean_pickle_typeday[column_name] = v1s print('len v1s: ' + str(len(v1s))) print(df_mean_pickle_typeday[[column_name]]) print(df_mean_season) df_mean_season = pd.concat( [df_mean_season, df_mean_pickle_typeday[column_name]], sort=True) print('len mean_season: ' + str(df_mean_season.size)) print(df_mean_season) df_mean_season.to_pickle(pickle_dir / (str(path) + 'season_aggregation'))