def test_hierarchical(): raw = pd.read_csv(os.path.join(os.path.dirname(__file__), '..', 'examples', 'testdata.csv'), index_col=0) results = pd.read_csv(os.path.join(os.path.dirname(__file__), '..', 'examples', 'results', 'testperiods_hierarchical.csv'), index_col=[0, 1]) starttime = time.time() aggregation = tsam.TimeSeriesAggregation( raw, noTypicalPeriods=8, hoursPerPeriod=24, clusterMethod='hierarchical', extremePeriodMethod='new_cluster_center', addPeakMin=['T'], addPeakMax=['Load']) typPeriods = aggregation.createTypicalPeriods() print('Clustering took ' + str(time.time() - starttime)) np.testing.assert_array_almost_equal(typPeriods.values, results.values, decimal=4)
def test_hierarchical(): raw = pd.read_csv(os.path.join(os.path.dirname(__file__),'..','examples','testdata.csv'), index_col = 0) orig_raw = pd.read_csv(os.path.join(os.path.dirname(__file__),'..','examples','results','testperiods_hierarchical.csv'), index_col = [0,1]) starttime = time.time() aggregation = tsam.TimeSeriesAggregation(raw, noTypicalPeriods = 8, hoursPerPeriod = 24, clusterMethod = 'hierarchical', extremePeriodMethod = 'new_cluster_center', addPeakMin = ['T'], addPeakMax = ['Load'] ) typPeriods = aggregation.createTypicalPeriods() print('Clustering took ' + str(time.time() - starttime)) # sort the typical days in order to avoid error assertion due to different order sortedDaysOrig = orig_raw.sum(axis=0,level=0).sort_values('GHI').index sortedDaysTest = typPeriods.sum(axis=0,level=0).sort_values('GHI').index # rearange their order orig = orig_raw[typPeriods.columns].unstack().loc[sortedDaysOrig,:].stack() test = typPeriods.unstack().loc[sortedDaysTest,:].stack() np.testing.assert_array_almost_equal(orig.values, test.values,decimal=4)
def test_samemean(): raw = pd.read_csv(os.path.join(os.path.dirname(__file__), '..', 'examples', 'testdata.csv'), index_col=0) # get all columns as floats to avoid warning for col in raw.columns: raw[col] = raw[col].astype(float) starttime = time.time() aggregation = tsam.TimeSeriesAggregation( raw, noTypicalPeriods=8, hoursPerPeriod=24, clusterMethod='k_means', sameMean=True, ) typPeriods = aggregation.createTypicalPeriods() print('Clustering took ' + str(time.time() - starttime)) # test if the normalized time series all have the same mean means = aggregation.normalizedTimeSeries.mean().values np.testing.assert_allclose(means, np.array([means[0]] * len(means)), rtol=1e-5) # repredict the original data rearangedData = aggregation.predictOriginalData() # test if the mean fits the mean of the raw time series --> should always hold for k-means independent from sameMean True or False np.testing.assert_array_almost_equal(raw.mean(), rearangedData[raw.columns].mean(), decimal=4)
def test_hierarchical(): raw = pd.read_csv(os.path.join(os.path.dirname(__file__), '..', 'examples', 'testdata.csv'), index_col=0) results = pd.read_csv(os.path.join(os.path.dirname(__file__), '..', 'examples', 'results', 'testperiods_kmedoids.csv'), index_col=[0, 1]) starttime = time.time() aggregation = tsam.TimeSeriesAggregation( raw, noTypicalPeriods=8, hoursPerPeriod=24 * 7, clusterMethod='k_medoids', ) typPeriods = aggregation.createTypicalPeriods() print('Clustering took ' + str(time.time() - starttime)) sortedTypPeriods = typPeriods.reindex( typPeriods['GHI'].unstack().sum(axis=1).sort_values().index, level=0) np.testing.assert_array_almost_equal(sortedTypPeriods.values, results.values, decimal=4)
def discretize_tsam( self, resolution=None, noTypicalPeriods=10, hoursPerPeriod=24, clusterMethod="hierarchical", evalSumPeriods=False, sortValues=False, sameMean=False, rescaleClusterPeriods=True, weightDict=None, extremePeriodMethod="None", solver="glpk", roundOutput=None, addPeakMin=None, addPeakMax=None, addMeanMin=None, addMeanMax=None, ): """uses tsam""" try: import tsam.timeseriesaggregation as tsam except ImportError: raise ImportError("tsam is required for discretize_tsam()") if not isinstance(self.index, DatetimeIndex): raise TypeError("To use tsam, index of series must be a " "DateTimeIndex") timeSeries = self.copy() agg = tsam.TimeSeriesAggregation( timeSeries, resolution=resolution, noTypicalPeriods=noTypicalPeriods, hoursPerPeriod=hoursPerPeriod, clusterMethod=clusterMethod, evalSumPeriods=evalSumPeriods, sortValues=sortValues, sameMean=sameMean, rescaleClusterPeriods=rescaleClusterPeriods, weightDict=weightDict, extremePeriodMethod=extremePeriodMethod, solver=solver, roundOutput=roundOutput, addPeakMin=addPeakMin, addPeakMax=addPeakMax, addMeanMin=addMeanMin, addMeanMax=addMeanMax, ) agg.createTypicalPeriods() results = agg.predictOriginalData() results = EnergyDataFrame(results) results.__dict__["agg"] = agg return results.__finalize__(self)
def apply_time_segmentation(n, segments): logger.info(f"Aggregating time series to {segments} segments.") try: import tsam.timeseriesaggregation as tsam except: raise ModuleNotFoundError("Optional dependency 'tsam' not found." "Install via 'pip install tsam'") p_max_pu_norm = n.generators_t.p_max_pu.max() p_max_pu = n.generators_t.p_max_pu / p_max_pu_norm load_norm = n.loads_t.p_set.max() load = n.loads_t.p_set / load_norm inflow_norm = n.storage_units_t.inflow.max() inflow = n.storage_units_t.inflow / inflow_norm raw = pd.concat([p_max_pu, load, inflow], axis=1, sort=False) solver_name = snakemake.config["solving"]["solver"]["name"] agg = tsam.TimeSeriesAggregation( raw, hoursPerPeriod=len(raw), noTypicalPeriods=1, noSegments=int(segments), segmentation=True, solver=solver_name, ) segmented = agg.createTypicalPeriods() weightings = segmented.index.get_level_values("Segment Duration") offsets = np.insert(np.cumsum(weightings[:-1]), 0, 0) snapshots = [ n.snapshots[0] + pd.Timedelta(f"{offset}h") for offset in offsets ] n.set_snapshots(pd.DatetimeIndex(snapshots, name="name")) n.snapshot_weightings = pd.Series(weightings, index=snapshots, name="weightings", dtype="float64") segmented.index = snapshots n.generators_t.p_max_pu = segmented[ n.generators_t.p_max_pu.columns] * p_max_pu_norm n.loads_t.p_set = segmented[n.loads_t.p_set.columns] * load_norm n.storage_units_t.inflow = segmented[ n.storage_units_t.inflow.columns] * inflow_norm return n
def tsam_cluster(timeseries_df, typical_periods=10, how='daily'): """ Parameters ---------- df : pd.DataFrame DataFrame with timeseries to cluster Returns ------- timeseries : pd.DataFrame Clustered timeseries """ if how == 'daily': hours = 24 if how == 'weekly': hours = 168 aggregation = tsam.TimeSeriesAggregation( timeseries_df, noTypicalPeriods=typical_periods, rescaleClusterPeriods=False, hoursPerPeriod=hours, clusterMethod='hierarchical') timeseries = aggregation.createTypicalPeriods() cluster_weights = aggregation.clusterPeriodNoOccur # get the medoids/ the clusterCenterIndices clusterCenterIndices = aggregation.clusterCenterIndices # get all index for every hour of that day of the clusterCenterIndices start = [] # get the first hour of the clusterCenterIndices (days start with 0) for i in clusterCenterIndices: start.append(i * hours) # get a list with all hours belonging to the clusterCenterIndices nrhours = [] for j in start: nrhours.append(j) x = 1 while x < hours: j = j + 1 nrhours.append(j) x = x + 1 # get the origial Datetimeindex dates = timeseries_df.iloc[nrhours].index return timeseries, cluster_weights, dates, hours
def test_preprocess(): raw = pd.read_csv(os.path.join(os.path.dirname(__file__), '..', 'examples', 'testdata.csv'), index_col=0) raw_wind = raw.loc[:, 'Wind'].to_frame() aggregation_wind = tsam.TimeSeriesAggregation(raw_wind, noTypicalPeriods=8, hoursPerPeriod=24, clusterMethod='hierarchical') aggregation_wind._preProcessTimeSeries() test = aggregation_wind.normalizedPeriodlyProfiles orig = pd.read_csv(os.path.join(os.path.dirname(__file__), '..', 'examples', 'results', 'preprocessed_wind.csv'), index_col=[0], header=[0, 1]) np.testing.assert_array_almost_equal(test.values, orig.values, decimal=15)
system = System(lat, lon, model_name=modelname) wind = Wind("Onshore turbine", lat=lat, lon=lon, installed=1, use_dowa=False) pv = PhotoVoltaic("PV Full south", installed=1) system.add_components([wind, pv]) system.fetch_input_data() system.calculate_time_series() source = "PVGIS-SARAH ERA-Interim 2005-2016 TMY" #%% aggregation raw = pd.DataFrame([wind.state.power, pv.state.power], index=["Wind", "PV"]).transpose() aggregation = tsam.TimeSeriesAggregation( raw, noTypicalPeriods=8, hoursPerPeriod=24, clusterMethod="hierarchical", solver="gurobi", ) typPeriods_c = aggregation.createTypicalPeriods() accuracy = aggregation.accuracyIndicators() low_to_high = typPeriods_c.groupby(level=0).mean().sum(axis=1).sort_values().index low_to_high_map = {k: i for i, k in enumerate(low_to_high)} low_to_high_get = lambda x: low_to_high_map.get(x) raw["cluster"] = raw.apply( lambda row: low_to_high_get(aggregation.clusterOrder[row.name.dayofyear - 1]), axis=1 )
import tsam.timeseriesaggregation as tsam import pandas as pd raw = pd.read_csv('testdata.csv', index_col=0) aggregation = tsam.TimeSeriesAggregation(raw, noTypicalPeriods=8, hoursPerPeriod=24, clusterMethod='hierarchical') df = aggregation.createTypicalPeriods() weights = aggregation.clusterPeriodNoOccur aggregation.clusterOrder timesteps = [i for i in range(0, len(df.index.get_level_values('TimeStep')))] print(aggregation.clusterCenterIndices) # get all index for every hour that in day of the clusterCenterIndices days = [ d for d in raw.index.dayofyear if d in aggregation.clusterCenterIndices ] # select the dates based on this dates = raw.iloc[days].index # TODO: Check index start (1 in dataframe, 0 in numpy -> should fit, otherwise # days are offset one day
def test_cluster_order(): raw = pd.read_csv(os.path.join(os.path.dirname(__file__), '..', 'examples', 'testdata.csv'), index_col=0) raw_wind = raw.loc[:, 'Wind'].to_frame() orig_raw_predefClusterOrder = pd.read_csv(os.path.join( os.path.dirname(__file__), '..', 'examples', 'results', 'testperiods_predefClusterOrder.csv'), index_col=[0, 1]) orig_raw_predefClusterOrderAndClusterCenters = pd.read_csv( os.path.join(os.path.dirname(__file__), '..', 'examples', 'results', 'testperiods_predefClusterOrderAndClusterCenters.csv'), index_col=[0, 1]) starttime = time.time() aggregation_wind = tsam.TimeSeriesAggregation(raw_wind, noTypicalPeriods=8, hoursPerPeriod=24, clusterMethod='hierarchical') typPeriods_wind = aggregation_wind.createTypicalPeriods() aggregation_predefClusterOrder = tsam.TimeSeriesAggregation( raw, noTypicalPeriods=8, hoursPerPeriod=24, clusterMethod='hierarchical', predefClusterOrder=aggregation_wind.clusterOrder) typPeriods_predefClusterOrder = aggregation_predefClusterOrder.createTypicalPeriods( ) aggregation_predefClusterOrderAndClusterCenters = tsam.TimeSeriesAggregation( raw, noTypicalPeriods=8, hoursPerPeriod=24, clusterMethod='hierarchical', predefClusterOrder=aggregation_wind.clusterOrder, predefClusterCenterIndices=aggregation_wind.clusterCenterIndices) typPeriods_predefClusterOrderAndClusterCenters = aggregation_predefClusterOrderAndClusterCenters.createTypicalPeriods( ) print('Clustering took ' + str(time.time() - starttime)) # sort the typical days in order to avoid error assertion due to different order sortedDaysOrig1 = orig_raw_predefClusterOrder.sum( axis=0, level=0).sort_values('GHI').index sortedDaysTest1 = typPeriods_predefClusterOrder.sum( axis=0, level=0).sort_values('GHI').index sortedDaysOrig2 = orig_raw_predefClusterOrderAndClusterCenters.sum( axis=0, level=0).sort_values('GHI').index sortedDaysTest2 = typPeriods_predefClusterOrderAndClusterCenters.sum( axis=0, level=0).sort_values('GHI').index # rearange their order orig1 = orig_raw_predefClusterOrder[ typPeriods_predefClusterOrder.columns].unstack().loc[ sortedDaysOrig1, :].stack() test1 = typPeriods_predefClusterOrder.unstack().loc[ sortedDaysTest1, :].stack() orig2 = orig_raw_predefClusterOrderAndClusterCenters[ typPeriods_predefClusterOrderAndClusterCenters.columns].unstack().loc[ sortedDaysOrig2, :].stack() test2 = typPeriods_predefClusterOrderAndClusterCenters.unstack().loc[ sortedDaysTest2, :].stack() np.testing.assert_array_almost_equal(orig1.values, test1[orig1.columns].values, decimal=4) np.testing.assert_array_almost_equal(orig2.values, test2[orig2.columns].values, decimal=4)
def tsam_cluster(timeseries_df, typical_periods=10, how='daily', extremePeriodMethod = 'None'): """ Parameters ---------- df : pd.DataFrame DataFrame with timeseries to cluster extremePeriodMethod: {'None','append','new_cluster_center', 'replace_cluster_center'}, default: 'None' Method how to integrate extreme Periods into to the typical period profiles. None: No integration at all. 'append': append typical Periods to cluster centers 'new_cluster_center': add the extreme period as additional cluster center. It is checked then for all Periods if they fit better to the this new center or their original cluster center. 'replace_cluster_center': replaces the cluster center of the cluster where the extreme period belongs to with the periodly profile of the extreme period. (Worst case system design) Returns ------- timeseries : pd.DataFrame Clustered timeseries """ if how == 'daily': hours = 24 period = ' days' if how == 'weekly': hours = 168 period = ' weeks' print('Snapshot clustering to ' + str(typical_periods) + period + ' using extreme period method: ' + extremePeriodMethod) aggregation = tsam.TimeSeriesAggregation( timeseries_df, noTypicalPeriods=typical_periods, extremePeriodMethod = extremePeriodMethod, addPeakMin = ['residual_load'], addPeakMax = ['residual_load'], rescaleClusterPeriods=False, hoursPerPeriod=hours, clusterMethod='hierarchical') timeseries = aggregation.createTypicalPeriods() cluster_weights = aggregation.clusterPeriodNoOccur clusterOrder =aggregation.clusterOrder clusterCenterIndices= aggregation.clusterCenterIndices if extremePeriodMethod == 'new_cluster_center': for i in aggregation.extremePeriods.keys(): clusterCenterIndices.insert( aggregation.extremePeriods[i]['newClusterNo'], aggregation.extremePeriods[i]['stepNo']) if extremePeriodMethod == 'append': for i in aggregation.extremePeriods.keys(): clusterCenterIndices.insert( aggregation.extremePeriods[i]['clusterNo'], aggregation.extremePeriods[i]['stepNo']) # get all index for every hour of that day of the clusterCenterIndices start = [] # get the first hour of the clusterCenterIndices (days start with 0) for i in clusterCenterIndices: start.append(i * hours) # get a list with all hours belonging to the clusterCenterIndices nrhours = [] for j in start: nrhours.append(j) x = 1 while x < hours: j = j + 1 nrhours.append(j) x = x + 1 # get the origial Datetimeindex dates = timeseries_df.iloc[nrhours].index #get list of representative days representative_day=[] #cluster:medoid des jeweiligen Clusters dic_clusterCenterIndices = dict(enumerate(clusterCenterIndices)) for i in clusterOrder: representative_day.append(dic_clusterCenterIndices[i]) #get list of last hour of representative days last_hour_datetime=[] for i in representative_day: last_hour = i * hours + hours - 1 last_hour_datetime.append(timeseries_df.index[last_hour]) #create a dataframe (index=nr. of day in a year/candidate) df_cluster = pd.DataFrame({ 'Cluster': clusterOrder, #Cluster of the day 'RepresentativeDay': representative_day, #representative day of the cluster 'last_hour_RepresentativeDay': last_hour_datetime}) #last hour of the cluster df_cluster.index = df_cluster.index + 1 df_cluster.index.name = 'Candidate' #create a dataframe each timeseries (h) and its candiddate day (i) df_i_h nr_day = [] x = len(timeseries_df.index)/hours+1 for i in range(1,int(x)): j=1 while j <= hours: nr_day.append(i) j=j+1 df_i_h = pd.DataFrame({'Timeseries': timeseries_df.index, 'Candidate_day': nr_day}) df_i_h.set_index('Timeseries',inplace=True) return df_cluster, cluster_weights, dates, hours, df_i_h
def tsam_cluster(timeseries_df, typical_periods=10, how='daily'): """ Parameters ---------- df : pd.DataFrame DataFrame with timeseries to cluster Returns ------- timeseries : pd.DataFrame Clustered timeseries """ if how == 'daily': hours = 24 if how == 'weekly': hours = 168 ''' ########################################################### #URI: This adds peaks #URI: We find the day with more peaks and pick one of the LOADS with such. #Get indices of the maximums loading = timeseries_df.filter(regex='0$|1$|2$|3$|4$|5$|6$|7$|8$|9$',axis=1) loading_max = loading.idxmax() #Get rid of the hours #loading_aux = loading.apply(lambda x: str(x.year) + "-" + str(x.month) + "-" + str(x.day)) loading_aux=loading_max.apply(lambda x: x.strftime("%y-%m-%d")) #find the day that is peak for most of the loads #Notice: We just pick the fist date, could be other days with the same maximums #This comes from: https://stackoverflow.com/questions/6987285/python-find-the-item-with-maximum-occurrences-in-a-list from collections import defaultdict from operator import itemgetter c = defaultdict(int) for i in loading_aux: c[i] += 1 peakloadday=max(c.items(), key=itemgetter(1)) for idx, val in enumerate(loading_aux): if val == peakloadday[0]: peakloadcol = loading_aux.index.values[idx] break #URI: We find the day with more peaks and pick one of the WIND generators with such. #Get indices of the maximums wind = timeseries_df.filter(regex='wind$', axis=1) wind_max = wind.idxmax() #Get rid of the hours #loading_aux = loading.apply(lambda x: str(x.year) + "-" + str(x.month) + "-" + str(x.day)) wind_aux=wind_max.apply(lambda x: x.strftime("%y-%m-%d")) #find the day that is peak for most of the loads #Notice: We just pick the fist date, could be other days with the same maximums #This comes from: https://stackoverflow.com/questions/6987285/python-find-the-item-with-maximum-occurrences-in-a-list from collections import defaultdict from operator import itemgetter c = defaultdict(int) for i in wind_aux: c[i] += 1 peakwindday=max(c.items(), key=itemgetter(1)) for idx, val in enumerate(wind_aux): if val == peakwindday[0]: peakwindcol = wind_aux.index.values[idx] break #URI: We find the day with more peaks and pick one of the SOLAR generators with such. #Get indices of the maximums solar = timeseries_df.filter(regex='solar$', axis=1) solar_max = solar.idxmax() #Get rid of the hours #loading_aux = loading.apply(lambda x: str(x.year) + "-" + str(x.month) + "-" + str(x.day)) solar_aux=solar_max.apply(lambda x: x.strftime("%y-%m-%d")) #find the day that is peak for most of the loads #Notice: We just pick the fist date, could be other days with the same maximums #This comes from: https://stackoverflow.com/questions/6987285/python-find-the-item-with-maximum-occurrences-in-a-list from collections import defaultdict from operator import itemgetter c = defaultdict(int) for i in solar_aux: c[i] += 1 peaksolarday=max(c.items(), key=itemgetter(1)) for idx, val in enumerate(solar_aux): if val == peaksolarday[0]: #This is not the column with the overall maximum, just the first column with the maximum at the place #in time with more maximums peaksolarcol = solar_aux.index.values[idx] break aggregation = tsam.TimeSeriesAggregation( timeseries_df, noTypicalPeriods=typical_periods, rescaleClusterPeriods=False, hoursPerPeriod=hours, clusterMethod='hierarchical', extremePeriodMethod = 'append', #'None', 'append', 'new_cluster_center', 'replace_cluster_center' addPeakMax = [peakloadcol,peakwindcol,peaksolarcol]) ''' ################################################################## #URI: This is the original method=hierarchical aggregation = tsam.TimeSeriesAggregation( timeseries_df, noTypicalPeriods=typical_periods, rescaleClusterPeriods=False, hoursPerPeriod=hours, clusterMethod='hierarchical' ) #averaging, k_means, k_medoids, hierarchical timeseries = aggregation.createTypicalPeriods() #URI: Better take the whole thing timeseries_new = aggregation.predictOriginalData() cluster_weights = aggregation.clusterPeriodNoOccur # get the medoids/ the clusterCenterIndices clusterCenterIndices = aggregation.clusterCenterIndices #URI: and add the peak periods #clusterOrder = aggregation.clusterOrder #for i in range(len(clusterCenterIndices),len(cluster_weights.keys())): # clusterCenterIndices.append(np.where(clusterOrder == i)[0][0]) # get all index for every hour of that day of the clusterCenterIndices start = [] # get the first hour of the clusterCenterIndices (days start with 0) for i in clusterCenterIndices: start.append(i * hours) # get a list with all hours belonging to the clusterCenterIndices nrhours = [] for j in start: nrhours.append(j) x = 1 while x < hours: j = j + 1 nrhours.append(j) x = x + 1 # get the origial Datetimeindex dates = timeseries_df.iloc[nrhours].index ''' ###################################################### #URI: This is them method=k_means aggregation = tsam.TimeSeriesAggregation( timeseries_df, noTypicalPeriods=typical_periods, rescaleClusterPeriods=False, hoursPerPeriod=hours, clusterMethod='k_means') #averaging, k_means, k_medoids, hierarchical timeseries = aggregation.createTypicalPeriods() #URI: Better take the whole thing timeseries_new =aggregation.predictOriginalData() cluster_weights = aggregation.clusterPeriodNoOccur # get a representative asclusterCenterIndices #clusterCenterIndices = aggregation.clusterCenterIndices clusterCenterIndices = [] for i in cluster_weights.keys(): clusterCenterIndices.append(np.argmax(aggregation.clusterOrder == i)) #URI: and add the peak periods #clusterOrder = aggregation.clusterOrder #for i in range(len(clusterCenterIndices),len(cluster_weights.keys())): # clusterCenterIndices.append(np.where(clusterOrder == i)[0][0]) # get all index for every hour of that day of the clusterCenterIndices start = [] # get the first hour of the clusterCenterIndices (days start with 0) for i in clusterCenterIndices: start.append(i * hours) # get a list with all hours belonging to the clusterCenterIndices nrhours = [] for j in start: nrhours.append(j) x = 1 while x < hours: j = j + 1 nrhours.append(j) x = x + 1 # get the origial Datetimeindex dates = timeseries_df.iloc[nrhours].index ''' ########################################################## #URI: his is to get the data as csv directory = '/home/raventos/Results/tsamdata/' timeseries_df.to_csv(directory + 'rawPeriods' + str(typical_periods) + '.csv') timeseries.to_csv(directory + 'typPeriods' + str(typical_periods) + '.csv') timeseries_new.to_csv(directory + 'predictedPeriods' + str(typical_periods) + '.csv') aggregation.indexMatching().to_csv(directory + 'indexMatching' + str(typical_periods) + '.csv') np.savetxt(directory + 'clusterOrder' + str(typical_periods) + '.csv', aggregation.clusterOrder.astype(int), fmt='%i', delimiter=",") cCI = pd.DataFrame(clusterCenterIndices, columns=['index']) cCI.to_csv(directory + 'clusterCenterIndices' + str(typical_periods) + '.csv') noOccur = pd.DataFrame(cluster_weights, index=['noOccur']) noOccur.to_csv(directory + 'noOccurrances' + str(typical_periods) + '.csv') aggregation.accuracyIndicators().to_csv(directory + 'indicators' + str(typical_periods) + '.csv') np.savetxt(directory + 'dates' + str(typical_periods) + '.csv', dates.strftime("%Y-%m-%d %X"), fmt="%s", delimiter=",") #URI: Just checking #print([peakloadcol,peakwindcol,peaksolarcol]) #print(clusterCenterIndices) #print([peakloadday,peakwindday,peaksolarday]) #print(dates) #print(cluster_weights) #print(aggregation.indexMatching()) return timeseries_new, cluster_weights, dates, hours
def temporal_clustering(datapackage, n, path="/tmp", how="daily"): """ Creates a new datapackage by aggregating sequences inside the `sequence` folder of the specified datapackage by clustering `n` timesteps Parameters ---------- datapackage: string String of meta data file datapackage.json n: integer Number of clusters path: string Path to directory where the aggregated datapackage is stored how: string How to cluster 'daily' or 'hourly' """ if how == "weekly": raise NotImplementedError("Weekly clustering is not implemented!") p = Package(datapackage) cwd = os.getcwd() copied_package_name = (p.descriptor["name"] + "__temporal_cluster__" + how + "_" + str(n)) copy_path = os.path.join(path, p.descriptor["name"], copied_package_name) copied_root = copy_datapackage(datapackage, os.path.abspath(copy_path), subset="data") sequence_resources = [ r for r in p.resources if re.match(r"^data/sequences/.*$", r.descriptor["path"]) ] dfs = { r.name: pd.DataFrame(r.read(keyed="True")).set_index("timeindex").astype(float) for r in sequence_resources } sequences = pd.concat(dfs.values(), axis=1) if how == "daily": hoursPerPeriod = 24 elif how == "hourly": hoursPerPeriod = 1 elif how == "weekly": hoursPerPeriod = 24 * 7 aggregation = tsam.TimeSeriesAggregation( sequences, noTypicalPeriods=n, rescaleClusterPeriods=False, hoursPerPeriod=hoursPerPeriod, clusterMethod="hierarchical", ) cluster_weights = { aggregation.clusterCenterIndices[n]: w for n, w in aggregation.clusterPeriodNoOccur.items() } if how == "daily": temporal = pd.Series( { d: cluster_weights[d.dayofyear] for d in sequences.index if d.dayofyear in aggregation.clusterCenterIndices }, name="weighting", ) temporal.index.name = "timeindex" elif how == "hourly": temporal = pd.Series( { h: cluster_weights[sequences.index.get_loc(h)] for h in sequences.index if sequences.index.get_loc(h) in aggregation.clusterCenterIndices }, name="weighting", ) temporal.index.name = "timeindex" # write resources to copied package (should not interfer with meta data) # as columns are not removed and sorted when written. os.chdir(copied_root) for r in sequence_resources: write_sequences(r.name + ".csv", dfs[r.name].loc[temporal.index], replace=True) # write temporal information from clustering temporal.to_csv( "data/temporal.csv", header=True, sep=";", date_format="%Y-%m-%dT%H:%M:%SZ", ) # add meta data for new temporal information r = Resource({"path": "data/temporal.csv"}) r.infer() # TODO: Add meta-data description r.descriptor[ "description"] = "Temporal selection based on hierachical clustering..." # Update meta-data of copied package cp = Package("datapackage.json") cp.descriptor["name"] = copied_package_name cp.descriptor["resources"].append(r.descriptor) cp.commit() cp.save("datapackage.json") # set back to 'old' workdirectory os.chdir(cwd) return copied_root