def create_slot_data(): raw_filename = config['transactions_filename'] raw_filepath = os.path.join(os.getcwd(), config["dir_names"]["res_folder_name"]) slot_minutes = config['slot_mins'] save_location = os.path.join(os.getcwd(), config["dir_names"]["preprocess_folder_name"]) save_file_name = "Slot_" + str(slot_minutes) + "_min_trans_data.csv" if os.path.exists(os.path.join(save_location, save_file_name)): if config['verbose'] > 1: print( ' ------------------- Slotted data already present : Slots of ' + str(slot_minutes) + ' minutes -------------------') return save_file_name, save_location else: # get raw data raw_data = get_csv_data(filename=raw_filename, filepath=raw_filepath) # process the raw data file processed_data = get_processed_data(raw_data=raw_data) processed_data.columns = [ 'Start_day', 'Start_month', 'Start_year', 'Start_DOY', 'Start_weekday', 'Start_time', 'Energy_required', 'Connected_time', 'Charge_point' ] # create slotted data file from processed data file slotted_data = get_slotted_array(data=processed_data, slot_secs=slot_minutes * 60) # Cleaning the data. THIS IS FOR OUR ELAAD NL DATASET # PLEASE CHANGE IF NEW DATA IS USED if config['data_collector'] == 'ELaadNL': processed_data = slotted_data[slotted_data.Energy_required != 0] processed_data['Departure_time'] = processed_data[ 'Start_time'] + processed_data['Connected_time'] processed_data = processed_data[processed_data.Departure_time <= processed_data.Start_time + 24] weirdline = (processed_data.Connected_time <= 18.01) & (processed_data.Connected_time > 17.99) processed_data = processed_data[-weirdline] processed_data = processed_data.reset_index() if config['verbose'] > 1: print( ' ------------------- Created slotted data: Data cleaning - ' ) print(' \t\t Energy required > 0') print(' \t\t Departure time < Start time + 24 hours') print(' \t\t Removed the weird line in data') # save slotted data file processed_data.to_csv(os.path.join( save_location, "Slot_" + str(slot_minutes) + "_min_trans_data.csv"), index=False) return save_file_name, save_location
def __init__(self, Clustered_filename, Clustered_filepath, pole_col_name = 'Charge_point'): # We will get the data and do some preprocessing. Here we will also add 2 extra columns. # 1) as a weekend/weekday indicator # 2) as a indicator of session self._data = get_csv_data(filename=Clustered_filename, filepath=Clustered_filepath) self._pole_col_name = pole_col_name # 2 extra columns self.weekdays_divide() self._data['Indicator'] = 1 self._each_pole_data = self.split_data()
def get_trained_model(model, process="EP", lambda_mod="mean"): year = config['Year'] SLOT = config['slot_mins'] model = str(model) process = str(process) lambda_mod = str(lambda_mod) slot_sec = 60 * SLOT factor = 'Factor' # this is the name of factor we use to generate indipendent models of. usually of the form of # year_month_daytype. we will have indipenden AM,MMc and MMe for these # --------------------------------------- LOADING DATASET ------------------------------------------------------ if model == "AM": file_loc = config['filenames']['slot_data_file_path'] file_name = config['filenames']['slot_data_file_name'] required_file_name = "Slot_" + str(SLOT) + "_min_trans_data.csv" else: file_loc = config['filenames']['processed_data_file_path'] file_name = config['filenames']['processed_data_file_name'] required_file_name = 'Processed_' + str(SLOT) + '_min_' + str( year) + '_year_trans_data.csv' try: if config['verbose'] > 0: print(' ------------------- Training ', str(model), ' model -------------------') all_data = get_csv_data(filename=file_name, filepath=file_loc) except: if config['verbose'] > 0: print( ' ------------------- Required data file not found -------------------' ) if config['verbose'] > 0: print(' \t\t Please run SDG_preprocessing.py before this script') if config['verbose'] > 0: print(' \t\t Missing data file :', required_file_name) sys.exit("ERROR") # --------------------------------------- FIT and RETURN MODEL DATASETS ----------------------------------------- # we have three different types of models that can be returned here # AM = arrival model. The standard parameters here are of # exponential process with variability lambda = true. # MMc = mixture models for connection times # # MMe = mixture model for required energy if model == 'AM': # --------------------------------------- PREPARE TRAINING DATASETS -------------------------------------------- # Sorting data into required parameters all_data = all_data.sort_values( by=['Start_year', 'Start_DOY', 'Start_time']).copy() y_train = [2015] n_poles_test = [1677] # ------------------------------------ poles selection # This is a pole selector. given methods are onceeachn, topn and continous. for continous give all data PS = poles_selector(alldata=all_data, year=y_train) PS.select_poles(by='continous') charge_points = PS._charge_points # ------------------------------------- Training dataset ts_d = all_data[all_data['Start_year'].isin(y_train)] if config['verbose'] > 0: print(' \t\t Training AM for year: ' + str(year)) if config['verbose'] > 0: print(' \t\t Training AM for slot minutes: ' + str(SLOT)) if config['verbose'] > 0: print(' \t\t Total number of poles: ' + str(len(np.unique(ts_d['Charge_point'])))) if config['verbose'] > 0: print(' \t\t Number of poles used:' + str(len(charge_points))) ts_d = ts_d[ts_d['Charge_point'].isin(charge_points)] n_poles_train = len(np.unique(ts_d['Charge_point'])) ts_d = ts_d.reset_index() # --------------------------------- add the factor column to the dataset Start_times_slot = get_slotted_data(ts_d['Start_time'], slot_sec) ts_d['Start_time_slot'] = Start_times_slot weekday = ts_d['Start_weekday'].copy() weekday[ts_d['Start_weekday'] < 5] = 0 weekday[ts_d['Start_weekday'] >= 5] = 1 ts_d[factor] = create_factor_arr(year=ts_d['Start_year'], month=ts_d['Start_month'], daytype=weekday) # prepare the ts and x which are inputs for the exponential processes and poisson process # prepare the time seires ts = ts_d['Start_time'].copy() doy = ts_d['Start_DOY'].copy() sesonality = 24.00 d = 1 if config['verbose'] > 0: print(' \t\t Preparing time seires for modeling ... ') for i in range(1, ts.size): if doy[i] != doy[i - 1]: ts[i:] = ts[i:] + sesonality d = d + 1 # prepare the X for generating TS x = ts_d[[factor]].copy() # preparing start times x['Start_time'] = ts_d[['Start_time_slot']].copy() x['Start_DOY'] = ts_d[['Start_DOY']].copy() if config['verbose'] > 0: print(' \t\t Training ... ') if process == "IAT": # model: # EXPONENTIAL PROCESS MODEL ep = exponential_process(events=ts, x=x, variablity_lambda=True, log=True, normalize=True) ep.fit(lambda_mod=lambda_mod, combine=np.arange(1, 7), poly_deg=1, alpha=0.125, max_poly_deg=30, verbose=config['verbose']) if config['verbose'] > 0: print(' \t\t Trained ... ') return ep if process == "AC": # POISSON PROCESS MODEL pp = poisson_process(events=ts, x=x, variablity_lambda=True) pp.fit(lambda_mod=lambda_mod, combine=None, verbose=config['verbose']) if config['verbose'] > 0: print(' \t\t Trained ... ') return pp # both MMc and MMe are idential except for the departure and energy columns. if model == 'MMc': if config['verbose'] > 0: print(' \t\t Training MMc for year: ' + str(year)) if config['verbose'] > 0: print(' \t\t Training MMc for slot minutes: ' + str(SLOT)) # --------------------------------------- PREPARING DATASETS ----------------------------------------------- # Sorting data into required parameters all_data = all_data.sort_values( by=['Start_year', 'Start_DOY', 'Start_time']).copy() ts_d = all_data.copy() useful_data = ts_d[[ 'Start_time', 'Connected_time', 'Final_clusters', 'Final_Pole_clusters', 'Start_daytype', 'Factor' ]] useful_data['Start_time_slot'] = np.floor( useful_data['Start_time']) + 1 # model # GMM for the mixture normal_mm_EM = mixture_models( y=useful_data['Connected_time'], x=useful_data[['Factor', 'Start_time_slot']], initilizations=useful_data['Final_clusters'], combine=None) normal_mm_EM.fit(mix='normal', method='EM', verbose=config['verbose']) return normal_mm_EM if model == 'MMe': if config['verbose'] > 0: print(' \t\t Training MMe for year: ' + str(year)) if config['verbose'] > 0: print(' \t\t Training MMe for slot minutes: ' + str(SLOT)) # --------------------------------------- PREPARING DATASETS ----------------------------------------------- # Sorting data into required parameters all_data = all_data.sort_values( by=['Start_year', 'Start_DOY', 'Start_time']).copy() ts_d = all_data.copy() useful_data = ts_d[[ 'Start_time', 'Energy_required', 'Final_clusters', 'Final_Pole_clusters', 'Start_daytype', 'Factor' ]] useful_data['Start_time_slot'] = np.floor( useful_data['Start_time']) + 1 # model # GMM for the mixture normal_mm_EM = mixture_models( y=useful_data['Energy_required'], x=useful_data[['Factor', 'Start_time_slot']], initilizations=useful_data['Final_clusters'], combine=None) normal_mm_EM.fit(mix='normal', method='EM', verbose=config['verbose']) return normal_mm_EM
def create_processed_data(): # Name and location of the final saved file save_name = 'Processed_' + str(config['slot_mins']) + '_min_' + str( config['Year']) + '_year_trans_data.csv' save_loc = os.path.join(os.getcwd(), config["dir_names"]["preprocess_folder_name"]) if os.path.exists(os.path.join(save_loc, save_name)): # if the data is already generated, then we dont need to worry if config['verbose'] > 0: print( ' ------------------- Processed Data File Exists -------------------' ) else: if config['verbose'] > 0: print( ' ------------------- Creating Processed Data File -------------------' ) # call slotting script. this will create the slotted data that we need from transactions slot_file_name, slot_file_loc = create_slot_data() # call session clustering script. This will generate the session clusters ses_clust_file_name, ses_clust_file_path = sesssion_clustering( slot_file_path=slot_file_loc, slot_file_name=slot_file_name) # call pole clustering script. This will generate the clusters for pole types pole_clust_file_name, pole_clust_file_path = pole_clustering( ses_clust_file_path=ses_clust_file_path, ses_clust_file_name=ses_clust_file_name) # Getting the session/pole clusters - Combining the data, and saving it. pole_clusts = get_csv_data(filename=pole_clust_file_name, filepath=pole_clust_file_path) session_clusts = get_csv_data(filename=ses_clust_file_name, filepath=ses_clust_file_path) fin_clust_data = session_clusts.join( pole_clusts.set_index('Charge_point'), on='Charge_point') fin_clust_data = fin_clust_data.drop(columns='index') fin_clust_data.reset_index() wd = fin_clust_data[['Start_weekday']] wd[wd < 5] = 0 wd[wd >= 5] = 1 fin_clust_data['Start_daytype'] = wd fin_clust_data['Factor'] = fin_clust_data['Start_year'].map(str) + '_' + \ fin_clust_data['Start_month'].map(str) + '_' + \ fin_clust_data['Start_daytype'].map(str) fin_clust_data.to_csv(os.path.join(save_loc, save_name)) if config['verbose'] > 0: print(' Final clustering data file saved as :', os.path.join(save_loc, save_name)) # update the names of files in configration files config['filenames'] = {} config['filenames']['slot_data_file_name'] = slot_file_name config['filenames']['ses_data_file_name'] = ses_clust_file_name config['filenames']['pole_data_file_name'] = pole_clust_file_name config['filenames']['processed_data_file_name'] = save_name config['filenames']['slot_data_file_path'] = slot_file_loc config['filenames']['ses_data_file_path'] = ses_clust_file_path config['filenames']['pole_data_file_path'] = pole_clust_file_path config['filenames']['processed_data_file_path'] = save_loc json.dump(config, open('config.json', 'w')) return save_name, save_loc
def sesssion_clustering(slot_file_name, slot_file_path): Year = config['Year'] slot_file_name = slot_file_name slot_file_path = slot_file_path save_name = "Final_session_clustered_" + str(Year) + "_trans_data.csv" save_location = os.path.join(os.getcwd(), config["dir_names"]["preprocess_folder_name"], 'session_cluster') monthclust_file = 'Monthly_clustered_' + str(Year) + '_trans_data.csv' monthclust_filepath = os.path.join( os.getcwd(), config["dir_names"]["preprocess_folder_name"], 'session_cluster') # Check if the final clustered file exists or not if os.path.exists(os.path.join(save_location, save_name)): if config['verbose'] > 1: print( ' ------------------- Annual clusters exist ------------------------' ) print(" \t\t Final Clustered Data Saved as: ", os.path.join(save_location, save_name)) print(' \t\t Clusters Created for year :' + str(Year)) return save_name, save_location else: # ------------------------------------------ Monthly Clusters -------------------------------------------------- # Check if monthly clusters are created. If not, create them if os.path.exists(os.path.join(monthclust_filepath, monthclust_file)): if config['verbose'] > 1: print( ' ------------------- Monthly Cluster File Exists --------------------' ) else: if config['verbose'] > 1: print( ' ------------------- Creating Monthly Cluster File --------------------' ) from preprocess.clustering.monthly_cluster_data_points import main as monthly_ses_clust monthly_ses_clust(slot_file_name=slot_file_name, slot_file_path=slot_file_path, save_loc=monthclust_filepath, save_name=monthclust_file) if config['verbose'] > 1: print(" Monthly clustered Data Saved as: ", os.path.join(monthclust_filepath, monthclust_file)) # ----------------------------------- Monthly Clusters data Loading ------------------------------------------- X = get_csv_data(filename=monthclust_file, filepath=monthclust_filepath) # ANNUAL CLUSTERS ------------------------------------------------------- Here we prepare a annual # clustering. we calculate the means of monthly clusters, and then cluster these means. depending on this new # clustering, we add the final_clusters column in the X_annual data set. We fill the noise with -1 factor We # also plot the annual clustered points means = X.groupby(['Start_month', 'Clusters'], as_index=False).agg({ 'Start_time': ['mean'], 'Departure_time': ['mean'] }) means.columns = [ 'Start_month', 'Clusters', 'Start_mean', 'Departure_mean' ] means_withoutnoise = means[means['Clusters'] >= 0] mean_clusters = DBSCAN(eps=2, min_samples=6).fit( means_withoutnoise[['Start_mean', 'Departure_mean']]) means_withoutnoise['Final_clusters'] = mean_clusters.labels_ means_withoutnoise = means_withoutnoise.copy() X_annual = pd.merge(X, means_withoutnoise, on=['Start_month', 'Clusters'], how='outer').fillna(-1) # ------------------------------------------ NOISE REMOVAL ---------------------------------------------------- # Now we include the noise in the clusters too. here we classify each noise point to a given cluster point, # based on its distance from the nearest clustered point. This seems very time consuming. We find dist of all # points wrt to one point, find the min dist point that is clustered and the assign this point to that cluster. if config['verbose'] > 1: print( ' ------------------- Processing Noise Data Points ------------------- ' ) bool = X_annual[['Final_clusters']] < 0 indexes = np.where(bool)[0] Final_clusters = X_annual[['Final_clusters']] noise_ind = np.where(X_annual[['Final_clusters']] < 0)[0] data_ind = np.where(X_annual[['Final_clusters']] >= 0)[0] boundry_points = X_annual.iloc[data_ind, :] boundry = [] for i in np.unique(boundry_points[['Final_clusters']]): data = boundry_points[boundry_points['Final_clusters'] == i] data = np.array(data[['Start_time', 'Departure_time']]).reshape(-1, 2) hull = ConvexHull(data) bp = np.insert(data[hull.vertices], 2, i, axis=1) boundry = np.append(boundry, bp) # boundry of the clusters boundry = boundry.reshape(-1, 3) dist_matrix = pairwise_distances( np.array( X_annual.iloc[noise_ind, :][['Start_time', 'Departure_time']]), boundry[:, :2]) min_indexes = np.argmin(dist_matrix, axis=1) clusts = np.array([boundry[i, 2] for i in min_indexes]) Final_clusters.iloc[indexes] = clusts.reshape(-1, 1) X_annual[['Final_clusters']] = Final_clusters.copy() # SAVING ------------------------------------------------------- X_annual.to_csv(os.path.join(save_location, save_name), index=False) if config['verbose'] > 1: print(" Session clusters data saved as: ", os.path.join(save_location, save_name)) if config['verbose'] > 1: print(' \t\t Clusters Created for year :' + str(Year)) if config['create_plots']: # create plots colors = np.array(X_annual[['Final_clusters']]) plt.scatter(X_annual[['Start_time']], X_annual[['Departure_time']], c=colors, cmap='Paired', s=0.2) plt.savefig( os.path.join(save_location, 'Final_session_clust_' + str(Year) + '_plot.png')) return save_name, save_location
def main(slot_file_name, slot_file_path, save_loc, save_name): slot_filename = slot_file_name slot_filepath = slot_file_path Year = config['Year'] save_location = save_loc save_name = save_name # these are the parameters for dbscan. Use them carefully ep = 1 alpha = 0.05 minpoints = 440 if os.path.exists(os.path.join(save_location, save_name)): if config['verbose'] > 2: print( ' ------------------- Monthly clusters exist ------------------------' ) return else: # ------------------------------------------Preprocess data for dbscan ---------------------------------------- # Create data properly. We take data from jan to dec. # This data is cleaned such that charging timees are realistic and less than 24 hours. We also make sure that # no car stays more than 24 hours. Year is the only parameter defined as input, all other parameters # should be changed from inside the code X = get_csv_data(filename=slot_filename, filepath=slot_filepath) temp = X[X.Start_year == Year] temp = temp[temp.Start_month < 13] processed_data = temp.reset_index() temp2 = processed_data[[ 'Start_time', "Departure_time", "Start_month", 'index' ]] Data_for_dbscan = temp2 if config['verbose'] > 2: print( ' ------------------- Total number of sessions for clustring: ', len(Data_for_dbscan)) print(' ------------------- Session clustring for year: ' + str(Year)) # this can be used to pull a specific number of samples # Data_for_dbscan = Data_for_dbscan.head(3000) # --------------------------------Create a dbscanner and cluster data------------------------------------------ # dbscanner from class created in my_dbscan. We will not normalize the data and take ep = 0.5 and min points = # 440 in each cluster. db_temp = my_dbscan.mydbscan(epsilon=ep, min_points=minpoints, alpha=alpha) db_temp.data(data=Data_for_dbscan, norm=False) db_temp.create_clusters() # Save Clusters processed_data = processed_data.join(pd.concat( db_temp._monthly_clusters).set_index('index'), on='index') processed_data.to_csv(os.path.join(save_location, save_name), index=False) # PLot and save the plot for future references if config['create_plots']: colors = np.array(processed_data[['Clusters']]) plt.scatter(processed_data[['Start_time']], processed_data[['Departure_time']], c=colors, cmap='Paired', s=0.2) plt.savefig( os.path.join(save_location, 'Monthly_clust_' + str(Year) + '_plot.png'))