예제 #1
0
def create_slot_data():
    raw_filename = config['transactions_filename']
    raw_filepath = os.path.join(os.getcwd(),
                                config["dir_names"]["res_folder_name"])
    slot_minutes = config['slot_mins']
    save_location = os.path.join(os.getcwd(),
                                 config["dir_names"]["preprocess_folder_name"])
    save_file_name = "Slot_" + str(slot_minutes) + "_min_trans_data.csv"

    if os.path.exists(os.path.join(save_location, save_file_name)):
        if config['verbose'] > 1:
            print(
                ' ------------------- Slotted data already present : Slots of '
                + str(slot_minutes) + ' minutes -------------------')
        return save_file_name, save_location

    else:

        # get raw data
        raw_data = get_csv_data(filename=raw_filename, filepath=raw_filepath)

        # process the raw data file
        processed_data = get_processed_data(raw_data=raw_data)
        processed_data.columns = [
            'Start_day', 'Start_month', 'Start_year', 'Start_DOY',
            'Start_weekday', 'Start_time', 'Energy_required', 'Connected_time',
            'Charge_point'
        ]

        # create slotted data file from processed data file
        slotted_data = get_slotted_array(data=processed_data,
                                         slot_secs=slot_minutes * 60)

        # Cleaning the data. THIS IS FOR OUR ELAAD NL DATASET
        # PLEASE CHANGE IF NEW DATA IS USED
        if config['data_collector'] == 'ELaadNL':
            processed_data = slotted_data[slotted_data.Energy_required != 0]
            processed_data['Departure_time'] = processed_data[
                'Start_time'] + processed_data['Connected_time']
            processed_data = processed_data[processed_data.Departure_time <=
                                            processed_data.Start_time + 24]
            weirdline = (processed_data.Connected_time <=
                         18.01) & (processed_data.Connected_time > 17.99)
            processed_data = processed_data[-weirdline]
            processed_data = processed_data.reset_index()

            if config['verbose'] > 1:
                print(
                    ' ------------------- Created slotted data: Data cleaning - '
                )
                print(' \t\t Energy required > 0')
                print(' \t\t Departure time < Start time + 24 hours')
                print(' \t\t Removed the weird line in data')
        # save slotted data file
        processed_data.to_csv(os.path.join(
            save_location,
            "Slot_" + str(slot_minutes) + "_min_trans_data.csv"),
                              index=False)

        return save_file_name, save_location
예제 #2
0
 def __init__(self, Clustered_filename, Clustered_filepath, pole_col_name = 'Charge_point'):
     # We will get the data and do some preprocessing. Here we will also add 2 extra columns.
     # 1) as a weekend/weekday indicator
     # 2) as a indicator of session
     self._data = get_csv_data(filename=Clustered_filename, filepath=Clustered_filepath)
     self._pole_col_name = pole_col_name
     # 2 extra columns
     self.weekdays_divide()
     self._data['Indicator'] = 1
     self._each_pole_data = self.split_data()
예제 #3
0
def get_trained_model(model, process="EP", lambda_mod="mean"):

    year = config['Year']
    SLOT = config['slot_mins']
    model = str(model)
    process = str(process)
    lambda_mod = str(lambda_mod)

    slot_sec = 60 * SLOT
    factor = 'Factor'  # this is the name of factor we use to generate indipendent models of. usually of the form of
    # year_month_daytype. we will have indipenden AM,MMc and MMe for these

    # --------------------------------------- LOADING DATASET ------------------------------------------------------
    if model == "AM":
        file_loc = config['filenames']['slot_data_file_path']
        file_name = config['filenames']['slot_data_file_name']
        required_file_name = "Slot_" + str(SLOT) + "_min_trans_data.csv"
    else:
        file_loc = config['filenames']['processed_data_file_path']
        file_name = config['filenames']['processed_data_file_name']
        required_file_name = 'Processed_' + str(SLOT) + '_min_' + str(
            year) + '_year_trans_data.csv'

    try:
        if config['verbose'] > 0:
            print(' ------------------- Training ', str(model),
                  ' model -------------------')
        all_data = get_csv_data(filename=file_name, filepath=file_loc)
    except:
        if config['verbose'] > 0:
            print(
                ' ------------------- Required data file not found -------------------'
            )
        if config['verbose'] > 0:
            print(' \t\t Please run SDG_preprocessing.py before this script')
        if config['verbose'] > 0:
            print(' \t\t Missing data file :', required_file_name)
        sys.exit("ERROR")
    # --------------------------------------- FIT and RETURN MODEL DATASETS -----------------------------------------
    # we have three different types of models that can be returned here
    #   AM = arrival model. The standard parameters here are of
    #       exponential process with variability lambda = true.
    #   MMc = mixture models for connection times
    #
    #   MMe = mixture model for required energy

    if model == 'AM':
        # --------------------------------------- PREPARE TRAINING DATASETS --------------------------------------------
        # Sorting data into required parameters
        all_data = all_data.sort_values(
            by=['Start_year', 'Start_DOY', 'Start_time']).copy()
        y_train = [2015]
        n_poles_test = [1677]

        # ------------------------------------  poles selection
        # This is a pole selector. given methods are onceeachn, topn and continous. for continous give all data
        PS = poles_selector(alldata=all_data, year=y_train)
        PS.select_poles(by='continous')
        charge_points = PS._charge_points
        # ------------------------------------- Training dataset
        ts_d = all_data[all_data['Start_year'].isin(y_train)]
        if config['verbose'] > 0:
            print(' \t\t Training AM for year: ' + str(year))
        if config['verbose'] > 0:
            print(' \t\t Training AM for slot minutes: ' + str(SLOT))
        if config['verbose'] > 0:
            print(' \t\t Total number of poles: ' +
                  str(len(np.unique(ts_d['Charge_point']))))
        if config['verbose'] > 0:
            print(' \t\t Number of poles used:' + str(len(charge_points)))
        ts_d = ts_d[ts_d['Charge_point'].isin(charge_points)]
        n_poles_train = len(np.unique(ts_d['Charge_point']))
        ts_d = ts_d.reset_index()

        # --------------------------------- add the factor column to the dataset
        Start_times_slot = get_slotted_data(ts_d['Start_time'], slot_sec)
        ts_d['Start_time_slot'] = Start_times_slot
        weekday = ts_d['Start_weekday'].copy()
        weekday[ts_d['Start_weekday'] < 5] = 0
        weekday[ts_d['Start_weekday'] >= 5] = 1
        ts_d[factor] = create_factor_arr(year=ts_d['Start_year'],
                                         month=ts_d['Start_month'],
                                         daytype=weekday)

        # prepare the ts and x which are inputs for the exponential processes and poisson process
        # prepare the time seires
        ts = ts_d['Start_time'].copy()
        doy = ts_d['Start_DOY'].copy()
        sesonality = 24.00
        d = 1
        if config['verbose'] > 0:
            print(' \t\t Preparing time seires for modeling ... ')
        for i in range(1, ts.size):
            if doy[i] != doy[i - 1]:
                ts[i:] = ts[i:] + sesonality
                d = d + 1

        # prepare the X for generating TS
        x = ts_d[[factor]].copy()
        # preparing start times
        x['Start_time'] = ts_d[['Start_time_slot']].copy()
        x['Start_DOY'] = ts_d[['Start_DOY']].copy()

        if config['verbose'] > 0: print(' \t\t Training ... ')
        if process == "IAT":
            # model:
            # EXPONENTIAL PROCESS MODEL
            ep = exponential_process(events=ts,
                                     x=x,
                                     variablity_lambda=True,
                                     log=True,
                                     normalize=True)
            ep.fit(lambda_mod=lambda_mod,
                   combine=np.arange(1, 7),
                   poly_deg=1,
                   alpha=0.125,
                   max_poly_deg=30,
                   verbose=config['verbose'])
            if config['verbose'] > 0: print(' \t\t Trained ... ')
            return ep
        if process == "AC":
            # POISSON PROCESS MODEL
            pp = poisson_process(events=ts, x=x, variablity_lambda=True)
            pp.fit(lambda_mod=lambda_mod,
                   combine=None,
                   verbose=config['verbose'])
            if config['verbose'] > 0: print(' \t\t Trained ... ')
            return pp

    # both MMc and MMe are idential except for the departure and energy columns.
    if model == 'MMc':
        if config['verbose'] > 0:
            print(' \t\t Training MMc for year: ' + str(year))
        if config['verbose'] > 0:
            print(' \t\t Training MMc for slot minutes: ' + str(SLOT))
        # --------------------------------------- PREPARING DATASETS -----------------------------------------------
        # Sorting data into required parameters
        all_data = all_data.sort_values(
            by=['Start_year', 'Start_DOY', 'Start_time']).copy()
        ts_d = all_data.copy()

        useful_data = ts_d[[
            'Start_time', 'Connected_time', 'Final_clusters',
            'Final_Pole_clusters', 'Start_daytype', 'Factor'
        ]]
        useful_data['Start_time_slot'] = np.floor(
            useful_data['Start_time']) + 1

        # model
        # GMM for the mixture
        normal_mm_EM = mixture_models(
            y=useful_data['Connected_time'],
            x=useful_data[['Factor', 'Start_time_slot']],
            initilizations=useful_data['Final_clusters'],
            combine=None)
        normal_mm_EM.fit(mix='normal', method='EM', verbose=config['verbose'])

        return normal_mm_EM

    if model == 'MMe':
        if config['verbose'] > 0:
            print(' \t\t Training MMe for year: ' + str(year))
        if config['verbose'] > 0:
            print(' \t\t Training MMe for slot minutes: ' + str(SLOT))
        # --------------------------------------- PREPARING DATASETS -----------------------------------------------
        # Sorting data into required parameters
        all_data = all_data.sort_values(
            by=['Start_year', 'Start_DOY', 'Start_time']).copy()
        ts_d = all_data.copy()

        useful_data = ts_d[[
            'Start_time', 'Energy_required', 'Final_clusters',
            'Final_Pole_clusters', 'Start_daytype', 'Factor'
        ]]
        useful_data['Start_time_slot'] = np.floor(
            useful_data['Start_time']) + 1

        # model
        # GMM for the mixture
        normal_mm_EM = mixture_models(
            y=useful_data['Energy_required'],
            x=useful_data[['Factor', 'Start_time_slot']],
            initilizations=useful_data['Final_clusters'],
            combine=None)
        normal_mm_EM.fit(mix='normal', method='EM', verbose=config['verbose'])

        return normal_mm_EM
예제 #4
0
def create_processed_data():

    # Name and location of the final saved file
    save_name = 'Processed_' + str(config['slot_mins']) + '_min_' + str(
        config['Year']) + '_year_trans_data.csv'
    save_loc = os.path.join(os.getcwd(),
                            config["dir_names"]["preprocess_folder_name"])

    if os.path.exists(os.path.join(save_loc, save_name)):
        # if the data is already generated, then we dont need to worry
        if config['verbose'] > 0:
            print(
                ' ------------------- Processed Data File Exists -------------------'
            )

    else:
        if config['verbose'] > 0:
            print(
                ' ------------------- Creating Processed Data File -------------------'
            )

        # call slotting script. this will create the slotted data that we need from transactions
        slot_file_name, slot_file_loc = create_slot_data()

        # call session clustering script. This will generate the session clusters
        ses_clust_file_name, ses_clust_file_path = sesssion_clustering(
            slot_file_path=slot_file_loc, slot_file_name=slot_file_name)

        # call pole clustering script. This will generate the clusters for pole types
        pole_clust_file_name, pole_clust_file_path = pole_clustering(
            ses_clust_file_path=ses_clust_file_path,
            ses_clust_file_name=ses_clust_file_name)

        # Getting the session/pole clusters - Combining the data, and saving it.
        pole_clusts = get_csv_data(filename=pole_clust_file_name,
                                   filepath=pole_clust_file_path)
        session_clusts = get_csv_data(filename=ses_clust_file_name,
                                      filepath=ses_clust_file_path)
        fin_clust_data = session_clusts.join(
            pole_clusts.set_index('Charge_point'), on='Charge_point')
        fin_clust_data = fin_clust_data.drop(columns='index')
        fin_clust_data.reset_index()
        wd = fin_clust_data[['Start_weekday']]
        wd[wd < 5] = 0
        wd[wd >= 5] = 1
        fin_clust_data['Start_daytype'] = wd
        fin_clust_data['Factor'] = fin_clust_data['Start_year'].map(str) + '_' + \
                                    fin_clust_data['Start_month'].map(str) + '_' + \
                                    fin_clust_data['Start_daytype'].map(str)
        fin_clust_data.to_csv(os.path.join(save_loc, save_name))

        if config['verbose'] > 0:
            print(' Final clustering data file saved as :',
                  os.path.join(save_loc, save_name))

        # update the names of files in configration files
        config['filenames'] = {}
        config['filenames']['slot_data_file_name'] = slot_file_name
        config['filenames']['ses_data_file_name'] = ses_clust_file_name
        config['filenames']['pole_data_file_name'] = pole_clust_file_name
        config['filenames']['processed_data_file_name'] = save_name
        config['filenames']['slot_data_file_path'] = slot_file_loc
        config['filenames']['ses_data_file_path'] = ses_clust_file_path
        config['filenames']['pole_data_file_path'] = pole_clust_file_path
        config['filenames']['processed_data_file_path'] = save_loc
        json.dump(config, open('config.json', 'w'))

    return save_name, save_loc
예제 #5
0
def sesssion_clustering(slot_file_name, slot_file_path):

    Year = config['Year']
    slot_file_name = slot_file_name
    slot_file_path = slot_file_path
    save_name = "Final_session_clustered_" + str(Year) + "_trans_data.csv"
    save_location = os.path.join(os.getcwd(),
                                 config["dir_names"]["preprocess_folder_name"],
                                 'session_cluster')
    monthclust_file = 'Monthly_clustered_' + str(Year) + '_trans_data.csv'
    monthclust_filepath = os.path.join(
        os.getcwd(), config["dir_names"]["preprocess_folder_name"],
        'session_cluster')

    # Check if the final clustered file exists or not
    if os.path.exists(os.path.join(save_location, save_name)):
        if config['verbose'] > 1:
            print(
                ' ------------------- Annual clusters exist ------------------------'
            )
            print(" \t\t Final Clustered Data Saved as: ",
                  os.path.join(save_location, save_name))
            print(' \t\t Clusters Created for year :' + str(Year))
        return save_name, save_location
    else:

        # ------------------------------------------ Monthly Clusters --------------------------------------------------
        # Check if monthly clusters are created. If not, create them

        if os.path.exists(os.path.join(monthclust_filepath, monthclust_file)):
            if config['verbose'] > 1:
                print(
                    ' ------------------- Monthly Cluster File Exists --------------------'
                )
        else:
            if config['verbose'] > 1:
                print(
                    ' ------------------- Creating Monthly Cluster File --------------------'
                )
            from preprocess.clustering.monthly_cluster_data_points import main as monthly_ses_clust
            monthly_ses_clust(slot_file_name=slot_file_name,
                              slot_file_path=slot_file_path,
                              save_loc=monthclust_filepath,
                              save_name=monthclust_file)
            if config['verbose'] > 1:
                print(" Monthly clustered Data Saved as: ",
                      os.path.join(monthclust_filepath, monthclust_file))

        # ----------------------------------- Monthly Clusters data Loading  -------------------------------------------
        X = get_csv_data(filename=monthclust_file,
                         filepath=monthclust_filepath)

        # ANNUAL CLUSTERS ------------------------------------------------------- Here we prepare a annual
        # clustering. we calculate the means of monthly clusters, and then cluster these means. depending on this new
        # clustering, we add the final_clusters column in the X_annual data set. We fill the noise with -1 factor We
        # also plot the annual clustered points
        means = X.groupby(['Start_month', 'Clusters'], as_index=False).agg({
            'Start_time': ['mean'],
            'Departure_time': ['mean']
        })
        means.columns = [
            'Start_month', 'Clusters', 'Start_mean', 'Departure_mean'
        ]
        means_withoutnoise = means[means['Clusters'] >= 0]
        mean_clusters = DBSCAN(eps=2, min_samples=6).fit(
            means_withoutnoise[['Start_mean', 'Departure_mean']])
        means_withoutnoise['Final_clusters'] = mean_clusters.labels_
        means_withoutnoise = means_withoutnoise.copy()
        X_annual = pd.merge(X,
                            means_withoutnoise,
                            on=['Start_month', 'Clusters'],
                            how='outer').fillna(-1)

        # ------------------------------------------ NOISE REMOVAL ----------------------------------------------------
        # Now we include the noise in the clusters too. here we classify each noise point to a given cluster point,
        # based on its distance from the nearest clustered point. This seems very time consuming. We find dist of all
        # points wrt to one point, find the min dist point that is clustered and the assign this point to that cluster.
        if config['verbose'] > 1:
            print(
                ' ------------------- Processing Noise Data Points ------------------- '
            )
        bool = X_annual[['Final_clusters']] < 0
        indexes = np.where(bool)[0]
        Final_clusters = X_annual[['Final_clusters']]
        noise_ind = np.where(X_annual[['Final_clusters']] < 0)[0]
        data_ind = np.where(X_annual[['Final_clusters']] >= 0)[0]
        boundry_points = X_annual.iloc[data_ind, :]
        boundry = []
        for i in np.unique(boundry_points[['Final_clusters']]):
            data = boundry_points[boundry_points['Final_clusters'] == i]
            data = np.array(data[['Start_time',
                                  'Departure_time']]).reshape(-1, 2)
            hull = ConvexHull(data)
            bp = np.insert(data[hull.vertices], 2, i, axis=1)
            boundry = np.append(boundry, bp)

        # boundry of the clusters
        boundry = boundry.reshape(-1, 3)

        dist_matrix = pairwise_distances(
            np.array(
                X_annual.iloc[noise_ind, :][['Start_time', 'Departure_time']]),
            boundry[:, :2])

        min_indexes = np.argmin(dist_matrix, axis=1)
        clusts = np.array([boundry[i, 2] for i in min_indexes])
        Final_clusters.iloc[indexes] = clusts.reshape(-1, 1)

        X_annual[['Final_clusters']] = Final_clusters.copy()

        # SAVING -------------------------------------------------------
        X_annual.to_csv(os.path.join(save_location, save_name), index=False)
        if config['verbose'] > 1:
            print(" Session clusters data saved as: ",
                  os.path.join(save_location, save_name))
        if config['verbose'] > 1:
            print(' \t\t Clusters Created for year :' + str(Year))

        if config['create_plots']:
            # create plots
            colors = np.array(X_annual[['Final_clusters']])
            plt.scatter(X_annual[['Start_time']],
                        X_annual[['Departure_time']],
                        c=colors,
                        cmap='Paired',
                        s=0.2)
            plt.savefig(
                os.path.join(save_location,
                             'Final_session_clust_' + str(Year) + '_plot.png'))

        return save_name, save_location
def main(slot_file_name, slot_file_path, save_loc, save_name):

    slot_filename = slot_file_name
    slot_filepath = slot_file_path
    Year = config['Year']
    save_location = save_loc
    save_name = save_name

    # these are the parameters for dbscan. Use them carefully
    ep = 1
    alpha = 0.05
    minpoints = 440

    if os.path.exists(os.path.join(save_location, save_name)):
        if config['verbose'] > 2:
            print(
                ' ------------------- Monthly clusters exist ------------------------'
            )
        return
    else:

        # ------------------------------------------Preprocess data for dbscan ----------------------------------------
        # Create data properly. We take data from jan to dec.
        # This data is cleaned such that charging timees are realistic and less than 24 hours. We also make sure that
        # no car stays more than 24 hours. Year is the only parameter defined as input, all other parameters
        # should be changed from inside the code
        X = get_csv_data(filename=slot_filename, filepath=slot_filepath)
        temp = X[X.Start_year == Year]
        temp = temp[temp.Start_month < 13]
        processed_data = temp.reset_index()
        temp2 = processed_data[[
            'Start_time', "Departure_time", "Start_month", 'index'
        ]]
        Data_for_dbscan = temp2
        if config['verbose'] > 2:
            print(
                ' ------------------- Total number of sessions for clustring: ',
                len(Data_for_dbscan))
            print(' ------------------- Session clustring for year: ' +
                  str(Year))
        # this can be used to pull a specific number of samples
        # Data_for_dbscan = Data_for_dbscan.head(3000)

        # --------------------------------Create a dbscanner and cluster data------------------------------------------
        # dbscanner from class created in my_dbscan. We will not normalize the data and take ep = 0.5 and min points =
        # 440 in each cluster.
        db_temp = my_dbscan.mydbscan(epsilon=ep,
                                     min_points=minpoints,
                                     alpha=alpha)
        db_temp.data(data=Data_for_dbscan, norm=False)
        db_temp.create_clusters()

        # Save Clusters
        processed_data = processed_data.join(pd.concat(
            db_temp._monthly_clusters).set_index('index'),
                                             on='index')
        processed_data.to_csv(os.path.join(save_location, save_name),
                              index=False)

        # PLot and save the plot for future references
        if config['create_plots']:
            colors = np.array(processed_data[['Clusters']])
            plt.scatter(processed_data[['Start_time']],
                        processed_data[['Departure_time']],
                        c=colors,
                        cmap='Paired',
                        s=0.2)
            plt.savefig(
                os.path.join(save_location,
                             'Monthly_clust_' + str(Year) + '_plot.png'))