def prepare(self, Y_high_filter, min_count_per_day, include_preprocess): ''' Preprocess and classify days in DataFrame ''' self.train_df = preprocess.data_preprocessing( self.train_df, self.xs, self.Y_tag, self.I_tag, self.cs_tag, Y_high_filter, self.print_info, include_preprocess) #if len(self.cs_tag) != 0 or len(self.ghi_tag) != 0: if True: if include_preprocess: classification, k, MF = preprocess.classify_weather_day_GM_Tina( self.train_df, self.cs_tag, self.ghi_tag) self.train_df['day_type'] = classification if False: if include_preprocess: classification = cluster.classify_weather_day_MHopwood( self.cut_results, self.Y_tag, self.xs, kmeans_num_clusters=4) self.train_df['day_type'] = classification # cuts train_df into daily DF's # also returns a filtered train_df which cuts out a day if its length is too small index_list, day_hour_list, self.cut_results, self.train_df = utilities.find_and_break_days_or_hours( self.train_df, True, min_count_per_day=min_count_per_day, frequency='days', print_info=self.print_info) middles_dates, hours_kpi = utilities.get_weighted_middle_of_day_and_calculate_float_since_noon( self.cut_results, self.Y_tag) ### For each day, compile frequencies ### For each day, output the # times each class is triggered self.ordered_pair_list, freq_df = cluster.create_conglomerated_vectors_for_clustering_algorithm( self.cut_results, hours_kpi, day_hour_list, self.Y_tag, self.xs) self.test_df = preprocess.data_preprocessing( self.test_df, self.xs, self.Y_tag, self.I_tag, self.cs_tag, Y_high_filter, self.print_info, include_preprocess) if len(self.cs_tag) != 0 or len(self.ghi_tag) != 0: test_classification, test_k, test_MF = preprocess.classify_weather_day_GM_Tina( self.test_df, self.cs_tag, self.ghi_tag) self.test_df['day_type'] = test_classification test_index_list, test_day_hour_list, self.test_cut_results, self.test_df = utilities.find_and_break_days_or_hours( self.test_df, True, min_count_per_day=min_count_per_day, frequency='days', print_info=self.print_info) test_middles_dates, test_hours_kpi = utilities.get_weighted_middle_of_day_and_calculate_float_since_noon( self.test_cut_results, self.Y_tag) print("TEST CUT RESULTS CREATE CONGLOMERATED") self.test_ordered_pair_list, test_freq_df = cluster.create_conglomerated_vectors_for_clustering_algorithm( self.test_cut_results, test_hours_kpi, test_day_hour_list, self.Y_tag, self.xs)
def process_test_data_through_models(test_kmeans_dfs, kmeans_saved_models, test_km_labels, xs): # When inputted, test_kmeans_dfs is ordered by the number of kmeans clusters # This will then be transitioned to be ordered by day # Then, it will be pushed through the models new_dfs = [] for kmeans_saved_model, test_kmeans_df in zip(kmeans_saved_models, test_kmeans_dfs): # Check for error case # print(kmeans_saved_models[i], test_kmeans_df) if kmeans_saved_model == 0 and len(test_kmeans_df.index) != 0: raise Exception( "Input Error: PVPolyfit requires either less clusters or more training data." ) if len(test_kmeans_df.index) == 0: continue # need to parse days from each df _, _, dfs, _ = utilities.find_and_break_days_or_hours( test_kmeans_df, False, min_count_per_day=0, frequency="days") new_dfs.append(dfs) # flatten list of lists test_kmeans_dfs = [item for sublist in new_dfs for item in sublist] # sort the dfs by datetime index for i in range(len(test_kmeans_dfs)): for j in range(len(test_kmeans_dfs)): if datetime.strptime(test_kmeans_dfs[i].index[0], "%m/%d/%Y %H:%M:%S %p") < datetime.strptime( test_kmeans_dfs[j].index[0], "%m/%d/%Y %H:%M:%S %p"): test_kmeans_dfs[i], test_kmeans_dfs[j] = test_kmeans_dfs[ j], test_kmeans_dfs[i] # iterate through dfs and run models kmeans_Y_lists = [] for model_index, test_kmeans_df in zip(test_km_labels, test_kmeans_dfs): # if model does not have any days if len(test_kmeans_df) == 0: raise Exception("DataFrame of zero length has been detected") temps = [test_kmeans_df[x].values for x in xs] Y_list = [ kmeans_saved_models[model_index].output( [item[j] for item in temps]) for j in range(len(temps[0])) ] kmeans_Y_lists.append(Y_list) flattened_kmeans_Y_lists = [ item for sublist in kmeans_Y_lists for item in sublist ] return flattened_kmeans_Y_lists
def process_test_data_through_models(test_kmeans_dfs, kmeans_saved_models, test_km_labels, xs): # When inputted, test_kmeans_dfs is ordered by the number of kmeans clusters # This will then be transitioned to be ordered by day # Then, it will be pushed through the models new_dfs = [] for i in range(len(test_kmeans_dfs)): # Check for error case #print(kmeans_saved_models[i], test_kmeans_dfs[i]) if kmeans_saved_models[i] == 0 and len(test_kmeans_dfs[i] != 0): raise Exception( "Input Error: PVPolyfit requires either less clusters or more training data." ) if len(test_kmeans_dfs[i]) == 0: continue # need to parse days from each df _, _, dfs, _ = utilities.find_and_break_days_or_hours( test_kmeans_dfs[i], False, min_count_per_day=0, frequency='days') new_dfs.append(dfs) # flatten list of lists test_kmeans_dfs = [item for sublist in new_dfs for item in sublist] # sort the dfs by datetime index for i in range(len(test_kmeans_dfs)): for j in range(len(test_kmeans_dfs)): if (datetime.strptime(test_kmeans_dfs[i].index[0], '%m/%d/%Y %H:%M:%S %p') < datetime.strptime( test_kmeans_dfs[j].index[0], '%m/%d/%Y %H:%M:%S %p')): temp = test_kmeans_dfs[i] test_kmeans_dfs[i] = test_kmeans_dfs[j] test_kmeans_dfs[j] = temp # iterate through dfs and run models kmeans_Y_lists = [] for i in range(len(test_kmeans_dfs)): # if model does not have any days if len(test_kmeans_dfs[i]) == 0: raise Exception("DataFrame of zero length has been detected") temps = [] for j in range(len(xs)): temps.append(array(test_kmeans_dfs[i][xs[j]].tolist())) model_index = test_km_labels[i] Y_list = [] for j in range(len(temps[0])): inputs_iter = [item[j] for item in temps] Y_val = (kmeans_saved_models[model_index]).output(inputs_iter) Y_list.append(Y_val) kmeans_Y_lists.append(Y_list) flattened_kmeans_Y_lists = [ item for sublist in kmeans_Y_lists for item in sublist ] return flattened_kmeans_Y_lists
def run(self, num_clusters=6, num_iterations=1, degrees=None, kernel_type="polynomial"): """ Iterates through Degrees For each Degree, iterates n times Returns best model for each input day Parameters: num_clusters: int, default 6 number of clusters used in clustering algorithm, synonymous with number of 'types of days' num_iterations: int, default 1 number of times algorithm loops, indicates volatility of algorithm (usually very small, so default = 1) degrees: list of ints range of degrees that polynomial kernel iterates through kernel_type: str type of regression kernel to be used OPTIONS: polynomial - a(AB)+ """ if degrees is None: degrees = list(range(1, 10)) self.num_clusters = num_clusters self.num_iterations = num_iterations self.degrees = degrees self.kernel_type = kernel_type self.all_best_dfs = [] self.model_day_counts = [] for degree in self.degrees: P_se_list = [] combined_P_list = [] combined_day_counts = [] combined_test_km_labels = [] # 1. Run the code an n number of times for _ in range(self.num_iterations): # clusters and adds 'model_num' column to cut_results & test_cut_results train_kmeans_dfs, test_kmeans_dfs, self.test_km_labels, self.cut_results, self.test_cut_results, train_model_day_count, test_model_day_count = cluster.cluster_ordered_pairs_and_return_df_of_days_in_cluster( self.cut_results, self.test_cut_results, self.ordered_pair_list, self.test_ordered_pair_list, kmeans_num_clusters=self.num_clusters, print_info=self.print_info) saved_models = cluster.save_model_for_each_cluster( train_kmeans_dfs, degree, self.Y_tag, self.xs, self.kernel_type) self.kmeans_Y_lists = kernel.process_test_data_through_models( test_kmeans_dfs, saved_models, self.test_km_labels, self.xs) # 2. For each iteration, save the modelled P and colors (based on model used) combined_P_list.append(self.kmeans_Y_lists) self.combined_test_cut_results.append(self.test_cut_results) combined_test_km_labels.append(self.test_km_labels) combined_day_counts.append( [train_model_day_count, test_model_day_count]) P_se_km = kernel.EvaluateModel( self.test_df[self.Y_tag].values, np.array(self.kmeans_Y_lists)).rmse() P_se_list.append(P_se_km) # 3. Gather the minimum and maximum for each index, save in two lists mins = [] maxs = [] for i in range(len(self.test_df.index)): _min = 9999 _max = -9999 for j in range(len(combined_P_list)): if combined_P_list[j][i] < _min: _min = combined_P_list[j][i] if combined_P_list[j][i] > _max: _max = combined_P_list[j][i] mins.append(_min) maxs.append(_max) best_index = np.argmin(P_se_list) best_model = combined_P_list[best_index] best_df = pd.DataFrame() best_df['Y'] = best_model best_df['mins'] = mins best_df['maxs'] = maxs best_df.index = self.test_df.index _, _, dfg, _ = utilities.find_and_break_days_or_hours( best_df, False, min_count_per_day=0, frequency='days', print_info=self.print_info) self.all_best_dfs.append(dfg) self.model_day_counts.append(combined_day_counts[best_index]) return combined_test_km_labels[best_index]