示例#1
0
    def prepare(self, Y_high_filter, min_count_per_day, include_preprocess):
        ''' Preprocess and classify days in DataFrame '''
        self.train_df = preprocess.data_preprocessing(
            self.train_df, self.xs, self.Y_tag, self.I_tag, self.cs_tag,
            Y_high_filter, self.print_info, include_preprocess)
        #if len(self.cs_tag) != 0 or len(self.ghi_tag) != 0:
        if True:
            if include_preprocess:
                classification, k, MF = preprocess.classify_weather_day_GM_Tina(
                    self.train_df, self.cs_tag, self.ghi_tag)
                self.train_df['day_type'] = classification

        if False:
            if include_preprocess:
                classification = cluster.classify_weather_day_MHopwood(
                    self.cut_results,
                    self.Y_tag,
                    self.xs,
                    kmeans_num_clusters=4)
                self.train_df['day_type'] = classification
        # cuts train_df into daily DF's
        # also returns a filtered train_df which cuts out a day if its length is too small
        index_list, day_hour_list, self.cut_results, self.train_df = utilities.find_and_break_days_or_hours(
            self.train_df,
            True,
            min_count_per_day=min_count_per_day,
            frequency='days',
            print_info=self.print_info)

        middles_dates, hours_kpi = utilities.get_weighted_middle_of_day_and_calculate_float_since_noon(
            self.cut_results, self.Y_tag)
        ### For each day, compile frequencies
        ### For each day, output the # times each class is triggered

        self.ordered_pair_list, freq_df = cluster.create_conglomerated_vectors_for_clustering_algorithm(
            self.cut_results, hours_kpi, day_hour_list, self.Y_tag, self.xs)

        self.test_df = preprocess.data_preprocessing(
            self.test_df, self.xs, self.Y_tag, self.I_tag, self.cs_tag,
            Y_high_filter, self.print_info, include_preprocess)

        if len(self.cs_tag) != 0 or len(self.ghi_tag) != 0:
            test_classification, test_k, test_MF = preprocess.classify_weather_day_GM_Tina(
                self.test_df, self.cs_tag, self.ghi_tag)
            self.test_df['day_type'] = test_classification

        test_index_list, test_day_hour_list, self.test_cut_results, self.test_df = utilities.find_and_break_days_or_hours(
            self.test_df,
            True,
            min_count_per_day=min_count_per_day,
            frequency='days',
            print_info=self.print_info)

        test_middles_dates, test_hours_kpi = utilities.get_weighted_middle_of_day_and_calculate_float_since_noon(
            self.test_cut_results, self.Y_tag)

        print("TEST CUT RESULTS CREATE CONGLOMERATED")
        self.test_ordered_pair_list, test_freq_df = cluster.create_conglomerated_vectors_for_clustering_algorithm(
            self.test_cut_results, test_hours_kpi, test_day_hour_list,
            self.Y_tag, self.xs)
示例#2
0
def process_test_data_through_models(test_kmeans_dfs, kmeans_saved_models,
                                     test_km_labels, xs):
    # When inputted, test_kmeans_dfs is ordered by the number of kmeans clusters
    # This will then be transitioned to be ordered by day
    # Then, it will be pushed through the models

    new_dfs = []
    for kmeans_saved_model, test_kmeans_df in zip(kmeans_saved_models,
                                                  test_kmeans_dfs):
        # Check for error case
        # print(kmeans_saved_models[i], test_kmeans_df)
        if kmeans_saved_model == 0 and len(test_kmeans_df.index) != 0:
            raise Exception(
                "Input Error: PVPolyfit requires either less clusters or more training data."
            )

        if len(test_kmeans_df.index) == 0:
            continue

        # need to parse days from each df
        _, _, dfs, _ = utilities.find_and_break_days_or_hours(
            test_kmeans_df, False, min_count_per_day=0, frequency="days")
        new_dfs.append(dfs)

    # flatten list of lists
    test_kmeans_dfs = [item for sublist in new_dfs for item in sublist]

    # sort the dfs by datetime index
    for i in range(len(test_kmeans_dfs)):
        for j in range(len(test_kmeans_dfs)):
            if datetime.strptime(test_kmeans_dfs[i].index[0],
                                 "%m/%d/%Y %H:%M:%S %p") < datetime.strptime(
                                     test_kmeans_dfs[j].index[0],
                                     "%m/%d/%Y %H:%M:%S %p"):
                test_kmeans_dfs[i], test_kmeans_dfs[j] = test_kmeans_dfs[
                    j], test_kmeans_dfs[i]

    # iterate through dfs and run models
    kmeans_Y_lists = []

    for model_index, test_kmeans_df in zip(test_km_labels, test_kmeans_dfs):
        # if model does not have any days
        if len(test_kmeans_df) == 0:
            raise Exception("DataFrame of zero length has been detected")

        temps = [test_kmeans_df[x].values for x in xs]

        Y_list = [
            kmeans_saved_models[model_index].output(
                [item[j] for item in temps]) for j in range(len(temps[0]))
        ]

        kmeans_Y_lists.append(Y_list)

    flattened_kmeans_Y_lists = [
        item for sublist in kmeans_Y_lists for item in sublist
    ]

    return flattened_kmeans_Y_lists
示例#3
0
def process_test_data_through_models(test_kmeans_dfs, kmeans_saved_models,
                                     test_km_labels, xs):
    # When inputted, test_kmeans_dfs is ordered by the number of kmeans clusters
    # This will then be transitioned to be ordered by day
    # Then, it will be pushed through the models

    new_dfs = []
    for i in range(len(test_kmeans_dfs)):
        # Check for error case
        #print(kmeans_saved_models[i], test_kmeans_dfs[i])
        if kmeans_saved_models[i] == 0 and len(test_kmeans_dfs[i] != 0):
            raise Exception(
                "Input Error: PVPolyfit requires either less clusters or more training data."
            )

        if len(test_kmeans_dfs[i]) == 0:
            continue

        # need to parse days from each df
        _, _, dfs, _ = utilities.find_and_break_days_or_hours(
            test_kmeans_dfs[i], False, min_count_per_day=0, frequency='days')
        new_dfs.append(dfs)

    # flatten list of lists
    test_kmeans_dfs = [item for sublist in new_dfs for item in sublist]

    # sort the dfs by datetime index
    for i in range(len(test_kmeans_dfs)):
        for j in range(len(test_kmeans_dfs)):
            if (datetime.strptime(test_kmeans_dfs[i].index[0],
                                  '%m/%d/%Y %H:%M:%S %p') < datetime.strptime(
                                      test_kmeans_dfs[j].index[0],
                                      '%m/%d/%Y %H:%M:%S %p')):
                temp = test_kmeans_dfs[i]
                test_kmeans_dfs[i] = test_kmeans_dfs[j]
                test_kmeans_dfs[j] = temp

    # iterate through dfs and run models
    kmeans_Y_lists = []

    for i in range(len(test_kmeans_dfs)):
        # if model does not have any days
        if len(test_kmeans_dfs[i]) == 0:
            raise Exception("DataFrame of zero length has been detected")

        temps = []
        for j in range(len(xs)):
            temps.append(array(test_kmeans_dfs[i][xs[j]].tolist()))

        model_index = test_km_labels[i]
        Y_list = []
        for j in range(len(temps[0])):
            inputs_iter = [item[j] for item in temps]
            Y_val = (kmeans_saved_models[model_index]).output(inputs_iter)
            Y_list.append(Y_val)

        kmeans_Y_lists.append(Y_list)

    flattened_kmeans_Y_lists = [
        item for sublist in kmeans_Y_lists for item in sublist
    ]

    return flattened_kmeans_Y_lists
示例#4
0
    def run(self,
            num_clusters=6,
            num_iterations=1,
            degrees=None,
            kernel_type="polynomial"):
        """
        Iterates through Degrees
        For each Degree, iterates n times
        Returns best model for each input day

        Parameters:
            num_clusters: int, default 6
                number of clusters used in clustering algorithm, synonymous with number of 'types of days'

            num_iterations: int, default 1
                number of times algorithm loops, indicates volatility of algorithm (usually very small, so default = 1)

            degrees: list of ints
                range of degrees that polynomial kernel iterates through

            kernel_type: str
                type of regression kernel to be used
                OPTIONS: polynomial - a(AB)+
        """
        if degrees is None:
            degrees = list(range(1, 10))

        self.num_clusters = num_clusters
        self.num_iterations = num_iterations
        self.degrees = degrees
        self.kernel_type = kernel_type

        self.all_best_dfs = []
        self.model_day_counts = []

        for degree in self.degrees:
            P_se_list = []
            combined_P_list = []
            combined_day_counts = []
            combined_test_km_labels = []

            # 1. Run the code an n number of times
            for _ in range(self.num_iterations):
                # clusters and adds 'model_num' column to cut_results & test_cut_results
                train_kmeans_dfs, test_kmeans_dfs, self.test_km_labels, self.cut_results, self.test_cut_results, train_model_day_count, test_model_day_count = cluster.cluster_ordered_pairs_and_return_df_of_days_in_cluster(
                    self.cut_results,
                    self.test_cut_results,
                    self.ordered_pair_list,
                    self.test_ordered_pair_list,
                    kmeans_num_clusters=self.num_clusters,
                    print_info=self.print_info)

                saved_models = cluster.save_model_for_each_cluster(
                    train_kmeans_dfs, degree, self.Y_tag, self.xs,
                    self.kernel_type)

                self.kmeans_Y_lists = kernel.process_test_data_through_models(
                    test_kmeans_dfs, saved_models, self.test_km_labels,
                    self.xs)

                # 2. For each iteration, save the modelled P and colors (based on model used)
                combined_P_list.append(self.kmeans_Y_lists)
                self.combined_test_cut_results.append(self.test_cut_results)
                combined_test_km_labels.append(self.test_km_labels)
                combined_day_counts.append(
                    [train_model_day_count, test_model_day_count])
                P_se_km = kernel.EvaluateModel(
                    self.test_df[self.Y_tag].values,
                    np.array(self.kmeans_Y_lists)).rmse()
                P_se_list.append(P_se_km)

            # 3. Gather the minimum and maximum for each index, save in two lists
            mins = []
            maxs = []
            for i in range(len(self.test_df.index)):
                _min = 9999
                _max = -9999
                for j in range(len(combined_P_list)):
                    if combined_P_list[j][i] < _min:
                        _min = combined_P_list[j][i]
                    if combined_P_list[j][i] > _max:
                        _max = combined_P_list[j][i]
                mins.append(_min)
                maxs.append(_max)

            best_index = np.argmin(P_se_list)
            best_model = combined_P_list[best_index]
            best_df = pd.DataFrame()

            best_df['Y'] = best_model
            best_df['mins'] = mins
            best_df['maxs'] = maxs

            best_df.index = self.test_df.index
            _, _, dfg, _ = utilities.find_and_break_days_or_hours(
                best_df,
                False,
                min_count_per_day=0,
                frequency='days',
                print_info=self.print_info)
            self.all_best_dfs.append(dfg)
            self.model_day_counts.append(combined_day_counts[best_index])
            return combined_test_km_labels[best_index]