def model_train_predict_test(
            self,
            input_file_regx="(DBID)\((\d+)\)_INSTID\([1]\).csv",
            override=False):  # "^(\d+)\.csv"
        """
        :param override=Fasle: rerun the model prediction no matter if the expected output file exists
        :return: model file, model weights files, prediction file, discrepancy statistic bar plot file
        """
        # get training sets for lstm training
        print("Scanning files within select id range ...")
        print(input_file_regx)
        print(self.training_set_dir)
        ids, files = get_ids_and_files_in_dir(inputdir=self.training_set_dir,
                                              range=self.training_set_id_range,
                                              input_file_regx=input_file_regx)
        print("Scanning done! Selected enterprise ids are {}".format(ids))
        if not files:
            raise ValueError(
                "No files selected in current id range. Please check the input training set directory, "
                "input enterprise id range or file format which should be '[0-9]+.csv'"
            )

        # get train, test, validation data
        for id_index, id_file in enumerate(files):
            # store prediction result to prediction directory
            enter_file = self.training_set_dir + "/" + id_file
            print("Processing dataset - enterprise_id is: {}".format(
                ids[id_index]))
            print("Reading from file {}".format(enter_file))
            df = pd.read_csv(enter_file)
            df.index = range(len(df.index))
            # retrieve training X and Y columns. First column is customer_id
            select_col = []
            select_col = np.append(
                select_col,
                ['X' + str(i) for i in range(1, 1 + self.training_set_length)])
            select_col = np.append(select_col, ['Y'])
            df_selected = df.ix[:, select_col]
            print(df_selected)
            # remove outlier records
            """
            df_selected = percentile_remove_outlier(df_selected, filter_start=0, filter_end=1+self.training_set_length)
            print(df_selected)
            """
            # scale the train columns
            print("Scaling...")
            if self.scaler == 'mm':
                global bin_boundary
                df_scale, minVal, maxVal, bin_boundary = MinMaxScaler(
                    df_selected,
                    start_col_index=0,
                    end_col_index=self.training_set_length)
            elif self.scaler == 'norm':
                df_scale, meanVal, stdVal = NormalDistributionScaler(
                    df_selected,
                    start_col_index=0,
                    end_col_index=self.training_set_length)
            else:
                raise ValueError("Argument scaler must be mm or norm!")
            # bin date y
            df_bin, bin_boundary = binning_date_y(
                df_scale,
                y_col=self.training_set_length,
                n_group=5,
                bin_boundary=bin_boundary)
            print("Bin boundary is {}".format(bin_boundary))
            # get train and test dataset
            print("Randomly selecting training set and test set...")
            all_data_x = np.asarray(
                df_bin.ix[:, 0:self.training_set_length]).reshape(
                    (len(df_bin.index), 1, self.training_set_length))
            all_data_y = np.asarray(df_bin.ix[:, self.training_set_length])
            # convert y label to one-hot dummy label
            y_dummy_label = np.asarray(pd.get_dummies(all_data_y))
            # format train, test, validation data
            sub_train, val_train, sub_test, val_test = train_test_split(
                all_data_x, y_dummy_label, test_size=self.test_size)
            train_x, test_x, train_y, test_y = train_test_split(
                sub_train, sub_test, test_size=self.test_size)
            # create and fit the NN model
            model_save_path = self.model_save_dir + "/" + self.model_file_prefix + "-" + str(
                ids[id_index]) + ".h5"
            # check if model file exists
            if not os.path.exists(model_save_path) or override:
                self.NN_model_train(train_x,
                                    train_y,
                                    test_x,
                                    test_y,
                                    model_save_path=model_save_path)
            # generate prediction for training
            print("Predicting the output of validation set...")
            val_predict_class, val_predict_prob = self.NN_prediction(
                val_train, model_save_path=model_save_path)
            # statistic of discrepancy between expected value and real value
            total_sample_count = len(val_predict_class)
            val_test_label = np.asarray([list(x).index(1) for x in val_test])
            match_count = (np.asarray(val_predict_class) == np.asarray(
                val_test_label.ravel())).sum()
            print("Precision using validation dataset is {}".format(
                float(match_count) / total_sample_count))  # 0.9178082191780822
예제 #2
0
    def model_train_predict_test(
        self,
        dataX,
        dataY,
        end,
        lookback,
        override=False,
    ):
        # remove outlier records
        """
        df_selected = percentile_remove_outlier(df_selected, filter_start=0, filter_end=1+self.training_set_length)
        print(df_selected)
        """
        # scale the train columns
        print("Scaling...")
        if self.scaler == 'mm':
            copy = dataX[:, :, dataX.shape[2] - 1]
            dataX[:, :, dataX.shape[2] -
                  1], minVal, maxVal, _bin_boundary = MinMaxScaler(copy)
            for i in range(dataX.shape[2] - 1):
                copy = dataX[:, :, i]
                dataX[:, :, i], null1, null2, null3 = MinMaxScaler(copy)
        elif self.scaler == 'norm':
            pass
            # target_collection, meanVal, stdVal = NormalDistributionScaler(target_collection, start_col_index=0, end_col_index=self.training_set_length)
        else:
            raise ValueError("Argument scaler must be mm or norm!")
        # bin date y
        bin_boundary = [0, 50, 75, 90]
        dataY, bin_boundary = binning_date_y(dataY,
                                             y_col=self.training_set_length,
                                             n_group=5,
                                             bin_boundary=bin_boundary)
        print("Bin boundary is {}".format(bin_boundary))
        # get train and test dataset
        print("Randomly selecting training set and test set...")
        # convert y label to one-hot dummy label
        if len(set(dataY)) == 1:
            return (1010, [1010, 1010])
        y_dummy_label = np.asarray(pd.get_dummies(dataY))
        # format train, test, validation data
        count = 0
        while True:
            x_sub, x_test, y_sub, y_test = train_test_split(
                dataX, y_dummy_label, test_size=self.test_size)
            x_train, x_val, y_train, y_val = train_test_split(
                x_sub, y_sub, test_size=self.test_size)
            if count == 10:
                return (1010, [1010, 1010])

            def to_list(x):
                to_list = []
                for i in range(x.shape[0]):
                    result = 0
                    for j in range(x.shape[1]):
                        result += x[i][j] * (j + 1)
                    to_list.append(result)
                return to_list

            if len(set(to_list(y_train))) > 1:
                break
        # create and fit the NN model
        model_save_path = self.model_save_dir + "/" + self.model_file_prefix + ".h5"
        # check if model file exists
        if not os.path.exists(model_save_path) or override:
            score = self.NN_model_train(x_train,
                                        y_train,
                                        x_val,
                                        y_val,
                                        model_save_path=model_save_path,
                                        end=end,
                                        lookback=lookback)
            print("Models and their parameters are stored in {}".format(
                model_save_path))
        else:
            score = [1010, 1010]
        # generate prediction for training
        print("Predicting the output of validation set...")
        val_predict_class, val_predict_prob = self.NN_prediction(
            x_test, model_save_path=model_save_path)
        # statistic of discrepancy between expected value and real value
        total_sample_count = len(val_predict_class)
        val_test_label = np.asarray([list(x).index(1) for x in y_test])
        match_count = (np.asarray(val_predict_class) == np.asarray(
            val_test_label.ravel())).sum()
        return float(match_count) / total_sample_count, score