示例#1
0
    def train(self, pred_path=None, loss_log_path=None, csv_log_path=None, boost_round_log_path=None,
              train_seed=None, cv_args=None, parameters=None, show_importance=False,
              save_cv_pred=True, save_cv_pred_train=False, save_final_pred=True, save_final_pred_train=False,
              save_csv_log=True, csv_idx=None, use_global_valid=False, return_pred_test=False,
              mode=None, param_name_list=None, param_value_list=None, use_custom_obj=False,
              file_name_params=None, append_info=None, loss_fuc=None):

        # Check if directories exit or not
        utils.check_dir_model(pred_path, loss_log_path)

        # Global Validation
        self.use_global_valid = use_global_valid

        # Use Custom Objective Function
        self.use_custom_obj = use_custom_obj

        cv_args_copy = copy.deepcopy(cv_args)
        n_cv = cv_args_copy['n_cv']
        cv_seed = cv_args_copy['cv_seed']
        valid_rate = 1/n_cv

        # Append Information
        if append_info is None:
            append_info = '_c-' + str(n_cv)

        if csv_idx is None:
            csv_idx = self.model_name

        # Print Start Information and Get Model Name
        self.print_start_info()

        if use_global_valid:
            print('------------------------------------------------------')
            print('[W] Using Global Validation...')

        cv_count = 0
        pred_test_total = []
        pred_train_total = []
        loss_train_total = []
        loss_valid_total = []
        idx_round = []
        train_loss_round_total = []
        valid_loss_round_total = []
        global_valid_loss_round_total = []
        pred_global_valid_total = []
        loss_global_valid_total = []

        # Get Cross Validation Generator
        if 'cv_generator' in cv_args_copy:
            cv_generator = cv_args_copy['cv_generator']
            if cv_generator is None:
                cv_generator = CrossValidation.random_split
            cv_args_copy.pop('cv_generator')
        else:
            cv_generator = CrossValidation.random_split
        print('------------------------------------------------------')
        print('[W] Using CV Generator: {}'.format(getattr(cv_generator, '__name__')))

        # Training on Cross Validation Sets
        for x_train, y_train, x_valid, y_valid in cv_generator(x=self.x_train, y=self.y_train, **cv_args_copy):

            # CV Start Time
            cv_start_time = time.time()

            cv_count += 1

            # Fitting and Training Model
            if mode == 'auto_train_boost_round':
                if use_global_valid:
                    reg, idx_round_cv, train_loss_round_cv, valid_loss_round_cv, global_valid_loss_round_cv = \
                        self.fit_with_round_log(boost_round_log_path, cv_count, x_train, y_train, x_valid,
                                                y_valid, parameters, param_name_list, param_value_list,
                                                append_info=append_info)
                    global_valid_loss_round_total.append(global_valid_loss_round_cv)
                else:
                    reg, idx_round_cv, train_loss_round_cv, valid_loss_round_cv = \
                        self.fit_with_round_log(boost_round_log_path, cv_count, x_train, y_train, x_valid,
                                                y_valid, parameters, param_name_list, param_value_list,
                                                append_info=append_info)

                idx_round = idx_round_cv
                train_loss_round_total.append(train_loss_round_cv)
                valid_loss_round_total.append(valid_loss_round_cv)
            else:
                reg = self.fit(x_train, y_train, x_valid, y_valid, parameters)

            # Feature Importance
            if show_importance:
                self.get_importance(reg)

            # Prediction
            if save_cv_pred:
                cv_pred_path = pred_path + 'cv_results/' + self.model_name + '_cv_{}_'.format(cv_count)
            else:
                cv_pred_path = None
            pred_test = self.predict(reg, self.x_test, pred_path=cv_pred_path)

            # Save Train Prediction to CSV File
            if save_cv_pred_train:
                cv_pred_train_path = pred_path + 'cv_pred_train/' + self.model_name + '_cv_{}_'.format(cv_count)
            else:
                cv_pred_train_path = None
            pred_train = self.get_pred_train(reg, x_train, pred_path=cv_pred_train_path)
            pred_train_all = self.get_pred_train(reg, self.x_train, pred_path=cv_pred_train_path)

            # Predict Global Validation Set
            if use_global_valid:
                pred_global_valid = self.predict(reg, self.x_global_valid)
            else:
                pred_global_valid = np.array([])

            # Get Prediction sof Validation Set
            pred_valid = self.predict(reg, x_valid)

            # Print LogLoss
            loss_train, loss_valid = utils.print_loss(pred_train, y_train, pred_valid, y_valid, loss_fuc)

            # Print Loss and Accuracy of Global Validation Set
            if use_global_valid:
                loss_global_valid = utils.print_global_valid_loss(pred_global_valid, self.y_global_valid, loss_fuc)
                pred_global_valid_total.append(pred_global_valid)
                loss_global_valid_total.append(loss_global_valid)

            # Save Losses to File
            utils.save_loss_log(loss_log_path + self.model_name + '_', cv_count, parameters, valid_rate, n_cv,
                                loss_train, loss_valid, train_seed, cv_seed)

            pred_test_total.append(pred_test)
            pred_train_total.append(pred_train_all)
            loss_train_total.append(loss_train)
            loss_valid_total.append(loss_valid)

            # CV End Time
            print('------------------------------------------------------')
            print('CV Done! Using Time: {}s'.format(time.time() - cv_start_time))

        print('======================================================')
        print('Calculating Final Result...')

        # Calculate Means of pred and losses
        pred_test_mean, pred_train_mean, loss_train_mean, loss_valid_mean = \
            utils.calculate_means(pred_test_total, pred_train_total, loss_train_total, loss_valid_total)

        # Save Logs of num_boost_round
        if mode == 'auto_train_boost_round':
            if use_global_valid:
                train_loss_round_mean, valid_loss_round_mean, global_valid_loss_round_mean = \
                    utils.calculate_boost_round_means(train_loss_round_total, valid_loss_round_total,
                                                      global_valid_loss_round_total=global_valid_loss_round_total)
                self.save_boost_round_log(boost_round_log_path, idx_round, train_loss_round_mean,
                                          valid_loss_round_mean, train_seed, cv_seed, csv_idx,
                                          parameters, param_name_list, param_value_list, append_info=append_info,
                                          global_valid_loss_round_mean=global_valid_loss_round_mean)
            else:
                train_loss_round_mean, valid_loss_round_mean = \
                    utils.calculate_boost_round_means(train_loss_round_total, valid_loss_round_total)
                self.save_boost_round_log(boost_round_log_path, idx_round, train_loss_round_mean,
                                          valid_loss_round_mean, train_seed, cv_seed, csv_idx,
                                          parameters, param_name_list, param_value_list, append_info=append_info)

        # Save 'num_boost_round'
        if self.model_name in ['xgb', 'lgb']:
            parameters['num_boost_round'] = self.num_boost_round

        # Save Final Result
        if save_final_pred:
            self.save_final_pred(mode, save_final_pred, pred_test_mean, pred_path, parameters, csv_idx,
                                 train_seed, cv_seed, boost_round_log_path, param_name_list, param_value_list,
                                 file_name_params=file_name_params, append_info=append_info)

        # Save Final pred_train
        if save_final_pred_train:
            utils.save_pred_train_to_csv(pred_path + 'final_pred_train/' + self.model_name + '_',
                                         pred_train_mean, self.y_train)

        # Print Total Losses
        utils.print_total_loss(loss_train_mean, loss_valid_mean)

        # Save Final Losses to File
        utils.save_final_loss_log(loss_log_path + self.model_name + '_', parameters, valid_rate, n_cv,
                                  loss_train_mean, loss_valid_mean, train_seed, cv_seed)

        # Print Global Validation Information and Save
        if use_global_valid:
            # Calculate Means of Predictions and Losses
            loss_global_valid_mean = utils.calculate_global_valid_means(loss_global_valid_total)

            # Save csv log
            if save_csv_log:
                self.save_csv_log(mode, csv_log_path, param_name_list, param_value_list, csv_idx, loss_train_mean,
                                  loss_global_valid_mean, train_seed, cv_seed, valid_rate, n_cv, parameters,
                                  boost_round_log_path=boost_round_log_path, file_name_params=file_name_params,
                                  append_info=append_info, loss_global_valid=loss_global_valid_mean)

        # Save Loss Log to csv File
        if save_csv_log:
            if not use_global_valid:
                self.save_csv_log(mode, csv_log_path, param_name_list, param_value_list, csv_idx, loss_train_mean,
                                  loss_valid_mean, train_seed, cv_seed, valid_rate, n_cv, parameters,
                                  boost_round_log_path=boost_round_log_path, file_name_params=file_name_params,
                                  append_info=append_info)

        # Remove 'num_boost_round' of parameters
        if 'num_boost_round' in parameters:
            parameters.pop('num_boost_round')

        # Return Final Result
        if return_pred_test:
            return pred_test_mean
    def train(self, pred_path=None, loss_log_path=None, csv_log_path=None, boost_round_log_path=None,
              train_seed=None, cv_args=None, parameters=None, show_importance=False, show_accuracy=False,
              save_cv_pred=True, save_cv_pred_train=False, save_final_pred=True, save_final_pred_train=False,
              save_csv_log=True, csv_idx=None, prescale=False, postscale=False, use_global_valid=False,
              return_pred_test=False, mode=None, param_name_list=None, param_value_list=None,
              use_custom_obj=False, use_scale_pos_weight=False, file_name_params=None, append_info=None):

        # Check if directories exit or not
        utils.check_dir_model(pred_path, loss_log_path)
        utils.check_dir([pred_path, loss_log_path, csv_log_path, boost_round_log_path])

        # Global Validation
        self.use_global_valid = use_global_valid

        # Use Custom Objective Function
        self.use_custom_obj = use_custom_obj

        # Cross Validation Arguments
        cv_args_copy, n_valid, n_cv, n_era, cv_seed = utils.get_cv_args(cv_args, append_info)

        if csv_idx is None:
            csv_idx = self.model_name

        # Print Start Information and Get Model Name
        self.print_start_info()

        if use_global_valid:
            print('------------------------------------------------------')
            print('[W] Using Global Validation...')

        cv_count = 0
        pred_test_total = []
        pred_train_total = []
        loss_train_total = []
        loss_valid_total = []
        loss_train_w_total = []
        loss_valid_w_total = []
        idx_round = []
        train_loss_round_total = []
        valid_loss_round_total = []
        global_valid_loss_round_total = []
        pred_global_valid_total = []
        loss_global_valid_total = []
        loss_global_valid_w_total = []

        # Get Cross Validation Generator
        if 'cv_generator' in cv_args_copy:
            cv_generator = cv_args_copy['cv_generator']
            if cv_generator is None:
                cv_generator = CrossValidation.era_k_fold
            cv_args_copy.pop('cv_generator')
        else:
            cv_generator = CrossValidation.era_k_fold
        print('------------------------------------------------------')
        print('[W] Using CV Generator: {}'.format(getattr(cv_generator, '__name__')))

        if 'era_list' in cv_args_copy:
            print('Era List: ', cv_args_copy['era_list'])
        if 'window_size' in cv_args_copy:
            print('Window Size: ', cv_args_copy['window_size'])
        if 'cv_weights' in cv_args_copy:
            cv_weights = cv_args_copy['cv_weights']
            cv_args_copy.pop('cv_weights')
            if cv_weights is not None:
                if len(cv_weights) != n_cv:
                    raise ValueError("The length of 'cv_weights'({}) should be equal to 'n_cv'({})!"
                                     .format(len(cv_weights), n_cv))
        else:
            cv_weights = None

        # Training on Cross Validation Sets
        for x_train, y_train, w_train, e_train, x_valid, y_valid, w_valid, e_valid, valid_era \
                in cv_generator(x=self.x_train, y=self.y_train,
                                w=self.w_train, e=self.e_train, **cv_args_copy):

            # CV Start Time
            cv_start_time = time.time()

            cv_count += 1

            # Get Positive Rate of Train Set and postscale Rate
            positive_rate_train, postscale_rate = self.get_postscale_rate(y_train)
            positive_rate_valid, _ = self.get_postscale_rate(y_valid)

            # Remove Metric of Post Scale
            if postscale:
                self.postscale = True
                self.postscale_rate = postscale_rate
                if 'metric' in parameters.keys():
                    parameters.pop('metric')
                if 'eval_metric' in parameters.keys():
                    parameters.pop('eval_metric')

            if use_scale_pos_weight:
                if self.model_name == 'xgb':
                    parameters['scale_pos_weight'] = postscale_rate

            print('------------------------------------------------------')
            print('Validation Set Era: ', valid_era)
            print('Number of Features: ', x_train.shape[1])
            print('------------------------------------------------------')
            print('Positive Rate of Train Set: {:.6f}'.format(positive_rate_train))
            print('Positive Rate of Valid Set: {:.6f}'.format(positive_rate_valid))
            print('------------------------------------------------------')

            # prescale
            if prescale:
                x_train, y_train, w_train, e_train = self.prescale(x_train, y_train, w_train, e_train)

            # Fitting and Training Model
            if mode == 'auto_train_boost_round':
                if use_global_valid:
                    reg, idx_round_cv, train_loss_round_cv, \
                        valid_loss_round_cv, global_valid_loss_round_cv = \
                        self.fit_with_round_log(
                            boost_round_log_path, cv_count, x_train, y_train, w_train, x_valid, y_valid,
                            w_valid, parameters, param_name_list, param_value_list, append_info=append_info)
                    global_valid_loss_round_total.append(global_valid_loss_round_cv)
                else:
                    reg, idx_round_cv, train_loss_round_cv, valid_loss_round_cv = \
                        self.fit_with_round_log(
                            boost_round_log_path, cv_count, x_train, y_train, w_train, x_valid, y_valid,
                            w_valid, parameters, param_name_list, param_value_list, append_info=append_info)

                idx_round = idx_round_cv
                train_loss_round_total.append(train_loss_round_cv)
                valid_loss_round_total.append(valid_loss_round_cv)
            else:
                reg = self.fit(x_train, y_train, w_train, x_valid, y_valid, w_valid, parameters)

            # Feature Importance
            if show_importance:
                self.get_importance(reg)

            # Prediction
            if save_cv_pred:
                cv_pred_path = \
                    pred_path + 'cv_results/' + self.model_name + '_cv_{}_'.format(cv_count)
            else:
                cv_pred_path = None
            pred_test = self.predict(reg, self.x_test, pred_path=cv_pred_path)

            # Save Train Probabilities to CSV File
            if save_cv_pred_train:
                cv_pred_train_path = \
                    pred_path + 'cv_pred_train/' + self.model_name + '_cv_{}_'.format(cv_count)
            else:
                cv_pred_train_path = None
            pred_train = self.get_pred_train(reg, x_train, pred_path=cv_pred_train_path)
            pred_train_all = self.get_pred_train(reg, self.x_train, pred_path=cv_pred_train_path)

            # Predict Global Validation Set
            if use_global_valid:
                pred_global_valid = self.predict(reg, self.x_global_valid)
            else:
                pred_global_valid = np.array([])

            # Get Probabilities of Validation Set
            pred_valid = self.predict(reg, x_valid)

            # postscale
            if postscale:
                print('------------------------------------------------------')
                print('[W] PostScaling Results...')
                print('PostScale Rate: {:.6f}'.format(postscale_rate))
                pred_test *= postscale_rate
                pred_train *= postscale_rate
                pred_valid *= postscale_rate
                if use_global_valid:
                    pred_global_valid *= postscale_rate

            # Print LogLoss
            print('------------------------------------------------------')
            print('Validation Set Era: ', valid_era)
            loss_train, loss_valid, loss_train_w, loss_valid_w = \
                utils.print_loss(pred_train, y_train, w_train, pred_valid, y_valid, w_valid)

            # Print and Get Accuracies of CV
            acc_train_cv, acc_valid_cv, acc_train_cv_era, acc_valid_cv_era = \
                utils.print_and_get_accuracy(pred_train, y_train, e_train,
                                             pred_valid, y_valid, e_valid, show_accuracy)

            # Print Loss and Accuracy of Global Validation Set
            if use_global_valid:
                loss_global_valid, loss_global_valid_w, acc_global_valid = \
                    utils.print_global_valid_loss_and_acc(
                        pred_global_valid, self.y_global_valid, self.w_global_valid)
                pred_global_valid_total.append(pred_global_valid)
                loss_global_valid_total.append(loss_global_valid)
                loss_global_valid_w_total.append(loss_global_valid_w)

            # Save Losses to File
            utils.save_loss_log(
                loss_log_path + self.model_name + '_', cv_count, parameters, n_valid, n_cv,
                valid_era, loss_train, loss_valid, loss_train_w, loss_valid_w, train_seed,
                cv_seed, acc_train_cv, acc_valid_cv, acc_train_cv_era, acc_valid_cv_era)

            pred_test_total.append(pred_test)
            pred_train_total.append(pred_train_all)
            loss_train_total.append(loss_train)
            loss_valid_total.append(loss_valid)
            loss_train_w_total.append(loss_train_w)
            loss_valid_w_total.append(loss_valid_w)

            # CV End Time
            print('------------------------------------------------------')
            print('CV Done! Using Time: {}s'.format(time.time() - cv_start_time))

        print('======================================================')
        print('Calculating Final Result...')

        # Calculate Means of pred and losses
        pred_test_mean, pred_train_mean, loss_train_mean, \
            loss_valid_mean, loss_train_w_mean, loss_valid_w_mean = \
            utils.calculate_means(pred_test_total, pred_train_total, loss_train_total, loss_valid_total,
                                  loss_train_w_total, loss_valid_w_total, weights=cv_weights)

        # Save 'num_boost_round'
        if self.model_name in ['xgb', 'lgb']:
            parameters['num_boost_round'] = self.num_boost_round

        # Calculate Profit
        profit = 0

        # Save Logs of num_boost_round
        if mode == 'auto_train_boost_round':
            if use_global_valid:
                train_loss_round_mean, valid_loss_round_mean, global_valid_loss_round_mean = \
                    utils.calculate_boost_round_means(
                        train_loss_round_total, valid_loss_round_total, weights=cv_weights,
                        global_valid_loss_round_total=global_valid_loss_round_total)
                self.save_boost_round_log(
                    boost_round_log_path, idx_round, train_loss_round_mean,
                    valid_loss_round_mean, train_seed, cv_seed, csv_idx,
                    parameters, param_name_list, param_value_list, append_info=append_info,
                    global_valid_loss_round_mean=global_valid_loss_round_mean, profit=profit)
            else:
                train_loss_round_mean, valid_loss_round_mean = \
                    utils.calculate_boost_round_means(
                        train_loss_round_total, valid_loss_round_total, weights=cv_weights)
                self.save_boost_round_log(
                    boost_round_log_path, idx_round, train_loss_round_mean,
                    valid_loss_round_mean, train_seed, cv_seed, csv_idx, parameters,
                    param_name_list, param_value_list, append_info=append_info, profit=profit)

        # Save Final Result
        if save_final_pred:
            self.save_final_pred(
                mode, pred_test_mean, pred_path, parameters, csv_idx, train_seed,
                cv_seed, boost_round_log_path, param_name_list, param_value_list,
                file_name_params=file_name_params, append_info=append_info)

        # Save Final pred_train
        if save_final_pred_train:
            utils.save_pred_train_to_csv(pred_path + 'final_pred_train/' + self.model_name + '_',
                                         pred_train_mean, self.y_train)

        # Print Total Losses
        utils.print_total_loss(loss_train_mean, loss_valid_mean, loss_train_w_mean,
                               loss_valid_w_mean, profit=profit)

        # Print and Get Accuracies of CV of All Train Set
        acc_train, acc_train_era = \
            utils.print_and_get_train_accuracy(pred_train_mean, self.y_train, self.e_train, show_accuracy)

        # Save Final Losses to File
        utils.save_final_loss_log(
            loss_log_path + self.model_name + '_', parameters, n_valid, n_cv,
            loss_train_mean, loss_valid_mean, loss_train_w_mean, loss_valid_w_mean,
            train_seed, cv_seed, acc_train, acc_train_era)

        # Print Global Validation Information and Save
        if use_global_valid:
            # Calculate Means of Probabilities and Losses
            pred_global_valid_mean, loss_global_valid_mean, loss_global_valid_w_mean = \
                utils.calculate_global_valid_means(pred_global_valid_total, loss_global_valid_total,
                                                   loss_global_valid_w_total, weights=cv_weights)
            # Print Loss and Accuracy
            acc_total_global_valid = \
                utils.print_total_global_valid_loss_and_acc(
                    pred_global_valid_mean, self.y_global_valid,
                    loss_global_valid_mean, loss_global_valid_w_mean)
            # Save csv log
            if save_csv_log:
                self.save_csv_log(
                    mode, csv_log_path, param_name_list, param_value_list, csv_idx, loss_train_w_mean,
                    loss_valid_w_mean, acc_train, train_seed, cv_seed, n_valid, n_cv, parameters,
                    boost_round_log_path=boost_round_log_path, file_name_params=file_name_params,
                    append_info=append_info, loss_global_valid=loss_global_valid_w_mean,
                    acc_global_valid=acc_total_global_valid, profit=profit)

        # Save Loss Log to csv File
        if save_csv_log:
            if not use_global_valid:
                self.save_csv_log(
                    mode, csv_log_path, param_name_list, param_value_list, csv_idx, loss_train_w_mean,
                    loss_valid_w_mean, acc_train, train_seed, cv_seed, n_valid, n_cv, parameters,
                    boost_round_log_path=boost_round_log_path, file_name_params=file_name_params,
                    append_info=append_info, profit=profit)

        # Remove 'num_boost_round' of parameters
        if 'num_boost_round' in parameters:
            parameters.pop('num_boost_round')

        # Return Final Result
        if return_pred_test:
            return pred_test_mean
示例#3
0
    def train(self,
              pred_path=None,
              loss_log_path=None,
              csv_log_path=None,
              boost_round_log_path=None,
              train_seed=None,
              cv_args=None,
              parameters=None,
              show_importance=False,
              show_accuracy=False,
              save_cv_pred=True,
              save_cv_prob_train=False,
              save_final_pred=True,
              save_final_prob_train=False,
              save_csv_log=True,
              csv_idx=None,
              prescale=False,
              postscale=False,
              use_global_valid=False,
              return_prob_test=False,
              mode=None,
              param_name_list=None,
              param_value_list=None,
              use_custom_obj=False,
              use_scale_pos_weight=False,
              file_name_params=None,
              append_info=None):

        # Check if directories exit or not
        utils.check_dir_model(pred_path, loss_log_path)

        # Global Validation
        self.use_global_valid = use_global_valid

        cv_args_copy = copy.deepcopy(cv_args)
        if 'n_valid' in cv_args:
            n_valid = cv_args_copy['n_valid']
        elif 'valid_rate' in cv_args:
            n_valid = cv_args_copy['valid_rate']
        else:
            n_valid = ''
        n_cv = cv_args_copy['n_cv']
        n_era = cv_args_copy['n_era']
        cv_seed = cv_args_copy['cv_seed']

        # Append Information
        if append_info is None:
            append_info = 'v-' + str(n_valid) + '_c-' + str(
                n_cv) + '_e-' + str(n_era)
            if 'window_size' in cv_args_copy:
                append_info += '_w-' + str(cv_args_copy['window_size'])

        if csv_idx is None:
            csv_idx = self.model_name

        # Build Network
        tf.reset_default_graph()
        train_graph = tf.Graph()

        with train_graph.as_default():

            # Inputs
            inputs, labels, weights, lr, keep_prob, is_training = self.input_tensor(
            )

            # Logits
            logits = self.model(inputs, self.unit_number, keep_prob,
                                is_training)
            logits = tf.identity(logits, name='logits')

            # Loss
            with tf.name_scope('Loss'):
                # cost_ = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))
                cost_ = self.log_loss(logits, weights, labels)

            # Optimizer
            optimizer = tf.train.AdamOptimizer(lr).minimize(cost_)

        # Training
        self.print_start_info()

        if use_global_valid:
            print('------------------------------------------------------')
            print('[W] Using Global Validation...')

        with tf.Session(graph=train_graph) as sess:

            # Merge all the summaries
            merged = tf.summary.merge_all()

            start_time = time.time()
            cv_counter = 0

            prob_test_total = []
            prob_train_total = []
            loss_train_total = []
            loss_valid_total = []
            loss_train_w_total = []
            loss_valid_w_total = []
            idx_round = []
            train_loss_round_total = []
            valid_loss_round_total = []
            prob_global_valid_total = []
            loss_global_valid_total = []
            loss_global_valid_w_total = []

            # Get Cross Validation Generator
            if 'cv_generator' in cv_args_copy:
                cv_generator = cv_args_copy['cv_generator']
                if cv_generator is None:
                    cv_generator = CrossValidation.era_k_fold
                cv_args_copy.pop('cv_generator')
            else:
                cv_generator = CrossValidation.era_k_fold
            print('------------------------------------------------------')
            print('[W] Using CV Generator: {}'.format(
                getattr(cv_generator, '__name__')))

            if 'era_list' in cv_args_copy:
                print('Era List: ', cv_args_copy['era_list'])
            if 'window_size' in cv_args_copy:
                print('Window Size: ', cv_args_copy['window_size'])
            if 'cv_weights' in cv_args_copy:
                cv_weights = cv_args_copy['cv_weights']
                cv_args_copy.pop('cv_weights')
            else:
                cv_weights = None

            for x_train, y_train, w_train, e_train, x_valid, y_valid, w_valid, e_valid, valid_era \
                    in cv_generator(self.x_train, self.y_train, self.w_train, self.e_train, **cv_args_copy):

                # CV Start Time
                cv_start_time = time.time()

                cv_counter += 1

                # Get Positive Rate of Train Set and postscale Rate
                positive_rate_train, postscale_rate = self.get_postscale_rate(
                    y_train)
                positive_rate_valid, _ = self.get_postscale_rate(y_valid)

                print('------------------------------------------------------')
                print('Number of Features: ', x_train.shape[1])
                print('Validation Set Era: ', valid_era)
                print('------------------------------------------------------')
                print('Positive Rate of Train Set: ', positive_rate_train)
                print('Positive Rate of Valid Set: ', positive_rate_valid)
                print('------------------------------------------------------')

                # prescale
                if prescale:
                    x_train, y_train, w_train, e_train = self.prescale(
                        x_train, y_train, w_train, e_train)

                # Training
                if mode == 'auto_train_boost_round':
                    idx_round_cv, train_loss_round_cv, valid_loss_round_cv = \
                        self.train_with_round_log(boost_round_log_path, sess, cv_counter, x_train, y_train,
                                                  w_train, x_valid, y_valid, w_valid, optimizer, merged, cost_,
                                                  inputs, labels, weights, lr, keep_prob, is_training, start_time,
                                                  param_name_list, param_value_list, append_info=append_info)
                    idx_round = idx_round_cv
                    train_loss_round_total.append(train_loss_round_cv)
                    valid_loss_round_total.append(valid_loss_round_cv)
                else:
                    self.trainer(sess, cv_counter, x_train, y_train, w_train,
                                 x_valid, y_valid, w_valid, optimizer, merged,
                                 cost_, inputs, labels, weights, lr, keep_prob,
                                 is_training, start_time)

                # Save Model
                # print('Saving model...')
                # saver = tf.train.Saver()
                # saver.save(sess, self.save_path + 'model.' + self.version + '.ckpt')

                # Prediction
                print('------------------------------------------------------')
                print('Predicting Probabilities...')
                prob_train = self.get_prob(sess, logits, x_train,
                                           self.batch_size, inputs, keep_prob,
                                           is_training)
                prob_train_all = self.get_prob(sess, logits, self.x_train,
                                               self.batch_size, inputs,
                                               keep_prob, is_training)
                prob_valid = self.get_prob(sess, logits, x_valid,
                                           self.batch_size, inputs, keep_prob,
                                           is_training)
                prob_test = self.get_prob(sess, logits, self.x_test,
                                          self.batch_size, inputs, keep_prob,
                                          is_training)

                # Predict Global Validation Set
                if use_global_valid:
                    prob_global_valid = self.get_prob(sess, logits,
                                                      self.x_global_valid,
                                                      self.batch_size, inputs,
                                                      keep_prob, is_training)
                else:
                    prob_global_valid = np.array([])

                # postscale
                if postscale:
                    print(
                        '------------------------------------------------------'
                    )
                    print('[W] PostScaling Results...')
                    print('PostScale Rate: {:.6f}'.format(postscale_rate))
                    prob_test *= postscale_rate
                    prob_train *= postscale_rate
                    prob_valid *= postscale_rate
                    if use_global_valid:
                        prob_global_valid *= postscale_rate

                # Print Losses of CV
                loss_train, loss_valid, loss_train_w, loss_valid_w = \
                    utils.print_loss(prob_train, y_train, w_train, prob_valid, y_valid, w_valid)

                prob_test_total.append(prob_test)
                prob_train_total.append(prob_train_all)
                loss_train_total.append(loss_train)
                loss_valid_total.append(loss_valid)
                loss_train_w_total.append(loss_train_w)
                loss_valid_w_total.append(loss_valid_w)

                # Print and Get Accuracies of CV
                acc_train_cv, acc_valid_cv, acc_train_cv_era, acc_valid_cv_era = \
                    utils.print_and_get_accuracy(prob_train, y_train, e_train,
                                                 prob_valid, y_valid, e_valid, show_accuracy)

                # Print Loss and Accuracy of Global Validation Set
                if use_global_valid:
                    loss_global_valid, loss_global_valid_w, acc_global_valid = \
                        utils.print_global_valid_loss_and_acc(prob_global_valid, self.y_global_valid,
                                                              self.w_global_valid)
                    prob_global_valid_total.append(prob_global_valid)
                    loss_global_valid_total.append(loss_global_valid)
                    loss_global_valid_w_total.append(loss_global_valid_w)

                # Save losses
                utils.save_loss_log(loss_log_path + self.model_name + '_',
                                    cv_counter, self.parameters, n_valid, n_cv,
                                    valid_era, loss_train, loss_valid,
                                    loss_train_w, loss_valid_w, train_seed,
                                    cv_seed, acc_train_cv, acc_valid_cv,
                                    acc_train_cv_era, acc_valid_cv_era)

                if save_cv_pred:
                    utils.save_pred_to_csv(
                        pred_path + 'cv_results/' + self.model_name +
                        '_cv_{}_'.format(cv_counter), self.id_test, prob_test)

                # CV End Time
                print('------------------------------------------------------')
                print('CV Done! Using Time: {}s'.format(time.time() -
                                                        cv_start_time))

            # Final Result
            print('======================================================')
            print('Calculating Final Result...')

            # Calculate Means of prob and losses
            prob_test_mean, prob_train_mean, loss_train_mean, loss_valid_mean, loss_train_w_mean, loss_valid_w_mean = \
                utils.calculate_means(prob_test_total, prob_train_total, loss_train_total, loss_valid_total,
                                      loss_train_w_total, loss_valid_w_total, weights=cv_weights)

            # Save Logs of num_boost_round
            if mode == 'auto_train_boost_round':
                l = len(train_loss_round_total[0])
                for train_loss_cv in train_loss_round_total:
                    if l > len(train_loss_cv):
                        l = len(train_loss_cv)
                idx_round = idx_round[:l]
                train_loss_round_total = [
                    train_loss[:l] for train_loss in train_loss_round_total
                ]
                valid_loss_round_total = [
                    valid_loss[:l] for valid_loss in valid_loss_round_total
                ]
                train_loss_round_mean, valid_loss_round_mean = \
                    utils.calculate_boost_round_means(train_loss_round_total, valid_loss_round_total, weights=cv_weights)
                self.save_boost_round_log(boost_round_log_path,
                                          idx_round,
                                          train_loss_round_mean,
                                          valid_loss_round_mean,
                                          train_seed,
                                          cv_seed,
                                          csv_idx,
                                          parameters,
                                          param_name_list,
                                          param_value_list,
                                          append_info=append_info)

            # Save Final Result
            if save_final_pred:
                self.save_final_pred(mode,
                                     save_final_pred,
                                     prob_test_mean,
                                     pred_path,
                                     parameters,
                                     csv_idx,
                                     train_seed,
                                     cv_seed,
                                     boost_round_log_path,
                                     param_name_list,
                                     param_value_list,
                                     file_name_params=None,
                                     append_info=append_info)

            # Save Final prob_train
            if save_final_prob_train:
                utils.save_prob_train_to_csv(
                    pred_path + 'final_prob_train/' + self.model_name + '_',
                    prob_train_mean, self.y_train)

            # Print Total Losses
            utils.print_total_loss(loss_train_mean, loss_valid_mean,
                                   loss_train_w_mean, loss_valid_w_mean)

            # Print and Get Accuracies of CV of All Train Set
            acc_train, acc_train_era = \
                utils.print_and_get_train_accuracy(prob_train_mean, self.y_train, self.e_train, show_accuracy)

            # Save Final Losses to File
            utils.save_final_loss_log(loss_log_path + self.model_name + '_',
                                      self.parameters, n_valid, n_cv,
                                      loss_train_mean, loss_valid_mean,
                                      loss_train_w_mean, loss_valid_w_mean,
                                      train_seed, cv_seed, acc_train,
                                      acc_train_era)

            # Print Global Validation Information and Save
            if use_global_valid:
                # Calculate Means of Probabilities and Losses
                prob_global_valid_mean, loss_global_valid_mean, loss_global_valid_w_mean = \
                    utils.calculate_global_valid_means(prob_global_valid_total, loss_global_valid_total,
                                                       loss_global_valid_w_total, weights=cv_weights)
                # Print Loss and Accuracy
                acc_total_global_valid = \
                    utils.print_total_global_valid_loss_and_acc(prob_global_valid_mean, self.y_global_valid,
                                                                loss_global_valid_mean, loss_global_valid_w_mean)
                # Save csv log
                if save_csv_log:
                    self.save_csv_log(
                        mode,
                        csv_log_path,
                        param_name_list,
                        param_value_list,
                        csv_idx,
                        loss_train_w_mean,
                        loss_valid_w_mean,
                        acc_train,
                        train_seed,
                        cv_seed,
                        n_valid,
                        n_cv,
                        parameters,
                        boost_round_log_path=boost_round_log_path,
                        file_name_params=file_name_params,
                        append_info=append_info,
                        loss_global_valid=loss_global_valid_w_mean,
                        acc_global_valid=acc_total_global_valid)

            # Save Loss Log to csv File
            if save_csv_log:
                if not use_global_valid:
                    self.save_csv_log(mode,
                                      csv_log_path,
                                      param_name_list,
                                      param_value_list,
                                      csv_idx,
                                      loss_train_w_mean,
                                      loss_valid_w_mean,
                                      acc_train,
                                      train_seed,
                                      cv_seed,
                                      n_valid,
                                      n_cv,
                                      parameters,
                                      file_name_params=file_name_params,
                                      append_info=append_info)

            # Return Final Result
            if return_prob_test:
                return prob_test_mean
示例#4
0
    def train(self,
              pred_path=None,
              loss_log_path=None,
              csv_log_path=None,
              boost_round_log_path=None,
              train_seed=None,
              cv_args=None,
              parameters=None,
              show_importance=False,
              save_cv_pred=True,
              save_cv_pred_train=False,
              save_final_pred=True,
              save_final_pred_train=False,
              save_csv_log=True,
              csv_idx=None,
              use_global_valid=False,
              return_pred_test=False,
              mode=None,
              param_name_list=None,
              param_value_list=None,
              use_custom_obj=False,
              file_name_params=None,
              append_info=None,
              loss_fuc=None):

        # Check if directories exit or not
        utils.check_dir_model(pred_path, loss_log_path)

        # Global Validation
        self.use_global_valid = use_global_valid

        cv_args_copy = copy.deepcopy(cv_args)
        n_cv = cv_args_copy['n_cv']
        cv_seed = cv_args_copy['cv_seed']
        valid_rate = 1 / float(n_cv)

        # Append Information
        if append_info is None:
            append_info = 'v-' + str(valid_rate) + '_c-' + str(n_cv)

        if csv_idx is None:
            csv_idx = self.model_name

        # Build Network
        tf.reset_default_graph()
        train_graph = tf.Graph()

        with train_graph.as_default():

            # Inputs
            inputs, labels, lr, keep_prob, is_training = self.input_tensor()

            # Logits
            logits = self.model(inputs, self.unit_number, keep_prob,
                                is_training)
            logits = tf.identity(logits, name='logits')

            # Loss
            with tf.name_scope('Loss'):
                cost_ = tf.sqrt(tf.reduce_mean(tf.square(logits - labels)))
                # cost_ = tf.reduce_mean(tf.square(logits - labels))

            # Optimizer
            optimizer = tf.train.AdamOptimizer(lr).minimize(cost_)

        # Training
        self.print_start_info()

        if use_global_valid:
            print('------------------------------------------------------')
            print('[W] Using Global Validation...')

        with tf.Session(graph=train_graph) as sess:

            # Merge all the summaries
            merged = tf.summary.merge_all()

            start_time = time.time()
            cv_count = 0
            pred_test_total = []
            pred_train_total = []
            loss_train_total = []
            loss_valid_total = []
            idx_round = []
            train_loss_round_total = []
            valid_loss_round_total = []
            global_valid_loss_round_total = []
            pred_global_valid_total = []
            loss_global_valid_total = []

            # Get Cross Validation Generator
            if 'cv_generator' in cv_args_copy:
                cv_generator = cv_args_copy['cv_generator']
                if cv_generator is None:
                    cv_generator = CrossValidation.random_split
                cv_args_copy.pop('cv_generator')
            else:
                cv_generator = CrossValidation.random_split
            print('------------------------------------------------------')
            print('[W] Using CV Generator: {}'.format(
                getattr(cv_generator, '__name__')))

            # Training on Cross Validation Sets
            for x_train, y_train, x_valid, y_valid in cv_generator(
                    x=self.x_train, y=self.y_train, **cv_args_copy):

                # CV Start Time
                cv_start_time = time.time()

                cv_count += 1

                # Training
                if mode == 'auto_train_boost_round':
                    idx_round_cv, train_loss_round_cv, valid_loss_round_cv = \
                        self.train_with_round_log(boost_round_log_path, sess, cv_count, x_train, y_train,
                                                  x_valid, y_valid, optimizer, merged, cost_,
                                                  inputs, labels, lr, keep_prob, is_training, start_time,
                                                  param_name_list, param_value_list, append_info=append_info)
                    idx_round = idx_round_cv
                    train_loss_round_total.append(train_loss_round_cv)
                    valid_loss_round_total.append(valid_loss_round_cv)
                else:
                    self.trainer(sess, cv_count, x_train, y_train, x_valid,
                                 y_valid, optimizer, merged, cost_, inputs,
                                 labels, lr, keep_prob, is_training,
                                 start_time)

                # Save Model
                # print('Saving model...')
                # saver = tf.train.Saver()
                # saver.save(sess, self.save_path + 'model.' + self.version + '.ckpt')

                # Prediction
                print('------------------------------------------------------')
                print('Predicting Probabilities...')
                pred_train = self.get_pred(sess, logits, x_train,
                                           self.batch_size, inputs, keep_prob,
                                           is_training)
                pred_train_all = self.get_pred(sess, logits, self.x_train,
                                               self.batch_size, inputs,
                                               keep_prob, is_training)
                pred_valid = self.get_pred(sess, logits, x_valid,
                                           self.batch_size, inputs, keep_prob,
                                           is_training)
                pred_test = self.get_pred(sess, logits, self.x_test,
                                          self.batch_size, inputs, keep_prob,
                                          is_training)

                # Predict Global Validation Set
                if use_global_valid:
                    pred_global_valid = self.get_pred(sess, logits,
                                                      self.x_global_valid,
                                                      self.batch_size, inputs,
                                                      keep_prob, is_training)
                else:
                    pred_global_valid = np.array([])

                # Print Losses of CV
                loss_train, loss_valid = utils.print_loss(
                    pred_train, y_train, pred_valid, y_valid, loss_fuc)

                # Print Loss and Accuracy of Global Validation Set
                if use_global_valid:
                    loss_global_valid = utils.print_global_valid_loss(
                        pred_global_valid, self.y_global_valid, self.rmse_loss)
                    pred_global_valid_total.append(pred_global_valid)
                    loss_global_valid_total.append(loss_global_valid)

                # Save Losses to File
                utils.save_loss_log(loss_log_path + self.model_name + '_',
                                    cv_count, parameters, valid_rate, n_cv,
                                    loss_train, loss_valid, train_seed,
                                    cv_seed)

                pred_test_total.append(pred_test)
                pred_train_total.append(pred_train_all)
                loss_train_total.append(loss_train)
                loss_valid_total.append(loss_valid)

                if save_cv_pred:
                    utils.save_pred_to_csv(
                        pred_path + 'cv_results/' + self.model_name +
                        '_cv_{}_'.format(cv_count), self.id_test, pred_test)

                # CV End Time
                print('------------------------------------------------------')
                print('CV Done! Using Time: {}s'.format(time.time() -
                                                        cv_start_time))

            print('======================================================')
            print('Calculating Final Result...')

            # Calculate Means of pred and losses
            pred_test_mean, pred_train_mean, loss_train_mean, loss_valid_mean = \
                utils.calculate_means(pred_test_total, pred_train_total, loss_train_total, loss_valid_total)

            # Save Logs of num_boost_round
            if mode == 'auto_train_boost_round':
                if use_global_valid:
                    train_loss_round_mean, valid_loss_round_mean, global_valid_loss_round_mean = \
                        utils.calculate_boost_round_means(train_loss_round_total, valid_loss_round_total,
                                                          global_valid_loss_round_total=global_valid_loss_round_total)
                    self.save_boost_round_log(boost_round_log_path,
                                              idx_round,
                                              train_loss_round_mean,
                                              valid_loss_round_mean,
                                              train_seed,
                                              cv_seed,
                                              csv_idx,
                                              parameters,
                                              param_name_list,
                                              param_value_list,
                                              append_info=append_info,
                                              global_valid_loss_round_mean=
                                              global_valid_loss_round_mean)
                else:
                    train_loss_round_mean, valid_loss_round_mean = \
                        utils.calculate_boost_round_means(train_loss_round_total, valid_loss_round_total)
                    self.save_boost_round_log(boost_round_log_path,
                                              idx_round,
                                              train_loss_round_mean,
                                              valid_loss_round_mean,
                                              train_seed,
                                              cv_seed,
                                              csv_idx,
                                              parameters,
                                              param_name_list,
                                              param_value_list,
                                              append_info=append_info)

            # Save 'num_boost_round'
            if self.model_name in ['xgb', 'lgb']:
                parameters['num_boost_round'] = self.num_boost_round

            # Save Final Result
            if save_final_pred:
                self.save_final_pred(mode,
                                     save_final_pred,
                                     pred_test_mean,
                                     pred_path,
                                     parameters,
                                     csv_idx,
                                     train_seed,
                                     cv_seed,
                                     boost_round_log_path,
                                     param_name_list,
                                     param_value_list,
                                     file_name_params=file_name_params,
                                     append_info=append_info)

            # Save Final pred_train
            if save_final_pred_train:
                utils.save_pred_train_to_csv(
                    pred_path + 'final_pred_train/' + self.model_name + '_',
                    pred_train_mean, self.y_train)

            # Print Total Losses
            utils.print_total_loss(loss_train_mean, loss_valid_mean)

            # Save Final Losses to File
            utils.save_final_loss_log(loss_log_path + self.model_name + '_',
                                      parameters, valid_rate, n_cv,
                                      loss_train_mean, loss_valid_mean,
                                      train_seed, cv_seed)

            # Print Global Validation Information and Save
            if use_global_valid:
                # Calculate Means of Predictions and Losses
                loss_global_valid_mean = utils.calculate_global_valid_means(
                    loss_global_valid_total)

                # Save csv log
                if save_csv_log:
                    self.save_csv_log(
                        mode,
                        csv_log_path,
                        param_name_list,
                        param_value_list,
                        csv_idx,
                        loss_train_mean,
                        loss_global_valid_mean,
                        train_seed,
                        cv_seed,
                        valid_rate,
                        n_cv,
                        parameters,
                        boost_round_log_path=boost_round_log_path,
                        file_name_params=file_name_params,
                        append_info=append_info,
                        loss_global_valid=loss_global_valid_mean)

            # Save Loss Log to csv File
            if save_csv_log:
                if not use_global_valid:
                    self.save_csv_log(
                        mode,
                        csv_log_path,
                        param_name_list,
                        param_value_list,
                        csv_idx,
                        loss_train_mean,
                        loss_valid_mean,
                        train_seed,
                        cv_seed,
                        valid_rate,
                        n_cv,
                        parameters,
                        boost_round_log_path=boost_round_log_path,
                        file_name_params=file_name_params,
                        append_info=append_info)

            # Remove 'num_boost_round' of parameters
            if 'num_boost_round' in parameters:
                parameters.pop('num_boost_round')

            # Return Final Result
            if return_pred_test:
                return pred_test_mean