예제 #1
0
    def simple_validate(self, params_dict, X, y):
        self.logger.info("Simple CV with parms:{}".format(params_dict))
        start = time.time()
        if self.verbose:
            self.logger.info("=" * 50)
            self.logger.info("Task")
            self.logger.info("      %s" % str(self.__str__()))
            self.logger.info("Param")
            self._print_param_dict(params_dict)
            self.logger.info("Result")
            self.logger.info("      Run      RMSE        Shape")

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.10,
                                                            random_state=1024)

        # data
        #X_train, y_train, X_valid, y_valid = self.feature._get_train_valid_data(i)
        model = self.learner.create_model(params_dict)
        # fit
        self.logger.info("Fit for train data")
        model = self.learner.fit(model, X_train, y_train)
        self.logger.info("Predict for validation data")
        y_pred = np.reshape(self.learner.predict(model, X_test),
                            (len(X_test), ))
        print(y_test.shape)
        print(y_pred.shape)
        auc_cv = dist_utils._auc(y_test, y_pred)
        # log
        self.logger.info("      {:>3}    {:>8}    {} x {}".format(
            1, np.round(auc_cv, 6), X_train.shape[0], X_train.shape[1]))

        del (model)
        del (y_pred)
        gc.collect()

        self.rmse_cv_mean_simple = 1 - auc_cv
        self.rmse_cv_std_simple = 0
        end = time.time()
        _sec = end - start
        _min = int(_sec / 60.)
        if self.verbose:
            self.logger.info("AUC")
            self.logger.info("      Mean: %.6f" % self.rmse_cv_mean)
            self.logger.info("      Std: %.6f" % self.rmse_cv_std)
            self.logger.info("Time")
            if _min > 0:
                self.logger.info("      %d mins" % _min)
            else:
                self.logger.info("      %d secs" % _sec)
            self.logger.info("-" * 50)
        return self
예제 #2
0
def _obj(param_dict):
    global trial_counter
    global train_df, valid_df

    trial_counter += 1

    param_dict = _convert_int_param(param_dict)
    #self.task.go(param_dict,sample_len= 100000,do_save_cv=False,do_save_refit=False)
    sample_len = 10000000
    print("==>Tunne with CV")
    bst = None
    start_time = time.time()
    tr_size = 10000000
    vl_size = 10000000
    train_df1 = train_df[-tr_size:]
    valid_df1 = train_df[:vl_size]
    (bst, best_iteration) = lgb_modelfit_nocv(bst,
                                              param_dict,
                                              train_df1,
                                              valid_df1,
                                              predictors,
                                              target,
                                              objective='binary',
                                              metrics='auc',
                                              early_stopping_rounds=50,
                                              verbose_eval=True,
                                              num_boost_round=1000,
                                              do_free=False,
                                              categorical_features=categorical)

    print('[{}]: model training time'.format(time.time() - start_time))
    gc.collect()

    y_pred = np.reshape(bst.predict(valid_df1), (vl_size, ))
    y_true = np.reshape(valid_df1[target].values, (vl_size, 1))
    rmse_cv_mean = dist_utils._auc(y_true, y_pred)
    print("Predict for value")
    ret = {
        "loss": rmse_cv_mean,
        "attachments": {
            "std": 0.0,
        },
        "status": STATUS_OK,
    }
    return ret
예제 #3
0
    def train_cv(self):
        start_time = time.time()

        nfold = 4
        train_preds = []
        auc_cv = [0.0 for _ in range(nfold)]
        for fold in range(nfold):
            self.create_clf()
            print("Pretrain models")
            self.pretrain()
            """
            for pretrain_file in self.pretrain_files:
                print("Pretrain using file:{}".format(pretrain_file))
                loader = DataPiper(pretrain_file,logger)
                for (idx, fold_num, X, labels, weights) in self.get_data(loader):
                    self.do_thread_execute(fit_batch,self.clf,X,labels,weights)
            """
            print("Train with file={}".format(self.train_file))
            file_size = 40000000
            all_cv_preds = np.zeros(shape=(file_size,),dtype=np.float32)
            loader = DataPiper(self.train_file,logger)
            valid_datas = []
            loops = 0
            rcount = 0
            for (idx, fold_num, X, labels, weights) in self.get_data(loader,fold=nfold,file_size=file_size):
                print("fold_num={},fold={},nfold={}".format(fold_num,fold,nfold))
                if fold_num == fold:
                    valid_datas.append((idx,fold_num,X,labels,weights))
                    print("Add valid_datas:len={}".format(len(valid_datas)))
                    continue

                loops += 1
                rcount += len(labels)
                if loops % 2 == 0:
                    self.do_thread_execute(evaluate_batch,self.clf,X,labels,weights,do_free=False)

                print("Training", rcount, time.time() - start_time)
                self.do_thread_execute(fit_batch,self.clf,X,labels,weights)

            print("Predict for the validation data")
            print("Valid_datas:len={}".format(len(valid_datas)))
            valid_start_idx = valid_datas[0][0]
            valid_labels = []
            valid_weights = []
            valid_ds = []
            for d in valid_datas:
                valid_labels.append(d[3])
                valid_weights.append(d[4])
                valid_ds.append(d[2])
                #print("Valid_ds:d.len={},valid_ds.len={}".format(len(d[2]),len(valid_ds)))
            num = len(valid_labels)
            if num > 1:
                valid_weights = np.concatenate(valid_weights,axis=0)
                valid_labels = np.concatenate(valid_labels, axis=0)
                from scipy.sparse import hstack
                #valid_ds = np.concatenate(valid_ds,axis=0)
                valid_ds = hstack(valid_ds,axis=0)
            else:
                valid_labels = valid_labels[0]
                valid_weights = valid_weights[0]
                valid_ds = valid_ds[0]
            y_pred = self.do_thread_execute(predict_batch,self.clf,valid_ds)
            num = len(valid_labels)
            y_pred = np.reshape(y_pred,(num,))
            print("y_pred.shape={}".format(y_pred.shape))
            print("valid_labels.shape={}".format(valid_labels.shape))
            valid_labels = np.reshape(valid_labels,(num,))
            train_preds.append((valid_start_idx,num,y_pred))
            auc_cv[fold] = dist_utils._auc(valid_labels, y_pred)
            logger.info("      {:>3}    {:>8}    {} x {}".format(
                fold+1, np.round(auc_cv[fold],6), valid_ds.shape[0], valid_ds.shape[1]))

            #clean up
            del(valid_datas)
            del(valid_ds)
            del(valid_labels)
            del(valid_weights)
            gc.collect()

        # Save cv result data
        fname = "%s/cv_pred_%s_%s.csv"%(config.OUTPUT_DIR, "fmftrl",Ver)
        print("Save cv predictions:{}".format(fname))
        df = pd.DataFrame({"predicted": all_cv_preds})
        df.to_csv(fname, index=False, columns=["predicted"])
예제 #4
0
    def cv(self, params_dict, sample_len=0, do_save=False):
        self.logger.info("CV with parms:{}".format(params_dict))
        start = time.time()
        if self.verbose:
            self.logger.info("=" * 50)
            self.logger.info("Task")
            self.logger.info("      %s" % str(self.__str__()))
            self.logger.info("Param")
            self._print_param_dict(params_dict)
            self.logger.info("Result")
            self.logger.info("      Run      RMSE        Shape")

        auc_cv = np.zeros(self.n_fold)
        total_train = self.data_loader.len_train
        train_pred = np.zeros(total_train)
        i = -1
        cnt = 0
        y_pred_test = None
        X_test, y_test, W_test = self.data_loader._get_test_data()
        for (X_train, y_train, X_valid, y_valid, train_ind, valid_ind, W_train,
             W_valid) in self.data_loader._get_train_valid_data():
            i += 1
            cnt += 1
            print(X_train[:10])
            print(y_train[:10])
            cur_train_len = len(X_train)
            if sample_len > 0 and cur_train_len > sample_len:
                X_train = X_train[-sample_len:]
                y_train = y_train[-sample_len:]

            # data
            #X_train, y_train, X_valid, y_valid = self.feature._get_train_valid_data(i)
            model = self.learner.create_model(params_dict)
            # fit
            model = self.learner.fit(model, X_train, y_train)
            self.logger.info("Predict for validation data")
            y_pred = np.reshape(self.learner.predict(model, X_valid),
                                (len(X_valid), ))
            train_pred[valid_ind] = y_pred
            if do_save and X_test is not None:
                self.logger.info("Predict for test data")
                if y_pred_test is None:
                    y_pred_test = np.reshape(
                        self.learner.predict(model, X_test), (len(X_test), ))
                else:
                    y_pred_test += np.reshape(
                        self.learner.predict(model, X_test), (len(X_test), ))

            print(y_valid.shape)
            print(y_pred.shape)
            auc_cv[i] = dist_utils._auc(y_valid, y_pred)
            # log
            self.logger.info("      {:>3}    {:>8}    {} x {}".format(
                i + 1, np.round(auc_cv[i], 6), X_train.shape[0],
                X_train.shape[1]))

            del (model)
            del (y_pred)
            gc.collect()

            if sample_len > 0 and do_save == False:  # with sample we only do some evaluation
                break

        # save
        if do_save:
            fname = "%s/cv_pred.%s.csv" % (config.OUTPUT_DIR, self.__str__())
            self.logger.info("Save cv predictions:{}".format(fname))
            df = pd.DataFrame({"predicted": train_pred})
            df.to_csv(fname, index=False, columns=["predicted"])

            if y_pred_test is not None:
                fname = "%s/cv_test_avg_pred.%s.csv" % (config.OUTPUT_DIR,
                                                        self.__str__())
                self.logger.info("Save cv test predictions:{}".format(fname))
                df = pd.DataFrame({
                    "click_id": y_test,
                    "predicted": y_pred_test
                })
                df.to_csv(fname, index=False, columns=["predicted"])

        #self.rmse_cv_mean = 1- np.mean(auc_cv)
        # auc_error = 1 - auc
        self.rmse_cv_mean = 1 - np.mean(auc_cv[:cnt])
        self.rmse_cv_std = np.std(auc_cv[:cnt])
        end = time.time()
        _sec = end - start
        _min = int(_sec / 60.)
        if self.verbose:
            self.logger.info("AUC")
            self.logger.info("      Mean: %.6f" % self.rmse_cv_mean)
            self.logger.info("      Std: %.6f" % self.rmse_cv_std)
            self.logger.info("Time")
            if _min > 0:
                self.logger.info("      %d mins" % _min)
            else:
                self.logger.info("      %d secs" % _sec)
            self.logger.info("-" * 50)
        return self
예제 #5
0
    def cv(self, params_dict, sample_len=0, do_save=False):
        self.logger.info("CV with parms:{}".format(params_dict))
        start = time.time()
        if self.verbose:
            self.logger.info("=" * 50)
            self.logger.info("Task")
            self.logger.info("      %s" % str(self.__str__()))
            self.logger.info("Param")
            self._print_param_dict(params_dict)
            self.logger.info("Result")
            self.logger.info("      Run      RMSE        Shape")

        auc_cv = np.zeros(self.n_fold)
        train_preds = []
        total_train = 0
        for fold in range(self.n_fold):
            valid_datas = []
            rcount = 0
            model = self.learner.create_model(params_dict)
            #Train with train data
            for (idx, fold_num, data, labels,
                 weights) in self.data_loader.get_train_data():
                print("fold_num={},fold={}".format(fold_num, fold))
                if fold_num == fold:
                    valid_datas.append((idx, fold_num, data, labels, weights))
                else:
                    cur_train_len = len(data)
                    rcount += cur_train_len
                    if sample_len > 0:
                        if rcount > sample_len and rcount - cur_train_len < sample_len:
                            # Train with sampled data
                            try:
                                model = self.learner.fit(
                                    model, data, labels, weights)
                            except:
                                model = self.learner.fit(model, data, labels)
                        else:
                            continue  # Use continue to add valid datas here
                    else:
                        try:
                            model = self.learner.fit(model, data, labels,
                                                     weights)
                        except:
                            model = self.learner.fit(model, data, labels)
                del (data)
                del (labels)
                del (weights)
                gc.collect()

            self.logger.info("Predict for validation data")
            print("valid_datas[0]={}".format(valid_datas[0]))
            valid_start_idx = valid_datas[0][0]
            valid_labels = []
            valid_weights = []
            valid_ds = []
            for d in valid_datas:
                valid_labels.append(d[3])
                valid_weights.append(d[4])
                valid_ds.append(d[2])
            valid_weights = np.concatenate(valid_weights, axis=0)
            valid_labels = np.concatenate(valid_labels, axis=0)
            valid_ds = np.concatenate(valid_ds, axis=0)
            num = len(valid_ds)
            y_pred = np.reshape(self.learner.predict(model, valid_ds), (num, ))
            total_train += num
            train_preds.append((valid_start_idx, num, y_pred))
            auc_cv[fold] = dist_utils._auc(valid_labels, y_pred)
            self.logger.info("      {:>3}    {:>8}    {} x {}".format(
                fold + 1, np.round(auc_cv[fold], 6), valid_ds.shape[0],
                valid_ds.shape[1]))

            #clean up
            del (valid_datas)
            del (valid_ds)
            del (valid_labels)
            del (valid_weights)
            gc.collect()

        #Aggregate valid data
        train_pred = np.zeros(total_train)
        for (start, num, pred) in train_preds:
            train_pred[start:start + num] = pred

        # save
        if do_save:
            fname = "%s/cv_pred.%s.csv" % (config.OUTPUT_DIR, self.__str__())
            self.logger.info("Save cv predictions:{}".format(fname))
            df = pd.DataFrame({"predicted": train_pred})
            df.to_csv(fname, index=False, columns=["predicted"])

        #self.rmse_cv_mean = 1- np.mean(auc_cv)
        # auc_error = 1 - auc
        self.rmse_cv_mean = 1 - np.mean(auc_cv)
        self.rmse_cv_std = np.std(auc_cv)
        end = time.time()
        _sec = end - start
        _min = int(_sec / 60.)
        if self.verbose:
            self.logger.info("AUC")
            self.logger.info("      Mean: %.6f" % self.rmse_cv_mean)
            self.logger.info("      Std: %.6f" % self.rmse_cv_std)
            self.logger.info("Time")
            if _min > 0:
                self.logger.info("      %d mins" % _min)
            else:
                self.logger.info("      %d secs" % _sec)
            self.logger.info("-" * 50)
        return self
예제 #6
0
    def simple_cv(self, params_dict, sample_len=0, do_save=False):
        """
        Simpler Cross-Validation for select best parameters
        :param params_dict:
        :param sample_len:
        :param do_save:
        :return:
        """
        self.logger.info("CV with parms:{}".format(params_dict))
        start = time.time()
        if self.verbose:
            self.logger.info("=" * 50)
            self.logger.info("Task")
            self.logger.info("      %s" % str(self.__str__()))
            self.logger.info("Param")
            self._print_param_dict(params_dict)
            self.logger.info("Result")
            self.logger.info("      Run      RMSE        Shape")

        for (idx, fold_num, data, labels,
             weights) in self.data_loader.get_train_data():
            # Train with sampled data
            len_data = len(data)
            if sample_len > 0 and len_data > sample_len:
                len_data = sample_len
            train_len = int(len_data * 0.9)

            valid_data = data[train_len:len_data]
            valid_labels = labels[train_len:len_data]
            valid_weights = weights[train_len:len_data]
            data = data[:train_len]
            labels = labels[:train_len]
            weights = labels[:train_len]

            model = self.learner.create_model(params_dict)
            try:
                model = self.learner.fit(model, data, labels, weights)
            except:
                model = self.learner.fit(model, data, labels)
            del (data)
            del (labels)
            del (weights)
            gc.collect()

            self.logger.info("Predict for validation data")
            y_pred = np.reshape(self.learner.predict(model, valid_data),
                                (len(valid_data), ))
            valid_labels = np.reshape(valid_labels, (len(valid_labels), ))
            print(y_pred.shape)
            auc_cv = dist_utils._auc(valid_labels, y_pred)
            # log
            self.logger.info("      {:>3}    {:>8}    {} x {}".format(
                1, np.round(auc_cv, 6), valid_data.shape[0],
                valid_data.shape[1]))

            del (model)
            del (y_pred)
            del (valid_data)
            del (valid_weights)
            del (valid_labels)
            gc.collect()

            self.rmse_cv_mean_simple = 1 - auc_cv
            self.rmse_cv_std_simple = 0
            end = time.time()
            _sec = end - start
            _min = int(_sec / 60.)
            if self.verbose:
                self.logger.info("AUC")
                self.logger.info("      Mean: %.6f" % self.rmse_cv_mean)
                self.logger.info("      Std: %.6f" % self.rmse_cv_std)
                self.logger.info("Time")
                if _min > 0:
                    self.logger.info("      %d mins" % _min)
                else:
                    self.logger.info("      %d secs" % _sec)
                self.logger.info("-" * 50)
            return self