def simple_validate(self, params_dict, X, y): self.logger.info("Simple CV with parms:{}".format(params_dict)) start = time.time() if self.verbose: self.logger.info("=" * 50) self.logger.info("Task") self.logger.info(" %s" % str(self.__str__())) self.logger.info("Param") self._print_param_dict(params_dict) self.logger.info("Result") self.logger.info(" Run RMSE Shape") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1024) # data #X_train, y_train, X_valid, y_valid = self.feature._get_train_valid_data(i) model = self.learner.create_model(params_dict) # fit self.logger.info("Fit for train data") model = self.learner.fit(model, X_train, y_train) self.logger.info("Predict for validation data") y_pred = np.reshape(self.learner.predict(model, X_test), (len(X_test), )) print(y_test.shape) print(y_pred.shape) auc_cv = dist_utils._auc(y_test, y_pred) # log self.logger.info(" {:>3} {:>8} {} x {}".format( 1, np.round(auc_cv, 6), X_train.shape[0], X_train.shape[1])) del (model) del (y_pred) gc.collect() self.rmse_cv_mean_simple = 1 - auc_cv self.rmse_cv_std_simple = 0 end = time.time() _sec = end - start _min = int(_sec / 60.) if self.verbose: self.logger.info("AUC") self.logger.info(" Mean: %.6f" % self.rmse_cv_mean) self.logger.info(" Std: %.6f" % self.rmse_cv_std) self.logger.info("Time") if _min > 0: self.logger.info(" %d mins" % _min) else: self.logger.info(" %d secs" % _sec) self.logger.info("-" * 50) return self
def _obj(param_dict): global trial_counter global train_df, valid_df trial_counter += 1 param_dict = _convert_int_param(param_dict) #self.task.go(param_dict,sample_len= 100000,do_save_cv=False,do_save_refit=False) sample_len = 10000000 print("==>Tunne with CV") bst = None start_time = time.time() tr_size = 10000000 vl_size = 10000000 train_df1 = train_df[-tr_size:] valid_df1 = train_df[:vl_size] (bst, best_iteration) = lgb_modelfit_nocv(bst, param_dict, train_df1, valid_df1, predictors, target, objective='binary', metrics='auc', early_stopping_rounds=50, verbose_eval=True, num_boost_round=1000, do_free=False, categorical_features=categorical) print('[{}]: model training time'.format(time.time() - start_time)) gc.collect() y_pred = np.reshape(bst.predict(valid_df1), (vl_size, )) y_true = np.reshape(valid_df1[target].values, (vl_size, 1)) rmse_cv_mean = dist_utils._auc(y_true, y_pred) print("Predict for value") ret = { "loss": rmse_cv_mean, "attachments": { "std": 0.0, }, "status": STATUS_OK, } return ret
def train_cv(self): start_time = time.time() nfold = 4 train_preds = [] auc_cv = [0.0 for _ in range(nfold)] for fold in range(nfold): self.create_clf() print("Pretrain models") self.pretrain() """ for pretrain_file in self.pretrain_files: print("Pretrain using file:{}".format(pretrain_file)) loader = DataPiper(pretrain_file,logger) for (idx, fold_num, X, labels, weights) in self.get_data(loader): self.do_thread_execute(fit_batch,self.clf,X,labels,weights) """ print("Train with file={}".format(self.train_file)) file_size = 40000000 all_cv_preds = np.zeros(shape=(file_size,),dtype=np.float32) loader = DataPiper(self.train_file,logger) valid_datas = [] loops = 0 rcount = 0 for (idx, fold_num, X, labels, weights) in self.get_data(loader,fold=nfold,file_size=file_size): print("fold_num={},fold={},nfold={}".format(fold_num,fold,nfold)) if fold_num == fold: valid_datas.append((idx,fold_num,X,labels,weights)) print("Add valid_datas:len={}".format(len(valid_datas))) continue loops += 1 rcount += len(labels) if loops % 2 == 0: self.do_thread_execute(evaluate_batch,self.clf,X,labels,weights,do_free=False) print("Training", rcount, time.time() - start_time) self.do_thread_execute(fit_batch,self.clf,X,labels,weights) print("Predict for the validation data") print("Valid_datas:len={}".format(len(valid_datas))) valid_start_idx = valid_datas[0][0] valid_labels = [] valid_weights = [] valid_ds = [] for d in valid_datas: valid_labels.append(d[3]) valid_weights.append(d[4]) valid_ds.append(d[2]) #print("Valid_ds:d.len={},valid_ds.len={}".format(len(d[2]),len(valid_ds))) num = len(valid_labels) if num > 1: valid_weights = np.concatenate(valid_weights,axis=0) valid_labels = np.concatenate(valid_labels, axis=0) from scipy.sparse import hstack #valid_ds = np.concatenate(valid_ds,axis=0) valid_ds = hstack(valid_ds,axis=0) else: valid_labels = valid_labels[0] valid_weights = valid_weights[0] valid_ds = valid_ds[0] y_pred = self.do_thread_execute(predict_batch,self.clf,valid_ds) num = len(valid_labels) y_pred = np.reshape(y_pred,(num,)) print("y_pred.shape={}".format(y_pred.shape)) print("valid_labels.shape={}".format(valid_labels.shape)) valid_labels = np.reshape(valid_labels,(num,)) train_preds.append((valid_start_idx,num,y_pred)) auc_cv[fold] = dist_utils._auc(valid_labels, y_pred) logger.info(" {:>3} {:>8} {} x {}".format( fold+1, np.round(auc_cv[fold],6), valid_ds.shape[0], valid_ds.shape[1])) #clean up del(valid_datas) del(valid_ds) del(valid_labels) del(valid_weights) gc.collect() # Save cv result data fname = "%s/cv_pred_%s_%s.csv"%(config.OUTPUT_DIR, "fmftrl",Ver) print("Save cv predictions:{}".format(fname)) df = pd.DataFrame({"predicted": all_cv_preds}) df.to_csv(fname, index=False, columns=["predicted"])
def cv(self, params_dict, sample_len=0, do_save=False): self.logger.info("CV with parms:{}".format(params_dict)) start = time.time() if self.verbose: self.logger.info("=" * 50) self.logger.info("Task") self.logger.info(" %s" % str(self.__str__())) self.logger.info("Param") self._print_param_dict(params_dict) self.logger.info("Result") self.logger.info(" Run RMSE Shape") auc_cv = np.zeros(self.n_fold) total_train = self.data_loader.len_train train_pred = np.zeros(total_train) i = -1 cnt = 0 y_pred_test = None X_test, y_test, W_test = self.data_loader._get_test_data() for (X_train, y_train, X_valid, y_valid, train_ind, valid_ind, W_train, W_valid) in self.data_loader._get_train_valid_data(): i += 1 cnt += 1 print(X_train[:10]) print(y_train[:10]) cur_train_len = len(X_train) if sample_len > 0 and cur_train_len > sample_len: X_train = X_train[-sample_len:] y_train = y_train[-sample_len:] # data #X_train, y_train, X_valid, y_valid = self.feature._get_train_valid_data(i) model = self.learner.create_model(params_dict) # fit model = self.learner.fit(model, X_train, y_train) self.logger.info("Predict for validation data") y_pred = np.reshape(self.learner.predict(model, X_valid), (len(X_valid), )) train_pred[valid_ind] = y_pred if do_save and X_test is not None: self.logger.info("Predict for test data") if y_pred_test is None: y_pred_test = np.reshape( self.learner.predict(model, X_test), (len(X_test), )) else: y_pred_test += np.reshape( self.learner.predict(model, X_test), (len(X_test), )) print(y_valid.shape) print(y_pred.shape) auc_cv[i] = dist_utils._auc(y_valid, y_pred) # log self.logger.info(" {:>3} {:>8} {} x {}".format( i + 1, np.round(auc_cv[i], 6), X_train.shape[0], X_train.shape[1])) del (model) del (y_pred) gc.collect() if sample_len > 0 and do_save == False: # with sample we only do some evaluation break # save if do_save: fname = "%s/cv_pred.%s.csv" % (config.OUTPUT_DIR, self.__str__()) self.logger.info("Save cv predictions:{}".format(fname)) df = pd.DataFrame({"predicted": train_pred}) df.to_csv(fname, index=False, columns=["predicted"]) if y_pred_test is not None: fname = "%s/cv_test_avg_pred.%s.csv" % (config.OUTPUT_DIR, self.__str__()) self.logger.info("Save cv test predictions:{}".format(fname)) df = pd.DataFrame({ "click_id": y_test, "predicted": y_pred_test }) df.to_csv(fname, index=False, columns=["predicted"]) #self.rmse_cv_mean = 1- np.mean(auc_cv) # auc_error = 1 - auc self.rmse_cv_mean = 1 - np.mean(auc_cv[:cnt]) self.rmse_cv_std = np.std(auc_cv[:cnt]) end = time.time() _sec = end - start _min = int(_sec / 60.) if self.verbose: self.logger.info("AUC") self.logger.info(" Mean: %.6f" % self.rmse_cv_mean) self.logger.info(" Std: %.6f" % self.rmse_cv_std) self.logger.info("Time") if _min > 0: self.logger.info(" %d mins" % _min) else: self.logger.info(" %d secs" % _sec) self.logger.info("-" * 50) return self
def cv(self, params_dict, sample_len=0, do_save=False): self.logger.info("CV with parms:{}".format(params_dict)) start = time.time() if self.verbose: self.logger.info("=" * 50) self.logger.info("Task") self.logger.info(" %s" % str(self.__str__())) self.logger.info("Param") self._print_param_dict(params_dict) self.logger.info("Result") self.logger.info(" Run RMSE Shape") auc_cv = np.zeros(self.n_fold) train_preds = [] total_train = 0 for fold in range(self.n_fold): valid_datas = [] rcount = 0 model = self.learner.create_model(params_dict) #Train with train data for (idx, fold_num, data, labels, weights) in self.data_loader.get_train_data(): print("fold_num={},fold={}".format(fold_num, fold)) if fold_num == fold: valid_datas.append((idx, fold_num, data, labels, weights)) else: cur_train_len = len(data) rcount += cur_train_len if sample_len > 0: if rcount > sample_len and rcount - cur_train_len < sample_len: # Train with sampled data try: model = self.learner.fit( model, data, labels, weights) except: model = self.learner.fit(model, data, labels) else: continue # Use continue to add valid datas here else: try: model = self.learner.fit(model, data, labels, weights) except: model = self.learner.fit(model, data, labels) del (data) del (labels) del (weights) gc.collect() self.logger.info("Predict for validation data") print("valid_datas[0]={}".format(valid_datas[0])) valid_start_idx = valid_datas[0][0] valid_labels = [] valid_weights = [] valid_ds = [] for d in valid_datas: valid_labels.append(d[3]) valid_weights.append(d[4]) valid_ds.append(d[2]) valid_weights = np.concatenate(valid_weights, axis=0) valid_labels = np.concatenate(valid_labels, axis=0) valid_ds = np.concatenate(valid_ds, axis=0) num = len(valid_ds) y_pred = np.reshape(self.learner.predict(model, valid_ds), (num, )) total_train += num train_preds.append((valid_start_idx, num, y_pred)) auc_cv[fold] = dist_utils._auc(valid_labels, y_pred) self.logger.info(" {:>3} {:>8} {} x {}".format( fold + 1, np.round(auc_cv[fold], 6), valid_ds.shape[0], valid_ds.shape[1])) #clean up del (valid_datas) del (valid_ds) del (valid_labels) del (valid_weights) gc.collect() #Aggregate valid data train_pred = np.zeros(total_train) for (start, num, pred) in train_preds: train_pred[start:start + num] = pred # save if do_save: fname = "%s/cv_pred.%s.csv" % (config.OUTPUT_DIR, self.__str__()) self.logger.info("Save cv predictions:{}".format(fname)) df = pd.DataFrame({"predicted": train_pred}) df.to_csv(fname, index=False, columns=["predicted"]) #self.rmse_cv_mean = 1- np.mean(auc_cv) # auc_error = 1 - auc self.rmse_cv_mean = 1 - np.mean(auc_cv) self.rmse_cv_std = np.std(auc_cv) end = time.time() _sec = end - start _min = int(_sec / 60.) if self.verbose: self.logger.info("AUC") self.logger.info(" Mean: %.6f" % self.rmse_cv_mean) self.logger.info(" Std: %.6f" % self.rmse_cv_std) self.logger.info("Time") if _min > 0: self.logger.info(" %d mins" % _min) else: self.logger.info(" %d secs" % _sec) self.logger.info("-" * 50) return self
def simple_cv(self, params_dict, sample_len=0, do_save=False): """ Simpler Cross-Validation for select best parameters :param params_dict: :param sample_len: :param do_save: :return: """ self.logger.info("CV with parms:{}".format(params_dict)) start = time.time() if self.verbose: self.logger.info("=" * 50) self.logger.info("Task") self.logger.info(" %s" % str(self.__str__())) self.logger.info("Param") self._print_param_dict(params_dict) self.logger.info("Result") self.logger.info(" Run RMSE Shape") for (idx, fold_num, data, labels, weights) in self.data_loader.get_train_data(): # Train with sampled data len_data = len(data) if sample_len > 0 and len_data > sample_len: len_data = sample_len train_len = int(len_data * 0.9) valid_data = data[train_len:len_data] valid_labels = labels[train_len:len_data] valid_weights = weights[train_len:len_data] data = data[:train_len] labels = labels[:train_len] weights = labels[:train_len] model = self.learner.create_model(params_dict) try: model = self.learner.fit(model, data, labels, weights) except: model = self.learner.fit(model, data, labels) del (data) del (labels) del (weights) gc.collect() self.logger.info("Predict for validation data") y_pred = np.reshape(self.learner.predict(model, valid_data), (len(valid_data), )) valid_labels = np.reshape(valid_labels, (len(valid_labels), )) print(y_pred.shape) auc_cv = dist_utils._auc(valid_labels, y_pred) # log self.logger.info(" {:>3} {:>8} {} x {}".format( 1, np.round(auc_cv, 6), valid_data.shape[0], valid_data.shape[1])) del (model) del (y_pred) del (valid_data) del (valid_weights) del (valid_labels) gc.collect() self.rmse_cv_mean_simple = 1 - auc_cv self.rmse_cv_std_simple = 0 end = time.time() _sec = end - start _min = int(_sec / 60.) if self.verbose: self.logger.info("AUC") self.logger.info(" Mean: %.6f" % self.rmse_cv_mean) self.logger.info(" Std: %.6f" % self.rmse_cv_std) self.logger.info("Time") if _min > 0: self.logger.info(" %d mins" % _min) else: self.logger.info(" %d secs" % _sec) self.logger.info("-" * 50) return self