def fit(self, X, y): params = { "objective": "binary", "metric": "auc", "verbosity": -1, "seed": 1, "num_threads": 4 } num = int(len(X)*0.4) X_sample, y_sample = sample(X, y, num) hyperparams = self._hyperopt(X_sample, y_sample, params) for i in range(350): remain_time = self.time_budget - (time.time() - self.start_time) log(f"Remain time: {self.time_budget - (time.time() - self.start_time)}") if(remain_time/self.time_budget<=0.2): break X_sample, y_sample = sample(X, y, num) X_train, X_val, y_train, y_val = train_test_split(X_sample, y_sample, test_size=0.3, random_state=random.sample(range(0,2000),1)[0]) train_data = lgb.Dataset(X_train, label=y_train) valid_data = lgb.Dataset(X_val, label=y_val) model = lgb.train({**params, **hyperparams}, train_data, num_boost_round=500, valid_sets=[train_data,valid_data], early_stopping_rounds=10, verbose_eval=100) self.model.append(model) return self
def fit(self, X, y): params = { "objective": "binary", "metric": "auc", "verbosity": -1, "seed": 1, "num_threads": 4 } X_sample, y_sample = sample(X, y, 30000) hyperparams = self._hyperopt(X_sample, y_sample, params) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=1) train_data = lgb.Dataset(X_train, label=y_train) valid_data = lgb.Dataset(X_val, label=y_val) self.model = lgb.train({ **params, **hyperparams }, train_data, 500, valid_data, early_stopping_rounds=30, verbose_eval=100) return self
def class_sample(X, y, pos_num, neg_num, seed=2019): npos = float((y == 1).sum()) nneg = len(y) - npos pos_frac = pos_num / npos neg_frac = neg_num / nneg X_pos = X[y == 1] X_pos = sample(X_pos, pos_frac, seed) X_neg = X[y != 1] X_neg = sample(X_neg, neg_frac, seed) X = pd.concat([X_pos, X_neg]) X, y = sample(X, 1, seed, y) return X, y
def fit(self, X, y, time_remain): self.raw_cols = list(set(self.raw_cols).intersection([c for c in X])) params = { "objective": "binary", "metric": "auc", "verbosity": -1, "seed": 1, "num_threads": 4 } budget = time_remain SEED = 1 train_start = time.time() self.auc = [] while SEED <= self.iter: #SEED <= self.iter: round_start = time.time() print(SEED, budget) x_sample, y_sample = self._negative_sample(X, y, SEED) X_sample, y_sample = sample(x_sample, y_sample, 30000)#, random_state=SEED) # X_sample, y_sample, sum_rou, rou_0, rou_1 = clean_labels(X_sample[self.raw_cols], y_sample, SEED, pulearning=1) hyperparams = self._hyperopt(X_sample, y_sample, params)#, random_state=SEED) X_train, X_val, y_train, y_val = train_test_split(X_sample, y_sample, test_size=0.2)#, random_state=SEED) train_data = lgb.Dataset(X_train, label=y_train) valid_data = lgb.Dataset(X_val, label=y_val) model = lgb.train({**params, **{key: hyperparams[key] for key in hyperparams if key != "learning_rate"}}, train_data, 100, valid_data, learning_rates=get_log_lr(100, hyperparams["learning_rate"] * 5, hyperparams["learning_rate"] * 0.9), early_stopping_rounds=90, verbose_eval=100) print_feature_importance(model) self.models.append(model) self.auc.append(model.best_score["valid_0"]["auc"]) single_round = (time.time() - train_start) / SEED print(single_round) budget -= (time.time() - round_start) if budget <= single_round * 3: break SEED += 1 print([m.best_iteration for m in self.models]) print(self.auc) zipped = zip(self.models, self.auc) if CONSTANT.IF_SORT_VALID_AUC: self.model_sorted = sorted(zipped, key=lambda x: x[1], reverse=True) else: self.model_sorted = [(model, 0) for model in self.models] return self
def main(): files = get_filenames() x, y = [], [] # generate the learning curve data data = list(parse(file(files.dataset))) for train_prop in np.arange(0.1, 0.99, 0.05): training_set, testing_set = sample(data, train_prop) tree = build_tree(training_set).prune(MIN_GAIN) check = [record[RESULT_IDX] == plurality(tree.classify(record)) for record in testing_set] counter = Counter(check) precision = counter[True] / float(counter[True] + counter[False]) print 'Training set sampling probability = %.2f:' % (train_prop) print 'training data size = %d,' % (len(training_set)), print 'test data size = %d,' % (len(testing_set)), print 'precision = %.4f' % (precision) x.append(len(training_set)) y.append(precision) # statistics ymean, ystd, ymin, ymax = np.mean(y), np.std(y), np.min(y), np.max(y) print 'Mean of precision = %.4f' % (ymean) print 'Standard deviation of precision = %.4f' % (ystd) print 'Min = %.4f, max = %.4f' % (ymin, ymax) xy = sorted(zip(x, y), key=lambda a: a[0]) x, y = zip(*xy) # setup decorations plt.rc('font', family='serif') plt.yticks(np.arange(0.0, 1.0, 0.1)) plt.ylim(0.0, 1.0) plt.grid(True) plt.title('Learning Curve') plt.xlabel('Training set size') plt.ylabel('Precision on test set') # plot smoothed learning curve xnew = np.linspace(np.min(x), np.max(x), 100) ynew = interp1d(x, y)(xnew) plt.plot(x, y, '.', xnew, ynew, '--') # annotation box = dict(boxstyle='square', fc="w", ec="k") txt = '$\mu = %.4f$, $\sigma = %.4f$' % (ymean, ystd) txt += ', $min = %.4f$, $max = %.4f$' % (ymin, ymax) plt.text(170, 0.05, txt, bbox=box) plt.savefig(files.curve) print 'Save learning curve to', files.curve
def fit(self, X, y): params = { "objective": "binary", "metric": "auc", "verbosity": -1, "seed": 1, "num_threads": 4 } sample_size = 40000 dims = sample_size * X.shape[1] print(dims) print(self.train_time_budget) print(X.shape) self.multiplier = max( int(self.train_time_budget * 1000 / (dims**0.65 * np.log(dims))), 1) print(self.multiplier) for _ in range(self.iter * self.multiplier): x_sample, y_sample = self._negative_sample(X, y) X_sample, y_sample = sample(x_sample, y_sample, sample_size) hyperparams = self._hyperopt(X_sample, y_sample, params) X_train, X_val, y_train, y_val = train_test_split(X_sample, y_sample, test_size=0.15, random_state=1) train_data = lgb.Dataset(X_train, label=y_train) valid_data = lgb.Dataset(X_val, label=y_val) model = lgb.train({ **params, **hyperparams }, train_data, 400, valid_data, early_stopping_rounds=20, verbose_eval=0) self.models.append(model) return self
def fit(self, X, y): params = { "objective": "binary", "metric": "auc", "verbosity": -1, "seed": 1, "num_threads": 4 } dims = X.shape[0] * X.shape[1] self.multiplier = min( max(int(1000 * 100000 / (dims * np.log(dims))), 1), 50) print(self.multiplier) X_sample, y_sample = sample(X, y, len(X)) hyperparams = self._hyperopt(X_sample, y_sample, params) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=1) train_data = lgb.Dataset(X_train, label=y_train) valid_data = lgb.Dataset(X_val, label=y_val) self.model = lgb.train({ **params, **hyperparams }, train_data, 500 * max(int(self.multiplier / 4), 1), valid_data, early_stopping_rounds=20 * max(int(self.multiplier / 4), 1), verbose_eval=0) return self
def clean_labels(X: pd.DataFrame, y, count_start, pulearning=None, strategy="cut", round=0, early_stop=False): count = count_start from preprocess import sample cols = [ c for c in X if len(c.split("_")) == 2 and ( c.startswith("c_") or c.startswith("n_")) ] print(cols) while count <= count_start + round: try: params = { "objective": "binary", "metric": "auc", "verbosity": -1, "seed": count, "num_threads": 4, "num_boost_round": 50 } X_sample, y_sample = sample(X[cols], y, 30000, random_state=count) hyperparams = _hyperopt(X_sample, y_sample, params, random_state=count) # confident_joint, psx = estimate_confident_joint_and_cv_pred_proba( # X=X.values, # s=1 * (y.values == 1), # clf=lgb.LGBMClassifier(**hyperparams, **params), # default, you can use any classifier # seed=count, # ) # est_py, est_nm, est_inv = estimate_latent(confident_joint, s=1 * (y.values == 1)) model = LearningWithNoisyLabels( lgb.LGBMClassifier(**hyperparams, **params), seed=count, cv_n_folds=5, prune_method="both", # 'prune_by_noise_rate', converge_latent_estimates=True, pulearning=pulearning) print(X.shape, len(y)) # import pdb;pdb.set_trace() noisy, noise_matrix, inverse_noise_matrix, confident_joint, psx = model.fit( X[cols].values, 1 * (y.values == 1), thresholds=None) # noise_matrix=est_nm, # inverse_noise_matrix=est_inv, ) if count == count_start: rou_0 = noise_matrix[1, 0] rou_1 = noise_matrix[0, 1] print(rou_0, rou_1) if early_stop and rou_0 + rou_1 <= 0.9: break if len(noisy) <= 0: break print(len([x for x in noisy if x == True])) if strategy == "cut": X = X[~noisy] y = y[~noisy] else: X = X[~noisy] y = y[~noisy] except Exception as exp: print("error:", exp) finally: count += 1 return X, y, rou_0 + rou_1, rou_0, rou_1
def fit(self, X_, y_, time_remain): # import pdb;pdb.set_trace() start_fit = time.time() # SEED = 2019 # for SEED in range(2019, self.iter + 2019): SEED = 2019 print(f"fix label use:{time.time()-start_fit}") budget = time_remain - (time.time() - start_fit) print(len(X_)) while SEED <= self.iter + 2019: try: print(SEED, budget) round_start = time.time() self.hyper_seed = SEED params = { "objective": "regression", "metric": "rmse", "verbosity": -1, "seed": self.hyper_seed, "num_threads": 4, } X, y = sample(X_, y_, int(len(X_) * 0.75), random_state=self.hyper_seed) X, y_tag, y = clean_labels(X, y) hyperparams = self._hyperopt(X, y, params, random_state=self.hyper_seed) X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=0.3, random_state=self.hyper_seed) train_data = lgb.Dataset(X_train, label=y_train) valid_data = lgb.Dataset(X_val, label=y_val) self.model = lgb.train({ **params, **hyperparams }, train_data, 500, valid_data, early_stopping_rounds=30, verbose_eval=100) # learning_rates = get_log_lr(500, hyperparams["learning_rate"] * 3, # hyperparams["learning_rate"] * 0.6) print_feature_importance(self.model) params["num_boost_round"] = self.model.best_iteration self.best_iter.append(self.model.best_iteration) self.models.append(self.model) if SEED == 2019: single_round = time.time() - round_start budget -= (time.time() - round_start) if single_round / time_remain < 0.2: if budget <= single_round * 3: break else: if budget <= single_round * 1.5: break SEED += 1 except: if SEED == 1: single_round = time.time() - round_start budget -= (time.time() - round_start) if single_round / time_remain < 0.2: if budget <= single_round * 3: break else: if budget <= single_round * 1.5: break SEED += 1 print(self.best_iter) return self
model.fit(X, y) print model # save prediction X_test = preprocess.precondition('shuffle_data_test.txt', 0) prediction = model.predict_proba(X_test)[:, 1] # select the column of probabilities of 1 (click response) np.savetxt("prediction_complete_best.csv", np.dstack((np.arange(1, prediction.size + 1), prediction))[0], "%d,%f", header="Id,Prediction") end = time.time() # end timing print("\ntotal training time: %d seconds" % (end - start)) if __name__ == '__main__': # preprocess and split data X, y = preprocess.precondition('data_train.txt', 1) X_train, X_test, y_train, y_test = preprocess.sample(X, y) # user may choose to load previously preprocessed data # print 'loading preprocessed data...' # X_train = np.genfromtxt('X_train.csv', delimiter=',') # y_train = np.genfromtxt('y_train.csv', delimiter=',') # X_test = np.genfromtxt('X_test.csv', delimiter=',') # y_test = np.genfromtxt('y_test.csv', delimiter=',') """ test_internal(X_train, X_test, y_train, y_test) is an internal testing method that generates an AUC score that measures the prediction accuracy of the model. The prediction model is trained using 70% of the labeled data from the complete training data. This prediction model is then tested on the other 30% of the labeled data form the complete training data.