예제 #1
0
    def fit(self, X, y):
        params = {
            "objective": "binary",
            "metric": "auc",
            "verbosity": -1,
            "seed": 1,
            "num_threads": 4
            
        }
        num = int(len(X)*0.4)
        X_sample, y_sample = sample(X, y, num)
        hyperparams = self._hyperopt(X_sample, y_sample, params)
        
        for i in range(350):
            
            remain_time = self.time_budget - (time.time() - self.start_time)
            log(f"Remain time: {self.time_budget - (time.time() - self.start_time)}")
            
            if(remain_time/self.time_budget<=0.2):
                break
  
            X_sample, y_sample = sample(X, y, num)
            
            X_train, X_val, y_train, y_val = train_test_split(X_sample, y_sample, test_size=0.3, random_state=random.sample(range(0,2000),1)[0])
            train_data = lgb.Dataset(X_train, label=y_train)
            valid_data = lgb.Dataset(X_val, label=y_val)

            model = lgb.train({**params, **hyperparams}, train_data, num_boost_round=500, valid_sets=[train_data,valid_data],   early_stopping_rounds=10, verbose_eval=100)
      
            self.model.append(model)
               
        return self
예제 #2
0
    def fit(self, X, y):
        params = {
            "objective": "binary",
            "metric": "auc",
            "verbosity": -1,
            "seed": 1,
            "num_threads": 4
        }

        X_sample, y_sample = sample(X, y, 30000)
        hyperparams = self._hyperopt(X_sample, y_sample, params)

        X_train, X_val, y_train, y_val = train_test_split(X,
                                                          y,
                                                          test_size=0.1,
                                                          random_state=1)
        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_val, label=y_val)

        self.model = lgb.train({
            **params,
            **hyperparams
        },
                               train_data,
                               500,
                               valid_data,
                               early_stopping_rounds=30,
                               verbose_eval=100)

        return self
예제 #3
0
def class_sample(X, y, pos_num, neg_num, seed=2019):
    npos = float((y == 1).sum())
    nneg = len(y) - npos

    pos_frac = pos_num / npos
    neg_frac = neg_num / nneg

    X_pos = X[y == 1]
    X_pos = sample(X_pos, pos_frac, seed)

    X_neg = X[y != 1]
    X_neg = sample(X_neg, neg_frac, seed)

    X = pd.concat([X_pos, X_neg])

    X, y = sample(X, 1, seed, y)

    return X, y
예제 #4
0
    def fit(self, X, y, time_remain):
        self.raw_cols = list(set(self.raw_cols).intersection([c for c in X]))
        params = {
            "objective": "binary",
            "metric": "auc",
            "verbosity": -1,
            "seed": 1,
            "num_threads": 4
        }
        budget = time_remain
        SEED = 1
        train_start = time.time()
        self.auc = []
        while SEED <= self.iter: #SEED <= self.iter:
            round_start = time.time()
            print(SEED, budget)

            x_sample, y_sample = self._negative_sample(X, y, SEED)
            X_sample, y_sample = sample(x_sample, y_sample, 30000)#, random_state=SEED)
            # X_sample, y_sample, sum_rou, rou_0, rou_1 = clean_labels(X_sample[self.raw_cols], y_sample, SEED, pulearning=1)
            hyperparams = self._hyperopt(X_sample, y_sample, params)#, random_state=SEED)

            X_train, X_val, y_train, y_val = train_test_split(X_sample, y_sample, test_size=0.2)#, random_state=SEED)

            train_data = lgb.Dataset(X_train, label=y_train)
            valid_data = lgb.Dataset(X_val, label=y_val)

            model = lgb.train({**params, **{key: hyperparams[key] for key in hyperparams if key != "learning_rate"}},
                              train_data, 100, valid_data,
                              learning_rates=get_log_lr(100, hyperparams["learning_rate"] * 5,
                                                        hyperparams["learning_rate"] * 0.9),
                              early_stopping_rounds=90, verbose_eval=100)
            print_feature_importance(model)
            self.models.append(model)
            self.auc.append(model.best_score["valid_0"]["auc"])

            single_round = (time.time() - train_start) / SEED
            print(single_round)

            budget -= (time.time() - round_start)

            if budget <= single_round * 3:
                break

            SEED += 1
        print([m.best_iteration for m in self.models])
        print(self.auc)

        zipped = zip(self.models, self.auc)
        if CONSTANT.IF_SORT_VALID_AUC:
            self.model_sorted = sorted(zipped, key=lambda x: x[1], reverse=True)
        else:
            self.model_sorted = [(model, 0) for model in self.models]

        return self
예제 #5
0
def main():
    files = get_filenames()
    x, y = [], []

    # generate the learning curve data
    data = list(parse(file(files.dataset)))
    for train_prop in np.arange(0.1, 0.99, 0.05):
        training_set, testing_set = sample(data, train_prop)
        tree = build_tree(training_set).prune(MIN_GAIN)
        check = [record[RESULT_IDX] == plurality(tree.classify(record))
                 for record in testing_set]
        counter = Counter(check)
        precision = counter[True] / float(counter[True] + counter[False])
        print 'Training set sampling probability = %.2f:' % (train_prop)
        print 'training data size = %d,' % (len(training_set)),
        print 'test data size = %d,' % (len(testing_set)),
        print 'precision = %.4f' % (precision)
        x.append(len(training_set))
        y.append(precision)

    # statistics
    ymean, ystd, ymin, ymax = np.mean(y), np.std(y), np.min(y), np.max(y)
    print 'Mean of precision = %.4f' % (ymean)
    print 'Standard deviation of precision = %.4f' % (ystd)
    print 'Min = %.4f, max = %.4f' % (ymin, ymax)
    xy = sorted(zip(x, y), key=lambda a: a[0])
    x, y = zip(*xy)

    # setup decorations
    plt.rc('font', family='serif')
    plt.yticks(np.arange(0.0, 1.0, 0.1))
    plt.ylim(0.0, 1.0)
    plt.grid(True)
    plt.title('Learning Curve')
    plt.xlabel('Training set size')
    plt.ylabel('Precision on test set')

    # plot smoothed learning curve
    xnew = np.linspace(np.min(x), np.max(x), 100)
    ynew = interp1d(x, y)(xnew)
    plt.plot(x, y, '.', xnew, ynew, '--')

    # annotation
    box = dict(boxstyle='square', fc="w", ec="k")
    txt = '$\mu = %.4f$, $\sigma = %.4f$' % (ymean, ystd)
    txt += ', $min = %.4f$, $max = %.4f$' % (ymin, ymax)
    plt.text(170, 0.05, txt, bbox=box)

    plt.savefig(files.curve)
    print 'Save learning curve to', files.curve
예제 #6
0
    def fit(self, X, y):
        params = {
            "objective": "binary",
            "metric": "auc",
            "verbosity": -1,
            "seed": 1,
            "num_threads": 4
        }

        sample_size = 40000
        dims = sample_size * X.shape[1]
        print(dims)
        print(self.train_time_budget)
        print(X.shape)
        self.multiplier = max(
            int(self.train_time_budget * 1000 / (dims**0.65 * np.log(dims))),
            1)
        print(self.multiplier)

        for _ in range(self.iter * self.multiplier):
            x_sample, y_sample = self._negative_sample(X, y)
            X_sample, y_sample = sample(x_sample, y_sample, sample_size)

            hyperparams = self._hyperopt(X_sample, y_sample, params)

            X_train, X_val, y_train, y_val = train_test_split(X_sample,
                                                              y_sample,
                                                              test_size=0.15,
                                                              random_state=1)

            train_data = lgb.Dataset(X_train, label=y_train)
            valid_data = lgb.Dataset(X_val, label=y_val)
            model = lgb.train({
                **params,
                **hyperparams
            },
                              train_data,
                              400,
                              valid_data,
                              early_stopping_rounds=20,
                              verbose_eval=0)
            self.models.append(model)

        return self
예제 #7
0
    def fit(self, X, y):
        params = {
            "objective": "binary",
            "metric": "auc",
            "verbosity": -1,
            "seed": 1,
            "num_threads": 4
        }

        dims = X.shape[0] * X.shape[1]
        self.multiplier = min(
            max(int(1000 * 100000 / (dims * np.log(dims))), 1), 50)

        print(self.multiplier)

        X_sample, y_sample = sample(X, y, len(X))
        hyperparams = self._hyperopt(X_sample, y_sample, params)

        X_train, X_val, y_train, y_val = train_test_split(X,
                                                          y,
                                                          test_size=0.15,
                                                          random_state=1)
        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_val, label=y_val)

        self.model = lgb.train({
            **params,
            **hyperparams
        },
                               train_data,
                               500 * max(int(self.multiplier / 4), 1),
                               valid_data,
                               early_stopping_rounds=20 *
                               max(int(self.multiplier / 4), 1),
                               verbose_eval=0)

        return self
예제 #8
0
def clean_labels(X: pd.DataFrame,
                 y,
                 count_start,
                 pulearning=None,
                 strategy="cut",
                 round=0,
                 early_stop=False):
    count = count_start
    from preprocess import sample
    cols = [
        c for c in X if len(c.split("_")) == 2 and (
            c.startswith("c_") or c.startswith("n_"))
    ]
    print(cols)

    while count <= count_start + round:
        try:
            params = {
                "objective": "binary",
                "metric": "auc",
                "verbosity": -1,
                "seed": count,
                "num_threads": 4,
                "num_boost_round": 50
            }

            X_sample, y_sample = sample(X[cols], y, 30000, random_state=count)
            hyperparams = _hyperopt(X_sample,
                                    y_sample,
                                    params,
                                    random_state=count)
            # confident_joint, psx = estimate_confident_joint_and_cv_pred_proba(
            #     X=X.values,
            #     s=1 * (y.values == 1),
            #     clf=lgb.LGBMClassifier(**hyperparams, **params),  # default, you can use any classifier
            #     seed=count,
            # )
            # est_py, est_nm, est_inv = estimate_latent(confident_joint, s=1 * (y.values == 1))

            model = LearningWithNoisyLabels(
                lgb.LGBMClassifier(**hyperparams, **params),
                seed=count,
                cv_n_folds=5,
                prune_method="both",  # 'prune_by_noise_rate',
                converge_latent_estimates=True,
                pulearning=pulearning)
            print(X.shape, len(y))
            # import pdb;pdb.set_trace()
            noisy, noise_matrix, inverse_noise_matrix, confident_joint, psx = model.fit(
                X[cols].values, 1 * (y.values == 1), thresholds=None)
            # noise_matrix=est_nm,
            # inverse_noise_matrix=est_inv, )
            if count == count_start:

                rou_0 = noise_matrix[1, 0]
                rou_1 = noise_matrix[0, 1]

                print(rou_0, rou_1)
                if early_stop and rou_0 + rou_1 <= 0.9:
                    break
            if len(noisy) <= 0:
                break
            print(len([x for x in noisy if x == True]))

            if strategy == "cut":
                X = X[~noisy]
                y = y[~noisy]
            else:
                X = X[~noisy]
                y = y[~noisy]

        except Exception as exp:
            print("error:", exp)
        finally:
            count += 1

    return X, y, rou_0 + rou_1, rou_0, rou_1
예제 #9
0
    def fit(self, X_, y_, time_remain):

        # import pdb;pdb.set_trace()
        start_fit = time.time()
        # SEED = 2019

        # for SEED in range(2019, self.iter + 2019):
        SEED = 2019

        print(f"fix label use:{time.time()-start_fit}")
        budget = time_remain - (time.time() - start_fit)
        print(len(X_))

        while SEED <= self.iter + 2019:
            try:
                print(SEED, budget)
                round_start = time.time()

                self.hyper_seed = SEED
                params = {
                    "objective": "regression",
                    "metric": "rmse",
                    "verbosity": -1,
                    "seed": self.hyper_seed,
                    "num_threads": 4,
                }

                X, y = sample(X_,
                              y_,
                              int(len(X_) * 0.75),
                              random_state=self.hyper_seed)
                X, y_tag, y = clean_labels(X, y)
                hyperparams = self._hyperopt(X,
                                             y,
                                             params,
                                             random_state=self.hyper_seed)

                X_train, X_val, y_train, y_val = train_test_split(
                    X, y, test_size=0.3, random_state=self.hyper_seed)
                train_data = lgb.Dataset(X_train, label=y_train)
                valid_data = lgb.Dataset(X_val, label=y_val)

                self.model = lgb.train({
                    **params,
                    **hyperparams
                },
                                       train_data,
                                       500,
                                       valid_data,
                                       early_stopping_rounds=30,
                                       verbose_eval=100)

                # learning_rates = get_log_lr(500, hyperparams["learning_rate"] * 3,
                # hyperparams["learning_rate"] * 0.6)

                print_feature_importance(self.model)
                params["num_boost_round"] = self.model.best_iteration

                self.best_iter.append(self.model.best_iteration)

                self.models.append(self.model)

                if SEED == 2019:
                    single_round = time.time() - round_start

                budget -= (time.time() - round_start)
                if single_round / time_remain < 0.2:
                    if budget <= single_round * 3:
                        break
                else:
                    if budget <= single_round * 1.5:
                        break

                SEED += 1
            except:
                if SEED == 1:
                    single_round = time.time() - round_start

                budget -= (time.time() - round_start)
                if single_round / time_remain < 0.2:
                    if budget <= single_round * 3:
                        break
                else:
                    if budget <= single_round * 1.5:
                        break

                SEED += 1

        print(self.best_iter)
        return self
예제 #10
0
    model.fit(X, y)
    print model
    
    # save prediction
    X_test = preprocess.precondition('shuffle_data_test.txt', 0)
    prediction = model.predict_proba(X_test)[:, 1] # select the column of probabilities of 1 (click response)
    np.savetxt("prediction_complete_best.csv", np.dstack((np.arange(1, prediction.size + 1), prediction))[0], "%d,%f", header="Id,Prediction")
    
    end = time.time()  # end timing
    print("\ntotal training time: %d seconds" % (end - start))
    
    
if __name__ == '__main__':
    # preprocess and split data
    X, y = preprocess.precondition('data_train.txt', 1)
    X_train, X_test, y_train, y_test = preprocess.sample(X, y)

    # user may choose to load previously preprocessed data
#     print 'loading preprocessed data...'
#     X_train = np.genfromtxt('X_train.csv', delimiter=',')
#     y_train = np.genfromtxt('y_train.csv', delimiter=',')
#     X_test = np.genfromtxt('X_test.csv', delimiter=',')
#     y_test = np.genfromtxt('y_test.csv', delimiter=',')
    
    """
    test_internal(X_train, X_test, y_train, y_test) is an internal testing method 
    that generates an AUC score that measures the prediction accuracy of the model.
    The prediction model is trained using 70% of the labeled data 
    from the complete training data. This prediction model is then tested 
    on the other 30% of the labeled data form the complete training data.