Exemplo n.º 1
0
def main():
    options = common_model_parser().parse_args()
    config = json.load(open(options.config_file))
    data = pd.read_csv(options.data_file)
    feature_columns = data.columns[data.columns != 'y']

    if options.train:
        train, valid = train_test_split(data, test_size=0.2, random_state=334)
        train = scale_dataset(data=train, target_positive_ratio=0.191)
        valid = scale_dataset(data=valid, target_positive_ratio=0.191)
        X_train, y_train = train[feature_columns], train['y']
        X_valid, y_valid = valid[feature_columns], valid['y']

        d_train = xgb.DMatrix(X_train, label=y_train)
        d_valid = xgb.DMatrix(X_valid, label=y_valid)
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        params = config['model']['params']
        bst = xgb.train(params=params['booster'],
                        dtrain=d_train,
                        evals=watchlist,
                        **params['train'])
        joblib.dump(bst, options.model_file)

        p_valid = bst.predict(d_valid)
        log_result(y_valid, p_valid, config, options.log_file)
        add_feature_importance(bst, options.log_file)
    else:
        bst = joblib.load(options.model_file)
        data['is_duplicate'] = bst.predict(xgb.DMatrix(data[feature_columns]))
        data[['is_duplicate']].to_csv(options.submission_file,
                                      index_label='test_id')
Exemplo n.º 2
0
def main():
    options = common_model_parser().parse_args()
    config = json.load(open(options.config_file))
    data = pd.read_csv(options.data_file)
    feature_columns = data.columns[data.columns != 'y']
    categorical_feature_columns = [feature for feature in feature_columns if feature.endswith('.cat')]
    print("Categorical features: {}".format(categorical_feature_columns), file=sys.stderr)

    if options.train:
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=114514)
        negative_weight = (data.y.sum() / config['model']["target_positive_ratio"] - data.y.sum()) / (data.y == 0).sum()

        models = []
        stats = {"results": [], "config": config}
        data['prediction'] = [0] * data.shape[0]
        for train, valid in skf.split(data[feature_columns], data['y']):
            train_data = data.ix[train]
            valid_data = data.ix[valid]
            X_train, y_train = train_data[feature_columns], train_data['y']
            X_valid, y_valid = valid_data[feature_columns], valid_data['y']

            w_train = np.ones(X_train.shape[0])
            w_train[y_train == 0] *= negative_weight
            w_valid = np.ones(X_valid.shape[0])
            w_valid[y_valid == 0] *= negative_weight

            d_train = lgb.Dataset(data=X_train, label=y_train, weight=w_train,
                                  categorical_feature=categorical_feature_columns)
            d_valid = lgb.Dataset(data=X_valid, label=y_valid, weight=w_valid,
                                  categorical_feature=categorical_feature_columns)
            params = config['model']['params']
            gbm = lgb.train(params['booster'], d_train, valid_sets=d_valid, **params['train'])
            models.append(gbm)

            p_train = gbm.predict(X_train, num_iteration=gbm.best_iteration)
            p_valid = gbm.predict(X_valid, num_iteration=gbm.best_iteration)
            data.ix[valid, 'prediction'] = p_valid
           
            stat = calculate_statistics(pred=p_valid, true=y_valid, weight=w_valid)
            stat['results']['train_log_loss'] = log_loss(y_train, p_train, sample_weight=w_train)
            stats["results"].append(stat["results"])
        stats['sum_log_loss'] = sum(stat['log_loss'] for stat in stats['results'])
        joblib.dump(models, options.model_file)
        data[['prediction']].to_csv(options.model_file + '.train.pred', index=False)
        json.dump(stats, open(options.log_file, 'w'), sort_keys=True, indent=4)
    else:
        models = joblib.load(options.model_file)
        data['is_duplicate'] = np.zeros(data.shape[0])
        preds = np.zeros((data.shape[0], len(models)))
        BATCH_SIZE = 300000
        for begin in range(0, data.shape[0], BATCH_SIZE):
            end = min(begin + BATCH_SIZE, data.shape[0])
            for i, gbm in enumerate(models):
                preds[begin:end, i] = gbm.predict(data[begin:end][feature_columns])
        data['is_duplicate'] = preds.mean(axis=1)
        data[['is_duplicate']].to_csv(options.submission_file, index_label='test_id')
Exemplo n.º 3
0
def main():
    options = common_model_parser().parse_args()
    config = json.load(open(options.config_file))
    data = pd.read_csv(options.data_file)
    feature_columns = data.columns[data.columns != 'y']

    if options.train:
        train, valid = train_test_split(data,
                                        test_size=0.2,
                                        random_state=334,
                                        stratify=data.y)
        X_train, y_train = train[feature_columns], train['y']
        X_valid, y_valid = valid[feature_columns], valid['y']

        negative_weight = (
            data.y.sum() / config['model']["target_positive_ratio"] -
            data.y.sum()) / (data.y == 0).sum()
        w_train = np.ones(X_train.shape[0])
        w_train[y_train == 0] *= negative_weight
        w_valid = np.ones(X_valid.shape[0])
        w_valid[y_valid == 0] *= negative_weight

        d_train = xgb.DMatrix(X_train, label=y_train, weight=w_train)
        d_valid = xgb.DMatrix(X_valid, label=y_valid, weight=w_valid)

        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        params = config['model']['params']
        bst = xgb.train(params=params['booster'],
                        dtrain=d_train,
                        evals=watchlist,
                        **params['train'])
        joblib.dump(bst, options.model_file)

        p_train = bst.predict(d_train)
        p_valid = bst.predict(d_valid)
        log_result(y_valid, p_valid, config, options.log_file, weight=w_valid)
        add_feature_importance(bst, options.log_file)
        add_train_score(y_train=y_train,
                        p_train=p_train,
                        log_file=options.log_file,
                        weight=w_train)
    else:
        bst = joblib.load(options.model_file)
        data['is_duplicate'] = bst.predict(xgb.DMatrix(data[feature_columns]))
        data[['is_duplicate']].to_csv(options.submission_file,
                                      index_label='test_id')
Exemplo n.º 4
0
def main():
    options = common_model_parser().parse_args()
    config = json.load(open(options.config_file))
    data = pd.read_csv(options.data_file)
    feature_columns = data.columns[data.columns != 'y']
    categorical_feature_columns = [
        feature for feature in feature_columns if feature.endswith('.cat')
    ]
    print("Categorical features: {}".format(categorical_feature_columns),
          file=sys.stderr)

    if options.train:
        train, valid = train_test_split(data,
                                        test_size=0.2,
                                        random_state=334,
                                        stratify=data.y)
        X_train, y_train = train[feature_columns], train['y']
        X_valid, y_valid = valid[feature_columns], valid['y']

        negative_weight = (
            data.y.sum() / config['model']["target_positive_ratio"] -
            data.y.sum()) / (data.y == 0).sum()
        w_train = np.ones(X_train.shape[0])
        w_train[y_train == 0] *= negative_weight
        w_valid = np.ones(X_valid.shape[0])
        w_valid[y_valid == 0] *= negative_weight

        d_train = lgb.Dataset(data=X_train,
                              label=y_train,
                              weight=w_train,
                              categorical_feature=categorical_feature_columns)
        d_valid = lgb.Dataset(data=X_valid,
                              label=y_valid,
                              weight=w_valid,
                              categorical_feature=categorical_feature_columns)
        params = config['model']['params']
        gbm = lgb.train(params['booster'],
                        d_train,
                        valid_sets=d_valid,
                        **params['train'])
        joblib.dump(gbm, options.model_file)

        p_train = gbm.predict(X_train, num_iteration=gbm.best_iteration)
        p_valid = gbm.predict(X_valid, num_iteration=gbm.best_iteration)

        log_result(y_valid, p_valid, config, options.log_file, weight=w_valid)
        add_feature_importance(gbm, options.log_file)
        add_train_score(y_train=y_train,
                        p_train=p_train,
                        log_file=options.log_file,
                        weight=w_train)

        temp_df = pd.DataFrame()
        temp_df['is_duplicate'] = gbm.predict(data[feature_columns])
        temp_df[['is_duplicate'
                 ]].to_csv(options.submission_file + ".train.pred.csv",
                           index_label='id')

    else:
        gbm = joblib.load(options.model_file)
        data['is_duplicate'] = gbm.predict(data[feature_columns])
        data[['is_duplicate']].to_csv(options.submission_file,
                                      index_label='test_id')
Exemplo n.º 5
0
def main():
    options = common_model_parser().parse_args()
    config = json.load(open(options.config_file))
    data = pd.read_csv(options.data_file)
    feature_columns = data.columns[data.columns != 'y']
    class_weight = {0: 1.309028344, 1: 0.472001959}

    if options.train:
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=114514)
        negative_weight = (
            data.y.sum() / config['model']["target_positive_ratio"] -
            data.y.sum()) / (data.y == 0).sum()

        scalers = []
        stats = {"results": [], "config": config}
        data['prediction'] = [0] * data.shape[0]
        for i, (train,
                valid) in enumerate(skf.split(data[feature_columns],
                                              data['y'])):
            train_data = data.ix[train]
            valid_data = data.ix[valid]
            X_train, y_train = train_data[feature_columns], train_data['y']
            X_valid, y_valid = valid_data[feature_columns], valid_data['y']

            w_train = np.ones(X_train.shape[0])
            w_train *= 0.472001959
            w_train[y_train == 0] = 1.309028344
            w_valid = np.ones(X_valid.shape[0])
            w_valid *= 0.472001959
            w_valid[y_valid == 0] = 1.309028344

            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train.values)
            X_valid = scaler.transform(X_valid.values)
            model = nn_model(X_train)
            bst_model_path = options.model_file + '.h5'
            hist = model.fit(X_train,
                             y_train,
                             validation_data=(X_valid, y_valid, w_valid),
                             epochs=200,
                             batch_size=2048,
                             shuffle=True,
                             class_weight=class_weight,
                             callbacks=[
                                 EarlyStopping(monitor='val_loss',
                                               patience=10),
                                 ModelCheckpoint(bst_model_path,
                                                 save_best_only=True,
                                                 save_weights_only=True)
                             ])

            p_valid = calibration(
                model.predict(X_valid, batch_size=8192, verbose=1).ravel())
            p_train = calibration(
                model.predict(X_train, batch_size=8192, verbose=1).ravel())
            print(p_valid, p_valid.max())
            joblib.dump(p_valid, 'tmp.p_valid.pkl')
            model.save(options.model_file + '.{}.h5'.format(i))
            scalers.append(scaler)
            data.ix[valid, 'prediction'] = p_valid
            stat = calculate_statistics(pred=p_valid,
                                        true=y_valid,
                                        weight=w_valid)
            stat['results']['train_log_loss'] = log_loss(y_train,
                                                         p_train,
                                                         sample_weight=w_train)
            print(stat)
            stats["results"].append(stat["results"])
        joblib.dump(scalers, options.model_file + '.scaler')
        data[['prediction']].to_csv(options.model_file + '.train.pred',
                                    index=False)
        print(stats)
        json.dump(stats, open(options.log_file, 'w'), sort_keys=True, indent=4)
    else:
        scalers = joblib.load(options.model_file + '.scaler')
        data['is_duplicate'] = np.zeros(data.shape[0])
        preds = np.zeros((data.shape[0], len(scalers)))
        for i, scaler in enumerate(scalers):
            model = load_model(options.model_file + '.{}.h5'.format(i))
            X = scaler.transform(data[feature_columns])
            preds[:, i] = calibration(
                model.predict(X, batch_size=8192, verbose=1).ravel())
        data['is_duplicate'] = preds.mean(axis=1)
        data[['is_duplicate']].to_csv(options.submission_file,
                                      index_label='test_id')