예제 #1
0
def cross_validation(train,
                     params,
                     ID_COLUMN_NAME,
                     LABEL_COLUMN_NAME,
                     N_FOLD=5):
    '''
    :return: loss
    '''
    NUM_BOOST_ROUND = 1000
    EARLY_STOPPING_ROUNDS = 50

    # Cross validation model
    folds = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=1001)
    feats = [
        f for f in train.columns
        if f not in [LABEL_COLUMN_NAME, ID_COLUMN_NAME]
    ]
    for i_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train[feats], train[LABEL_COLUMN_NAME])):
        dtrain = lgb.Dataset(data=train[feats].iloc[train_idx],
                             label=train[LABEL_COLUMN_NAME].iloc[train_idx],
                             free_raw_data=False,
                             silent=True)
        dvalid = lgb.Dataset(data=train[feats].iloc[valid_idx],
                             label=train[LABEL_COLUMN_NAME].iloc[valid_idx],
                             free_raw_data=False,
                             silent=True)
        with timer('cross validation-fold {} train model'.format(i_fold)):
            log.info('params is {}'.format(params))
            clf = lgb.train(num_boost_round=NUM_BOOST_ROUND,
                            params=params,
                            verbose_eval=10,
                            train_set=dtrain,
                            valid_sets=[dvalid],
                            early_stopping_rounds=EARLY_STOPPING_ROUNDS)
        with timer('cross validation-fold {} predict'.format(i_fold)):
            v_data = clf.predict(dvalid.data)
            y_pre = []
            for d in v_data:
                max = d[0]
                max_i = 0
                for i in range(1, 15):
                    if d[i] > max:
                        max = d[i]
                        max_i = i
                y_pre.append(max_i)
        f1 = f1_score(dvalid.label, y_pre, average='macro')
        return f1
예제 #2
0
def write2file(col_id, pre_label, name=None):
    with timer('write result {}'.format(name)):
        y_pre = one_hot2label_index(pre_label)
        df = pd.DataFrame()
        df[ID] = col_id
        df['predict'] = index2label(y_pre)
        df.to_csv('result{}.csv'.format(name), index=False)
예제 #3
0
def data_prepare(df_train, df_test):
    conti_list = [
        '1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee',
        'contract_time', 'former_complaint_fee', 'former_complaint_num',
        'last_month_traffic', 'local_caller_time', 'local_trafffic_month',
        'month_traffic', 'online_time', 'pay_num', 'pay_times',
        'service1_caller_time', 'service2_caller_time', 'pay_num_per_time',
        'll'
    ]

    normalize_process(df_train, df_test, conti_list)
    # label 2 index
    base_data_process.label2index(df_train, LABEL)

    base_util.pickle_dump(
        (base_data_process.encode_map, base_data_process.decode_list),
        'origin_data/label2index.pkl')

    with timer('save train data'):
        df_train.to_csv('origin_data/train_modified.csv', index=False)
    with timer('save test data'):
        df_test.to_csv('origin_data/test_modified.csv', index=False)
예제 #4
0
def optimization():
    space = {
        'learning_rate':
        0.1,
        'boosting_type':
        hp.choice('boosting_type', ['gbdt']),
        'num_leaves':
        hp.choice('num_leaves', [15, 20, 30, 50, 65, 80, 100, 150, 400]),
        'bin_construct_sample_cnt':
        hp.choice('bin_construct_sample_cnt',
                  [10000, 20000, 60000, 100000, 200000]),
        'min_data_in_leaf':
        hp.quniform('min_data_in_leaf', 20, 500, 10),
        'reg_alpha':
        hp.choice('reg_alpha', [0, 0.001, 0.01, 0.1, 0.2]),
        'reg_lambda':
        hp.choice('reg_lambda', [0, 0.001, 0.01, 0.1, 0.2]),
        'feature_fraction':
        hp.uniform('feature_fraction', 0.8, 1.0),
        'bagging_fraction':
        hp.uniform('bagging_fraction', 0.8, 1.0),
        'bagging_freq':
        hp.choice('bagging_freq', [0, 2, 6, 10, 16]),
        'is_unbalance':
        hp.choice('is_unbalance', [True, False]),
        'num_threads':
        40,
        'objective':
        'multiclass',
        'num_class':
        15,
        'verbose':
        -1
    }

    trials = Trials()

    with timer('optimization'):
        # Run optimization
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    trials=trials,
                    max_evals=config_dict['max_evals'])

    print('-' * 100)
    log.warn(best)

    with open('model_trials.pkl', mode='wb') as mt:
        pickle.dump(trials, mt)
예제 #5
0
파일: model.py 프로젝트: lyliu/kaggle
def write2file(col_id, pre_label, name=None):
    with timer('write result {}'.format(name)):
        y_pre = []
        for d in pre_label:
            max = d[0]
            max_i = 0
            for i in range(1, 15):
                if d[i] > max:
                    max = d[i]
                    max_i = i
            y_pre.append(decode_list[max_i])
        df = pd.DataFrame()
        df['user_id'] = col_id
        df['predict'] = y_pre
        df.to_csv('result{}.csv'.format(name), index=False)
예제 #6
0
def objective(hyperparameters):
    # Keep track of evals
    global ITERATION

    ITERATION += 1

    # Make sure parameters that need to be integers are integers
    for parameter_name in [
            'num_leaves', 'bin_construct_sample_cnt', 'bagging_freq',
            'min_data_in_leaf'
    ]:
        hyperparameters[parameter_name] = int(hyperparameters[parameter_name])

    with timer('run lgb') as ti:
        # Perform n_folds cross validation
        f1 = cross_validation(config_dict['train'], hyperparameters, 'user_id',
                              'current_service')
        loss = 1 - f1**2

        run_time = ti.get_delay_t0()

    # Write to the csv file ('a' means append)
    of_connection = open('hyperparameters.csv', 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, hyperparameters, ITERATION, run_time, 1 - loss])
    of_connection.close()

    log.info('iteration-{} f1:{} loss:{} train_time:{}'.format(
        ITERATION, f1, loss, run_time))
    # Dictionary with information for evaluation
    return {
        'loss': loss,
        'hyperparameters': hyperparameters,
        'iteration': ITERATION,
        'train_time': run_time,
        'status': STATUS_OK
    }
def main(limit):
    """
    This method is used to generate processed data for train and test.

    :return: No return, only write files
             (1) {mode}_{LIMITED}_wfd.pkl
                write index, words, tags, len_w
             (2) {mode}_{LIMITED}_wbd.pkl
                write bert embedding in line

    """
    # change LIMITED
    global LIMITED
    LIMITED = limit

    modes = ['train', 'valid', 'test']
    mode_map = {'train': [0, 1, 2, 3, 4], 'valid': [5], 'test': [6]}
    mod_num = 7

    import os
    for mode in modes:

        if mode == 'valid':
            LIMITED = LIMITED // 5

        word_flag_data = []
        word_bert_emb_data = []
        origin_bert_emb = collections.OrderedDict()
        tags = collections.OrderedDict()
        tokens = collections.OrderedDict()
        for f in os.listdir(INPUT_DIR):
            if 'json' not in f and bu.get_str_index(f,
                                                    mod_num) in mode_map[mode]:
                data_file = INPUT_DIR + f
                if LIMITED > 0 and len(origin_bert_emb) < LIMITED:
                    with bu.timer(f'load {data_file} bert emb'):
                        load_bert_embedding(data_file, origin_bert_emb, tags,
                                            tokens)

        index = 0

        for i, words_emb_bert, words, tags, len_w in generator_fn(
                origin_bert_emb, tags, tokens):
            # used to generate small dataset if set LIMITED's value

            if LIMITED > 0 and index >= LIMITED:
                break

            if words_emb_bert is None:
                continue
            if i % 1000 == 0:
                print(f'{mode} index:{i} finished!')
            assert len(words_emb_bert) == len(
                words) == len_w, f'length not match in {i},' \
                                 f'{len(words_emb_bert)}-{len(words)}-{len_w}'
            word_bert_emb_data.append(words_emb_bert)
            word_flag_data.append((index, words, tags, len_w))
            index += 1

        with open(DATA_DIR + f'/processed/{mode}_{LIMITED}_wfd.pkl',
                  'wb') as wfd, open(
                      DATA_DIR + f'/processed/{mode}_{LIMITED}_wbd.pkl',
                      'wb') as wbd:
            with bu.timer(f'write {mode} to file'):
                # each line is (index, words, tags, len_w)
                pickle.dump(word_flag_data, wfd)
                # each line is word's bert context embedding
                pickle.dump(word_bert_emb_data, wbd)
예제 #8
0
def model(train, test, num_folds=5, stratified=True, num_boost_round=1000):
    LABEL_SIZE = train[LABEL].value_counts().count()

    print("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train.shape, test.shape))

    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=1001)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    sub_preds = np.zeros(shape=(test.shape[0], LABEL_SIZE))
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train.columns if f not in [LABEL, ID]]
    for i_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train[feats], train[LABEL])):
        dtrain = lgb.Dataset(data=train[feats].iloc[train_idx],
                             label=train[LABEL].iloc[train_idx],
                             free_raw_data=False,
                             silent=True)
        dvalid = lgb.Dataset(data=train[feats].iloc[valid_idx],
                             label=train[LABEL].iloc[valid_idx],
                             free_raw_data=False,
                             silent=True)

        params = {
            'bagging_fraction': 0.94795171020152,
            'bagging_freq': 6,
            'bin_construct_sample_cnt': 200000,
            'boosting_type': 'gbdt',
            'feature_fraction': 0.9953235660931046,
            'is_unbalance': False,
            'learning_rate': 0.001,
            'min_data_in_leaf': 30,
            'num_class': 15,
            'num_leaves': 80,
            'num_threads': 40,
            'objective': 'multiclass',
            'reg_alpha': 0.001,
            'reg_lambda': 0.1,
            'verbose': -1
        }
        with timer('fold {} train model'.format(i_fold)):
            clf = lgb.train(num_boost_round=num_boost_round,
                            params=params,
                            train_set=dtrain,
                            valid_sets=[dvalid],
                            early_stopping_rounds=50)
        with timer('fold {} predict'.format(i_fold)):
            v_data = clf.predict(dvalid.data)
            y_pre = one_hot2label_index(v_data)
            sub_preds += clf.predict(test[feats])
            write2file(test[ID], sub_preds, i_fold)
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importance(
            importance_type='gain')
        fold_importance_df["fold"] = i_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        f1 = f1_score(dvalid.label, y_pre, average='macro')
        log.warn('Fold {} f1 : {} score {}'.format(i_fold + 1, f1, f1**2))
        del clf, dtrain, dvalid
        gc.collect()
    display_importances(feature_importance_df)
예제 #9
0
        df = pd.DataFrame()
        df[ID] = col_id
        df['predict'] = index2label(y_pre)
        df.to_csv('result{}.csv'.format(name), index=False)


# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[[
        "feature", "importance"
    ]].groupby("feature").mean().sort_values(by="importance",
                                             ascending=False)[:40].index
    best_features = feature_importance_df_.loc[
        feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance",
                y="feature",
                data=best_features.sort_values(by="importance",
                                               ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances01.png')


if __name__ == '__main__':
    with timer('data process'):
        df_train, df_test = eda()
        label2index(df_train, LABEL)
    with timer('model process'):
        model(df_train, df_test, num_folds=5, num_boost_round=10000)
예제 #10
0
def main():
    params = {
        'output_dir': str(Path(RESULT_DIR, 'res_torch')),
        'checkpoint': str(Path(RESULT_DIR, 'res_torch/model')),
        'glove_dim': 300,
        'vocab_tags': str(Path(DATA_DIR, 'processed/vocab.tags.txt')),
        'glove': str(Path(DATA_DIR, 'embedding/glove.npz')),
        'words': str(Path(DATA_DIR, 'processed/vocab.words.txt')),
        'tags': str(Path(DATA_DIR, 'processed/vocab.tags.txt')),
    }

    parser = argparse.ArgumentParser()
    parser.add_argument('--undo_train_valid', help="undo train data as valid",
                        action='store_true', default=False)
    parser.add_argument('--input', help="input dir or file",
                        type=str, required=True)
    parser.add_argument('--valid_input', help="valid data input dir or file",
                        type=str, required=True)
    parser.add_argument('--output', help="output file dir for writing result",
                        type=str, default=params['output_dir'])
    parser.add_argument('--limit', help="if use data limit",
                        type=int, default=0)
    parser.add_argument('--gpu_index', help="gpu index must>-1,if use gpu",
                        type=int, default=0)
    parser.add_argument('--dropout',
                        help="dropout rate in embed and liner layer",
                        type=float, default=0.2)
    parser.add_argument('--batch_size', help="batch size od data",
                        type=int, default=32)
    parser.add_argument('--hidden_size', help="set the hidden size",
                        type=int, default=128)
    parser.add_argument('--epochs', help="epochs of train",
                        type=int, default=100)

    parser.add_argument('--monitor',
                        help="monitor f1,acc,precision or recall, "
                             "value like ORG:f1 or PER:acc or LOC:recall",
                        type=str, default='ORG:f1')
    parser.add_argument('--use_glove', help="denote whether use use_glove",
                        type=bool, default=False)
    parser.add_argument('--model_name', help="file name of model file",
                        type=str, default='ner_model_crf')
    parser.add_argument('--mode_type',
                        help="choose transformer(t) or biLstm(b) or only crf(c)",
                        choices=['b', 't', 'c', 'bt', 'cnn'],
                        type=str, default='b')
    parser.add_argument('--bert_dim', help="bert dim",
                        type=int, default=768)
    parser.add_argument('--te_dropout', help="te dropout",
                        type=float, default=0.1)
    parser.add_argument('--lr', help="learning rate",
                        type=float, default=3e-4)
    parser.add_argument('--lr_times', help="learning rate decay times",
                        type=int, default=0)
    parser.add_argument('--wd', help="weight decay",
                        type=float, default=1e-3)
    parser.add_argument('--head_num', help="set the head num",
                        type=int, default=8)
    parser.add_argument('--vip', help="the ip or domain of visdom server",
                        type=str, default='')
    parser.add_argument('--env', help="the name of env of visdom",
                        type=str, default='ner')

    parser.add_argument('--pre_model_path', help="the pre model path",
                        type=str, default='')
    parser.add_argument('--use_cross_entropy', help="use cross entropy loss",
                        action='store_true', default=False)
    args = parser.parse_args()

    params['dropout'] = args.dropout
    params['use_glove'] = args.use_glove
    params['bert_dim'] = args.bert_dim
    params['mode_type'] = args.mode_type
    params['hidden_size'] = args.hidden_size
    # just for transformer
    params['te_dropout'] = args.te_dropout
    params['head_num'] = args.head_num
    params['use_cross_entropy'] = args.use_cross_entropy

    model_time_str = args.model_name + '_' + bu.get_time_str()

    log = bu.get_logger(model_time_str)

    if args.vip:
        vis = visdom.Visdom(args.vip, env=args.env)
    else:
        vis = None

    word_to_ix = {'<pad>': 0}
    if params['use_glove']:
        with open(params['words']) as wvf:
            for word in wvf:
                word = word.strip()
                if word not in word_to_ix:
                    word_to_ix[word] = len(word_to_ix)

    tag_to_ix = {'O': 0}
    with open(params['tags']) as wvf:
        for tag in wvf:
            tag = tag.strip()
            if tag not in tag_to_ix:
                tag_to_ix[tag] = len(tag_to_ix)
    idx_to_tag = {tag_to_ix[key]: key for key in tag_to_ix}

    if args.gpu_index > -1:
        device = torch.device(f'cuda:{args.gpu_index}')
    else:
        device = torch.device('cpu')

    model = Bert_CRF(tag_to_ix, params, device)
    model.to(device)

    if args.pre_model_path:
        with Path(args.pre_model_path).open('rb') as mp:
            if args.gpu_index < 0:
                ml = 'cpu'
            else:
                ml = None
            best_state_dict = torch.load(mp, map_location=ml)
            model.load_state_dict(best_state_dict, False)

    optimizer = optim.Adam(model.parameters(), lr=args.lr,
                           weight_decay=args.wd)

    # begin to train model
    step_index = 0

    # model, bert_dim, tag_to_ix, word_to_ix, rw, batch
    collate_fn = functools.partial(data_provider.collect_fn, model,
                                   params['bert_dim'], tag_to_ix, None,
                                   False)
    with bu.timer('load train data'):
        dataset = data_provider.BBNDatasetCombine(args.input,
                                                  args.limit)
    data_loader = tud.DataLoader(dataset, args.batch_size,
                                 shuffle=True, collate_fn=collate_fn,
                                 drop_last=True)

    if not args.undo_train_valid:
        sampler = tud.RandomSampler(data_source=dataset,
                                    replacement=True,
                                    num_samples=5000)
    else:
        sampler = None

    log.info('begin to train')
    Path(params['checkpoint']).mkdir(parents=True, exist_ok=True)
    monitor_best = 0
    wait = 0
    loss_train_epoch = []
    loss_valid_epoch = []
    loss_train_t = []
    loss_train_valid = []
    criterion_key = ['f1', 'precision', 'recall']
    criterion_map = {}

    lr_times = args.lr_times
    lr = args.lr
    for epoch in range(args.epochs):
        loss_train = []

        # index_batch, words_batch, words_ids_batch, len_w_batch, tags_batch
        # sentence_batch
        for i, w, wi, l, t, _ in data_loader:
            # Step 1. Remember that Pytorch accumulates gradients.
            model.zero_grad()
            # Step 2. Run our forward pass.
            # words, words_ids, len_w, tags
            loss = model.neg_log_likelihood(w, wi, l, t)
            # Step 3. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            ls = loss.mean()
            ls.backward()
            optimizer.step()
            step_index += 1
            step_loss = ls.item()
            log.info(
                f'global step:{step_index} epoch:{epoch} loss:{step_loss}')
            loss_train.append(step_loss)
            loss_train_t.append(step_loss)
            plot(vis, loss_train_t, args.model_name, ['train_loss'])

        if sampler:
            # collate_fn, model, args, tag_to_ix = None, idx_to_tag = None,
            # fpr = True, get_loss = False, input_dir = None, dataset_in = None,
            # sampler = None
            criterion, loss_valid_ = evaluate(collate_fn, model, args,
                                              tag_to_ix, idx_to_tag,
                                              True, True,
                                              dataset_in=dataset,
                                              sampler=sampler)
            for k in criterion:
                # ['f1', 'precision', 'recall']
                for ck in criterion_key:
                    key = f'train_{k}_{ck}'
                    if key not in criterion_map:
                        criterion_map[key] = []
                    criterion_map[key].append(criterion[k][ck])
            loss_train_valid.append(np.mean(loss_valid_))

        criterion, loss_valid = evaluate(collate_fn, model, args,
                                         tag_to_ix, idx_to_tag, True, True,
                                         input_dir=args.valid_input)
        loss_train_epoch.append(np.mean(loss_train))
        loss_valid_epoch.append(np.mean(loss_valid))

        for k in criterion:
            # ['f1', 'precision', 'recall']
            for ck in criterion_key:
                key = f'valid_{k}_{ck}'
                if key not in criterion_map:
                    criterion_map[key] = []
                criterion_map[key].append(criterion[k][ck])
        plot_data = []
        keys = list(criterion_map.keys())
        for k in criterion_map:
            plot_data.append(criterion_map[k])
        if sampler:
            legend = ['train_loss', 'valid_loss',
                      'train_loss_t'] + keys
            x_in = zip(loss_train_epoch, loss_valid_epoch,
                       loss_train_valid, *plot_data)
        else:
            legend = ['train_loss', 'valid_loss'] + keys
            x_in = zip(loss_train_epoch, loss_valid_epoch, *plot_data)
        plot(vis, x_in, args.model_name, legend)

        log.info(f'valid:{criterion}')
        tag_type, monitor_type = args.monitor.split(':')
        if (criterion[tag_type][monitor_type] > monitor_best
                or monitor_best == 0):
            monitor_best = criterion[tag_type][monitor_type]
            wait = 0
            best_state_dict = model.state_dict()
            if monitor_best:
                save_mode(best_state_dict, params, tag_to_ix, args.model_name)
        else:
            wait += 1
        if (epoch + 1) % 5 == 0:
            temp_name = f't_{args.model_name}_{epoch+1}'
            save_mode(model.state_dict(), params, tag_to_ix, temp_name)
        if wait > 8:
            if lr_times:
                lr_times -= 1
                wait = 3
                lr /= 3
                optimizer = optim.Adam(model.parameters(), lr=lr,
                                       weight_decay=args.wd)
            else:
                log.warn(f'meat early stopping! best score is {monitor_best}')
                break
        log.info('finish train')
예제 #11
0
def main():
    params = {
        'output_dir': str(Path(RESULT_DIR, 'res_torch')),
        'checkpoint': str(Path(RESULT_DIR, 'res_torch/model')),
        'glove_dim': 300,
        'vocab_tags': str(Path(DATA_DIR, 'processed/vocab.tags.txt')),
        'glove': str(Path(DATA_DIR, 'embedding/glove.npz')),
        'words': str(Path(DATA_DIR, 'processed/vocab.words.txt')),
        'tags': str(Path(DATA_DIR, 'processed/vocab.tags.txt')),
    }

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        help="input dir or file",
                        type=str,
                        required=True)
    parser.add_argument('--output',
                        help="output file dir for writing result",
                        type=str,
                        default=params['output_dir'])
    parser.add_argument('--limit',
                        help="if use data limit",
                        type=int,
                        default=0)
    parser.add_argument('--gpu_index',
                        help="gpu index must>-1,if use gpu",
                        type=int,
                        default=0)
    parser.add_argument('--model_name',
                        help="file name of model file",
                        type=str,
                        default='ner_model_crf')
    args = parser.parse_args()

    model_time_str = args.model_name + '_' + bu.get_time_str()

    log = bu.get_logger(model_time_str)

    log.info('begin predict')
    fn_model = params['checkpoint'] + f'/{args.model_name}_torch.pkl'
    fn_config = params['checkpoint'] + f'/{args.model_name}_config.pkl'
    with Path(fn_model).open('rb') as mp:
        if args.gpu_index < 0:
            ml = 'cpu'
        else:
            ml = None
        best_state_dict = torch.load(mp, map_location=ml)
    with Path(fn_config).open('rb') as mp:
        params, tag_to_ix = pickle.load(mp)
    print(tag_to_ix)
    idx_to_tag = {tag_to_ix[key]: key for key in tag_to_ix}
    if args.gpu_index > -1:
        device = torch.device(f'cuda:{args.gpu_index}')
    else:
        device = torch.device('cpu')
    model = Bert_CRF(tag_to_ix, params, device)
    model.to(device)
    model.load_state_dict(best_state_dict, strict=False)

    with bu.timer('load data'):
        dataset = data_provider.BBNDatasetCombine(args.input, args.limit)
    # change batch_size to 1
    args.batch_size = 1

    # model, bert_dim, tag_to_ix, word_to_ix, rw, batch
    collate_fn = functools.partial(data_provider.collect_fn, model,
                                   params['bert_dim'], tag_to_ix, None, True)
    log.warn(f"{'-'*25}test_valid{'-'*25}")
    evaluate(collate_fn,
             model,
             args,
             tag_to_ix,
             idx_to_tag,
             True,
             False,
             f"{args.output}/{args.model_name}.txt",
             dataset_in=dataset)
예제 #12
0
파일: model.py 프로젝트: lyliu/kaggle
def model(train, test, num_folds=5, stratified=True, num_boost_round=1000):
    global decode_list
    # Divide in training/validation and test data
    ID_COLUMN_NAME = 'user_id'
    LABEL_COLUMN_NAME = 'current_service'
    LABEL_SIZE = train[LABEL_COLUMN_NAME].value_counts().count()

    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train.shape, test.shape))

    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    sub_preds = np.zeros(shape=(test.shape[0], LABEL_SIZE))
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train.columns if
             f not in [LABEL_COLUMN_NAME, ID_COLUMN_NAME]]
    for i_fold, (train_idx, valid_idx) in enumerate(folds.split(train[feats], train[LABEL_COLUMN_NAME])):
        dtrain = lgb.Dataset(data=train[feats].iloc[train_idx],
                             label=train[LABEL_COLUMN_NAME].iloc[train_idx],
                             free_raw_data=False, silent=True)
        dvalid = lgb.Dataset(data=train[feats].iloc[valid_idx],
                             label=train[LABEL_COLUMN_NAME].iloc[valid_idx],
                             free_raw_data=False, silent=True)

        # LightGBM parameters found by Bayesian optimization
        # {'boosting_type': 'dart', 'colsample_bytree': 0.9577639825746964, 'is_unbalance': False,
        #  'learning_rate': 0.11102546218712299, 'min_child_samples': 355, 'min_data_in_leaf': 101, 'num_class': 15,
        #  'num_leaves': 22, 'num_threads': 35, 'objective': 'multiclass', 'reg_alpha': 0.12542902430757463,
        #  'reg_lambda': 0.15833387646203106, 'subsample_for_bin': 260000, 'verbose': -1, 'subsample': 0.738876981095225}

        params = {
            'objective': 'multiclass',
            'boosting_type': 'gbdt',
            'learning_rate': 0.1,
            'num_leaves': 80,
            'feature_fraction': 0.85,
            'bagging_fraction': 0.9,
            'bagging_freq': 10,
            'num_threads': 35,
            'verbose': -1,
            'max_bin': 550,
            'num_class': LABEL_SIZE
        }
        with timer('fold {} train model'.format(i_fold)):
            clf = lgb.train(
                num_boost_round=num_boost_round,
                params=params,
                train_set=dtrain,
                valid_sets=[dvalid],
                early_stopping_rounds=50
            )
        with timer('fold {} predict'.format(i_fold)):
            v_data = clf.predict(dvalid.data)
            y_pre = []
            for d in v_data:
                max = d[0]
                max_i = 0
                for i in range(1, 15):
                    if d[i] > max:
                        max = d[i]
                        max_i = i
                y_pre.append(max_i)

            sub_preds += clf.predict(test[feats])
            write2file(test[ID_COLUMN_NAME], sub_preds, i_fold)
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importance(importance_type='gain')
        fold_importance_df["fold"] = i_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        f1 = f1_score(dvalid.label, y_pre, average='macro')
        log.warn('Fold {} f1 : {} score {}'.format(i_fold + 1, f1, f1 ** 2))
        del clf, dtrain, dvalid
        gc.collect()
    display_importances(feature_importance_df)