def __init__(self, input_dir, limit=0, from_one_file=True):
        self.data = []
        inputs = []
        log = bu.get_logger()

        if os.path.isfile(input_dir):
            inputs.append(input_dir)
        else:
            for input_file in os.listdir(input_dir):
                file_path = input_dir + '/' + input_file
                if os.path.isfile(file_path):
                    inputs.append(file_path)
        if from_one_file:
            one_file_limit = limit
        else:
            one_file_limit = limit // len(inputs)
        for input_file in inputs:
            if one_file_limit > 0 and len(self.data) >= limit:
                break
            with open(input_file, 'rb') as wfd:
                log.info(input_file)
                if one_file_limit > 0:
                    self.data.extend(pickle.load(wfd)[:one_file_limit])
                else:
                    self.data.extend(pickle.load(wfd))
                if limit > 0 and len(self.data) > limit:
                    self.data = self.data[:limit]
Exemplo n.º 2
0
 def __init__(self, args, label2index_map, input_size, paths, config):
     self.batch_size = args.batch_size
     self.epoch_num = args.epoch
     self.optimier = args.optimizer
     self.hidden_dim1 = args.hidden_dim1
     self.hidden_dim2 = args.hidden_dim2
     self.hidden_dim3 = args.hidden_dim3
     self.dropout_keep_prob = args.dropout
     self.beta = args.beta
     self.lr = args.lr
     self.clip_grad = args.clip
     self.optimizer = args.optimizer
     self.test_data_path = args.test_data
     self.tag2label = label2index_map
     self.num_tags = len(label2index_map)
     self.input_size = input_size
     self.config = config
     self.model_path = paths['model_path']
     self.summary_path = paths['summary_path']
     self.logger = base_util.get_logger(paths['log_path'])
     self.result_path = paths['result_path']
Exemplo n.º 3
0
timestamp = str(int(time.time())) if args.mode == 'train' else args.demo_model
output_path = os.path.join(args.train_data + "_save", timestamp)
if not os.path.exists(output_path): os.makedirs(output_path)
summary_path = os.path.join(output_path, "summaries")
paths['summary_path'] = summary_path
if not os.path.exists(summary_path): os.makedirs(summary_path)
model_path = os.path.join(output_path, "checkpoints/")
if not os.path.exists(model_path): os.makedirs(model_path)
ckpt_prefix = os.path.join(model_path, "model")
paths['model_path'] = ckpt_prefix
result_path = os.path.join(output_path, "results")
paths['result_path'] = result_path
if not os.path.exists(result_path): os.makedirs(result_path)
log_path = os.path.join(result_path, "log.txt")
paths['log_path'] = log_path
get_logger(log_path).info(str(args))

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

label2index_map, _ = load_label2index()
print(label2index_map)
# training model
train_path = os.path.join(args.train_data, 'train_modified.csv')
test_path = os.path.join(args.test_data, 'test_modified.csv')
if args.mode == 'train':
    ids, train_data = read_corpus(train_path)
    print("train data: {}".format(len(train_data)))
    train = train_data[:650000]
    val = train_data[650000:]
    input_size = len(train.columns) - 1
    print('input_size', input_size)
Exemplo n.º 4
0
from util.base_util import timer
from util.base_util import get_logger
import lightgbm as lgb
import numpy as np
import pickle
from sklearn.model_selection import KFold, StratifiedKFold
import pandas as pd
from sklearn.metrics import f1_score
from hyperopt import hp
from hyperopt import tpe
from hyperopt import Trials
from hyperopt import fmin

ITERATION = 0

log = get_logger()


def cross_validation(train,
                     params,
                     ID_COLUMN_NAME,
                     LABEL_COLUMN_NAME,
                     N_FOLD=5):
    '''
    :return: loss
    '''
    NUM_BOOST_ROUND = 1000
    EARLY_STOPPING_ROUNDS = 50

    # Cross validation model
    folds = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=1001)
Exemplo n.º 5
0
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score

from sklearn.model_selection import KFold, StratifiedKFold
from util.base_util import timer
import os
from competitions311 import data_process
import tensorflow as tf
from util import base_util

log = base_util.get_logger()

ID_COLUMN_NAME = 'user_id'
LABEL_COLUMN_NAME = 'current_service'


def nn_model(df_train, df_test):
    pass


class FeatureNN():
    def __init__(self,
                 x_train,
                 y_train,
                 x_val,
                 y_val,
                 epoch=10,
                 batch_size=1500):

        self.epoch = epoch
Exemplo n.º 6
0
def main():
    params = {
        'output_dir': str(Path(RESULT_DIR, 'res_torch')),
        'checkpoint': str(Path(RESULT_DIR, 'res_torch/model')),
        'glove_dim': 300,
        'vocab_tags': str(Path(DATA_DIR, 'processed/vocab.tags.txt')),
        'glove': str(Path(DATA_DIR, 'embedding/glove.npz')),
        'words': str(Path(DATA_DIR, 'processed/vocab.words.txt')),
        'tags': str(Path(DATA_DIR, 'processed/vocab.tags.txt')),
    }

    parser = argparse.ArgumentParser()
    parser.add_argument('--undo_train_valid', help="undo train data as valid",
                        action='store_true', default=False)
    parser.add_argument('--input', help="input dir or file",
                        type=str, required=True)
    parser.add_argument('--valid_input', help="valid data input dir or file",
                        type=str, required=True)
    parser.add_argument('--output', help="output file dir for writing result",
                        type=str, default=params['output_dir'])
    parser.add_argument('--limit', help="if use data limit",
                        type=int, default=0)
    parser.add_argument('--gpu_index', help="gpu index must>-1,if use gpu",
                        type=int, default=0)
    parser.add_argument('--dropout',
                        help="dropout rate in embed and liner layer",
                        type=float, default=0.2)
    parser.add_argument('--batch_size', help="batch size od data",
                        type=int, default=32)
    parser.add_argument('--hidden_size', help="set the hidden size",
                        type=int, default=128)
    parser.add_argument('--epochs', help="epochs of train",
                        type=int, default=100)

    parser.add_argument('--monitor',
                        help="monitor f1,acc,precision or recall, "
                             "value like ORG:f1 or PER:acc or LOC:recall",
                        type=str, default='ORG:f1')
    parser.add_argument('--use_glove', help="denote whether use use_glove",
                        type=bool, default=False)
    parser.add_argument('--model_name', help="file name of model file",
                        type=str, default='ner_model_crf')
    parser.add_argument('--mode_type',
                        help="choose transformer(t) or biLstm(b) or only crf(c)",
                        choices=['b', 't', 'c', 'bt', 'cnn'],
                        type=str, default='b')
    parser.add_argument('--bert_dim', help="bert dim",
                        type=int, default=768)
    parser.add_argument('--te_dropout', help="te dropout",
                        type=float, default=0.1)
    parser.add_argument('--lr', help="learning rate",
                        type=float, default=3e-4)
    parser.add_argument('--lr_times', help="learning rate decay times",
                        type=int, default=0)
    parser.add_argument('--wd', help="weight decay",
                        type=float, default=1e-3)
    parser.add_argument('--head_num', help="set the head num",
                        type=int, default=8)
    parser.add_argument('--vip', help="the ip or domain of visdom server",
                        type=str, default='')
    parser.add_argument('--env', help="the name of env of visdom",
                        type=str, default='ner')

    parser.add_argument('--pre_model_path', help="the pre model path",
                        type=str, default='')
    parser.add_argument('--use_cross_entropy', help="use cross entropy loss",
                        action='store_true', default=False)
    args = parser.parse_args()

    params['dropout'] = args.dropout
    params['use_glove'] = args.use_glove
    params['bert_dim'] = args.bert_dim
    params['mode_type'] = args.mode_type
    params['hidden_size'] = args.hidden_size
    # just for transformer
    params['te_dropout'] = args.te_dropout
    params['head_num'] = args.head_num
    params['use_cross_entropy'] = args.use_cross_entropy

    model_time_str = args.model_name + '_' + bu.get_time_str()

    log = bu.get_logger(model_time_str)

    if args.vip:
        vis = visdom.Visdom(args.vip, env=args.env)
    else:
        vis = None

    word_to_ix = {'<pad>': 0}
    if params['use_glove']:
        with open(params['words']) as wvf:
            for word in wvf:
                word = word.strip()
                if word not in word_to_ix:
                    word_to_ix[word] = len(word_to_ix)

    tag_to_ix = {'O': 0}
    with open(params['tags']) as wvf:
        for tag in wvf:
            tag = tag.strip()
            if tag not in tag_to_ix:
                tag_to_ix[tag] = len(tag_to_ix)
    idx_to_tag = {tag_to_ix[key]: key for key in tag_to_ix}

    if args.gpu_index > -1:
        device = torch.device(f'cuda:{args.gpu_index}')
    else:
        device = torch.device('cpu')

    model = Bert_CRF(tag_to_ix, params, device)
    model.to(device)

    if args.pre_model_path:
        with Path(args.pre_model_path).open('rb') as mp:
            if args.gpu_index < 0:
                ml = 'cpu'
            else:
                ml = None
            best_state_dict = torch.load(mp, map_location=ml)
            model.load_state_dict(best_state_dict, False)

    optimizer = optim.Adam(model.parameters(), lr=args.lr,
                           weight_decay=args.wd)

    # begin to train model
    step_index = 0

    # model, bert_dim, tag_to_ix, word_to_ix, rw, batch
    collate_fn = functools.partial(data_provider.collect_fn, model,
                                   params['bert_dim'], tag_to_ix, None,
                                   False)
    with bu.timer('load train data'):
        dataset = data_provider.BBNDatasetCombine(args.input,
                                                  args.limit)
    data_loader = tud.DataLoader(dataset, args.batch_size,
                                 shuffle=True, collate_fn=collate_fn,
                                 drop_last=True)

    if not args.undo_train_valid:
        sampler = tud.RandomSampler(data_source=dataset,
                                    replacement=True,
                                    num_samples=5000)
    else:
        sampler = None

    log.info('begin to train')
    Path(params['checkpoint']).mkdir(parents=True, exist_ok=True)
    monitor_best = 0
    wait = 0
    loss_train_epoch = []
    loss_valid_epoch = []
    loss_train_t = []
    loss_train_valid = []
    criterion_key = ['f1', 'precision', 'recall']
    criterion_map = {}

    lr_times = args.lr_times
    lr = args.lr
    for epoch in range(args.epochs):
        loss_train = []

        # index_batch, words_batch, words_ids_batch, len_w_batch, tags_batch
        # sentence_batch
        for i, w, wi, l, t, _ in data_loader:
            # Step 1. Remember that Pytorch accumulates gradients.
            model.zero_grad()
            # Step 2. Run our forward pass.
            # words, words_ids, len_w, tags
            loss = model.neg_log_likelihood(w, wi, l, t)
            # Step 3. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            ls = loss.mean()
            ls.backward()
            optimizer.step()
            step_index += 1
            step_loss = ls.item()
            log.info(
                f'global step:{step_index} epoch:{epoch} loss:{step_loss}')
            loss_train.append(step_loss)
            loss_train_t.append(step_loss)
            plot(vis, loss_train_t, args.model_name, ['train_loss'])

        if sampler:
            # collate_fn, model, args, tag_to_ix = None, idx_to_tag = None,
            # fpr = True, get_loss = False, input_dir = None, dataset_in = None,
            # sampler = None
            criterion, loss_valid_ = evaluate(collate_fn, model, args,
                                              tag_to_ix, idx_to_tag,
                                              True, True,
                                              dataset_in=dataset,
                                              sampler=sampler)
            for k in criterion:
                # ['f1', 'precision', 'recall']
                for ck in criterion_key:
                    key = f'train_{k}_{ck}'
                    if key not in criterion_map:
                        criterion_map[key] = []
                    criterion_map[key].append(criterion[k][ck])
            loss_train_valid.append(np.mean(loss_valid_))

        criterion, loss_valid = evaluate(collate_fn, model, args,
                                         tag_to_ix, idx_to_tag, True, True,
                                         input_dir=args.valid_input)
        loss_train_epoch.append(np.mean(loss_train))
        loss_valid_epoch.append(np.mean(loss_valid))

        for k in criterion:
            # ['f1', 'precision', 'recall']
            for ck in criterion_key:
                key = f'valid_{k}_{ck}'
                if key not in criterion_map:
                    criterion_map[key] = []
                criterion_map[key].append(criterion[k][ck])
        plot_data = []
        keys = list(criterion_map.keys())
        for k in criterion_map:
            plot_data.append(criterion_map[k])
        if sampler:
            legend = ['train_loss', 'valid_loss',
                      'train_loss_t'] + keys
            x_in = zip(loss_train_epoch, loss_valid_epoch,
                       loss_train_valid, *plot_data)
        else:
            legend = ['train_loss', 'valid_loss'] + keys
            x_in = zip(loss_train_epoch, loss_valid_epoch, *plot_data)
        plot(vis, x_in, args.model_name, legend)

        log.info(f'valid:{criterion}')
        tag_type, monitor_type = args.monitor.split(':')
        if (criterion[tag_type][monitor_type] > monitor_best
                or monitor_best == 0):
            monitor_best = criterion[tag_type][monitor_type]
            wait = 0
            best_state_dict = model.state_dict()
            if monitor_best:
                save_mode(best_state_dict, params, tag_to_ix, args.model_name)
        else:
            wait += 1
        if (epoch + 1) % 5 == 0:
            temp_name = f't_{args.model_name}_{epoch+1}'
            save_mode(model.state_dict(), params, tag_to_ix, temp_name)
        if wait > 8:
            if lr_times:
                lr_times -= 1
                wait = 3
                lr /= 3
                optimizer = optim.Adam(model.parameters(), lr=lr,
                                       weight_decay=args.wd)
            else:
                log.warn(f'meat early stopping! best score is {monitor_best}')
                break
        log.info('finish train')
Exemplo n.º 7
0
def main():
    params = {
        'output_dir': str(Path(RESULT_DIR, 'res_torch')),
        'checkpoint': str(Path(RESULT_DIR, 'res_torch/model')),
        'glove_dim': 300,
        'vocab_tags': str(Path(DATA_DIR, 'processed/vocab.tags.txt')),
        'glove': str(Path(DATA_DIR, 'embedding/glove.npz')),
        'words': str(Path(DATA_DIR, 'processed/vocab.words.txt')),
        'tags': str(Path(DATA_DIR, 'processed/vocab.tags.txt')),
    }

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        help="input dir or file",
                        type=str,
                        required=True)
    parser.add_argument('--output',
                        help="output file dir for writing result",
                        type=str,
                        default=params['output_dir'])
    parser.add_argument('--limit',
                        help="if use data limit",
                        type=int,
                        default=0)
    parser.add_argument('--gpu_index',
                        help="gpu index must>-1,if use gpu",
                        type=int,
                        default=0)
    parser.add_argument('--model_name',
                        help="file name of model file",
                        type=str,
                        default='ner_model_crf')
    args = parser.parse_args()

    model_time_str = args.model_name + '_' + bu.get_time_str()

    log = bu.get_logger(model_time_str)

    log.info('begin predict')
    fn_model = params['checkpoint'] + f'/{args.model_name}_torch.pkl'
    fn_config = params['checkpoint'] + f'/{args.model_name}_config.pkl'
    with Path(fn_model).open('rb') as mp:
        if args.gpu_index < 0:
            ml = 'cpu'
        else:
            ml = None
        best_state_dict = torch.load(mp, map_location=ml)
    with Path(fn_config).open('rb') as mp:
        params, tag_to_ix = pickle.load(mp)
    print(tag_to_ix)
    idx_to_tag = {tag_to_ix[key]: key for key in tag_to_ix}
    if args.gpu_index > -1:
        device = torch.device(f'cuda:{args.gpu_index}')
    else:
        device = torch.device('cpu')
    model = Bert_CRF(tag_to_ix, params, device)
    model.to(device)
    model.load_state_dict(best_state_dict, strict=False)

    with bu.timer('load data'):
        dataset = data_provider.BBNDatasetCombine(args.input, args.limit)
    # change batch_size to 1
    args.batch_size = 1

    # model, bert_dim, tag_to_ix, word_to_ix, rw, batch
    collate_fn = functools.partial(data_provider.collect_fn, model,
                                   params['bert_dim'], tag_to_ix, None, True)
    log.warn(f"{'-'*25}test_valid{'-'*25}")
    evaluate(collate_fn,
             model,
             args,
             tag_to_ix,
             idx_to_tag,
             True,
             False,
             f"{args.output}/{args.model_name}.txt",
             dataset_in=dataset)
Exemplo n.º 8
0
import pickle
import numpy as np
import torch

import pathlib

from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from torch.utils.data.distributed import DistributedSampler

from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertModel

import util.base_util as bu

logger = bu.get_logger(__name__)
CUDA_ID_PATTERN = re.compile('(\d+,)*\d+')

MODEL_PATH = '../config/bert/bert-base-chinese.tar.gz'
VOCAB_PATH = '../config/bert/bert-base-chinese-vocab.txt'
DATA_SET = set()

LIMITED = 0


class InputExample(object):
    def __init__(self, unique_id, text, entity_map):
        self.unique_id = unique_id
        self.text = text
        self.entity_map = entity_map