예제 #1
0
def test_itf():
    tokenizer = Tokenizer.from_pretrained(Config.model_name)
    if Config.use_pickle:
        with open(f'{Config.pickle_path}', 'rb') as f:
            train_data = pickle.load(f)
    else:
        train_data = make_train_data_from_txt(Config, tokenizer)
    counter, itf = make_itf(train_data, Config.vocab_size, tokenizer)
    # itf = (itf - itf.min()) / (itf.max() - itf.min())
    # for i in range(itf.size(0)):
    #     print(i, itf[i])
    # itf[itf == 0] += 1e-6
    for k, v in counter.most_common(len(counter)):
        print(tokenizer.decode([k]), v)
예제 #2
0
def preprocess_bert(config):
    opt = config['opt']

    from transformers import BertTokenizer
    from transformers import DistilBertTokenizer
    from transformers import AlbertTokenizer
    from transformers import RobertaTokenizer
    from transformers import BartTokenizer
    from transformers import ElectraTokenizer
    TOKENIZER_CLASSES = {
        "bert": BertTokenizer,
        "distilbert": DistilBertTokenizer,
        "albert": AlbertTokenizer,
        "roberta": RobertaTokenizer,
        "bart": BartTokenizer,
        "electra": ElectraTokenizer,
    }
    Tokenizer = TOKENIZER_CLASSES[config['emb_class']]

    tokenizer = Tokenizer.from_pretrained(opt.bert_model_name_or_path,
                                          do_lower_case=opt.bert_do_lower_case)
    # build poss, chars, labels
    path = os.path.join(opt.data_dir, _TRAIN_FILE)
    poss, chars, labels = build_dict(path, config)

    # build features
    path = os.path.join(opt.data_dir, _TRAIN_FILE)
    train_features = build_features(path, tokenizer, poss, labels, config, mode='train')

    path = os.path.join(opt.data_dir, _VALID_FILE)
    valid_features = build_features(path, tokenizer, poss, labels, config, mode='valid')

    path = os.path.join(opt.data_dir, _TEST_FILE)
    test_features = build_features(path, tokenizer, poss, labels, config, mode='test')

    # write features
    path = os.path.join(opt.data_dir, _TRAIN_FILE + _FSUFFIX)
    write_features(train_features, path)

    path = os.path.join(opt.data_dir, _VALID_FILE + _FSUFFIX)
    write_features(valid_features, path)

    path = os.path.join(opt.data_dir, _TEST_FILE + _FSUFFIX)
    write_features(test_features, path)

    # write poss, labels
    path = os.path.join(opt.data_dir, _POS_FILE)
    write_dict(poss, path)
    path = os.path.join(opt.data_dir, _LABEL_FILE)
    write_dict(labels, path)
예제 #3
0
파일: main.py 프로젝트: NukeA/Dialog
logging.basicConfig(level=logging.INFO)

if __name__ == '__main__':
    logging.info('*** Initializing ***')

    if not os.path.isdir(Config.data_dir):
        os.mkdir(Config.data_dir)

    seed_everything(Config.seed)
    device = torch.device(Config.device)

    start_epoch = 0

    logging.info('Define Models')
    model = build_model(Config).to(device)
    tokenizer = Tokenizer.from_pretrained(Config.model_name)

    logging.info('Define Loss and Optimizer')
    criterion = LabelSmoothing(tokenizer.vocab_size,
                               pad_id=tokenizer.pad_token_id,
                               smoothing=Config.smoothing)
    _opt = optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
    optimizer = get_optimizer(_opt, factor=Config.factor, warmup=Config.warmup)

    logging.info('Preparing training data')
    if Config.use_pickle:
        with open(f'{Config.pickle_path}', 'rb') as f:
            train_data = pickle.load(f)
    else:
        train_data = make_train_data_from_txt(Config, tokenizer)
    dataset = DialogDataset(train_data, tokenizer)