def preprocess() -> argparse.Namespace:
    """
    preprocess of training
    :return: config args
    """
    print('preprocessing starts...\n')
    # ====== parse arguments ====== #
    args = parse_args()
    # ====== set random seed ====== #
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    # ====== save path ====== #
    now_time = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    args.save_path = os.path.join('./logs/', 'my_log-' + now_time)
    if not os.path.exists(args.save_path) and not args.debug:
        os.makedirs(args.save_path)
    # ====== fitlog init ====== #
    fitlog.commit(__file__)
    fitlog.debug(args.debug)
    fitlog.add_hyper(args)
    # ====== tb VisualLogger init ====== #
    args.visual_logger = VisualLogger(
        args.save_path) if not args.debug else None
    # ====== cuda enable ====== #
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
    args.device = torch.device(
        'cuda') if args.cuda and torch.cuda.is_available() else torch.device(
            'cpu')
    # ====== others ====== #
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
    torch.set_num_threads(6)
    print(args, end='\n\n')
    return args
示例#2
0
import torch as tc
import torch.nn as nn
import torch.nn.functional as F
import pdb
import fitlog
from .transformer_sublayers import Encoder, Decoder
from fastNLP.embeddings.bert_embedding import BertEmbedding
from fastNLP.embeddings.static_embedding import StaticEmbedding

fitlog.commit(__file__)


class Model(nn.Module):
    def __init__(self,
                 vocab,
                 logger,
                 num_layers=6,
                 d_hid=1024,
                 h=8,
                 d_model=512,
                 dropout=0.0):
        super().__init__()

        #self.b_embedding = BertEmbedding(vocab, model_dir_or_name='cn-wwm', requires_grad = False, layers='4,-2,-1')
        #self.b_emb_outer = nn.Linear(self.b_embedding.embed_size , d_model)
        self.s_embedding = StaticEmbedding(vocab,
                                           "cn-sgns-literature-word",
                                           requires_grad=True)
        self.s_emb_outer = nn.Linear(self.s_embedding.embed_size, d_model)
        #self.r_embedding = nn.Embedding(len(vocab) , d_model , padding_idx = vocab.to_index("<pad>"))
        #self.r_emb_outer = nn.Linear(d_model , d_model)
示例#3
0
def train():
    args = parse_args()
    if args.debug:
        fitlog.debug()
        args.save_model = False
    # ================= define =================
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    word_mask_index = tokenizer.mask_token_id
    word_vocab_size = len(tokenizer)

    if get_local_rank() == 0:
        fitlog.set_log_dir(args.log_dir)
        fitlog.commit(__file__, fit_msg=args.name)
        fitlog.add_hyper_in_file(__file__)
        fitlog.add_hyper(args)

    # ================= load data =================
    dist.init_process_group('nccl')
    init_logger_dist()

    n_proc = dist.get_world_size()
    bsz = args.batch_size // args.grad_accumulation // n_proc
    args.local_rank = get_local_rank()
    args.save_dir = os.path.join(args.save_dir,
                                 args.name) if args.save_model else None
    if args.save_dir is not None and os.path.exists(args.save_dir):
        raise RuntimeError('save_dir has already existed.')
    logger.info('save directory: {}'.format(
        'None' if args.save_dir is None else args.save_dir))
    devices = list(range(torch.cuda.device_count()))
    NUM_WORKERS = 4

    ent_vocab, rel_vocab = load_ent_rel_vocabs()
    logger.info('# entities: {}'.format(len(ent_vocab)))
    logger.info('# relations: {}'.format(len(rel_vocab)))
    ent_freq = get_ent_freq()
    assert len(ent_vocab) == len(ent_freq), '{} {}'.format(
        len(ent_vocab), len(ent_freq))

    #####
    root = args.data_dir
    dirs = os.listdir(root)
    drop_files = []
    for dir in dirs:
        path = os.path.join(root, dir)
        max_idx = 0
        for file_name in os.listdir(path):
            if 'large' in file_name:
                continue
            max_idx = int(file_name) if int(file_name) > max_idx else max_idx
        drop_files.append(os.path.join(path, str(max_idx)))
    #####

    file_list = []
    for path, _, filenames in os.walk(args.data_dir):
        for filename in filenames:
            file = os.path.join(path, filename)
            if 'large' in file or file in drop_files:
                continue
            file_list.append(file)
    logger.info('used {} files in {}.'.format(len(file_list), args.data_dir))
    if args.data_prop > 1:
        used_files = file_list[:int(args.data_prop)]
    else:
        used_files = file_list[:round(args.data_prop * len(file_list))]

    data = GraphOTFDataSet(used_files, n_proc, args.local_rank,
                           word_mask_index, word_vocab_size, args.n_negs,
                           ent_vocab, rel_vocab, ent_freq)
    dev_data = GraphDataSet(used_files[0], word_mask_index, word_vocab_size,
                            args.n_negs, ent_vocab, rel_vocab, ent_freq)

    sampler = OTFDistributedSampler(used_files, n_proc, get_local_rank())
    train_data_iter = TorchLoaderIter(dataset=data,
                                      batch_size=bsz,
                                      sampler=sampler,
                                      num_workers=NUM_WORKERS,
                                      collate_fn=data.collate_fn)
    dev_data_iter = TorchLoaderIter(dataset=dev_data,
                                    batch_size=bsz,
                                    sampler=RandomSampler(),
                                    num_workers=NUM_WORKERS,
                                    collate_fn=dev_data.collate_fn)
    if args.test_data is not None:
        test_data = FewRelDevDataSet(path=args.test_data,
                                     label_vocab=rel_vocab,
                                     ent_vocab=ent_vocab)
        test_data_iter = TorchLoaderIter(dataset=test_data,
                                         batch_size=32,
                                         sampler=RandomSampler(),
                                         num_workers=NUM_WORKERS,
                                         collate_fn=test_data.collate_fn)

    if args.local_rank == 0:
        print('full wiki files: {}'.format(len(file_list)))
        print('used wiki files: {}'.format(len(used_files)))
        print('# of trained samples: {}'.format(len(data) * n_proc))
        print('# of trained entities: {}'.format(len(ent_vocab)))
        print('# of trained relations: {}'.format(len(rel_vocab)))

    # ================= prepare model =================
    logger.info('model init')
    if args.rel_emb is not None:  # load pretrained relation embeddings
        rel_emb = np.load(args.rel_emb)
        # add_embs = np.random.randn(3, rel_emb.shape[1])  # add <pad>, <mask>, <unk>
        # rel_emb = np.r_[add_embs, rel_emb]
        rel_emb = torch.from_numpy(rel_emb).float()
        assert rel_emb.shape[0] == len(rel_vocab), '{} {}'.format(
            rel_emb.shape[0], len(rel_vocab))
        # assert rel_emb.shape[1] == args.rel_dim
        logger.info('loaded pretrained relation embeddings. dim: {}'.format(
            rel_emb.shape[1]))
    else:
        rel_emb = None
    if args.model_name is not None:
        logger.info('further pre-train.')
        config = RobertaConfig.from_pretrained('roberta-base',
                                               type_vocab_size=3)
        model = CoLAKE(config=config,
                       num_ent=len(ent_vocab),
                       num_rel=len(rel_vocab),
                       ent_dim=args.ent_dim,
                       rel_dim=args.rel_dim,
                       ent_lr=args.ent_lr,
                       ip_config=args.ip_config,
                       rel_emb=None,
                       emb_name=args.emb_name)
        states_dict = torch.load(args.model_name)
        model.load_state_dict(states_dict, strict=True)
    else:
        model = CoLAKE.from_pretrained(
            'roberta-base',
            num_ent=len(ent_vocab),
            num_rel=len(rel_vocab),
            ent_lr=args.ent_lr,
            ip_config=args.ip_config,
            rel_emb=rel_emb,
            emb_name=args.emb_name,
            cache_dir=PYTORCH_PRETRAINED_BERT_CACHE /
            'dist_{}'.format(args.local_rank))
        model.extend_type_embedding(token_type=3)
    # if args.local_rank == 0:
    #     for name, param in model.named_parameters():
    #         if param.requires_grad is True:
    #             print('{}: {}'.format(name, param.shape))

    # ================= train model =================
    # lr=1e-4 for peak value, lr=5e-5 for initial value
    logger.info('trainer init')
    no_decay = [
        'bias', 'LayerNorm.bias', 'LayerNorm.weight', 'layer_norm.bias',
        'layer_norm.weight'
    ]
    param_optimizer = list(model.named_parameters())
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    word_acc = WordMLMAccuracy(pred='word_pred',
                               target='masked_lm_labels',
                               seq_len='word_seq_len')
    ent_acc = EntityMLMAccuracy(pred='entity_pred',
                                target='ent_masked_lm_labels',
                                seq_len='ent_seq_len')
    rel_acc = RelationMLMAccuracy(pred='relation_pred',
                                  target='rel_masked_lm_labels',
                                  seq_len='rel_seq_len')
    metrics = [word_acc, ent_acc, rel_acc]

    if args.test_data is not None:
        test_metric = [rel_acc]
        tester = Tester(data=test_data_iter,
                        model=model,
                        metrics=test_metric,
                        device=list(range(torch.cuda.device_count())))
        # tester.test()
    else:
        tester = None

    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=args.lr,
                            betas=(0.9, args.beta),
                            eps=1e-6)
    # warmup_callback = WarmupCallback(warmup=args.warm_up, schedule='linear')
    fitlog_callback = MyFitlogCallback(tester=tester,
                                       log_loss_every=100,
                                       verbose=1)
    gradient_clip_callback = GradientClipCallback(clip_value=1,
                                                  clip_type='norm')
    emb_callback = EmbUpdateCallback(model.ent_embeddings)
    all_callbacks = [gradient_clip_callback, emb_callback]
    if args.save_dir is None:
        master_callbacks = [fitlog_callback]
    else:
        save_callback = SaveModelCallback(args.save_dir,
                                          model.ent_embeddings,
                                          only_params=True)
        master_callbacks = [fitlog_callback, save_callback]

    if args.do_test:
        states_dict = torch.load(os.path.join(args.save_dir,
                                              args.model_name)).state_dict()
        model.load_state_dict(states_dict)
        data_iter = TorchLoaderIter(dataset=data,
                                    batch_size=args.batch_size,
                                    sampler=RandomSampler(),
                                    num_workers=NUM_WORKERS,
                                    collate_fn=data.collate_fn)
        tester = Tester(data=data_iter,
                        model=model,
                        metrics=metrics,
                        device=devices)
        tester.test()
    else:
        trainer = DistTrainer(train_data=train_data_iter,
                              dev_data=dev_data_iter,
                              model=model,
                              optimizer=optimizer,
                              loss=LossInForward(),
                              batch_size_per_gpu=bsz,
                              update_every=args.grad_accumulation,
                              n_epochs=args.epoch,
                              metrics=metrics,
                              callbacks_master=master_callbacks,
                              callbacks_all=all_callbacks,
                              validate_every=5000,
                              use_tqdm=True,
                              fp16='O1' if args.fp16 else '')
        trainer.train(load_best_model=False)
示例#4
0
import fitlog

fitlog.commit(__file__)  # auto commit your codes
fitlog.add_hyper_in_file(__file__)  # record your hyperparameters
"""
Your training code here, you may use these functions to log your result:
    fitlog.add_hyper()
    fitlog.add_loss()
    fitlog.add_metric()
    fitlog.add_best_metric()
    ......
"""

fitlog.finish()  # finish the logging
示例#5
0
parser.add_argument('--embed_dropout', default=0.5)
parser.add_argument('--gaz_dropout', default=-1)
parser.add_argument('--output_dropout', default=0.5)
parser.add_argument('--epoch', default=100)
parser.add_argument('--seed', default=100)

args = parser.parse_args()

set_seed(args.seed)

fit_msg_list = [args.model, 'bi' if args.bi else 'uni', str(args.batch)]
if args.model == 'lattice':
    fit_msg_list.append(str(args.skip_before_head))
fit_msg = ' '.join(fit_msg_list)
fitlog.commit(__file__, fit_msg=fit_msg)

device = torch.device(args.device)
for k, v in args.__dict__.items():
    print(k, v)

refresh_data = False

from pathes import *
# ontonote4ner_cn_path = 0
# yangjie_rich_pretrain_unigram_path = 0
# yangjie_rich_pretrain_bigram_path = 0
# resume_ner_path = 0
# weibo_ner_path = 0

if args.dataset == 'ontonote':
示例#6
0
文件: run_re.py 项目: yyht/CoLAKE
def main():
    args = parse_args()

    if args.debug:
        fitlog.debug()

    fitlog.set_log_dir(args.log_dir)
    fitlog.commit(__file__)
    fitlog.add_hyper_in_file(__file__)
    fitlog.add_hyper(args)
    if args.gpu != 'all':
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    train_set, dev_set, test_set, temp_ent_vocab = load_fewrel_graph_data(
        data_dir=args.data_dir)

    print('data directory: {}'.format(args.data_dir))
    print('# of train samples: {}'.format(len(train_set)))
    print('# of dev samples: {}'.format(len(dev_set)))
    print('# of test samples: {}'.format(len(test_set)))

    ent_vocab, rel_vocab = load_ent_rel_vocabs(path='../')

    # load entity embeddings
    ent_index = []
    for k, v in temp_ent_vocab.items():
        ent_index.append(ent_vocab[k])
    ent_index = torch.tensor(ent_index)
    ent_emb = np.load(os.path.join(args.model_path, 'entities.npy'))
    ent_embedding = nn.Embedding.from_pretrained(torch.from_numpy(ent_emb))
    ent_emb = ent_embedding(ent_index.view(1, -1)).squeeze().detach()

    # load CoLAKE parameters
    config = RobertaConfig.from_pretrained('roberta-base', type_vocab_size=3)
    model = CoLAKEForRE(config,
                        num_types=len(train_set.label_vocab),
                        ent_emb=ent_emb)
    states_dict = torch.load(os.path.join(args.model_path, 'model.bin'))
    model.load_state_dict(states_dict, strict=False)
    print('parameters below are randomly initializecd:')
    for name, param in model.named_parameters():
        if name not in states_dict:
            print(name)

    # tie relation classification head
    rel_index = []
    for k, v in train_set.label_vocab.items():
        rel_index.append(rel_vocab[k])
    rel_index = torch.LongTensor(rel_index)
    rel_embeddings = nn.Embedding.from_pretrained(
        states_dict['rel_embeddings.weight'])
    rel_index = rel_index.cuda()
    rel_cls_weight = rel_embeddings(rel_index.view(1, -1)).squeeze()
    model.tie_rel_weights(rel_cls_weight)

    model.rel_head.dense.weight.data = states_dict['rel_lm_head.dense.weight']
    model.rel_head.dense.bias.data = states_dict['rel_lm_head.dense.bias']
    model.rel_head.layer_norm.weight.data = states_dict[
        'rel_lm_head.layer_norm.weight']
    model.rel_head.layer_norm.bias.data = states_dict[
        'rel_lm_head.layer_norm.bias']

    model.resize_token_embeddings(
        len(RobertaTokenizer.from_pretrained('roberta-base')) + 4)
    print('parameters of CoLAKE has been loaded.')

    # fine-tune
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'embedding']
    param_optimizer = list(model.named_parameters())
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = optim.AdamW(optimizer_grouped_parameters,
                            lr=args.lr,
                            betas=(0.9, args.beta),
                            eps=1e-6)

    metrics = [MacroMetric(pred='pred', target='target')]

    test_data_iter = TorchLoaderIter(dataset=test_set,
                                     batch_size=args.batch_size,
                                     sampler=RandomSampler(),
                                     num_workers=4,
                                     collate_fn=test_set.collate_fn)
    devices = list(range(torch.cuda.device_count()))
    tester = Tester(data=test_data_iter,
                    model=model,
                    metrics=metrics,
                    device=devices)
    # tester.test()

    fitlog_callback = FitlogCallback(tester=tester,
                                     log_loss_every=100,
                                     verbose=1)
    gradient_clip_callback = GradientClipCallback(clip_value=1,
                                                  clip_type='norm')
    warmup_callback = WarmupCallback(warmup=args.warm_up, schedule='linear')

    bsz = args.batch_size // args.grad_accumulation

    train_data_iter = TorchLoaderIter(dataset=train_set,
                                      batch_size=bsz,
                                      sampler=RandomSampler(),
                                      num_workers=4,
                                      collate_fn=train_set.collate_fn)
    dev_data_iter = TorchLoaderIter(dataset=dev_set,
                                    batch_size=bsz,
                                    sampler=RandomSampler(),
                                    num_workers=4,
                                    collate_fn=dev_set.collate_fn)

    trainer = Trainer(
        train_data=train_data_iter,
        dev_data=dev_data_iter,
        model=model,
        optimizer=optimizer,
        loss=LossInForward(),
        batch_size=bsz,
        update_every=args.grad_accumulation,
        n_epochs=args.epoch,
        metrics=metrics,
        callbacks=[fitlog_callback, gradient_clip_callback, warmup_callback],
        device=devices,
        use_tqdm=True)

    trainer.train(load_best_model=False)
示例#7
0
from fastNLP.core.losses import CrossEntropyLoss
from fastNLP.core.metrics import AccuracyMetric

from fastNLP import Const
from fastNLP import AccuracyMetric
from fastNLP import CrossEntropyLoss
from fastNLP import BucketSampler
from fastNLP import Batch
import torch
import time
import fitlog
from fastNLP.core.callback import FitlogCallback
from fastNLP import Tester
from fastNLP import Callback

fitlog.commit('__file__')             # auto commit your codes
fitlog.add_hyper_in_file ('__file__') # record your hyperparameters
loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET)
metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)
target_len = 20
def readdata():
    global target_len
    min_count = 10
    #categories = ['comp.os.ms-windows.misc', 'rec.motorcycles', 'sci.space', 'talk.politics.misc', ]
    dataset_train = fetch_20newsgroups(subset='train', data_home='../../..')
    dataset_test = fetch_20newsgroups(subset='test', data_home='../../..')

    data = dataset_train.data
    target = dataset_train.target
    target_len = len(dataset_train.target_names)
    train_data =  DataSet()