Exemplo n.º 1
0
def main(args):
    # fix_seed()   不要fix效果才好.!!!!!!!!!!!!!!否则shuffle没用了
    if not os.path.exists(args.model_dir):
        os.mkdir(args.model_dir)

    weights_name = get_weights_name(args.transformer_model,
                                    args.lowercase_tokens)
    # read datasets
    reader = get_data_reader(weights_name,
                             args.max_len,
                             skip_correct=bool(args.skip_correct),
                             skip_complex=args.skip_complex,
                             test_mode=False,
                             tag_strategy=args.tag_strategy,
                             lowercase_tokens=args.lowercase_tokens,
                             max_pieces_per_token=args.pieces_per_token,
                             tn_prob=args.tn_prob,
                             tp_prob=args.tp_prob,
                             special_tokens_fix=args.special_tokens_fix)
    train_data = reader.read(args.train_set)
    dev_data = reader.read(args.dev_set)
    #   list(train_data)
    default_tokens = [DEFAULT_OOV_TOKEN, DEFAULT_PADDING_TOKEN]
    namespaces = ['labels', 'd_tags']
    tokens_to_add = {x: default_tokens for x in namespaces}
    # build vocab  # 这里面是生成字典的算法
    if args.vocab_path:
        old_vocab = Vocabulary.from_files(args.vocab_path)

    # 代码修改成,不管传入不传入都根据数据集重新简历字典,然后进行2个字典的合并.

    if 1:  # 生成字典. 利用数据集生成字典!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        # 下面看看这个生成vocab 如何做的. 生成之后的东西就是对应到目录下的output_vocabulary里面的内容.
        # 直接调用的是allennlp库包,还是需要看懂里面实现的算法.

        #-------------需要对这个from_instances进行修改,看里面如何生成我的更大字典.这个是已经封装好的了.所以不用看了.改上面数据才行.
        new_vocab = Vocabulary.from_instances(train_data,
                                              max_vocab_size={
                                                  'tokens': 30000,
                                                  'labels':
                                                  args.target_vocab_size,
                                                  'd_tags': 2
                                              },
                                              tokens_to_add=tokens_to_add)

    from allennlp.common.params import Params
    params = Params({"non_padded_namespaces": set(namespaces)})
    vocab = old_vocab

    old_vocab.extend_from_instances(params, train_data)

    old_vocab.save_to_files(os.path.join(args.model_dir, 'vocabulary'))
    from pathlib import Path
    vocabdir = Path(__file__).resolve().parent.parent / os.path.join(
        args.model_dir, 'vocabulary', 'labels.txt')
    print("Data is loaded")
    model = get_model(weights_name,
                      vocab,
                      tune_bert=args.tune_bert,
                      predictor_dropout=args.predictor_dropout,
                      label_smoothing=args.label_smoothing,
                      special_tokens_fix=args.special_tokens_fix)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        if torch.cuda.device_count() > 1:
            cuda_device = list(range(torch.cuda.device_count()))
        else:
            cuda_device = 0
    else:
        cuda_device = -1

    if args.pretrain:  # 我们不用这个地方来加载
        model.load_state_dict(
            torch.load(
                os.path.join(args.pretrain_folder, args.pretrain + '.th')))

    model = model.to(device)

    print("Model is set", '模型加载完毕')

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.1,
                                                           patience=10)
    instances_per_epoch = None if not args.updates_per_epoch else \
        int(args.updates_per_epoch * args.batch_size * args.accumulation_size)
    iterator = BucketIterator(
        batch_size=args.batch_size,
        sorting_keys=[("tokens", "num_tokens")],
        biggest_batch_first=True,
        max_instances_in_memory=args.batch_size * 20000,
        instances_per_epoch=instances_per_epoch,
    )
    iterator.index_with(vocab)
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        scheduler=scheduler,
        iterator=iterator,
        train_dataset=train_data,
        validation_dataset=dev_data,
        serialization_dir=args.model_dir,
        patience=args.patience,
        num_epochs=args.n_epoch,
        cuda_device=cuda_device,
        shuffle=True,  # 吧这个地方改了.true
        accumulated_batch_count=args.accumulation_size,
        cold_step_count=args.cold_steps_count,
        cold_lr=args.cold_lr,
        cuda_verbose_step=int(args.cuda_verbose_steps)
        if args.cuda_verbose_steps else None)
    print("Start training")
    trainer.train(args.oldmodel)

    # Here's how to save the model. # 最优模型再存一遍.所以最后这个目录里面只存model.th即可.而不用管那些带系数的.
    out_model = os.path.join(args.model_dir, 'model.th')
    with open(out_model, 'wb') as f:
        torch.save(model.state_dict(), f)
    print("Model is dumped", "训练全部结束,model存在了", args.model_dir + ' / model.th')
def main(args):
    fix_seed()
    if not os.path.exists(args.model_dir):
        os.mkdir(args.model_dir)

    weights_name = get_weights_name(args.transformer_model,
                                    args.lowercase_tokens)
    # read datasets
    reader = get_data_reader(weights_name,
                             args.max_len,
                             skip_correct=bool(args.skip_correct),
                             skip_complex=args.skip_complex,
                             test_mode=False,
                             tag_strategy=args.tag_strategy,
                             lowercase_tokens=args.lowercase_tokens,
                             max_pieces_per_token=args.pieces_per_token,
                             tn_prob=args.tn_prob,
                             tp_prob=args.tp_prob,
                             special_tokens_fix=args.special_tokens_fix)
    train_data = reader.read(args.train_set)
    dev_data = reader.read(args.dev_set)

    default_tokens = [DEFAULT_OOV_TOKEN, DEFAULT_PADDING_TOKEN]
    namespaces = ['labels', 'd_tags']
    tokens_to_add = {x: default_tokens for x in namespaces}
    # build vocab
    if args.vocab_path:
        vocab = Vocabulary.from_files(args.vocab_path)
    else:
        vocab = Vocabulary.from_instances(train_data,
                                          max_vocab_size={
                                              'tokens': 30000,
                                              'labels': args.target_vocab_size,
                                              'd_tags': 2
                                          },
                                          tokens_to_add=tokens_to_add)
    vocab.save_to_files(os.path.join(args.model_dir, 'vocabulary'))

    print("Data is loaded")
    model = get_model(weights_name,
                      vocab,
                      tune_bert=args.tune_bert,
                      predictor_dropout=args.predictor_dropout,
                      label_smoothing=args.label_smoothing,
                      special_tokens_fix=args.special_tokens_fix)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        if torch.cuda.device_count() > 1:
            cuda_device = list(range(torch.cuda.device_count()))
        else:
            cuda_device = 0
    else:
        cuda_device = -1

    if args.pretrain:
        model.load_state_dict(
            torch.load(os.path.join(args.pretrain_folder,
                                    args.pretrain + '.th'),
                       map_location=torch.device('cpu')))

    model = model.to(device)

    print("Model is set")

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.1,
                                                           patience=10)
    instances_per_epoch = None if not args.updates_per_epoch else \
        int(args.updates_per_epoch * args.batch_size * args.accumulation_size)
    iterator = BucketIterator(
        batch_size=args.batch_size,
        sorting_keys=[("tokens", "num_tokens")],
        biggest_batch_first=True,
        max_instances_in_memory=args.batch_size * 20000,
        instances_per_epoch=instances_per_epoch,
    )
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      iterator=iterator,
                      train_dataset=train_data,
                      validation_dataset=dev_data,
                      serialization_dir=args.model_dir,
                      patience=args.patience,
                      num_epochs=args.n_epoch,
                      cuda_device=cuda_device,
                      shuffle=False,
                      accumulated_batch_count=args.accumulation_size,
                      cold_step_count=args.cold_steps_count,
                      cold_lr=args.cold_lr,
                      cuda_verbose_step=int(args.cuda_verbose_steps)
                      if args.cuda_verbose_steps else None)
    print("Start training")
    trainer.train()

    # Here's how to save the model.
    out_model = os.path.join(args.model_dir, 'model.th')
    with open(out_model, 'wb') as f:
        torch.save(model.state_dict(), f)
    print("Model is dumped")
Exemplo n.º 3
0
def main(args):
    fix_seed()
    if not os.path.exists(args.model_dir):
        os.mkdir(args.model_dir)

    weights_name = get_weights_name(args.transformer_model,
                                    args.lowercase_tokens)
    # read datasets
    reader = get_data_reader(weights_name,
                             args.max_len,
                             skip_correct=bool(args.skip_correct),
                             skip_complex=args.skip_complex,
                             test_mode=False,
                             tag_strategy=args.tag_strategy,
                             lowercase_tokens=args.lowercase_tokens,
                             max_pieces_per_token=args.pieces_per_token,
                             tn_prob=args.tn_prob,
                             tp_prob=args.tp_prob,
                             special_tokens_fix=args.special_tokens_fix)
    train_data = reader.read(args.train_set)
    dev_data = reader.read(args.dev_set)

    default_tokens = [DEFAULT_OOV_TOKEN, DEFAULT_PADDING_TOKEN]
    namespaces = ['labels', 'd_tags']
    tokens_to_add = {x: default_tokens for x in namespaces}
    # build vocab
    if args.vocab_path:
        vocab = Vocabulary.from_files(args.vocab_path)
    else:
        vocab = Vocabulary.from_instances(train_data,
                                          max_vocab_size={
                                              'tokens': 30000,
                                              'labels': args.target_vocab_size,
                                              'd_tags': 2
                                          },
                                          tokens_to_add=tokens_to_add)
    vocab.save_to_files(os.path.join(args.model_dir, 'vocabulary'))

    print("Data is loaded")
    model = get_model(weights_name,
                      vocab,
                      tune_bert=args.tune_bert,
                      predictor_dropout=args.predictor_dropout,
                      label_smoothing=args.label_smoothing,
                      special_tokens_fix=args.special_tokens_fix)

    # model = GecBERTModel(vocab_path=args.vocab_path,
    #                      model_paths=args.model_path,
    #                      max_len=args.max_len, min_len=args.min_len,
    #                      iterations=args.iteration_count,
    #                      min_error_probability=args.min_error_probability,
    #                      min_probability=args.min_error_probability,
    #                      lowercase_tokens=args.lowercase_tokens,
    #                      model_name=args.transformer_model,
    #                      special_tokens_fix=args.special_tokens_fix,
    #                      log=False,
    #                      confidence=args.additional_confidence,
    #                      is_ensemble=args.is_ensemble,
    #                      weigths=args.weights)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        if torch.cuda.device_count() > 1:
            cuda_device = list(range(torch.cuda.device_count()))
        else:
            cuda_device = 0
    else:
        cuda_device = -1

    if args.pretrain:
        model.load_state_dict(
            torch.load(
                os.path.join(args.pretrain_folder, args.pretrain + '.th')))

    print('cuda_device:', cuda_device)
    #exit(0)
    model = model.to(device)

    print("Model is set")

    # print('model:', model)
    def print_size_of_model(model):
        torch.save(model.state_dict(), "temp.p")
        print('Size (MB):', os.path.getsize("temp.p") / 1e6)
        os.remove('temp.p')

    if args.keep != 12:

        prev_model = GecBERTModel(
            vocab_path=args.vocab_path,
            model_paths=args.model_path,
            max_len=args.max_len,
            min_len=args.min_len,
            iterations=args.iteration_count,
            min_error_probability=args.min_error_probability,
            min_probability=args.min_error_probability,
            lowercase_tokens=args.lowercase_tokens,
            model_name=args.transformer_model,
            special_tokens_fix=args.special_tokens_fix,
            log=False,
            confidence=args.additional_confidence,
            is_ensemble=args.is_ensemble,
            weigths=args.weights,
            num_layers_to_keep=args.keep)

        # print('prev_model:', prev_model.models)

        # print(model)
        print_size_of_model(model)
        print_size_of_model(prev_model.models[0])

        model.text_field_embedder.token_embedder_bert.bert_model.encoder.layer = \
            prev_model.models[0].text_field_embedder.token_embedder_bert.bert_model.encoder.layer

    print_size_of_model(model)

    # exit(0)

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.1,
                                                           patience=10)
    instances_per_epoch = None if not args.updates_per_epoch else \
        int(args.updates_per_epoch * args.batch_size * args.accumulation_size)
    iterator = BucketIterator(
        batch_size=args.batch_size,
        sorting_keys=[("tokens", "num_tokens")],
        biggest_batch_first=True,
        max_instances_in_memory=args.batch_size * 20000,
        instances_per_epoch=instances_per_epoch,
    )
    iterator.index_with(vocab)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      iterator=iterator,
                      train_dataset=train_data,
                      validation_dataset=dev_data,
                      serialization_dir=args.model_dir,
                      patience=args.patience,
                      num_epochs=args.n_epoch,
                      cuda_device=cuda_device,
                      shuffle=False,
                      accumulated_batch_count=args.accumulation_size,
                      cold_step_count=args.cold_steps_count,
                      cold_lr=args.cold_lr,
                      cuda_verbose_step=int(args.cuda_verbose_steps)
                      if args.cuda_verbose_steps else None)

    GPUtil.showUtilization()
    print("Start training")
    trainer.train()

    # Here's how to save the model.
    out_model = os.path.join(args.model_dir, 'model.th')
    with open(out_model, 'wb') as f:
        torch.save(model.state_dict(), f)
    print("Model is dumped")