Exemplo n.º 1
0
def validate(args, device_id, pt, step, tokenizer):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    valid_iter = data_loader.Dataloader(args, load_dataset(args, 'dev', shuffle=False),
                                        args.batch_size, device,
                                        shuffle=False, is_test=False)

    symbols = {'BOS': tokenizer.convert_tokens_to_ids('<s>'), 'EOS': tokenizer.convert_tokens_to_ids('</s>'),
               'PAD': tokenizer.convert_tokens_to_ids('[PAD]')}

    valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device)

    trainer = build_trainer(args, device_id, model, None, valid_loss)
    stats = trainer.validate(valid_iter, step)
    return stats.xent()
def test_text_abs(args):

    logger.info('Loading checkpoint from %s' % args.test_from)
    device = "cpu" if args.visible_gpus == '-1' else "cuda"

    checkpoint = torch.load(args.test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)
    logger.info('Loading args inside test_text_abs %s' % args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.load_text(args, args.text_src, args.text_tgt,
                                      device)

    logger.info('test_iter is %s' % test_iter)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True,
                                              cache_dir=args.temp_dir)
    symbols = {
        'BOS': tokenizer.vocab['[unused0]'],
        'EOS': tokenizer.vocab['[unused1]'],
        'PAD': tokenizer.vocab['[PAD]'],
        'EOQ': tokenizer.vocab['[unused2]']
    }

    logger.info('symbols is %s' % symbols)
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, -1)
Exemplo n.º 3
0
def test_text_abs(args, device_id, pt, step, tokenizer):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False),
                                       args.test_batch_size, device,
                                       shuffle=False, is_test=True)
    symbols = {'BOS': tokenizer.convert_tokens_to_ids('<s>'), 'EOS': tokenizer.convert_tokens_to_ids('</s>'),
               'PAD': tokenizer.convert_tokens_to_ids('[PAD]')}
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
Exemplo n.º 4
0
def test_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if pt != '':
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if k in model_flags:
            setattr(args, k, opt[k])
    print(args)
    symbols, tokenizer = get_symbol_and_tokenizer(args.encoder, args.temp_dir)
    model = AbsSummarizer(args, device, checkpoint, symbols=symbols)
    model.eval()

    test_iter = data_loader.Dataloader(args,
                                       load_dataset(args,
                                                    'test',
                                                    shuffle=False),
                                       args.test_batch_size,
                                       device,
                                       shuffle=False,
                                       is_test=True,
                                       tokenizer=tokenizer)

    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
Exemplo n.º 5
0
def test_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False),
                                       args.test_batch_size, device,
                                       shuffle=False, is_test=True)
    # 为了中文的tokenize能把unused分开
    # for chinese tokenization, or it will split the word 'unused'
    add_token_list = ['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]']
    if args.bart:
        tokenizer = AutoTokenizer.from_pretrained('bart-base', do_lower_case=True, cache_dir=args.temp_dir, local_files_only=False)
        symbols = {'BOS': tokenizer.encoder['madeupword0000'], 'EOS': tokenizer.encoder['madeupword0001'],
                   'PAD': 0, 'EOQ': tokenizer.encoder['madeupword0002']}
    else:
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True,
                                                  cache_dir=args.temp_dir, local_files_only=False, additional_special_tokens=add_token_list)
        symbols = {'BOS': tokenizer.vocab['[unused1]'], 'EOS': tokenizer.vocab['[unused2]'],
                   'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused3]']}
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
Exemplo n.º 6
0
def validate(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False),
                                        args.batch_size, device,
                                        shuffle=False, is_test=False)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
    symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],
               'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}

    valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device)

    trainer = build_trainer(args, device_id, model, None, valid_loss)
    stats = trainer.validate(valid_iter, step)
    return stats.xent()
Exemplo n.º 7
0
def validate(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    #if (pt != ''):
    if not (args.test_from):
        test_from = pt

    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    valid_iter = data_loader.Dataloader(args,
                                        load_dataset(args,
                                                     'valid',
                                                     shuffle=False),
                                        args.batch_size,
                                        device,
                                        shuffle=False,
                                        is_test=False)

    tokenizer = BertTokenizer.from_pretrained(
        '../ETRI_koBERT/003_bert_eojeol_pytorch/vocab.txt',
        do_lower_case=False,
        cache_dir=args.temp_dir)

    if not args.share_emb:
        tokenizer = add_tokens(tokenizer)

    symbols = {
        'BOS': tokenizer.vocab['<S>'],
        'EOS': tokenizer.vocab['<T>'],
        'PAD': tokenizer.vocab['[PAD]']
    }
    # symbols = {'BOS': tokenizer.vocab['[BOS]'], 'EOS': tokenizer.vocab['[EOS]'],
    #            'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[EOQ]']}
    # symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],
    #            'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}

    # print(tokenizer.vocab_size)
    # print([(key, value) for key, value in tokenizer.vocab.items()][-10:])
    # exit()
    valid_loss = abs_loss(model.generator,
                          symbols,
                          model.vocab_size,
                          train=False,
                          device=device)

    trainer = build_trainer(args, device_id, model, None, valid_loss)
    stats = trainer.validate(valid_iter, step)
    return stats.xent()
Exemplo n.º 8
0
def test_text_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False),
                                       args.test_batch_size, device,
                                       shuffle=False, is_test=True)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
    symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],
               'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
Exemplo n.º 9
0
def validate(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    valid_iter = data_loader.Dataloader(args,
                                        load_dataset(args,
                                                     'valid',
                                                     shuffle=False),
                                        args.batch_size,
                                        device,
                                        shuffle=False,
                                        is_test=False)

    parser = argparse.ArgumentParser()
    parser.add_argument('--bpe-codes',
                        default="/content/PhoBERT_base_transformers/bpe.codes",
                        required=False,
                        type=str,
                        help='path to fastBPE BPE')
    args1, unknown = parser.parse_known_args()
    bpe = fastBPE(args1)

    # Load the dictionary
    vocab = Dictionary()
    vocab.add_from_file("/content/PhoBERT_base_transformers/dict.txt")

    tokenizer = bpe
    symbols = {
        'BOS': vocab.indices['[unused0]'],
        'EOS': vocab.indices['[unused1]'],
        'PAD': vocab.indices['[PAD]'],
        'EOQ': vocab.indices['[unused2]']
    }

    valid_loss = abs_loss(model.generator,
                          symbols,
                          model.vocab_size,
                          train=False,
                          device=device)

    trainer = build_trainer(args, device_id, model, None, valid_loss)
    stats = trainer.validate(valid_iter, step)
    return stats.xent()
def validate(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == "-1" else "cuda"
    if pt != "":
        test_from = pt
    else:
        test_from = args.test_from
    logger.info("Loading checkpoint from %s" % test_from)
    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint["opt"])
    for k in opt.keys():
        if k in model_flags:
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    valid_iter = data_loader.Dataloader(
        args,
        load_dataset(args, "valid", shuffle=False),
        args.batch_size,
        device,
        shuffle=False,
        is_test=False,
    )

    tokenizer = BertTokenizer.from_pretrained(
        "chinese_roberta_wwm_ext_pytorch",
        do_lower_case=True,
        cache_dir=args.temp_dir)
    symbols = {
        "BOS": tokenizer.vocab["[unused1]"],
        "EOS": tokenizer.vocab["[unused2]"],
        "PAD": tokenizer.vocab["[PAD]"],
        "EOQ": tokenizer.vocab["[unused3]"],
    }

    valid_loss = abs_loss(model.generator,
                          symbols,
                          model.vocab_size,
                          train=False,
                          device=device)

    trainer = build_trainer(args, device_id, model, None, valid_loss)
    stats = trainer.validate(valid_iter, step)
    return stats.xent()
Exemplo n.º 11
0
def test_abs(args, device_id, pt, step):
    """ Implements testing process (meta / non-memta)
    Arguments:
        device_id (int) : the GPU id to be used
        pt() : checkpoint model
        step (int) : checkpoint step
    Process:
        - load checkpoint
        - prepare dataloader class
        - prepare model class
        - prepare predictor
        - predictor.translate()
    """
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    logger.info('Device ID %d', device_id)
    logger.info('Device %s', device)

    # Load chekcpoint
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])

    # Prepare dataloader
    test_iter = data_loader.Dataloader(args,
                                       load_dataset(args,
                                                    'test',
                                                    shuffle=False),
                                       args.test_batch_size,
                                       device,
                                       shuffle=False,
                                       is_test=True)
    # Prepare model
    if (args.meta_mode):
        model = MTLAbsSummarizer(args, device, checkpoint)
    else:
        model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    # Prepare predictor
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True,
                                              cache_dir=args.temp_dir)
    symbols = {
        'BOS': tokenizer.vocab['[unused0]'],
        'EOS': tokenizer.vocab['[unused1]'],
        'PAD': tokenizer.vocab['[PAD]'],
        'EOQ': tokenizer.vocab['[unused2]']
    }

    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)  # long time
Exemplo n.º 12
0
def train_abs_single(args, device_id):
    init_logger(args.log_file)
    logger.info(str(args))
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    logger.info('Device ID %d' % device_id)
    logger.info('Device %s' % device)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    if device_id >= 0:
        torch.cuda.set_device(device_id)
        torch.cuda.manual_seed(args.seed)

    if args.train_from != '':
        logger.info('Loading checkpoint from %s' % args.train_from)
        checkpoint = torch.load(args.train_from,
                                map_location=lambda storage, loc: storage)
        opt = vars(checkpoint['opt'])
        for k in opt.keys():
            if (k in model_flags):
                setattr(args, k, opt[k])
    else:
        checkpoint = None

    if (args.load_from_extractive != ''):
        logger.info('Loading bert from extractive model %s' % args.load_from_extractive)
        bert_from_extractive = torch.load(args.load_from_extractive, map_location=lambda storage, loc: storage)
        bert_from_extractive = bert_from_extractive['model']
    else:
        bert_from_extractive = None
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    def train_iter_fct():
        return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=True), args.batch_size, device,
                                      shuffle=True, is_test=False)

    model = AbsSummarizer(args, device, checkpoint, bert_from_extractive)
    if (args.sep_optim):
        optim_bert = model_builder.build_optim_bert(args, model, checkpoint)
        optim_dec = model_builder.build_optim_dec(args, model, checkpoint)
        optim = [optim_bert, optim_dec]
    else:
        optim = [model_builder.build_optim(args, model, checkpoint)]

    logger.info(model)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
    symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],
               'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}

    train_loss = abs_loss(model.generator, symbols, model.vocab_size, device, train=True,
                          label_smoothing=args.label_smoothing)

    trainer = build_trainer(args, device_id, model, optim, train_loss)

    trainer.train(train_iter_fct, args.train_steps)
Exemplo n.º 13
0
def test_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if not (args.test_from):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args,
                                       load_dataset(args,
                                                    'test',
                                                    shuffle=False),
                                       args.test_batch_size,
                                       device,
                                       shuffle=False,
                                       is_test=True)

    tokenizer = BertTokenizer.from_pretrained(
        '../ETRI_koBERT/003_bert_eojeol_pytorch/vocab.txt',
        do_lower_case=False,
        cache_dir=args.temp_dir)
    if not args.share_emb:
        tokenizer = add_tokens(tokenizer)

    symbols = {
        'BOS': tokenizer.vocab['<S>'],
        'EOS': tokenizer.vocab['<T>'],
        'PAD': tokenizer.vocab['[PAD]']
    }

    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
def test_text_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == "-1" else "cuda"
    if pt != "":
        test_from = pt
    else:
        test_from = args.test_from
    logger.info("Loading checkpoint from %s" % test_from)

    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint["opt"])
    for k in opt.keys():
        if k in model_flags:
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(
        args,
        load_dataset(args, "test", shuffle=False),
        args.test_batch_size,
        device,
        shuffle=False,
        is_test=True,
    )
    tokenizer = BertTokenizer.from_pretrained("bert-base-chinese",
                                              do_lower_case=True,
                                              cache_dir=args.temp_dir)
    symbols = {
        "BOS": tokenizer.vocab["[unused0]"],
        "EOS": tokenizer.vocab["[unused1]"],
        "PAD": tokenizer.vocab["[PAD]"],
        "EOQ": tokenizer.vocab["[unused2]"],
    }
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
Exemplo n.º 15
0
def validate(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if pt != '':
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if k in model_flags:
            setattr(args, k, opt[k])
    print(args)

    symbols, tokenizer = get_symbol_and_tokenizer(args.encoder, args.temp_dir)
    model = AbsSummarizer(args, device, checkpoint, symbols=symbols)
    model.eval()

    valid_iter = data_loader.Dataloader(args,
                                        load_dataset(args,
                                                     'valid',
                                                     shuffle=False),
                                        args.batch_size,
                                        device,
                                        shuffle=False,
                                        is_test=False,
                                        tokenizer=tokenizer)
    valid_loss = abs_loss(model.generator,
                          symbols,
                          model.vocab_size,
                          train=False,
                          device=device)

    trainer = build_trainer(args, device_id, model, None, valid_loss)
    stats = trainer.validate(valid_iter, step)
    return stats.xent()
Exemplo n.º 16
0
def test_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False),
                                       args.test_batch_size, device,
                                       shuffle=False, is_test=True)

    tokenizer = BertData(args).tokenizer

    #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
    # tokenizer = None
    # if args.pretrained_model_type in ['bert-base-uncased', 'bert-base-multilingual-uncased']:
    #     tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_type, do_lower_case=True, cache_dir=args.temp_dir)
    #
    # if not tokenizer:
    #     raise NotImplementedError("tokenizer")

   # tokenizer = add_to_vocab(tokenizer, ['[unused0]', '[unused1]', '[PAD]', '[unused2]'])
    symbols = {'BOS': tokenizer.convert_tokens_to_ids('[unused0]'), 'EOS': tokenizer.convert_tokens_to_ids('[unused1]'),
               'PAD': tokenizer.convert_tokens_to_ids('[PAD]'), 'EOQ': tokenizer.convert_tokens_to_ids('[unused2]')}
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
Exemplo n.º 17
0
args.large = False
args.temp_dir = 'temp'
args.finetune_bert = False
args.encoder = 'bert'
args.max_pos = 256
args.dec_layers = 6
args.share_emb = False
args.dec_hidden_size = 768
args.dec_heads = 8
args.dec_ff_size = 2048
args.dec_dropout = 0.2
args.use_bert_emb = False

bert_data = BertData(args.model_path, True, 510, 128)

BertSumAbs = AbsSummarizer(args, DEVICE, checkpoint)
BertSumAbs.eval()

data = pd.read_json(DATASET_PATH,
                    encoding='utf-8',
                    lines=True,
                    chunksize=CHUNK_SIZE)

for el in tqdm.tqdm(data, total=450000 // CHUNK_SIZE):
    with open('vectors.npy', 'ab') as fvecs, open('text.jsonl',
                                                  'a',
                                                  encoding='utf-8') as ft:
        for j in range(CHINK_SIZE):
            text = el.iloc[j]["text"].lower().replace('\xa0', ' ').replace(
                '\n', ' ').strip()
            title = el.iloc[j]["title"].lower()
Exemplo n.º 18
0
def train_abs_single(args, device_id):
    init_logger(args.log_file)
    logger.info(str(args))
    device = "cpu" if args.visible_gpus == "-1" else "cuda"
    logger.info("Device ID %d" % device_id)
    logger.info("Device %s" % device)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    if device_id >= 0:
        torch.cuda.set_device(device_id)
        torch.cuda.manual_seed(args.seed)

    if args.train_from != "":
        logger.info("Loading checkpoint from %s" % args.train_from)
        checkpoint = torch.load(
            args.train_from, map_location=lambda storage, loc: storage
        )
        opt = vars(checkpoint["opt"])
        for k in opt.keys():
            if k in model_flags:
                setattr(args, k, opt[k])
    else:
        checkpoint = None

    if args.load_from_extractive != "":
        logger.info("Loading bert from extractive model %s" % args.load_from_extractive)
        bert_from_extractive = torch.load(
            args.load_from_extractive, map_location=lambda storage, loc: storage
        )
        bert_from_extractive = bert_from_extractive["model"]
    else:
        bert_from_extractive = None
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    def train_iter_fct():
        return data_loader.Dataloader(
            args,
            load_dataset(args, "train", shuffle=True),
            args.batch_size,
            device,
            shuffle=True,
            is_test=False,
        )

    model = AbsSummarizer(args, device, checkpoint, bert_from_extractive)
    if args.sep_optim:
        optim_bert = model_builder.build_optim_bert(args, model, checkpoint)
        optim_dec = model_builder.build_optim_dec(args, model, checkpoint)
        optim = [optim_bert, optim_dec]
    else:
        optim = [model_builder.build_optim(args, model, checkpoint)]

    logger.info(model)

    tokenizer = BertTokenizer.from_pretrained(
        "chinese_roberta_wwm_ext_pytorch/", do_lower_case=True, cache_dir=args.temp_dir
    )
    symbols = {
        "BOS": tokenizer.vocab["[unused1]"],
        "EOS": tokenizer.vocab["[unused2]"],
        "PAD": tokenizer.vocab["[PAD]"],
        "EOQ": tokenizer.vocab["[unused3]"],
    }

    train_loss = abs_loss(
        model.generator,
        symbols,
        model.vocab_size,
        device,
        train=True,
        label_smoothing=args.label_smoothing,
    )

    trainer = build_trainer(args, device_id, model, optim, train_loss)

    trainer.train(train_iter_fct, args.train_steps)
def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
    """ Copy/paste and tweak the pre-trained weights provided by the creators
    of BertAbs for the internal architecture.
    """

    # Instantiate the authors' model with the pre-trained weights
    config = BertAbsConfig(
        temp_dir=".",
        finetune_bert=False,
        large=False,
        share_emb=True,
        use_bert_emb=False,
        encoder="bert",
        max_pos=512,
        enc_layers=6,
        enc_hidden_size=512,
        enc_heads=8,
        enc_ff_size=512,
        enc_dropout=0.2,
        dec_layers=6,
        dec_hidden_size=768,
        dec_heads=8,
        dec_ff_size=2048,
        dec_dropout=0.2,
    )
    checkpoints = torch.load(path_to_checkpoints, lambda storage, loc: storage)
    original = AbsSummarizer(config, torch.device("cpu"), checkpoints)
    original.eval()

    new_model = BertAbsSummarizer(config, torch.device("cpu"))
    new_model.eval()

    # -------------------
    # Convert the weights
    # -------------------

    logging.info("convert the model")
    new_model.bert.load_state_dict(original.bert.state_dict())
    new_model.decoder.load_state_dict(original.decoder.state_dict())
    new_model.generator.load_state_dict(original.generator.state_dict())

    # ----------------------------------
    # Make sure the outpus are identical
    # ----------------------------------

    logging.info("Make sure that the models' outputs are identical")
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # prepare the model inputs
    encoder_input_ids = tokenizer.encode("This is sample éàalj'-.")
    encoder_input_ids.extend([tokenizer.pad_token_id] *
                             (512 - len(encoder_input_ids)))
    encoder_input_ids = torch.tensor(encoder_input_ids).unsqueeze(0)
    decoder_input_ids = tokenizer.encode("This is sample 3 éàalj'-.")
    decoder_input_ids.extend([tokenizer.pad_token_id] *
                             (512 - len(decoder_input_ids)))
    decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0)

    # failsafe to make sure the weights reset does not affect the
    # loaded weights.
    assert torch.max(
        torch.abs(original.generator[0].weight -
                  new_model.generator[0].weight)) == 0

    # forward pass
    src = encoder_input_ids
    tgt = decoder_input_ids
    segs = token_type_ids = None
    clss = None
    mask_src = encoder_attention_mask = None
    mask_tgt = decoder_attention_mask = None
    mask_cls = None

    # The original model does not apply the geneator layer immediatly but rather in
    # the beam search (where it combines softmax + linear layer). Since we already
    # apply the softmax in our generation process we only apply the linear layer here.
    # We make sure that the outputs of the full stack are identical
    output_original_model = original(src, tgt, segs, clss, mask_src, mask_tgt,
                                     mask_cls)[0]
    output_original_generator = original.generator(output_original_model)

    output_converted_model = new_model(encoder_input_ids, decoder_input_ids,
                                       token_type_ids, encoder_attention_mask,
                                       decoder_attention_mask)[0]
    output_converted_generator = new_model.generator(output_converted_model)

    maximum_absolute_difference = torch.max(
        torch.abs(output_converted_model - output_original_model)).item()
    print("Maximum absolute difference beween weights: {:.2f}".format(
        maximum_absolute_difference))
    maximum_absolute_difference = torch.max(
        torch.abs(output_converted_generator -
                  output_original_generator)).item()
    print("Maximum absolute difference beween weights: {:.2f}".format(
        maximum_absolute_difference))

    are_identical = torch.allclose(output_converted_model,
                                   output_original_model,
                                   atol=1e-3)
    if are_identical:
        logging.info("all weights are equal up to 1e-3")
    else:
        raise ValueError(
            "the weights are different. The new model is likely different from the original one."
        )

    # The model has been saved with torch.save(model) and this is bound to the exact
    # directory structure. We save the state_dict instead.
    logging.info("saving the model's state dictionary")
    torch.save(
        new_model.state_dict(),
        "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin"
    )
                    const=True,
                    default=True)

args = parser.parse_args()
args.gpu_ranks = [int(i) for i in range(len(args.visible_gpus.split(',')))]
args.world_size = len(args.gpu_ranks)
os.environ["CUDA_VISIBLE_DEVICES"] = args.visible_gpus

device = "cpu" if args.visible_gpus == '-1' else "cuda"
device_id = 0 if device == "cuda" else -1

checkpoint = torch.load(args.test_from,
                        map_location=lambda storage, loc: storage)
opt = vars(checkpoint['opt'])

model = AbsSummarizer(args, device, checkpoint)
model.eval()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True,
                                          cache_dir=args.temp_dir)


def getrespond(request):
    context = {}
    if (request.POST):
        print(request.POST["Language"])
        #print(request.POST)
        context["input"] = request.POST["input_block"]
        if (request.POST["Language"] != "English"):
            context[
Exemplo n.º 21
0
def train_abs(args, device_id):
    init_logger(args.log_file)
    logger.info(str(args))
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    logger.info('Device ID %d' % device_id)
    logger.info('Device %s' % device)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    if device_id >= 0:
        torch.cuda.set_device(device_id)
        torch.cuda.manual_seed(args.seed)

    if args.train_from != '':
        logger.info('Loading checkpoint from %s' % args.train_from)
        checkpoint = torch.load(args.train_from,
                                map_location=lambda storage, loc: storage)
        opt = vars(checkpoint['opt'])
        for k in opt.keys():
            if k in model_flags:
                setattr(args, k, opt[k])
    else:
        checkpoint = None

    if args.load_from_extractive != '':
        logger.info('Loading bert from extractive model %s' %
                    args.load_from_extractive)
        bert_from_extractive = torch.load(
            args.load_from_extractive,
            map_location=lambda storage, loc: storage)
        bert_from_extractive = bert_from_extractive['model']
    else:
        bert_from_extractive = None
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    symbols, tokenizer = get_symbol_and_tokenizer(args.encoder, args.temp_dir)

    model = AbsSummarizer(args,
                          device,
                          checkpoint,
                          bert_from_extractive,
                          symbols=symbols)
    if args.sep_optim:
        optim_enc = model_builder.build_optim_enc(args, model, checkpoint)
        optim_dec = model_builder.build_optim_dec(args, model, checkpoint)
        optim = [optim_enc, optim_dec]
    else:
        optim = [model_builder.build_optim(args, model, checkpoint)]

    logger.info(model)

    def train_iter_fct():
        return data_loader.Dataloader(args,
                                      load_dataset(args, 'train',
                                                   shuffle=True),
                                      args.batch_size,
                                      device,
                                      shuffle=True,
                                      is_test=False,
                                      tokenizer=tokenizer)

    train_loss = abs_loss(model.generator,
                          symbols,
                          model.vocab_size,
                          device,
                          train=True,
                          label_smoothing=args.label_smoothing)

    trainer = build_trainer(args, device_id, model, optim, train_loss)

    trainer.train(train_iter_fct, args.train_steps)
Exemplo n.º 22
0
def train_abs_single(args, device_id):
    init_logger(args.log_file)
    logger.info(str(args))
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    logger.info('Device ID %d' % device_id)
    logger.info('Device %s' % device)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    if device_id >= 0:
        torch.cuda.set_device(device_id)
        torch.cuda.manual_seed(args.seed)

    if args.train_from != '':
        logger.info('Loading checkpoint from %s' % args.train_from)
        checkpoint = torch.load(args.train_from,
                                map_location=lambda storage, loc: storage)
        opt = vars(checkpoint['opt'])
        for k in opt.keys():
            if (k in model_flags):
                setattr(args, k, opt[k])
    else:
        checkpoint = None

    if (args.load_from_extractive != ''):
        logger.info('Loading bert from extractive model %s' %
                    args.load_from_extractive)
        bert_from_extractive = torch.load(
            args.load_from_extractive,
            map_location=lambda storage, loc: storage)
        bert_from_extractive = bert_from_extractive['model']
    else:
        bert_from_extractive = None
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    def train_iter_fct():
        return data_loader.Dataloader(args,
                                      load_dataset(args, 'train',
                                                   shuffle=True),
                                      args.batch_size,
                                      device,
                                      shuffle=True,
                                      is_test=False)

    def valid_iter_fct():
        return data_loader.Dataloader(args,
                                      load_dataset(args, 'valid',
                                                   shuffle=True),
                                      args.batch_size,
                                      device,
                                      shuffle=True,
                                      is_test=False)

    model = AbsSummarizer(args, device, checkpoint, bert_from_extractive)
    if (args.sep_optim):
        optim_bert = model_builder.build_optim_bert(args, model, checkpoint)
        optim_dec = model_builder.build_optim_dec(args, model, checkpoint)
        optim = [optim_bert, optim_dec]
    else:
        optim = [model_builder.build_optim(args, model, checkpoint)]

    logger.info(model)
    print("model.vocab_size" + str(model.vocab_size))

    parser = argparse.ArgumentParser()
    parser.add_argument('--bpe-codes',
                        default="/content/PhoBERT_base_transformers/bpe.codes",
                        required=False,
                        type=str,
                        help='path to fastBPE BPE')
    args1, unknown = parser.parse_known_args()
    bpe = fastBPE(args1)

    # Load the dictionary
    vocab = Dictionary()
    vocab.add_from_file("/content/PhoBERT_base_transformers/dict.txt")

    tokenizer = bpe
    symbols = {
        'BOS': vocab.indices['[unused0]'],
        'EOS': vocab.indices['[unused1]'],
        'PAD': vocab.indices['[PAD]'],
        'EOQ': vocab.indices['[unused2]']
    }

    train_loss = abs_loss(model.generator,
                          symbols,
                          model.vocab_size,
                          device,
                          train=True,
                          label_smoothing=args.label_smoothing)

    trainer = build_trainer(args, device_id, model, optim, train_loss)

    trainer.train(train_iter_fct=train_iter_fct,
                  train_steps=args.train_steps,
                  valid_iter_fct=valid_iter_fct)
Exemplo n.º 23
0
    return get_clf_report(embeds, markup, url2record, best_dist)


if __name__ == "__main__":
    from models.model_builder import AbsSummarizer
    checkpoint = torch.load(sys.argv[1],
                            map_location=lambda storage, loc: storage)
    embed_mode = sys.argv[2]

    args = lambda a: b

    args.model_path = '/data/alolbuhtijarov/rubert_cased_L-12_H-768_A-12_pt'
    args.large = False
    args.temp_dir = 'temp'
    args.finetune_bert = False
    args.encoder = 'bert'
    args.max_pos = 256
    args.dec_layers = 6
    args.share_emb = False
    args.dec_hidden_size = 768
    args.dec_heads = 8
    args.dec_ff_size = 2048
    args.dec_dropout = 0.2
    args.use_bert_emb = False

    bert_data = BertData(args.model_path, True, 510, 128)

    model = AbsSummarizer(args, 'cpu', checkpoint)
    model.eval()

    eval_clustering(lambda text: doc2vec(text, model, mode=embed_mode))
Exemplo n.º 24
0
def validate(args, device_id, pt, step):
    ''' Implements validation process (meta / non-memta)
    Arguments:
        device_id (int) : the GPU id to be used
        pt() : checkpoint model
        step (int) : checkpoint step
    Process:
        - load checkpoint
        - prepare dataloader class
        - prepare model class
        - prepare loss func, which return loss class
        - prepare trainer
        - trainer.validate()
    Meta vs Normal
        - MetaDataloader      vs Dataloader
        - load_dataset        vs load_meta_dataset
        - MTLAbsSummarizer    vs AbsSummarizer
        - build_MTLtrainer    vs MTLTrainer
    '''
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    logger.info('Device ID %d' % device_id)
    logger.info('Device %s' % device)

    # Fix random seed to control experiement
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True
    if device_id >= 0:
        torch.cuda.set_device(device_id)
        torch.cuda.manual_seed(args.seed)

    # Load checkpoint ard args
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])  # which is self.args
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])

    # Prepare dataloader
    if (args.meta_mode):

        def valid_iter_fct():
            return data_loader.MetaDataloader(args,
                                              load_meta_dataset(args,
                                                                'valid',
                                                                shuffle=True),
                                              args.batch_size,
                                              device,
                                              shuffle=True,
                                              is_test=False)

    else:
        valid_iter = data_loader.Dataloader(args,
                                            load_dataset(args,
                                                         'valid',
                                                         shuffle=False),
                                            args.batch_size,
                                            device,
                                            shuffle=False,
                                            is_test=False)

    # Prepare model
    if (args.meta_mode):
        model = MTLAbsSummarizer(args, device, checkpoint)
    else:
        model = AbsSummarizer(args, device, checkpoint)
    #model.eval()

    # Prepare optimizer for inner loop
    # The optimizer for each task is seperated
    if (args.meta_mode):
        optims_inner = []
        for i in range(args.num_task):
            if (args.sep_optim):
                optim_bert_inner = model_builder.build_optim_bert_inner(
                    args, model, checkpoint, 'maml')
                optim_dec_inner = model_builder.build_optim_dec_inner(
                    args, model, checkpoint, 'maml')
                optims_inner.append([optim_bert_inner, optim_dec_inner])
            else:
                self.optims_inner.append([
                    model_builder.build_optim_inner(args, model, checkpoint,
                                                    'maml')
                ])

    # Prepare optimizer (not actually used, but get the step information)
    if (args.sep_optim):
        optim_bert = model_builder.build_optim_bert(args, model, checkpoint)
        optim_dec = model_builder.build_optim_dec(args, model, checkpoint)
        optim = [optim_bert, optim_dec]
    else:
        optim = [model_builder.build_optim(args, model, checkpoint)]

    # Prepare loss
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True,
                                              cache_dir=args.temp_dir)
    symbols = {
        'BOS': tokenizer.vocab['[unused0]'],
        'EOS': tokenizer.vocab['[unused1]'],
        'PAD': tokenizer.vocab['[PAD]'],
        'EOQ': tokenizer.vocab['[unused2]']
    }

    # Prepare loss computation
    valid_loss = abs_loss(model.generator,
                          symbols,
                          model.vocab_size,
                          device,
                          train=False)

    # Prepare trainer and perform validation
    if (args.meta_mode):
        trainer = build_MTLtrainer(args, device_id, model, optim, optims_inner,
                                   valid_loss)
        stats = trainer.validate(valid_iter_fct, step)
    else:
        trainer = build_trainer(args, device_id, model, None, valid_loss)
        stats = trainer.validate(valid_iter, step)

    return stats.xent()
Exemplo n.º 25
0
def train_abs_single(args, device_id):
    """Implements training process (meta / non-meta)
    Args:
        device_id (int) : the GPU id to be used
    """

    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    logger.info('Device ID %d', device_id)
    logger.info('Device %s', device)

    # Fix random seed to control experiement
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True
    if device_id >= 0:  # if use GPU
        torch.cuda.set_device(device_id)
        torch.cuda.manual_seed(args.seed)

    # Load checkpoint and args
    if args.train_from != '':
        logger.info('Loading checkpoint from %s', args.train_from)
        checkpoint = torch.load(args.train_from,
                                map_location=lambda storage, loc: storage)
        opt = vars(checkpoint['opt'])  # which is self.args
        for k in opt.keys():
            if k in model_flags:
                setattr(args, k, opt[k])
    else:
        checkpoint = None

    # Load extractive model as initial parameter (proposed by Presumm)
    if args.load_from_extractive != '':
        logger.info('Loading bert from extractive model %s',
                    args.load_from_extractive)
        bert_from_extractive = torch.load(
            args.load_from_extractive,
            map_location=lambda storage, loc: storage)
        bert_from_extractive = bert_from_extractive['model']
    else:
        bert_from_extractive = None

    # Prepare dataloader
    if args.meta_mode:

        def meta_train_iter_fct():
            return data_loader.MetaDataloader(args,
                                              load_meta_dataset(args,
                                                                'train',
                                                                shuffle=True),
                                              args.batch_size,
                                              device,
                                              shuffle=True,
                                              is_test=False)
    else:

        def train_iter_fct():
            return data_loader.Dataloader(args,
                                          load_dataset(args,
                                                       'train',
                                                       shuffle=True),
                                          args.batch_size,
                                          device,
                                          shuffle=True,
                                          is_test=False)

    # Prepare model
    if args.meta_mode:
        model = MTLAbsSummarizer(args, device, checkpoint,
                                 bert_from_extractive)
    else:
        model = AbsSummarizer(args, device, checkpoint, bert_from_extractive)

    # Prepare optimizer for inner loop
    # The optimizer for each task is seperated
    if args.meta_mode:
        optims_inner = []
        for _ in range(args.num_task):
            if args.sep_optim:
                optim_bert_inner = model_builder.build_optim_bert_inner(
                    args, model, checkpoint, 'maml')
                optim_dec_inner = model_builder.build_optim_dec_inner(
                    args, model, checkpoint, 'maml')
                optims_inner.append([optim_bert_inner, optim_dec_inner])
            else:
                optims_inner.append([
                    model_builder.build_optim_inner(args, model, checkpoint,
                                                    'maml')
                ])

    # Prepare optimizer for outer loop
    if args.sep_optim:
        optim_bert = model_builder.build_optim_bert(args, model, checkpoint)
        optim_dec = model_builder.build_optim_dec(args, model, checkpoint)
        optims = [optim_bert, optim_dec]
    else:
        optims = [model_builder.build_optim(args, model, checkpoint)]

    # Prepare tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True,
                                              cache_dir=args.temp_dir)
    symbols = {
        'BOS': tokenizer.vocab['[unused0]'],  # id = 1
        'EOS': tokenizer.vocab['[unused1]'],  # id = 2
        'EOQ': tokenizer.vocab['[unused2]'],  # id = 3
        'PAD': tokenizer.vocab['[PAD]']  # id = 0
    }

    # Self Check : special word ids
    special_words = [w for w in tokenizer.vocab.keys() if "[" in w]
    special_word_ids = [
        tokenizer.convert_tokens_to_ids(w) for w in special_words
    ]

    # Prepare loss computation
    train_loss = abs_loss(model.generator,
                          symbols,
                          model.vocab_size,
                          device,
                          train=True,
                          label_smoothing=args.label_smoothing)

    # Prepare trainer and perform training
    if args.meta_mode:
        trainer = build_MTLtrainer(args, device_id, model, optims,
                                   optims_inner, train_loss)
        trainer.train(meta_train_iter_fct)
    else:
        trainer = build_trainer(args, device_id, model, optims, train_loss)
        trainer.train(train_iter_fct, args.train_steps)