示例#1
0
def test_text_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args,
                                       load_dataset(args,
                                                    'test',
                                                    shuffle=False),
                                       args.test_batch_size,
                                       device,
                                       shuffle=False,
                                       is_test=True)
    phobert = AutoModel.from_pretrained("vinai/phobert-base")
    tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base')
    symbols = {
        'BOS': tokenizer.vocab['[unused0]'],
        'EOS': tokenizer.vocab['[unused1]'],
        'PAD': tokenizer.vocab['[PAD]'],
        'EOQ': tokenizer.vocab['[unused2]']
    }
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
示例#2
0
def validate(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if pt != '':
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if k in model_flags:
            setattr(args, k, opt[k])
    print(args)

    symbols, tokenizer = get_symbol_and_tokenizer(args.encoder, args.temp_dir)
    model = AbsSummarizer(args, device, checkpoint, symbols=symbols)
    model.eval()

    valid_iter = data_loader.Dataloader(args,
                                        load_dataset(args,
                                                     'valid',
                                                     shuffle=False),
                                        args.batch_size,
                                        device,
                                        shuffle=False,
                                        is_test=False,
                                        tokenizer=tokenizer)
    valid_loss = abs_loss(model.generator,
                          symbols,
                          model.vocab_size,
                          train=False,
                          device=device)

    trainer = build_trainer(args, device_id, model, None, valid_loss)
    stats = trainer.validate(valid_iter, step)
    return stats.xent()
示例#3
0
def test_text_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False),
                                       args.test_batch_size, device,
                                       shuffle=False, is_test=True)
    # for chinese tokenization
    add_token_list = ['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]']
    if args.bart:
        tokenizer = AutoTokenizer.from_pretrained('bart-base', do_lower_case=True, cache_dir=args.temp_dir, local_files_only=False)
        # tokenizer = AutoTokenizer.from_pretrained('/home/ybai/downloads/bart', do_lower_case=True,
        #                                           cache_dir=args.temp_dir, local_files_only=False)
        symbols = {'BOS': tokenizer.encoder['madeupword0000'], 'EOS': tokenizer.encoder['madeupword0001'],
                   'PAD': tokenizer.encoder['<pad>'], 'EOQ': tokenizer.encoder['madeupword0002']}
    else:
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True,
                                                  cache_dir=args.temp_dir, local_files_only=False, additional_special_tokens=add_token_list)
        symbols = {'BOS': tokenizer.vocab['[unused1]'], 'EOS': tokenizer.vocab['[unused2]'],
                   'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused3]']}
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
示例#4
0
def test_text_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args,
                                       load_dataset(args,
                                                    'test',
                                                    shuffle=False),
                                       args.test_batch_size,
                                       device,
                                       shuffle=False,
                                       is_test=True)
    vocab = get_kobert_vocab(cachedir=args.temp_dir)
    symbols = {
        'BOS': vocab.token_to_idx['[BOS]'],
        'EOS': vocab.token_to_idx['[EOS]'],
        'PAD': vocab.token_to_idx['[PAD]'],
        'EOQ': vocab.token_to_idx['[EOS]']
    }
    predictor = build_predictor(args, vocab, symbols, model, logger)
    predictor.translate(test_iter, step)
示例#5
0
def test_text_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False),
                                       args.test_batch_size, device,
                                       shuffle=False, is_test=True)
    tokenizer = BertData(args).tokenizer

    #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
    # tokenizer = None
    # if args.pretrained_model_type in ['bert-base-uncased', 'bert-base-multilingual-uncased']:
    #     tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_type, do_lower_case=True, cache_dir=args.temp_dir)
    #
    # if not tokenizer:
    #     raise NotImplementedError("tokenizer")

   # tokenizer = add_to_vocab(tokenizer, ['[unused0]', '[unused1]', '[PAD]', '[unused2]'])
    symbols = {'BOS': tokenizer.convert_tokens_to_ids('[unused0]'), 'EOS': tokenizer.convert_tokens_to_ids('[unused1]'),
               'PAD': tokenizer.convert_tokens_to_ids('[PAD]'), 'EOQ': tokenizer.convert_tokens_to_ids('[unused2]')}
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter, step)
def convert_bertabs_checkpoints(path_to_checkpoints, dump_path):
    """ Copy/paste and tweak the pre-trained weights provided by the creators
    of BertAbs for the internal architecture.
    """

    # Instantiate the authors' model with the pre-trained weights
    config = BertAbsConfig(
        temp_dir=".",
        finetune_bert=False,
        large=False,
        share_emb=True,
        use_bert_emb=False,
        encoder="bert",
        max_pos=512,
        enc_layers=6,
        enc_hidden_size=512,
        enc_heads=8,
        enc_ff_size=512,
        enc_dropout=0.2,
        dec_layers=6,
        dec_hidden_size=768,
        dec_heads=8,
        dec_ff_size=2048,
        dec_dropout=0.2,
    )
    checkpoints = torch.load(path_to_checkpoints, lambda storage, loc: storage)
    original = AbsSummarizer(config, torch.device("cpu"), checkpoints)
    original.eval()

    new_model = BertAbsSummarizer(config, torch.device("cpu"))
    new_model.eval()

    # -------------------
    # Convert the weights
    # -------------------

    logging.info("convert the model")
    new_model.bert.load_state_dict(original.bert.state_dict())
    new_model.decoder.load_state_dict(original.decoder.state_dict())
    new_model.generator.load_state_dict(original.generator.state_dict())

    # ----------------------------------
    # Make sure the outpus are identical
    # ----------------------------------

    logging.info("Make sure that the models' outputs are identical")
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # prepare the model inputs
    encoder_input_ids = tokenizer.encode("This is sample éàalj'-.")
    encoder_input_ids.extend([tokenizer.pad_token_id] *
                             (512 - len(encoder_input_ids)))
    encoder_input_ids = torch.tensor(encoder_input_ids).unsqueeze(0)
    decoder_input_ids = tokenizer.encode("This is sample 3 éàalj'-.")
    decoder_input_ids.extend([tokenizer.pad_token_id] *
                             (512 - len(decoder_input_ids)))
    decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0)

    # failsafe to make sure the weights reset does not affect the
    # loaded weights.
    assert torch.max(
        torch.abs(original.generator[0].weight -
                  new_model.generator[0].weight)) == 0

    # forward pass
    src = encoder_input_ids
    tgt = decoder_input_ids
    segs = token_type_ids = None
    clss = None
    mask_src = encoder_attention_mask = None
    mask_tgt = decoder_attention_mask = None
    mask_cls = None

    # The original model does not apply the geneator layer immediatly but rather in
    # the beam search (where it combines softmax + linear layer). Since we already
    # apply the softmax in our generation process we only apply the linear layer here.
    # We make sure that the outputs of the full stack are identical
    output_original_model = original(src, tgt, segs, clss, mask_src, mask_tgt,
                                     mask_cls)[0]
    output_original_generator = original.generator(output_original_model)

    output_converted_model = new_model(encoder_input_ids, decoder_input_ids,
                                       token_type_ids, encoder_attention_mask,
                                       decoder_attention_mask)[0]
    output_converted_generator = new_model.generator(output_converted_model)

    maximum_absolute_difference = torch.max(
        torch.abs(output_converted_model - output_original_model)).item()
    print("Maximum absolute difference beween weights: {:.2f}".format(
        maximum_absolute_difference))
    maximum_absolute_difference = torch.max(
        torch.abs(output_converted_generator -
                  output_original_generator)).item()
    print("Maximum absolute difference beween weights: {:.2f}".format(
        maximum_absolute_difference))

    are_identical = torch.allclose(output_converted_model,
                                   output_original_model,
                                   atol=1e-3)
    if are_identical:
        logging.info("all weights are equal up to 1e-3")
    else:
        raise ValueError(
            "the weights are different. The new model is likely different from the original one."
        )

    # The model has been saved with torch.save(model) and this is bound to the exact
    # directory structure. We save the state_dict instead.
    logging.info("saving the model's state dictionary")
    torch.save(
        new_model.state_dict(),
        "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin"
    )
                    default=True)

args = parser.parse_args()
args.gpu_ranks = [int(i) for i in range(len(args.visible_gpus.split(',')))]
args.world_size = len(args.gpu_ranks)
os.environ["CUDA_VISIBLE_DEVICES"] = args.visible_gpus

device = "cpu" if args.visible_gpus == '-1' else "cuda"
device_id = 0 if device == "cuda" else -1

checkpoint = torch.load(args.test_from,
                        map_location=lambda storage, loc: storage)
opt = vars(checkpoint['opt'])

model = AbsSummarizer(args, device, checkpoint)
model.eval()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True,
                                          cache_dir=args.temp_dir)


def getrespond(request):
    context = {}
    if (request.POST):
        print(request.POST["Language"])
        #print(request.POST)
        context["input"] = request.POST["input_block"]
        if (request.POST["Language"] != "English"):
            context[
                "err"] = "ERROR: The language is not supported currently. Please try later."
示例#8
0
def validate(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    valid_iter = data_loader.Dataloader(args,
                                        load_dataset(args,
                                                     'valid',
                                                     shuffle=False),
                                        args.batch_size,
                                        device,
                                        shuffle=False,
                                        is_test=False)

    if (args.bert_model == 'bert-base-multilingual-cased'):
        tokenizer = BertTokenizer.from_pretrained(
            'bert-base-multilingual-cased',
            do_lower_case=False,
            cache_dir=args.temp_dir)
    else:
        tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                                  do_lower_case=True,
                                                  cache_dir=args.temp_dir)
        print(len(tokenizer.vocab))
        if (len(tokenizer.vocab) == 31748):
            f = open(args.bert_model + "/vocab.txt", "a")
            f.write(
                "\n[unused1]\n[unused2]\n[unused3]\n[unused4]\n[unused5]\n[unused6]\n[unused7]"
            )
            f.close()
            tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                                      do_lower_case=True)
        print(len(tokenizer.vocab))

    symbols = {
        'BOS': tokenizer.vocab['[unused1]'],
        'EOS': tokenizer.vocab['[unused2]'],
        'PAD': tokenizer.vocab['[PAD]'],
        'EOQ': tokenizer.vocab['[unused3]']
    }

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    valid_loss = abs_loss(model.generator,
                          symbols,
                          model.vocab_size,
                          train=False,
                          device=device)

    trainer = build_trainer(args, device_id, model, None, valid_loss)
    stats = trainer.validate(valid_iter, step)
    return stats.xent()
示例#9
0
def test_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    # model = AbsSummarizer(args, device, checkpoint)
    model = AbsSummarizer(args, device, checkpoint=None)
    model.eval()

    def test_iter_fct():
        return data_loader.Dataloader(args,
                                      load_dataset(args, 'test',
                                                   shuffle=False),
                                      args.test_batch_size,
                                      device,
                                      shuffle=False,
                                      is_test=True)

    def train_iter_fct():
        return data_loader.Dataloader(args,
                                      load_dataset(args,
                                                   'train',
                                                   shuffle=False),
                                      args.test_batch_size,
                                      device,
                                      shuffle=False,
                                      is_test=True)

    def val_iter_fct():
        return data_loader.Dataloader(args,
                                      load_dataset(args, 'val', shuffle=False),
                                      args.test_batch_size,
                                      device,
                                      shuffle=False,
                                      is_test=True)

    # tokenizer = BertTokenizer.from_pretrained('/disk1/sajad/pretrained-bert/scibert_scivocab_uncased', do_lower_case=True)
    # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
    tokenizer = BertTokenizer.from_pretrained(
        'allenai/scibert_scivocab_uncased',
        do_lower_case=True,
        cache_dir=args.temp_dir)
    symbols = {
        'BOS': tokenizer.vocab['[unused0]'],
        'EOS': tokenizer.vocab['[unused1]'],
        'PAD': tokenizer.vocab['[PAD]'],
        'EOQ': tokenizer.vocab['[unused2]']
    }
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    predictor.translate(test_iter_fct, step)
示例#10
0
args.temp_dir = 'temp'
args.finetune_bert = False
args.encoder = 'bert'
args.max_pos = 256
args.dec_layers = 6
args.share_emb = False
args.dec_hidden_size = 768
args.dec_heads = 8
args.dec_ff_size = 2048
args.dec_dropout = 0.2
args.use_bert_emb = False

bert_data = BertData(args.model_path, True, 510, 128)

BertSumAbs = AbsSummarizer(args, DEVICE, checkpoint)
BertSumAbs.eval()

data = pd.read_json(DATASET_PATH,
                    encoding='utf-8',
                    lines=True,
                    chunksize=CHUNK_SIZE)

for el in tqdm.tqdm(data, total=450000 // CHUNK_SIZE):
    with open('vectors.npy', 'ab') as fvecs, open('text.jsonl',
                                                  'a',
                                                  encoding='utf-8') as ft:
        for j in range(CHINK_SIZE):
            text = el.iloc[j]["text"].lower().replace('\xa0', ' ').replace(
                '\n', ' ').strip()
            title = el.iloc[j]["title"].lower()