def test_text_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) phobert = AutoModel.from_pretrained("vinai/phobert-base") tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base') symbols = { 'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]'] } predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def validate(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if pt != '': test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if k in model_flags: setattr(args, k, opt[k]) print(args) symbols, tokenizer = get_symbol_and_tokenizer(args.encoder, args.temp_dir) model = AbsSummarizer(args, device, checkpoint, symbols=symbols) model.eval() valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False), args.batch_size, device, shuffle=False, is_test=False, tokenizer=tokenizer) valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device) trainer = build_trainer(args, device_id, model, None, valid_loss) stats = trainer.validate(valid_iter, step) return stats.xent()
def test_text_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) # for chinese tokenization add_token_list = ['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]'] if args.bart: tokenizer = AutoTokenizer.from_pretrained('bart-base', do_lower_case=True, cache_dir=args.temp_dir, local_files_only=False) # tokenizer = AutoTokenizer.from_pretrained('/home/ybai/downloads/bart', do_lower_case=True, # cache_dir=args.temp_dir, local_files_only=False) symbols = {'BOS': tokenizer.encoder['madeupword0000'], 'EOS': tokenizer.encoder['madeupword0001'], 'PAD': tokenizer.encoder['<pad>'], 'EOQ': tokenizer.encoder['madeupword0002']} else: tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True, cache_dir=args.temp_dir, local_files_only=False, additional_special_tokens=add_token_list) symbols = {'BOS': tokenizer.vocab['[unused1]'], 'EOS': tokenizer.vocab['[unused2]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused3]']} predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def test_text_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) vocab = get_kobert_vocab(cachedir=args.temp_dir) symbols = { 'BOS': vocab.token_to_idx['[BOS]'], 'EOS': vocab.token_to_idx['[EOS]'], 'PAD': vocab.token_to_idx['[PAD]'], 'EOQ': vocab.token_to_idx['[EOS]'] } predictor = build_predictor(args, vocab, symbols, model, logger) predictor.translate(test_iter, step)
def test_text_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args, device, checkpoint) model.eval() test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) tokenizer = BertData(args).tokenizer #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) # tokenizer = None # if args.pretrained_model_type in ['bert-base-uncased', 'bert-base-multilingual-uncased']: # tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_type, do_lower_case=True, cache_dir=args.temp_dir) # # if not tokenizer: # raise NotImplementedError("tokenizer") # tokenizer = add_to_vocab(tokenizer, ['[unused0]', '[unused1]', '[PAD]', '[unused2]']) symbols = {'BOS': tokenizer.convert_tokens_to_ids('[unused0]'), 'EOS': tokenizer.convert_tokens_to_ids('[unused1]'), 'PAD': tokenizer.convert_tokens_to_ids('[PAD]'), 'EOQ': tokenizer.convert_tokens_to_ids('[unused2]')} predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter, step)
def convert_bertabs_checkpoints(path_to_checkpoints, dump_path): """ Copy/paste and tweak the pre-trained weights provided by the creators of BertAbs for the internal architecture. """ # Instantiate the authors' model with the pre-trained weights config = BertAbsConfig( temp_dir=".", finetune_bert=False, large=False, share_emb=True, use_bert_emb=False, encoder="bert", max_pos=512, enc_layers=6, enc_hidden_size=512, enc_heads=8, enc_ff_size=512, enc_dropout=0.2, dec_layers=6, dec_hidden_size=768, dec_heads=8, dec_ff_size=2048, dec_dropout=0.2, ) checkpoints = torch.load(path_to_checkpoints, lambda storage, loc: storage) original = AbsSummarizer(config, torch.device("cpu"), checkpoints) original.eval() new_model = BertAbsSummarizer(config, torch.device("cpu")) new_model.eval() # ------------------- # Convert the weights # ------------------- logging.info("convert the model") new_model.bert.load_state_dict(original.bert.state_dict()) new_model.decoder.load_state_dict(original.decoder.state_dict()) new_model.generator.load_state_dict(original.generator.state_dict()) # ---------------------------------- # Make sure the outpus are identical # ---------------------------------- logging.info("Make sure that the models' outputs are identical") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # prepare the model inputs encoder_input_ids = tokenizer.encode("This is sample éàalj'-.") encoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(encoder_input_ids))) encoder_input_ids = torch.tensor(encoder_input_ids).unsqueeze(0) decoder_input_ids = tokenizer.encode("This is sample 3 éàalj'-.") decoder_input_ids.extend([tokenizer.pad_token_id] * (512 - len(decoder_input_ids))) decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0) # failsafe to make sure the weights reset does not affect the # loaded weights. assert torch.max( torch.abs(original.generator[0].weight - new_model.generator[0].weight)) == 0 # forward pass src = encoder_input_ids tgt = decoder_input_ids segs = token_type_ids = None clss = None mask_src = encoder_attention_mask = None mask_tgt = decoder_attention_mask = None mask_cls = None # The original model does not apply the geneator layer immediatly but rather in # the beam search (where it combines softmax + linear layer). Since we already # apply the softmax in our generation process we only apply the linear layer here. # We make sure that the outputs of the full stack are identical output_original_model = original(src, tgt, segs, clss, mask_src, mask_tgt, mask_cls)[0] output_original_generator = original.generator(output_original_model) output_converted_model = new_model(encoder_input_ids, decoder_input_ids, token_type_ids, encoder_attention_mask, decoder_attention_mask)[0] output_converted_generator = new_model.generator(output_converted_model) maximum_absolute_difference = torch.max( torch.abs(output_converted_model - output_original_model)).item() print("Maximum absolute difference beween weights: {:.2f}".format( maximum_absolute_difference)) maximum_absolute_difference = torch.max( torch.abs(output_converted_generator - output_original_generator)).item() print("Maximum absolute difference beween weights: {:.2f}".format( maximum_absolute_difference)) are_identical = torch.allclose(output_converted_model, output_original_model, atol=1e-3) if are_identical: logging.info("all weights are equal up to 1e-3") else: raise ValueError( "the weights are different. The new model is likely different from the original one." ) # The model has been saved with torch.save(model) and this is bound to the exact # directory structure. We save the state_dict instead. logging.info("saving the model's state dictionary") torch.save( new_model.state_dict(), "bertabs-finetuned-cnndm-extractive-abstractive-summarization-pytorch_model.bin" )
default=True) args = parser.parse_args() args.gpu_ranks = [int(i) for i in range(len(args.visible_gpus.split(',')))] args.world_size = len(args.gpu_ranks) os.environ["CUDA_VISIBLE_DEVICES"] = args.visible_gpus device = "cpu" if args.visible_gpus == '-1' else "cuda" device_id = 0 if device == "cuda" else -1 checkpoint = torch.load(args.test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) model = AbsSummarizer(args, device, checkpoint) model.eval() tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) def getrespond(request): context = {} if (request.POST): print(request.POST["Language"]) #print(request.POST) context["input"] = request.POST["input_block"] if (request.POST["Language"] != "English"): context[ "err"] = "ERROR: The language is not supported currently. Please try later."
def validate(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False), args.batch_size, device, shuffle=False, is_test=False) if (args.bert_model == 'bert-base-multilingual-cased'): tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased', do_lower_case=False, cache_dir=args.temp_dir) else: tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True, cache_dir=args.temp_dir) print(len(tokenizer.vocab)) if (len(tokenizer.vocab) == 31748): f = open(args.bert_model + "/vocab.txt", "a") f.write( "\n[unused1]\n[unused2]\n[unused3]\n[unused4]\n[unused5]\n[unused6]\n[unused7]" ) f.close() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True) print(len(tokenizer.vocab)) symbols = { 'BOS': tokenizer.vocab['[unused1]'], 'EOS': tokenizer.vocab['[unused2]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused3]'] } model = AbsSummarizer(args, device, checkpoint) model.eval() valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device) trainer = build_trainer(args, device_id, model, None, valid_loss) stats = trainer.validate(valid_iter, step) return stats.xent()
def test_abs(args, device_id, pt, step): device = "cpu" if args.visible_gpus == '-1' else "cuda" if (pt != ''): test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) # model = AbsSummarizer(args, device, checkpoint) model = AbsSummarizer(args, device, checkpoint=None) model.eval() def test_iter_fct(): return data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) def train_iter_fct(): return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) def val_iter_fct(): return data_loader.Dataloader(args, load_dataset(args, 'val', shuffle=False), args.test_batch_size, device, shuffle=False, is_test=True) # tokenizer = BertTokenizer.from_pretrained('/disk1/sajad/pretrained-bert/scibert_scivocab_uncased', do_lower_case=True) # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) tokenizer = BertTokenizer.from_pretrained( 'allenai/scibert_scivocab_uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = { 'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]'] } predictor = build_predictor(args, tokenizer, symbols, model, logger) predictor.translate(test_iter_fct, step)
args.temp_dir = 'temp' args.finetune_bert = False args.encoder = 'bert' args.max_pos = 256 args.dec_layers = 6 args.share_emb = False args.dec_hidden_size = 768 args.dec_heads = 8 args.dec_ff_size = 2048 args.dec_dropout = 0.2 args.use_bert_emb = False bert_data = BertData(args.model_path, True, 510, 128) BertSumAbs = AbsSummarizer(args, DEVICE, checkpoint) BertSumAbs.eval() data = pd.read_json(DATASET_PATH, encoding='utf-8', lines=True, chunksize=CHUNK_SIZE) for el in tqdm.tqdm(data, total=450000 // CHUNK_SIZE): with open('vectors.npy', 'ab') as fvecs, open('text.jsonl', 'a', encoding='utf-8') as ft: for j in range(CHINK_SIZE): text = el.iloc[j]["text"].lower().replace('\xa0', ' ').replace( '\n', ' ').strip() title = el.iloc[j]["title"].lower()