Exemplo n.º 1
0
def test_text_abs(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False),
                                       args.test_batch_size, device,
                                       shuffle=False, is_test=True)
    #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
    tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased',  cache_dir=args.temp_dir)
    #symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}
    predictor = build_predictor(args, tokenizer, symbols, model, logger)
    #predictor = build_predictor(args, tokenizer, model, logger)
    predictor.translate(test_iter, step)
Exemplo n.º 2
0
def validate(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args, device, checkpoint)
    model.eval()

    valid_iter = data_loader.Dataloader(args, load_dataset(args, 'valid', shuffle=False),
                                        args.batch_size, device,
                                        shuffle=False, is_test=False)

    tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased', cache_dir=args.temp_dir)
    #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
    symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],
               'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}

    valid_loss = abs_loss(model.generator, symbols, model.vocab_size, train=False, device=device)

    trainer = build_trainer(args, device_id, model, None, valid_loss)
    stats = trainer.validate(valid_iter, step)
    return stats.xent()
Exemplo n.º 3
0
def train_abs_single(args, device_id):
    init_logger(args.log_file)
    logger.info(str(args))
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    logger.info('Device ID %d' % device_id)
    logger.info('Device %s' % device)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    if device_id >= 0:
        torch.cuda.set_device(device_id)
        torch.cuda.manual_seed(args.seed)

    if args.train_from != '':
        logger.info('Loading checkpoint from %s' % args.train_from)
        checkpoint = torch.load(args.train_from,
                                map_location=lambda storage, loc: storage)
        opt = vars(checkpoint['opt'])
        for k in opt.keys():
            if (k in model_flags):
                setattr(args, k, opt[k])
    else:
        checkpoint = None

    if (args.load_from_extractive != ''):
        logger.info('Loading bert from extractive model %s' % args.load_from_extractive)
        bert_from_extractive = torch.load(args.load_from_extractive, map_location=lambda storage, loc: storage)
        bert_from_extractive = bert_from_extractive['model']
    else:
        bert_from_extractive = None
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    def train_iter_fct():
        return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=True), args.batch_size, device,
                                      shuffle=True, is_test=False)

    model = AbsSummarizer(args, device, checkpoint, bert_from_extractive)
    if (args.sep_optim):
        optim_bert = model_builder.build_optim_bert(args, model, checkpoint)
        optim_dec = model_builder.build_optim_dec(args, model, checkpoint)
        optim = [optim_bert, optim_dec]
    else:
        optim = [model_builder.build_optim(args, model, checkpoint)]

    logger.info(model)

    #tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir)
    tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased',  cache_dir=args.temp_dir)
    symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],
               'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}

    train_loss = abs_loss(model.generator,symbols, model.vocab_size, device, train=True,label_smoothing=args.label_smoothing)

    trainer = build_trainer(args, device_id, model, optim, train_loss)

    trainer.train(train_iter_fct, args.train_steps)
Exemplo n.º 4
0
 def load_model(self,
                model_dir: str,
                model_config: str = "model_config.json"):
     model_config = os.path.join(model_dir, model_config)
     model_config = json.load(open(model_config))
     model = BertNer.from_pretrained(model_dir)
     tokenizer = MobileBertTokenizer.from_pretrained(
         model_dir, do_lower_case=model_config["do_lower"])
     return model, tokenizer, model_config
Exemplo n.º 5
0
    def __init__(self, train=True):
        tokenizer = MobileBertTokenizer.from_pretrained(
            'google/mobilebert-uncased')

        def customTokenizer(sentence: str) -> list:
            # keeps only the first 510 character
            max_len = 510
            tokens = tokenizer.tokenize(sentence)
            tokens = ['<cls>'] + \
                     tokens[:max_len] + \
                     ['<sep>'] + \
                     ['<pad>'] * max(0, max_len - len(tokens))
            return tokens

        def getVocab(tokenizer):
            tokenizer.save_pretrained("./")

            token_old = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']
            token_new = ['<pad>', '<unk>', '<cls>', '<sep>', '<mask>']

            fin = open("vocab.txt", "rt")
            data = fin.read()
            for old, new in zip(token_old, token_new):
                data = data.replace(old, new)
            fin.close()

            fin = open("vocab_adapted.txt", "wt")
            fin.write(data)
            fin.close()

            f = open('vocab_adapted.txt', 'r')
            v = vocab_from_file_object(f)
            return v

        trainData, testData = torchtext.experimental.datasets.IMDB(
            vocab=getVocab(tokenizer),
            tokenizer=customTokenizer,
            data_select=('train', 'test'))
        self.dataset = trainData if train == True else testData

        self.targets = [sample[0] for sample in self.dataset.data
                        ]  # for later use when partitioning the data
Exemplo n.º 6
0
                        default=None,
                        help='output model path and name')
    parser.add_argument('--benchmark',
                        action='store_true',
                        default=False,
                        help='Get benchmark performance of quantized model.')
    parser.add_argument('--benchmark_nums',
                        type=int,
                        default=1000,
                        help="Benchmark numbers of samples")
    parser.add_argument('--accuracy_only',
                        action='store_true',
                        default=False,
                        help="Mode of benchmark")
    args = parser.parse_args()
    tokenizer = MobileBertTokenizer.from_pretrained(args.input_dir,
                                                    do_lower_case=True)
    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (
        args.task_name, )
    eval_dataset = load_and_cache_examples(args,
                                           args.task_name,
                                           tokenizer,
                                           evaluate=True)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, \
        batch_size=args.eval_batch_size)

    def eval_func(model):
        return evaluate_onnxrt(args, model, tokenizer, eval_dataloader)
    emotions_path = 'data/emotions.txt'
    with open(emotions_path) as f:
        for line in f:
            labels.append(line.replace('\n', ''))
    return labels


def get_features():
    return ['text']


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# example_sentence = 'He isn\'t as big, but he\'s still quite popular. I\'ve heard the same thing about his content. Never watched him much.'

# Pretrain MobileBertTokenizer and MobileBertModel
mobile_bert_tokenizer = MobileBertTokenizer.from_pretrained(
    config.PRETRAINED_MODEL_NAME)

mobile_bert_model = MobileBertModel.from_pretrained(
    config.PRETRAINED_MODEL_NAME)
mobile_bert_model.to(device)

# Preprocessing Data
labels_headings = get_labels()
features_headings = get_features()

input_ids, token_type_ids, attention_mask, labels = preprocessing_data(
    features=features_headings, labels=labels_headings)

input_ids = input_ids.type(torch.LongTensor)
attention_mask = attention_mask.type(torch.LongTensor)