예제 #1
0
def _ensure_bert():
    if "tokenizer" not in BERT_SINGLETONS:
        tokenizer = BertTokenizer.from_pretrained(FLAGS.bert_version)
        BERT_SINGLETONS["tokenizer"] = tokenizer

    if "representer" not in BERT_SINGLETONS:
        representer = BertModel.from_pretrained(FLAGS.bert_version, output_hidden_states=True).to(_device())
        BERT_SINGLETONS["representer"] = representer

    return BERT_SINGLETONS["tokenizer"], BERT_SINGLETONS["representer"]
예제 #2
0
    def __init__(self):
        super(BertForWordSegmentation_4, self).__init__()
        self.tokenizer = BertTokenizer.from_pretrained(
            'bert-base-multilingual-cased', do_lower_case=False)
        self.model = BertModel.from_pretrained(
            'bert-base-multilingual-cased',
            do_lower_case=False,
            output_hidden_states=True).to('cuda')

        self.classifier = DropoutClassifier(768 * 4, 2).to('cuda')
 def load_tokenizer(self):
     if self.model_configuration.is_xlnet:
         self.tokenizer = XLNetTokenizer.from_pretrained(self.model_configuration.bert_model,
                                                         do_lower_case=self.model_configuration.do_lower)
     elif not self.model_configuration.is_scibert:
         self.tokenizer = BertTokenizer.from_pretrained(self.model_configuration.bert_model,
                                                        do_lower_case=self.model_configuration.do_lower)
     else:
         self.tokenizer = BertTokenizer(self.model_configuration.vocab_file,
                                        do_lower_case=self.model_configuration.do_lower)
예제 #4
0
 def initialize(self):
     super().initialize()
     bert_model = bert_models.get_model(self.bert_base, self.logger)
     self.tokenizer = BertTokenizer.from_pretrained(bert_model)
     # TODO: HACK! Until the transformers library adopts tokenizers, save and re-load vocab
     with tempfile.TemporaryDirectory() as d:
         self.tokenizer.save_vocabulary(d)
         # this tokenizer is ~4x faster as the BertTokenizer, per my measurements
         self.tokenizer = tk.BertWordPieceTokenizer(
             os.path.join(d, 'vocab.txt'))
예제 #5
0
 def __init__(self, vocab, encoding_length=20, added_special_tokens=[]):
     # <NAV>, <ORA>,<TAR>
     from pytorch_transformers import BertTokenizer
     self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     #added_tok = {'additional_special_tokens': added_special_tokens}
     #self.tokenizer.add_special_tokens(added_tok)
     self.encoding_length = encoding_length
     self.split_regex = re.compile(
         r'(\W+)')  # Split on any non-alphanumeric character
     self.vocab = vocab
예제 #6
0
    def __init__(self,
                 device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
                 batch_size=16):
        self.device = device if isinstance(device, torch.device) else torch.device(device)
        self.model_type = "distress"
        self.batch_size = batch_size

        model_path = os.path.join(os.path.dirname(__file__), f"models/{self.model_type}.pth")
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.model = load_model(self.model_type, model_path, self.device)
예제 #7
0
 def __init__(self, inv_dict):
     super(Model, self).__init__()
     self.config = config.SNLIConfig()
     model = BertModel.from_pretrained(self.config.BERT_MODEL)
     self.model = ModelTrainer(model, 3)
     self.model.load_state_dict(torch.load(self.config.model_name))
     self.model = self.model.eval().cuda()
     self.inv_dict = inv_dict
     self.tokenizer = BertTokenizer.from_pretrained(self.config.BERT_MODEL)
     self.m = nn.Softmax(1)
 def __init__(self, device='cuda'):
     super().__init__()
     self._tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     self._model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
     self._blank = '[unused0]'
     self._question = '[SEP]'
     self._context = '[SEP]'
     self._choice = '[SEP]'
     self._choice_split = '[SEP]'
     self._device = device
예제 #9
0
def train_abs_single(args, device_id):
    setattr(args, "device_id", device_id)

    init_logger(args.log_file)
    logger.info(str(args))
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    logger.info('Device ID %d' % device_id)
    logger.info('Device %s' % device)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    if device_id >= 0:
        torch.cuda.set_device(device_id)
        torch.cuda.manual_seed(args.seed)

    if args.train_from != '':
        logger.info('Loading checkpoint from %s' % args.train_from)
        checkpoint = torch.load(args.train_from,
                                map_location=lambda storage, loc: storage)
        opt = vars(checkpoint['opt'])
        for k in opt.keys():
            if (k in model_flags):
                setattr(args, k, opt[k])
    else:
        checkpoint = None

    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    def train_iter_fct():
        return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=True), args.batch_size, device,
                                      shuffle=True, is_test=False)

    model = AbsSummarizer(args, device, checkpoint)
    if (args.sep_optim):
        optim_bert = model_builder.build_optim_bert(args, model, checkpoint)
        optim_dec = model_builder.build_optim_dec(args, model, checkpoint)
        optim = [optim_bert, optim_dec]
    else:
        optim = [model_builder.build_optim(args, model, checkpoint)]

    logger.info(model)

    tokenizer = BertTokenizer.from_pretrained(path.join(args.bert_model_path, model.bert.model_name), do_lower_case=True, cache_dir=args.temp_dir)
    symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],
               'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}

    train_loss = abs_loss(model.generator, symbols['PAD'], model.vocab_size, device, train=True,
                          label_smoothing=args.label_smoothing)

    trainer = build_trainer(args, device_id, model, optim, train_loss)

    trainer.train(train_iter_fct, args.train_steps)
def gen_dataloader(_train_path,
                   _test_path,
                   batch_size,
                   preprocess_inputs=False,
                   tokenizer_type='bert-base-uncased',
                   input_len=128,
                   **kwargs):
    """
    Helper function that takes either just the train data path or both
    train and test data an outputs the appropriate dataloader instance

    kwargs are:
    for preprocessing:
    sample_size=None,
    weak_supervision=True
    max_len = 128
    filter_bad_rows = True
    tokenizer = DFAULT_TOKENIIZER
    
    For dataloaders:
    val_sample_dataloader=True
    pin_memory = False
    num_workers = 0
    """

    if 'bert' in tokenizer_type.lower():
        tokenizer = BertTokenizer.from_pretrained(tokenizer_type)
    elif 'xlnet' in tokenizer_type.lower():
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_type)
    else:
        raise NotImplementedError(
            'model {} is not implemented'.format(tokenizer_type))

    train_dataset = read_data_to_dataframe(_train_path)
    if preprocess_inputs:
        df_train = preprocess_model_inputs(train_dataset,
                                           tokenizer=tokenizer,
                                           output_len=input_len,
                                           **kwargs)
    else:
        df_train = train_dataset

    if _test_path:
        test_dataset = read_data_to_dataframe(_test_path)
        if preprocess_inputs:
            df_test = preprocess_model_inputs(test_dataset,
                                              tokenizer=tokenizer,
                                              **kwargs)
        else:
            df_test = test_dataset
        dl = TrainValDataloader(df_train, df_test, batch_size, kwargs)
        return dl

    dl = TrainValSplitDataloader(df_train, batch_size, kwargs)
    return dl
예제 #11
0
def bert_word_data_variable(batch, config):
    tokenizer = BertTokenizer.from_pretrained('Data/ms/.')
    batch_size = len(batch) * 2
    src_premise_matrix = np.zeros((batch_size, config.max_sen_len + 2))
    src_hypothesis_matrix = np.zeros((batch_size, config.max_sen_len + 2))
    p_mask = np.zeros((batch_size, config.max_sen_len + 2))
    h_mask = np.zeros((batch_size, config.max_sen_len + 2))
    tag_matrix = np.zeros(batch_size)
    for idx, instance in enumerate(batch):
        premise = tokenizer.encode(instance[0])
        hypothesis_b = tokenizer.encode(instance[1])
        hypothesis_c = tokenizer.encode(instance[2])
        while len(premise) > config.max_sen_len:
            premise = premise[len(premise) - config.max_sen_len:]
        while len(hypothesis_b) > config.max_sen_len:
            hypothesis_b = hypothesis_b[len(hypothesis_b) - config.max_sen_len:]
        while len(hypothesis_c) > config.max_sen_len:
            hypothesis_c = hypothesis_c[len(hypothesis_c) - config.max_sen_len:]
        premise.insert(0, 101)
        premise.append(102)
        p_len = len(premise)

        hypothesis_b.insert(0, 101)
        hypothesis_b.append(102)
        hb_len = len(hypothesis_b)

        hypothesis_c.insert(0, 101)
        hypothesis_c.append(102)
        hc_len = len(hypothesis_c)

        for jdx in range(p_len):
            src_premise_matrix[idx * 2][jdx] = premise[jdx]
            src_premise_matrix[idx * 2 + 1][jdx] = premise[jdx]
            p_mask[idx * 2][jdx] = 1
            p_mask[idx * 2 + 1][jdx] = 1
        for kdx in range(hb_len):
            src_hypothesis_matrix[idx * 2][kdx] = hypothesis_b[kdx]
            h_mask[idx * 2][kdx] = 1
        for gdx in range(hc_len):
            src_hypothesis_matrix[idx * 2 + 1][gdx] = hypothesis_c[gdx]
            h_mask[idx * 2 + 1][gdx] = 1
        tag_matrix[idx * 2] = 1
        tag_matrix[idx * 2 + 1] = 0
    src_premise_matrix = torch.from_numpy(src_premise_matrix).long()
    src_hypothesis_matrix = torch.from_numpy(src_hypothesis_matrix).long()
    p_mask = torch.from_numpy(p_mask).float()
    h_mask = torch.from_numpy(h_mask).float()
    tag_matrix = torch.from_numpy(tag_matrix).long()
    if config.use_cuda:
        src_premise_matrix = src_premise_matrix.cuda()
        src_hypothesis_matrix = src_hypothesis_matrix.cuda()
        p_mask = p_mask.cuda()
        h_mask = h_mask.cuda()
        tag_matrix = tag_matrix.cuda()
    return [src_premise_matrix, src_hypothesis_matrix, p_mask, h_mask, tag_matrix]
예제 #12
0
def getBertSentenceFromRaw(raw_sent):
    """
    convert the original tokenization to the BERT tokenization
    returns the BERT tokenization
    """
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_sent_list = []
    for raw_word in raw_sent:
        bert_tokens = tokenizer.tokenize(raw_word)
        bert_sent_list += bert_tokens
    return bert_sent_list
예제 #13
0
파일: process.py 프로젝트: wibruce/deepke
def build_lm_data(raw_data: List) -> List:
    tokenizer = BertTokenizer.from_pretrained(config.lm_name)
    sents = []
    for data in raw_data:
        sent = data[0]
        sub = data[1]
        obj = data[4]
        sent = '[CLS]' + sent + '[SEP]' + sub + '[SEP]' + obj + '[SEP]'
        input_ids = torch.tensor([tokenizer.encode(sent)])
        sents.append(input_ids)
    return sents
예제 #14
0
    def __init__(self, test_text, ref_text, batch_size, device, method='mean'):
        super().__init__()

        self.name = 'BertDist'
        self.ref_text = ref_text
        self.test_text = test_text
        self.batch_size = batch_size
        self.device = device
        self.method = method

        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
예제 #15
0
    def __init__(self, batches, batch_size, device, bert_model_path):
        self.bert_tokenizer = BertTokenizer.from_pretrained(bert_model_path)

        self.batch_size = batch_size
        self.batches = batches
        self.n_batches = len(batches) // batch_size
        self.residue = False  # 记录batch数量是否为整数
        if len(batches) % self.n_batches != 0:
            self.residue = True
        self.index = 0
        self.device = device
def validate(args, device_id, pt, step):
    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    if (pt != ''):
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)
    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])
    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = Z_AbsSummarizer(args, device, checkpoint)
    model.eval()

    valid_iter = data_loader.Dataloader(args,
                                        load_dataset(args,
                                                     'valid',
                                                     shuffle=False),
                                        args.batch_size,
                                        device,
                                        shuffle=False,
                                        is_test=False)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True,
                                              cache_dir=args.temp_dir)
    symbols = {
        'BOS': tokenizer.vocab['[unused0]'],
        'EOS': tokenizer.vocab['[unused1]'],
        'PAD': tokenizer.vocab['[PAD]'],
        'EOQ': tokenizer.vocab['[unused2]']
    }

    if COPY:
        valid_loss = abs_loss(model.generator,
                              symbols,
                              model.vocab_size,
                              train=False,
                              device=device,
                              copy_generator=model.copy_generator)
    else:
        valid_loss = abs_loss(model.generator,
                              symbols,
                              model.vocab_size,
                              train=False,
                              device=device)

    trainer = build_trainer(args, device_id, model, None, valid_loss)
    stats = trainer.validate(valid_iter, step)
    return stats.xent()
예제 #17
0
파일: all.py 프로젝트: zeta1999/CoSDA-ML
 def init(args):
     BERTTool.multi_bert = BertModel.from_pretrained(
         args.multi_bert.location)
     BERTTool.multi_tokener = BertTokenizer.from_pretrained(
         args.multi_bert.location)
     BERTTool.multi_pad = BERTTool.multi_tokener.convert_tokens_to_ids(
         ["[PAD]"])[0]
     BERTTool.multi_sep = BERTTool.multi_tokener.convert_tokens_to_ids(
         ["[SEP]"])[0]
     BERTTool.multi_cls = BERTTool.multi_tokener.convert_tokens_to_ids(
         ["[CLS]"])[0]
예제 #18
0
    def __init__(self,
                 args=None,
                 device='cuda',
                 bert_model_path='bert-base-uncased',
                 batch_size=10,
                 learning_rate=5e-5,
                 weight_decay=0,
                 additional_features=None):
        if args is not None:
            self.args = vars(args)

        assert device in ['cuda', 'cpu']

        if not args:
            self.args = {}
            self.args['bert_model_path'] = bert_model_path
            self.args['device'] = device
            self.args['learning_rate'] = learning_rate
            self.args['weight_decay'] = weight_decay
            self.args['batch_size'] = batch_size

        self.log = logging.getLogger()

        self.bert_tokenizer = BertTokenizer.from_pretrained(
            self.args['bert_model_path'])
        if os.path.exists(self.args['bert_model_path']):
            if os.path.exists(
                    os.path.join(self.args['bert_model_path'], CONFIG_NAME)):
                config = BertConfig.from_json_file(
                    os.path.join(self.args['bert_model_path'], CONFIG_NAME))
            elif os.path.exists(
                    os.path.join(self.args['bert_model_path'],
                                 'bert_config.json')):
                config = BertConfig.from_json_file(
                    os.path.join(self.args['bert_model_path'],
                                 'bert_config.json'))
            else:
                raise ValueError(
                    "Cannot find a configuration for the BERT model you are attempting to load."
                )

        self.loss_function = torch.nn.MSELoss()

        config.pretrained_config_archive_map[
            'additional_features'] = additional_features

        self.regressor_net = BertSimilarityRegressor.from_pretrained(
            self.args['bert_model_path'], config=config)
        self.optimizer = torch.optim.Adam(
            self.regressor_net.parameters(),
            weight_decay=self.args['weight_decay'],
            lr=self.args['learning_rate'])
        self.log.info('Initialized BertSentencePairSimilarity model from %s' %
                      self.args['bert_model_path'])
예제 #19
0
    def __init__(self, ext_vocab=None, \
        key_name=None, \
        bert_vocab_name='bert-base-uncased'):

        # initialize by default value. (can be overwritten by subclass)
        self.ext_vocab = ext_vocab or ["[PAD]", "[UNK]", "[CLS]", "[SEP]"]

        self.tokenizer = BertTokenizer.from_pretrained(bert_vocab_name)
        self._build_bert_vocab()

        super().__init__(self.ext_vocab, key_name)
def main():
    parser = argparse.ArgumentParser(
        description=
        "Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids)."
    )
    parser.add_argument('--file_path',
                        type=str,
                        default='data/dump.txt',
                        help='The path to the data.')
    parser.add_argument('--bert_tokenizer',
                        type=str,
                        default='bert-base-uncased',
                        help="The tokenizer to use.")
    parser.add_argument('--dump_file',
                        type=str,
                        default='data/dump',
                        help='The dump file prefix.')
    args = parser.parse_args()

    logger.info(f'Loading Tokenizer ({args.bert_tokenizer})')
    bert_tokenizer = BertTokenizer.from_pretrained(args.bert_tokenizer)

    logger.info(f'Loading text from {args.file_path}')
    with open(args.file_path, 'r', encoding='utf8') as fp:
        data = fp.readlines()

    logger.info(f'Start encoding')
    logger.info(f'{len(data)} examples to process.')

    rslt = []
    iter = 0
    interval = 10000
    start = time.time()
    for text in data:
        text = f'[CLS] {text.strip()} [SEP]'
        token_ids = bert_tokenizer.encode(text)
        rslt.append(token_ids)

        iter += 1
        if iter % interval == 0:
            end = time.time()
            logger.info(
                f'{iter} examples processed. - {(end-start)/interval:.2f}s/expl'
            )
            start = time.time()
    logger.info('Finished binarization')
    logger.info(f'{len(data)} examples processed.')

    dp_file = f'{args.dump_file}.{args.bert_tokenizer}.pickle'
    rslt_ = [np.uint16(d) for d in rslt]
    random.shuffle(rslt_)
    logger.info(f'Dump to {dp_file}')
    with open(dp_file, 'wb') as handle:
        pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL)
def tokenize(input_string, output_file):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokens = tokenizer.tokenize(input_string)
    line = " ".join(tokens)
    line = '{}\n'.format(line)

    with open(output_file + ".src", 'w', encoding='utf-8') as src:
        src.write(line)

    with open(output_file + ".tgt", 'w', encoding='utf-8') as tgt:
        tgt.write("line")
예제 #22
0
    def __init__(self,
                 model_path='bert-base-uncased',
                 tokenizer_path=None,
                 device=None):
        super().__init__(device)
        self.model_path = model_path

        self.tokenizer = BertTokenizer.from_pretrained(model_path)
        self.model = BertForMaskedLM.from_pretrained(model_path)
        self.model.to(device)
        self.model.eval()
예제 #23
0
def _add_lm_data(data: List[Dict]) -> List[Dict]:
    '使用语言模型的词表,序列化输入的句子'
    tokenizer = BertTokenizer.from_pretrained(config.lm.lm_file)

    for d in data:
        sent = d['sentence']
        sent += '[SEP]' + d['head'] + '[SEP]' + d['tail']

        d['lm_idx'] = tokenizer.encode(sent, add_special_tokens=True)
        d['seq_len'] = len(d['lm_idx'])

    return data
예제 #24
0
def main(text):
    tokenizer = BertTokenizer.from_pretrained('./', do_lower_case=True)
    model = BertForSequenceClassification.from_pretrained('./')
    model.to(device)
    texts = []
    preds = []
    texts.append("[CLS] " + text[:509] + " [SEP]")
    tokenized_texts = [tokenizer.tokenize(sent) for sent in texts]
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(
        input_ids,
        maxlen=100,
        dtype="long",
        truncating="post",
        padding="post"
    )
    attention_masks = [[float(i>0) for i in seq] for seq in input_ids]

    prediction_inputs = torch.tensor(input_ids)
    prediction_masks = torch.tensor(attention_masks)
  
    prediction_data = TensorDataset(
        prediction_inputs,
        prediction_masks
    )

    prediction_dataloader = DataLoader(
      prediction_data, 
      sampler=SequentialSampler(prediction_data),
      batch_size=1
    )
    model.eval()
    preds = []

    for batch in prediction_dataloader:
        # добавляем батч для вычисления на GPU
        batch = tuple(t.to(device) for t in batch)
    
        # Распаковываем данные из dataloader
        b_input_ids, b_input_mask = batch
    
        # При использовании .no_grad() модель не будет считать и хранить градиенты.
        # Это ускорит процесс предсказания меток для тестовых данных.
        with torch.no_grad():
            logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        # Перемещаем logits и метки классов на CPU для дальнейшей работы
        logits = logits[0].detach().cpu().numpy()

        # Сохраняем предсказанные классы и ground truth
        batch_preds = np.argmax(logits, axis=1) 
        preds.extend(batch_preds)
    return preds
예제 #25
0
파일: run.py 프로젝트: AMDonati/RL-NLP
def get_hf_path():
    if not os.path.isdir("cache/gpt-2"):
        os.makedirs("cache/gpt-2")
        model = AutoModelWithLMHead.from_pretrained("gpt2")
        model.save_pretrained("cache/gpt-2")
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        tokenizer.save_pretrained("cache/gpt-2")
    if not os.path.isdir("cache/bert"):
        os.makedirs("cache/bert")
        reward_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                                         do_lower_case=True)
        reward_tokenizer.save_pretrained("cache/bert")
예제 #26
0
 def __init__(self, model_state_dict) -> None:
     no_cuda = True
     self.device = torch.device(
         "cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
     self.tokenizer = BertTokenizer.from_pretrained('bert-base-chinese',
                                                    do_lower_case=False)
     config = BertConfig.from_pretrained('bert-base-chinese')
     self.model = BertForQuestionAnswering(config)
     self.model.load_state_dict(
         torch.load(model_state_dict, map_location='cpu'))
     self.model.to(self.device)
     self.model.eval()  # TODO
예제 #27
0
파일: CNN.py 프로젝트: Inaguma1110/span_NER
    def __init__(self, config, vocab):
        super(BERT_PRETRAINED_MODEL_JAPANESE, self).__init__()

        self.config = config
        self.vocab = vocab
        self.BERT_config = BertConfig.from_json_file(
            '../published_model/bert_spm/bert_config.json')
        self.tokenizer = BertTokenizer.from_pretrained(
            './spm_model/wiki-ja.vocab.txt')
        self.pretrained_BERT_model = BertModel.from_pretrained(
            '../published_model/bert_spm/pytorch_model.bin',
            config=self.BERT_config)
예제 #28
0
 def __init__(self, text=None, tokenizerName="bert-base-chinese"):
     self.device = torch.device(
         "cuda:0" if torch.cuda.is_available() else "cpu")
     self.predict_map = {
         0: "High Positive",
         1: "Clam Positive",
         2: "High Negative",
         3: "clam Negative"
     }
     self.modelPath = "bert_sentiment_wordmax_128_loss_0.033_lr_2e-05.pkl"
     self.tokenizer = BertTokenizer.from_pretrained(tokenizerName)
     self.text = text
예제 #29
0
def main(args):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    ontology = json.load(open(os.path.join(args.data_root,
                                           args.ontology_data)))
    slot_meta, _ = make_slot_meta(ontology)

    tokenizer = BertTokenizer.from_pretrained(args.bert_config)
    special_tokens = ['[SLOT]', '[NULL]']
    special_tokens_dict = {'additional_special_tokens': special_tokens}
    tokenizer.add_special_tokens(special_tokens_dict)

    data = prepare_dataset(data_path=os.path.join(args.data_root,
                                                  args.test_data),
                           data_list=None,
                           tokenizer=tokenizer,
                           slot_meta=slot_meta,
                           n_history=args.n_history,
                           max_seq_length=args.max_seq_length,
                           op_code=args.op_code)

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = 0.1
    op2id = OP_SET[args.op_code]
    model = TransformerDST(model_config, len(op2id), len(domain2id),
                           op2id['update'])
    ckpt = torch.load(args.model_ckpt_path, map_location='cpu')
    model.load_state_dict(ckpt)

    model.eval()
    model.to(device)

    if args.eval_all:
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         False, False, False)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         False, False, True)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         False, True, False)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         False, True, True)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         True, False, False)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         True, True, False)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         True, False, True)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         True, True, True)
    else:
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         args.gt_op, args.gt_p_state, args.gt_gen)
예제 #30
0
def add_pytorch_transformers_vocab(vocab, tokenizer_name):
    """Add vocabulary from tokenizers in pytorch_transformers for use with pre-tokenized data.

    These tokenizers have a convert_tokens_to_ids method, but this doesn't do
    anything special, so we can just use the standard indexers.
    """
    do_lower_case = "uncased" in tokenizer_name

    if tokenizer_name.startswith("bert-"):
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("roberta-"):
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name,
                                                   do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)

    if (tokenizer_name.startswith("openai-gpt")
            or tokenizer_name.startswith("gpt2")
            or tokenizer_name.startswith("transo-xl-")):
        tokenizer.add_special_tokens({
            "bos_token": "<start>",
            "sep_token": "<delim>",
            "cls_token": "<extract>"
        })
    # TODO: this is another place can be simplified by "model-before-preprocess" reorganization
    # we can pass tokenizer created in model here, see issue <TBD>

    vocab_size = len(tokenizer)
    # do not use tokenizer.vocab_size, it does not include newly added token
    if tokenizer_name.startswith("roberta-"):
        if tokenizer.convert_ids_to_tokens(vocab_size - 1) is None:
            vocab_size -= 1
        else:
            log.info("Time to delete vocab_size-1 in preprocess.py !!!")
    # due to a quirk in huggingface's file, the last token of RobertaTokenizer is None, remove
    # this when they fix the problem

    ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size))
    log.info("Added pytorch_transformers vocab (%s): %d tokens",
             tokenizer_name, len(ordered_vocab))
    for word in ordered_vocab:
        vocab.add_token_to_namespace(
            word, input_module_tokenizer_name(tokenizer_name))