示例#1
0
 def __init__(self,
              vocab_size,
              hidden_size,
              dropout,
              n_layers=1,
              vocab_file='./data/vocab.txt'):
     super(UntrainedEncoderBERT, self).__init__()
     self.vocab_size = vocab_size
     self.hidden_size = hidden_size
     self.dropout = dropout
     self.dropout_layer = nn.Dropout(dropout)
     self.embedding = nn.Embedding(vocab_size,
                                   hidden_size,
                                   padding_idx=PAD_token)
     self.embedding.weight.data.normal_(0, 0.1)
     self.config = transformers.BertConfig(vocab_size=self.vocab_size,
                                           hidden_size=self.hidden_size,
                                           num_hidden_layers=n_layers,
                                           hidden_dropout_prob=dropout,
                                           attention_probs_dropout=dropout,
                                           num_attention_heads=16,
                                           output_hidden_states=True,
                                           max_position_embeddings=1024)
     self.tokenizer = transformers.BertTokenizer(vocab_file,
                                                 pad_token='PAD',
                                                 unk_token='UNK',
                                                 sep_token='EOS')
     self.BERT = transformers.BertModel(self.config)
     self.training = True
示例#2
0
 def bert_tokenizer_returner(self):
     if self.config.bert_name == 'japanese-bert':
         vocab_file = './vocab_file/vocab.txt'
         return transformers.BertTokenizer(vocab_file=vocab_file,
                                           do_basic_tokenize=True)
     else:
         print('currently not supported:', self.config.bert_name)
         raise NotImplementedError
示例#3
0
def get_tokenizer(file_config=None, use_vocab=False):
    """ 加载tokenizer
        use_vocab: 是否用现有的字典文件"""
    if use_vocab:
        tokenizer = transformers.BertTokenizer(vocab_file=args.tokenizer_vocab)
    else:
        tokenizer = transformers.BertTokenizer.from_pretrained(file_config.bert_tokenizer_dir)
    logging.info('字典大小:{}'.format(len(tokenizer)))
    return tokenizer
示例#4
0
def two():
    tokenizer = transformers.BertTokenizer("../model/chinese_L-12_H-768_A-12/vocab.txt")
    tokenizer.add_special_tokens({"additional_special_tokens":["[SPACE]","“","”"]})
    vocab_f = open("../model/chinese_L-12_H-768_A-12/vocab.txt","r",encoding="utf8")
    list_vocab = vocab_f.readlines()
    list_vocab = [data.strip() for data in list_vocab]
    dict_vocab = {k:v for k,v in enumerate(list_vocab)}

    f_sentence = open(two_train_sentence_path,"w",encoding="utf8")
    f_label = open(two_train_label_path,"w",encoding="utf8")


    with open(train_sentence_path,"r",encoding="utf8") as f1,open(train_label_path,"r",encoding="utf8") as f2:
        test_list_label = []
        for sentence,label in zip(f1.readlines(),f2.readlines()):
            sentence = sentence.strip()
            label = label.strip()
            # 目标所求
            # tokened_text = tokenizer.encode(sentence)[1:-1] #全部是id 摘除[cls]
            tokened_text = tokenizer.encode(sentence)  #全部是id
            tokened_text_str = [str(i) for i in tokened_text]
            print(" ".join(tokened_text_str),file=f_sentence)
            tokened_text = tokened_text[1:-1]
            # print(tokened_text)
            tokened_list_text = [dict_vocab[data].lstrip("##") for data in tokened_text]

            # print(A)
            # print(tokened_list_text)
            list_label = label.split(" ")
            #目标所求。
            list_label_final = []
            for i in range(len(tokened_list_text)):
                num = len(list(tokened_list_text[i]))
                if tokened_list_text[i] in ["[SPACE]","[UNK]"]:
                    # print(1)
                    num =1

                # 这里处理合并问题
                maybe = list_label[0:num]
                if num != 1:
                    maybe_a = [int(i)%3 for i in maybe]
                    if maybe_a[0]==1 and maybe_a[-1]==0:
                        a = str(int(maybe[0])//3 + 13)
                    elif maybe_a[0]==1 and maybe_a[-1]==2:
                        a = maybe[0]
                    elif maybe_a[0]==2 and maybe_a[-1]==0:
                        a = maybe[-1]
                    else:
                        a = maybe[0]
                else:
                    a = maybe[0]

                list_label_final.append(a)
                list_label = list_label[num:]
            print(" ".join(["0"]+list_label_final),file=f_label)
 def berttokenizer_returner(self):
     if self.args.bert_name == 'bert-base-uncased':
         vocab_file = './src/vocab_file/bert-base-uncased-vocab.txt'
         do_lower_case = True
     else:
         print('currently not supported:', self.args.bert_name)
         raise NotImplementedError
     return transformers.BertTokenizer(vocab_file=vocab_file,
                                       do_lower_case=do_lower_case,
                                       do_basic_tokenize=True,
                                       never_split=NEVER_SPLIT_TOKEN)
示例#6
0
def doc_context_similarity(request):
    """Predict probability of two documents appearing in the same context."""
    print('Starting document context similarity prediction...')
    global model, tokenizer
    if not tokenizer:
        print('Loading tokenizer...')
        if os.getenv('ENV', '') == 'local':
            # TODO: Think about whether to keep the cased or uncased?
            tokenizer = transformers.BertTokenizer('./ext/model/vocab.txt',
                                                   do_lower_case=False)
        else:
            tokenizer = (transformers.BertTokenizer.from_pretrained(
                'bert-base-finnish-cased-v1'))
        print('Tokenizer loaded!')
    if not model:
        print('Loading model...')
        model_path = ('./ext/model' if os.getenv('ENV', '') == 'local' else
                      'bert-base-finnish-cased-v1')
        model = (transformers.BertForNextSentencePrediction.from_pretrained(
            model_path))
        model.eval()
        print('Model loaded!')

    print('Predicting...')
    data = request.get_json()['data']

    # Parse data
    doc1 = data['doc1']
    doc2 = data['doc2']

    # Inference
    tokens1 = ['[CLS]'] + tokenizer.tokenize(doc1) + ['[SEP]']
    tokens2 = tokenizer.tokenize(doc2) + ['[SEP]']
    tokens = tokens1 + tokens2

    indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
    segments_ids = [0] * len(tokens1) + [1] * len(tokens2)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    pred = model(tokens_tensor, token_type_ids=segments_tensors)[0]
    probability = float(torch.nn.Softmax(dim=1)(pred).data.numpy()[0][0])
    print('Prediction done!')
    return {
        'status': 'success',
        'message': 'Prediction obtained successfully!',
        'data': {
            'probability': probability
        }
    }, 200
示例#7
0
 def berttokenizer_returner(self):
     if self.args.bert_name == 'bert-base-uncased':
         vocab_file = './vocab_file/bert-base-uncased-vocab.txt'
         do_lower_case = True
     elif self.args.bert_name == 'biobert':
         vocab_file = './vocab_file/biobert_v1.1_pubmed_vocab.txt'
         do_lower_case = False
     else:
         print('currently not supported:', self.args.bert_name)
         raise NotImplementedError
     return transformers.BertTokenizer(
         vocab_file=vocab_file,
         do_lower_case=do_lower_case,
         do_basic_tokenize=True,
         never_split=['<target>', '</target>'])
示例#8
0
 def update_vocab(self):
     self.tokenizer.save_vocabulary(
         '/home/qianhoude/Neural-OpenIE/Transformers-version/')
     with open(
             '/home/qianhoude/Neural-OpenIE/Transformers-version/vocab.txt',
             'a') as f:
         for i in [
                 '<arg1>', '</arg1>', '<rel>', '</rel>', '<arg2>', '</arg2>'
         ]:
             f.write(i + '\n')
     self.tokenizer = transformers.BertTokenizer(
         vocab_file=
         '/home/qianhoude/Neural-OpenIE/Transformers-version/vocab.txt')
     os.system(
         'rm /home/qianhoude/Neural-OpenIE/Transformers-version/vocab.txt')
示例#9
0
def load_bert(path):
    '''
    Load the Chinese Bert model in the specified folder
    '''
    config_path = os.path.join(path,
                               'chinese_wwm_ext_pytorch/bert_config.json')
    model_path = os.path.join(path,
                              'chinese_wwm_ext_pytorch/pytorch_model.bin')
    vocab_path = os.path.join(path, 'chinese_wwm_ext_pytorch/vocab.txt')

    config = transformers.BertConfig.from_pretrained(config_path)
    config.output_hidden_states = True

    model = transformers.BertModel.from_pretrained(model_path, config=config)
    model.eval()

    tokenizer = transformers.BertTokenizer(vocab_path)

    return model, tokenizer
示例#10
0
文件: modules.py 项目: d223302/jiant
    def __init__(self, args):
        super(BertEmbedderModule, self).__init__(args)
        config = transformers.BertConfig.from_json_file(
            '/work/dcml0714/bert_data/en_pretrain/bert_config.json')
        config.output_hidden_states = True
        self.model = transformers.BertForPreTraining.from_pretrained(
            pretrained_model_name_or_path=None,
            config=config,
            state_dict=torch.load(args.bert_model_path))
        self.max_pos = self.model.config.max_position_embeddings

        self.tokenizer = transformers.BertTokenizer(
            vocab_file='/work/dcml0714/bert_data/bi_pretrain/vocab.txt')
        self._sep_id = self.tokenizer.convert_tokens_to_ids("[SEP]")
        self._cls_id = self.tokenizer.convert_tokens_to_ids("[CLS]")
        self._pad_id = self.tokenizer.convert_tokens_to_ids("[PAD]")
        self._unk_id = self.tokenizer.convert_tokens_to_ids("[UNK]")

        self.parameter_setup(args)
    def device_setup(self):
        """
        设备配置并加载BERT模型
        :return:
        """

        # 使用GPU,通过model.to(device)的方式使用
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        model_save_path = self.config.get("result", "model_save_path")
        config_save_path = self.config.get("result", "config_save_path")
        vocab_save_path = self.config.get("result", "vocab_save_path")

        self.model_config = BertConfig.from_json_file(config_save_path)
        self.model = BertForSequenceClassification(self.model_config)
        self.state_dict = torch.load(model_save_path)
        self.model.load_state_dict(self.state_dict)
        self.tokenizer = transformers.BertTokenizer(vocab_save_path)
        self.model.to(self.device)
        self.model.eval()
示例#12
0
def get_tokenizer() -> transformers.BertTokenizer:
    """Returns the BERT tokenizer."""
    do_lower_case = ('uncased' in common_flags.BERT_CONFIG.value
                     or 'cased' not in common_flags.BERT_CONFIG.value)
    with tempfile.TemporaryDirectory() as tdir:
        vocab_fn = os.path.join(tdir, 'vocab.txt')
        tf.io.gfile.copy(common_flags.BERT_VOCAB.value, vocab_fn)

        # special symbols
        grammar_symbols = state_tree.NQStateTree.tree_node_symbols
        operator_symbols = [
            operator.value for operator in state_tree.Operator
        ] + ['[stop]']
        # add field symbols
        operator_symbols += [field.value for field in state_tree.Field]
        special_symbols = grammar_symbols + operator_symbols
        assert len(special_symbols) < 99, 'Too many special symbols.'

        tokenizer = transformers.BertTokenizer(vocab_fn,
                                               do_lower_case=do_lower_case)
        tokenizer.add_tokens(special_symbols)
    return tokenizer
示例#13
0
    def __init__(self, device, cfg):
        super().__init__()

        if cfg.tokens_pretrained:
            self.tokenizer = transformers.BertTokenizer.from_pretrained(
                'bert-base-uncased')
        else:
            self.tokenizer = transformers.BertTokenizer(
                cfg.vocab_path, cfg.merge_path)

        if cfg.embeddings_pretrained:
            self.model = transformers.BertModel.from_pretrained(
                'bert-base-uncased')
        else:
            self.model = transformers.BertModel('bert-base-uncased')
        self.model = self.model.to(device)

        self.pad_token = 'pad_token'
        self.device = device

        self.max_len = cfg.max_seq_len
        self.trainable = cfg.embeddings_trainable
示例#14
0
# @Time: 2020/7/8 10:37
# @Author: R.Jian
# @Note: 用于验证集的预处理

import json
import transformers

f = open("ccks_8_data_v2/validate_data.json",encoding="utf8")
dict_vali = json.load(f)
f.close()

# token
tokenizer = transformers.BertTokenizer("../model/chinese_L-12_H-768_A-12/vocab.txt")
tokenizer.add_special_tokens({"additional_special_tokens": ["[SPACE]", "“", "”"]})
vocab_f = open("../model/chinese_L-12_H-768_A-12/vocab.txt", "r", encoding="utf8")
list_vocab = vocab_f.readlines()
list_vocab = [data.strip() for data in list_vocab]
dict_vocab = {k: v for k, v in enumerate(list_vocab)}

f_sentence = open("val_token.txt", "w", encoding="utf8")
f_word = open("val_word.txt","w",encoding="utf8")

for i in range(1,101):
    sen = dict_vali["validate_V2_"+str(i)+".json"]
    sen = sen.strip("\r\n").replace("\r\n"," ")
    sen = sen.replace(" "," [SPACE] ")
    list_token = tokenizer.encode(sen)
    list_token_str = [str(i) for i in list_token]
    print(" ".join(list_token_str), file=f_sentence)
    list_token = list_token[1:-1]
    print(list_token)
示例#15
0
Settings related to input data, file exports and model configuration will be available in this script.

"""

# data files
TRAIN_FILE = "./data/train.tsv"
TEST_FILE = "./data/test.tsv"  # use for evaluation

# model files
TRAINED_MODEL_FILE = './results/trained_model.pt'
NER_RESULTS_FILE = './results/ner_result.csv'

# torch setting
import torch
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# BioBERT base
import transformers
BERT_CONFIG_FILE = './biobert_v1.1_pubmed'  # /config.json'
TOKENIZER = transformers.BertTokenizer(
    vocab_file='biobert_v1.1_pubmed/vocab.txt', do_lower_case=False)
WEIGHTS_BIN = torch.load('./biobert_v1.1_pubmed/pytorch_model.bin',
                         map_location=DEVICE)

# params config
MAX_LEN = 75
MAX_GRAD_NORM = 1.0
TRAIN_BATCH_SIZE = 32
TEST_BATCH_SIZE = 8
EPOCHS = 1
示例#16
0
    # code borrowed from official pytorch discussion forum
    with torch.no_grad():
        correct = rankings.eq(0)
        res = []
        for k in topk:
            correct_k = correct[:, :k].float().sum()
            res.append(correct_k.mul_(1.0 / rankings.size(0)).item())
    return res


if __name__ == '__main__':

    args = parse_args()

    # create tokenizer
    tokenizer = trs.BertTokenizer(vocab_file='bert_vocab.txt',
                                  do_lower_case=True)

    assert args.split in ['valid', 'test'], f"{args.split} not allowed"

    # create dataloader
    if args.dataset.startswith('udc'):
        dataset = UDC(root=args.dataset_root,
                      split=args.split,
                      tokenizer=tokenizer)
        dataloader = data.DataLoader(dataset,
                                     batch_size=args.batch_size,
                                     shuffle=False,
                                     num_workers=2)
    else:
        raise Exception(F"unknown dataset: {args.dataset}")
示例#17
0
import argparse
import transformers

parser = argparse.ArgumentParser()
parser.add_argument('--vocab', type=str)
parser.add_argument('--model', type=str)
parser.add_argument('--data', type=str)
args = parser.parse_args()

tokenizer = transformers.BertTokenizer(vocab_file=args.vocab,
                                       do_lower_case=False,
                                       do_basic_tokenize=True)
model = transformers.BertForMaskedLM.from_pretrained(args.model)

dataset = transformers.LineByLineTextDataset(tokenizer=tokenizer,
                                             file_path=args.data,
                                             block_size=128)
data_collator = transformers.DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
train_args = transformers.TrainingArguments(
    per_device_eval_batch_size=16, output_dir=f"/tmp/echau18/{args.model}")
trainer = transformers.Trainer(model=model,
                               eval_dataset=dataset,
                               data_collator=data_collator,
                               prediction_loss_only=True,
                               args=train_args)

eval_output = trainer.evaluate()
print(eval_output)
示例#18
0
import transformers
import os

MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 10
BERT_PATH = "../input/bert_base_uncased/"
MODEL_PATH = "../models/model.bin"
TRAINING_FILE = "../input/imdb.csv"
TOKENIZER = transformers.BertTokenizer(os.path.join(BERT_PATH, "vocab.txt"),
                                       do_lower_case=True)