Пример #1
0
    def test_tokenizer_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
            self.assertIsInstance(tokenizer, BertTokenizer)
            self.assertGreater(len(tokenizer), 0)

        for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.assertIsNotNone(tokenizer)
            self.assertIsInstance(tokenizer, GPT2Tokenizer)
            self.assertGreater(len(tokenizer), 0)
Пример #2
0
def test_wordpiece_to_token_correct(base_model):
    t = WordPieceListTransformer(name="wordpiece-to-token", base_model=base_model)
    tokenizer = AutoTokenizer.from_pretrained(base_model)

    # Long text
    sentences = [
        "Some strange text sssasd sdafds dfv vc a more strange",
        "Short sentence",
        "OneToken",
        "",
    ]
    encoded_ids = [tokenizer.encode(sentence) for sentence in sentences]
    _, context = t.transform(encoded_ids)

    t = context["wordpiece_to_token_list"]

    assert [
        (1,),
        (2,),
        (3,),
        (4, 5, 6, 7),
        (8, 9, 10),
        (11, 12),
        (13, 14),
        (15,),
        (16,),
        (17,),
    ] == t[0]

    assert [(1,), (2,)] == t[1]
    assert [(1, 2)] == t[2]
    assert [] == t[3]
Пример #3
0
def tokenizer(*args, **kwargs):
    r""" 
        # Using torch.hub !
        import torch

        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`

    """

    return AutoTokenizer.from_pretrained(*args, **kwargs)
Пример #4
0
 def __init__(
     self,
     base_model=None,
     max_seq_len=512,
     do_lower_case=True,
     do_basic_tokenize=False,
     num_of_special_tokens=2,
     **kwargs,
 ):
     super().__init__(**kwargs)
     self.base_model = base_model
     self.max_length = max_seq_len
     self.num_of_special_tokens = num_of_special_tokens
     self._tokenizer = AutoTokenizer.from_pretrained(
         base_model,
         do_lower_case=do_lower_case,
         do_basic_tokenize=do_basic_tokenize)
Пример #5
0
 def __init__(self, base_model=None, add_to_context=True, **kwargs):
     super().__init__(**kwargs)
     self.base_model = base_model
     self._tokenizer = AutoTokenizer.from_pretrained(base_model)
     self.add_to_context = add_to_context
     self.func = self.__class__._get_func(base_model)
Пример #6
0
    network = NumericallyAugmentedBertNet(bert_model,
                hidden_size=bert_model.config.hidden_size,
                dropout_prob=0.0,
                use_gcn=args.use_gcn,
                gcn_steps=args.gcn_steps)

if args.cuda: network.cuda()
print("Load from pre path {}.".format(args.pre_path))
network.load_state_dict(torch.load(args.pre_path))

print("Load data from {}.".format(args.inf_path))
if args.eng != 0:
    tokenizer = RobertaTokenizer.from_pretrained(args.roberta_model)
else:
    # import pdb; pdb.set_trace()
    tokenizer = AutoTokenizer.from_pretrained(args.roberta_model)
if args.tag_mspan:
    inf_iter = TDropBatchGen(args, tokenizer,
                            TDropReader(tokenizer, passage_length_limit=463, question_length_limit=46, is_eng=args.eng)
                                ._read(args.inf_path))
else:
    inf_iter = DropBatchGen(args, tokenizer, DropReader(tokenizer, passage_length_limit=463, question_length_limit=46)._read(args.inf_path))

print("Start inference...")
result = {}
network.eval()
# myf = open(args.dump_path, 'w', encoding="utf8")
# myf.close()
# myf = open(args.dump_path, 'a', encoding="utf8")
with torch.no_grad():
    for batch in tqdm(inf_iter):
Пример #7
0
import sys

from pytorch_transformers import AutoTokenizer

# dataset = sys.argv[1]
# model_name_or_path = sys.argv[2]
# max_len = int(sys.argv[3])

dataset = r'/media/alvinai/Documents/baidu_nlp/data/GermEval/train.txt.tmp'
model_name_or_path = 'bert-base-multilingual-cased'
max_len = 128

subword_len_counter = 0

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
# max_len -= tokenizer.num_special_tokens_to_add()
max_len -= 2
with open(dataset, "rt") as f_p:
    for line in f_p:
        line = line.rstrip()
        if not line:
            print(line)
            subword_len_counter = 0
            continue
        token = line.split()[0]
        current_subwords_len = len(tokenizer.tokenize(token))
        # Token contains strange control characters like \x96 or \x95
        # Just filter out the complete line
        if current_subwords_len == 0:
            continue
        if (subword_len_counter + current_subwords_len) > max_len:
 def __init__(self, vocab_path=None, do_lower_case=None):
     # self.tokenizer = BertTokenizer(vocab_path,do_lower_case)
     self.tokenizer = AutoTokenizer.from_pretrained(
         "bert-base-multilingual-cased")
 def __init__(self,
              model_name_or_path: str) -> None:
     super().__init__(lazy=False)
     self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)