def build_feature(tokenizer: transformers.BertTokenizer, examples: list, max_length: int = None): ''' @param tokenizer (transformers.BertTokenizer): tokenzier to convert token to ids @param examples (list): input examples @param maxlength (int): set max length to cut off example sequence @return examples (list): new examples with input feature ''' if max_length is not None: length = max_length else: length = 1e3 for example in examples: context = tokenizer.convert_tokens_to_ids( example['context'][:min(length, len(example['context']))]) # print(context) question = tokenizer.convert_tokens_to_ids( example['question'][:min(length, len(example['question']))]) # print(question) out = tokenizer.prepare_for_model(context, question, return_token_type_ids=True, return_attention_mask=True) inputs = out['input_ids'] token_type_ids = out['token_type_ids'] attention_mask = out['attention_mask'] # print(inputs) # print(token_type_ids) # print(attention_mask) example['input_feature'] = inputs example['token_type_ids'] = token_type_ids example['attention_mask'] = attention_mask return examples
a = DataLoader(x, batch_size=10, sampler=SubsetRandomSampler(x.sampler), shuffle=False, collate_fn=my_collate) for i, s in enumerate(a): print(i) print(s) tokenizer = BertTokenizer("data/atis/token.vocab", bos_token="<BOS>", eos_token="<EOS>", model_max_len=50) tokenizer.prepare_for_model(tokenizer.encode(y), return_tensors="pt") tokenizer.SPECIAL_TOKENS_ATTRIBUTES tokenizer.encode(y) tokenizer.encode_plus(y) y = "<BOS> embedding what is the flight number <EOS>" ids = tokenizer.encode_plus tokenizer.decode(tokenizer.encode(y)) tokenizer.save_pretrained("data/atis/save") tokenizer.save_vocabulary("data/atis/save/saved") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", bos_token="<BOS>", eos_token="<EOS>") tokenizer.tokenize("i like tea") special_tokens = {"bos_token": "<BOS>", "eos_token": "<EOS>"}