def run_dataloader(): """test dataloader""" parser = get_parser() # add model specific args parser = BertLabeling.add_model_specific_args(parser) # add all the available trainer options to argparse # ie: now --gpus --num_nodes ... --fast_dev_run all work in the cli parser = Trainer.add_argparse_args(parser) args = parser.parse_args() args.workers = 0 args.default_root_dir = "/scratch/shravya.k/train_logs/debug" model = BertLabeling(args) from tokenizers import BertWordPieceTokenizer tokenizer = BertWordPieceTokenizer( os.path.join(args.bert_config_dir, "vocab.txt")) loader = model.get_dataloader("dev", limit=1000) for d in loader: input_ids = d[0][0].tolist() match_labels = d[-1][0] start_positions, end_positions = torch.where(match_labels > 0) start_positions = start_positions.tolist() end_positions = end_positions.tolist() if not start_positions: continue print("=" * 20) print(tokenizer.decode(input_ids, skip_special_tokens=False)) for start, end in zip(start_positions, end_positions): print(tokenizer.decode(input_ids[start:end + 1]))
def run_zh(): from tokenizers import BertWordPieceTokenizer data_path = "/data/nfsdata2/sunzijun/glyce/glyce/data/small_bin" bert_path = "/data/nfsdata2/sunzijun/glyce/glyce/bert_chinese_base_large_vocab" config_path = "/data/nfsdata2/sunzijun/glyce/glyce/config" tokenizer = BertWordPieceTokenizer(os.path.join(bert_path, "vocab.txt")) prefix = "small" fields = None dataset = DynamicGlyceMaskedLMDataset(config_path=config_path, directory=data_path, vocab_file=os.path.join( bert_path, "vocab.txt"), prefix=prefix, fields=fields, max_length=512) print(len(dataset)) from tqdm import tqdm for d in tqdm(dataset): print([v.shape for v in d]) print(tokenizer.decode(d[0].tolist(), skip_special_tokens=False)) tgt = [ src if label == -100 else label for src, label in zip(d[0].tolist(), d[2].tolist()) ] print(tokenizer.decode(tgt, skip_special_tokens=False))
def main(args): print(args) if args['train']: tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, # Must be False if cased model lowercase=True, wordpieces_prefix="##" ) tokenizer.train( files=['/data2/BERT/data/naver_news/news_3_preprocessed/naver_news.txt'], limit_alphabet=6000, vocab_size=32000 ) print(tokenizer.save_model("../BertWordPieceTokenizer_32000")) elif args['test']: test_str = '나는 워드피스 토크나이저를 써요. 성능이 좋은지 테스트 해보려 합니다.' print("=========== tokenizer ===========") tokenizer = BertWordPieceTokenizer("../BertWordPieceTokenizer_32000/vocab.txt") print(tokenizer) encoded_str = tokenizer.encode(test_str) print('encoding: ', encoded_str.ids) decoded_str = tokenizer.decode(encoded_str.ids) print(decoded_str) print("=========== BertTokenizer ===========") tokenizer = BertTokenizer("../BertWordPieceTokenizer_32000/vocab.txt") print(tokenizer) encoded_str = tokenizer.encode(test_str) print('encoding: ', encoded_str) decoded_str = tokenizer.decode(encoded_str) print(decoded_str) print("=========== BertTokenizer2 ===========") tokenizer = BertTokenizer.from_pretrained("../BertWordPieceTokenizer_32000") print(tokenizer) encoded_str = tokenizer.encode(test_str) print('encoding: ', encoded_str) decoded_str = tokenizer.decode(encoded_str) print(decoded_str)
def run_dataset(): """test dataset""" import os from datasets.collate_functions import collate_to_max_length from torch.utils.data import DataLoader # zh datasets # bert_path = "/mnt/mrc/chinese_L-12_H-768_A-12" # json_path = "/mnt/mrc/zh_msra/mrc-ner.test" # # json_path = "/mnt/mrc/zh_onto4/mrc-ner.train" # is_chinese = True # en datasets bert_path = "/mnt/mrc/bert-base-uncased" json_path = "/mnt/mrc/ace2004/mrc-ner.train" # json_path = "/mnt/mrc/genia/mrc-ner.train" is_chinese = False vocab_file = os.path.join(bert_path, "vocab.txt") tokenizer = BertWordPieceTokenizer(vocab_file=vocab_file) dataset = MRCNERDataset(json_path=json_path, tokenizer=tokenizer, is_chinese=is_chinese) dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_to_max_length) for batch in dataloader: for tokens, token_type_ids, start_labels, end_labels, start_label_mask, end_label_mask, match_labels, sample_idx, label_idx in zip( *batch): tokens = tokens.tolist() start_positions, end_positions = torch.where(match_labels > 0) start_positions = start_positions.tolist() end_positions = end_positions.tolist() if not start_positions: continue print("=" * 20) print(f"len: {len(tokens)}", tokenizer.decode(tokens, skip_special_tokens=False)) for start, end in zip(start_positions, end_positions): print( str(sample_idx.item()), str(label_idx.item()) + "\t" + tokenizer.decode(tokens[start:end + 1]))
def generate_custom_vocab(self): try: tokenizer = None # root dir path check and generate if not os.path.isdir(self.vocab_root_dir): os.makedirs(self.vocab_root_dir, exist_ok=True) # generate models directory self.vocab_dir = '/BERT_TRAINING_VOCAB_' + self.getCurrent_time()[2] + '/' os.makedirs(self.vocab_root_dir + self.vocab_dir, exist_ok=True) user_defined_symbols = ['[BOS]', '[EOS]', '[UNK]', '[UNK1]', '[UNK2]', '[UNK3]', '[UNK4]', '[UNK5]', '[UNK6]', '[UNK7]', '[UNK8]', '[UNK9]'] unused_token_num = 200 unused_list = ['[unused{}]'.format(n) for n in range(unused_token_num)] user_defined_symbols = user_defined_symbols + unused_list if self.tokenizer_type == 'word': # if lowercase is False must set strip_accents option as 'False' tokenizer = BertWordPieceTokenizer(strip_accents=False, lowercase=True, clean_text=True, handle_chinese_chars=True, wordpieces_prefix="##" ) # when selected 'base' going to use bert-base-uncased tokenizer... close function # training vocab start corpus_file = [self.corpus_path] vocab_size = 32000 limit_alphabet = 6000 min_frequency = 3 tokenizer.train(files=corpus_file, vocab_size=vocab_size, special_tokens=user_defined_symbols, min_frequency=min_frequency, # 단어의 최소 발생 빈도, 3 limit_alphabet=limit_alphabet, # ByteLevelBPETokenizer 학습시엔 주석처리 필요 show_progress=True) self.setPrint('Customer Tokenizer Training is completed') sentence = '전화 통화가 정상적으로 안됨.' output = tokenizer.encode(sentence) self.setPrint('Tokenizer 테스트 문장: {}'.format(sentence)) self.setPrint('Tokenizer 분석 결과\n=>idx: {}\n=>tokens: {}\n=>offset: {}\n=>decode: {}\n'. format(output.ids, output.tokens, output.offsets, tokenizer.decode(output.ids))) # save tokenizer tokenizer.save_model(self.vocab_root_dir + self.vocab_dir) except: self.setPrint('Error: {}. {}, line: {}'.format(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2].tb_lineno))
def run(): data_path = "/data/nfsdata2/sunzijun/glyce/glyce/data/bin" bert_path = "/data/nfsdata2/sunzijun/glyce/glyce/bert_chinese_base_large_vocab" tokenizer = BertWordPieceTokenizer(os.path.join(bert_path, "vocab.txt")) prefix = "dev" dataset = StaticGlyceMaskLMDataset(data_path, vocab_file=os.path.join( bert_path, "vocab.txt"), prefix=prefix, max_length=512) print(len(dataset)) from tqdm import tqdm for d in tqdm(dataset): print([v.shape for v in d]) print(tokenizer.decode(d[0].tolist(), skip_special_tokens=False)) tgt = [ src if label == -100 else label for src, label in zip(d[0].tolist(), d[2].tolist()) ] print(tokenizer.decode(tgt, skip_special_tokens=False))
class CheckerDecoder: def __init__(self, model_dir): self.detector = DetectorModel(os.path.join(model_dir, 'detector')) self.corrector = CorrectorModel(os.path.join(model_dir, 'corrector')) self.tokenizer = BertWordPieceTokenizer( os.path.join(model_dir, 'vocab.txt')) mask_id = self.tokenizer.encode('[MASK]').ids[1:-1] assert len(mask_id) == 1 self.mask_id = mask_id[0] def predict(self, text, suggest=False, k=5, max_k=200): tokenized = self.tokenizer.encode(text) if len(tokenized.tokens) > MAX_LEN: raise ValueError('The text is too long (>512) to process') token_ids = tokenized.ids segment_ids = tokenized.type_ids mapping = rematch(tokenized.offsets) token_ids, segment_ids = np.array([token_ids]), np.array([segment_ids]) probas = self.detector.predict(token_ids, segment_ids)[0][0] incorrect_ids = np.where(probas > 0.5)[0] token_ids[0, incorrect_ids] = self.mask_id if not suggest: ret = [] for i in incorrect_ids: ret.append((i - 1, tokenized.tokens[i])) return ret probas = self.corrector.predict(token_ids, segment_ids)[0][0] sorted_probas, sort_indexs = topK(probas, max_k) ret = {} for i in incorrect_ids: if i == 0 or i == len(tokenized.tokens) - 1: continue current_token = text[mapping[i][0]:mapping[i][-1] + 1] current_pinyin = ' '.join(xmnlp.pinyin(current_token)) cands = [] for proba, token in zip( sorted_probas[i], self.tokenizer.decode(sort_indexs[i]).split()): pinyin = ' '.join(xmnlp.pinyin(token)) score = 0 if current_pinyin == pinyin: score = 1 cands.append((token, proba + score)) cands.sort(key=lambda x: x[1], reverse=True) ret[(i - 1, current_token)] = cands[:k] return dict(ret)
def run_dataset(): """test dataset""" import os #from datasets.collate_functions import collate_to_max_length from collate_functions import collate_to_max_length, collate_to_max_length_demo from torch.utils.data import DataLoader # zh datasets # bert_path = "/mnt/mrc/chinese_L-12_H-768_A-12" # json_path = "/mnt/mrc/zh_msra/mrc-ner.test" # # json_path = "/mnt/mrc/zh_onto4/mrc-ner.train" # is_chinese = True # en datasets bert_path = "../chinese_roberta_wwm_large_ext_pytorch" json_path = "zh_msra/mrc-ner.demo" # json_path = "/mnt/mrc/genia/mrc-ner.train" is_chinese = False vocab_file = os.path.join(bert_path, "vocab.txt") #assert os.path.exists(vocab_file) tokenizer = BertWordPieceTokenizer(vocab_file=vocab_file) dataset = MRCNERDataset_demo(json_path=json_path, tokenizer=tokenizer, is_chinese=is_chinese) dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_to_max_length_demo) # dataloader = DataLoader(dataset, batch_size=32) for batch in dataloader: for tokens, token_type_ids in zip(*batch): tokens = tokens.tolist() print("=" * 20) print(f"len: {len(tokens)}", tokenizer.decode(tokens, skip_special_tokens=False))
class PreTrainedTokenizer(GenericTokenizer): vocab_files = { "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", "bert-base-german-cased": "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt", "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt", "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt", "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt", "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt", "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt", "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt", "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt", "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt", "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt", "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt", "moses-pre-tokenized-wmt-uncased-fr": "https://drive.google.com/uc?export=download&id=1kYxOhJh4UshVE_SGYMANjLn_oEB6RMYC", "moses-pre-tokenized-wmt-uncased-en": "https://drive.google.com/uc?export=download&id=1hIURG9eiIXQYCm8cS4vJM3RLVl6UcW32", "moses-pre-tokenized-paracrawl-uncased-accented-de": "https://drive.google.com/uc?export=download&id=15EKdo2IXyyfZvrpOEwtx4KgeeL6Ot-Gi" } def __init__(self, lang, root='../.data', clean_text=False, handle_chinese_chars=True, strip_accents=False, lowercase=True): """ Example instantiation: PreTrainedTokenizer("bert-base-uncased", root="../.data") """ pre_trained_model_name = self.get_default_model_name(lang, lowercase) self._model_name_ = pre_trained_model_name if not os.path.exists(root): os.mkdir(root) assert pre_trained_model_name in self.vocab_files, \ "The requested pre_trained tokenizer model {} does not exist!".format(pre_trained_model_name) url = self.vocab_files[pre_trained_model_name] f_name = root + "/" + pre_trained_model_name + ".txt" if not os.path.exists(f_name): with open(f_name, "wb") as file_: response = get(url) file_.write(response.content) self.moses_tkn = PyMosesTokenizer(lang, lowercase) self.tokenizer = BertWordPieceTokenizer( f_name, clean_text=clean_text, lowercase=lowercase, handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents) self.mid_tokens = { ".": "&md;", "-": "&hp;", "\'": "&ma;", ",": "&mc;", " ": "&fs;" } self.reverse_mid_tokens = {v: k for k, v in self.mid_tokens.items()} self.lang = lang def get_tokenized_sub_tokens(self, token, mid_sign): result = [] if mid_sign in self.mid_tokens: sub_tokens = token.split(mid_sign) assert len(sub_tokens) > 1 if not len("".join(sub_tokens)): result.append(self.mid_tokens[" "]) for sub_token in sub_tokens[:-1]: result.append(sub_token) result.append(self.mid_tokens[mid_sign]) if len(sub_tokens[-1]) or (mid_sign == '\'' and self.lang == "fr"): result.append(sub_tokens[-1]) else: # case like "p.m." where the last token is empty result.append(self.mid_tokens[" "]) else: result.append(token) return result def tokenize_token(self, tokens, mid_sign): res = [] for token in tokens: if len(token) > 1 and mid_sign in token: for sub_token in self.get_tokenized_sub_tokens( token, mid_sign): res.append(sub_token) else: res.append(token) return res def tokenize(self, text): """ You can recover the output of this function using " ".join(encoded_list).replace(" ##", "") :param text: one line of text in type of str :return a list of tokenized "str"s """ if not len(text.strip()): return [""] tokens = [] for token in self.moses_tkn.tokenize(text): if token.startswith("'") and token != "'": token = token.replace("'", "\'") if self.lang == "fr" and len(token) > 1 and token[1:] == "'": token = token.replace("'", "\'") elif self.lang == "fr" and "qu'" in token: token = token.replace("'", "\'") sub_ts = [token] for mid_sign in self.mid_tokens: sub_ts = self.tokenize_token(sub_ts, mid_sign) for sub_token in sub_ts: tokens.append(sub_token) # encoding = self.tokenizer.encode(n_text, add_special_tokens=False) encoding = self.tokenizer.encode(tokens, is_pretokenized=True, add_special_tokens=False) # encoding contains "ids", "tokens", and "offsets" return encoding.tokens def detokenize(self, tokenized_list): # TODO make it work on more test examples temp_result = [] # Merging sub-tokens for token in tokenized_list: if len(temp_result) and token.startswith("##"): temp_result[-1] = temp_result[-1] + token[2:] else: temp_result.append(token) result = [] index = 0 t_len = len(temp_result) # merging & tokens for moses decoder while index < t_len: if temp_result[index] == "&" and index < t_len - 2 and temp_result[ index + 2] == ";": result.append("".join(temp_result[index:index + 3])) index += 3 elif temp_result[ index] == "&" and index < t_len - 3 and temp_result[ index + 3] == ";": result.append("".join(temp_result[index:index + 4])) index += 4 else: result.append(temp_result[index]) index += 1 del temp_result[:] index = 0 t_len = len(result) # merging ‐ tokens for moses decoder while index < t_len: if result[index] in self.reverse_mid_tokens: if not len(temp_result): temp_result.append("") if index + 1 < t_len and result[ index + 1] in self.reverse_mid_tokens: # final dot in "p.m." temp_result[-1] += self.reverse_mid_tokens[ result[index]] + self.reverse_mid_tokens[result[index + 1]] index += 2 elif index + 1 < t_len: # middle dot in "p.m." temp_result[-1] += self.reverse_mid_tokens[ result[index]] + result[index + 1] index += 2 else: # any thing else" temp_result[-1] += self.reverse_mid_tokens[result[index]] index += 1 else: temp_result.append(result[index]) index += 1 return self.moses_tkn.detokenize(temp_result) def decode(self, encoded_ids_list): """ :param encoded_ids_list: list of int ids :return a decoded str """ decoded = self.tokenizer.decode(encoded_ids_list) return decoded @staticmethod def get_default_model_name(lang, lowercase): if lang == "en" and lowercase: return "bert-base-uncased" elif lang == "en" and not lowercase: return "bert-base-cased" elif lang == "zh": return "bert-base-chinese" elif lang == "de" and lowercase: return "bert-base-german-dbmdz-uncased" elif lang == "de" and not lowercase: return "bert-base-german-dbmdz-cased" elif lang == "fi" and lowercase: return "bert-base-finnish-uncased-v1" elif lang == "fi" and not lowercase: return "bert-base-finnish-cased-v1" elif lang == "fr" and lowercase: return "moses-pre-tokenized-wmt-uncased-fr" else: raise ValueError( "No pre-trained tokenizer found for language {} in {} mode". format(lang, "lowercased" if lowercase else "cased")) @property def model_name(self): return self._model_name_
class TFLiteNLU: """Abstraction for using TFLite NLU models Args: model_dir (str): path to the model directory containing nlu.tflite, metadata.json, and vocab.txt """ def __init__(self, model_dir: str) -> None: self._model = TFLiteModel( model_path=os.path.join(model_dir, "nlu.tflite")) self._metadata = utils.load_json( os.path.join(model_dir, "metadata.json")) self._tokenizer = BertWordPieceTokenizer( os.path.join(model_dir, "vocab.txt")) self._max_length = self._model.input_details[0]["shape"][-1] self._intent_decoder = { i: intent["name"] for i, intent in enumerate(self._metadata["intents"]) } self._tag_decoder = { i: tag for i, tag in enumerate(self._metadata["tags"]) } self._intent_meta = { intent.pop("name"): intent for intent in self._metadata["intents"] } self._slot_meta = {} for intent in self._intent_meta: for slot in self._intent_meta[intent]["slots"]: self._slot_meta[slot.pop("name")] = slot self._warm_up() def __call__(self, utterance: str) -> Result: """Classifies a string utterance into an intent and identifies any associated slots contained in the utterance. The slots get parsed based on type and then returned along with the intent and its associated confidence value. Args: utterance (str): string that needs to be understood Returns (Result): A class with properties for the identified intent, along with raw, parsed slots and model confidence in prediction """ inputs, input_ids = self._encode(utterance) outputs = self._model(inputs) intent, tags, confidence = self._decode(outputs) # slice off special tokens: [CLS], [SEP] tags = tags[:len(input_ids) - 2] _LOG.debug(f"{tags}") input_ids = input_ids[1:-1] _LOG.debug(f"{input_ids}") # retrieve slots from the tagged positions and decode slots back # into original values slots = [(token_id, tag[2:]) for token_id, tag in zip(input_ids, tags) if tag != "o"] _LOG.debug(f"{slots}") slot_map: dict = {} for (token, tag) in slots: if tag in slot_map: slot_map[tag].append(token) else: slot_map[tag] = [token] for key, value in slot_map.items(): slot_map[key] = self._tokenizer.decode(value) # attempt to resolve tagged tokens into slots and # collect the successful ones parsed_slots = {} for key in slot_map: parsed = self._parse_slots(self._slot_meta[key], slot_map[key]) parsed_slots[key] = { "name": key, "parsed_value": parsed, "raw_value": slot_map[key], } _LOG.debug(f"parsed slots: {parsed_slots}") return Result( utterance=utterance, intent=intent, confidence=confidence, slots=parsed_slots, ) def _warm_up(self) -> None: # make an array the same size as the inputs to warm the # model since first inference is always slower than subsequent warm = np.zeros((self._model.input_details[0]["shape"]), dtype=np.int32) _ = self._model(warm) def _encode(self, utterance: str) -> Tuple[np.ndarray, List[int]]: inputs = self._tokenizer.encode(utterance) # get the non-padded/truncated token ids to match the # original utterance to the respective labels and # use the length to slice the results input_ids = inputs.ids # it's (max_length + 1) because the [CLS] # token gets appended inside the model # notice the slice [1:] when we convert to an array inputs.truncate(max_length=self._max_length + 1) inputs.pad(length=self._max_length + 1) inputs = np.array(inputs.ids[1:], np.int32) # add the batch dimension for the TFLite model inputs = np.expand_dims(inputs, 0) return inputs, input_ids def _decode(self, outputs: list) -> Tuple[str, List[str], float]: # to get the index of the highest probability we # apply argmax to the posteriors which allows the # labels to be decoded with an integer to string mapping # we derive the confidence from the highest probability intent_posterior, tag_posterior = outputs intents, confidence = self._decode_intent(intent_posterior) tags = self._decode_tags(tag_posterior) _LOG.debug(f"decoded tags: {tags}") _LOG.debug(f"decoded intent: {intents}") _LOG.debug(f"confidence: {confidence}") return intents, tags, confidence def _decode_tags(self, posterior: np.ndarray) -> List[Any]: posterior = np.squeeze(posterior, 0) tags = np.argmax(posterior, -1) return [self._tag_decoder.get(tag) for tag in tags] def _decode_intent(self, posterior: np.ndarray) -> Any: posterior = np.squeeze(posterior, 0) intent = np.argmax(posterior, -1) return self._intent_decoder.get(intent), posterior[intent] def _parse_slots(self, slot_meta: Dict[str, Any], slots: Dict[str, Any]) -> Any: slot_type = slot_meta["type"] parser = import_module(f"spokestack.nlu.parsers.{slot_type}") facets = json.loads(slot_meta["facets"]) return parser.parse(facets, slots) # type: ignore
# data_00150000_00150539.gif,Place all the blocks individually on the surface.,Disjoint the given stack of blocks. # data_00110000_00110725.gif,"Separate the given stack to form yellow, red blocks stack.",Remove 2nd and 4th blocks from the given stack. # data_00120000_00120478.gif,Remove 1st and 2nd block from the given stack and form stack with blue on top of yellow block.,Do not touch green and red block and form another stack with blue and yellow block # Now, let's use it: #input = "I can feel the magic, can you?" #input = "Disjoint the given stacks to form a new stack with blue, red blocks." #input = "Make a new stack with blue, red blocks." input = "Remove 1st and 2nd block from the given stack and form stack with blue on top of yellow block.,Do not touch green and red block and form another stack with blue and yellow block" print(input) encoded = tokenizer.encode(input) #, return_tensors="pt") print(encoded) print(encoded.ids) print(encoded.tokens) print(tokenizer.decode(encoded.ids)) # Unit testing ;) def compare(filename, debug=False): # Iterate through all commands. diffs = 0 total = 0 with open(filename, "r") as f: csvreader = csv.reader(f, delimiter=';') for row in csvreader: for command in row: total += 1 # "Custom" processing for comparison - remove commas and three dots. command = command.replace(",", "") command = command.replace("...", "")
with torch.no_grad(): for batch in dataloader: tokens, token_type_ids = batch attention_mask = (tokens != 0).long() start_logits, end_logits, span_logits = model(tokens, attention_mask, token_type_ids) ls_start=start_logits.squeeze().cpu().numpy().tolist() ls_end=end_logits.squeeze().cpu().numpy().tolist() for s, e, t in zip(ls_start, ls_end, tokens): ss=[i for i,v in enumerate(s) if v>0] ee=[i for i,v in enumerate(e) if v>0] t=t.tolist() t_d=tokenizer.decode(t, skip_special_tokens=True) print('\n', t_d[len(query)*2:]) # print(ss, ee) if len(ss)==len(ee) and len(ss)>0: for i, j in zip(ss, ee): print('【Company】: ', tokenizer.decode(t[i:j+1])) else: print('【Company】: None')
class FastBERTTokenizer: r""" Constructs a tokenizer based on `SentencePiece <https://github.com/google/sentencepiece>`__. Args: vocab_file (:obj:`str`): `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that contains the vocabulary necessary to instantiate a tokenizer. sp_model_kwargs (:obj:`dict`, `optional`): Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set: - ``enable_sampling``: Enable subword regularization. - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. - ``nbest_size = {0,1}``: No sampling is performed. - ``nbest_size > 1``: samples from the nbest_size results. - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) using forward-filtering-and-backward-sampling algorithm. - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for BPE-dropout. """ def __init__(self, vocab_file, fb_model_kwargs: Optional[Dict[str, Any]] = None): self.vocab_file = vocab_file self.fb_model_kwargs = {} if fb_model_kwargs is None else fb_model_kwargs assert os.path.exists(vocab_file), "no existing vocab file." #spm = sp.SentencePieceProcessor(**self.sp_model_kwargs) #spm.load(vocab_file) #bpe_vocab_size = spm.GetPieceSize() #self.spm = spm self.bert_tokenizer = BertWordPieceTokenizer(vocab_file, clean_text=False, strip_accents=False, lowercase=False) self.vocab = {} self.ids_to_tokens = [] self.add_from_file(open(vocab_file, 'r')) self.add_symbol('<mask>') self.vocab_size = len(self.vocab) def add_from_file(self, f): """ Loads a pre-existing dictionary from a text file and adds its symbols to this instance. """ lines = f.readlines() for line in lines: line = line.rstrip() word = line self.add_symbol(word, overwrite=False) def add_symbol(self, word, overwrite=False): """Adds a word to the dictionary""" if word in self.vocab and not overwrite: idx = self.vocab[word] return idx else: idx = len(self.ids_to_tokens) self.vocab[word] = idx self.ids_to_tokens.append(word) return idx def tokenize(self, text): return self.bert_tokenizer.encode(text, add_special_tokens=False).tokens def convert_ids_to_tokens(self, index): return self.ids_to_tokens[ index] if index < self.vocab_size else self.unk def _convert_token_to_id(self, token): return self.vocab[token] def decode(self, x: str) -> str: return self.bert_tokenizer.decode([int(tok) for tok in x.split()]) def pad(self): return "[PAD]" def bos(self): return "[CLS]" def eos(self): return "[SEP]" def unk(self): return "[UNK]" def mask(self): return "<mask>" def sym(self, id): return self.ids_to_tokens[id] def id(self, sym): return self.vocab[sym] if sym in self.vocab else 1 def save_pretrained(self, path: str, filename_prefix: str = None): filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]] if filename_prefix is not None: filename = filename_prefix + "-" + filename full_path = os.path.join(path, filename) with open(full_path, "wb") as fs: #fs.write(self.spm.serialized_model_proto()) for item in self.ids_to_tokens: fs.write(str(item) + '\n') return (full_path, ) #pass def _run_strip_accents(self, text): """Strips accents from a piece of text.""" text = unicodedata.normalize("NFD", text) output = [] for char in text: cat = unicodedata.category(char) if cat == "Mn": continue output.append(char) return "".join(output) def _run_split_on_punc(self, text): """Splits punctuation on a piece of text.""" chars = list(text) i = 0 start_new_word = True output = [] while i < len(chars): char = chars[i] if _is_punctuation(char): output.append([char]) start_new_word = True else: if start_new_word: output.append([]) start_new_word = False output[-1].append(char) i += 1 return ["".join(x) for x in output]
with open(metrics_path, 'rb') as metrics_handle: metrics_obj = pickle.load(metrics_handle) return (metrics_obj['token_pairs'], metrics_obj['decoded_pairs'], metrics_obj['jaccard_similarities'], metrics_obj['levenshtein_distances']) token_pairs, decoded_pairs, jaccard_similarities, levenshtein_distances = load_metrics_obj() if not token_pairs: token_pairs = [([tokenizer.id_to_token(x) for x in ocr_tokens[i]], [tokenizer.id_to_token(x) for x in gs_tokens[i]]) for i in range(len(ocr_tokens))] save_metrics_obj(token_pairs, decoded_pairs, jaccard_similarities, levenshtein_distances) if not decoded_pairs: decoded_pairs = [(tokenizer.decode(ocr_tokens[i]), tokenizer.decode(gs_tokens[i])) for i in range(len(ocr_tokens))] save_metrics_obj(token_pairs, decoded_pairs, jaccard_similarities, levenshtein_distances) all_pairs = len(token_pairs) if not jaccard_similarities: jaccard_similarities = [] for i, token_pair in enumerate(token_pairs): jaccard_similarities.append(calculate_jaccard_similarity(token_pair[0], token_pair[1])) save_metrics_obj(token_pairs, decoded_pairs, jaccard_similarities, levenshtein_distances) if not levenshtein_distances: levenshtein_distances = [] if len(levenshtein_distances) < all_pairs: for i, decoded_pair in enumerate(decoded_pairs):
# tokenizer.save('./', 'token_test') # else: # tokenizer = ByteLevelBPETokenizer( "./{}-vocab.json".format('token_test'), "./{}-merges.txt".format('token_test'), # add_prefix_space=True, # ) # # Now we can encode # encoded = tokenizer.encode("will be back later. http://plurk.com/p/rp3k7,will be back later, loooove u @mahboi #blessed") # print(encoded.tokens) # print(encoded.offsets) from tokenizers import BertWordPieceTokenizer # My arbitrary sentence sentence = "[CLS] will be back later. www.facebook.com ,will be back later, loooove u @mahboi #blessed" # Bert vocabularies # Instantiate a Bert tokenizers tokenizer = BertWordPieceTokenizer("bert-large-uncased-vocab.txt", lowercase=True, clean_text=True) tokenizer.add_tokens(['[LINK]']) tokenizer.enable_padding(max_length=100) WordPieceEncoder = tokenizer.encode(sentence) # Print the ids, tokens and offsets print(WordPieceEncoder.ids) print(WordPieceEncoder.tokens) print(WordPieceEncoder.offsets) print(tokenizer.get_vocab()['[PAD]']) print(tokenizer.decode(WordPieceEncoder.ids))
dec_seq_len=512) checkpoint = torch.load( 'checkpoints/amadeus-performer-2020-11-25-00.20.57-300.pt') model.eval(True) # model.load_state_dict(torch.load('models/amadeus-performer-2020-11-06-12.47.52.pt')) model.load_state_dict(checkpoint['model_state_dict']) model.cuda() run = True sentences = [] while run: try: sentence = input('> ') if sentence in ['quit', 'exit']: run = False continue sentences.append(tokenizer.encode(sentence)) if len(sentences) > 3: sentences = sentences[-3:] input_seq = torch.tensor(Encoding.merge(sentences[:]).ids).cuda() start_tokens = torch.tensor([tokenizer.token_to_id('[CLS]')]).cuda() out = model.generate(input_seq=input_seq, start_tokens=start_tokens, eos_token=tokenizer.token_to_id('[SEP]')) response = tokenizer.decode(out.tolist()) sentences.append(tokenizer.encode(response)) print(response) except KeyboardInterrupt: run = False