def test_convert_tokens_to_ids(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {token: i for i, token in enumerate(vocab_tokens)} self.assertAllEqual( tokenization.convert_tokens_to_ids( vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9])
def test_convert_tokens_to_ids(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i self.assertAllEqual( tokenization.convert_tokens_to_ids( vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9])
def test_convert_tokens_to_ids(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing" ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i self.assertListEqual( tokenization.convert_tokens_to_ids( vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9])
def test_convert_tokens_to_ids(self): vocab_tokens = [ '[UNK]', '[CLS]', '[SEP]', 'want', '##want', '##ed', 'wa', 'un', 'runn', '##ing' ] vocab = {} for (i, token) in enumerate(vocab_tokens): vocab[token] = i self.assertAllEqual( tokenization.convert_tokens_to_ids( vocab, ['un', '##want', '##ed', 'runn', '##ing']), [7, 4, 5, 8, 9])
def get_padded_tokens( tokens, tags, flags, bounds, extra_features, vocabs, max_seq_length, ): max_seq_length += 64 tokens.extend('有哪些金融公司、平台、中心、币、银行、基金、外汇、集团、链、股份、商城、店、资本、家园、金服、交易所、理财、贷款') tokens.append('[SEP]') tokens = [ token.lower() if token not in ['[CLS]', '[SEP]'] else token for token in tokens ] tokens = [token if token in vocabs else '[UNK]' for token in tokens] input_ids = tokenization.convert_tokens_to_ids(vocabs, tokens) input_mask = [1] * len(input_ids) # tag_ids = [BIO_TAG2ID[tag] for tag in tags] begin_tag_ids = [1 if tag == 'B' else 0 for tag in tags] end_tag_ids = [1 if tag == 'E' else 0 for tag in tags] pairs = get_span_from_tags(tags) # flag_ids = [POS_FLAGS_TO_IDS[flag] for flag in flags] # bound_ids = [WORD_BOUNDS_TO_IDS[bound] for bound in bounds] assert len(input_ids) <= max_seq_length, "len:{}".format(len(input_ids)) # to_pad = [0] * (max_seq_length - len(input_ids)) # fea_to_pad = [[0] * len(extra_features[0])] * (max_seq_length - len(input_ids)) while len(begin_tag_ids) < len(tokens): begin_tag_ids.append(-1) end_tag_ids.append(-1) while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) # flag_ids.append(0) begin_tag_ids.append(-1) end_tag_ids.append(-1) # bound_ids.append(0) # extra_features.append([0]*len(extra_features[0])) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(begin_tag_ids) == max_seq_length assert len(end_tag_ids) == max_seq_length return input_ids, input_mask, (begin_tag_ids, end_tag_ids, pairs ) # , flag_ids, bound_ids, extra_features
def get_padded_tokens(tokens, tags, flags, bounds, extra_features, vocabs, max_seq_length, pad='after'): tokens = [ token.lower() if token not in ['[CLS]', '[SEP]'] else token for token in tokens ] tokens = [token if token in vocabs else '[UNK]' for token in tokens] input_ids = tokenization.convert_tokens_to_ids(vocabs, tokens) input_mask = [1] * len(input_ids) tag_ids = [BIO_TAG2ID[tag] for tag in tags] flag_ids = [POS_FLAGS_TO_IDS[flag] for flag in flags] bound_ids = [WORD_BOUNDS_TO_IDS[bound] for bound in bounds] assert len(input_ids) <= max_seq_length, "len:{}".format(len(input_ids)) to_pad = [0] * (max_seq_length - len(input_ids)) fea_to_pad = [[0] * len(extra_features[0]) ] * (max_seq_length - len(input_ids)) if pad == 'before': input_ids = to_pad + input_ids input_mask = to_pad + input_mask tag_ids = to_pad + tag_ids flag_ids = to_pad + flag_ids bound_ids = to_pad + bound_ids extra_features = fea_to_pad + extra_features elif pad == 'after': input_ids = input_ids + to_pad input_mask = input_mask + to_pad tag_ids = tag_ids + to_pad flag_ids = flag_ids + to_pad bound_ids = bound_ids + to_pad extra_features = extra_features + fea_to_pad assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(tag_ids) == max_seq_length return input_ids, input_mask, tag_ids, flag_ids, bound_ids, extra_features
def __init__(self): vocab_file = 'vocab.txt' vocab = tokenization.load_vocab(vocab_file=vocab_file) tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) path = 'train_processed.txt' train_file = open(path, 'r', encoding='utf-8') lines = train_file.read().split('\n') max_length = 0 for i in range(len(lines)): TK = lines[i].split(' \t') if max_length < len(TK[0]): max_length = len(TK[0]) max_length += 1 self.input_ids = np.zeros(shape=[len(lines), max_length], dtype=np.int32) self.input_mask = np.zeros(shape=[len(lines), max_length], dtype=np.int32) self.label = np.zeros(shape=[len(lines)], dtype=np.int32) for i in range(len(lines) - 1): TK = lines[i].split(' \t') if len(TK) != 2: TK = lines[i].split('\t') sentence = TK[0] token = tokenizer.tokenize(sentence) tk_ids = tokenization.convert_tokens_to_ids(vocab=vocab, tokens=token) for j in range(len(tk_ids)): self.input_ids[i, j + 1] = tk_ids[j] self.input_mask[i, j + 1] = 1 self.input_ids[i, 0] = tokenization.convert_tokens_to_ids( vocab=vocab, tokens=['[CLS]'])[0] self.input_mask[i, 0] = 1 self.label[i] = int(TK[1]) path = 'test_processed.txt' test_file = open(path, 'r', encoding='utf-8') lines = test_file.read().split('\n') max_length = 0 for i in range(len(lines)): TK = lines[i].split(' \t') if max_length < len(TK[0]): max_length = len(TK[0]) print(max_length) max_length += 1 self.test_input_ids = np.zeros(shape=[len(lines), max_length], dtype=np.int32) self.test_input_ids_masking = np.zeros(shape=[len(lines), max_length], dtype=np.int32) self.test_label = np.zeros(shape=[len(lines)], dtype=np.int32) for i in range(len(lines) - 1): TK = lines[i].split(' \t') if len(TK) != 2: TK = lines[i].split('\t') sentence = TK[0] token = tokenizer.tokenize(sentence) tk_ids = tokenization.convert_tokens_to_ids(vocab=vocab, tokens=token) for j in range(len(tk_ids)): self.test_input_ids[i, j + 1] = tk_ids[j] self.test_input_ids_masking[i, j + 1] = 1 self.test_input_ids[i, 0] = tokenization.convert_tokens_to_ids( vocab=vocab, tokens=['[CLS]'])[0] self.test_input_ids_masking[i, 0] = 1 self.test_label[i] = int(TK[1]) self.Batch_Size = 8 self.random_idx = np.array(range(self.label.shape[0]), dtype=np.int32) np.random.shuffle(self.random_idx) self.Batch_Idx = 0 self.Test_Batch_Idx = 0
def convert_tokens_to_ids(self, tokens): return tokenization.convert_tokens_to_ids(self.vocab, tokens)
import tensorflow as tf import tokenization vocab_pass = '******' p_tokens = [ '[Subject]', 'John', 'and', 'Michael', '[equalTo]', 'genius', '[when]', 'morning' ] predict_ids = tokenization.convert_tokens_to_ids( tokenization.load_vocab(vocab_pass), p_tokens) r_tokens = [ '[Subject]', 'John', 'and', 'Michael', '[equalTo]', 'smart', '[when]', 'afternoon', 'and', 'evening' ] real_ids = tokenization.convert_tokens_to_ids( tokenization.load_vocab(vocab_pass), r_tokens) predict_tensor = tf.constant([[1, 2, 3], [98, 1, 6], [1, 2, 4], [22, 1, 6], [3, 2, 3], [7, 1, 6], [0, 2, 3], [11, 1, 9]], dtype=float) real_tensor = tf.constant( [[1, 2, 3], [12, 8, 1], [1, 2, 4], [12, 8, 1], [3, 2, 3], [12, 8, 1], [0, 2, 3], [12, 8, 1], [1, 2, 4], [12, 8, 1]], dtype=float) def is_special_id(id): vocab = tokenization.load_vocab(vocab_pass) inv_vocab = {v: k for k, v in vocab.items()} special_tokens = [] for token in vocab.keys():