def _get_vocabs(self): word_list = [] char_list = [] for ds in self._datasets: for item in ds: words = self._get_word_tokens(item[1]) word_list.extend(words) for word in words: char_list.extend(iter(word)) word_counter = data.count_tokens(word_list) char_counter = data.count_tokens(char_list) word_vocab = Vocab(word_counter) char_vocab = Vocab(char_counter) # embedding_zh = gluonnlp.embedding.create('fasttext', source='cc.zh.300') # embedding_eng = gluonnlp.embedding.create('fasttext', source='cc.en.300') # embedding_ko = gluonnlp.embedding.create('fasttext', source='cc.ko.300') # word_vocab.set_embedding(embedding_eng, embedding_zh, embedding_ko) # # count = 0 # for token, times in word_counter.items(): # if (word_vocab.embedding[token].sum() != 0).asscalar(): # count += 1 # else: # print(token) # # print("{}/{} words have embeddings".format(count, len(word_counter))) return word_vocab, char_vocab
def test_berttokenizer(): # test WordpieceTokenizer vocab_tokens = ["want", "##want", "##ed", "wa", "un", "runn", "##ing"] vocab = Vocab( count_tokens(vocab_tokens), reserved_tokens=["[CLS]", "[SEP]"], unknown_token="[UNK]", padding_token=None, bos_token=None, eos_token=None) tokenizer = t.BERTTokenizer(vocab=vocab) assert tokenizer(u"unwanted running") == [ "un", "##want", "##ed", "runn", "##ing"] assert tokenizer(u"unwantedX running") == ["[UNK]", "runn", "##ing"] assert tokenizer.is_first_subword('un') assert not tokenizer.is_first_subword('##want') # test BERTTokenizer vocab_tokens = ["[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", ","] vocab = Vocab( count_tokens(vocab_tokens), reserved_tokens=["[CLS]", "[SEP]"], unknown_token="[UNK]", padding_token=None, bos_token=None, eos_token=None) tokenizer = t.BERTTokenizer(vocab=vocab) tokens = tokenizer(u"UNwant\u00E9d,running") assert tokens == ["un", "##want", "##ed", ",", "runn", "##ing"]
def test_bert_sentences_transform(): text_a = u'is this jacksonville ?' text_b = u'no it is not' vocab_tokens = ['is', 'this', 'jack', '##son', '##ville', '?', 'no', 'it', 'is', 'not'] bert_vocab = BERTVocab(count_tokens(vocab_tokens)) tokenizer = t.BERTTokenizer(vocab=bert_vocab) # test BERTSentenceTransform bert_st = t.BERTSentenceTransform(tokenizer, 15, pad=True, pair=True) token_ids, length, type_ids = bert_st((text_a, text_b)) text_a_tokens = ['is', 'this', 'jack', '##son', '##ville', '?'] text_b_tokens = ['no', 'it', 'is', 'not'] text_a_ids = bert_vocab[text_a_tokens] text_b_ids = bert_vocab[text_b_tokens] cls_ids = bert_vocab[[bert_vocab.cls_token]] sep_ids = bert_vocab[[bert_vocab.sep_token]] pad_ids = bert_vocab[[bert_vocab.padding_token]] concated_ids = cls_ids + text_a_ids + sep_ids + text_b_ids + sep_ids + pad_ids valid_token_ids = np.array([pad_ids[0]] * 15, dtype=np.int32) for i, x in enumerate(concated_ids): valid_token_ids[i] = x valid_type_ids = np.zeros((15,), dtype=np.int32) start = len(text_a_tokens) + 2 end = len(text_a_tokens) + 2 + len(text_b_tokens) + 1 valid_type_ids[start:end] = 1 assert all(token_ids == valid_token_ids) assert length == len(vocab_tokens) + 3 assert all(type_ids == valid_type_ids)
def test_gluon_nlp(self): # get corpus statistics counter = count_tokens(['alpha', 'beta', 'gamma', 'beta']) # create Vocab vocab = Vocab(counter) # find index based on token self.assertEqual(4, vocab['beta'])
def test_bert_dataset_transform(): text_a = u'is this jacksonville ?' text_b = u'no it is not' label_cls = 0 vocab_tokens = [ 'is', 'this', 'jack', '##son', '##ville', '?', 'no', 'it', 'is', 'not' ] bert_vocab = BERTVocab(count_tokens(vocab_tokens)) tokenizer = BERTTokenizer(vocab=bert_vocab) # test BERTDatasetTransform for classification task bert_cls_dataset_t = BERTDatasetTransform(tokenizer, 15, labels=[label_cls], pad=True, pair=True, label_dtype='int32') token_ids, length, type_ids, label_ids = bert_cls_dataset_t( (text_a, text_b, label_cls)) text_a_tokens = ['is', 'this', 'jack', '##son', '##ville', '?'] text_b_tokens = ['no', 'it', 'is', 'not'] text_a_ids = bert_vocab[text_a_tokens] text_b_ids = bert_vocab[text_b_tokens] cls_ids = bert_vocab[[bert_vocab.cls_token]] sep_ids = bert_vocab[[bert_vocab.sep_token]] pad_ids = bert_vocab[[bert_vocab.padding_token]] concated_ids = cls_ids + text_a_ids + sep_ids + text_b_ids + sep_ids + pad_ids valid_token_ids = np.array([pad_ids[0]] * 15, dtype=np.int32) for i, x in enumerate(concated_ids): valid_token_ids[i] = x valid_type_ids = np.zeros((15, ), dtype=np.int32) start = len(text_a_tokens) + 2 end = len(text_a_tokens) + 2 + len(text_b_tokens) + 1 valid_type_ids[start:end] = 1 assert all(token_ids == valid_token_ids) assert length == len(vocab_tokens) + 3 assert all(type_ids == valid_type_ids) assert all(label_ids == np.array([label_cls], dtype=np.int32)) # test BERTDatasetTransform for regression task label_reg = 0.2 bert_reg_dataset_t = BERTDatasetTransform(tokenizer, 15, pad=True, pair=True, label_dtype='float32') token_ids, length, type_ids, label_reg_val = bert_reg_dataset_t( (text_a, text_b, label_reg)) assert all(token_ids == valid_token_ids) assert length == len(vocab_tokens) + 3 assert all(type_ids == valid_type_ids) assert all(label_reg_val == np.array([label_reg], dtype=np.float32))
def _create_squad_vocab(tokenization_fn, dataset): all_tokens = [] for data_item in dataset: all_tokens.extend(tokenization_fn(data_item[1])) all_tokens.extend(tokenization_fn(data_item[2])) counter = data.count_tokens(all_tokens) vocab = Vocab(counter) return vocab
def __call__(self, example): """Maps examples into distinct tokens Parameters ---------- example : dict Example to process with context_tokens and ques_tokens keys Returns ------- mapped_values : List[Tuple] Result of mapping process. Each tuple of (token, count) format """ para_counter = data.count_tokens(example['context_tokens'] if not self._iterate_over_example else [c for tkn in example['context_tokens'] for c in tkn]) ques_counter = data.count_tokens(example['ques_tokens'] if not self._iterate_over_example else [c for tkn in example['ques_tokens'] for c in tkn]) counter = para_counter + ques_counter return list(counter.items())
def get_vocab(datasets): all_words = [ word for dataset in datasets for item in dataset for word in item[0] ] vocab = Vocab(data.count_tokens(all_words)) glove = embedding.create('glove', source='glove.6B.' + str(args.embedding_dim) + 'd') vocab.set_embedding(glove) return vocab
def read_data(word_path, label_path, nature_path, max_seq_len, PAD, NOT, PAD_NATURE, UNK): ''' 读取数据中的每个句子的词,词性,词所对应的实体的标记。对每条句子的词的长度进行长截短补到指定的 max_seq_len 的长度,对词的填充使用 PAD, 词性填充使用 PAD_NATURE, 标记填充使用 NOT。 构建 词的字典,词性的字典以及标记的字典,字典中保留位置符号 UNK Args: word_path: 包含每条句子的词的数据的路径 label_path: 包含每条句子的词的标记的数据的路径 nature_path: 包含每条句子的词的词性的数据的路径 max_seq_len: 最大句子长度,以词为单位 PAD: 词的填充符号 NOT: 标记的填充符号 PAD_NATURE: 词性的填充符号 UNK: 未知符号 Returns: word_vocab:词的字典 label_vocab:词所对应的实体的标记的字典 nature_vocab:词的词性的字典 input_seqs:所有句子的输入的词的列表 [[word1, word2, ...], [word1, word2, ...], ...] output_seqs: 所有句子的词的标记的列表 [[label1, label2, ...], [label1, label2, ...], ...] nature_seqs:所有句子的词的词性的列表 [[nature1, nature2, ...], [nature1, nature2, ...], ...] ''' input_tokens, output_tokens, nature_tokens = [], [], [] input_seqs, output_seqs, nature_seqs = [], [], [] with open(word_path, 'r', encoding='utf-8') as fx, open(label_path, 'r', encoding='utf-8') as fy, open(nature_path, 'r', encoding='utf-8') as fn: word_lines = fx.readlines() label_lines = fy.readlines() word_natures = fn.readlines() assert len(word_lines) == len(word_natures) assert len(word_natures) == len(label_lines) for word_line, label_line, word_nature in zip(word_lines, label_lines, word_natures): input_seq = word_line.strip() output_seq = label_line.strip() nature_seq = word_nature.strip() cur_input_tokens = input_seq.split(' ') cur_output_tokens = output_seq.split(' ') cur_nature_tokens = nature_seq.split(' ') assert len(cur_input_tokens) == len(cur_output_tokens) assert len(cur_output_tokens) == len(cur_nature_tokens) # 跳过奇怪的实体类别标注 if '' in cur_output_tokens: continue # if-else: 长截短补 if len(cur_input_tokens) < max_seq_len or len(cur_output_tokens) < max_seq_len or len(cur_nature_tokens) < max_seq_len: # 添加 PAD 符号使每个序列长度都为 max_seq_len while len(cur_input_tokens) < max_seq_len: cur_input_tokens.append(PAD) cur_output_tokens.append(NOT) cur_nature_tokens.append(PAD_NATURE) else: cur_input_tokens = cur_input_tokens[0:max_seq_len] cur_output_tokens = cur_output_tokens[0:max_seq_len] cur_nature_tokens = cur_nature_tokens[0:max_seq_len] input_tokens.extend(cur_input_tokens) output_tokens.extend(cur_output_tokens) nature_tokens.extend(cur_nature_tokens) # 记录序列 input_seqs.append(cur_input_tokens) output_seqs.append(cur_output_tokens) nature_seqs.append(cur_nature_tokens) # 创建字典 word_vocab = Vocab(count_tokens(input_tokens), unknown_token=UNK, padding_token=PAD) label_vocab = Vocab(count_tokens(output_tokens), unknown_token=UNK, padding_token=NOT) nature_vocab = Vocab(count_tokens(nature_tokens), unknown_token=UNK, padding_token=PAD_NATURE) return word_vocab, label_vocab, nature_vocab, input_seqs, output_seqs, nature_seqs