def process_data(loadfile, savefile, vocab): print(type(vocab)) tokens = JiebaTokenizer(vocab) with open(loadfile, mode="r", encoding="utf8") as rfp: input_data = json.load(rfp)["data"] new_examples = [] logger.info("Processing dataset %s." % loadfile) for entry in input_data: for paragraph in tqdm(entry["paragraphs"], desc="process"): title = paragraph["title"].strip() context = paragraph["context"].strip() for qa in paragraph["qas"]: qas_id = qa['id'] question = qa["question"].strip() tmp_dict = {} tmp_dict['qas_id'] = qas_id tmp_dict['question_w'] = word_to_idx(tokens.cut(question), vocab) tmp_dict['context_w'] = word_to_idx(tokens.cut(context), vocab) tmp_dict['title_w'] = word_to_idx(tokens.cut(title), vocab) tmp_dict['question_c'] = chars_to_idx(question, vocab) tmp_dict['context_c'] = chars_to_idx(context, vocab) tmp_dict['title_c'] = chars_to_idx(title, vocab) tmp_dict['is_impossible'] = 1 if qa["is_impossible"] else 0 length = len(tmp_dict['context_c']) for item in qa['answers']: answer_start = int(item["answer_start"]) answer = item["text"].strip() if answer_start == -1: label = random.randint(0, length) tmp_dict['start_positions'] = label tmp_dict["end_positions"] = label else: # Start/end character index of the answer in the text. start_char = answer_start end_char = start_char + len(answer) tmp_dict["start_positions"] = start_char tmp_dict["end_positions"] = end_char new_examples.append(tmp_dict) with open(savefile, mode="w", encoding="utf-8") as wfp: json.dump(new_examples, wfp) logger.info("Saved the processed dataset %s." % savefile) return new_examples
def sentence_to_idx(sentence, embedding): chars_list = [] tokens = JiebaTokenizer(embedding) word_list = tokens.cut(sentence) for word in word_list: tp_w = get_idx_from_word(word, embedding.vocab.token_to_idx, embedding.vocab.unk_token) tp_list = [ get_idx_from_word(ch, embedding.vocab.token_to_idx, embedding.vocab.unk_token) for ch in list(word) ] chars_list.append({tp_w: tp_list}) return chars_list
class TestJiebaTokenizer(CpuCommonTest): def setUp(self): test_data_file = create_test_data(__file__) self.vocab = Vocab.load_vocabulary(test_data_file, unk_token='[UNK]') self.tokenizer = JiebaTokenizer(self.vocab) def test_jieba(self): text = "一万一" token_arr = self.tokenizer.cut(text) idx_arr = self.tokenizer.encode(text) for i, token in enumerate(token_arr): self.check_output_equal(self.vocab(token), idx_arr[i]) jieba_tokenizer = self.tokenizer.get_tokenizer() jieba_token_arr = jieba_tokenizer.lcut(text, False, True) self.check_output_equal(token_arr, jieba_token_arr) def test_unk(self): text = "中国" idx_arr = self.tokenizer.encode(text) self.check_output_equal(self.vocab[self.vocab.unk_token] in idx_arr, True)