Exemplo n.º 1
0
def process_data(loadfile, savefile, vocab):
    print(type(vocab))
    tokens = JiebaTokenizer(vocab)
    with open(loadfile, mode="r", encoding="utf8") as rfp:
        input_data = json.load(rfp)["data"]
    new_examples = []
    logger.info("Processing dataset %s." % loadfile)
    for entry in input_data:
        for paragraph in tqdm(entry["paragraphs"], desc="process"):
            title = paragraph["title"].strip()
            context = paragraph["context"].strip()
            for qa in paragraph["qas"]:
                qas_id = qa['id']
                question = qa["question"].strip()
                tmp_dict = {}
                tmp_dict['qas_id'] = qas_id

                tmp_dict['question_w'] = word_to_idx(tokens.cut(question),
                                                     vocab)
                tmp_dict['context_w'] = word_to_idx(tokens.cut(context), vocab)
                tmp_dict['title_w'] = word_to_idx(tokens.cut(title), vocab)
                tmp_dict['question_c'] = chars_to_idx(question, vocab)
                tmp_dict['context_c'] = chars_to_idx(context, vocab)
                tmp_dict['title_c'] = chars_to_idx(title, vocab)
                tmp_dict['is_impossible'] = 1 if qa["is_impossible"] else 0
                length = len(tmp_dict['context_c'])
                for item in qa['answers']:
                    answer_start = int(item["answer_start"])
                    answer = item["text"].strip()
                    if answer_start == -1:
                        label = random.randint(0, length)
                        tmp_dict['start_positions'] = label
                        tmp_dict["end_positions"] = label
                    else:
                        # Start/end character index of the answer in the text.
                        start_char = answer_start
                        end_char = start_char + len(answer)
                        tmp_dict["start_positions"] = start_char
                        tmp_dict["end_positions"] = end_char
                    new_examples.append(tmp_dict)
    with open(savefile, mode="w", encoding="utf-8") as wfp:
        json.dump(new_examples, wfp)
    logger.info("Saved the processed dataset %s." % savefile)
    return new_examples
Exemplo n.º 2
0
def sentence_to_idx(sentence, embedding):
    chars_list = []
    tokens = JiebaTokenizer(embedding)
    word_list = tokens.cut(sentence)
    for word in word_list:
        tp_w = get_idx_from_word(word, embedding.vocab.token_to_idx,
                                 embedding.vocab.unk_token)
        tp_list = [
            get_idx_from_word(ch, embedding.vocab.token_to_idx,
                              embedding.vocab.unk_token) for ch in list(word)
        ]
        chars_list.append({tp_w: tp_list})
    return chars_list
Exemplo n.º 3
0
class TestJiebaTokenizer(CpuCommonTest):
    def setUp(self):
        test_data_file = create_test_data(__file__)
        self.vocab = Vocab.load_vocabulary(test_data_file, unk_token='[UNK]')
        self.tokenizer = JiebaTokenizer(self.vocab)

    def test_jieba(self):
        text = "一万一"
        token_arr = self.tokenizer.cut(text)
        idx_arr = self.tokenizer.encode(text)
        for i, token in enumerate(token_arr):
            self.check_output_equal(self.vocab(token), idx_arr[i])

        jieba_tokenizer = self.tokenizer.get_tokenizer()
        jieba_token_arr = jieba_tokenizer.lcut(text, False, True)
        self.check_output_equal(token_arr, jieba_token_arr)

    def test_unk(self):
        text = "中国"
        idx_arr = self.tokenizer.encode(text)
        self.check_output_equal(self.vocab[self.vocab.unk_token] in idx_arr,
                                True)