def get_tokenizer(self, *args, **kwargs): """ Get tokenizer of embedding module """ if self.embedding_name.endswith('.en'): # English raise NotImplementedError # TODO: (chenxiaojie) add tokenizer of English embedding else: # Chinese return JiebaTokenizer(self.vocab)
def sentence_to_idx(sentence, embedding): chars_list = [] tokens = JiebaTokenizer(embedding) word_list = tokens.cut(sentence) for word in word_list: tp_w = get_idx_from_word(word, embedding.vocab.token_to_idx, embedding.vocab.unk_token) tp_list = [ get_idx_from_word(ch, embedding.vocab.token_to_idx, embedding.vocab.unk_token) for ch in list(word) ] chars_list.append({tp_w: tp_list}) return chars_list
def process_data(loadfile, savefile, vocab): print(type(vocab)) tokens = JiebaTokenizer(vocab) with open(loadfile, mode="r", encoding="utf8") as rfp: input_data = json.load(rfp)["data"] new_examples = [] logger.info("Processing dataset %s." % loadfile) for entry in input_data: for paragraph in tqdm(entry["paragraphs"], desc="process"): title = paragraph["title"].strip() context = paragraph["context"].strip() for qa in paragraph["qas"]: qas_id = qa['id'] question = qa["question"].strip() tmp_dict = {} tmp_dict['qas_id'] = qas_id tmp_dict['question_w'] = word_to_idx(tokens.cut(question), vocab) tmp_dict['context_w'] = word_to_idx(tokens.cut(context), vocab) tmp_dict['title_w'] = word_to_idx(tokens.cut(title), vocab) tmp_dict['question_c'] = chars_to_idx(question, vocab) tmp_dict['context_c'] = chars_to_idx(context, vocab) tmp_dict['title_c'] = chars_to_idx(title, vocab) tmp_dict['is_impossible'] = 1 if qa["is_impossible"] else 0 length = len(tmp_dict['context_c']) for item in qa['answers']: answer_start = int(item["answer_start"]) answer = item["text"].strip() if answer_start == -1: label = random.randint(0, length) tmp_dict['start_positions'] = label tmp_dict["end_positions"] = label else: # Start/end character index of the answer in the text. start_char = answer_start end_char = start_char + len(answer) tmp_dict["start_positions"] = start_char tmp_dict["end_positions"] = end_char new_examples.append(tmp_dict) with open(savefile, mode="w", encoding="utf-8") as wfp: json.dump(new_examples, wfp) logger.info("Saved the processed dataset %s." % savefile) return new_examples
print(probs) idx = np.argmax(probs, axis=1) idx = idx.tolist() labels = [label_map[i] for i in idx] results.extend(labels) return results if __name__ == "__main__": # Define predictor to do prediction. predictor = Predictor(args.model_file, args.params_file, args.device, args.max_seq_length) # Firstly pre-processing prediction data and then do predict. data = [ '非常不错,服务很好,位于市中心区,交通方便,不过价格也高!', '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片', '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。', ] vocab = Vocab.from_json(args.vocab_path) tokenizer = JiebaTokenizer(vocab) label_map = {0: 'negative', 1: 'positive'} results = predictor.predict(data, tokenizer, label_map, batch_size=args.batch_size, network=args.network) for idx, text in enumerate(data): print('Data: {} \t Label: {}'.format(text, results[idx]))
def set_tokenizer(vocab): global tokenizer if vocab is not None: tokenizer = JiebaTokenizer(vocab=vocab)
def setUp(self): test_data_file = create_test_data(__file__) self.vocab = Vocab.load_vocabulary(test_data_file, unk_token='[UNK]') self.tokenizer = JiebaTokenizer(self.vocab)