Python JiebaTokenizer.JiebaTokenizer 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: paddlenlp.data

클래스/타입: JiebaTokenizer

메소드/함수: JiebaTokenizer

hotexamples.com에서의 예제들: 6

Python JiebaTokenizer.JiebaTokenizer - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 paddlenlp.data.JiebaTokenizer.JiebaTokenizer에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

JiebaTokenizer(6)

cut(3)

encode(1)

get_tokenizer(1)

예제 #1

파일 보기

파일: nlp_module.py 프로젝트: superwanna/PaddleHub

 def get_tokenizer(self, *args, **kwargs):
     """
     Get tokenizer of embedding module
     """
     if self.embedding_name.endswith('.en'):  # English
         raise NotImplementedError  # TODO: (chenxiaojie) add tokenizer of English embedding
     else:  # Chinese
         return JiebaTokenizer(self.vocab)

예제 #2

파일 보기

def sentence_to_idx(sentence, embedding):
    chars_list = []
    tokens = JiebaTokenizer(embedding)
    word_list = tokens.cut(sentence)
    for word in word_list:
        tp_w = get_idx_from_word(word, embedding.vocab.token_to_idx,
                                 embedding.vocab.unk_token)
        tp_list = [
            get_idx_from_word(ch, embedding.vocab.token_to_idx,
                              embedding.vocab.unk_token) for ch in list(word)
        ]
        chars_list.append({tp_w: tp_list})
    return chars_list

예제 #3

파일 보기

def process_data(loadfile, savefile, vocab):
    print(type(vocab))
    tokens = JiebaTokenizer(vocab)
    with open(loadfile, mode="r", encoding="utf8") as rfp:
        input_data = json.load(rfp)["data"]
    new_examples = []
    logger.info("Processing dataset %s." % loadfile)
    for entry in input_data:
        for paragraph in tqdm(entry["paragraphs"], desc="process"):
            title = paragraph["title"].strip()
            context = paragraph["context"].strip()
            for qa in paragraph["qas"]:
                qas_id = qa['id']
                question = qa["question"].strip()
                tmp_dict = {}
                tmp_dict['qas_id'] = qas_id

                tmp_dict['question_w'] = word_to_idx(tokens.cut(question),
                                                     vocab)
                tmp_dict['context_w'] = word_to_idx(tokens.cut(context), vocab)
                tmp_dict['title_w'] = word_to_idx(tokens.cut(title), vocab)
                tmp_dict['question_c'] = chars_to_idx(question, vocab)
                tmp_dict['context_c'] = chars_to_idx(context, vocab)
                tmp_dict['title_c'] = chars_to_idx(title, vocab)
                tmp_dict['is_impossible'] = 1 if qa["is_impossible"] else 0
                length = len(tmp_dict['context_c'])
                for item in qa['answers']:
                    answer_start = int(item["answer_start"])
                    answer = item["text"].strip()
                    if answer_start == -1:
                        label = random.randint(0, length)
                        tmp_dict['start_positions'] = label
                        tmp_dict["end_positions"] = label
                    else:
                        # Start/end character index of the answer in the text.
                        start_char = answer_start
                        end_char = start_char + len(answer)
                        tmp_dict["start_positions"] = start_char
                        tmp_dict["end_positions"] = end_char
                    new_examples.append(tmp_dict)
    with open(savefile, mode="w", encoding="utf-8") as wfp:
        json.dump(new_examples, wfp)
    logger.info("Saved the processed dataset %s." % savefile)
    return new_examples

예제 #4

파일 보기

            print(probs)
            idx = np.argmax(probs, axis=1)
            idx = idx.tolist()
            labels = [label_map[i] for i in idx]
            results.extend(labels)
        return results


if __name__ == "__main__":
    # Define predictor to do prediction.
    predictor = Predictor(args.model_file, args.params_file, args.device,
                          args.max_seq_length)

    # Firstly pre-processing prediction data  and then do predict.
    data = [
        '非常不错，服务很好，位于市中心区，交通方便，不过价格也高！',
        '怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片',
        '作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。',
    ]
    vocab = Vocab.from_json(args.vocab_path)
    tokenizer = JiebaTokenizer(vocab)
    label_map = {0: 'negative', 1: 'positive'}

    results = predictor.predict(data,
                                tokenizer,
                                label_map,
                                batch_size=args.batch_size,
                                network=args.network)
    for idx, text in enumerate(data):
        print('Data: {} \t Label: {}'.format(text, results[idx]))

예제 #5

파일 보기

def set_tokenizer(vocab):
    global tokenizer
    if vocab is not None:
        tokenizer = JiebaTokenizer(vocab=vocab)

예제 #6

파일 보기

파일: test_tokenizer.py 프로젝트: yuweifamily/PaddleNLP

 def setUp(self):
     test_data_file = create_test_data(__file__)
     self.vocab = Vocab.load_vocabulary(test_data_file, unk_token='[UNK]')
     self.tokenizer = JiebaTokenizer(self.vocab)