示例#1
0
def chinese_word_cut_tf(input_str):
    """"""
    main_root = os.environ["MAIN_ROOT"]
    dict_path = os.path.join(main_root, "tools/cppjieba/dict/jieba.dict.utf8")
    hmm_path = os.path.join(main_root, "tools/cppjieba/dict/hmm_model.utf8")
    user_dict_path = os.path.join(main_root,
                                  "tools/cppjieba/dict/user.dict.utf8")
    idf_path = os.path.join(main_root, "tools/cppjieba/dict/idf.utf8")
    stop_word_path = os.path.join(main_root,
                                  "tools/cppjieba/dict/stop_words.utf8")
    dict_lines = read_lines_from_text_file(dict_path)
    model_lines = read_lines_from_text_file(hmm_path)
    user_dict_lines = read_lines_from_text_file(user_dict_path)
    idf_lines = read_lines_from_text_file(idf_path)
    stop_word_lines = read_lines_from_text_file(stop_word_path)

    output_str = py_x_ops.jieba_cut(input_str,
                                    use_file=False,
                                    hmm=True,
                                    dict_lines=dict_lines,
                                    model_lines=model_lines,
                                    user_dict_lines=user_dict_lines,
                                    idf_lines=idf_lines,
                                    stop_word_lines=stop_word_lines)
    return output_str
示例#2
0
    def build_op_no_file(self, sentence):
        ''' build graph '''
        main_root = os.environ["MAIN_ROOT"]

        dict_path = os.path.join(main_root,
                                 "tools/cppjieba/dict/jieba.dict.utf8")
        hmm_path = os.path.join(main_root,
                                "tools/cppjieba/dict/hmm_model.utf8")
        user_dict_path = os.path.join(main_root,
                                      "tools/cppjieba/dict/user.dict.utf8")
        idf_path = os.path.join(main_root, "tools/cppjieba/dict/idf.utf8")
        stop_word_path = os.path.join(main_root,
                                      "tools/cppjieba/dict/stop_words.utf8")

        dict_lines = read_lines_from_text_file(dict_path)
        model_lines = read_lines_from_text_file(hmm_path)
        user_dict_lines = read_lines_from_text_file(user_dict_path)
        idf_lines = read_lines_from_text_file(idf_path)
        stop_word_lines = read_lines_from_text_file(stop_word_path)

        words = py_x_ops.jieba_cut(sentence,
                                   use_file=False,
                                   hmm=True,
                                   dict_lines=dict_lines,
                                   model_lines=model_lines,
                                   user_dict_lines=user_dict_lines,
                                   idf_lines=idf_lines,
                                   stop_word_lines=stop_word_lines)
        return words
示例#3
0
def tokenize_sentence(texts, max_seq_len, vocab_path):
    """Tokenize sentence"""
    vocabs = read_lines_from_text_file(vocab_path)
    token_ids, _ = py_x_ops.sentence_to_ids(texts,
                                            maxlen=max_seq_len,
                                            use_vocab_file=False,
                                            vocab=vocabs,
                                            load_token_ids_from_vocab=True,
                                            pad_id=utils.PAD_IDX,
                                            check_tokens=False)
    return token_ids
示例#4
0
def tokenize_label(label, maxlen, label_vocab_file_path, pad_id):
    """Tokenize labels"""
    vocabs = read_lines_from_text_file(label_vocab_file_path)
    label_id, _ = py_x_ops.sentence_to_ids(label,
                                           maxlen=maxlen,
                                           use_vocab_file=False,
                                           vocab=vocabs,
                                           load_token_ids_from_vocab=True,
                                           pad_id=pad_id,
                                           check_tokens=False)
    return label_id
示例#5
0
文件: py_x_ops.py 项目: zhjou/delta
def jieba_cut(input_sentence, use_file=True, hmm=True):

  dict_path = os.path.join(PACKAGE_ROOT_DIR,
                           "./resources/cppjieba_dict/jieba.dict.utf8")
  hmm_path = os.path.join(PACKAGE_ROOT_DIR,
                          "./resources/cppjieba_dict/hmm_model.utf8")
  user_dict_path = os.path.join(PACKAGE_ROOT_DIR,
                                "./resources/cppjieba_dict/user.dict.utf8")
  idf_path = os.path.join(PACKAGE_ROOT_DIR,
                          "./resources/cppjieba_dict/idf.utf8")
  stop_word_path = os.path.join(PACKAGE_ROOT_DIR,
                                "./resources/cppjieba_dict/stop_words.utf8")

  if use_file:
    output_sentence = gen_x_ops.jieba_cut(
        input_sentence,
        use_file=use_file,
        hmm=hmm,
        dict_path=dict_path,
        hmm_path=hmm_path,
        user_dict_path=user_dict_path,
        idf_path=idf_path,
        stop_word_path=stop_word_path)
  else:
    dict_lines = read_lines_from_text_file(dict_path)
    model_lines = read_lines_from_text_file(hmm_path)
    user_dict_lines = read_lines_from_text_file(user_dict_path)
    idf_lines = read_lines_from_text_file(idf_path)
    stop_word_lines = read_lines_from_text_file(stop_word_path)

    output_sentence = gen_x_ops.jieba_cut(
        input_sentence,
        use_file=use_file,
        hmm=hmm,
        dict_lines=dict_lines,
        model_lines=model_lines,
        user_dict_lines=user_dict_lines,
        idf_lines=idf_lines,
        stop_word_lines=stop_word_lines)

  return output_sentence