Exemplo n.º 1
0
 def test_char_cut_tf_str(self):
     t_sen_in = tf.placeholder(dtype=tf.string, shape=())
     t_sen_out = char_cut_tf(t_sen_in)
     with self.cached_session(use_gpu=False, force_gpu=False) as sess:
         sen_out = sess.run(t_sen_out, {t_sen_in: "我爱北京天安门"})
         logging.info(sen_out.decode("utf-8"))
         self.assertEqual("我 爱 北 京 天 安 门", sen_out.decode("utf-8"))
Exemplo n.º 2
0
 def test_char_cut_tf_list(self):
     t_sen_in = tf.placeholder(dtype=tf.string, shape=(None, ))
     t_sen_out = char_cut_tf(t_sen_in)
     with self.cached_session(use_gpu=False, force_gpu=False) as sess:
         sen_out = sess.run(t_sen_out, {t_sen_in: ["我爱北京天安门", "天安门前太阳升啊"]})
         logging.info([one.decode("utf-8") for one in sen_out])
         self.assertAllEqual(["我 爱 北 京 天 安 门", "天 安 门 前 太 阳 升 啊"],
                             [one.decode("utf-8") for one in sen_out])
Exemplo n.º 3
0
  def pre_process_pipeline(self, input_sentences):
    """Data pipeline function for pre-processing."""
    language = self.task_config["language"]
    clean_english = self.task_config.get("clean_english", False)
    split_by_space = self.task_config.get("split_by_space", False)
    use_word = self.task_config.get("use_word", False)

    if language == "english":
      if clean_english:
        batch = clean_english_str_tf(input_sentences)
      else:
        batch = input_sentences
    else:
      if split_by_space:
        batch = input_sentences
      else:
        if use_word:
          batch = chinese_word_cut_tf(input_sentences)
        else:
          batch = char_cut_tf(input_sentences)
    return batch
Exemplo n.º 4
0
    def pre_process_pipeline(self, input_sentences):
        """Data pipeline function for pre-processing."""
        language = self.task_config["language"]
        clean_english = self.task_config.get("clean_english", False)
        split_by_space = self.task_config.get("split_by_space", False)
        use_word = self.task_config.get("use_word", False)

        if language == "english":
            if clean_english:
                batch = clean_english_str_tf(input_sentences)
            else:
                batch = input_sentences
        else:
            if split_by_space:
                batch = input_sentences
            else:
                if use_word:
                    main_root = os.environ["MAIN_ROOT"]
                    dict_path = os.path.join(
                        main_root, "tools/cppjieba/dict/jieba.dict.utf8")
                    hmm_path = os.path.join(
                        main_root, "tools/cppjieba/dict/hmm_model.utf8")
                    user_dict_path = os.path.join(
                        main_root, "tools/cppjieba/dict/user.dict.utf8")
                    idf_path = os.path.join(main_root,
                                            "tools/cppjieba/dict/idf.utf8")
                    stop_word_path = os.path.join(
                        main_root, "tools/cppjieba/dict/stop_words.utf8")
                    batch = py_x_ops.jieba_cut(input_sentences,
                                               hmm=True,
                                               dict_path=dict_path,
                                               hmm_path=hmm_path,
                                               user_dict_path=user_dict_path,
                                               idf_path=idf_path,
                                               stop_word_path=stop_word_path)
                else:
                    batch = char_cut_tf(input_sentences)
        return batch