def test_char_cut_tf_str(self): t_sen_in = tf.placeholder(dtype=tf.string, shape=()) t_sen_out = char_cut_tf(t_sen_in) with self.cached_session(use_gpu=False, force_gpu=False) as sess: sen_out = sess.run(t_sen_out, {t_sen_in: "我爱北京天安门"}) logging.info(sen_out.decode("utf-8")) self.assertEqual("我 爱 北 京 天 安 门", sen_out.decode("utf-8"))
def test_char_cut_tf_list(self): t_sen_in = tf.placeholder(dtype=tf.string, shape=(None, )) t_sen_out = char_cut_tf(t_sen_in) with self.cached_session(use_gpu=False, force_gpu=False) as sess: sen_out = sess.run(t_sen_out, {t_sen_in: ["我爱北京天安门", "天安门前太阳升啊"]}) logging.info([one.decode("utf-8") for one in sen_out]) self.assertAllEqual(["我 爱 北 京 天 安 门", "天 安 门 前 太 阳 升 啊"], [one.decode("utf-8") for one in sen_out])
def pre_process_pipeline(self, input_sentences): """Data pipeline function for pre-processing.""" language = self.task_config["language"] clean_english = self.task_config.get("clean_english", False) split_by_space = self.task_config.get("split_by_space", False) use_word = self.task_config.get("use_word", False) if language == "english": if clean_english: batch = clean_english_str_tf(input_sentences) else: batch = input_sentences else: if split_by_space: batch = input_sentences else: if use_word: batch = chinese_word_cut_tf(input_sentences) else: batch = char_cut_tf(input_sentences) return batch
def pre_process_pipeline(self, input_sentences): """Data pipeline function for pre-processing.""" language = self.task_config["language"] clean_english = self.task_config.get("clean_english", False) split_by_space = self.task_config.get("split_by_space", False) use_word = self.task_config.get("use_word", False) if language == "english": if clean_english: batch = clean_english_str_tf(input_sentences) else: batch = input_sentences else: if split_by_space: batch = input_sentences else: if use_word: main_root = os.environ["MAIN_ROOT"] dict_path = os.path.join( main_root, "tools/cppjieba/dict/jieba.dict.utf8") hmm_path = os.path.join( main_root, "tools/cppjieba/dict/hmm_model.utf8") user_dict_path = os.path.join( main_root, "tools/cppjieba/dict/user.dict.utf8") idf_path = os.path.join(main_root, "tools/cppjieba/dict/idf.utf8") stop_word_path = os.path.join( main_root, "tools/cppjieba/dict/stop_words.utf8") batch = py_x_ops.jieba_cut(input_sentences, hmm=True, dict_path=dict_path, hmm_path=hmm_path, user_dict_path=user_dict_path, idf_path=idf_path, stop_word_path=stop_word_path) else: batch = char_cut_tf(input_sentences) return batch