def test_clean_english_str_tf(self): t_sentence_in = tf.placeholder(dtype=tf.string) t_sentence_out = clean_english_str_tf(t_sentence_in) with self.cached_session(use_gpu=False, force_gpu=False) as sess: sentence_out = sess.run(t_sentence_out, {t_sentence_in: "I'd like to have an APPLE! "}) logging.info(sentence_out) self.assertEqual("i 'd like to have an apple !", sentence_out.decode("utf-8")) sentence_out = sess.run(t_sentence_out, {t_sentence_in: ["I'd like to have an APPLE! "]}) logging.info(sentence_out) self.assertEqual("i 'd like to have an apple !", sentence_out[0].decode("utf-8"))
def pre_process_pipeline(self, input_sentences): """Data pipeline function for pre-processing.""" language = self.task_config["language"] clean_english = self.task_config.get("clean_english", False) split_by_space = self.task_config.get("split_by_space", False) use_word = self.task_config.get("use_word", False) if language == "english": if clean_english: batch = clean_english_str_tf(input_sentences) else: batch = input_sentences else: if split_by_space: batch = input_sentences else: if use_word: batch = chinese_word_cut_tf(input_sentences) else: batch = char_cut_tf(input_sentences) return batch
def pre_process_pipeline(self, input_sentences): """Data pipeline function for pre-processing.""" language = self.task_config["language"] clean_english = self.task_config.get("clean_english", False) split_by_space = self.task_config.get("split_by_space", False) use_word = self.task_config.get("use_word", False) if language == "english": if clean_english: batch = clean_english_str_tf(input_sentences) else: batch = input_sentences else: if split_by_space: batch = input_sentences else: if use_word: main_root = os.environ["MAIN_ROOT"] dict_path = os.path.join( main_root, "tools/cppjieba/dict/jieba.dict.utf8") hmm_path = os.path.join( main_root, "tools/cppjieba/dict/hmm_model.utf8") user_dict_path = os.path.join( main_root, "tools/cppjieba/dict/user.dict.utf8") idf_path = os.path.join(main_root, "tools/cppjieba/dict/idf.utf8") stop_word_path = os.path.join( main_root, "tools/cppjieba/dict/stop_words.utf8") batch = py_x_ops.jieba_cut(input_sentences, hmm=True, dict_path=dict_path, hmm_path=hmm_path, user_dict_path=user_dict_path, idf_path=idf_path, stop_word_path=stop_word_path) else: batch = char_cut_tf(input_sentences) return batch