def ignore_test_trainer_persist(self): """ test pipeline persist, metadata will be saved :return: """ test_config = "tests/data/test_config/test_config.json" config = AnnotatorConfig(test_config) trainer = Trainer(config) assert len(trainer.pipeline) > 0 # char_tokenizer component should been created assert trainer.pipeline[0] is not None # create tmp train set tmp_path = create_tmp_test_jsonfile("tmp.json") train_data = load_local_data(tmp_path) # rm tmp train set rm_tmp_file("tmp.json") trainer.train(train_data) persisted_path = trainer.persist(config['path'], config['project'], config['fixed_model_name']) # load persisted metadata metadata_path = os.path.join(persisted_path, 'metadata.json') with io.open(metadata_path) as f: metadata = json.load(f) assert 'trained_at' in metadata # rm tmp files and dirs shutil.rmtree(config['path'], ignore_errors=False)
def ignore_test_pipeline_flow(self): """ test trainer's train func for pipeline :return: """ test_config = "tests/data/test_config/test_config.json" config = AnnotatorConfig(test_config) trainer = Trainer(config) assert len(trainer.pipeline) > 0 # create tmp train set tmp_path = create_tmp_test_jsonfile("tmp.json") train_data = load_local_data(tmp_path) # rm tmp train set rm_tmp_file("tmp.json") interpreter = trainer.train(train_data) assert interpreter is not None out1 = interpreter.parse(("点连接拿红包啦")) # test persist and load persisted_path = trainer.persist(config['path'], config['project'], config['fixed_model_name']) interpreter_loaded = Interpreter.load(persisted_path, config) out2 = interpreter_loaded.parse("点连接拿红包啦") assert out1.get("classifylabel").get("name") == out2.get("classifylabel").get("name") # remove tmp models shutil.rmtree(config['path'], ignore_errors=True)
def ignore_test_load_and_persist_without_train(self): """ test save and load model without train :return: """ test_config = "tests/data/test_config/test_config.json" config = AnnotatorConfig(test_config) trainer = Trainer(config) assert len(trainer.pipeline) > 0 # create tmp train set tmp_path = create_tmp_test_jsonfile("tmp.json") train_data = load_local_data(tmp_path) # rm tmp train set rm_tmp_file("tmp.json") # interpreter = trainer.train(train_data) # test persist and load persisted_path = trainer.persist(config['path'], config['project'], config['fixed_model_name']) interpreter_loaded = Interpreter.load(persisted_path, config) assert interpreter_loaded.pipeline assert interpreter_loaded.parse("hello") is not None assert interpreter_loaded.parse("Hello today is Monday, again!") is not None # remove tmp models shutil.rmtree(config['path'], ignore_errors=False)
def ignore_test_load_and_persist_without_train(self): """ test save and load model without train :return: """ test_config = "tests/data/test_config/test_config.json" config = AnnotatorConfig(test_config) trainer = Trainer(config) assert len(trainer.pipeline) > 0 # create tmp train set tmp_path = create_tmp_test_jsonfile("tmp.json") train_data = load_local_data(tmp_path) # rm tmp train set rm_tmp_file("tmp.json") # interpreter = trainer.train(train_data) # test persist and load persisted_path = trainer.persist(config['path'], config['project'], config['fixed_model_name']) interpreter_loaded = Interpreter.load(persisted_path, config) assert interpreter_loaded.pipeline assert interpreter_loaded.parse("hello") is not None assert interpreter_loaded.parse( "Hello today is Monday, again!") is not None # remove tmp models shutil.rmtree(config['path'], ignore_errors=False)
def ignore_test_pipeline_flow(self): """ test trainer's train func for pipeline :return: """ test_config = "tests/data/test_config/test_config.json" config = AnnotatorConfig(test_config) trainer = Trainer(config) assert len(trainer.pipeline) > 0 # create tmp train set tmp_path = create_tmp_test_jsonfile("tmp.json") train_data = load_local_data(tmp_path) # rm tmp train set rm_tmp_file("tmp.json") interpreter = trainer.train(train_data) assert interpreter is not None out1 = interpreter.parse(("点连接拿红包啦")) # test persist and load persisted_path = trainer.persist(config['path'], config['project'], config['fixed_model_name']) interpreter_loaded = Interpreter.load(persisted_path, config) out2 = interpreter_loaded.parse("点连接拿红包啦") assert out1.get("classifylabel").get("name") == out2.get( "classifylabel").get("name") # remove tmp models shutil.rmtree(config['path'], ignore_errors=True)
def teardown_class(cls): """ teardown any state that was previously setup with a call to setup_class. """ # remove tmp files and dirs created in test case test_config = "tests/data/test_config/test_config.json" config = AnnotatorConfig(test_config) rm_tmp_file("test_data.json") shutil.rmtree(config['path'], ignore_errors=True)
def teardown_class(cls): """ teardown any state that was previously setup with a call to setup_class. """ # remove tmp files and dirs created in test case test_config = "tests/data/test_config.json" config = AnnotatorConfig(test_config) rm_tmp_file("test_data.json") shutil.rmtree(config['path'], ignore_errors=True)
def test_char2vec_standalone(self): """ test char2vec_standalone training """ create_tmp_test_textfile("spam_email_text_1000") os.system( "python -m chi_annotator.algo_factory.preprocess.char2vec_standalone -train tests/data/spam_email_text_1000 -output tests/data/test_vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3" ) assert os.path.isfile("tests/data/test_vec.txt") is not None rm_tmp_file("spam_email_text_1000") rm_tmp_file("test_vec.txt")
def ignore_test_load_local_data(self): """ test load local json format data :return: """ tmp_path = create_tmp_test_jsonfile("tmp.json") train_data = load_local_data(tmp_path) rm_tmp_file("tmp.json") assert train_data is not None assert len(train_data.training_examples) == 1000 assert "text" not in train_data.training_examples[0].data assert "label" in train_data.training_examples[0].data
def ingor_test_char2vec_standalone(self): """ test char2vec_standalone training """ abs_path = os.path.dirname(os.path.abspath(__file__)) create_tmp_test_textfile(os.path.join(abs_path, "spam_email_text_1000")) os.system("python -m chi_annotator.algo_factory.preprocess.char2vec_standalone -train " + os.path.join(abs_path, "/../data/spam_email_text_1000") + " -output " + os.path.join(abs_path, "/../data/test_vec.txt") + " -size 200 -sample 1e-4 -binary 0 -iter 3") assert os.path.isfile(os.path.join(abs_path, "/../data/test_vec.txt")) is not None rm_tmp_file(os.path.join(abs_path, "spam_email_text_1000")) rm_tmp_file(os.path.join(abs_path, "/../data/test_vec.txt"))
def ignore_test_train_model_empty_pipeline(self): """ train model with no component :return: """ test_config = "tests/data/test_config/test_config.json" config = AnnotatorConfig(test_config) config['pipeline'] = [] tmp_path = create_tmp_test_jsonfile("tmp.json") train_data = load_local_data(tmp_path) rm_tmp_file("tmp.json") with pytest.raises(ValueError): trainer = Trainer(config) trainer.train(train_data)
def ingor_test_char2vec_standalone(self): """ test char2vec_standalone training """ abs_path = os.path.dirname(os.path.abspath(__file__)) create_tmp_test_textfile(os.path.join(abs_path, "spam_email_text_1000")) os.system( "python -m chi_annotator.algo_factory.preprocess.char2vec_standalone -train " + os.path.join(abs_path, "/../data/spam_email_text_1000") + " -output " + os.path.join(abs_path, "/../data/test_vec.txt") + " -size 200 -sample 1e-4 -binary 0 -iter 3") assert os.path.isfile(os.path.join( abs_path, "/../data/test_vec.txt")) is not None rm_tmp_file(os.path.join(abs_path, "spam_email_text_1000")) rm_tmp_file(os.path.join(abs_path, "/../data/test_vec.txt"))
def ignore_test_handles_pipeline_with_non_existing_component(self): """ handle no exist component in pipeline :return: """ test_config = "tests/data/test_config/test_config.json" config = AnnotatorConfig(test_config) config['pipeline'].append("unknown_component") tmp_path = create_tmp_test_jsonfile("tmp.json") train_data = load_local_data(tmp_path) rm_tmp_file("tmp.json") with pytest.raises(Exception) as execinfo: trainer = Trainer(config) trainer.train(train_data) assert "Failed to find component" in str(execinfo.value)
def test_pipeline_flow(self): """ test trainer's train func for pipeline :return: """ test_config = "tests/data/test_config.json" config = AnnotatorConfig(test_config) trainer = Trainer(config) assert len(trainer.pipeline) == 1 # char_tokenizer component should been created assert trainer.pipeline[0] is not None # create tmp train set tmp_path = create_tmp_test_file("tmp.json") train_data = load_local_data(tmp_path) # rm tmp train set rm_tmp_file("tmp.json") interpreter = trainer.train(train_data) assert interpreter is not None
def ignor_test_embedding(self): from chi_annotator.algo_factory.components import ComponentBuilder from chi_annotator.algo_factory.common import Message from chi_annotator.task_center.config import AnnotatorConfig from chi_annotator.algo_factory.common import TrainingData from gensim.models.word2vec import LineSentence text_dir = create_tmp_test_textfile("spam_email_text_1000") # 将数据放入TrainingData with open(text_dir, 'r') as f: res = [] for line in f.readlines(): line.strip('\n') line = Message(re.sub('\s', '', line)) res.append(line) res = TrainingData(res) cfg = AnnotatorConfig( filename="tests/data/test_config/test_config_embedding.json") cb = ComponentBuilder() # char_tokenize, embedding的训练暂时不用用到 char_tokenize = cb.create_component("char_tokenizer", cfg) char_tokenize.train(res, cfg) # 加载embedding, 训练模型, 传入数据为LinSentence(data_path) embedding = cb.create_component("embedding", cfg) embedding.train(LineSentence(text_dir), cfg) embedding.persist(cfg.wv_model_path) # 加载sent_embedding, 从embedding训练完是model中, 获得sentence_vec sent_embedding = cb.create_component("embedding_extractor", cfg) msg = Message("你好,我是一个demo!!!!") char_tokenize.process(msg) sent_embedding.sentence_process(msg, **{}) assert msg.get("sentence_embedding").sum() != 0 # 加载base model, 加入新的corpus, 在base_model的基础上进行增量学习 embedding = embedding.load(model_metadata=cfg) embedding.train(LineSentence(text_dir), cfg) embedding.persist(cfg.wv_model_path) # 增量学习后生成的新model, 进行EmbeddingExtractor测验 sent_embedding = cb.create_component("embedding_extractor", cfg) msg = Message("你好,我是一个demo!!!!") char_tokenize.process(msg) sent_embedding.sentence_process(msg, **{}) assert msg.get("sentence_embedding").sum() != 0 rm_tmp_file("word2vec.model") rm_tmp_file("word2vec.model.vector") rm_tmp_file("spam_email_text_1000")
def ignor_test_embedding(self): from chi_annotator.algo_factory.components import ComponentBuilder from chi_annotator.algo_factory.common import Message from chi_annotator.task_center.config import AnnotatorConfig from chi_annotator.algo_factory.common import TrainingData from gensim.models.word2vec import LineSentence text_dir = create_tmp_test_textfile("spam_email_text_1000") # 将数据放入TrainingData with open(text_dir, 'r') as f: res = [] for line in f.readlines(): line.strip('\n') line = Message(re.sub('\s', '', line)) res.append(line) res = TrainingData(res) cfg = AnnotatorConfig(filename="tests/data/test_config/test_config_embedding.json") cb = ComponentBuilder() # char_tokenize, embedding的训练暂时不用用到 char_tokenize = cb.create_component("char_tokenizer", cfg) char_tokenize.train(res, cfg) # 加载embedding, 训练模型, 传入数据为LinSentence(data_path) embedding = cb.create_component("embedding", cfg) embedding.train(LineSentence(text_dir), cfg) embedding.persist(cfg.wv_model_path) # 加载sent_embedding, 从embedding训练完是model中, 获得sentence_vec sent_embedding = cb.create_component("embedding_extractor", cfg) msg = Message("你好,我是一个demo!!!!") char_tokenize.process(msg) sent_embedding.sentence_process(msg, **{}) assert msg.get("sentence_embedding").sum() != 0 # 加载base model, 加入新的corpus, 在base_model的基础上进行增量学习 embedding = embedding.load(model_metadata=cfg) embedding.train(LineSentence(text_dir), cfg) embedding.persist(cfg.wv_model_path) # 增量学习后生成的新model, 进行EmbeddingExtractor测验 sent_embedding = cb.create_component("embedding_extractor", cfg) msg = Message("你好,我是一个demo!!!!") char_tokenize.process(msg) sent_embedding.sentence_process(msg, **{}) assert msg.get("sentence_embedding").sum() != 0 rm_tmp_file("word2vec.model") rm_tmp_file("word2vec.model.vector") rm_tmp_file("spam_email_text_1000")