def test_duet(train_data_processed, task, train_generator, valid_data_processed, test_data_processed, preprocessor): """Test DUET model.""" # Create a duet model duet = mz.models.DUET() input_shapes = preprocessor.context['input_shapes'] embed_dimension = preprocessor.context['vocab_size'] + 1 duet.params['input_shapes'] = input_shapes duet.params['task'] = task duet.params['embedding_input_dim'] = embed_dimension duet.params['embedding_output_dim'] = 10 duet.params['lm_filters'] = 32 duet.params['lm_hidden_sizes'] = [16] duet.params['dm_filters'] = 32 duet.params['dm_kernel_size'] = 3 duet.params['dm_hidden_sizes'] = [16] duet.params['dropout_rate'] = 0.5 duet.params['activation_func'] = 'relu' duet.guess_and_fill_missing_params() duet.build() duet.compile() x_valid, y_valid = valid_data_processed.unpack() valid_eval = mz.callbacks.EvaluateAllMetrics(duet, x_valid, y_valid) duet.fit_generator(train_generator, epochs=1, callbacks=[valid_eval]) duet.save('.tmpdir') try: duet = mz.load_model('.tmpdir') x, y = test_data_processed.unpack() results = duet.evaluate(x, y) assert len(results) > 0 finally: shutil.rmtree('.tmpdir')
def test_drmmtks(train_data_processed, task, train_generator, valid_data_processed, test_data_processed, preprocessor): """Test DRMMTKS model.""" # Create a drmmtks model drmmtks_model = mz.models.DRMMTKS() input_shapes = preprocessor.context['input_shapes'] embed_dimension = preprocessor.context['vocab_size'] + 1 drmmtks_model.params['input_shapes'] = input_shapes drmmtks_model.params['task'] = task drmmtks_model.params['top_k'] = 10 drmmtks_model.params['embedding_input_dim'] = embed_dimension drmmtks_model.params['embedding_output_dim'] = 10 drmmtks_model.params['mlp_num_layers'] = 1 drmmtks_model.params['mlp_num_units'] = 5 drmmtks_model.params['mlp_num_fan_out'] = 1 drmmtks_model.params['mlp_activation_func'] = 'relu' drmmtks_model.guess_and_fill_missing_params() drmmtks_model.build() drmmtks_model.compile() x_valid, y_valid = valid_data_processed.unpack() valid_eval = mz.callbacks.EvaluateAllMetrics(drmmtks_model, x_valid, y_valid) drmmtks_model.fit_generator(train_generator, epochs=1, callbacks=[valid_eval]) drmmtks_model.save('.tmpdir') try: drmmtks_model = mz.load_model('.tmpdir') x, y = test_data_processed.unpack() results = drmmtks_model.evaluate(x, y) assert len(results) > 0 finally: shutil.rmtree('.tmpdir')
def test_mvlstm(train_data_processed, task, train_generator, valid_data_processed, test_data_processed, preprocessor): """Test MVLSTM model.""" # Create a mvlstm model mvlstm = mz.models.MVLSTM() input_shapes = preprocessor.context['input_shapes'] embed_dimension = preprocessor.context['vocab_size'] + 1 mvlstm.params['input_shapes'] = input_shapes mvlstm.params['task'] = task mvlstm.params['embedding_input_dim'] = embed_dimension mvlstm.params['embedding_output_dim'] = 10 mvlstm.params['lstm_units'] = 10 mvlstm.params['top_k'] = 10 mvlstm.params['mlp_num_layers'] = 1 mvlstm.params['mlp_num_units'] = 5 mvlstm.params['mlp_num_fan_out'] = 1 mvlstm.params['mlp_activation_func'] = 'relu' mvlstm.params['dropout_rate'] = 0.5 mvlstm.guess_and_fill_missing_params() mvlstm.build() mvlstm.compile() x_valid, y_valid = valid_data_processed.unpack() valid_eval = mz.callbacks.EvaluateAllMetrics(mvlstm, x_valid, y_valid) mvlstm.fit_generator(train_generator, epochs=1, callbacks=[valid_eval]) mvlstm.save('.tmpdir') try: mvlstm = mz.load_model('.tmpdir') x, y = test_data_processed.unpack() results = mvlstm.evaluate(x, y) assert len(results) > 0 finally: shutil.rmtree('.tmpdir')
def __init__(self, modelDir, dtProcDir, debugPrint): super().__init__(exclusive=True) with open(dtProcDir, 'rb') as f: self.prep = pickle.load(f) self.model = mz.load_model(modelDir) self.model.backend.summary() self.debugPrint = debugPrint
def predict(self, test_pack_processed): model = mz.load_model(self.modeldir) test_generator = mz.DPoolDataGenerator(test_pack_processed, fixed_length_left=20, fixed_length_right=20, batch_size=20) pred_x, pred_y = test_generator[:] predict_value = model.predict(pred_x, batch_size=len(pred_y)) # batch_size return predict_value
def get_model_and_data(topic_number, d_pack_test, model_type, embedding): if model_type == 'dense': # load model model = mz.load_model(os.path.join(MODEL_DUMP, MODEL_TYPE, str(topic_number))) # prepare preprocessor train_raw = train_data(topic_number) preprocessor = mz.preprocessors.BasicPreprocessor() preprocessor.fit(train_raw) # transform document data test_processed = preprocessor.transform(d_pack_test) test_x, test_y = test_processed.unpack() if model_type == 'drmm': # load model model = mz.load_model(os.path.join(MODEL_DUMP, MODEL_TYPE, str(topic_number))) task = mz.tasks.Ranking() train_raw = train_data(topic_number) preprocessor = mz.preprocessors.BasicPreprocessor(fixed_length_left=10, fixed_length_right=100, remove_stop_words=False) preprocessor.fit(train_raw) test_processed = preprocessor.transform(d_pack_test) embedding_matrix = embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index']) # normalize the word embedding for fast histogram generating. l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1)) embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis] model.load_embedding_matrix(embedding_matrix) hist_callback = mz.data_generator.callbacks.Histogram(embedding_matrix, bin_size=30, hist_mode='LCH') test_generator = mz.DataGenerator(data_pack=test_processed, mode='point', callbacks=[hist_callback]) test_x, test_y = test_generator[:] return model, test_x
def test_save_load_model(model): tmpdir = '.matchzoo_test_save_load_tmpdir' if Path(tmpdir).exists(): shutil.rmtree(tmpdir) try: model.save(tmpdir) assert mz.load_model(tmpdir) with pytest.raises(FileExistsError): model.save(tmpdir) finally: if Path(tmpdir).exists(): shutil.rmtree(tmpdir)
def predict(train_id='test_file'): q = 'how did apollo creed die' d = "Urban legend states that Apollo Creed's name is a wordplay on the Apostles' Creed , a statement of belief used in Christian churches." df = pd.DataFrame(data={'text_left': [q], 'text_right': [d], 'label': [0]}) preprocessor = mz.load_preprocessor(ROOT_PATH + 'matchzoo_temp_files/preprocessors/' + train_id + '.dssm_preprocessor') predict_pack = mz.pack(df) predict_pack_processed = preprocessor.transform(predict_pack) model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + '.dssm_model') predict_score = float( model.predict(predict_pack_processed[:10].unpack()[0])[0][0]) ret_dict = {'score': predict_score} print(ret_dict)
def DSSM(model): request_data = json.loads(request.data.decode('utf-8')) q = request_data['text1'] d = request_data['text2'] train_id = request_data['train_id'] df = pd.DataFrame(data={'text_left': [q], 'text_right': [d], 'label': [0]}) preprocessor_suffix = '.' + model + '_preprocessor' preprocessor = mz.load_preprocessor(ROOT_PATH + 'matchzoo_temp_files/preprocessors/' + train_id + preprocessor_suffix) predict_pack = mz.pack(df) predict_pack_processed = preprocessor.transform(predict_pack) keras.backend.clear_session() model_suffix = '.' + model + '_model' model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + model_suffix) predict_score = float( model.predict(predict_pack_processed[:10].unpack()[0])[0][0]) ret_dict = {'score': predict_score} ''' if model != 'drmm': model_suffix = '.' + model + '_model' model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + model_suffix) predict_score = float(model.predict(predict_pack_processed[:10].unpack()[0])[0][0]) ret_dict = { 'score': predict_score } else: glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300) embedding_matrix = glove_embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index']) l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1)) embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis] pred_generator = mz.HistogramDataGenerator(data_pack=predict_pack_processed, embedding_matrix=embedding_matrix, bin_size=30, hist_mode='LCH') test_x, test_y = pred_generator[:] keras.backend.clear_session() model_suffix = '.' + model + '_model' model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + model_suffix) predict_score = float(model.predict(test_x)[0][0]) ret_dict = { 'score': predict_score } ''' return jsonify(ret_dict)
def test_dssm(train_data_processed, task, train_generator, test_generator, dssm_preprocessor): """Test DSSM model.""" # Create a dssm model dssm_model = mz.models.DSSM() input_shapes = dssm_preprocessor.context['input_shapes'] dssm_model.params['input_shapes'] = input_shapes dssm_model.params['task'] = task dssm_model.guess_and_fill_missing_params() dssm_model.build() dssm_model.compile() dssm_model.fit_generator(train_generator) dssm_model.save('.tmpdir') X, y = test_generator[0] try: dssm_model = mz.load_model('.tmpdir') predictions = dssm_model.predict(X) assert len(predictions) > 0 assert type(predictions[0][0]) == np.float32 finally: shutil.rmtree('.tmpdir')
def test_cdssm(task, train_generator, test_generator, cdssm_preprocessor): """Test CDSSM model.""" # Create a cdssm model cdssm_model = mz.models.CDSSM() assert isinstance(cdssm_model.get_default_preprocessor(), mz.preprocessors.CDSSMPreprocessor) input_shapes = cdssm_preprocessor.context['input_shapes'] cdssm_model.params['input_shapes'] = input_shapes cdssm_model.params['task'] = task cdssm_model.guess_and_fill_missing_params() cdssm_model.build() cdssm_model.compile() cdssm_model.fit_generator(train_generator) cdssm_model.save('.tmpdir') X, y = test_generator[0] try: cdssm_model = mz.load_model('.tmpdir') predictions = cdssm_model.predict(X) assert len(predictions) > 0 assert type(predictions[0][0]) == np.float32 finally: shutil.rmtree('.tmpdir')
import matchzoo as mz import pandas as pd print(mz.__version__) data_pack = mz.pack(pd.read_csv('match-zoo-corpus-top-1w.csv')) print(data_pack[-10:]) data_pack.relation['label'] = data_pack.relation['label'].astype('float32') frame = data_pack.frame task = mz.tasks.Ranking() train_raw = data_pack # mz.datasets.toy.load_data(stage='train', task=task) test_raw = data_pack # mz.datasets.toy.load_data(stage='test', task=task) model = mz.load_model('step-2-mz-model') preprocessor = mz.preprocessors.BasicPreprocessor() preprocessor.fit(train_raw, verbose=0) ## init preprocessor inner state. # train_processed = preprocessor.transform(train_raw, verbose=5) test_processed = preprocessor.transform(test_raw, verbose=0) # x, y = train_processed.unpack() test_x, test_y = test_processed.unpack() results = model.predict(test_x) print(type(results)) print(len(results)) print(results) for idx, item in enumerate(results[:20]): print('*' * 100) print(idx)
mz.preprocessors.units.punc_removal.PuncRemoval(), ] model_class = mz.models.MVLSTM model, preprocessor, data_generator_builder, embedding_matrix = mz.auto.prepare( task=ranking_task, model_class=model_class, preprocessor=preprocessor_class, data_pack=train_raw) train_processed = preprocessor.fit_transform(train_raw, verbose=1) tuner = mz.auto.Tuner(params=model.params, train_data=train_processed, test_data=train_processed, num_runs=10) results = tuner.tune() print(results['best']) params = results['best']['sample'] print(params) model.params['input_shapes'] = preprocessor.context['input_shapes'] model.params['mlp_num_fan_out'] = params['mlp_num_fan_out'] model.params['mlp_num_layers'] = params['mlp_num_layers'] model.params['mlp_num_units'] = params['mlp_num_units'] model.params['top_k'] = params['top_k'] model.compile() model.build() model.save('my-model') loaded_model = mz.load_model('./my-model') print("after==================================") print(loaded_model.params) # 展示模型中可调参数
print( f'Collection: {colName} model file: {modelFile} data transform file: {dataTranFile}' ) print(f'Test file: {dataFileTest}') # Note dtype! don't let Pandas guess column data types! dataTestPacked = pack(readWhiteSpacedMatchZooData(dataFileTest)) with open(dataTranFile, 'rb') as f: prep = pickle.load(f) import pdb, sys #try: if True: dataTestProc = prep.transform(dataTestPacked) model = mz.load_model(modelFile) model.backend.summary() xTest, yTest = dataTestProc.unpack() model.params['task'].metrics = [ mz.metrics.NormalizedDiscountedCumulativeGain(k=20) ] print(model.evaluate(xTest, yTest, batch_size=128)) #except: # tb is traceback #type, value, tb = sys.exc_info() #pdb.post_mortem(tb)
def __init__(self, model_path, batch_size=8): preprocess_path = model_path + "preprocessor.dill" self.model = mz.load_model(model_path + "model") self.preprocessor = dill.load(open(preprocess_path, "rb")) self.batch_size = batch_size
'text_left': X_left, 'text_right': X_right, 'id_left': X_left_id, 'id_right': X_right_id, 'label': y }) return mz.pack(df) root_dir = r"D:\data\biendata\ccks2019_el\ccks_train_data\{}" print('data loading ...') # train_pack_raw = load_data('train', 100000) # dev_pack_raw = load_data('validate', 200) test_pack_raw = load_data('test', 200) model_path = r"D:/data/biendata/ccks2019_el/entityrank/m0/model/" preprocess_path = model_path + "preprocessor.dill" model = load_model(model_path) preprocessor = dill.load(open(preprocess_path, "rb")) # train_pack_processed = preprocessor.fit_transform(train_pack_raw) # dev_pack_processed = preprocessor.transform(dev_pack_raw) test_pack_processed = preprocessor.transform(test_pack_raw) test_x, test_y = test_pack_processed.unpack() pre = model.predict(test_x, 128) pass