def test_save_load(base_preprocessor): dirpath = '.tmpdir' base_preprocessor.save(dirpath) assert mz.load_preprocessor(dirpath) with pytest.raises(FileExistsError): base_preprocessor.save(dirpath) shutil.rmtree(dirpath)
def predict(train_id='test_file'): q = 'how did apollo creed die' d = "Urban legend states that Apollo Creed's name is a wordplay on the Apostles' Creed , a statement of belief used in Christian churches." df = pd.DataFrame(data={'text_left': [q], 'text_right': [d], 'label': [0]}) preprocessor = mz.load_preprocessor(ROOT_PATH + 'matchzoo_temp_files/preprocessors/' + train_id + '.dssm_preprocessor') predict_pack = mz.pack(df) predict_pack_processed = preprocessor.transform(predict_pack) model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + '.dssm_model') predict_score = float( model.predict(predict_pack_processed[:10].unpack()[0])[0][0]) ret_dict = {'score': predict_score} print(ret_dict)
def DSSM(model): request_data = json.loads(request.data.decode('utf-8')) q = request_data['text1'] d = request_data['text2'] train_id = request_data['train_id'] df = pd.DataFrame(data={'text_left': [q], 'text_right': [d], 'label': [0]}) preprocessor_suffix = '.' + model + '_preprocessor' preprocessor = mz.load_preprocessor(ROOT_PATH + 'matchzoo_temp_files/preprocessors/' + train_id + preprocessor_suffix) predict_pack = mz.pack(df) predict_pack_processed = preprocessor.transform(predict_pack) keras.backend.clear_session() model_suffix = '.' + model + '_model' model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + model_suffix) predict_score = float( model.predict(predict_pack_processed[:10].unpack()[0])[0][0]) ret_dict = {'score': predict_score} ''' if model != 'drmm': model_suffix = '.' + model + '_model' model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + model_suffix) predict_score = float(model.predict(predict_pack_processed[:10].unpack()[0])[0][0]) ret_dict = { 'score': predict_score } else: glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300) embedding_matrix = glove_embedding.build_matrix(preprocessor.context['vocab_unit'].state['term_index']) l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1)) embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis] pred_generator = mz.HistogramDataGenerator(data_pack=predict_pack_processed, embedding_matrix=embedding_matrix, bin_size=30, hist_mode='LCH') test_x, test_y = pred_generator[:] keras.backend.clear_session() model_suffix = '.' + model + '_model' model = mz.load_model(ROOT_PATH + 'matchzoo_temp_files/models/' + train_id + model_suffix) predict_score = float(model.predict(test_x)[0][0]) ret_dict = { 'score': predict_score } ''' return jsonify(ret_dict)
def _load_basic_preprocessor(self, extra_terms): dataset_name = self.dataset.dataset preprocessor_name = "basic" if self.model_class != mz.models.Bert else "bert" preprocessor_folder = self.preprocessor_path save_name = ".".join([preprocessor_name, dataset_name]) save_path = os.path.join(preprocessor_folder, save_name) if os.path.exists(save_path) and self.model_class != mz.models.Bert: preprocessor = mz.load_preprocessor(save_path) print("Load Preprocessor from %s" % save_path) preprocessor.fit = lambda *args, **argv: None save_path = None else: print("Init Preprocessor") preprocessor = self.model_class.get_default_preprocessor( truncated_length_left=20, truncated_length_right=1024 if self.model_class != mz.models.Bert else 492, truncated_mode="post") preprocessor.multiprocessing = 0 if self.dataset.debug_mode else 1 preprocessor.extra_terms = extra_terms return preprocessor, save_path
model_path = f"./model/traversal_path_esim-{split}" task = mz.tasks.Classification(num_classes=2) task.metrics = ['acc'] print("`classification_task` initialized with metrics", task.metrics) best_model = sorted(os.listdir(model_path), key=lambda fn: os.path.getmtime(model_path + '/' + fn))[-1] test_raw = mz.datasets.cfq.load_data(stage='test', task=task, data_root=data_root, suffix="mask_predict_classification.csv") print('data loaded as `train_pack_raw` `dev_pack_raw` `test_pack_raw`') # print(model_path, ) preprocessor = mz.load_preprocessor(model_path) # preprocessor.fit(train_raw) # train_processed = preprocessor.transform(train_raw) test_processed = preprocessor.transform(test_raw) # print(test_processed.frame()) testset = mz.dataloader.Dataset(data_pack=test_processed, mode='point', batch_size=1024, shuffle=False) padding_callback = mz.models.ESIM.get_default_padding_callback() testloader = mz.dataloader.DataLoader(dataset=testset, stage='test',
def test_save_load(base_preprocessor): dirpath = '.tmpdir' base_preprocessor.save(dirpath) assert mz.load_preprocessor(dirpath) shutil.rmtree(dirpath)
def prepare_test(self, test_pack): preprocessor = mz.load_preprocessor(self.preprocessordir) test_pack_processed = preprocessor.transform(test_pack) return test_pack_processed
print("Give fold number") sys.exit(0) elif len(sys.argv) < 3: print("Give path folder") sys.exit(0) path = sys.argv[2] print("loading embedding ...") glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300) print("embedding loaded as `glove_embedding`") fold = sys.argv[1] print("Loading fold: ", fold) preprocessor = mz.load_preprocessor(path + "robust_preprocessor_fold_" + fold) print("preprocessor context: ", preprocessor.context) ranking_task = mz.tasks.Ranking(loss=mz.losses.RankHingeLoss(num_neg=1)) ranking_task.metrics = [ mz.metrics.NormalizedDiscountedCumulativeGain(k=20), mz.metrics.MeanAveragePrecision(), mz.metrics.Precision(k=20) ] print("ranking task ok") bin_size = 30 model = mz.models.DRMM() model.params.update(preprocessor.context)
print("task is", task) print("`task` initialized with metrics", task.metrics) if fit_preprocessor: preprocessor = mz.models.ESIM.get_default_preprocessor( truncated_mode='pre', truncated_length_left=64, truncated_length_right=256, filter_mode='df', filter_low_freq=2) preprocessor = preprocessor.fit(all_data_raw) preprocessor.save("preprocessor.prep") else: preprocessor = mz.load_preprocessor("preprocessor.prep") candidate_dic = pd.read_feather('data/candidate_dic.ftr') train_recall = pd.read_feather('data/train_recall.ftr') train_description = pd.read_feather('data/train_description.ftr') train_recall = pd.merge(train_recall, train_description, how='left', on='id_left') train_recall = pd.merge(train_recall, candidate_dic, how='left', on='id_right') train_recall = train_recall.drop_duplicates().reset_index(drop=True) del train_description gc.collect() test_recall = pd.read_feather('data/test_recall.ftr')