def test_npa_component_definition(mind_resource_path): wordEmb_file = os.path.join(mind_resource_path, "utils", "embedding.npy") userDict_file = os.path.join(mind_resource_path, "utils", "uid2index.pkl") wordDict_file = os.path.join(mind_resource_path, "utils", "word_dict.pkl") yaml_file = os.path.join(mind_resource_path, "utils", r"npa.yaml") if not os.path.exists(yaml_file): download_deeprec_resources( r"https://recodatasets.z20.web.core.windows.net/newsrec/", os.path.join(mind_resource_path, "utils"), "MINDdemo_utils.zip", ) hparams = prepare_hparams( yaml_file, wordEmb_file=wordEmb_file, wordDict_file=wordDict_file, userDict_file=userDict_file, epochs=1, ) iterator = MINDIterator model = NPAModel(hparams, iterator) assert model.model is not None assert model.scorer is not None assert model.loss is not None assert model.train_optimizer is not None
def test_naml_component_definition(tmp): wordEmb_file = os.path.join(tmp, "utils", "embedding_all.npy") userDict_file = os.path.join(tmp, "utils", "uid2index.pkl") wordDict_file = os.path.join(tmp, "utils", "word_dict_all.pkl") vertDict_file = os.path.join(tmp, "utils", "vert_dict.pkl") subvertDict_file = os.path.join(tmp, "utils", "subvert_dict.pkl") yaml_file = os.path.join(tmp, "utils", r"naml.yaml") if not os.path.exists(yaml_file): download_deeprec_resources( r"https://recodatasets.blob.core.windows.net/newsrec/", os.path.join(tmp, "utils"), "MINDdemo_utils.zip", ) hparams = prepare_hparams( yaml_file, wordEmb_file=wordEmb_file, wordDict_file=wordDict_file, userDict_file=userDict_file, vertDict_file=vertDict_file, subvertDict_file=subvertDict_file, epochs=1, ) iterator = MINDAllIterator model = NAMLModel(hparams, iterator) assert model.model is not None assert model.scorer is not None assert model.loss is not None assert model.train_optimizer is not None
def test_prepare_hparams(must_exist_attributes, deeprec_resource_path): wordEmb_file = os.path.join(deeprec_resource_path, "mind", "utils", "embedding.npy") userDict_file = os.path.join(deeprec_resource_path, "mind", "utils", "uid2index.pkl") wordDict_file = os.path.join(deeprec_resource_path, "mind", "utils", "word_dict.pkl") yaml_file = os.path.join(deeprec_resource_path, "mind", "utils", r"nrms.yaml") if not os.path.exists(yaml_file): download_deeprec_resources( r"https://recodatasets.z20.web.core.windows.net/newsrec/", os.path.join(deeprec_resource_path, "mind", "utils"), "MINDdemo_utils.zip", ) hparams = prepare_hparams( yaml_file, wordEmb_file=wordEmb_file, wordDict_file=wordDict_file, userDict_file=userDict_file, epochs=1, ) assert hasattr(hparams, must_exist_attributes)
def get_params(yaml_name, device_id=0, model_class=None): if yaml_name: yaml_path = os.path.join(get_root_path(), "utils", yaml_name) else: yaml_path = get_yaml_path() return prepare_hparams(yaml_path, wordEmb_file=get_emb_path(), wordDict_file=get_dict_file(), device_id=device_id, epochs=epochs, show_step=10, userDict_file=get_user_dic_path(), model_class=model_class)
def test_naml_iterator(mind_resource_path): train_news_file = os.path.join(mind_resource_path, "train", r"news.tsv") train_behaviors_file = os.path.join(mind_resource_path, "train", r"behaviors.tsv") valid_news_file = os.path.join(mind_resource_path, "valid", r"news.tsv") valid_behaviors_file = os.path.join(mind_resource_path, "valid", r"behaviors.tsv") wordEmb_file = os.path.join(mind_resource_path, "utils", "embedding_all.npy") userDict_file = os.path.join(mind_resource_path, "utils", "uid2index.pkl") wordDict_file = os.path.join(mind_resource_path, "utils", "word_dict_all.pkl") vertDict_file = os.path.join(mind_resource_path, "utils", "vert_dict.pkl") subvertDict_file = os.path.join(mind_resource_path, "utils", "subvert_dict.pkl") yaml_file = os.path.join(mind_resource_path, "utils", r"naml.yaml") if not os.path.exists(train_news_file): download_deeprec_resources( r"https://recodatasets.z20.web.core.windows.net/newsrec/", os.path.join(mind_resource_path, "train"), "MINDdemo_train.zip", ) if not os.path.exists(valid_news_file): download_deeprec_resources( r"https://recodatasets.z20.web.core.windows.net/newsrec/", os.path.join(mind_resource_path, "valid"), "MINDdemo_dev.zip", ) if not os.path.exists(yaml_file): download_deeprec_resources( r"https://recodatasets.z20.web.core.windows.net/newsrec/", os.path.join(mind_resource_path, "utils"), "MINDdemo_utils.zip", ) hparams = prepare_hparams( yaml_file, wordEmb_file=wordEmb_file, wordDict_file=wordDict_file, userDict_file=userDict_file, vertDict_file=vertDict_file, subvertDict_file=subvertDict_file, epochs=1, batch_size=1024, ) train_iterator = MINDAllIterator(hparams, hparams.npratio) test_iterator = MINDAllIterator(hparams, -1) assert train_iterator is not None for res in train_iterator.load_data_from_file( train_news_file, train_behaviors_file ): assert isinstance(res, dict) assert len(res) == 11 break assert test_iterator is not None for res in test_iterator.load_data_from_file(valid_news_file, valid_behaviors_file): assert isinstance(res, dict) assert len(res) == 11 break
def test_model_naml(mind_resource_path): train_news_file = os.path.join(mind_resource_path, "train", r"news.tsv") train_behaviors_file = os.path.join(mind_resource_path, "train", r"behaviors.tsv") valid_news_file = os.path.join(mind_resource_path, "valid", r"news.tsv") valid_behaviors_file = os.path.join(mind_resource_path, "valid", r"behaviors.tsv") wordEmb_file = os.path.join(mind_resource_path, "utils", "embedding_all.npy") userDict_file = os.path.join(mind_resource_path, "utils", "uid2index.pkl") wordDict_file = os.path.join(mind_resource_path, "utils", "word_dict_all.pkl") vertDict_file = os.path.join(mind_resource_path, "utils", "vert_dict.pkl") subvertDict_file = os.path.join(mind_resource_path, "utils", "subvert_dict.pkl") yaml_file = os.path.join(mind_resource_path, "utils", r"naml.yaml") if not os.path.exists(train_news_file): download_deeprec_resources( r"https://recodatasets.z20.web.core.windows.net/newsrec/", os.path.join(mind_resource_path, "train"), "MINDdemo_train.zip", ) if not os.path.exists(valid_news_file): download_deeprec_resources( r"https://recodatasets.z20.web.core.windows.net/newsrec/", os.path.join(mind_resource_path, "valid"), "MINDdemo_dev.zip", ) if not os.path.exists(yaml_file): download_deeprec_resources( r"https://recodatasets.z20.web.core.windows.net/newsrec/", os.path.join(mind_resource_path, "utils"), "MINDdemo_utils.zip", ) hparams = prepare_hparams( yaml_file, wordEmb_file=wordEmb_file, wordDict_file=wordDict_file, userDict_file=userDict_file, vertDict_file=vertDict_file, subvertDict_file=subvertDict_file, epochs=1, ) iterator = MINDAllIterator model = NAMLModel(hparams, iterator) assert model.run_eval(valid_news_file, valid_behaviors_file) is not None assert isinstance( model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file), BaseModel, )
def test_news_iterator(tmp): train_news_file = os.path.join(tmp, "train", r"news.tsv") train_behaviors_file = os.path.join(tmp, "train", r"behaviors.tsv") valid_news_file = os.path.join(tmp, "valid", r"news.tsv") valid_behaviors_file = os.path.join(tmp, "valid", r"behaviors.tsv") wordEmb_file = os.path.join(tmp, "utils", "embedding.npy") userDict_file = os.path.join(tmp, "utils", "uid2index.pkl") wordDict_file = os.path.join(tmp, "utils", "word_dict.pkl") yaml_file = os.path.join(tmp, "utils", r"nrms.yaml") if not os.path.exists(train_news_file): download_deeprec_resources( r"https://recodatasets.blob.core.windows.net/newsrec/", os.path.join(tmp, "train"), "MINDdemo_train.zip", ) if not os.path.exists(valid_news_file): download_deeprec_resources( r"https://recodatasets.blob.core.windows.net/newsrec/", os.path.join(tmp, "valid"), "MINDdemo_dev.zip", ) if not os.path.exists(yaml_file): download_deeprec_resources( r"https://recodatasets.blob.core.windows.net/newsrec/", os.path.join(tmp, "utils"), "MINDdemo_utils.zip", ) hparams = prepare_hparams( yaml_file, wordEmb_file=wordEmb_file, wordDict_file=wordDict_file, userDict_file=userDict_file, epochs=1, ) train_iterator = MINDIterator(hparams, hparams.npratio) test_iterator = MINDIterator(hparams, -1) assert train_iterator is not None for res in train_iterator.load_data_from_file(train_news_file, train_behaviors_file): assert isinstance(res, dict) assert len(res) == 5 break assert test_iterator is not None for res in test_iterator.load_data_from_file(valid_news_file, valid_behaviors_file): assert isinstance(res, dict) assert len(res) == 5 break
def test_model_nrms(tmp): train_news_file = os.path.join(tmp, "train", r"news.tsv") train_behaviors_file = os.path.join(tmp, "train", r"behaviors.tsv") valid_news_file = os.path.join(tmp, "valid", r"news.tsv") valid_behaviors_file = os.path.join(tmp, "valid", r"behaviors.tsv") wordEmb_file = os.path.join(tmp, "utils", "embedding.npy") userDict_file = os.path.join(tmp, "utils", "uid2index.pkl") wordDict_file = os.path.join(tmp, "utils", "word_dict.pkl") yaml_file = os.path.join(tmp, "utils", r"nrms.yaml") if not os.path.exists(train_news_file): download_deeprec_resources( r"https://recodatasets.blob.core.windows.net/newsrec/", os.path.join(tmp, "train"), "MINDdemo_train.zip", ) if not os.path.exists(valid_news_file): download_deeprec_resources( r"https://recodatasets.blob.core.windows.net/newsrec/", os.path.join(tmp, "valid"), "MINDdemo_dev.zip", ) if not os.path.exists(yaml_file): download_deeprec_resources( r"https://recodatasets.blob.core.windows.net/newsrec/", os.path.join(tmp, "utils"), "MINDdemo_utils.zip", ) hparams = prepare_hparams( yaml_file, wordEmb_file=wordEmb_file, wordDict_file=wordDict_file, userDict_file=userDict_file, epochs=1, ) assert hparams is not None iterator = MINDIterator model = NRMSModel(hparams, iterator) assert model.run_eval(valid_news_file, valid_behaviors_file) is not None assert isinstance( model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file), BaseModel, )
def test_lstur_component_definition(tmp): yaml_file = os.path.join(tmp, "lstur.yaml") wordEmb_file = os.path.join(tmp, "embedding.npy") if not os.path.exists(yaml_file): download_deeprec_resources( "https://recodatasets.blob.core.windows.net/newsrec/", tmp, "lstur.zip") hparams = prepare_hparams(yaml_file, wordEmb_file=wordEmb_file, epochs=1) iterator = NewsIterator model = LSTURModel(hparams, iterator) assert model.model is not None assert model.scorer is not None assert model.loss is not None assert model.train_optimizer is not None
def test_model_npa(tmp): yaml_file = os.path.join(tmp, "npa.yaml") train_file = os.path.join(tmp, "train.txt") valid_file = os.path.join(tmp, "test.txt") wordEmb_file = os.path.join(tmp, "embedding.npy") if not os.path.exists(yaml_file): download_deeprec_resources( "https://recodatasets.blob.core.windows.net/newsrec/", tmp, "npa.zip") hparams = prepare_hparams(yaml_file, wordEmb_file=wordEmb_file, epochs=1) assert hparams is not None iterator = NewsIterator model = NPAModel(hparams, iterator) assert model.run_eval(valid_file) is not None assert isinstance(model.fit(train_file, valid_file), BaseModel)
train_behaviors_file = os.path.join(data_path, 'train', r'final_behaviors.tsv') # valid_news_file = os.path.join(data_path, 'valid', r'news.tsv') valid_behaviors_file = os.path.join(data_path, 'valid', r'final_behaviors.tsv') # test_news_file = os.path.join(data_path, 'test', r'news.tsv') # test_behaviors_file = os.path.join(data_path, 'test', r'behaviors.tsv') wordEmb_file = os.path.join(data_path, "utils", "embedding.npy") userDict_file = os.path.join(data_path, "utils", "uid2index.pkl") wordDict_file = os.path.join(data_path, "utils", "word_dict_all.pkl") subvertDict_file = os.path.join(data_path, "utils", "subvert_dict.pkl") vertDict_file = os.path.join(data_path, "utils", "vert_dict.pkl") yaml_file = os.path.join(data_path, "utils", '{}.yaml'.format(opt.model_name)) hparams = prepare_hparams(yaml_file, wordEmb_file=wordEmb_file, wordDict_file=wordDict_file, userDict_file=userDict_file, vertDict_file=vertDict_file, subvertDict_file=subvertDict_file, batch_size=batch_size, epochs=epochs) print(hparams) iterator = iterator = MINDAllIterator model = NAMLModel(hparams, iterator, seed=seed) # print(model.run_slow_eval(news_file, valid_behaviors_file)) model.fit(news_file, train_behaviors_file, news_file, valid_behaviors_file) # model_path = os.path.join(model_path, "model") # os.makedirs(model_path, exist_ok=True)
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv') valid_news_file = os.path.join(data_path, 'valid', r'news.tsv') valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv') test_news_file = os.path.join(data_path, 'test', r'news.tsv') test_behaviors_file = os.path.join(data_path, 'test', r'behaviors.tsv') wordEmb_file = os.path.join(data_path, "utils", "embedding_all.npy") userDict_file = os.path.join(data_path, "utils", "uid2index.pkl") wordDict_file = os.path.join(data_path, "utils", "word_dict_all.pkl") yaml_file = os.path.join(data_path, "utils", r'nrms.yaml') entityDict_file = os.path.join(data_path, "utils", "entity_dict_all.pkl") entity_embedding_file = os.path.join(data_path, "utils", "entity_embeddings_5w_100_all.npy") context_embedding_file = os.path.join(data_path, "utils", "context_embeddings_5w_100_all.npy") hparams = prepare_hparams(yaml_file, wordEmb_file=wordEmb_file, \ wordDict_file=wordDict_file, userDict_file=userDict_file, \ epochs=epochs, entityEmb_file=entity_embedding_file, contextEmb_file=context_embedding_file, \ entityDict_file=entityDict_file, show_step=10) print(hparams) iterator = MINDIterator model = NRMSModel(hparams, iterator, seed=seed) model_path = os.path.join(data_path, "model") model.model.load_weights(os.path.join(model_path, "my_nrms_ckpt")) f = open(test_behaviors_file) print('total test samples:', len(f.readlines())) f.close() group_impr_indexes, group_labels, group_preds = model.run_fast_eval(test_news_file, test_behaviors_file, test=1) import numpy as np
mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set( MIND_type) if not os.path.exists(train_news_file): download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset) if not os.path.exists(valid_news_file): download_deeprec_resources(mind_url, \ os.path.join(data_path, 'valid'), mind_dev_dataset) if not os.path.exists(yaml_file): download_deeprec_resources(r'https://recodatasets.blob.core.windows.net/newsrec/', \ os.path.join(data_path, 'utils'), mind_utils) hparams = prepare_hparams(yaml_file, wordEmb_file=wordEmb_file, \ wordDict_file=wordDict_file, userDict_file=userDict_file, \ epochs=epochs, show_step=10) print("[NRMS] Config,", hparams) iterator = MINDIterator model = NRMSModel(hparams, iterator, seed=seed) print("[NRMS] First run:", model.run_eval(valid_news_file, fast_valid_behaviors_file)) model.fit(train_news_file, train_behaviors_file, valid_news_file, fast_valid_behaviors_file, model_save_path=model_dir)
if not os.path.exists(train_news_file): download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset) if not os.path.exists(valid_news_file): download_deeprec_resources(mind_url, \ os.path.join(data_path, 'valid'), mind_dev_dataset) if not os.path.exists(test_news_file): download_deeprec_resources(mind_url, \ os.path.join(data_path, 'test'), mind_test_dataset) if not os.path.exists(yaml_file): download_deeprec_resources(r'https://recodatasets.blob.core.windows.net/newsrec/', \ os.path.join(data_path, 'utils'), mind_utils) hparams = prepare_hparams(yaml_file, wordEmb_file=wordEmb_file, wordDict_file=wordDict_file, userDict_file=userDict_file, batch_size=batch_size, epochs=epochs, show_step=10) print(hparams) iterator = MINDIterator model = NRMSModel(hparams, iterator, seed=seed) # print(model.run_eval(valid_news_file, valid_behaviors_file)) model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file) # res_syn = model.run_eval(valid_news_file, valid_behaviors_file) # print(res_syn) # pm.record("res_syn", res_syn)
if not os.path.exists(train_news_file): print("not os.path.exists(train_news_file)") download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset) if not os.path.exists(valid_news_file): print("not os.path.exists(valid_news_file)") download_deeprec_resources(mind_url, \ os.path.join(data_path, 'valid'), mind_dev_dataset) if not os.path.exists(yaml_file): print("not os.path.exists(yaml_file)") download_deeprec_resources(r'https://recodatasets.blob.core.windows.net/newsrec/', \ os.path.join(data_path, 'utils'), mind_utils) ## Create hyper-parameters hparams = prepare_hparams(yaml_file, wordEmb_file=wordEmb_file, \ wordDict_file=wordDict_file, userDict_file=userDict_file, epochs=epochs) print('hparams:', hparams) # We generate a word_dict file to tranform words in news title to word indexes, and a embedding matrix is initted from pretrained glove embeddings. # 我们生成一个word_dict文件,将新闻标题中的单词转换为单词索引,并从预先训练的glove嵌入中初始化embedding矩阵。 iterator = MINDIterator ## Train the LSTUR model model = LSTURModel(hparams, iterator, seed=seed) print( model.run_eval(valid_news_file, valid_behaviors_file)) # news:18723, behaviors:7538(多了第一列) model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)