def load_data(data_name): timer = utils.timer(name='main') data_path = './data/' + data_name user_pref_file = data_path + '/U_BPR.npy' item_pref_file = data_path + '/V_BPR.npy' item_content_file = data_path + '/item_features.txt' train_file = data_path + '/train.csv' test_file = data_path + '/test.csv' vali_file = data_path + '/vali.csv' dat = {} # load preference data timer.tic() dat['u_pref'] = np.load(user_pref_file) dat['v_pref'] = np.load(item_pref_file) timer.toc('loaded U:%s,V:%s' % (str(dat['u_pref'].shape), str(dat['v_pref'].shape))).tic() # pre-process preference data _, dat['u_pref'] = utils.standardize(dat['u_pref']) _, dat['v_pref'] = utils.standardize_2(dat['v_pref']) timer.toc('standardized U,V').tic() # load item(article) content data # load_svmlight_file(file): 读取svmlight格式的数据文件,文件存放格式 # <label> <feature-id>:<feature-value> <feature-id>:<feature-value> ... # 其中 zero_based 选项,如果为 False 的话会将所有的 indices 减 1 # 返回 (X, y),其中 X 是 scipy.sparse matrix,y 是 numpy.ndarray item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32) # tfidf 文本特征化 item_content = tfidf(item_content) # svd 特征降维 u, s, _ = randomized_svd(item_content, n_components=300, n_iter=5) item_content = u * s # 特征标准化 _, item_content = utils.standardize(item_content) dat['item_content'] = item_content timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic() # load split train = pd.read_csv(train_file, dtype=np.int32) dat['user_list'] = train['uid'].values dat['item_list'] = train['iid'].values timer.toc('read train triplets %s' % str(train.shape)) dat['test_eval'] = data.load_eval_data(test_file) dat['vali_eval'] = data.load_eval_data(vali_file) return dat
def load_data(data_name): timer = utils.timer(name='main').tic() data_path = './data/' + data_name u_file = data_path + '/U_BPR.npy' v_file = data_path + '/V_BPR.npy' user_content_file = data_path + '/user_content.npz' train_file = data_path + '/train.csv' test_file = data_path + '/test.csv' vali_file = data_path + '/vali.csv' dat = {} # load preference data timer.tic() u_pref = np.load(u_file) v_pref = np.load(v_file) dat['u_pref'] = u_pref dat['v_pref'] = v_pref timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic() # pre-process _, dat['u_pref'] = utils.standardize_2(u_pref) _, dat['v_pref'] = utils.standardize(v_pref) timer.toc('standardized U,V').tic() # load content data timer.tic() user_content = scipy.sparse.load_npz(user_content_file) dat['user_content'] = user_content.tolil(copy=False) timer.toc('loaded user feature sparse matrix: %s' % (str(user_content.shape))).tic() # load split timer.tic() train = pd.read_csv(train_file, dtype=np.int32) dat['user_list'] = train['uid'].values dat['item_list'] = train['iid'].values dat['warm_item'] = np.unique(train['iid'].values) timer.toc('read train triplets %s' % str(train.shape)).tic() dat['vali_eval'] = data.load_eval_data(vali_file, cold_user=True, test_item_ids=dat['warm_item']) dat['test_eval'] = data.load_eval_data(test_file, cold_user=True, test_item_ids=dat['warm_item']) return dat
def eval(): # Load graph g = Graph(is_training=False) print("Graph loaded") # Load data x, y = load_eval_data() char2idx, idx2char = load_vocab() with g.graph.as_default(): sv = tf.train.Supervisor() with sv.managed_session() as sess: # Restore parameters sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) print("Restored!") # Get model name mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # Speech to Text if not os.path.exists('samples'): os.mkdir('samples') with codecs.open('samples/{}.txt'.format(mname), 'w', 'utf-8') as fout: preds = np.zeros((hp.batch_size, hp.max_len), np.int32) for j in range(hp.max_len): _preds = sess.run(g.preds, {g.x: x, g.y: preds}) preds[:, j] = _preds[:, j] # Write to file for i, (expected, got) in enumerate(zip( y, preds)): # ground truth vs. prediction fout.write("Expected: {}\n".format(expected.split("S")[0])) fout.write("Got : {}\n\n".format( ("".join(idx2char[idx] for idx in np.fromstring(got, np.int32)) ).split("S")[0])) fout.flush()
def load_data(data_path): timer = utils.timer(name='main').tic() split_folder = os.path.join(data_path, 'cold') u_file = os.path.join( data_path, 'trained/cold/WRMF_cold_rank200_reg1_alpha10_iter10.U.txt') v_file = os.path.join( data_path, 'trained/cold/WRMF_cold_rank200_reg1_alpha10_iter10.V.txt') item_content_file = os.path.join(data_path, 'item_features_0based.txt') train_file = os.path.join(split_folder, 'train.csv') test_cold_file = os.path.join(split_folder, 'test.csv') test_cold_iid_file = os.path.join(split_folder, 'test_item_ids.csv') dat = {} # load preference data timer.tic() u_pref = np.loadtxt(u_file).reshape(n_users, 200) v_pref = np.loadtxt(v_file).reshape(n_items, 200) dat['u_pref'] = u_pref dat['v_pref'] = v_pref timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic() # pre-process _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref) _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref) timer.toc('standardized U,V').tic() # load content data timer.tic() item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32) item_content = tfidf(item_content) from sklearn.utils.extmath import randomized_svd u, s, _ = randomized_svd(item_content, n_components=300, n_iter=5) item_content = u * s _, item_content = utils.prep_standardize(item_content) if sp.issparse(item_content): dat['item_content'] = item_content.tolil(copy=False) else: dat['item_content'] = item_content timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic() # load split timer.tic() train = pd.read_csv( train_file, delimiter=",", header=None, dtype=np.int32).values.ravel().view( dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)]) dat['user_indices'] = np.unique(train['uid']) timer.toc('read train triplets %s' % train.shape).tic() dat['eval_cold'] = data.load_eval_data(test_cold_file, test_cold_iid_file, name='eval_cold', cold=True, train_data=train, citeu=True) return dat
def load_data(data_path): timer = utils.timer(name='main').tic() split_folder = os.path.join(data_path, 'warm') u_file = os.path.join(data_path, 'trained/warm/U.csv.bin') v_file = os.path.join(data_path, 'trained/warm/V.csv.bin') user_content_file = os.path.join(data_path, 'user_features_0based.txt') item_content_file = os.path.join(data_path, 'item_features_0based.txt') train_file = os.path.join(split_folder, 'train.csv') test_warm_file = os.path.join(split_folder, 'test_warm.csv') test_warm_iid_file = os.path.join(split_folder, 'test_warm_item_ids.csv') test_cold_user_file = os.path.join(split_folder, 'test_cold_user.csv') test_cold_user_iid_file = os.path.join(split_folder, 'test_cold_user_item_ids.csv') test_cold_item_file = os.path.join(split_folder, 'test_cold_item.csv') test_cold_item_iid_file = os.path.join(split_folder, 'test_cold_item_item_ids.csv') dat = {} # load preference data timer.tic() u_pref = np.fromfile(u_file, dtype=np.float32).reshape(n_users, 200) v_pref = np.fromfile(v_file, dtype=np.float32).reshape(n_items, 200) dat['u_pref'] = u_pref dat['v_pref'] = v_pref timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic() # pre-process _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref) _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref) timer.toc('standardized U,V').tic() # load content data timer.tic() user_content, _ = datasets.load_svmlight_file(user_content_file, zero_based=True, dtype=np.float32) dat['user_content'] = user_content.tolil(copy=False) timer.toc('loaded user feature sparse matrix: %s' % (str(user_content.shape))).tic() item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32) dat['item_content'] = item_content.tolil(copy=False) timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic() # load split timer.tic() train = pd.read_csv( train_file, delimiter=",", header=-1, dtype=np.int32).values.ravel().view( dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32), ('date', np.int32)]) dat['user_indices'] = np.unique(train['uid']) timer.toc('read train triplets %s' % train.shape).tic() dat['eval_warm'] = data.load_eval_data(test_warm_file, test_warm_iid_file, name='eval_warm', cold=False, train_data=train) dat['eval_cold_user'] = data.load_eval_data(test_cold_user_file, test_cold_user_iid_file, name='eval_cold_user', cold=True, train_data=train) dat['eval_cold_item'] = data.load_eval_data(test_cold_item_file, test_cold_item_iid_file, name='eval_cold_item', cold=True, train_data=train) return dat
(index + 1), total_correct_predictions / total_predictions)) generator_tqdm.set_description(description, refresh=False) average_loss = total_eval_loss / len(eval_batches) eval_accuracy = total_correct_predictions / total_predictions print('Final evaluation accuracy: %.4f loss: %.4f' % (eval_accuracy, average_loss)) if __name__ == "__main__": parser = argparse.ArgumentParser( description="""Script to evaluate a trained model on data.""") parser.add_argument('model', help='Path to trained model directory') parser.add_argument('--test', help='Path to evaluation data.', default=r'./data/test.csv') parser.add_argument('--labels', help='Path to label dictionary.', default=r'./data/answers.json') args = parser.parse_args() data, label_to_id = load_eval_data(args.test, args.labels) print('\nLoading test data...') model, model_config, vocab, reverse_vocab = load_model(args.model) test_X, test_Y, vocab, reverse_vocab = process_data( data, label_to_id, vocab=vocab, vocab_size=model_config['vocab_size']) print('Test data loaded.') batch_size = 32 batches = generate_batches(test_X, test_Y, batch_size) print('Batches finished generating.') train_result = eval(model, batches)
def load_data(data_name): timer = utils.timer(name='main').tic() data_path = args.datadir + data_name train_file = data_path + '/warm_emb.csv' warm_test_file = data_path + '/warm_test.csv' test_file = data_path + f'/cold_user_test.csv' val_file = data_path + f'/cold_user_val.csv' pref_file = data_path + f'/{args.warm_model}.npy' content_file = data_path + '/user_content.npz' warm_dict_file = data_path + '/warm_dict.pkl' cold_dict_file = data_path + f'/cold_user_dict.pkl' dat = {} # load split timer.tic() train = pd.read_csv(train_file, dtype=np.int32) dat['user_list'] = train['user'].values dat['item_list'] = train['item'].values dat['warm_user'] = np.unique(train['user'].values) dat['warm_item'] = np.unique(train['item'].values) dat['test_eval'] = data.load_eval_data(test_file) dat['val_eval'] = data.load_eval_data(val_file) dat['warm_test'] = data.load_eval_data(warm_test_file) timer.toc('read train triplets %s' % str(train.shape)).tic() # load preference data and standardize pref = np.load(pref_file) n_warm_user = len(np.unique(dat['user_list'])) max_user_id = np.max([np.max(dat['user_list']), np.max(dat['test_eval'].test_user_ids), np.max(dat['val_eval'].test_user_ids)]) max_item_id = np.max(np.max(dat['item_list'])) # mapped object mapped_user = pref[:n_warm_user] mapped_item = pref[n_warm_user:] # reversely mapped user user_map = pd.read_csv(data_path + '/warm_user_mapped.csv', dtype=np.int) new2old = user_map['org_id'].values dat['u_pref'] = np.zeros((max_user_id + 1, pref.shape[1])) # 注意 + 1 dat['u_pref'][new2old] = mapped_user[:] # reversely mapped item item_map = pd.read_csv(data_path + '/warm_item_mapped.csv', dtype=np.int) new2old = item_map['org_id'].values dat['v_pref'] = np.zeros((max_item_id + 1, pref.shape[1])) # 注意 + 1 dat['v_pref'][new2old] = mapped_item[:] # standardize _, dat['u_pref'] = utils.standardize(dat['u_pref']) _, dat['v_pref'] = utils.standardize_2(dat['v_pref']) timer.toc('Load U:%s, V:%s and standardize.' % (str(dat['u_pref'].shape), str(dat['v_pref'].shape))).tic() # load user_content data dat['user_content'] = sp.load_npz(content_file).tolil() timer.toc('loaded user feature sparse matrix: %s' % (str(dat['user_content'].shape))).tic() # load metric cold_dict = pickle.load(open(cold_dict_file, 'rb')) warm_dict = pickle.load(open(warm_dict_file, 'rb')) metric = { 'val': cold_dict['val@100'], 'warm_test': warm_dict['test@100'], 'cold_test': cold_dict['test@100'], } dat['metric'] = metric return dat