예제 #1
0
def load_data(data_name):
    timer = utils.timer(name='main')
    data_path = './data/' + data_name
    user_pref_file = data_path + '/U_BPR.npy'
    item_pref_file = data_path + '/V_BPR.npy'
    item_content_file = data_path + '/item_features.txt'
    train_file = data_path + '/train.csv'
    test_file = data_path + '/test.csv'
    vali_file = data_path + '/vali.csv'
    dat = {}

    # load preference data
    timer.tic()
    dat['u_pref'] = np.load(user_pref_file)
    dat['v_pref'] = np.load(item_pref_file)
    timer.toc('loaded U:%s,V:%s' %
              (str(dat['u_pref'].shape), str(dat['v_pref'].shape))).tic()

    # pre-process preference data
    _, dat['u_pref'] = utils.standardize(dat['u_pref'])
    _, dat['v_pref'] = utils.standardize_2(dat['v_pref'])
    timer.toc('standardized U,V').tic()

    # load item(article) content data
    # load_svmlight_file(file): 读取svmlight格式的数据文件,文件存放格式
    # <label> <feature-id>:<feature-value> <feature-id>:<feature-value> ...
    # 其中 zero_based 选项,如果为 False 的话会将所有的 indices 减 1
    # 返回 (X, y),其中 X 是 scipy.sparse matrix,y 是 numpy.ndarray
    item_content, _ = datasets.load_svmlight_file(item_content_file,
                                                  zero_based=True,
                                                  dtype=np.float32)
    # tfidf 文本特征化
    item_content = tfidf(item_content)
    # svd 特征降维
    u, s, _ = randomized_svd(item_content, n_components=300, n_iter=5)
    item_content = u * s
    # 特征标准化
    _, item_content = utils.standardize(item_content)
    dat['item_content'] = item_content
    timer.toc('loaded item feature sparse matrix: %s' %
              (str(item_content.shape))).tic()

    # load split
    train = pd.read_csv(train_file, dtype=np.int32)
    dat['user_list'] = train['uid'].values
    dat['item_list'] = train['iid'].values
    timer.toc('read train triplets %s' % str(train.shape))

    dat['test_eval'] = data.load_eval_data(test_file)
    dat['vali_eval'] = data.load_eval_data(vali_file)
    return dat
예제 #2
0
def load_data(data_name):
    timer = utils.timer(name='main').tic()
    data_path = './data/' + data_name
    u_file = data_path + '/U_BPR.npy'
    v_file = data_path + '/V_BPR.npy'
    user_content_file = data_path + '/user_content.npz'
    train_file = data_path + '/train.csv'
    test_file = data_path + '/test.csv'
    vali_file = data_path + '/vali.csv'

    dat = {}
    # load preference data
    timer.tic()
    u_pref = np.load(u_file)
    v_pref = np.load(v_file)
    dat['u_pref'] = u_pref
    dat['v_pref'] = v_pref
    timer.toc('loaded U:%s,V:%s' %
              (str(u_pref.shape), str(v_pref.shape))).tic()

    # pre-process
    _, dat['u_pref'] = utils.standardize_2(u_pref)
    _, dat['v_pref'] = utils.standardize(v_pref)
    timer.toc('standardized U,V').tic()

    # load content data
    timer.tic()
    user_content = scipy.sparse.load_npz(user_content_file)
    dat['user_content'] = user_content.tolil(copy=False)
    timer.toc('loaded user feature sparse matrix: %s' %
              (str(user_content.shape))).tic()

    # load split
    timer.tic()
    train = pd.read_csv(train_file, dtype=np.int32)
    dat['user_list'] = train['uid'].values
    dat['item_list'] = train['iid'].values
    dat['warm_item'] = np.unique(train['iid'].values)
    timer.toc('read train triplets %s' % str(train.shape)).tic()

    dat['vali_eval'] = data.load_eval_data(vali_file,
                                           cold_user=True,
                                           test_item_ids=dat['warm_item'])
    dat['test_eval'] = data.load_eval_data(test_file,
                                           cold_user=True,
                                           test_item_ids=dat['warm_item'])
    return dat
예제 #3
0
def eval():
    # Load graph
    g = Graph(is_training=False)
    print("Graph loaded")

    # Load data
    x, y = load_eval_data()
    char2idx, idx2char = load_vocab()

    with g.graph.as_default():
        sv = tf.train.Supervisor()
        with sv.managed_session() as sess:
            # Restore parameters
            sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir))
            print("Restored!")
            # Get model name
            mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1]

            # Speech to Text
            if not os.path.exists('samples'): os.mkdir('samples')
            with codecs.open('samples/{}.txt'.format(mname), 'w',
                             'utf-8') as fout:
                preds = np.zeros((hp.batch_size, hp.max_len), np.int32)
                for j in range(hp.max_len):
                    _preds = sess.run(g.preds, {g.x: x, g.y: preds})
                    preds[:, j] = _preds[:, j]

                # Write to file
                for i, (expected, got) in enumerate(zip(
                        y, preds)):  # ground truth vs. prediction
                    fout.write("Expected: {}\n".format(expected.split("S")[0]))
                    fout.write("Got     : {}\n\n".format(
                        ("".join(idx2char[idx]
                                 for idx in np.fromstring(got, np.int32))
                         ).split("S")[0]))
                    fout.flush()
예제 #4
0
def load_data(data_path):
    timer = utils.timer(name='main').tic()
    split_folder = os.path.join(data_path, 'cold')

    u_file = os.path.join(
        data_path, 'trained/cold/WRMF_cold_rank200_reg1_alpha10_iter10.U.txt')
    v_file = os.path.join(
        data_path, 'trained/cold/WRMF_cold_rank200_reg1_alpha10_iter10.V.txt')
    item_content_file = os.path.join(data_path, 'item_features_0based.txt')
    train_file = os.path.join(split_folder, 'train.csv')
    test_cold_file = os.path.join(split_folder, 'test.csv')
    test_cold_iid_file = os.path.join(split_folder, 'test_item_ids.csv')

    dat = {}
    # load preference data
    timer.tic()

    u_pref = np.loadtxt(u_file).reshape(n_users, 200)
    v_pref = np.loadtxt(v_file).reshape(n_items, 200)

    dat['u_pref'] = u_pref
    dat['v_pref'] = v_pref

    timer.toc('loaded U:%s,V:%s' %
              (str(u_pref.shape), str(v_pref.shape))).tic()

    # pre-process
    _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref)
    _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref)

    timer.toc('standardized U,V').tic()

    # load content data
    timer.tic()
    item_content, _ = datasets.load_svmlight_file(item_content_file,
                                                  zero_based=True,
                                                  dtype=np.float32)

    item_content = tfidf(item_content)

    from sklearn.utils.extmath import randomized_svd
    u, s, _ = randomized_svd(item_content, n_components=300, n_iter=5)
    item_content = u * s
    _, item_content = utils.prep_standardize(item_content)

    if sp.issparse(item_content):
        dat['item_content'] = item_content.tolil(copy=False)
    else:
        dat['item_content'] = item_content
    timer.toc('loaded item feature sparse matrix: %s' %
              (str(item_content.shape))).tic()

    # load split
    timer.tic()
    train = pd.read_csv(
        train_file, delimiter=",", header=None,
        dtype=np.int32).values.ravel().view(
            dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)])
    dat['user_indices'] = np.unique(train['uid'])
    timer.toc('read train triplets %s' % train.shape).tic()

    dat['eval_cold'] = data.load_eval_data(test_cold_file,
                                           test_cold_iid_file,
                                           name='eval_cold',
                                           cold=True,
                                           train_data=train,
                                           citeu=True)
    return dat
예제 #5
0
def load_data(data_path):
    timer = utils.timer(name='main').tic()
    split_folder = os.path.join(data_path, 'warm')

    u_file = os.path.join(data_path, 'trained/warm/U.csv.bin')
    v_file = os.path.join(data_path, 'trained/warm/V.csv.bin')
    user_content_file = os.path.join(data_path, 'user_features_0based.txt')
    item_content_file = os.path.join(data_path, 'item_features_0based.txt')
    train_file = os.path.join(split_folder, 'train.csv')
    test_warm_file = os.path.join(split_folder, 'test_warm.csv')
    test_warm_iid_file = os.path.join(split_folder, 'test_warm_item_ids.csv')
    test_cold_user_file = os.path.join(split_folder, 'test_cold_user.csv')
    test_cold_user_iid_file = os.path.join(split_folder,
                                           'test_cold_user_item_ids.csv')
    test_cold_item_file = os.path.join(split_folder, 'test_cold_item.csv')
    test_cold_item_iid_file = os.path.join(split_folder,
                                           'test_cold_item_item_ids.csv')

    dat = {}
    # load preference data
    timer.tic()
    u_pref = np.fromfile(u_file, dtype=np.float32).reshape(n_users, 200)
    v_pref = np.fromfile(v_file, dtype=np.float32).reshape(n_items, 200)
    dat['u_pref'] = u_pref
    dat['v_pref'] = v_pref

    timer.toc('loaded U:%s,V:%s' %
              (str(u_pref.shape), str(v_pref.shape))).tic()

    # pre-process
    _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref)
    _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref)
    timer.toc('standardized U,V').tic()

    # load content data
    timer.tic()
    user_content, _ = datasets.load_svmlight_file(user_content_file,
                                                  zero_based=True,
                                                  dtype=np.float32)
    dat['user_content'] = user_content.tolil(copy=False)
    timer.toc('loaded user feature sparse matrix: %s' %
              (str(user_content.shape))).tic()
    item_content, _ = datasets.load_svmlight_file(item_content_file,
                                                  zero_based=True,
                                                  dtype=np.float32)
    dat['item_content'] = item_content.tolil(copy=False)
    timer.toc('loaded item feature sparse matrix: %s' %
              (str(item_content.shape))).tic()

    # load split
    timer.tic()
    train = pd.read_csv(
        train_file, delimiter=",", header=-1,
        dtype=np.int32).values.ravel().view(
            dtype=[('uid', np.int32), ('iid',
                                       np.int32), ('inter',
                                                   np.int32), ('date',
                                                               np.int32)])
    dat['user_indices'] = np.unique(train['uid'])
    timer.toc('read train triplets %s' % train.shape).tic()

    dat['eval_warm'] = data.load_eval_data(test_warm_file,
                                           test_warm_iid_file,
                                           name='eval_warm',
                                           cold=False,
                                           train_data=train)
    dat['eval_cold_user'] = data.load_eval_data(test_cold_user_file,
                                                test_cold_user_iid_file,
                                                name='eval_cold_user',
                                                cold=True,
                                                train_data=train)
    dat['eval_cold_item'] = data.load_eval_data(test_cold_item_file,
                                                test_cold_item_iid_file,
                                                name='eval_cold_item',
                                                cold=True,
                                                train_data=train)
    return dat
예제 #6
0
             (index + 1), total_correct_predictions / total_predictions))
        generator_tqdm.set_description(description, refresh=False)
    average_loss = total_eval_loss / len(eval_batches)
    eval_accuracy = total_correct_predictions / total_predictions
    print('Final evaluation accuracy: %.4f loss: %.4f' %
          (eval_accuracy, average_loss))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="""Script to evaluate a trained model on data.""")
    parser.add_argument('model', help='Path to trained model directory')
    parser.add_argument('--test',
                        help='Path to evaluation data.',
                        default=r'./data/test.csv')
    parser.add_argument('--labels',
                        help='Path to label dictionary.',
                        default=r'./data/answers.json')

    args = parser.parse_args()
    data, label_to_id = load_eval_data(args.test, args.labels)
    print('\nLoading test data...')
    model, model_config, vocab, reverse_vocab = load_model(args.model)
    test_X, test_Y, vocab, reverse_vocab = process_data(
        data, label_to_id, vocab=vocab, vocab_size=model_config['vocab_size'])
    print('Test data loaded.')
    batch_size = 32
    batches = generate_batches(test_X, test_Y, batch_size)
    print('Batches finished generating.')
    train_result = eval(model, batches)
예제 #7
0
def load_data(data_name):
    timer = utils.timer(name='main').tic()
    data_path = args.datadir + data_name
    train_file = data_path + '/warm_emb.csv'
    warm_test_file = data_path + '/warm_test.csv'
    test_file = data_path + f'/cold_user_test.csv'
    val_file = data_path + f'/cold_user_val.csv'
    pref_file = data_path + f'/{args.warm_model}.npy'
    content_file = data_path + '/user_content.npz'
    warm_dict_file = data_path + '/warm_dict.pkl'
    cold_dict_file = data_path + f'/cold_user_dict.pkl'
    dat = {}

    # load split
    timer.tic()
    train = pd.read_csv(train_file, dtype=np.int32)
    dat['user_list'] = train['user'].values
    dat['item_list'] = train['item'].values
    dat['warm_user'] = np.unique(train['user'].values)
    dat['warm_item'] = np.unique(train['item'].values)
    dat['test_eval'] = data.load_eval_data(test_file)
    dat['val_eval'] = data.load_eval_data(val_file)
    dat['warm_test'] = data.load_eval_data(warm_test_file)
    timer.toc('read train triplets %s' % str(train.shape)).tic()

    # load preference data and standardize
    pref = np.load(pref_file)
    n_warm_user = len(np.unique(dat['user_list']))
    max_user_id = np.max([np.max(dat['user_list']),
                          np.max(dat['test_eval'].test_user_ids),
                          np.max(dat['val_eval'].test_user_ids)])
    max_item_id = np.max(np.max(dat['item_list']))

    # mapped object
    mapped_user = pref[:n_warm_user]
    mapped_item = pref[n_warm_user:]
    # reversely mapped user
    user_map = pd.read_csv(data_path + '/warm_user_mapped.csv', dtype=np.int)
    new2old = user_map['org_id'].values
    dat['u_pref'] = np.zeros((max_user_id + 1, pref.shape[1]))  # 注意 + 1
    dat['u_pref'][new2old] = mapped_user[:]
    # reversely mapped item
    item_map = pd.read_csv(data_path + '/warm_item_mapped.csv', dtype=np.int)
    new2old = item_map['org_id'].values
    dat['v_pref'] = np.zeros((max_item_id + 1, pref.shape[1]))  # 注意 + 1
    dat['v_pref'][new2old] = mapped_item[:]
    # standardize
    _, dat['u_pref'] = utils.standardize(dat['u_pref'])
    _, dat['v_pref'] = utils.standardize_2(dat['v_pref'])
    timer.toc('Load U:%s, V:%s and standardize.' % (str(dat['u_pref'].shape), str(dat['v_pref'].shape))).tic()

    # load user_content data
    dat['user_content'] = sp.load_npz(content_file).tolil()
    timer.toc('loaded user feature sparse matrix: %s' % (str(dat['user_content'].shape))).tic()

    # load metric
    cold_dict = pickle.load(open(cold_dict_file, 'rb'))
    warm_dict = pickle.load(open(warm_dict_file, 'rb'))
    metric = {
        'val': cold_dict['val@100'],
        'warm_test': warm_dict['test@100'],
        'cold_test': cold_dict['test@100'],
    }
    dat['metric'] = metric

    return dat