Пример #1
0
def load_data(data_path):
    timer = utils.timer(name='main').tic()
    split_folder = os.path.join(data_path, 'cold')

    u_file = os.path.join(
        data_path, 'trained/cold/WRMF_cold_rank200_reg1_alpha10_iter10.U.txt')
    v_file = os.path.join(
        data_path, 'trained/cold/WRMF_cold_rank200_reg1_alpha10_iter10.V.txt')
    item_content_file = os.path.join(data_path, 'item_features_0based.txt')
    train_file = os.path.join(split_folder, 'train.csv')
    test_cold_file = os.path.join(split_folder, 'test.csv')
    test_cold_iid_file = os.path.join(split_folder, 'test_item_ids.csv')

    dat = {}
    # load preference data
    timer.tic()

    u_pref = np.loadtxt(u_file).reshape(n_users, 200)
    v_pref = np.loadtxt(v_file).reshape(n_items, 200)

    dat['u_pref'] = u_pref
    dat['v_pref'] = v_pref

    timer.toc('loaded U:%s,V:%s' %
              (str(u_pref.shape), str(v_pref.shape))).tic()

    # pre-process
    _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref)
    _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref)

    timer.toc('standardized U,V').tic()

    # load content data
    timer.tic()
    item_content, _ = datasets.load_svmlight_file(item_content_file,
                                                  zero_based=True,
                                                  dtype=np.float32)

    item_content = tfidf(item_content)

    from sklearn.utils.extmath import randomized_svd
    u, s, _ = randomized_svd(item_content, n_components=300, n_iter=5)
    item_content = u * s
    _, item_content = utils.prep_standardize(item_content)

    if sp.issparse(item_content):
        dat['item_content'] = item_content.tolil(copy=False)
    else:
        dat['item_content'] = item_content
    timer.toc('loaded item feature sparse matrix: %s' %
              (str(item_content.shape))).tic()

    # load split
    timer.tic()
    train = pd.read_csv(
        train_file, delimiter=",", header=None,
        dtype=np.int32).values.ravel().view(
            dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)])
    dat['user_indices'] = np.unique(train['uid'])
    timer.toc('read train triplets %s' % train.shape).tic()

    dat['eval_cold'] = data.load_eval_data(test_cold_file,
                                           test_cold_iid_file,
                                           name='eval_cold',
                                           cold=True,
                                           train_data=train,
                                           citeu=True)
    return dat
Пример #2
0
    def __init__(self, path, batch_size, split, max_L):

        self.batch_size = batch_size
        self.max_L = max_L
        self.split = split

        split_folder = os.path.join(path, 'cold')

        item_content_file = os.path.join(path, 'item_features_0based.txt')
        train_file = os.path.join(split_folder, 'train.csv')
        test_cold_file = os.path.join(split_folder, 'test.csv')
        test_cold_iid_file = os.path.join(split_folder, 'test_item_ids.csv')

        # load split
        # timer.tic()
        train = pd.read_csv(train_file,
                            delimiter=",",
                            header=None,
                            dtype=np.int32).values
        """将测试集的部分数据放入训练集"""
        if self.split:
            test_data = pd.read_csv('../data/CiteU/cold/test.csv',
                                    sep=',',
                                    header=None,
                                    dtype=np.int32).values
            before = test_data[0][1]
            item_set = []
            for line in test_data:
                if line[1] != before:
                    samples = rd.sample(item_set, min(len(item_set), split))
                    train = np.vstack((train, samples))
                    item_set = []
                item_set.append(line)
                before = line[1]
        train = train.ravel().view(
            dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)])

        train_user_ids = np.unique(train['uid'])
        train_item_ids = np.unique(train['iid'])
        self.train_item_ids = train_item_ids

        train_user_ids_map = {
            user_id: i
            for i, user_id in enumerate(train_user_ids)
        }
        train_item_ids_map = {
            user_id: i
            for i, user_id in enumerate(train_item_ids)
        }

        _train_i_for_inf = [train_user_ids_map[_t[0]] for _t in train]
        _train_j_for_inf = [train_item_ids_map[_t[1]] for _t in train]
        self.R_train = sp.coo_matrix(
            (np.ones(len(_train_i_for_inf)),
             (_train_i_for_inf, _train_j_for_inf)),
            shape=[len(train_user_ids),
                   len(train_item_ids)]).tolil(copy=False)
        """for transformer"""
        R_id = np.zeros((len(train_item_ids), self.max_L), dtype="int32")
        R_mask = np.zeros((len(train_item_ids), self.max_L), dtype="int32")
        R_line = []
        """"""
        # R_id = np.full((len(train_item_ids), self.max_L), 5551, dtype="int32")
        # R_mask = np.full((len(train_item_ids), self.max_L), 5551, dtype="int32")
        pre_i = 0
        for i in range(len(_train_i_for_inf)):
            if _train_j_for_inf[i] == pre_i:
                R_line.append(_train_i_for_inf[i])
            else:
                R_sample = rd.sample(R_line, min(len(R_line), self.max_L))
                for index in range(len(R_sample)):
                    R_id[_train_j_for_inf[i - 1]][index] = R_sample[index]
                    R_mask[_train_j_for_inf[i - 1]][index] = 1
                R_line = []
            pre_i = _train_j_for_inf[i]

        self.train_items = self.record_list(_train_j_for_inf, _train_i_for_inf)
        self.train_u = _train_i_for_inf
        self.train_i = _train_j_for_inf

        with open(test_cold_iid_file) as f:
            test_item_ids = [int(line) for line in f]
            self.test_item_ids = test_item_ids
            test_data = pd.read_csv(test_cold_file,
                                    delimiter=",",
                                    header=None,
                                    dtype=np.int32).values.ravel()
            test_data = test_data.view(
                dtype=[('uid', np.int32), ('iid', np.int32), ('inter',
                                                              np.int32)])

            test_item_ids_map = {iid: i for i, iid in enumerate(test_item_ids)}

            _test_ij_for_inf = [(t[0], t[1]) for t in test_data
                                if t[1] in test_item_ids_map]
            # test_user_ids
            test_user_ids = np.unique(test_data['uid'])
            # test_user_ids_map
            # test_user_ids_map = {user_id: i for i, user_id in enumerate(test_user_ids)}
            test_user_ids_map = train_user_ids_map

            _test_i_for_inf = [
                test_user_ids_map[_t[0]] for _t in _test_ij_for_inf
            ]
            _test_j_for_inf = [
                test_item_ids_map[_t[1]] for _t in _test_ij_for_inf
            ]
            self.R_test = sp.coo_matrix(
                (np.ones(len(_test_i_for_inf)),
                 (_test_i_for_inf, _test_j_for_inf)),
                shape=[len(train_user_ids),
                       len(test_item_ids)]).tolil(copy=False)

            self.test_items = self.record_list(_test_j_for_inf,
                                               _test_i_for_inf)
            self.test_u = _test_i_for_inf
            self.test_users = self.record_list(_test_i_for_inf,
                                               _test_j_for_inf)

        item_content, _ = datasets.load_svmlight_file(item_content_file,
                                                      zero_based=True,
                                                      dtype=np.float32)

        item_content = tfidf(item_content)

        from sklearn.utils.extmath import randomized_svd
        u, s, _ = randomized_svd(item_content, n_components=300, n_iter=5)
        item_content = u * s
        _, item_content = utils.prep_standardize(item_content)

        if sp.issparse(item_content):
            item_feature = item_content.tolil(copy=False)
        else:
            item_feature = item_content
        # timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic()
        self.item_content = item_feature
        """加入item预训练模型"""
        # pretrain_v = pd.read_csv("../data/CiteU/trained/cold/WRMF_cold_rank200_reg1_alpha10_iter10.V.txt",
        #                          sep=' ', header=None).values

        # self.S_tr = pretrain_v[train_item_ids, :]
        # self.S_te = pretrain_v[test_item_ids, :]

        self.S_tr = item_feature[train_item_ids, :]
        self.S_te = item_feature[test_item_ids, :]
        if sp.issparse(self.S_tr):
            self.S_tr = self.S_tr.todense()
            self.S_te = self.S_te.todense()

        self.X_tr = self.R_train.todense().T
        self.R_id = R_id
        self.R_mask = R_mask
        self.U_tr = self.R_train.todense()

        self.X_te = self.R_test.todense().T
        self.U_te = self.R_test.todense()

        self.n_users = self.X_tr.shape[0] + self.X_te.shape[0]
        self.n_items = self.U_tr.shape[0]
Пример #3
0
def load_data(data_path):
    timer = utils.timer(name='main').tic()
    split_folder = os.path.join(data_path, 'warm')

    u_file = os.path.join(data_path, 'trained/warm/U.csv.bin')
    v_file = os.path.join(data_path, 'trained/warm/V.csv.bin')
    user_content_file = os.path.join(data_path, 'user_features_0based.txt')
    item_content_file = os.path.join(data_path, 'item_features_0based.txt')
    train_file = os.path.join(split_folder, 'train.csv')
    test_warm_file = os.path.join(split_folder, 'test_warm.csv')
    test_warm_iid_file = os.path.join(split_folder, 'test_warm_item_ids.csv')
    test_cold_user_file = os.path.join(split_folder, 'test_cold_user.csv')
    test_cold_user_iid_file = os.path.join(split_folder,
                                           'test_cold_user_item_ids.csv')
    test_cold_item_file = os.path.join(split_folder, 'test_cold_item.csv')
    test_cold_item_iid_file = os.path.join(split_folder,
                                           'test_cold_item_item_ids.csv')

    dat = {}
    # load preference data
    timer.tic()
    u_pref = np.fromfile(u_file, dtype=np.float32).reshape(n_users, 200)
    v_pref = np.fromfile(v_file, dtype=np.float32).reshape(n_items, 200)
    dat['u_pref'] = u_pref
    dat['v_pref'] = v_pref

    timer.toc('loaded U:%s,V:%s' %
              (str(u_pref.shape), str(v_pref.shape))).tic()

    # pre-process
    _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref)
    _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref)
    timer.toc('standardized U,V').tic()

    # load content data
    timer.tic()
    user_content, _ = datasets.load_svmlight_file(user_content_file,
                                                  zero_based=True,
                                                  dtype=np.float32)
    dat['user_content'] = user_content.tolil(copy=False)
    timer.toc('loaded user feature sparse matrix: %s' %
              (str(user_content.shape))).tic()
    item_content, _ = datasets.load_svmlight_file(item_content_file,
                                                  zero_based=True,
                                                  dtype=np.float32)
    dat['item_content'] = item_content.tolil(copy=False)
    timer.toc('loaded item feature sparse matrix: %s' %
              (str(item_content.shape))).tic()

    # load split
    timer.tic()
    train = pd.read_csv(
        train_file, delimiter=",", header=-1,
        dtype=np.int32).values.ravel().view(
            dtype=[('uid', np.int32), ('iid',
                                       np.int32), ('inter',
                                                   np.int32), ('date',
                                                               np.int32)])
    dat['user_indices'] = np.unique(train['uid'])
    timer.toc('read train triplets %s' % train.shape).tic()

    dat['eval_warm'] = data.load_eval_data(test_warm_file,
                                           test_warm_iid_file,
                                           name='eval_warm',
                                           cold=False,
                                           train_data=train)
    dat['eval_cold_user'] = data.load_eval_data(test_cold_user_file,
                                                test_cold_user_iid_file,
                                                name='eval_cold_user',
                                                cold=True,
                                                train_data=train)
    dat['eval_cold_item'] = data.load_eval_data(test_cold_item_file,
                                                test_cold_item_iid_file,
                                                name='eval_cold_item',
                                                cold=True,
                                                train_data=train)
    return dat
Пример #4
0
	def __init__(self, path, batch_size):

		self.batch_size = batch_size

		split_folder = os.path.join(path, 'cold')

		item_content_file = os.path.join(path, 'item_features_0based.txt')
		train_file = os.path.join(split_folder, 'train.csv')
		test_cold_file = os.path.join(split_folder, 'test.csv')
		test_cold_iid_file = os.path.join(split_folder, 'test_item_ids.csv')

		# load split
		# timer.tic()
		train = pd.read_csv(train_file, delimiter=",", header=None, dtype=np.int32).values.ravel().view(
			dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)])
		train_user_ids = np.unique(train['uid'])

		train_item_ids = np.unique(train['iid'])
		self.train_item_ids = train_item_ids

		train_user_ids_map = {user_id: i for i, user_id in enumerate(train_user_ids)}
		train_item_ids_map = {user_id: i for i, user_id in enumerate(train_item_ids)}

		_train_i_for_inf = [train_user_ids_map[_t[0]] for _t in train]
		_train_j_for_inf = [train_item_ids_map[_t[1]] for _t in train]
		self.R_train = sp.coo_matrix(
			(np.ones(len(_train_i_for_inf)),
			 (_train_i_for_inf, _train_j_for_inf)),
			shape=[len(train_user_ids), len(train_item_ids)]).tolil(copy=False)

		self.train_items = self.record_list(_train_j_for_inf, _train_i_for_inf)
		self.train_u = _train_i_for_inf
		self.train_i = _train_j_for_inf

		with open(test_cold_iid_file) as f:
			test_item_ids = [int(line) for line in f]
			self.test_item_ids = test_item_ids
			test_data = pd.read_csv(test_cold_file, delimiter=",", header=None, dtype=np.int32).values.ravel()
			test_data = test_data.view(
				dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32)])

			test_item_ids_map = {iid: i for i, iid in enumerate(test_item_ids)}

			_test_ij_for_inf = [(t[0], t[1]) for t in test_data if t[1] in test_item_ids_map]
			# test_user_ids
			test_user_ids = np.unique(test_data['uid'])
			# test_user_ids_map
			# test_user_ids_map = {user_id: i for i, user_id in enumerate(test_user_ids)}
			test_user_ids_map = train_user_ids_map

			_test_i_for_inf = [test_user_ids_map[_t[0]] for _t in _test_ij_for_inf]
			_test_j_for_inf = [test_item_ids_map[_t[1]] for _t in _test_ij_for_inf]
			self.R_test = sp.coo_matrix(
				(np.ones(len(_test_i_for_inf)),
				 (_test_i_for_inf, _test_j_for_inf)),
				shape=[len(train_user_ids), len(test_item_ids)]).tolil(copy=False)

			self.test_items = self.record_list(_test_j_for_inf, _test_i_for_inf)
			self.test_u = _test_i_for_inf
			self.test_users = self.record_list(_test_i_for_inf, _test_j_for_inf)


		item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32)

		item_content = tfidf(item_content)

		from sklearn.utils.extmath import randomized_svd
		u, s, _ = randomized_svd(item_content, n_components=300, n_iter=5)
		item_content = u * s
		_, item_content = utils.prep_standardize(item_content)

		if sp.issparse(item_content):
			item_feature = item_content.tolil(copy=False)
		else:
			item_feature = item_content
		# timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic()
		self.item_content = item_feature


		self.S_tr = item_feature[train_item_ids, :]
		self.S_te = item_feature[test_item_ids, :]
		if sp.issparse(self.S_tr):
			self.S_tr = self.S_tr.todense()
			self.S_te = self.S_te.todense()



		self.X_tr = self.R_train.todense().T
		self.U_tr = self.R_train.todense()

		self.X_te = self.R_test.todense().T
		self.U_te = self.R_test.todense()


		self.n_users = self.X_tr.shape[0] + self.X_te.shape[0]
		self.n_items = self.U_tr.shape[0]