def als_model(self, dataset): return WALSModel( dataset.n_students, dataset.n_courses, self.num_factors, regularization=self.regularization, unobserved_weight=0)
def run_wals(data, dim, reg, unobs, weights=False, wt_type=LINEAR_RATINGS, feature_wt_exp=None, obs_wt=LINEAR_OBS_W): """Create the WALSModel and input, row and col factor tensors. Inputs: data: scipy coo_matrix of item ratings dim: number of latent factors reg: regularization constant unobs: unobserved item weight weights: True: set obs weights, False: obs weights = unobs weights wt_type: feature weight type: linear (0) or log (1) feature_wt_exp: feature weight exponent constant obs_wt: feature weight linear factor constant Outputs: input_tensor: tensor holding the input ratings matrix row_factor: tensor for row_factor col_factor: tensor for col_factor model: WALSModel instance """ row_wts = None col_wts = None num_rows = data.shape[0] num_cols = data.shape[1] if weights: assert feature_wt_exp is not None row_wts = np.ones(num_rows) col_wts = make_wts(data, wt_type, obs_wt, feature_wt_exp, 0) row_factor = None col_factor = None with tf.Graph().as_default(): input_tensor = tf.SparseTensor(indices=zip(data.row, data.col), values=(data.data).astype(np.float32), dense_shape=data.shape) model = WALSModel(num_rows, num_cols, dim, unobserved_weight=unobs, regularization=reg, row_weights=row_wts, col_weights=col_wts) # retrieve the row and column factors row_factor = model.row_factors[0] col_factor = model.col_factors[0] return input_tensor, row_factor, col_factor, model
def _build_model(self): """ 构建wALS算法计算图 :return: """ num_rows = self.data.shape[0] num_cols = self.data.shape[1] # Weight矩阵初始化方式 # 1.User orientation 同一个User下Miss Value平均 # 2.Item orientation 同一个Item下Miss Value平均 if self.weights: if self.weight_type == 'user': self.row_wts = np.ones(num_rows) self.col_wts = self._make_wts(self.data, self.wt_type, self.obs_wt, self.feature_wt_exp, 0) elif self.weight_type == 'item': self.col_wts = np.ones(num_cols) self.row_wts = self._make_wts(self.data, self.wt_type, self.obs_wt, self.feature_wt_exp, 1) with tf.Graph().as_default(): self.input_tensor = tf.SparseTensor( indices=list(zip(self.data.row, self.data.col)), values=(self.data.data).astype(np.float32), dense_shape=self.data.shape) self.model = WALSModel(num_rows, num_cols, self.dim, unobserved_weight=self.unobs, regularization=self.reg, row_weights=self.row_wts, col_weights=self.col_wts) self.row_factor = self.model.row_factors[0] self.col_factor = self.model.col_factors[0]
def get_model(data, ncomponents=10, unobserved_weight=0, regularization=0.05): nrows, ncols = data.shape r_weight = np.ones(nrows) c_weight = np.ones(ncols) with tf.Graph().as_default(): tensor = tf.SparseTensor(np.column_stack((data.row, data.col)), (data.data).astype(np.float32), data.shape) model = WALSModel(nrows, ncols, ncomponents, unobserved_weight, regularization, row_weights=r_weight, col_weights=c_weight) return tensor, model.row_factors[0], model.col_factors[0], model
def train_model(train_sparse, test_sparse, num_users, num_movies, args, verbose=False): tf.logging.info('Train Start: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) with tf.Graph().as_default(), tf.Session() as sess: row_weights = np.ones(num_users) col_weights = np.ones(num_movies) if args.col_weight_bool: col_weights = make_weights(train_sparse, args.col_weight_factor, axis=0) if args.row_weight_bool: row_weights = make_weights(train_sparse, args.row_weight_factor, axis=1) # create model model = WALSModel(num_users, num_movies, args.num_factors, regularization=args.regularization, unobserved_weight=args.unobserved_weight, row_weights=row_weights, col_weights=col_weights) # create sparse tensor input_tensor = tf.SparseTensor( indices=zip(train_sparse.row, train_sparse.col), values=(train_sparse.data).astype(np.float32), dense_shape=train_sparse.shape) test_tensor = tf.SparseTensor( indices=zip(test_sparse.row, test_sparse.col), values=(test_sparse.data).astype(np.float32), dense_shape=test_sparse.shape) # train model rmse_op = rmse(model, input_tensor) if verbose else None rmse_test_op = rmse(model, test_tensor) row_update_op = model.update_row_factors(sp_input=input_tensor)[1] col_update_op = model.update_col_factors(sp_input=input_tensor)[1] model.initialize_op.run() model.worker_init.run() for _ in range(args.epochs): # Update Users model.row_update_prep_gramian_op.run() model.initialize_row_update_op.run() row_update_op.run() # Update Items model.col_update_prep_gramian_op.run() model.initialize_col_update_op.run() col_update_op.run() if verbose: train_metric = rmse_op.eval() test_metric = rmse_test_op.eval() tf.logging.info('RMSE Train: {:,.3f}'.format(train_metric)) tf.logging.info('RMSE Test: {:,.3f}'.format(test_metric)) # TODO Collect these in variable for graphing later row_factor = model.row_factors[0].eval() col_factor = model.col_factors[0].eval() tf.logging.info('Train Finish: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) return row_factor, col_factor
n_rows = len(users_from_idx) n_cols = len(items_from_idx) shape = (n_rows, n_cols) P = tf.SparseTensor(indices, values, shape) print(P) print('Total values: {:,}'.format(n_rows * n_cols)) from tensorflow.contrib.factorization import WALSModel k = 10 n = 10 reg = 1e-1 model = WALSModel(n_rows, n_cols, k, regularization=reg, unobserved_weight=0) row_factors = tf.nn.embedding_lookup(params=model.row_factors, ids=tf.range(model._input_rows), partition_strategy="div") col_factors = tf.nn.embedding_lookup(params=model.col_factors, ids=tf.range(model._input_cols), partition_strategy="div") row_indices, col_indices = tf.split(P.indices, axis=1, num_or_size_splits=2) gathered_row_factors = tf.gather(row_factors, row_indices) gathered_col_factors = tf.gather(col_factors, col_indices) approx_vals = tf.squeeze( tf.matmul(gathered_row_factors, gathered_col_factors, adjoint_b=True)) P_approx = tf.SparseTensor(indices=P.indices, values=approx_vals,
with open(data_path, 'r') as f: data = json.load(f) indices = [] values = [] for idx, elem in enumerate(data): indices += zip([idx] * len(elem), elem) values += [1.0] * len(elem) with tf.Graph().as_default() as graph1: sp_mat = tf.SparseTensor(indices, values, [num_rows, num_cols]) model = WALSModel(num_rows, num_cols, dimension, 0.5, 2.0, row_weights=None, col_weights=None) row_factors = model.row_factors[0] col_factors = model.col_factors[0] sess = tf.Session(graph=graph1) writer = tf.summary.FileWriter('walsmodels', graph1) row_update_op = model.update_row_factors(sp_mat)[1] col_update_op = model.update_col_factors(sp_mat)[1] sess.run(model.initialize_op)
def als_model(self, dataset): return WALSModel(len(dataset["visitorid"].unique()), len(dataset["itemid"].unique()), self.num_factors, regularization=self.regularization, unobserved_weight=0)
def reco(sess, inp, code, label, epsilon, train_dataset, dev_dataset, lr, weights_path): # Initialize hyperparameters # TODO: Proper tuning_threshold strategy, or is there a better stopping condition? # TODO: Grid search for reg_l2 tuning? Currently only tune factor_dim factor_dim = 0 reg_l2 = 0.1 factor_loss_thresh = 1e-6 tuning_thresh = 1e-6 # Ratings matrix dimensions n_items = _train_utils.dataset_iter_len( sess, train_dataset.make_one_shot_iterator().get_next()) n_users_train = 877 n_users_dev = 110 n_users_test = 110 '''Placeholder labels label = np.random.randn(n_users_train + n_users_dev + n_users_test, 1) label = tf.convert_to_tensor(label, dtype=tf.float32) ''' label_train = label[1:n_users_train + 1, -1] label_dev = label[n_users_train + 1:n_users_train + 1 + n_users_dev + 1, -1] label_test = label[n_users_train + 1 + n_users_dev + 1:-1, -1] # Rating matrix # TODO: Random placeholder data for now. Rating matrix must include all train/dev/test # data. Each row represents a user, and each column represents a feature. The label # is to be included in the last feature column, with dev/test set labels removed. rating_matrix = np.random.randn(n_users_train + n_users_dev + n_users_test, n_items) input_tensor = tf.convert_to_tensor(rating_matrix, dtype=tf.float32) input_tensor = tf.contrib.layers.dense_to_sparse(input_tensor) # Tune model using increasing latent factor matrix dimension losscrit = np.inf while losscrit > tuning_thresh: factor_dim += 1 # Weighted alternating least squares model (causes deprecation warning) model = WALSModel(n_users_train + n_users_dev + n_users_test, n_items, factor_dim, regularization=reg_l2, row_weights=None, col_weights=None) # Retrieve row and column factors users_factor = model.row_factors[0] items_factor = model.col_factors[0] # Initialize training row_update_op = model.update_row_factors(sp_input=input_tensor)[1] col_update_op = model.update_col_factors(sp_input=input_tensor)[1] sess.run(model.initialize_op) sess.run(model.worker_init) # Update latent factor matrices via Alternating Least Squares until matrix decomposition converges u_factor_old = users_factor.eval(session=sess) i_factor_old = items_factor.eval(session=sess) factor_loss = np.inf while factor_loss > factor_loss_thresh: sess.run(model.row_update_prep_gramian_op) sess.run(model.initialize_row_update_op) sess.run(row_update_op) sess.run(model.col_update_prep_gramian_op) sess.run(model.initialize_col_update_op) sess.run(col_update_op) u_factor_new = users_factor.eval(session=sess) i_factor_new = items_factor.eval(session=sess) factor_loss = max(np.linalg.norm(u_factor_new - u_factor_old), np.linalg.norm(i_factor_new - i_factor_old)) u_factor_old = u_factor_new i_factor_old = i_factor_new # Predictions pred_fun = tf.matmul(users_factor, items_factor, transpose_b=True) pred = sess.run(pred_fun) pred_train = pred[1:n_users_train + 1, -1] pred_dev = pred[n_users_train + 1:n_users_train + 1 + n_users_dev + 1, -1] pred_test = pred[n_users_train + 1 + n_users_dev + 1:-1, -1] # Performance loss_fun = tf.math.reduce_sum(tf.math.square( tf.abs(pred - label))) + tf.nn.l2_loss( users_factor) + tf.nn.l2_loss(items_factor) losscrit = sess.run(loss_fun) train_loss = sess.run(tf.reduce_mean(tf.abs(pred_train - label_train))) dev_loss = sess.run(tf.reduce_mean(tf.abs(pred_dev - label_dev))) test_loss = sess.run(tf.reduce_mean(tf.abs(pred_test - label_test)))
class WRMFRecommender(object): def __init__(self, config): """ 推荐模型初始化 :param config: data: 训练数据 user_map: User映射文件 item_map: Item映射文件 weight_type: 权重矩阵初始化策略:['user'|'item'] weights: 是否加权 wt_type: 权重值线性或指数变换 obs_wt: 权重线性变换参数 feature_wt_exp: 权重指数变换参数 dim: 隐状态维度 unobs: 缺失值初始化大小 reg: 正则化参数 num_iterations: 迭代次数 save_path: 模型保存路径 topn: 推荐结果个数 """ self.data = config['data'] self.test = config['val'] self.user_map = config['user_map'] self.item_map = config['item_map'] self.weight_type = config['weight_type'] self.weights = config['weights'] self.wt_type = config['wt_type'] self.obs_wt = config['obs_wt'] self.feature_wt_exp = config['feature_wt_exp'] self.dim = config['dim'] self.unobs = config['unobs'] self.reg = config['reg'] self.num_iterations = config['num_iterations'] self.save_path = config['save_path'] self.topn = config['topn'] self.output_row = None self.output_col = None self.row_wts = None self.col_wts = None def _build_model(self): """ 构建wALS算法计算图 :return: """ num_rows = self.data.shape[0] num_cols = self.data.shape[1] # Weight矩阵初始化方式 # 1.User orientation 同一个User下Miss Value平均 # 2.Item orientation 同一个Item下Miss Value平均 if self.weights: if self.weight_type == 'user': self.row_wts = np.ones(num_rows) self.col_wts = self._make_wts(self.data, self.wt_type, self.obs_wt, self.feature_wt_exp, 0) elif self.weight_type == 'item': self.col_wts = np.ones(num_cols) self.row_wts = self._make_wts(self.data, self.wt_type, self.obs_wt, self.feature_wt_exp, 1) with tf.Graph().as_default(): self.input_tensor = tf.SparseTensor( indices=list(zip(self.data.row, self.data.col)), values=(self.data.data).astype(np.float32), dense_shape=self.data.shape) self.model = WALSModel(num_rows, num_cols, self.dim, unobserved_weight=self.unobs, regularization=self.reg, row_weights=self.row_wts, col_weights=self.col_wts) self.row_factor = self.model.row_factors[0] self.col_factor = self.model.col_factors[0] def eval_train_tf(self): """ 训练模型 :return: """ tf.logging.info('Train Start: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) self._build_model() self.sess = tf.Session(graph=self.input_tensor.graph) self.saver = tf.train.Saver([self.row_factor, self.col_factor]) with self.input_tensor.graph.as_default(): self.load_tf_model() row_update_op = self.model.update_row_factors( sp_input=self.input_tensor)[1] col_update_op = self.model.update_col_factors( sp_input=self.input_tensor)[1] self.sess.run(self.model.initialize_op) self.sess.run(self.model.worker_init) for i in range(self.num_iterations): self.sess.run(self.model.row_update_prep_gramian_op) self.sess.run(self.model.initialize_row_update_op) self.sess.run(row_update_op) self.sess.run(self.model.col_update_prep_gramian_op) self.sess.run(self.model.initialize_col_update_op) self.sess.run(col_update_op) self.output_row = self.row_factor.eval(session=self.sess) self.output_col = self.col_factor.eval(session=self.sess) if i % 2 == 0: self.eval_ranking(self.topn) # self.save_tf_model(i) tf.logging.info('Train Finish: {:%Y-%m-%d %H:%M:%S}'.format( datetime.datetime.now())) self.sess.close() def eval_test(self, user_idx): """ 获取测试集特定用户的评价物品 :param user_idx: 用户id :return: 测试集用户评价物品列表 """ return self.test.getrow(user_idx).indices def eval_recommend(self, user_idx, k): """ 为特定用户生成推荐列表 :param user_idx: 用户id :param k: 推荐列表大小 :return: 用户推荐列表 """ user_rated = self.data.getrow(user_idx).indices assert (self.output_col.shape[0] - len(user_rated)) >= k user_f = self.output_row[user_idx] pred_ratings = self.output_col.dot(user_f) k_r = k + len(user_rated) candidate_items = np.argsort(pred_ratings)[-k_r:] recommended_items = [i for i in candidate_items if i not in user_rated] recommended_items = recommended_items[-k:] recommended_items.reverse() return recommended_items def eval_ranking(self, N): """ 对模型进行评价 :param N: 为每个用户推荐物品的个数 :return: """ rec_list = {} test_list = {} for ux in range(len(self.user_map)): recommended_items = self.eval_recommend(ux, N) rec_list[self.user_map[ux]] = recommended_items test_list[self.user_map[ux]] = self.eval_test(ux) self.measure = Metrics.ranking_measure(test_list, rec_list, N) def save_tf_model(self, step): """ 保存tf模型 :param step: 全局总步数 :return: """ self.saver.save(self.sess, os.path.join(self.save_path, 'tf'), global_step=step) def load_tf_model(self): """ 加载tf模型 :return: """ ckpt = tf.train.get_checkpoint_state(self.save_path) if ckpt and ckpt.model_checkpoint_path: self.saver.restore(self.sess, ckpt.model_checkpoint_path) else: print("No checkpoint file.") def save_model(self): """ 使用numpy保存隐矩阵 :return: """ if not os.path.exists(self.save_path): os.makedirs(self.save_path) np.save(os.path.join(self.save_path, 'user'), self.user_map) np.save(os.path.join(self.save_path, 'item'), self.item_map) np.save(os.path.join(self.save_path, 'row'), self.output_row) np.save(os.path.join(self.save_path, 'col'), self.output_col) def load_model(self): """ 加载隐矩阵 :return: """ self.user_map = np.load(os.path.join(self.save_path, 'user.npy')) self.item_map = np.load(os.path.join(self.save_path, 'item.npy')) self.output_row = np.load(os.path.join(self.save_path, 'row.npy')) self.output_col = np.load(os.path.join(self.save_path, 'col.npy')) def eval_train(self): """ 传统方法进行训练 :return: """ print('Start training...') num_rows = self.data.shape[0] num_cols = self.data.shape[1] if os.path.exists(os.path.join( self.save_path, 'row.npy')) and os.path.exists( os.path.join(self.save_path, 'col.npy')): self.load_model() else: self.output_row = np.random.rand(num_rows, self.dim) # 对应论文中的X self.output_col = np.random.rand(num_cols, self.dim) # 对应论文中的Y iteration = 0 while iteration < self.num_iterations: print('iteration:', iteration) self.loss = 0 YtY = self.output_col.T.dot(self.output_col) I = np.ones(num_cols) for uid in range(len(self.user_map)): #C_u = np.ones(self.data.getSize(self.recType)) val = [] H = np.ones(num_cols) pos = [] P_u = np.zeros(num_cols) for iid in self.data.getrow(uid).indices: r_ui = float( self.data.getrow(uid).getcol(iid).toarray()[0][0]) pos.append(iid) val.append(r_ui) H[iid] += r_ui P_u[iid] = 1 error = (P_u[iid] - self.output_row[uid].dot(self.output_col[iid])) self.loss += error**2 C_u = coo_matrix((val, (pos, pos)), shape=(num_cols, num_cols)) # 计算权重Wu,Wu = (YtCuY + lambda * itemIdx) ^ -1 Au = (YtY + np.dot(self.output_col.T, C_u.dot(self.output_col)) + self.reg * np.eye(self.dim)) Wu = np.linalg.inv(Au) # 更新Xu,这里即X[uid], Xu = Wu*YtCuPu self.output_row[uid] = np.dot(Wu, (self.output_col.T * H).dot(P_u)) XtX = self.output_row.T.dot(self.output_row) I = np.ones(num_rows) for iid in range(len(self.item_map)): P_i = np.zeros(num_rows) H = np.ones(num_rows) val = [] pos = [] for uid in self.data.getcol(iid).indices: r_ui = float( self.data.getrow(uid).getcol(iid).toarray()[0][0]) pos.append(uid) val.append(r_ui) H[uid] += r_ui P_i[uid] = 1 C_i = coo_matrix((val, (pos, pos)), shape=(num_rows, num_rows)) # 计算权重Wi,Wi = (XtCiX + lambda * userIdx) ^ -1 Ai = (XtX + np.dot(self.output_row.T, C_i.dot(self.output_row)) + self.reg * np.eye(self.dim)) Wi = np.linalg.inv(Ai) # 更新Yi, Yi = Wi*XtCiPi self.output_col[iid] = np.dot(Wi, (self.output_row.T * H).dot(P_i)) iteration += 1 self.loss += self.reg * ( (self.output_row * self.output_row).sum() + (self.output_col * self.output_col).sum()) print('Loss:', self.loss) self.eval_ranking(self.topn) if iteration % 2 == 0: self.save_model() @staticmethod def _make_wts(data, wt_type, obs_wt, feature_wt_exp, axis): """ 计算缺失值初始化权重 :param data: 训练数据集 :param wt_type: 权重线性变换或指数变换 :param obs_wt: 线性变换参数 :param feature_wt_exp: 指数变换参数 :param axis: 数据累加维度 :return: 在一个维度上权重分布 """ frac = np.array(1.0 / (data > 0.0).sum(axis)) frac[np.ma.masked_invalid(frac).mask] = 0.0 if wt_type == 1: wts = np.array(np.power(frac, feature_wt_exp)).flatten() else: wts = np.array(obs_wt * frac).flatten() assert np.isfinite(wts).sum() == wts.shape[0] return wts
def wals(id, from_date, to_date, predict_moment, dimension=30, weight=0.5, coef=2.0, n_iter=30): data_path = 'wp_' + from_date + '_' + to_date + '_sparse.json' deal_dict = np.load('dict_' + from_date + '_' + to_date + '_for_sparse.npy') user_dict = np.load('user_' + from_date + '_' + to_date + '.npy') if id not in user_dict: return -1 else: user_index = np.where(user_dict == id)[0][0] num_rows = len(user_dict) num_cols = len(deal_dict) connect('wprec', host='mongodb://10.102.61.251:27017') deals = WepickDeal.objects(pk__gte=predict_moment + ' 20', pk__lte=predict_moment + ' 99') deal_slots = [] deal_ids = [] predict_input = [] for elem in deals: dealid = elem['deal'].id if dealid in deal_dict: deal_slots.append(int(elem.id[-2:])) deal_ids.append(elem['deal'].id) deal_finder = dict(zip(deal_dict, range(num_cols))) with open(data_path, 'r') as f: data = json.load(f) indices = [] values = [] for idx, elem in enumerate(data): indices += zip([idx] * len(elem), elem) values += [1.0] * len(elem) with tf.Graph().as_default() as graph1: sp_mat = tf.SparseTensor(indices, values, [num_rows, num_cols]) model = WALSModel(num_rows, num_cols, dimension, weight, coef, row_weights=None, col_weights=None) row_factors = model.row_factors[0] col_factors = model.col_factors[0] sess = tf.Session(graph=graph1) row_update_op = model.update_row_factors(sp_mat)[1] col_update_op = model.update_col_factors(sp_mat)[1] sess.run(model.initialize_op) for _ in range(n_iter): sess.run(model.row_update_prep_gramian_op) sess.run(model.initialize_row_update_op) sess.run(row_update_op) sess.run(model.col_update_prep_gramian_op) sess.run(model.initialize_col_update_op) sess.run(col_update_op) output_row = row_factors.eval(sess) output_col = col_factors.eval(sess) sess.close() results = [] for i in range(len(deal_ids)): deal_index = deal_finder[deal_ids[i]] results.append({ 'id': deal_ids[i], 'slot': deal_slots[i], 'score': sum(output_row[user_index][:] * output_col[deal_index]) }) return results
def wals_cate(from_date, to_date, dimension=10, weight=0.5, coef=2.0, n_iter=30): data_path = 'wp_' + from_date + '_' + to_date + '_cate.json' cate_dict = np.load('cate_dict.npy') user_dict = np.load('user_' + from_date + '_' + to_date + '_for_cate.npy') num_rows = len(user_dict) num_cols = len(cate_dict) with open(data_path, 'r') as f: data = json.load(f) indices = [] values = [] for idx, elem in enumerate(data): indices += zip([idx] * len(elem), elem) values += [1.0] * len(elem) with tf.Graph().as_default() as graph1: sp_mat = tf.SparseTensor(indices, values, [num_rows, num_cols]) model = WALSModel(num_rows, num_cols, dimension, weight, coef, row_weights=None, col_weights=None) row_factors = model.row_factors[0] col_factors = model.col_factors[0] sess = tf.Session(graph=graph1) row_update_op = model.update_row_factors(sp_mat)[1] col_update_op = model.update_col_factors(sp_mat)[1] sess.run(model.initialize_op) for _ in range(n_iter): sess.run(model.row_update_prep_gramian_op) sess.run(model.initialize_row_update_op) sess.run(row_update_op) sess.run(model.col_update_prep_gramian_op) sess.run(model.initialize_col_update_op) sess.run(col_update_op) output_row = row_factors.eval(sess).tolist() output_col = col_factors.eval(sess).tolist() sess.close() # temporary mechanism for generated matrice random.seed() temp_num = str(random.randrange(100)) user_temp_name = 'temp_user' + temp_num item_temp_name = 'temp_item' + temp_num with open('../' + user_temp_name + '.json', 'w') as f: json.dump(output_row, f) with open('../' + item_temp_name + '.json', 'w') as f: json.dump(output_col, f) print('files saved') return dimension, user_temp_name, item_temp_name