def fit(self, X, y, order=2, rank=10, lr=0.001, n_epochs=1, batch_size=100, std=0.001, lda=1e-6, log_dir='/tmp/jprior/logs', verbosity=0): self._clf = TFFMRegressor(**self._sub_params) ''' seed=0, order=order, rank=rank, optimizer=tf.train.FtrlOptimizer(learning_rate=lr), n_epochs=n_epochs, batch_size=batch_size, # smaller init_std -> lower contribution of higher order terms init_std=std, reg=lda, #input_type='sparse', log_dir=log_dir, verbose=verbosity ) ''' # tffm doesn't deal with DataFrames correctly (although it tries...) self._clf.fit(X[self.inputs].values, y.values, show_progress=True) return
def __init__(self, wfm_data, m_name, order, k, bs, lr, init, reg): self.data = wfm_data os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" #os.environ["CUDA_VISIBLE_DEVICES"] = "1" #session_config = tf.ConfigProto(log_device_placement=False, device_count={'GPU': 1}) self.lr = lr self.num_cand = 1000 self.m_name = m_name self.model = TFFMRegressor( order=order, rank=k, optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), session_config=tf.ConfigProto(log_device_placement=False, device_count={'GPU': 1}), n_epochs=1, batch_size=bs, init_std=init, reg=reg, input_type='sparse', seed=42, ) if m_name == 'bpr': self.model.core.mf = True if m_name == 'bpr' or m_name == 'fmp' or m_name == 'wfmp': self.model.core.loss_function = ut.loss_bpr if m_name == 'wfm' or m_name == 'wfmp': self.model.core.G = list(self.data.gr_train.values()) self.model.core.gamma_init = np.array(self.data.dr.weights).astype(np.float32) if self.data.w_init == 'all-one' or self.data.w_init == 'all-diff': self.model.core.M = np.repeat(True, len(self.data.dr.weights)) else: self.model.core.M = np.append([False, False], np.repeat(True, len(self.data.dr.weights) - 2)) fit_methods = {'bpr': 'fit_bpr', 'fm': 'fit', 'fmp': 'fit_bpr', 'wfm': 'fit', 'wfmp': 'fit_bpr', 'wmf': 'fit'} self.fit_method = fit_methods[m_name] self.c = None if m_name == 'wmf': self.c = self.data.dr.c self.model.core.has_conf = True print('preparing test matrix...') if self.data.dataset == 'frappe': self.X_test = self.get_test_matrix_opr() else: self.X_test, self.rel_c = self.get_test_matrix_ub()
class ArtistResponse(object): def __init__(self): self.model = TFFMRegressor( optimizer=tf.train.AdamOptimizer(learning_rate=0.01), n_epochs=1000, input_type='sparse') def generate_feature_matrix(self, info, artist_names, favored_artists, column_map): """ Generate the feature matrix :param info: User information :param artist_names: Artists we want to predict for :param column_map: Feature vector column mapping :return: sparse feature matrix """ # We create a matrix of fea ture vectors for each potential artist X = np.zeros((len(artist_names), len(column_map))) # Feature matrix will have the same values for the user information fields X[:, 0] = info["age"] X[:, column_map[f"country_{info['country']['name']}"]] = 1 if info["gender"] is not None: X[:, column_map[f"gender_{info['gender']}"]] = 1 # Set the proper one-hot vector for artist for i, name in enumerate(favored_artists): X[i, column_map[f"artistName_{name}"]] = 1 return sparse.csr_matrix(X) def get_top_predicted_artists(self, info, column_map, top_artists, n=10): """ Get the top predicted artists for a user :param info: User information :param column_map: Feature vector column mapping :param top_artists: Artists we want to predict for :param n: How many artists we want to return :return: list of the top predicted artists in descending order """ X = self.generate_feature_matrix(info, top_artists, info["artists_names"], column_map) self.model.core.set_num_features(X.shape[1]) self.model.load_state("tffm_model/") predictions = self.model.predict(X) predicted_artists = list( map(lambda artist: top_artists[artist], np.argsort(predictions)[::-1])) return predicted_artists[:n]
def tffm(self): show_progress = True if not self.onlyResults else False X_train, y_train, X_test, y_test = self.X_train.todense( ), np.transpose( self.y_train).flatten(), self.X_test.todense(), np.transpose( self.y_test).flatten() if self.onlyResults: environ['TF_CPP_MIN_LOG_LEVEL'] = '3' model = TFFMRegressor( order=2, rank=4, optimizer=tf.train.FtrlOptimizer(learning_rate=0.1), n_epochs=100, batch_size=-1, init_std=0.001, input_type='dense') model.fit(X_train, y_train, show_progress=show_progress) predictions = model.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, predictions)) if not self.onlyResults: print('RMSE: {:.6f}'.format(rmse)) model.destroy() if self.onlyResults: print("Completed tffm evaluation.") return rmse
def tffm(self): # show_progress = True if not self.onlyResults else False X_train, y_train, X_test, y_test = self.X_train.todense(), np.transpose(self.y_train).flatten(), self.X_test.todense(), np.transpose(self.y_test).flatten() # if self.onlyResults: environ['TF_CPP_MIN_LOG_LEVEL'] = '3' model = TFFMRegressor( order=2, rank=4, optimizer=tf.train.AdamOptimizer(learning_rate=0.1), n_epochs=100, batch_size=-1, init_std=0.001, input_type='dense' ) model.fit(X_train, y_train, show_progress=True) predictions = model.predict(X_test) prec = precision_score(y_test, predictions.round(), average='weighted') rec = recall_score(y_test, predictions.round(), average='weighted') fmeasure = 2*((prec*rec)/(prec+rec)) auc = roc_auc_score(y_test, predictions, average='weighted') rmse = np.sqrt(mean_squared_error(y_test, predictions)) model.destroy() print("Completed tffm evaluation.") return (auc, rmse)
class FM(ModelBase): def get_param_dist(self, X): num_rows = X.shape[0] num_features = X[self.inputs].shape[1] param_dist = { 'rank': sp_randint(1, num_features), 'batch_size': sp_randint(1, num_rows), 'lr': sp_uniform(loc=0.001, scale=0.01), } return param_dist def fit(self, X, y, order=2, rank=10, lr=0.001, n_epochs=1, batch_size=100, std=0.001, lda=1e-6, log_dir='/tmp/jprior/logs', verbosity=0): self._clf = TFFMRegressor(**self._sub_params) ''' seed=0, order=order, rank=rank, optimizer=tf.train.FtrlOptimizer(learning_rate=lr), n_epochs=n_epochs, batch_size=batch_size, # smaller init_std -> lower contribution of higher order terms init_std=std, reg=lda, #input_type='sparse', log_dir=log_dir, verbose=verbosity ) ''' # tffm doesn't deal with DataFrames correctly (although it tries...) self._clf.fit(X[self.inputs].values, y.values, show_progress=True) return
def train(export_path, data, version, args): x_train, y_train, x_test, y_test = data y_train = y_train * 2 - 1 print("train date shape is {}".format(x_train.shape)) if args.log: log_dir = './log' else: log_dir = None model = TFFMRegressor( order=2, rank=args.rank, optimizer=tf.train.AdamOptimizer(learning_rate=0.0001), n_epochs=1, batch_size=128, log_dir=log_dir, init_std=0.01, reg=0.5, input_type='sparse') base_path = 'ckpt/{}'.format(version) path_create(base_path) model_path = os.path.join(base_path, 'state.tf') print('model path is {}'.format(model_path)) model.core.set_num_features(x_train.shape[1]) model.fit(x_train, y_train, show_progress=True) print('train the model successfully') model.save_state(model_path) print('checkpoint save successfully') if args.save: save = Save(model, export_path) save.save() return model
def get_model(cls): """Get the model object for this instance, loading it if it's not already loaded.""" if cls.model == None: cls.model = TFFMRegressor( order=3, rank=7, optimizer=tf.train.AdamOptimizer(learning_rate=0.1), n_epochs=50, batch_size=-1, init_std=0.001, input_type='sparse' ) cls.model.core.set_num_features(cls.get_num_features()) cls.model.load_state(os.path.join(model_path, 'tffm_state.tf')) return cls.model
def runTFFM(train_X, train_y, test_X, test_y, test_X2, params): model = TFFMRegressor(**params) print_step('Fit TFFM') for i in range(rounds): model.fit(train_X, train_y.values, n_epochs=iters) pred_test_y = model.predict(test_X) print_step('Iteration {}/{} -- RMSE: {}'.format( i + 1, rounds, rmse(pred_test_y, test_y))) print_step('TFFM Predict 2/2') pred_test_y2 = model.predict(test_X2) return pred_test_y, pred_test_y2
class WfmModel: def __init__(self, wfm_data, m_name, order, k, bs, lr, init, reg): self.data = wfm_data os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" #os.environ["CUDA_VISIBLE_DEVICES"] = "1" #session_config = tf.ConfigProto(log_device_placement=False, device_count={'GPU': 1}) self.lr = lr self.num_cand = 1000 self.m_name = m_name self.model = TFFMRegressor( order=order, rank=k, optimizer=tf.train.GradientDescentOptimizer(learning_rate=lr), session_config=tf.ConfigProto(log_device_placement=False, device_count={'GPU': 1}), n_epochs=1, batch_size=bs, init_std=init, reg=reg, input_type='sparse', seed=42, ) if m_name == 'bpr': self.model.core.mf = True if m_name == 'bpr' or m_name == 'fmp' or m_name == 'wfmp': self.model.core.loss_function = ut.loss_bpr if m_name == 'wfm' or m_name == 'wfmp': self.model.core.G = list(self.data.gr_train.values()) self.model.core.gamma_init = np.array(self.data.dr.weights).astype(np.float32) if self.data.w_init == 'all-one' or self.data.w_init == 'all-diff': self.model.core.M = np.repeat(True, len(self.data.dr.weights)) else: self.model.core.M = np.append([False, False], np.repeat(True, len(self.data.dr.weights) - 2)) fit_methods = {'bpr': 'fit_bpr', 'fm': 'fit', 'fmp': 'fit_bpr', 'wfm': 'fit', 'wfmp': 'fit_bpr', 'wmf': 'fit'} self.fit_method = fit_methods[m_name] self.c = None if m_name == 'wmf': self.c = self.data.dr.c self.model.core.has_conf = True print('preparing test matrix...') if self.data.dataset == 'frappe': self.X_test = self.get_test_matrix_opr() else: self.X_test, self.rel_c = self.get_test_matrix_ub() def get_test_matrix_opr(self): items = self.data.items relevant = self.data.relevant c_cols = self.data.cols[2:] nc = self.num_cand n = relevant.apply(lambda x: len(x) * (nc + 1)).sum() ix = self.data.ix i_ix = np.zeros(n, dtype=np.int32) u_ix = np.zeros(n, dtype=np.int32) c_ix = {} for c in c_cols: c_ix[c] = np.zeros(n, dtype=np.int32) l = 0 for kk in relevant.keys(): cands = np.random.choice(np.setdiff1d(items, relevant[kk]), nc) for i in relevant[kk]: u_ix[l:l + nc + 1] = np.repeat(ix[str(kk[0] if len(c_cols) > 0 else kk) + 'UserId'], nc + 1) i_ix[l:l + nc] = [ix[str(ii) + 'ItemId'] for ii in cands] i_ix[l + nc] = ix[str(i) + 'ItemId'] for ii, c in enumerate(c_cols): c_ix[c][l:l + nc + 1] = np.repeat(ix[str(kk[ii + 1]) + c], nc + 1) l += nc + 1 g = len(c_cols) + 2 data_m = np.ones(n*g,dtype=bool) row_ix = np.repeat(np.arange(0, n, dtype=np.int32), g) col_ix = np.zeros(n*g, dtype=np.int32) col_ix[0::g] = u_ix col_ix[1::g] = i_ix for ii, c in enumerate(c_cols): col_ix[ii+2::g] = c_ix[c] p = self.data.p X = csr.csr_matrix((data_m, (row_ix, col_ix)), shape=(n, p)) return X def get_test_matrix_ub(self): items = self.data.items users = self.data.users relevant = self.data.relevant c_cols = self.data.cols[2:] nc = self.num_cand n = relevant.apply(lambda x: len(x) + nc).sum() ix = self.data.ix item_attr = {} if self.data.item_attr is not None: item_attr = dict(self.data.item_attr) c_ix_ = [ix[str(item_attr[i]) + c_cols[0]] for i in items] c_ix = np.zeros(n, dtype=np.int32) i_ix_ = [ix[str(i) + 'ItemId'] for i in items] i_ix = np.zeros(n, dtype=np.int32) u_ix = np.zeros(n, dtype=np.int32) rel_c = [] l = 0 for u in users: r = np.size(relevant[u]) u_ix[l:l + nc + r] = np.repeat(ix[str(u) + 'UserId'], nc + r) i_ix[l:l + nc] = i_ix_ i_ix[l + nc: l + nc + r] = [ix[str(i) + 'ItemId'] for i in relevant[u]] if self.data.item_attr is not None: c_ix[l:l + nc] = c_ix_ c_ix[l + nc:l + nc + r] = [ix[str(item_attr[i]) + c_cols[0]] for i in relevant[u]] l += nc + r rel_c.append(nc + r) g = len(c_cols) + 2 data_m = np.ones(n*g,dtype=bool) row_ix = np.repeat(np.arange(0, n, dtype=np.int32), g) col_ix = np.zeros(n*g, dtype=np.int32) col_ix[0::g] = u_ix col_ix[1::g] = i_ix if self.data.item_attr is not None: col_ix[2::g] = c_ix p = self.data.p X = csr.csr_matrix((data_m, (row_ix, col_ix)), shape=(n, p)) return X, rel_c def calc_metrics_opr(self, pred, k): relevant = self.data.relevant nc = self.num_cand hit_counts = [] rrs = [] l = 0 for kk in relevant.keys(): for i in relevant[kk]: top_ix = np.argpartition(pred[l:l+nc + 1], -k)[-k:] hit_count = len(np.where(top_ix >= nc)[0]) hit_counts.append(hit_count) top_val = pred[l + top_ix] top_ix = map(lambda x: x[0], sorted(zip(top_ix, top_val), key=lambda x: x[1], reverse=True)) rr = 0 for j, item_ix in enumerate(top_ix): if (item_ix >= nc): #if item is relavant rr = 1 / (j + 1) break; rrs.append(rr) l += nc + 1 recall = np.sum(hit_counts) / np.size(hit_counts) mrr = np.mean(rrs) return recall, mrr, recall / k def calc_metrics_ub(self, pred, k, rel_c): nc = self.num_cand hit_counts = [] recalls = [] rrs = [] l = 0 for c in rel_c: top_ix = np.argpartition(pred[l:l+c], -k)[-k:] hit_count = len(np.where(top_ix >= nc)[0]) hit_counts.append(hit_count) recalls.append(hit_count / (c - nc) if c > nc else 0) top_val = pred[l + top_ix] top_ix = map(lambda x: x[0], sorted(zip(top_ix, top_val), key=lambda x: x[1], reverse=True)) rr = 0 for j, item_ix in enumerate(top_ix): if (item_ix >= nc): #if item is relavant rr = 1 / (j + 1) break; rrs.append(rr) l += c prc = np.sum(hit_counts) / (k * np.size(hit_counts)) recall = np.mean(recalls) mrr = np.mean(rrs) return recall, mrr, prc def eval_model(self): if self.data.dataset == 'frappe': pred = self.model.predict(self.X_test, pred_batch_size=100000) r5, mrr5, prc5 = self.calc_metrics_opr(pred, 5) r10, mrr10, prc10 = self.calc_metrics_opr(pred, 10) r20, mrr20, prc20 = self.calc_metrics_opr(pred, 20) else: pred = self.model.predict(self.X_test, pred_batch_size=1000000) r5, mrr5, prc5 = self.calc_metrics_ub(pred, 5, self.rel_c) r10, mrr10, prc10 = self.calc_metrics_ub(pred, 10, self.rel_c) r20, mrr20, prc20 = self.calc_metrics_ub(pred, 20, self.rel_c) return r5, r10, r20, mrr5, mrr10, mrr20, prc5, prc10, prc20 def train_model(self, epochs, eval_freq, eval_file=None): writer = None if eval_file is not None: writer = open(eval_file, 'w') writer.write('Method,WeightInit,Context,Epoch,Order,K,BatchSize,LearnRate,InitStd,Reg,Recall@5,Recall@10,Recall@20,MRR@5,MRR@10,MRR@20,Precision@5,Precision@10,Precision@20,EpochTime,EvalTime,Weights,NewEval,Optimizer,MsdContext,NormalizeAlpha\n') def eval_epoch(ep_, epoch_time_): start_time = time.time() r5, r10, r20, mrr5, mrr10, mrr20, prc5, prc10, prc20 = self.eval_model() eval_time = time.time() - start_time if self.model.core.G is not None: ws = reduce(lambda x, y: str(x) + ' ' + str(y), self.model.session.run(self.model.core.alpha)) else: ws = 'NA' writer.write('{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10},{11},{12},{13},{14},{15},{16},{17},{18},{19},{20},{21},{22},{23},{24},{25}\n'.format( self.m_name, self.data.w_init, self.data.dr.context, ep_, self.model.core.order, self.model.core.rank, self.model.batch_size, self.lr, self.model.core.init_std, self.model.core.reg, r5, r10, r20, mrr5, mrr10, mrr20, prc5, prc10, prc20, epoch_time_, eval_time, ws,'True2','GD','Genre',self.model.core.norm_alpha)) writer.flush() total_time = 0 for ep in tqdm(range(epochs), unit='epoch'): start_time = time.time() if self.fit_method == 'fit': self.model.fit(self.data.X_train, self.data.y_train, c_=self.c) else: self.model.fit_bpr(self.data.X_train, self.data.X_train_neg) epoch_time = time.time() - start_time total_time += epoch_time if (ep + 1) % eval_freq == 0: eval_epoch(ep + 1, epoch_time) if writer is not None: writer.close()
def main(): seed = 123 # Random seed data_dir = "../validation_data_train/" n_epochs = 200 # Number of epochs learning_rate = 0.001 # Learning rate of the optimizer batch_size = 1024 # Batch size init_std = 0.01 # Initial standard deviation input_type = 'sparse' # Input type: 'sparse' or 'dense' reg = 10**4 # Regularization parameter rank = 100 # Rank of the factorization order = 5 # comboFM order print('GPU available:') print(tf.test.is_gpu_available()) ### Training data forr validation experiment # Features in position 1: Drug A - Drug B features_tensor_1 = ("drug1_concentration__one-hot_encoding.csv", "drug2_concentration__one-hot_encoding.csv", "drug1__one-hot_encoding.csv", "drug2__one-hot_encoding.csv", "cell_lines__one-hot_encoding.csv") features_auxiliary_1 = ("drug1_drug2_concentration__values.csv", "drug1__estate_fingerprints.csv", "drug2__estate_fingerprints.csv", "cell_lines__gene_expression.csv") X_tensor_1 = concatenate_features(data_dir, features_tensor_1) X_auxiliary_1 = concatenate_features(data_dir, features_auxiliary_1) X_1 = np.concatenate((X_tensor_1, X_auxiliary_1), axis=1) # Features in position 2: Drug B - Drug A features_tensor_2 = ("drug2_concentration__one-hot_encoding.csv", "drug1_concentration__one-hot_encoding.csv", "drug2__one-hot_encoding.csv", "drug1__one-hot_encoding.csv", "cell_lines__one-hot_encoding.csv") features_auxiliary_2 = ("drug2_drug1_concentration__values.csv", "drug2__estate_fingerprints.csv", "drug1__estate_fingerprints.csv", "cell_lines__gene_expression.csv") X_tensor_2 = concatenate_features(data_dir, features_tensor_2) X_auxiliary_2 = concatenate_features(data_dir, features_auxiliary_2) X_2 = np.concatenate((X_tensor_2, X_auxiliary_2), axis=1) # Concatenate the features from both positions vertically X_tr = np.concatenate((X_1, X_2), axis=0) print('Dataset shape: {}'.format(X_tr.shape)) print('Non-zeros rate: {:.05f}'.format(np.mean(X_tr != 0))) print('Number of one-hot encoding features: {}'.format( X_tensor_1.shape[1])) print('Number of auxiliary features: {}'.format(X_auxiliary_1.shape[1])) i_aux = X_tensor_1.shape[1] del X_tensor_1, X_auxiliary_1, X_tensor_2, X_auxiliary_2, X_1, X_2 # Read responses y_tr = np.loadtxt("../validation_data_train/responses.csv", delimiter=",", skiprows=1) y_tr = np.concatenate((y_tr, y_tr), axis=0) ### Validation data # Validation set features data_dir = "../validation_data/" X_tensor_1 = concatenate_features(data_dir, features_tensor_1) X_auxiliary_1 = concatenate_features(data_dir, features_auxiliary_1) X_val = np.concatenate((X_tensor_1, X_auxiliary_1), axis=1) print('Validation dataset shape: {}'.format(X_val.shape)) print('Non-zeros rate: {:.05f}'.format(np.mean(X_val != 0))) print('Number of one-hot encoding features: {}'.format( X_tensor_1.shape[1])) print('Number of auxiliary features: {}'.format(X_auxiliary_1.shape[1])) i_aux = X_tensor_1.shape[1] del X_tensor_1, X_auxiliary_1 X_tr, X_val = standardize(X_tr, X_val, i_aux) if input_type == 'sparse': X_tr = sp.csr_matrix(X_tr) X_val = sp.csr_matrix(X_val) model = TFFMRegressor( order=order, rank=rank, n_epochs=n_epochs, optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate), batch_size=batch_size, init_std=init_std, reg=reg, input_type=input_type, seed=seed) # Train the model model.fit(X_tr, y_tr, show_progress=True) # Predict y_pred_val = model.predict(X_val) np.savetxt("results/validation_set_predictions.txt", y_pred_val)
# ) # model.fit(train_queue[0], train_queue[1], show_progress=True) # inferences = model.predict(test_queue[0]) # mse = mean_squared_error(test_queue[1], inferences) # rmse = np.sqrt(mse) # logging.info('rmse: %.4f[%.4f]' % (rmse, time()-start)) from tffm import TFFMRegressor import tensorflow as tf os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) model = TFFMRegressor( order=dim, rank=args.embedding_dim, optimizer=tf.train.AdagradOptimizer(learning_rate=args.lr), n_epochs=args.train_epochs, # batch_size=1076946, batch_size=4096, init_std=0.001, reg=args.weight_decay, input_type='sparse', log_dir=os.path.join(args.save, save_name), ) model.fit(train_queue[0], train_queue[1], show_progress=True) inferences = model.predict(test_queue[0]) mse = mean_squared_error(test_queue[1], inferences) rmse = np.sqrt(mse) logging.info('rmse: %.4f[%.4f]' % (rmse, time() - start)) elif args.mode == 'autoneural': start = time() if dim == 2: model = AutoNeural(num_users, num_items, args.embedding_dim,
def main(): torch.set_default_tensor_type(torch.FloatTensor) torch.set_num_threads(3) if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) data_start = time.time() if args.dataset == 'ml-100k': num_users = 943 num_items = 1682 dim = 2 elif args.dataset == 'ml-1m': num_users = 6040 num_items = 3952 dim = 2 elif args.dataset == 'ml-10m': num_users = 71567 num_items = 65133 dim = 2 elif args.dataset == 'youtube-small': num_ps = 600 num_qs = 14340 num_rs = 5 dim = 3 train_queue, valid_queue, test_queue = utils.get_data_queue(args) logging.info('prepare data finish! [%f]' % (time.time() - data_start)) if args.mode == 'libfm': start = time.time() from tffm import TFFMRegressor import tensorflow as tf os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) model = TFFMRegressor( order=dim, rank=args.embedding_dim, optimizer=tf.train.AdagradOptimizer(learning_rate=args.lr), n_epochs=args.train_epochs, batch_size=args.batch_size, init_std=0.001, reg=args.weight_decay, input_type='sparse', log_dir=os.path.join(save_name, 'libfm-log')) model.fit(train_queue[0], train_queue[1], show_progress=True) inferences = model.predict(test_queue[0]) mse = mean_squared_error(test_queue[1], inferences) rmse = np.sqrt(mse) logging.info('rmse: %.4f[%.4f]' % (rmse, time.time() - start)) else: start = time.time() if args.mode == 'ncf': if dim == 2: model = NCF(num_users, num_items, args.embedding_dim, args.weight_decay).cuda() elif dim == 3: model = NCF_Triple(num_ps, num_qs, num_rs, args.embedding_dim, args.weight_decay).cuda() elif args.mode == 'deepwide': if dim == 2: model = DeepWide(num_users, num_items, args.embedding_dim, args.weight_decay).cuda() elif dim == 3: model = DeepWide_Triple(num_ps, num_qs, num_rs, args.embedding_dim, args.weight_decay).cuda() elif args.mode == 'altgrad': model = AltGrad(num_users, num_items, args.embedding_dim, args.weight_decay).cuda() elif args.mode == 'convncf': model = ConvNCF(num_users, num_items, args.embedding_dim, args.weight_decay).cuda() elif args.mode == 'outer': model = Outer(num_users, num_items, args.embedding_dim, args.weight_decay).cuda() elif args.mode == 'conv': model = Conv(num_users, num_items, args.embedding_dim, args.weight_decay).cuda() elif args.mode == 'plus': model = Plus(num_users, num_items, args.embedding_dim, args.weight_decay).cuda() elif args.mode == 'max': model = Max(num_users, num_items, args.embedding_dim, args.weight_decay).cuda() elif args.mode == 'min': model = Min(num_users, num_items, args.embedding_dim, args.weight_decay).cuda() elif args.mode == 'cp': model = CP(num_ps, num_qs, num_rs, args.embedding_dim, args.weight_decay).cuda() elif args.mode == 'tucker': model = TuckER(num_ps, num_qs, num_rs, args.embedding_dim, args.weight_decay).cuda() elif args.mode == 'sif': if dim == 2: arch = utils.load_arch(num_users, num_items, args) print(next(arch['mlp']['p'].parameters())) model = Network(num_users, num_items, args.embedding_dim, arch, args.weight_decay).cuda() elif dim == 3: arch = utils.load_arch_triple(num_ps, num_qs, num_rs, args) model = Network_Triple(num_ps, num_qs, num_rs, args.embedding_dim, arch, args.weight_decay).cuda() logging.info('build model finish! [%f]' % (time.time() - start)) optimizer = torch.optim.Adagrad(model.parameters(), args.lr) if dim == 2: train(model, train_queue, test_queue, optimizer, args) rmse = evaluate(model, test_queue) elif dim == 3: train_triple(model, train_queue, test_queue, optimizer, args) rmse = evaluate_triple(model, test_queue) logging.info('rmse: %.4f' % rmse)
use_info = True, path = path, ) learner.fit(train) ''' # TEST tffm. # https://github.com/geffy/tffm from tffm import TFFMRegressor import tensorflow as tf learner = simple_fm.SimpleFMLearner( external_fm=TFFMRegressor( order=2, rank=12, optimizer=tf.train.AdamOptimizer(learning_rate=0.001), n_epochs=300, batch_size=128, init_std=0.001, reg=0.001, input_type='sparse'), use_info=True, path=path, ) learner.fit(train) # calculate inverse count. order = [] for uid, iid, rating in test: y_ = learner.predict(uid, iid) #print >> sys.stderr, '%d\t%d\t%.4f\t%d' % (uid, iid, y_, rating) order.append((y_, rating))
y = np.reshape( y, (y.shape[0],) ) X = sparse.csr_matrix(data_train_FM.drop(columns=['FREQUENCY','CUST_ID','ARTICLE_ID']).to_numpy()) del data_train_FM rank = 20 l_r = 0.05 reg = 0.001 epoch = 200 model_tf = TFFMRegressor( order=2, rank=rank, optimizer=tf.train.AdamOptimizer(learning_rate=l_r), reg=reg, n_epochs=epoch, init_std=0.001, input_type='sparse' ) data_reco_baselines_score = pd.read_csv(d+'/data_reco_baselines_score.csv') data_reco_baselines_score = data_reco_baselines_score[['NB_PURCH_TEST', 'NB_ARTICLE_PURCH_TEST' ,'ARM_PRECISION', 'SVD_PURE_PRECISION','NMF_PRECISION','K100_PRECISION',\ 'VAES_PRECISION','SPEC_PRECISION','CUST_ID','ARTICLE_ID']] protocol = pd.read_csv(d+'/test_protocol.csv') protocol = protocol.drop_duplicates() data_reco_baselines = pd.read_csv(d+'/data_reco_baselines.csv')
def main(argv): seed = 123 # Random seed data_dir = "../data/" n_epochs_inner = 100 # Number of epochs in the inner loop n_epochs_outer = 200 # Number of epochs in the outer loop learning_rate = 0.001 # Learning rate of the optimizer batch_size = 1024 # Batch size init_std = 0.01 # Initial standard deviation input_type = 'sparse' # Input type: 'sparse' or 'dense' order = 5 # Order of the factorization machine (comboFM) nfolds_outer = 10 # Number of folds in the outer loop nfolds_inner = 5 # Number of folds in the inner loop regparams = [10**2, 10**3, 10**4, 10**5] # Regularization parameter: to be optimized ranks = [25, 50, 75, 100] # Rank of the factorization: to be optimized # Experiment: 1) new_dose-response_matrix_entries, 2) new_dose-response_matrices, 3) new_drug_combinations""" experiment = argv[2] id_in = int(argv[1]) print("\nJob ID: %d" % id_in) print('GPU available:') print(tf.test.is_gpu_available()) # Features in position 1: Drug A - Drug B features_tensor_1 = ("drug1_concentration__one-hot_encoding.csv", "drug2_concentration__one-hot_encoding.csv", "drug1__one-hot_encoding.csv", "drug2__one-hot_encoding.csv", "cell_lines__one-hot_encoding.csv") features_auxiliary_1 = ("drug1_drug2_concentration__values.csv", "drug1__estate_fingerprints.csv", "drug2__estate_fingerprints.csv", "cell_lines__gene_expression.csv") X_tensor_1 = concatenate_features(data_dir, features_tensor_1) X_auxiliary_1 = concatenate_features(data_dir, features_auxiliary_1) X_1 = np.concatenate((X_tensor_1, X_auxiliary_1), axis=1) # Features in position 2: Drug B - Drug A features_tensor_2 = ("drug2_concentration__one-hot_encoding.csv", "drug1_concentration__one-hot_encoding.csv", "drug2__one-hot_encoding.csv", "drug1__one-hot_encoding.csv", "cell_lines__one-hot_encoding.csv") features_auxiliary_2 = ("drug2_drug1_concentration__values.csv", "drug2__estate_fingerprints.csv", "drug1__estate_fingerprints.csv", "cell_lines__gene_expression.csv") X_tensor_2 = concatenate_features(data_dir, features_tensor_2) X_auxiliary_2 = concatenate_features(data_dir, features_auxiliary_2) X_2 = np.concatenate((X_tensor_2, X_auxiliary_2), axis=1) # Concatenate the features from both positions vertically X = np.concatenate((X_1, X_2), axis=0) print('Dataset shape: {}'.format(X.shape)) print('Non-zeros rate: {:.05f}'.format(np.mean(X != 0))) print('Number of one-hot encoding features: {}'.format( X_tensor_1.shape[1])) print('Number of auxiliary features: {}'.format(X_auxiliary_1.shape[1])) i_aux = X_tensor_1.shape[1] del X_tensor_1, X_auxiliary_1, X_tensor_2, X_auxiliary_2, X_1, X_2 # Read responses y = np.loadtxt("../data/responses.csv", delimiter=",", skiprows=1) y = np.concatenate((y, y), axis=0) inner_folds = list(range(1, nfolds_inner + 1)) outer_folds = list(range(1, nfolds_outer + 1)) outer_fold = outer_folds[id_in] te_idx = np.loadtxt( '../cross-validation_folds/%s/test_idx_outer_fold-%d.txt' % (experiment, outer_fold)).astype(int) tr_idx = np.loadtxt( '../cross-validation_folds/%s/train_idx_outer_fold-%d.txt' % (experiment, outer_fold)).astype(int) X_tr, X_te, y_tr, y_te = X[tr_idx, :], X[te_idx, :], y[tr_idx], y[te_idx] print('Training set shape: {}'.format(X_tr.shape)) print('Test set shape: {}'.format(X_te.shape)) CV_RMSE_reg = np.zeros([len(regparams), nfolds_inner]) CV_RPearson_reg = np.zeros([len(regparams), nfolds_inner]) CV_RSpearman_reg = np.zeros([len(regparams), nfolds_inner]) rank = 50 # Fix rank first to 50 while optimizing regularization for reg_i in range(len(regparams)): reg = regparams[reg_i] for inner_fold in inner_folds: print("INNER FOLD: %d" % inner_fold) print("Rank: %d" % rank) print("Regularization: %d" % reg) te_idx_CV = np.loadtxt( '../cross-validation_folds/%s/test_idx_outer_fold-%d_inner_fold-%d.txt' % (experiment, outer_fold, inner_fold)).astype(int) tr_idx_CV = np.loadtxt( '../cross-validation_folds/%s/train_idx_outer_fold-%d_inner_fold-%d.txt' % (experiment, outer_fold, inner_fold)).astype(int) X_tr_CV, X_te_CV, y_tr_CV, y_te_CV = X[tr_idx_CV, :], X[ te_idx_CV, :], y[tr_idx_CV], y[te_idx_CV] X_tr_CV, X_te_CV = standardize( X_tr_CV, X_te_CV, i_aux ) # i_aux: length of one-hot encoding, not to be standardized if input_type == 'sparse': X_tr_CV = sp.csr_matrix(X_tr_CV) X_te_CV = sp.csr_matrix(X_te_CV) model = TFFMRegressor( order=order, rank=rank, n_epochs=n_epochs_inner, optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate), batch_size=batch_size, init_std=init_std, reg=reg, input_type=input_type, seed=seed) # Train the model model.fit(X_tr_CV, y_tr_CV, show_progress=True) # Predict y_pred_te_CV = model.predict(X_te_CV) # Evaluate performance RMSE = np.sqrt(mean_squared_error(y_te_CV, y_pred_te_CV)) CV_RMSE_reg[reg_i, inner_fold - 1] = RMSE RPearson = np.corrcoef(y_te_CV, y_pred_te_CV)[0, 1] CV_RPearson_reg[reg_i, inner_fold - 1] = RPearson RSpearman, _ = spearmanr(y_te_CV, y_pred_te_CV) CV_RSpearman_reg[reg_i, inner_fold - 1] = RSpearman model.destroy() print("RMSE: %f\nR_pearson: %f\nR_spearman: %f" % (RMSE, RPearson, RSpearman)) CV_avg_reg = np.mean(CV_RPearson_reg, axis=1) reg_i = np.where(CV_avg_reg == np.max(CV_avg_reg))[0] reg = regparams[int(reg_i)] np.savetxt( 'results/%s/outer_fold-%d_reg_CV_avg_RPearson.txt' % (experiment, outer_fold), CV_avg_reg) CV_RMSE_rank = np.zeros([len(ranks), nfolds_inner]) CV_RPearson_rank = np.zeros([len(ranks), nfolds_inner]) CV_RSpearman_rank = np.zeros([len(ranks), nfolds_inner]) for rank_i in range(len(ranks)): rank = ranks[rank_i] for inner_fold in inner_folds: print("INNER FOLD: %d" % inner_fold) print("Rank: %d" % rank) print("Regularization: %d" % reg) te_idx_CV = np.loadtxt( '../cross-validation_folds/%s/test_idx_outer_fold-%d_inner_fold-%d.txt' % (experiment, outer_fold, inner_fold)).astype(int) tr_idx_CV = np.loadtxt( '../cross-validation_folds/%s/train_idx_outer_fold-%d_inner_fold-%d.txt' % (experiment, outer_fold, inner_fold)).astype(int) X_tr_CV, X_te_CV, y_tr_CV, y_te_CV = X[tr_idx_CV, :], X[ te_idx_CV, :], y[tr_idx_CV], y[te_idx_CV] X_tr_CV, X_te_CV = standardize(X_tr_CV, X_te_CV, i_aux) if input_type == 'sparse': X_tr_CV = sp.csr_matrix(X_tr_CV) X_te_CV = sp.csr_matrix(X_te_CV) model = TFFMRegressor( order=order, rank=rank, n_epochs=n_epochs_inner, optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate), batch_size=batch_size, init_std=init_std, reg=reg, input_type=input_type, seed=seed) # Train the model model.fit(X_tr_CV, y_tr_CV, show_progress=True) # Predict y_pred_te_CV = model.predict(X_te_CV) # Evaluate performance RMSE = np.sqrt(mean_squared_error(y_te_CV, y_pred_te_CV)) CV_RMSE_rank[rank_i, inner_fold - 1] = RMSE RPearson = np.corrcoef(y_te_CV, y_pred_te_CV)[0, 1] CV_RPearson_rank[rank_i, inner_fold - 1] = RPearson RSpearman, _ = spearmanr(y_te_CV, y_pred_te_CV) CV_RSpearman_rank[rank_i, inner_fold - 1] = RSpearman model.destroy() print("RMSE: %f\nR_pearson: %f\nR_spearman: %f" % (RMSE, RPearson, RSpearman)) CV_avg_rank = np.mean(CV_RPearson_rank, axis=1) rank_i = np.where(CV_avg_rank == np.max(CV_avg_rank))[0] rank = ranks[int(rank_i)] np.savetxt( 'results/%s/outer_fold-%d_rank_CV_avg_RPearson.txt' % (experiment, outer_fold), CV_avg_rank) X_tr, X_te = standardize(X_tr, X_te, i_aux) if input_type == 'sparse': X_tr = sp.csr_matrix(X_tr) X_te = sp.csr_matrix(X_te) model = TFFMRegressor( order=order, rank=rank, n_epochs=n_epochs_outer, optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate), batch_size=batch_size, init_std=init_std, reg=reg, input_type=input_type, seed=seed) # Train the model model.fit(X_tr, y_tr, show_progress=True) # Predict y_pred_te = model.predict(X_te) RPearson = np.corrcoef(y_te, y_pred_te)[0, 1] print("RMSE: %f\nR_pearson: %f\nR_spearman: %f" % (RMSE, RPearson, RSpearman)) np.savetxt( "results/%s/outer-fold-%d_y_test_order-%d_rank-%d_reg-%d_%s.txt" % (experiment, outer_fold, order, rank, reg, experiment), y_te) np.savetxt( "results/%s/outer-fold-%d_y_pred_order-%d_rank-%d_reg-%d_%s.txt" % (experiment, outer_fold, order, rank, reg, experiment), y_pred_te) # Save model weights weights = model.weights for i in range(order): np.savetxt( 'results/%s/outer-fold-%d_P_order%d_rank-%d_reg-%.1e.txt' % (experiment, outer_fold, i + 1, rank, reg), weights[i])
submission = pd.DataFrame({ "ID": test.index, "item_cnt_month": Y_test }) submission.to_csv('../data/xgb_submission_2020-06-14_03.csv', index=False) plot_features(model, (10,14)) # tffm sandbox from tffm import TFFMClassifier, TFFMRegressor model = TFFMRegressor( order=2, rank=10, optimizer=tf.train.AdamOptimizer(learning_rate=0.01), n_epochs=100, batch_size=1024, init_std=0.001, reg=0.01, input_type='dense' ) X_train.fillna(0.0, inplace=True) for c in X_train.columns: if X_train[c].isna().any(): print(c) if np.isinf(X_train[c]).any(): print(c) model.fit(X_train.values.astype('float32'), Y_train.values.astype('float32'), show_progress=True) from sklearn.metrics import mean_squared_error
merged1 = pd.merge(transformed_buys, historical_buy_data, left_index=True, right_index=True) merged2 = pd.merge(merged1, historical_click_data, left_index=True, right_index=True) # Create the MF model, you can play around with the parameters model = TFFMRegressor(order=2, rank=7, optimizer=tf.train.AdamOptimizer(learning_rate=0.1), n_epochs=100, batch_size=-1, init_std=0.001, input_type='dense') merged2.drop( ['Item ID', '_Session ID', 'click history:Item ID', 'buy history:Item ID'], 1, inplace=True) X = np.array(merged2) X = np.nan_to_num(X) y = np.array(merged2['Quantity'].as_matrix()) # Split data into train, test X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2)
# ** create np.array from X_train_withoutUsers ** # In[29]: X_train_withoutUsersArray = np.array(X_train_withoutUsers) X_test_withoutUsersArray = np.array(X_test_withoutUsers) # ** Run model ** # In[30]: model = TFFMRegressor(order=2, rank=7, optimizer=tf.train.AdamOptimizer(learning_rate=0.1), n_epochs=30, batch_size=-1, init_std=0.001, input_type='dense') # In[31]: model.fit(X_train_withoutUsersArray, y_tr, show_progress=True) predictions = model.predict(X_test_withoutUsersArray) print('MSE: {}'.format(mean_squared_error(y_te, predictions))) # ## Make predictions: # # (this is the messy part - very difficult to predict new movies using such a sparse array) # Checking out how many unique users there are:
def __init__(self): self.model = TFFMRegressor( optimizer=tf.train.AdamOptimizer(learning_rate=0.01), n_epochs=1000, input_type='sparse')
#valid_examples_pos = reader.valid_examples_pos words = reader.words meta_vector = reader.meta_vector X = reader.X X_ids = reader.X_ids X_weights = reader.X_weights Y = reader.Y model = TFFMRegressor( num_unique_meta=len_unique_meta, meta_vector=meta_vector, num_features=vocab_size, order=2, rank=dimensions, # optimizer=tf.train.AdamOptimizer(learning_rate=lr), # lr = 0.001 optimizer=tf.train.AdagradOptimizer(learning_rate=lr), # lr = 0.05 n_epochs=iterations, batch_size=batch_size, init_std=0.01, reg=0.02, reweight_reg=False, count_max=count_max, input_type='sparse', log_dir=log_path, valid_examples=valid_examples_words, words=words, write_embedding_every=10, session_config=tf.ConfigProto(log_device_placement=False), verbose=2 ) model.fit(X, X_ids, X_weights, Y, show_progress=True)