def build_fm_interaction(): begin = datetime.datetime.now() test_y = np.loadtxt(open(test_y_file), dtype=int) fm = pywFM.FM(task='classification', num_iter=100, learning_method='mcmc', temp_path=project_path + "model\\m_fm\\tmp\\") model = fm.run(None, None, None, None, train_path=train_x_file, test_path=test_x_file, model_path=project_path + "model\\m_fm\\model_file\\fm_model", out_path=project_path + "model\\m_fm\\model_file\\fm.out") end = datetime.datetime.now() print model.pairwise_interactions.shape prob_test = model.predictions auc_test = metrics.roc_auc_score(test_y, prob_test) print auc_test log_file = open(project_path + "result/exp_result", "a") log_file.write("fm: sparse_id + gbdt + 100 iters:" + '\n') log_file.write("auc_test: " + str(auc_test) + '\n') log_file.write("time: " + str(end - begin) + '\n' + '\n') log_file.close() print model.pairwise_interactions.shape
def test(): features = np.matrix([ # Users | Movies | Movie Ratings | Time | Last Movies Rated # A B C | TI NH SW ST | TI NH SW ST | | TI NH SW ST [1, 0, 0, 1, 0, 0, 0, 0.3, 0.3, 0.3, 0, 13, 0, 0, 0, 0], [1, 0, 0, 0, 1, 0, 0, 0.3, 0.3, 0.3, 0, 14, 1, 0, 0, 0], [1, 0, 0, 0, 0, 1, 0, 0.3, 0.3, 0.3, 0, 16, 0, 1, 0, 0], [0, 1, 0, 0, 0, 1, 0, 0, 0, 0.5, 0.5, 5, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 1, 0, 0, 0.5, 0.5, 8, 0, 0, 1, 0], [0, 0, 1, 1, 0, 0, 0, 0.5, 0, 0.5, 0, 9, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1, 0, 0.5, 0, 0.5, 0, 12, 1, 0, 0, 0] ]) target = [0, 1, 1, 0, 1, 0, 1] fm = pywFM.FM(task='c', num_iter=20, learning_method='sgd', temp_path=project_path + "model\\m_fm\\tmp\\") print features[:5] # split features and target for train/test # first 5 are train, last 2 are test model = fm.run(features[:5], target[:5], features[5:], target[5:], model_path=project_path + "model\\m_fm\\model_file\\fm_model", out_path=project_path + "model\\m_fm\\model_file\\fm.out") prob_test = model.predictions auc_test = metrics.roc_auc_score(target[5:], prob_test) print auc_test
def test(): features = np.matrix([ # Users | Movies | Movie Ratings | Time | Last Movies Rated # A B C | TI NH SW ST | TI NH SW ST | | TI NH SW ST [1, 0, 0, 1, 0, 0, 0, 0.3, 0.3, 0.3, 0, 13, 0, 0, 0, 0], [1, 0, 0, 0, 1, 0, 0, 0.3, 0.3, 0.3, 0, 14, 1, 0, 0, 0], [1, 0, 0, 0, 0, 1, 0, 0.3, 0.3, 0.3, 0, 16, 0, 1, 0, 0], [0, 1, 0, 0, 0, 1, 0, 0, 0, 0.5, 0.5, 5, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 1, 0, 0, 0.5, 0.5, 8, 0, 0, 1, 0], [0, 0, 1, 1, 0, 0, 0, 0.5, 0, 0.5, 0, 9, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1, 0, 0.5, 0, 0.5, 0, 12, 1, 0, 0, 0] ]) target = [5, 3, 1, 4, 5, 1, 5] fm = pywFM.FM(task='regression', num_iter=5) # split features and target for train/test # first 5 are train, last 2 are test model = fm.run(features[:5], target[:5], features[5:], target[5:]) print(model.predictions) # you can also get the model weights print(model.weights) prob_test = model.predictions auc_test = metrics.roc_auc_score(target[5:], prob_test) print auc_test
def fit(self, X, y): # Should not be done in production :) Otherwise you should also install libFM: # https://github.com/srendle/libfm import pywFM X_fm = self.prepare_fm(X) self.chrono.save('prepare data in sparse FM format') os.environ['LIBFM_PATH'] = 'XXX' # If applicable fm = pywFM.FM(task='regression', num_iter=self.nb_iterations, k2=self.rank, rlog=False) # MCMC method # rlog contains the RMSE at each epoch, we do not need it here model = fm.run(X_fm, y, X_fm, y) self.chrono.save('train FM') nb_agents = self.nb_users + self.nb_works + self.nb_tags current = len(model.weights) if model.global_bias is None: # Train failed (for example, libfm does not exist) self.mu = 0 self.W = np.random.random(nb_agents) self.V = np.random.random((nb_agents, self.rank)) else: self.mu = model.global_bias self.W = np.pad( np.array(model.weights), (0, nb_agents - current), mode='constant' ) # Just in case X_fm had too many zero columns on the right self.V = np.pad(model.pairwise_interactions, [(0, nb_agents - current), (0, 0)], mode='constant') self.V2 = np.power(self.V, 2)
def cross_validationMCMC(data, k_indices, k, num_iter, std_init): """ Runs the cross validation on the input data, using the Markov Chain Monte Carlo algorithm. It splits the data into a training and testing fold, according to k_indices and k, and then runs the MCMC on all the parameter std_init for num_iter iterations. @param data : the DataFrame containing all our training data (on which we do the CV) @param k_indices : array of k-lists containing each of the splits of the data @param k : the number of folds of the cross-validation @param num_iter : the number of iterations of the algorithm @param std_init : the standard deviation for the initialisation of the data @return loss_te : the RMSE loss for the run of the algorithm using libFM with these parameters. """ # get k'th subgroup in test, others in train te_indices = k_indices[k] tr_indices = k_indices[~(np.arange(k_indices.shape[0]) == k)] tr_indices = tr_indices.reshape(-1) train = data.loc[tr_indices] test = data.loc[te_indices] test.sort_values(['Movie', 'User'], ascending=[1, 1], inplace=True) # format the DataFrames into the Sparse matrices we need to run with pywFM features_tr, target_tr = df_to_sparse(train) features_te, target_te = df_to_sparse(test) # running the model fm = pywFM.FM(task='regression', num_iter=num_iter, init_stdev=std_init) model = fm.run(features_tr, target_tr, features_te, target_te) # getting the RMSE at the last run step. loss_te = model.rlog.rmse[num_iter - 1] return loss_te
def ALSBias_pywFM(train, test, num_iter=100, std_init = 0.43, rank = 7, r0_reg = 0.5, r1_reg = 15, r2_reg = 25): """ Runs the ALS algorithm with the user bias included for num_iter iterations. N.B. The parameters passed by default are the best ones we found. @param train : the DataFrame containing all our training data. @param test : the DataFrame containing all our testing data. @param num_iter : the number of iterations of the algorithm @param std_init : the standard deviation for the initialisation of W and Z @param rank : the number of columns of W and Z @param r0_reg : the regularization parameter for the global bias term w0 @param r1_reg : the regularization parameter of the user/item bias term w @param r2_reg : the regularization parameter for the ALS regularization (size of the entries of W and Z) @return np.array(pred) : the prediction values for all the data within the test set """ # 1. Defining the model fm = pywFM.FM(task = 'regression', learning_method='als', num_iter=num_iter, init_stdev = std_init, k2 = rank, r0_regularization = r0_reg, r1_regularization = r1_reg, r2_regularization = r2_reg) # 2. Formatting the data features_tr, target_tr = df_to_sparse(train) features_te, target_te = df_to_sparse(test) # 3. Running the model model = fm.run(features_tr, target_tr, features_te, target_te) # 4. Outputs pred = model.predictions return np.array(pred)
def MCMC_pywFM(train, test, num_iter=100, std_init = 0.5): """ Runs the ALS algorithm with MCMC for num_iter iterations. N.B. The parameters passed by default are the best ones we found. @param train : the DataFrame containing all our training data. @param test : the DataFrame containing all our testing data. @param num_iter : the number of iterations of the algorithm @param std_init : the standard deviation for the initialisation of W and Z @return np.array(pred) : the prediction values for all the data within the test set """ # 1. Defining the model fm = pywFM.FM(task='regression', num_iter= num_iter, init_stdev = std_init) # 2. Formatting the data features_tr, target_tr = df_to_sparse(train) features_te, target_te = df_to_sparse(test) # 3. Running the model model = fm.run(features_tr, target_tr, features_te, target_te) # 4. Outputs pred = model.predictions return np.array(pred)
def cross_validationMCMC(data, target, k_indices, k, num_iter, std_init): """ Runs the cross validation on the input data, using the Markov Chain Monte Carlo algorithm. It splits the data into a training and testing fold, according to k_indices and k, and then runs the MCMC on all the parameter std_init for num_iter iterations. @param data : the DataFrame containing all our training data (on which we do the CV) @param k_indices : array of k-lists containing each of the splits of the data @param k : the number of folds of the cross-validation @param num_iter : the number of iterations of the algorithm @param std_init : the standard deviation for the initialisation of the data @return loss_te : the RMSE loss for the run of the algorithm using libFM with these parameters. """ # get k'th subgroup in test, others in train te_indices = k_indices[k] tr_indices = k_indices[~(np.arange(k_indices.shape[0]) == k)] tr_indices = tr_indices.reshape(-1) x1 = data.loc[tr_indices] x2 = data.loc[te_indices] y1 = target[tr_indices] y2 = target[te_indices] # running the model fm = pywFM.FM(task='classification', num_iter=num_iter, init_stdev=std_init) model = fm.run(x1, y1, x2, y2) # getting the RMSE at the last run step. pred = model.predictions return eval_gini(pred, y2)
def make_mf_libfm(X, y, X_test, n_round=3): n = X.shape[0] ''' Fit metafeature by @clf and get prediction for test. Assumed that @clf -- regressor ''' mf_tr = np.zeros(X.shape[0]) mf_te = np.zeros(X_test.shape[0]) for i in range(n_round): skf = StratifiedKFold(y, n_folds=4, shuffle=True, random_state=42 + i * 1000) for ind_tr, ind_te in skf: X_tr = X[ind_tr] X_te = X[ind_te] y_tr = y[ind_tr] y_te = y[ind_te] fm = pywFM.FM(task='classification', learning_method='mcmc', num_iter=1000, init_stdev=0.7, k0=1, k1=1, k2=8, verbose=10) model = fm.run(X_tr, y_tr, X_te, y_te) y_pred = model.predictions mf_tr[ind_te] += y_pred score = roc_auc_score(y_te, y_pred) print 'pred[{}] score:{}'.format(i, score) fm = pywFM.FM(task='classification', learning_method='mcmc', num_iter=1000, init_stdev=0.7, k0=1, k1=1, k2=8, verbose=10) model = fm.run(X, y, X_test, np.zeros(X_test.shape[0])) mf_te = model.predictions return (mf_tr / n_round, mf_te)
def dofit_pywFM(num_iter=100, lr=0.1, k2=8, learning_method='sgda'): globals().update(get_nn_data()) fm = pywFM.FM('classification', num_iter=num_iter, init_stdev=0.1, k0=True, k1=True, k2=k2, learning_method=learning_method, learn_rate=lr, r0_regularization=0, r1_regularization=0, r2_regularization=0, rlog=False, verbose=True, silent=False, temp_path=None) y_fake_test = np.empty((X_test.shape[0])) assert(X_test.shape[0] == y_fake_test.shape[0]) predictions, global_bias, weights, pairwise_interactions, rlog = \ fm.run(X_fit[:1000], y_fit[:1000], X_test[:1000], y_fake_test[:1000], X_eval[:1000], y_eval[:1000]) # auc = sklearn.metrics.roc_auc_score(y_fit, predictions) return {'predictions': predictions, 'global_bias': global_bias, 'weights': weights, 'pairwise_interactions': pairwise_interactions, 'rlog': rlog}
def test_pywfm_on_bow(): df = load_train() folds = create_folds(df) train, test = folds[0] train, test = oversample(train, test, 42) questions = list(train[question1]) + list(train[question2]) print 'Creating Vectorizer...' c = CountVectorizer(questions, binary=True, stop_words='english') print 'Fitting Vectorizer...' c.fit(questions) train_arr_q1 = c.transform(train[question1]) train_arr_q2 = c.transform(train[question2]) train_arr = train_arr_q1 + train_arr_q2 train_arr[train_arr == 2] = 1 train_arr[train_arr == 1] = -1 test_arr_q1 = c.transform(test[question1]) test_arr_q2 = c.transform(test[question2]) test_arr = test_arr_q1 + test_arr_q2 test_arr[test_arr == 2] = 1 test_arr[test_arr == 1] = -1 train_target = train[TARGET] test_target = test[TARGET] fm = pywFM.FM(task='classification', num_iter=100, verbose=10, r1_regularization=0.1, learn_rate=0.1) res = fm.run(train_arr, train_target, test_arr, test_target) prob = res.predictions prob_0 = [1 - x for x in prob] # return res proba = np.array([prob_0, prob]).reshape(len(prob), 2) loss = log_loss(test[TARGET], proba) print loss print loss
def work(enu,us): local = data_test_FM[data_test_FM.CUST_ID==us] local = pd.merge(local,user_FM,on='CUST_ID') local = pd.merge(local,item_FM,on='ARTICLE_ID') if len(local) > 0 : X_test = sparse.csr_matrix(local.drop(columns=['CUST_ID','ARTICLE_ID']).to_numpy()) os.environ['LIBFM_PATH']='/home/slide/bouaroun/libfm/bin/' fm = pywFM.FM(task='regression', num_iter=150, learning_method='als', learn_rate=0.05, r2_regularization=0.001) model = fm.run(X, y, X_test , np.array([random.randint(1,5) for i in range(len(local))]) ) local = pd.read_csv( d+'/FM/local_test_{}.csv'.format(us)) local['FM_PRECISION'] = model.predictions local = local[['ARTICLE_ID','FM_PRECISION','FM_PRECISION_tf']] local.to_csv( d+'/FM/local_test_{}.csv'.format(us),index=False ) print(enu)
def cross_validationALSBias(data, k_indices, k, num_iter, std_init, rank, r0_reg, r1_reg, r2_reg): """ Runs the cross validation on the input data, using the ALS algorithm with the user bias included. It splits the data into a training and testing fold, according to k_indices and k, and then runs the ALS with bias on all the parameters (std_init, rank, r0_reg, r1_reg, r2_reg) for num_iter iterations. @param data : the DataFrame containing all our training data (on which we do the CV) @param k_indices : array of k-lists containing each of the splits of the data @param k : the number of folds of the cross-validation @param num_iter : the number of iterations of the algorithm @param std_init : the standard deviation for the initialisation of W and Z @param rank : the number of columns of W and Z @param r0_reg : the regularization parameter for the global bias term w0 @param r1_reg : the regularization parameter of the user/item bias term w @param r2_reg : the regularization parameter for the ALS regularization (size of the entries of W and Z) @return loss_te : the RMSE loss for the run of the algorithm using libFM with these parameters. """ # get k'th subgroup in test, others in train te_indices = k_indices[k] tr_indices = k_indices[~(np.arange(k_indices.shape[0]) == k)] tr_indices = tr_indices.reshape(-1) train = data.loc[tr_indices] test = data.loc[te_indices] test.sort_values(['Movie', 'User'], ascending=[1, 1], inplace=True) # format the DataFrames into the Sparse matrices we need to run with pywFM features_tr, target_tr = df_to_sparse(train) features_te, target_te = df_to_sparse(test) # running the model fm = pywFM.FM(task='regression', learning_method='als', num_iter=num_iter, init_stdev=std_init, k2=rank, r0_regularization=r0_reg, r1_regularization=r1_reg, r2_regularization=r2_reg) model = fm.run(features_tr, target_tr, features_te, target_te) # getting the RMSE at the last run step. loss_te = model.rlog.rmse[num_iter - 1] return loss_te
def demo_libfm(): # export PYTHONPATH=~/ai_group/zhihu2019 # export LIBFM_PATH=/root/ai_group/libfm/bin/ # os.environ['LIBFM_PATH'] = '/Users/zhengchubin/PycharmProjects/zhihu2019/data/资料/libfm/bin/' # features = pd.DataFrame(features) # from sklearn.datasets import dump_svmlight_file # dump_svmlight_file(features, target, '/Users/zhengchubin/Desktop/xx.svm') # print(features.head()) fm = pywFM.FM(task='classification', learning_method='mcmc', num_iter=100, init_stdev=0.7, k0=1, k1=1,k2=16, verbose=10) # split features and target for train/test # first 5 are train, last 2 are test model = fm.run(X_tr, y_tr, X_te, y_te) print(model.predictions, type(model.predictions)) # you can also get the model weights print(model.weights) print(model.pairwise_interactions)
def build_fm_interaction(): begin = datetime.datetime.now() test_y = np.loadtxt(open(test_Y_file), dtype=int) fm = pywFM.FM(task='classification', num_iter=100, learning_method='mcmc', temp_path=project_path + "model\\m_fm\\tmp\\") model = fm.run(None, None, None, None, train_path=train_X_file, test_path=test_X_file, model_path=project_path + "model\\m_fm\\model_file\\fm_model", out_path=project_path + "model\\m_fm\\model_file\\fm.out") end = datetime.datetime.now() print model.pairwise_interactions.shape y_pred = model.predictions auc_test = metrics.roc_auc_score(test_y, y_pred) accuracy = metrics.accuracy_score(test_y, y_pred) logloss = metrics.log_loss(test_y, y_pred) np.savetxt(open(constants.project_path + "result/10_9_fm_pred", "w"), y_pred, fmt='%.5f') rcd = str(end) + '\n' rcd += "fm: new basic" + '\n' rcd += "accuracy: " + str(accuracy) + '\n' rcd += "logloss: " + str(logloss) + '\n' rcd += "auc_test: " + str(auc_test) + '\n' rcd += "time: " + str(end - begin) + '\n' + '\n' print rcd log_file = open(project_path + "result/oct_result", "a") log_file.write(rcd) log_file.close() print model.pairwise_interactions.shape
import pandas as pd import csv from sklearn.feature_extraction import DictVectorizer from sklearn.metrics import mean_squared_error from data_loader import loadData from data_loader import loadTest (train, y_train), (valid, y_valid) = loadData("../item_recom/train_info.tsv", 1.1) dictv = DictVectorizer() test = loadTest("../item_recom/test_info.tsv") print "convert to one-hot represerntation" _ = dictv.fit_transform(train+test) X_train = dictv.transform(train) # X_valid = dictv.transform(valid) X_test = dictv.transform(test) y_test = np.ones(len(test))*2.5 fm = pywFM.FM(task='regression', num_iter=1200, k2=48, learning_method='mcmc') model = fm.run(X_train, y_train, X_test, y_test) # print("FM RMSE: %.6f" % math.sqrt(mean_squared_error(y_valid, model.predictions))) # with open('../submissions/sixteenth.csv', 'w') as csvfile: fieldnames = ['uid#iid', 'pred'] writer = csv.DictWriter(csvfile, fieldnames) writer.writeheader() for ind in xrange(len(test)): writer.writerow({'uid#iid': "%s#%s"%(test[ind]["1_user_id"], test[ind]["2_item_id"]) ,'pred': "%f"%model.predictions[ind]})
method=params["scale_features_method"]) # das3h features with lr lr = LogisticRegression(max_iter=1000, solver="liblinear") lr.fit(X_train_, y_train_das3h, sample_weight=sample_weight) # metrics test y_test_pred_probas_das3h_lr = lr.predict_proba(X_test_)[:, 1] logs[f"fold{i}"]["das3h_lr"] = compute_metrics( y_test_das3h, y_test_pred_probas_das3h_lr) # metrics train y_train_pred_probas_das3h_lr = lr.predict_proba(X_train_)[:, 1] logs[f"fold{i}"]["das3h_lr_train"] = compute_metrics( y_train_das3h, y_train_pred_probas_das3h_lr) # das3h fm = pywFM.FM(**params_fm) model = fm.run(X_train_, y_train_das3h, X_test_, y_test_das3h) y_test_pred_probas_das3h = np.array(model.predictions) logs[f"fold{i}"]["das3h"] = compute_metrics(y_test_das3h, y_test_pred_probas_das3h) # item-avg item_avg_train = item_avg_predictor(task_sessions_train) # metrics test y_test_pred_item_avg_probas = [ item_avg_train(item) for item in task_sessions_test["task"] ] logs[f"fold{i}"]["item_avg"] = compute_metrics( task_sessions_test["solved"], y_test_pred_item_avg_probas) # metrics train y_train_pred_item_avg_probas = [
import pywFM import numpy as np import pandas as pd features = np.matrix([ # Users | Movies | Movie Ratings | Time | Last Movies Rated # A B C | TI NH SW ST | TI NH SW ST | | TI NH SW ST [1, 0, 0, 1, 0, 0, 0, 0.3, 0.3, 0.3, 0, 13, 0, 0, 0, 0], [1, 0, 0, 0, 1, 0, 0, 0.3, 0.3, 0.3, 0, 14, 1, 0, 0, 0], [1, 0, 0, 0, 0, 1, 0, 0.3, 0.3, 0.3, 0, 16, 0, 1, 0, 0], [0, 1, 0, 0, 0, 1, 0, 0, 0, 0.5, 0.5, 5, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 1, 0, 0, 0.5, 0.5, 8, 0, 0, 1, 0], [0, 0, 1, 1, 0, 0, 0, 0.5, 0, 0.5, 0, 9, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1, 0, 0.5, 0, 0.5, 0, 12, 1, 0, 0, 0] ]) target = [5, 3, 1, 4, 5, 1, 5] fm = pywFM.FM(task='regression', num_iter=5) # split features and target for train/test # first 5 are train, last 2 are test model = fm.run(features[:5], target[:5], features[5:], target[5:]) print(model.predictions) # you can also get the model weights print(model.weights)
def DAS3H(a, active, tw, isKfold, model_params): dim = model_params['dim'] # FM parameters FM_params = { 'task': 'classification', 'num_iter': model_params['iter'], 'rlog': True, 'learning_method': 'mcmc', 'k2': dim } print(active) print(tw) prefix = '' if set(active) == {'users', 'items'} and dim == 0: prefix = 'IRT' elif set(active) == {'users', 'items'} and dim > 0: prefix = 'MIRTb' elif set(active) == {'skills', 'attempts'}: prefix = 'AFM' elif set(active) == {'skills', 'wins', 'fails'}: prefix = 'PFA' elif set(active) == {'items', 'skills', 'wins', 'fails'}: prefix = 'KTM' elif set(active) == {'users', 'items', 'skills', 'wins', 'attempts'} and ( tw == 'tw_kc'): prefix = 'DAS3H' elif set(active) == {'users', 'items', 'wins', 'attempts'} and ( tw == 'tw_items'): prefix = 'DASH' else: for f in active: prefix += f[0] if tw == 'tw_kc': prefix += 't1' else: prefix += 't2' print(prefix) [df, QMatrix, StaticInformation, DictList] = a.dataprocessor.loadLCData() X, dict_data = a.loadDAS3HData(active, features_suffix, 0.8, tw=tw) y = X[:,3].toarray().flatten() saveDir = os.path.join(a.LCDataDir, 'das3h', 'results_K'+str(isKfold)[0], prefix) prepareFolder(saveDir) metrics1 = {'MAE':metrics.mean_absolute_error, 'MSE':metrics.mean_squared_error, 'AUC':metrics.roc_auc_score, } metrics2 = {'Accuracy':metrics.accuracy_score, 'Precision':metrics.precision_score, 'AP':metrics.average_precision_score, 'Recall':metrics.recall_score, 'F1-score':metrics.f1_score, } metrics_tf1 = {'tf_Accuracy':tf.keras.metrics.Accuracy(), } metrics_tf2 = {'tf_Precision':tf.keras.metrics.Precision(thresholds = 0.5), 'tf_Recall':tf.keras.metrics.Recall(thresholds = 0.5), 'tf_MSE':tf.keras.metrics.MeanSquaredError(), 'tf_MAE':tf.keras.metrics.MeanAbsoluteError(), 'tf_RMSE':tf.keras.metrics.RootMeanSquaredError(), 'tf_AUC':tf.keras.metrics.AUC(), 'tf_AUC_1000': tf.keras.metrics.AUC(num_thresholds=1000) } results={'LC_params':a.LC_params,'model_params':model_params,'results':{}} if isKfold: for run_id in range(model_params['kFold']): prepareFolder(os.path.join(saveDir, str(run_id))) dict_data = a.loadSplitInfo(model_params['kFold']) for run_id in range(model_params['kFold']): users_train = dict_data[str(run_id)]['train'] users_test = dict_data[str(run_id)]['test'] X_train = X[np.where(np.isin(X[:,0].toarray().flatten(),users_train))] y_train = X_train[:,3].toarray().flatten() X_test = X[np.where(np.isin(X[:,0].toarray().flatten(),users_test))] y_test = X_test[:,3].toarray().flatten() if model_params['dim'] == 0: print('fitting...') model = LogisticRegression(solver="newton-cg", max_iter=400) model.fit(X_train[:,5:], y_train) # the 5 first columns are the non-sparse dataset y_pred_test = model.predict_proba(X_test[:,5:])[:, 1] else: fm = pywFM.FM(**FM_params) model = fm.run(X_train[:,5:], y_train, X_test[:,5:], y_test) y_pred_test = np.array(model.predictions) model.rlog.to_csv(os.path.join(saveDir, str(run_id), 'rlog.csv')) results['results'][run_id] = {} temp = results['results'][run_id] for metric in metrics1: temp[metric] = metrics1[metric](y_test, y_pred_test) for metric in metrics2: temp[metric] = metrics2[metric](y_test, (y_pred_test>0.5).astype(int)) for metric in metrics_tf1: m = metrics_tf1[metric] m.reset_states() m.update_state(y_test, tf.greater_equal(y_pred_test,0.5)) temp[metric] = m.result().numpy() for metric in metrics_tf2: m = metrics_tf2[metric] m.reset_states() m.update_state(y_test, y_pred_test) temp[metric] = m.result().numpy() else: X_train = X[np.where(np.isin(X[:,0].toarray().flatten(),dict_data['0']['train']))] y_train = X_train[:,3].toarray().flatten() X_test = X[np.where(np.isin(X[:,0].toarray().flatten(),dict_data['0']['test']))] y_test = X_test[:,3].toarray().flatten() if model_params['dim'] == 0: print('fitting...') model = LogisticRegression(solver="newton-cg", max_iter=model_params['iter']) model.fit(X_train[:,4:], y_train) # the 5 first columns are the non-sparse dataset y_pred_test = model.predict_proba(X_test[:,4:])[:, 1] else: fm = pywFM.FM(**FM_params) model = fm.run(X_train[:,4:], y_train, X_test[:,4:], y_test) y_pred_test = np.array(model.predictions) model.rlog.to_csv(os.path.join(saveDir, 'rlog'+getLegend(model_params)+'.csv')) temp = results['results'] for metric in metrics1: temp[metric] = metrics1[metric](y_test, y_pred_test) for metric in metrics2: temp[metric] = metrics2[metric](y_test, (y_pred_test>0.5).astype(int)) for metric in metrics_tf1: m = metrics_tf1[metric] m.reset_states() m.update_state(y_test, tf.greater_equal(y_pred_test,0.5)) temp[metric] = m.result().numpy() for metric in metrics_tf2: m = metrics_tf2[metric] m.reset_states() m.update_state(y_test, y_pred_test) temp[metric] = m.result().numpy() saveDict(results, saveDir, 'results'+getLegend(model_params)+'.json') return results
# A simple FM example to test the FM library and show the data structure (not # this project since there are too many features to show) features = np.matrix([ # Users | Movies | Movie Ratings | Time | Last Movies Rated # A B C | TI NH SW ST | TI NH SW ST | | TI NH SW ST [1, 0, 0, 1, 0, 0, 0, 0.3, 0.3, 0.3, 0, 13, 0, 0, 0, 0], [1, 0, 0, 0, 1, 0, 0, 0.3, 0.3, 0.3, 0, 14, 1, 0, 0, 0], [1, 0, 0, 0, 0, 1, 0, 0.3, 0.3, 0.3, 0, 16, 0, 1, 0, 0], [0, 1, 0, 0, 0, 1, 0, 0, 0, 0.5, 0.5, 5, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 1, 0, 0, 0.5, 0.5, 8, 0, 0, 1, 0], [0, 0, 1, 1, 0, 0, 0, 0.5, 0, 0.5, 0, 9, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1, 0, 0.5, 0, 0.5, 0, 12, 1, 0, 0, 0] ]) target = [5, 3, 1, 4, 5, 1, 5] fm = pywFM.FM(task='regression', num_iter=5) # split features and target for train/test # first 5 are train, last 2 are test model = fm.run(features[:5], target[:5], features[5:], target[5:]) print(model.predictions) # you can also get the model weights print(model.weights) # Data preprocessing # A small data test #n=20000 #sample = pd.read_csv('train.csv',iterator=True) #sample = sample.get_chunk(n) # All data
import pywFM import numpy as np import pandas as pd import os os.environ['LIBFM_PATH'] = '/Users/jilljenn/code/libfm/bin/' features = np.matrix([ # Users | Movies | Movie Ratings | Time | Last Movies Rated # A B C | TI NH SW ST | TI NH SW ST | | TI NH SW ST [1, 0, 0, 1, 0, 0, 0, 0.3, 0.3, 0.3, 0, 13, 0, 0, 0, 0 ], [1, 0, 0, 0, 1, 0, 0, 0.3, 0.3, 0.3, 0, 14, 1, 0, 0, 0 ], [1, 0, 0, 0, 0, 1, 0, 0.3, 0.3, 0.3, 0, 16, 0, 1, 0, 0 ], [0, 1, 0, 0, 0, 1, 0, 0, 0, 0.5, 0.5, 5, 0, 0, 0, 0 ], [0, 1, 0, 0, 0, 0, 1, 0, 0, 0.5, 0.5, 8, 0, 0, 1, 0 ], [0, 0, 1, 1, 0, 0, 0, 0.5, 0, 0.5, 0, 9, 0, 0, 0, 0 ], [0, 0, 1, 0, 0, 1, 0, 0.5, 0, 0.5, 0, 12, 1, 0, 0, 0 ] ]) target = [0, 1, 1, 0, 0, 0, 1] fm = pywFM.FM(task='classification', num_iter=50, rlog=False) # split features and target for train/test # first 5 are train, last 2 are test model = fm.run(features[:5], target[:5], features[5:], target[5:]) print(model.predictions) # you can also get the model weights print(model.weights)
x4=train_ens["exf"], x5=train_ens["knn"], y=np.ravel(train_y_ens)) with pm.Model() as model: pm.glm.glm('y ~ 0 + x1 + x2 + x3 + x4 + x5', data_1, family=pm.glm.families.Binomial()) start = pm.find_MAP() step = pm.Metropolis() trace_m = pm.sample(2000, step, start=start, progressbar=True) predicted_test[:, i] = np.median( trace_m.x1) * test_fin_data["rf"] + np.median( trace_m.x2) * test_fin_data["gbm"] + np.median( trace_m.x3 ) * test_fin_data["sgd"] + test_fin_data["exf"] * np.median( trace_m.x4) + test_fin_data["knn"] * np.median(trace_m.x5) predicted_test_fin = predicted_test.mean(axis=1) roc_auc_score(test_y, predicted_test2) ################ Factorization Machines #################### import pywFM as fm fm_logit = fm.FM(task="classification") fm_logit.run(train_data, np.ravel(train_y), test_data, np.ravel(test_y)) train_fin_data.drop(train_fin_data[[0]], axis=1, inplace=True) train_y.drop(train_y[[0]], axis=1, inplace=True)
train_x, train_y, test_x = ont_hot(data) print('##########################') print(train_x.shape) print(train_y.shape) print(test_x.shape) # train = data[data['FLAG'] >= 0] # # test = data[data['FLAG'] < 0] # train_y = train['FLAG'].values # fm = FM(num_factors=10, num_iter=300, verbose=True, task='classification', initial_learning_rate=0.01, learning_rate_schedule="optimal") # fm.fit(train_x, train_y) # y_pred = fm.predict(test_x) train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, random_state=2018) fm = pywFM.FM(task='classification', num_iter=100, k2=10, verbose=True) test_y = np.ones(test_x.shape[0]) fm.run(train_x,train_y, test_x, test_y, valid_x, valid_y) y_pred = fm.predictions # print(roc_auc_score(valid_y, )) sub = pd.DataFrame() sub['USRID'] = test_x['USRID'] sub['target'] = y_pred sub.to_csv('./submit/%s.csv'%str(datetime.now().strftime("%Y-%m-%d_%H-%M-%S")),index=None,sep='\t') # print('train set:', roc_auc_score(train_y, fm.predict(train_x))) # help(pylibfm.FM)
df_to_sparse(df_train, 'X_train.npz') print('Train done') df_to_sparse(df_test, 'X_test.npz') print('Test done') X_train = load_npz('X_train.npz') X_test = load_npz('X_test.npz') print(X_train.shape) print(X_test.shape) fm = pywFM.FM(task='regression', num_iter=500, k2=20, rlog=False, learning_method='mcmc', r1_regularization=0.1, r2_regularization=0.1) model = fm.run(X_train, df_train['outcome'], X_test, df_test['outcome']) print(mean_squared_error(df_test['outcome'], model.predictions)**0.5) print(X_test[0], df_test['outcome'][0], model.predictions[0]) bundle = { 'mu': model.global_bias, 'W': model.weights, 'V': model.pairwise_interactions } with open('fm.pickle', 'wb') as f: pickle.dump(bundle, f, pickle.HIGHEST_PROTOCOL)
X_fm = hstack([X[agent] for agent in active_agents]).tocsr() save_npz(SPARSE_NPZ, X_fm) return X_fm X_train = df_to_sparse(df_train, 'X_train.npz') y_train = df_train['outcome'] print('Encoding train done') X_test = df_to_sparse(df_test, 'X_test.npz') y_test = df_test['outcome'] print('Encoding test done') params = { 'task': 'classification', 'num_iter': options.iter, 'rlog': True, 'learning_method': 'mcmc' } if options.d > 0: params['k2'] = options.d fm = pywFM.FM(**params) model = fm.run(X_train, y_train, X_test, y_test) ACC = accuracy_score(y_test, np.round(model.predictions)) AUC = roc_auc_score(y_test, model.predictions) NLL = log_loss(y_test, model.predictions) print('accuracy', ACC) print('AUC', AUC) print('NLL', NLL)
test_pca = pca.fit_transform(Test) print('Explained variance: %.4f' % pca.explained_variance_ratio_.sum()) train_pca = pd.DataFrame(train_pca) train_pca.index = train.index test_pca = pd.DataFrame(test_pca) test_pca.index = test.index train = pd.concat([train, train_pca], axis=1) test = pd.concat([test, test_pca], axis=1) # pywfm clf = pywFM.FM(task='classification', num_iter=1000, init_stdev=0.1, k2=5, learning_method='mcmc', verbose=False, silent=False) y = np.asarray(y) y.shape = (len(y), ) sub = pd.DataFrame() sub['id'] = testid y1 = np.zeros((len(testid), )) model = clf.run(x_train=train, y_train=y, x_test=test, y_test=y1) sub['target'] = model.predictions