def train(data_dir): # data_dir = '/home/paula/Descargas/tagclouds-api/cstagclouds/extractkeywords/training/*' x, y, words, qids, rake, groups = (np.array(l) for l in load_data(data_dir)) model = RankSVM() gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=randint(0, 30)) for train_index, test_index in gss.split(x, y, groups=groups): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] q_train, q_test = qids[train_index], qids[test_index] w_train, w_test = words[train_index], words[test_index] rake_train, rake_test = rake[train_index], rake[test_index] model.fit(x_train, y_train, q_train) pred_test = model.predict(x_test) a = list(zip(q_test, pred_test, w_test)) a2 = sorted(a, key=lambda x: (x[0], x[1])) for word in a2: print(word[0], word[1], word[2]) metric = pyltr.metrics.NDCG(len(x_test)) print('Random ranking:', metric.calc_mean_random(q_test, y_test)) print('Our model:', metric.calc_mean(q_test, y_test, pred_test)) print('Rake:', metric.calc_mean(q_test, y_test, rake_test))
def cross_val(data_dir, model_dir, save_model): # data_dir = '/home/paula/Descargas/tagclouds-api/cstagclouds/extractkeywords/training/*' # model_dir = '/home/paula/Descargas/tagclouds-api/cstagclouds/learningtorank/models/unfiltered/' x, y, words, qids, rake, groups = (np.array(l) for l in load_data(data_dir)) logo = LeaveOneGroupOut() for train_index, test_index in logo.split(x, y, groups): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] q_train, q_test = qids[train_index], qids[test_index] rake_train, rake_test = rake[train_index], rake[test_index] if save_model: model = RankSVM(max_iter=5000) model.fit(x_train, y_train, q_train) pickle.dump( model, open( '%sRankSVM/rankSVM_model_%s.sav' % (model_dir, q_test[0].replace('.txt', '')), 'wb')) else: model = pickle.load( open( '%sRankSVM/rankSVM_model_%s.sav' % (model_dir, q_test[0].replace('.txt', '')), 'rb')) pred_test = model.predict(x_test) metric = pyltr.metrics.NDCG(len(x_test)) print('%s' % (metric.calc_mean(q_test, y_test, pred_test)))
def fit_SVM(): """Fit an SVMRank model (from pysofia) returns: d_user_pred, list_user, list_coupon list_coupon = list of test coupons list_user = list of user ID d_user_pred : key = user, value = predicted ranking of coupons in list_coupon """ #Get data for classification X_train, y_train, d_info = prepare_data() list_user = d_info["list_user"] list_coupon = d_info["list_coupon"] no_cpt = d_info["no_cpt"] mapper = d_info["mapper"] list_col_xgb = d_info["list_col_xgb"] list_col_mapper = d_info["list_col_mapper"] # Create RankSVM RSVM = RankSVM(max_iter=10, alpha=1) RSVM.fit(X_train, y_train) #Store predictions in a dict d_user_pred = {} #Load test data by chunks to avoid memory issues for index, test in enumerate( pd.read_csv("../Data/Data_translated/test_supervised_learning.csv", chunksize=1000 * no_cpt)): sys.stdout.write("\rProcessing row " + str(index * 1000 * no_cpt) + " to row " + str((index + 1) * 1000 * no_cpt)) sys.stdout.flush() test = test.fillna(-1) temp_list_user = test["USER_ID_hash"].drop_duplicates().values test = mapper.transform(test) test = pd.DataFrame(test, index=None, columns=list_col_mapper) test = test[list_col_xgb] X_test = test.values y_test = RSVM.rank(X_test) for i in range(min(1000, len(temp_list_user))): user = temp_list_user[i] d_user_pred[user] = y_test[i * no_cpt:(i + 1) * no_cpt] print # Compute score for users, apply MinMaxScaler for blending later on for i, user in enumerate(list_user): list_pred = d_user_pred[user] MMS = MinMaxScaler() pred = MMS.fit_transform(np.ravel(list_pred).astype(float)) d_user_pred[user] = pred # Pickle the predictions for future use d_pred = {"list_coupon": list_coupon.tolist(), "d_user_pred": d_user_pred} with open("../Data/Data_translated/d_pred_SVM.pickle", "w") as f: pickle.dump(d_pred, f, protocol=pickle.HIGHEST_PROTOCOL) return d_user_pred, list_user, list_coupon
def save(data_dir, model_dir): # data_dir = '/home/paula/Descargas/tagclouds-api/cstagclouds/extractkeywords/training/*' # model_dir = '/home/paula/Descargas/tagclouds-api/cstagclouds/learningtorank/models/unfiltered/' x, y, words, qids, rake, groups = (np.array(l) for l in load_data(data_dir)) model = RankSVM() model.fit(x, y, qids) pickle.dump(model, open("%srankSVM_model.sav" % model_dir, 'wb'))
def fit_SVM(week_ID): """Fit an SVMRank model (from pysofia) args : week_ID (validation week) returns: d_user_pred, list_user, list_coupon list_coupon = list of test coupons list_user = list of user ID d_user_pred : key = user, value = predicted ranking of coupons in list_coupon """ print "Fitting SVMrank" #Get data for classification X_train, y_train, d_info = prepare_data(week_ID) list_user = d_info["list_user"] list_coupon = d_info["list_coupon"] no_cpt = d_info["no_cpt"] mapper = d_info["mapper"] list_col_xgb = d_info["list_col_xgb"] list_col_mapper = d_info["list_col_mapper"] # Launch RankSVM RSVM = RankSVM(max_iter=10, alpha = 0.1) RSVM.fit(X_train, y_train) #Store predictions in a dict d_user_pred = {} #Load test by chunks to avoid memory issues for index, test in enumerate(pd.read_csv("../Data/Validation/%s/test_supervised_learning_%s.csv" % (week_ID, week_ID), chunksize=1000*no_cpt)) : sys.stdout.write("\rProcessing row " + str(index*1000*no_cpt)+" to row "+str((index+1)*1000*no_cpt)) sys.stdout.flush() test = test.fillna(-1) temp_list_user = test["USER_ID_hash"].drop_duplicates().values test = mapper.transform(test) test = pd.DataFrame(test, index = None, columns = list_col_mapper ) test = test[list_col_xgb] X_test = test.values y_test = RSVM.rank(X_test) for i in range(min(1000, len(temp_list_user))): user = temp_list_user[i] d_user_pred[user] = y_test[i*no_cpt: (i+1)*no_cpt] print #Sanity check assert (list_user == sorted(d_user_pred.keys())) # Pickle the predictions for future use d_pred = {"list_coupon" : list_coupon.tolist(), "d_user_pred" : d_user_pred} with open("../Data/Validation/%s/d_pred_SVM_%s.pickle" %(week_ID, week_ID), "w") as f: pickle.dump(d_pred, f, protocol = pickle.HIGHEST_PROTOCOL) return d_user_pred, list_user, list_coupon
def fit_SVM(): """Fit an SVMRank model (from pysofia) returns: d_user_pred, list_user, list_coupon list_coupon = list of test coupons list_user = list of user ID d_user_pred : key = user, value = predicted ranking of coupons in list_coupon """ #Get data for classification X_train, y_train, d_info = prepare_data() list_user = d_info["list_user"] list_coupon = d_info["list_coupon"] no_cpt = d_info["no_cpt"] mapper = d_info["mapper"] list_col_xgb = d_info["list_col_xgb"] list_col_mapper = d_info["list_col_mapper"] # Create RankSVM RSVM = RankSVM(max_iter=10, alpha = 1) RSVM.fit(X_train, y_train) #Store predictions in a dict d_user_pred = {} #Load test data by chunks to avoid memory issues for index, test in enumerate(pd.read_csv("../Data/Data_translated/test_supervised_learning.csv", chunksize=1000*no_cpt)) : sys.stdout.write("\rProcessing row " + str(index*1000*no_cpt)+" to row "+str((index+1)*1000*no_cpt)) sys.stdout.flush() test = test.fillna(-1) temp_list_user = test["USER_ID_hash"].drop_duplicates().values test = mapper.transform(test) test = pd.DataFrame(test, index = None, columns = list_col_mapper ) test = test[list_col_xgb] X_test = test.values y_test = RSVM.rank(X_test) for i in range(min(1000, len(temp_list_user))): user = temp_list_user[i] d_user_pred[user] = y_test[i*no_cpt: (i+1)*no_cpt] print # Compute score for users, apply MinMaxScaler for blending later on for i, user in enumerate(list_user) : list_pred = d_user_pred[user] MMS = MinMaxScaler() pred = MMS.fit_transform(np.ravel(list_pred).astype(float)) d_user_pred[user] = pred # Pickle the predictions for future use d_pred = {"list_coupon" : list_coupon.tolist(), "d_user_pred" : d_user_pred} with open("../Data/Data_translated/d_pred_SVM.pickle", "w") as f: pickle.dump(d_pred, f, protocol = pickle.HIGHEST_PROTOCOL) return d_user_pred, list_user, list_coupon