示例#1
0
def train(data_dir):
    # data_dir = '/home/paula/Descargas/tagclouds-api/cstagclouds/extractkeywords/training/*'

    x, y, words, qids, rake, groups = (np.array(l)
                                       for l in load_data(data_dir))
    model = RankSVM()
    gss = GroupShuffleSplit(n_splits=1,
                            test_size=0.2,
                            random_state=randint(0, 30))
    for train_index, test_index in gss.split(x, y, groups=groups):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        q_train, q_test = qids[train_index], qids[test_index]
        w_train, w_test = words[train_index], words[test_index]
        rake_train, rake_test = rake[train_index], rake[test_index]
        model.fit(x_train, y_train, q_train)

        pred_test = model.predict(x_test)
        a = list(zip(q_test, pred_test, w_test))
        a2 = sorted(a, key=lambda x: (x[0], x[1]))
        for word in a2:
            print(word[0], word[1], word[2])
        metric = pyltr.metrics.NDCG(len(x_test))
        print('Random ranking:', metric.calc_mean_random(q_test, y_test))
        print('Our model:', metric.calc_mean(q_test, y_test, pred_test))
        print('Rake:', metric.calc_mean(q_test, y_test, rake_test))
示例#2
0
def cross_val(data_dir, model_dir, save_model):
    # data_dir = '/home/paula/Descargas/tagclouds-api/cstagclouds/extractkeywords/training/*'
    # model_dir = '/home/paula/Descargas/tagclouds-api/cstagclouds/learningtorank/models/unfiltered/'
    x, y, words, qids, rake, groups = (np.array(l)
                                       for l in load_data(data_dir))
    logo = LeaveOneGroupOut()

    for train_index, test_index in logo.split(x, y, groups):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        q_train, q_test = qids[train_index], qids[test_index]
        rake_train, rake_test = rake[train_index], rake[test_index]
        if save_model:
            model = RankSVM(max_iter=5000)
            model.fit(x_train, y_train, q_train)
            pickle.dump(
                model,
                open(
                    '%sRankSVM/rankSVM_model_%s.sav' %
                    (model_dir, q_test[0].replace('.txt', '')), 'wb'))
        else:
            model = pickle.load(
                open(
                    '%sRankSVM/rankSVM_model_%s.sav' %
                    (model_dir, q_test[0].replace('.txt', '')), 'rb'))
        pred_test = model.predict(x_test)
        metric = pyltr.metrics.NDCG(len(x_test))
        print('%s' % (metric.calc_mean(q_test, y_test, pred_test)))
def fit_SVM():
    """Fit an SVMRank model (from pysofia)

    returns: d_user_pred, list_user, list_coupon
    list_coupon = list of test coupons 
    list_user = list of user ID 
    d_user_pred : key = user, value = predicted ranking of coupons in list_coupon
    """
    #Get data for classification
    X_train, y_train, d_info = prepare_data()

    list_user = d_info["list_user"]
    list_coupon = d_info["list_coupon"]
    no_cpt = d_info["no_cpt"]
    mapper = d_info["mapper"]
    list_col_xgb = d_info["list_col_xgb"]
    list_col_mapper = d_info["list_col_mapper"]

    # Create RankSVM
    RSVM = RankSVM(max_iter=10, alpha=1)
    RSVM.fit(X_train, y_train)

    #Store predictions in a dict
    d_user_pred = {}

    #Load test data by chunks to avoid memory issues
    for index, test in enumerate(
            pd.read_csv("../Data/Data_translated/test_supervised_learning.csv",
                        chunksize=1000 * no_cpt)):
        sys.stdout.write("\rProcessing row " + str(index * 1000 * no_cpt) +
                         " to row " + str((index + 1) * 1000 * no_cpt))
        sys.stdout.flush()
        test = test.fillna(-1)
        temp_list_user = test["USER_ID_hash"].drop_duplicates().values

        test = mapper.transform(test)
        test = pd.DataFrame(test, index=None, columns=list_col_mapper)
        test = test[list_col_xgb]
        X_test = test.values
        y_test = RSVM.rank(X_test)
        for i in range(min(1000, len(temp_list_user))):
            user = temp_list_user[i]
            d_user_pred[user] = y_test[i * no_cpt:(i + 1) * no_cpt]
    print

    # Compute score for users, apply MinMaxScaler for blending later on
    for i, user in enumerate(list_user):
        list_pred = d_user_pred[user]
        MMS = MinMaxScaler()
        pred = MMS.fit_transform(np.ravel(list_pred).astype(float))
        d_user_pred[user] = pred

    # Pickle the predictions for future use
    d_pred = {"list_coupon": list_coupon.tolist(), "d_user_pred": d_user_pred}
    with open("../Data/Data_translated/d_pred_SVM.pickle", "w") as f:
        pickle.dump(d_pred, f, protocol=pickle.HIGHEST_PROTOCOL)

    return d_user_pred, list_user, list_coupon
示例#4
0
def save(data_dir, model_dir):
    # data_dir = '/home/paula/Descargas/tagclouds-api/cstagclouds/extractkeywords/training/*'
    # model_dir = '/home/paula/Descargas/tagclouds-api/cstagclouds/learningtorank/models/unfiltered/'

    x, y, words, qids, rake, groups = (np.array(l)
                                       for l in load_data(data_dir))
    model = RankSVM()
    model.fit(x, y, qids)
    pickle.dump(model, open("%srankSVM_model.sav" % model_dir, 'wb'))
示例#5
0
def fit_SVM(week_ID):
    """Fit an SVMRank model (from pysofia)

    args : week_ID (validation week)

    returns: d_user_pred, list_user, list_coupon
    list_coupon = list of test coupons 
    list_user = list of user ID 
    d_user_pred : key = user, value = predicted ranking of coupons in list_coupon
    """

    print "Fitting SVMrank"

    #Get data for classification
    X_train, y_train, d_info = prepare_data(week_ID)

    list_user = d_info["list_user"]
    list_coupon = d_info["list_coupon"]
    no_cpt = d_info["no_cpt"]
    mapper = d_info["mapper"]
    list_col_xgb = d_info["list_col_xgb"]
    list_col_mapper = d_info["list_col_mapper"]

    # Launch RankSVM
    RSVM = RankSVM(max_iter=10, alpha = 0.1)
    RSVM.fit(X_train, y_train)

    #Store predictions in a dict
    d_user_pred = {}

    #Load test by chunks to avoid memory issues
    for index, test in enumerate(pd.read_csv("../Data/Validation/%s/test_supervised_learning_%s.csv" % (week_ID, week_ID), chunksize=1000*no_cpt)) :
        sys.stdout.write("\rProcessing row " + str(index*1000*no_cpt)+" to row "+str((index+1)*1000*no_cpt))
        sys.stdout.flush()
        test = test.fillna(-1)
        temp_list_user = test["USER_ID_hash"].drop_duplicates().values

        test = mapper.transform(test)
        test = pd.DataFrame(test, index = None, columns = list_col_mapper )
        test = test[list_col_xgb]
        X_test = test.values
        y_test = RSVM.rank(X_test)
        for i in range(min(1000, len(temp_list_user))):
            user = temp_list_user[i]
            d_user_pred[user] = y_test[i*no_cpt: (i+1)*no_cpt]
    print

    #Sanity check
    assert (list_user == sorted(d_user_pred.keys()))

    # Pickle the predictions for future use
    d_pred = {"list_coupon" : list_coupon.tolist(), "d_user_pred" : d_user_pred}
    with open("../Data/Validation/%s/d_pred_SVM_%s.pickle" %(week_ID, week_ID), "w") as f:
        pickle.dump(d_pred, f, protocol = pickle.HIGHEST_PROTOCOL)   

    return d_user_pred, list_user, list_coupon
示例#6
0
def fit_SVM():
    """Fit an SVMRank model (from pysofia)

    returns: d_user_pred, list_user, list_coupon
    list_coupon = list of test coupons 
    list_user = list of user ID 
    d_user_pred : key = user, value = predicted ranking of coupons in list_coupon
    """
    #Get data for classification
    X_train, y_train, d_info = prepare_data()

    list_user = d_info["list_user"]
    list_coupon = d_info["list_coupon"]
    no_cpt = d_info["no_cpt"]
    mapper = d_info["mapper"]
    list_col_xgb = d_info["list_col_xgb"]
    list_col_mapper = d_info["list_col_mapper"]

    # Create RankSVM
    RSVM = RankSVM(max_iter=10, alpha = 1)
    RSVM.fit(X_train, y_train)

    #Store predictions in a dict
    d_user_pred = {}

    #Load test data by chunks to avoid memory issues
    for index, test in enumerate(pd.read_csv("../Data/Data_translated/test_supervised_learning.csv", chunksize=1000*no_cpt)) :
        sys.stdout.write("\rProcessing row " + str(index*1000*no_cpt)+" to row "+str((index+1)*1000*no_cpt))
        sys.stdout.flush()
        test = test.fillna(-1)
        temp_list_user = test["USER_ID_hash"].drop_duplicates().values

        test = mapper.transform(test)
        test = pd.DataFrame(test, index = None, columns = list_col_mapper )
        test = test[list_col_xgb]
        X_test = test.values
        y_test = RSVM.rank(X_test)
        for i in range(min(1000, len(temp_list_user))):
            user = temp_list_user[i]
            d_user_pred[user] = y_test[i*no_cpt: (i+1)*no_cpt]
    print

    # Compute score for users, apply MinMaxScaler for blending later on
    for i, user in enumerate(list_user) :
        list_pred = d_user_pred[user]
        MMS = MinMaxScaler()
        pred = MMS.fit_transform(np.ravel(list_pred).astype(float))
        d_user_pred[user] = pred

    # Pickle the predictions for future use
    d_pred = {"list_coupon" : list_coupon.tolist(), "d_user_pred" : d_user_pred}
    with open("../Data/Data_translated/d_pred_SVM.pickle", "w") as f:
        pickle.dump(d_pred, f, protocol = pickle.HIGHEST_PROTOCOL) 

    return d_user_pred, list_user, list_coupon