import numpy as np import pandas as pd from utils.kfolds import KFolds, read_csv # loading the dataframe path = './data/dataset_augmented.csv' delimiter = ';' data_frame, all_headers = read_csv(path, delimiter, verbose=True) # selecting headers of interest headers = [ #'id', #'hash_email', #'hash_email_conversion', #'hash_userid', #'rank', 'occurrences', 'lifetime', # trop biasé ? 'nb_days', 'nb_idtags', 'nb_idtags_site', 'nb_idtags_media', #'click_rate', 'nb_purchases', 'last_time', 'nb_ips' ] # computing the co-variance matrix values = data_frame[headers].values values = (values - np.mean(values, axis=0)) / np.std(values, axis=0)
def main(path, delimiter, verbose): # loading the dataframe data_frame, all_headers = read_csv(path, delimiter, verbose) # selecting headers of interest headers = ['id', #'hash_email', #'hash_email_conversion', #'hash_userid', 'rank', 'occurrences', 'lifetime', 'nb_days', 'nb_idtags', 'nb_idtags_site', 'nb_idtags_media', #'click_rate', 'nb_purchases', 'last_time', 'nb_ips'] headers_to_drop = list(set(all_headers) - set(headers)) headers_to_scale = headers[:] headers_to_scale.remove('id') headers_to_scale.remove('rank') # K-Fold cross-validation nb_folds = 10 fold = KFolds(data_frame, nb_folds) missranked_scores_train = [] missranked_scores_test = [] for k in range(nb_folds): train, test = fold.get_fold(k) train = train.sort_values(by='id') test = test.sort_values(by='id') # dropping not usefull columns for drop in headers_to_drop: train = train.drop(drop, 1) test = test.drop(drop, 1) # train set train, mean, std = scaling(train, headers_to_scale) train = train.reset_index(drop=True) X = train[headers_to_scale].values Y = train['rank']==1 # training Logistic regression model linreg = LinearRegression(fit_intercept=True, normalize=False) linreg.fit(X, Y) # computing score on train set Y_score_train = linreg.predict(X) Y_predicted_train = compute_prediction(train, Y_score_train, verbose) missranked_train, wellranked_train, total_train = compute_error(Y, Y_predicted_train) missranked_scores_train.append(missranked_train/total_train) # test set test = scaling(test, headers_to_scale, mean, std) test = test.reset_index(drop=True) X_test = test[headers_to_scale].values Y_test = test['rank'].values==1 # computing score on test set Y_score_test = linreg.predict(X_test) Y_predicted_test = compute_prediction(test, Y_score_test, verbose) missranked_test, wellranked_test, total_test = compute_error(Y_test, Y_predicted_test) missranked_scores_test.append(missranked_test/total_test) # printing intermediate results if verbose: print('\n**** fold ', k, '****') print('train set:') print(' missranked =', round(missranked_train/total_train, 3)) print(' wellranked =', round(wellranked_train/total_train, 3)) print('test set:') print(' missranked =', round(missranked_test/total_test, 3)) print(' wellranked =', round(wellranked_test/total_test, 3)) # printing final result if verbose: print('\n******** MEAN over all folds ********') print('Train missranked = ', np.mean(missranked_scores_train)) print(' Test missranked = ', np.mean(missranked_scores_test))
def main(train_path, test_path): delimiter = ';' verbose = True data_frame, all_headers = read_csv(train_path, delimiter, verbose) delimiter_test = ',' verbose = True data_frame_test, all_headers_test = read_csv(test_path, delimiter_test, verbose) headers = [ 'id', 'rank', 'occurrences', 'lifetime', 'nb_days', 'nb_idtags', 'nb_idtags_site', 'nb_idtags_media', 'nb_purchases', 'last_time', 'nb_ips' ] headers_test = [ 'id', 'similarity', 'occurrences', 'lifetime', 'nb_days', 'nb_idtags', 'nb_idtags_site', 'nb_idtags_media', 'nb_purchases', 'last_time', 'nb_ips' ] headers_to_drop = list(set(all_headers) - set(headers)) headers_to_drop_test = list(set(all_headers_test) - set(headers_test)) headers_to_scale = headers[:] headers_to_scale.remove('id') headers_to_scale.remove('rank') headers_to_scale_test = headers_test[:] headers_to_scale_test.remove('id') headers_to_scale_test.remove('similarity') train = data_frame.copy() train = train.sort_values(by='id') test = data_frame_test.copy() test = test.sort_values(by='id') for drop in headers_to_drop: train = train.drop(drop, 1) for drop in headers_to_drop_test: test = test.drop(drop, 1) train = train.reset_index(drop=True) X_train = train[headers_to_scale].values Y_train = train[['rank', 'id']].values similarity = np.array(test.loc[:, 'similarity']) baseline = np.zeros(similarity.shape) test = test.reset_index(drop=True) test['arg_max'] = 0 old_id = test.loc[0, 'id'] rows = [] for row in range(test.shape[0]): current_id = test.loc[row, 'id'] if current_id == old_id: rows.append(row) else: sample = np.random.randint(min(rows), max(rows) + 1) baseline[sample] = 1 old_id = current_id m = test.loc[rows, 'similarity'].max() test.loc[rows, 'arg_max'] = 2 - (test.loc[rows, 'similarity'] == m) rows = [] rows.append(row) if row == test.shape[0] - 1: sample = np.random.randint(min(rows), max(rows) + 1) baseline[sample] = 1 old_id = current_id m = test.loc[rows, 'similarity'].max() test.loc[rows, 'arg_max'] = 2 - (test.loc[rows, 'similarity'] == m) rows = [] rows.append(row) X_test = test[headers_to_scale].values Y_test = test[['arg_max', 'id']].values # training model rank_svm = RankSVM() rank_svm = rank_svm.fit(X_train, Y_train) # ============================================================================= # # The following part is commented in order to decrease the execution time # # computing score on train set # missranked_score_train = 1 - rank_svm.scoreId(X_train, Y_train) # # # printing intermediate results # print('train set:') # print(' missranked =', round(missranked_score_train, 3)) # print(' wellranked =', round(1 - missranked_score_train, 3)) # ============================================================================= print('Training finished') # computing score on test set Y_predicted_test = rank_svm.predictId(X_test, Y_test) missranked_test, wellranked_test, total_test = compute_error( Y_test[:, 0], Y_predicted_test) # testing baseline missranked_test_baseline, wellranked_test_baseline, total_test_baseline = compute_error( Y_test[:, 0], baseline) # printing intermediate results print('test set:') print(' missranked =', round(missranked_test / total_test, 3)) print(' wellranked =', round(wellranked_test / total_test, 3)) print('baseline prediction:') print(' missranked =', round(missranked_test_baseline / total_test_baseline, 3)) print(' wellranked =', round(wellranked_test_baseline / total_test_baseline, 3)) # Les deux métriques du pdf #df_2 = data_frame_test.copy() old_id = test.loc[0, 'id'] rows = [] score_1 = 0 score_2 = 0 score_1_baseline = 0 score_2_baseline = 0 N = 0 for row in range(len(Y_predicted_test)): current_id = test.loc[row, 'id'] if current_id == old_id: rows.append(row) if Y_predicted_test[row] == 1: prediction = row N += 1 if baseline[row] == 1: prediction_baseline = row else: m = test.loc[rows, 'similarity'].max() score_1 += m - test.loc[prediction, 'similarity'] score_2 += m * test.loc[prediction, 'similarity'] score_1_baseline += m - test.loc[prediction_baseline, 'similarity'] score_2_baseline += m * test.loc[prediction_baseline, 'similarity'] #print(str(N)+' : commande '+str(old_id)+' nb_lignes = '+str(len(rows))+' ; sim_max = '+str(m)+' ; sim_predicted = '+str(test.loc[prediction, 'similarity'])+' ; sim_baseline = '+str(test.loc[prediction_baseline, 'similarity'])) old_id = current_id rows = [] rows.append(row) if Y_predicted_test[row] == 1: prediction = row N += 1 if baseline[row] == 1: prediction_baseline = row if row == len(Y_predicted_test) - 1: m = test.loc[rows, 'similarity'].max() score_1 += m - test.loc[prediction, 'similarity'] score_2 += m * test.loc[prediction, 'similarity'] score_1_baseline += m - test.loc[prediction_baseline, 'similarity'] score_2_baseline += m * test.loc[prediction_baseline, 'similarity'] #print(str(N)+' : commande '+str(old_id)+' nb_lignes = '+str(len(rows))+' ; sim_max = '+str(m)+' ; sim_predicted = '+str(test.loc[prediction, 'similarity'])+' ; sim_baseline = '+str(test.loc[prediction_baseline, 'similarity'])) score_1 /= N score_2 /= N score_1_baseline /= N score_2_baseline /= N print('score_1 = ' + str(score_1)) print('score_2 = ' + str(score_2)) print('score_1_baseline = ' + str(score_1_baseline)) print('score_2_baseline = ' + str(score_2_baseline))
def main(path, delimiter, score, threshold, verbose): # loading the dataframe data_frame, all_headers = read_csv(path, delimiter, verbose) # selecting headers of interest headers = ['id', #'hash_email', #'hash_email_conversion', #'hash_userid', 'rank', 'occurrences', 'lifetime', 'nb_days', 'nb_idtags', 'nb_idtags_site', 'nb_idtags_media', #'click_rate', 'nb_purchases', 'last_time', 'nb_ips'] headers_to_drop = list(set(all_headers) - set(headers)) headers_to_scale = headers[:] headers_to_scale.remove('id') headers_to_scale.remove('rank') # K-Fold cross-validation nb_folds = 10 fold = KFolds(data_frame, nb_folds) missranked_scores_train = [] missranked_scores_test = [] for k in range(nb_folds): # recover the train and the test set train, test = fold.get_fold(k) train = train.sort_values(by='id') test = test.sort_values(by='id') # dropping not usefull columns for drop in headers_to_drop: train = train.drop(drop, 1) test = test.drop(drop, 1) # split the feature and the [rank, id] X_train = train[headers_to_scale].values Y_train = train[['rank', 'id']].values X_test = test[headers_to_scale].values Y_test = test[['rank', 'id']].values # Create our model rank_svm = RankSVM() # Fit our model with the train set rank_svm = rank_svm.fit(X_train, Y_train) if score == 'inversion': # Compute the missranked score for the train set missranked_score_train = 1 - rank_svm.scoreInversion(X_train, Y_train) missranked_scores_train.append(missranked_score_train) # Compute the missranked score for the test set missranked_score_test = 1 - rank_svm.scoreInversion(X_test, Y_test) missranked_scores_test.append(missranked_score_test) elif score == 'thresholdId': # Compute the missranked score for the train set missranked_score_train = 1 - rank_svm.scoreThresholdId(X_train, Y_train, threshold) missranked_scores_train.append(missranked_score_train) # Compute the missranked score for the test set missranked_score_test = 1 - rank_svm.scoreThresholdId(X_test, Y_test, threshold) missranked_scores_test.append(missranked_score_test) elif score == 'id': # Compute the missranked score for the train set missranked_score_train = 1 - rank_svm.scoreId(X_train, Y_train) missranked_scores_train.append(missranked_score_train) # Compute the missranked score for the test set missranked_score_test = 1 - rank_svm.scoreId(X_test, Y_test) missranked_scores_test.append(missranked_score_test) else: print('Not a valid score method') return # printing intermediate results if verbose: print('\n**** fold ', k, '****') print('train set:') print(' missranked =', round(missranked_score_train, 3)) print(' wellranked =', round(1 - missranked_score_train, 3)) print('test set:') print(' missranked =', round(missranked_score_test, 3)) print(' wellranked =', round(1 - missranked_score_test, 3)) # print the final results if verbose: print('\n******** MEAN over all folds ********') print('Train missranked = ', np.mean(missranked_scores_train)) print(' Test missranked = ', np.mean(missranked_scores_test))
def main(train_path, test_path): delimiter = ';' verbose = True data_frame, all_headers = read_csv(train_path, delimiter, verbose) delimiter_test = ',' verbose = True data_frame_test, all_headers_test = read_csv(test_path, delimiter_test, verbose) headers = [ 'id', 'rank', 'occurrences', 'lifetime', 'nb_days', 'nb_idtags', 'nb_idtags_site', 'nb_idtags_media', 'nb_purchases', 'last_time', 'nb_ips' ] headers_test = [ 'id', 'similarity', 'occurrences', 'lifetime', 'nb_days', 'nb_idtags', 'nb_idtags_site', 'nb_idtags_media', 'nb_purchases', 'last_time', 'nb_ips' ] headers_to_drop = list(set(all_headers) - set(headers)) headers_to_drop_test = list(set(all_headers_test) - set(headers_test)) headers_to_scale = headers[:] headers_to_scale.remove('id') headers_to_scale.remove('rank') headers_to_scale_test = headers_test[:] headers_to_scale_test.remove('id') headers_to_scale_test.remove('similarity') train = data_frame.copy() train = train.sort_values(by='id') test = data_frame_test.copy() test = test.sort_values(by='id') for drop in headers_to_drop: train = train.drop(drop, 1) for drop in headers_to_drop_test: test = test.drop(drop, 1) train = train.reset_index(drop=True) X = train[headers_to_scale].values Y = train['rank'] == 1 similarity = np.array(test.loc[:, 'similarity']) baseline = np.zeros(similarity.shape) test = test.reset_index(drop=True) test['arg_max'] = 0 old_id = test.loc[0, 'id'] rows = [] for row in range(test.shape[0]): current_id = test.loc[row, 'id'] if current_id == old_id: rows.append(row) else: sample = np.random.randint(min(rows), max(rows) + 1) baseline[sample] = 1 old_id = current_id m = test.loc[rows, 'similarity'].max() test.loc[rows, 'arg_max'] = (test.loc[rows, 'similarity'] == m) rows = [] rows.append(row) if row == test.shape[0] - 1: sample = np.random.randint(min(rows), max(rows) + 1) baseline[sample] = 1 old_id = current_id m = test.loc[rows, 'similarity'].max() test.loc[rows, 'arg_max'] = (test.loc[rows, 'similarity'] == m) rows = [] rows.append(row) X_test = test[headers_to_scale].values Y_test = test['arg_max'] # training model regularization = 1e10 xgb_reg = GradientBoostingClassifier(loss='exponential', n_estimators=50, criterion='friedman_mse', max_depth=5, verbose=verbose) xgb_reg.fit(X, Y) # computing score on train set Y_score_train = xgb_reg.predict_proba(X)[:, 1] Y_predicted_train = compute_prediction(train, Y_score_train) #, verbose) missranked_train, wellranked_train, total_train = compute_error( Y, Y_predicted_train) # printing intermediate results print('train set:') print(' missranked =', round(missranked_train / total_train, 3)) print(' wellranked =', round(wellranked_train / total_train, 3)) # # computing score on test set Y_score_test = xgb_reg.predict_proba(X_test)[:, 1] Y_predicted_test = compute_prediction(test, Y_score_test) #, verbose) missranked_test, wellranked_test, total_test = compute_error( Y_test, Y_predicted_test) missranked_test_baseline, wellranked_test_baseline, total_test_baseline = compute_error( Y_test, baseline) # printing intermediate results print('test set:') print(' missranked =', round(missranked_test / total_test, 3)) print(' wellranked =', round(wellranked_test / total_test, 3)) print('baseline prediction:') print(' missranked =', round(missranked_test_baseline / total_test_baseline, 3)) print(' wellranked =', round(wellranked_test_baseline / total_test_baseline, 3)) # Les deux métriques du pdf #df_2 = data_frame_test.copy() old_id = test.loc[0, 'id'] rows = [] score_1 = 0 score_2 = 0 score_1_baseline = 0 score_2_baseline = 0 N = 0 for row in range(len(Y_predicted_test)): current_id = test.loc[row, 'id'] if current_id == old_id: rows.append(row) if Y_predicted_test[row] == 1: prediction = row N += 1 if baseline[row] == 1: prediction_baseline = row else: m = test.loc[rows, 'similarity'].max() score_1 += m - test.loc[prediction, 'similarity'] score_2 += m * test.loc[prediction, 'similarity'] score_1_baseline += m - test.loc[prediction_baseline, 'similarity'] score_2_baseline += m * test.loc[prediction_baseline, 'similarity'] #print(str(N)+' : commande '+str(old_id)+' nb_lignes = '+str(len(rows))+' ; sim_max = '+str(m)+' ; sim_predicted = '+str(test.loc[prediction, 'similarity'])+' ; sim_baseline = '+str(test.loc[prediction_baseline, 'similarity'])) old_id = current_id rows = [] rows.append(row) if Y_predicted_test[row] == 1: prediction = row N += 1 if baseline[row] == 1: prediction_baseline = row if row == len(Y_predicted_test) - 1: m = test.loc[rows, 'similarity'].max() score_1 += m - test.loc[prediction, 'similarity'] score_2 += m * test.loc[prediction, 'similarity'] score_1_baseline += m - test.loc[prediction_baseline, 'similarity'] score_2_baseline += m * test.loc[prediction_baseline, 'similarity'] #print(str(N)+' : commande '+str(old_id)+' nb_lignes = '+str(len(rows))+' ; sim_max = '+str(m)+' ; sim_predicted = '+str(test.loc[prediction, 'similarity'])+' ; sim_baseline = '+str(test.loc[prediction_baseline, 'similarity'])) score_1 /= N score_2 /= N score_1_baseline /= N score_2_baseline /= N print('score_1 = ' + str(score_1)) print('score_2 = ' + str(score_2)) print('score_1_baseline = ' + str(score_1_baseline)) print('score_2_baseline = ' + str(score_2_baseline))