def main(): if not Path(PATH_RAW + 'joined_raw.fthr').is_file(): data = join_and_feather() else: data = load_feather(PATH_RAW + 'joined_raw.fthr') if not Path(PATH_PROCESSED + 'joined_tmp.fthr').is_file(): data = clean_and_map(data) gc.collect() else: data = load_feather(PATH_PROCESSED + 'joined_tmp.fthr') if not Path(PATH_PROCESSED + 'joined_final.hd5').is_file(): data = extend_mapping_and_meta(data) gc.collect()
def time_features(base_path, log, examples, preprocessed_path=PREPROCESSED_FOLDER, redo=False): name = 'time_features' path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(log, examples, preprocessed_path=preprocessed_path) examples = reduce_mem_usage(examples, cols=cols) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def latent_sim_features(base_path, log, examples, latent_path=None, keys=KEYS, sizes=SIZES, redo=False): name = 'latent_sim_features' if latent_path is None: latent_path = base_path path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(log, examples, latent_path=latent_path, keys=keys, sizes=sizes) examples = reduce_mem_usage(examples, cols=cols) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def pop_features(base_path, log, examples, hidden=False, min_pop=None, train_only=False, redo=False): name = 'pop_features' if hidden: name += '_hidden' if min_pop is not None: name += '_mp' + str(min_pop) if train_only: name += '_trainonly' path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(log, examples, hidden=hidden, min_pop=min_pop, train_only=train_only) examples = reduce_mem_usage(examples) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def meta_features(base_path, meta_path, log, examples, latent='d2v', latent_size=16, redo=False): name = 'meta_features' if latent == None: name += '_all' else: name += '_' + str(latent_size) path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(meta_path, log, examples, latent_prefix=latent, latent_size=latent_size) examples = reduce_mem_usage(examples) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def crawl_features(base_path, crawl_path, log, examples, redo=False): name = 'crawl_features' path = Path( base_path + 'features/' + name + '.fthr' ) if path.is_file() and not redo: features = load_feather( path ) features = features[features.session_id.isin( examples.session_id.unique() )] examples = copy_features( examples, features ) else: examples, cols = create_features( crawl_path, log, examples ) examples = reduce_mem_usage(examples) write_feather( examples[['session_id','impressions'] + list(cols)], path ) #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list( cols ) return examples
def session_features(base_path, log, examples, price_path=None, crawl_path=CRAWL_FOLDER, poi_path=POI_FOLDER, redo=False): name = 'session_features' if price_path is None: price_path = base_path path = Path( base_path + 'features/' + name + '.fthr' ) if path.is_file() and not redo: features = load_feather( path ) features = features[features.session_id.isin( examples.session_id.unique() )] examples = copy_features( examples, features ) else: examples, cols = create_features( log, examples, price_path=price_path, crawl_path=crawl_path, poi_path=poi_path ) examples = reduce_mem_usage(examples, cols=cols) write_feather( examples[['session_id','impressions'] + list(cols)], path ) #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list( cols ) return examples
def list_context_features(base_path, log, examples, shifts=SHIFTS, redo=False): name = 'list_context_features_' + str(shifts) path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(log, examples, shifts=shifts) examples = reduce_mem_usage(examples) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','prices','label','position'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def add_last_poi( poi_path, log ): def _add_last_poi(row, save=None): session = row[0] action = row[1] ref = row[2] city = row[3] if 'session' in save and save['session'] != session or not 'session' in save: #new session save['session'] = session save['last_poi'] = -1 if 'city' in save and save['city'] != city or not 'city' in save: #new session save['city'] = city save['last_poi'] = -1 if action == POI and not np.isnan( ref ): save['last_poi'] = ref return save['last_poi'] file = poi_path + 'last_poi.fthr' if not Path( file ).is_file(): log_full = load_hdfs( poi_path + 'data_log.hd5' ) log_full['last_poi'] = apply(log_full, ['session_id','action_type','reference','city'], _add_last_poi, verbose=100000) write_feather( log_full[['session_id','last_poi']], file ) last_poi = load_feather( file ) print( len(last_poi) ) last_poi = last_poi[last_poi.session_id.isin( log.session_id.unique() )] print( len(last_poi) ) print( len(log) ) log['last_poi'] = last_poi['last_poi'].values del last_poi return log
def main(): train = create_set(base_path=BASE_PATH + SET, conf=CONF, key=DSKEY, redo=False) test = train.query('train == 0') test_file_key = DSKEY ensure_dir(BASE_PATH + SET + 'tmp/') test_file = BASE_PATH + SET + 'tmp/' + test_file_key + '_test.fthr' if not Path(test_file).is_file(): test = test.reset_index(drop=True) test.to_feather(test_file) test_len = len(test) del test gc.collect() train.query('train == 1', inplace=True) X = train[FEATURES + ['session_id']] y = train['label'] del train gc.collect() score = np.zeros((SPLITS, test_len)) i = 0 for train_idx, val_idx in train_test_cv(X, y, splits=SPLITS, shuffle=SHUFFLE): X_train = X.loc[train_idx] X_valid = X.loc[val_idx] y_train = y.loc[train_idx] y_valid = y.loc[val_idx] if LTR: q_train = X_train.groupby(['session_id' ]).size().values.astype(np.float32) q_valid = X_valid.groupby(['session_id' ]).size().values.astype(np.float32) xtrain = X_train[FEATURES].values.astype(np.float32) ytrain = y_train.values.astype(np.float32) del X_train, y_train gc.collect() d_train = lgbm.Dataset( xtrain, label=ytrain, group=q_train, feature_name=FEATURES) #, categorical_feature=CAT_FEATURES ) del q_train gc.collect() xval = X_valid[FEATURES].values.astype(np.float32) yval = y_valid.values.astype(np.float32) del X_valid, y_valid gc.collect() d_valid = lgbm.Dataset( xval, label=yval, group=q_valid, feature_name=FEATURES) #, categorical_feature=CAT_FEATURES ) del q_valid gc.collect() else: d_train = lgbm.Dataset( X_train[FEATURES], label=y_train, feature_name=FEATURES ) #+ ['session_id'])#, categorical_feature=CAT_FEATURES ) d_valid = lgbm.Dataset( X_valid[FEATURES], label=y_valid, feature_name=FEATURES ) #+ ['session_id'])#, categorical_feature=CAT_FEATURES ) watchlist = [d_train, d_valid] params = {} params['boosting'] = 'dart' params['learning_rate'] = 0.1 if LTR: params['application'] = 'lambdarank' params['metric'] = 'ndcg' params['eval_at'] = '30' else: params['application'] = 'binary' params['metric'] = 'binary_logloss' #params['max_depth'] = -1 #params['num_leaves'] = 64 #params['max_bin'] = 512 params['feature_fraction'] = 0.5 params['bagging_fraction'] = 0.5 #params['min_data_in_leaf'] = 20 #params['verbosity'] = 0 evals_result = {} model = lgbm.train(params, train_set=d_train, num_boost_round=MAX_EPOCHS, valid_sets=watchlist, early_stopping_rounds=STOPPING, evals_result=evals_result, verbose_eval=10) ensure_dir(BASE_PATH + SET + 'lgbm/') model.save_model( BASE_PATH + SET + 'lgbm/' + ALGKEY + '.' + str(i) + '.txt', num_iteration=model.best_iteration, ) del params, watchlist, d_train, d_valid, evals_result gc.collect() test = load_feather(test_file) X_test = test[FEATURES].values.astype(np.float32) y_test = model.predict(X_test, num_iteration=model.best_iteration) score[i] = y_test i += 1 del y_test, model, X_test, test gc.collect() test = load_feather(test_file) test['prob_norm'] = 0 test['prob_direct'] = 0 for i in range(SPLITS): test['prob_direct_' + str(i)] = score[i] test['prob_norm' + str(i)] = (test['prob_direct_' + str(i)] - test['prob_direct_' + str(i)].min()) / ( test['prob_direct_' + str(i)].max() - test['prob_direct_' + str(i)].min()) test['prob_direct'] += test['prob_direct_' + str(i)] test['prob_norm'] += test['prob_norm' + str(i)] test['prob_norm'] = test['prob_norm'] / SPLITS test['prob_direct'] = test['prob_direct'] / SPLITS #truth = pd.read_csv( self.folder + 'truth.csv' ) #truth['label2'] = 1 #test = test.merge( truth[['session_id','reference','label2']], left_on=['session_id','impressions'], right_on=['session_id','reference'], how='left' ) #test['label'] = test['label2'].fillna(0) #del test['label2'] test = test.sort_values(['session_id', 'prob_norm'], ascending=False) #test.to_csv( BASE_PATH + SET + 'test_debugcv.csv' ) solution = pd.DataFrame() solution['recommendations'] = test.groupby('session_id').impressions.apply( list) solution['confidences'] = test.groupby('session_id').prob_norm.apply(list) solution.reset_index(drop=True) solution = solution.merge( test[['session_id', 'user_id', 'timestamp', 'step']].drop_duplicates(keep='last'), on='session_id', how='inner') solution.to_csv(BASE_PATH + '/' + SET + '/solution_' + ALGKEY + '_norm.csv') test = test.sort_values(['session_id', 'prob_direct'], ascending=False) solution = pd.DataFrame() solution['recommendations'] = test.groupby('session_id').impressions.apply( list) solution['confidences'] = test.groupby('session_id').prob_direct.apply( list) solution.reset_index(drop=True) solution = solution.merge( test[['session_id', 'user_id', 'timestamp', 'step']].drop_duplicates(keep='last'), on='session_id', how='inner') solution.to_csv(BASE_PATH + '/' + SET + '/solution_' + ALGKEY + '_direct.csv') result = evaluate(solution, base=BASE_PATH, dataset=SET) print(result.T)
def create_set(base_path=SET, key='dataset', conf={}, redo=False): name = key path = Path(base_path + 'sets/' + name + '.fthr') if path.is_file() and not redo: print('loaded') examples = load_feather(path) gc.collect() else: print('create') log = load_hdfs(base_path + 'data_log.hd5') examples = load_hdfs(base_path + 'data_examples.hd5') if 'current_filters' in set(examples.columns): print('current_filters') del examples['current_filters'] if 'session_id_pre' in set(examples.columns): print('session_id_pre') del examples['session_id_pre'] examples = pop_features(conf['path_pop'], log, examples, hidden=conf['pop_hidden'], min_pop=conf['min_pop'], train_only=conf['train_only'], redo=redo) examples = price_features(conf['path_price'], log, examples, min_occurences=conf['min_occurences'], hidden=conf['price_hidden'], train_only=conf['train_only'], fillna_mean=conf['fillna_mean'], redo=redo) examples = session_features(conf['path_session'], log, examples, crawl_path=conf['path_crawl'], redo=redo) examples = crawl_features(base_path, conf['path_crawl'], log, examples, redo=redo) examples = geo_features(base_path, conf['path_crawl'], log, examples, redo=redo) examples = meta_features(base_path, conf['path_meta'], log, examples, latent=conf['meta_latent'], redo=redo) examples = user_features(conf['path_session'], log, examples, crawl_path=conf['path_crawl'], poi_path=conf['path_poi'], redo=redo) examples = position_features(base_path, log, examples, redo=redo) examples = properties_features(base_path, conf['path_meta'], log, examples, redo=redo) #examples = latent_features(base_path, log, examples, latent_path=conf['path_latent'], redo=redo) examples = latent_sim_features(base_path, log, examples, latent_path=conf['path_latent'], redo=redo) examples = combine_features(base_path, log, examples, redo=redo) examples = rank_features(base_path, log, examples, redo=redo) examples = time_features(base_path, log, examples, redo=redo) examples = list_context_features(base_path, log, examples, redo=redo) examples = stars_features(base_path, conf['path_meta'], log, examples, redo=redo) #examples = prediction_features(base_path, log, examples, redo=redo) #examples.to_csv( base_path + 'sets/' + name + '.csv' ) write_feather(examples, path) del log gc.collect() #print_col_list( examples.columns ) #examples = reduce_mem_usage(examples) return examples