예제 #1
0
파일: transform.py 프로젝트: rn5l/rsc19
def main():

    if not Path(PATH_RAW + 'joined_raw.fthr').is_file():
        data = join_and_feather()
    else:
        data = load_feather(PATH_RAW + 'joined_raw.fthr')

    if not Path(PATH_PROCESSED + 'joined_tmp.fthr').is_file():
        data = clean_and_map(data)
        gc.collect()
    else:
        data = load_feather(PATH_PROCESSED + 'joined_tmp.fthr')

    if not Path(PATH_PROCESSED + 'joined_final.hd5').is_file():
        data = extend_mapping_and_meta(data)
        gc.collect()
예제 #2
0
파일: time.py 프로젝트: rn5l/rsc19
def time_features(base_path,
                  log,
                  examples,
                  preprocessed_path=PREPROCESSED_FOLDER,
                  redo=False):

    name = 'time_features'

    path = Path(base_path + 'features/' + name + '.fthr')
    if path.is_file() and not redo:
        features = load_feather(path)
        features = features[features.session_id.isin(
            examples.session_id.unique())]
        examples = copy_features(examples, features)
    else:
        examples, cols = create_features(log,
                                         examples,
                                         preprocessed_path=preprocessed_path)
        examples = reduce_mem_usage(examples, cols=cols)
        write_feather(examples[['session_id', 'impressions'] + list(cols)],
                      path)
        #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list(cols)

    return examples
예제 #3
0
파일: latent_sim.py 프로젝트: rn5l/rsc19
def latent_sim_features(base_path,
                        log,
                        examples,
                        latent_path=None,
                        keys=KEYS,
                        sizes=SIZES,
                        redo=False):

    name = 'latent_sim_features'
    if latent_path is None:
        latent_path = base_path

    path = Path(base_path + 'features/' + name + '.fthr')
    if path.is_file() and not redo:
        features = load_feather(path)
        features = features[features.session_id.isin(
            examples.session_id.unique())]
        examples = copy_features(examples, features)
    else:
        examples, cols = create_features(log,
                                         examples,
                                         latent_path=latent_path,
                                         keys=keys,
                                         sizes=sizes)
        examples = reduce_mem_usage(examples, cols=cols)
        write_feather(examples[['session_id', 'impressions'] + list(cols)],
                      path)
        #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list(cols)

    return examples
예제 #4
0
def pop_features(base_path,
                 log,
                 examples,
                 hidden=False,
                 min_pop=None,
                 train_only=False,
                 redo=False):

    name = 'pop_features'
    if hidden:
        name += '_hidden'
    if min_pop is not None:
        name += '_mp' + str(min_pop)
    if train_only:
        name += '_trainonly'

    path = Path(base_path + 'features/' + name + '.fthr')
    if path.is_file() and not redo:
        features = load_feather(path)
        features = features[features.session_id.isin(
            examples.session_id.unique())]
        examples = copy_features(examples, features)
    else:
        examples, cols = create_features(log,
                                         examples,
                                         hidden=hidden,
                                         min_pop=min_pop,
                                         train_only=train_only)
        examples = reduce_mem_usage(examples)
        write_feather(examples[['session_id', 'impressions'] + list(cols)],
                      path)
        #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list(cols)

    return examples
예제 #5
0
def meta_features(base_path,
                  meta_path,
                  log,
                  examples,
                  latent='d2v',
                  latent_size=16,
                  redo=False):

    name = 'meta_features'
    if latent == None:
        name += '_all'
    else:
        name += '_' + str(latent_size)

    path = Path(base_path + 'features/' + name + '.fthr')
    if path.is_file() and not redo:
        features = load_feather(path)
        features = features[features.session_id.isin(
            examples.session_id.unique())]
        examples = copy_features(examples, features)
    else:
        examples, cols = create_features(meta_path,
                                         log,
                                         examples,
                                         latent_prefix=latent,
                                         latent_size=latent_size)
        examples = reduce_mem_usage(examples)
        write_feather(examples[['session_id', 'impressions'] + list(cols)],
                      path)
        #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list(cols)

    return examples
예제 #6
0
def crawl_features(base_path, crawl_path, log, examples, redo=False):
    
    name = 'crawl_features'
    
    path = Path( base_path + 'features/' + name + '.fthr' )
    if path.is_file() and not redo:
        features = load_feather( path )
        features = features[features.session_id.isin( examples.session_id.unique() )]
        examples = copy_features( examples, features )
    else:
        examples, cols = create_features( crawl_path, log, examples )
        examples = reduce_mem_usage(examples)
        write_feather( examples[['session_id','impressions'] + list(cols)], path )
        #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list( cols )
    return examples
예제 #7
0
def session_features(base_path, log, examples, price_path=None, crawl_path=CRAWL_FOLDER, poi_path=POI_FOLDER, redo=False):
    
    name = 'session_features'
    if price_path is None:
        price_path = base_path
    
    path = Path( base_path + 'features/' + name + '.fthr' )
    if path.is_file() and not redo:
        features = load_feather( path )
        features = features[features.session_id.isin( examples.session_id.unique() )]
        examples = copy_features( examples, features )
    else:
        examples, cols = create_features( log, examples, price_path=price_path, crawl_path=crawl_path, poi_path=poi_path )
        examples = reduce_mem_usage(examples, cols=cols)
        write_feather( examples[['session_id','impressions'] + list(cols)], path )
        #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list( cols )
        
    return examples
예제 #8
0
def list_context_features(base_path, log, examples, shifts=SHIFTS, redo=False):

    name = 'list_context_features_' + str(shifts)

    path = Path(base_path + 'features/' + name + '.fthr')
    if path.is_file() and not redo:
        features = load_feather(path)
        features = features[features.session_id.isin(
            examples.session_id.unique())]
        examples = copy_features(examples, features)
    else:
        examples, cols = create_features(log, examples, shifts=shifts)
        examples = reduce_mem_usage(examples)
        write_feather(examples[['session_id', 'impressions'] + list(cols)],
                      path)
        #examples[['session_id','impressions','prices','label','position'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list(cols)

    return examples
예제 #9
0
def add_last_poi( poi_path, log ):
    
    def _add_last_poi(row, save=None):
        
        session = row[0]
        action = row[1]
        ref = row[2]
        city = row[3]
        
        if 'session' in save and save['session'] != session or not 'session' in save:
            #new session
            save['session'] = session
            save['last_poi'] = -1
        
        if 'city' in save and save['city'] != city or not 'city' in save:
            #new session
            save['city'] = city
            save['last_poi'] = -1
        
        if action == POI and not np.isnan( ref ):
            save['last_poi'] = ref
        
        return save['last_poi']
    
    file = poi_path + 'last_poi.fthr'
    
    if not Path( file ).is_file():
        log_full = load_hdfs( poi_path + 'data_log.hd5' )
        log_full['last_poi'] = apply(log_full, ['session_id','action_type','reference','city'], _add_last_poi, verbose=100000)
        write_feather( log_full[['session_id','last_poi']], file )
    
    last_poi = load_feather( file )
    print( len(last_poi) )
    last_poi = last_poi[last_poi.session_id.isin( log.session_id.unique() )]
    
    print( len(last_poi) )
    print( len(log) )
    log['last_poi'] = last_poi['last_poi'].values
    del last_poi
    
    return log
예제 #10
0
파일: lgbm_cv.py 프로젝트: rn5l/rsc19
def main():

    train = create_set(base_path=BASE_PATH + SET,
                       conf=CONF,
                       key=DSKEY,
                       redo=False)
    test = train.query('train == 0')

    test_file_key = DSKEY
    ensure_dir(BASE_PATH + SET + 'tmp/')
    test_file = BASE_PATH + SET + 'tmp/' + test_file_key + '_test.fthr'

    if not Path(test_file).is_file():
        test = test.reset_index(drop=True)
        test.to_feather(test_file)

    test_len = len(test)
    del test
    gc.collect()

    train.query('train == 1', inplace=True)

    X = train[FEATURES + ['session_id']]
    y = train['label']

    del train
    gc.collect()

    score = np.zeros((SPLITS, test_len))
    i = 0

    for train_idx, val_idx in train_test_cv(X,
                                            y,
                                            splits=SPLITS,
                                            shuffle=SHUFFLE):

        X_train = X.loc[train_idx]
        X_valid = X.loc[val_idx]
        y_train = y.loc[train_idx]
        y_valid = y.loc[val_idx]

        if LTR:
            q_train = X_train.groupby(['session_id'
                                       ]).size().values.astype(np.float32)
            q_valid = X_valid.groupby(['session_id'
                                       ]).size().values.astype(np.float32)
            xtrain = X_train[FEATURES].values.astype(np.float32)
            ytrain = y_train.values.astype(np.float32)
            del X_train, y_train
            gc.collect()
            d_train = lgbm.Dataset(
                xtrain, label=ytrain, group=q_train,
                feature_name=FEATURES)  #, categorical_feature=CAT_FEATURES )
            del q_train
            gc.collect()
            xval = X_valid[FEATURES].values.astype(np.float32)
            yval = y_valid.values.astype(np.float32)
            del X_valid, y_valid
            gc.collect()
            d_valid = lgbm.Dataset(
                xval, label=yval, group=q_valid,
                feature_name=FEATURES)  #, categorical_feature=CAT_FEATURES )
            del q_valid
            gc.collect()
        else:
            d_train = lgbm.Dataset(
                X_train[FEATURES], label=y_train, feature_name=FEATURES
            )  #+ ['session_id'])#, categorical_feature=CAT_FEATURES )
            d_valid = lgbm.Dataset(
                X_valid[FEATURES], label=y_valid, feature_name=FEATURES
            )  #+ ['session_id'])#, categorical_feature=CAT_FEATURES )

        watchlist = [d_train, d_valid]

        params = {}
        params['boosting'] = 'dart'
        params['learning_rate'] = 0.1
        if LTR:
            params['application'] = 'lambdarank'
            params['metric'] = 'ndcg'
            params['eval_at'] = '30'
        else:
            params['application'] = 'binary'
            params['metric'] = 'binary_logloss'
        #params['max_depth'] = -1
        #params['num_leaves'] = 64
        #params['max_bin'] = 512
        params['feature_fraction'] = 0.5
        params['bagging_fraction'] = 0.5
        #params['min_data_in_leaf'] = 20
        #params['verbosity'] = 0

        evals_result = {}
        model = lgbm.train(params,
                           train_set=d_train,
                           num_boost_round=MAX_EPOCHS,
                           valid_sets=watchlist,
                           early_stopping_rounds=STOPPING,
                           evals_result=evals_result,
                           verbose_eval=10)

        ensure_dir(BASE_PATH + SET + 'lgbm/')
        model.save_model(
            BASE_PATH + SET + 'lgbm/' + ALGKEY + '.' + str(i) + '.txt',
            num_iteration=model.best_iteration,
        )

        del params, watchlist, d_train, d_valid, evals_result
        gc.collect()

        test = load_feather(test_file)

        X_test = test[FEATURES].values.astype(np.float32)

        y_test = model.predict(X_test, num_iteration=model.best_iteration)
        score[i] = y_test
        i += 1

        del y_test, model, X_test, test
        gc.collect()

    test = load_feather(test_file)

    test['prob_norm'] = 0
    test['prob_direct'] = 0
    for i in range(SPLITS):
        test['prob_direct_' + str(i)] = score[i]
        test['prob_norm' + str(i)] = (test['prob_direct_' + str(i)] -
                                      test['prob_direct_' + str(i)].min()) / (
                                          test['prob_direct_' + str(i)].max() -
                                          test['prob_direct_' + str(i)].min())
        test['prob_direct'] += test['prob_direct_' + str(i)]
        test['prob_norm'] += test['prob_norm' + str(i)]

    test['prob_norm'] = test['prob_norm'] / SPLITS
    test['prob_direct'] = test['prob_direct'] / SPLITS

    #truth = pd.read_csv( self.folder + 'truth.csv' )
    #truth['label2'] = 1
    #test = test.merge( truth[['session_id','reference','label2']], left_on=['session_id','impressions'], right_on=['session_id','reference'], how='left' )
    #test['label'] =  test['label2'].fillna(0)
    #del test['label2']

    test = test.sort_values(['session_id', 'prob_norm'], ascending=False)
    #test.to_csv( BASE_PATH + SET + 'test_debugcv.csv' )

    solution = pd.DataFrame()
    solution['recommendations'] = test.groupby('session_id').impressions.apply(
        list)
    solution['confidences'] = test.groupby('session_id').prob_norm.apply(list)
    solution.reset_index(drop=True)
    solution = solution.merge(
        test[['session_id', 'user_id', 'timestamp',
              'step']].drop_duplicates(keep='last'),
        on='session_id',
        how='inner')
    solution.to_csv(BASE_PATH + '/' + SET + '/solution_' + ALGKEY +
                    '_norm.csv')

    test = test.sort_values(['session_id', 'prob_direct'], ascending=False)
    solution = pd.DataFrame()
    solution['recommendations'] = test.groupby('session_id').impressions.apply(
        list)
    solution['confidences'] = test.groupby('session_id').prob_direct.apply(
        list)
    solution.reset_index(drop=True)
    solution = solution.merge(
        test[['session_id', 'user_id', 'timestamp',
              'step']].drop_duplicates(keep='last'),
        on='session_id',
        how='inner')
    solution.to_csv(BASE_PATH + '/' + SET + '/solution_' + ALGKEY +
                    '_direct.csv')

    result = evaluate(solution, base=BASE_PATH, dataset=SET)
    print(result.T)
예제 #11
0
파일: create_set.py 프로젝트: rn5l/rsc19
def create_set(base_path=SET, key='dataset', conf={}, redo=False):

    name = key

    path = Path(base_path + 'sets/' + name + '.fthr')
    if path.is_file() and not redo:
        print('loaded')
        examples = load_feather(path)
        gc.collect()
    else:
        print('create')
        log = load_hdfs(base_path + 'data_log.hd5')
        examples = load_hdfs(base_path + 'data_examples.hd5')
        if 'current_filters' in set(examples.columns):
            print('current_filters')
            del examples['current_filters']
        if 'session_id_pre' in set(examples.columns):
            print('session_id_pre')
            del examples['session_id_pre']

        examples = pop_features(conf['path_pop'],
                                log,
                                examples,
                                hidden=conf['pop_hidden'],
                                min_pop=conf['min_pop'],
                                train_only=conf['train_only'],
                                redo=redo)
        examples = price_features(conf['path_price'],
                                  log,
                                  examples,
                                  min_occurences=conf['min_occurences'],
                                  hidden=conf['price_hidden'],
                                  train_only=conf['train_only'],
                                  fillna_mean=conf['fillna_mean'],
                                  redo=redo)
        examples = session_features(conf['path_session'],
                                    log,
                                    examples,
                                    crawl_path=conf['path_crawl'],
                                    redo=redo)
        examples = crawl_features(base_path,
                                  conf['path_crawl'],
                                  log,
                                  examples,
                                  redo=redo)
        examples = geo_features(base_path,
                                conf['path_crawl'],
                                log,
                                examples,
                                redo=redo)
        examples = meta_features(base_path,
                                 conf['path_meta'],
                                 log,
                                 examples,
                                 latent=conf['meta_latent'],
                                 redo=redo)
        examples = user_features(conf['path_session'],
                                 log,
                                 examples,
                                 crawl_path=conf['path_crawl'],
                                 poi_path=conf['path_poi'],
                                 redo=redo)
        examples = position_features(base_path, log, examples, redo=redo)
        examples = properties_features(base_path,
                                       conf['path_meta'],
                                       log,
                                       examples,
                                       redo=redo)
        #examples = latent_features(base_path, log, examples, latent_path=conf['path_latent'], redo=redo)
        examples = latent_sim_features(base_path,
                                       log,
                                       examples,
                                       latent_path=conf['path_latent'],
                                       redo=redo)
        examples = combine_features(base_path, log, examples, redo=redo)
        examples = rank_features(base_path, log, examples, redo=redo)
        examples = time_features(base_path, log, examples, redo=redo)
        examples = list_context_features(base_path, log, examples, redo=redo)
        examples = stars_features(base_path,
                                  conf['path_meta'],
                                  log,
                                  examples,
                                  redo=redo)
        #examples = prediction_features(base_path, log, examples, redo=redo)

        #examples.to_csv( base_path + 'sets/' + name + '.csv' )
        write_feather(examples, path)

        del log
        gc.collect()

    #print_col_list( examples.columns )
    #examples = reduce_mem_usage(examples)
    return examples