Пример #1
0
def latent_sim_features(base_path,
                        log,
                        examples,
                        latent_path=None,
                        keys=KEYS,
                        sizes=SIZES,
                        redo=False):

    name = 'latent_sim_features'
    if latent_path is None:
        latent_path = base_path

    path = Path(base_path + 'features/' + name + '.fthr')
    if path.is_file() and not redo:
        features = load_feather(path)
        features = features[features.session_id.isin(
            examples.session_id.unique())]
        examples = copy_features(examples, features)
    else:
        examples, cols = create_features(log,
                                         examples,
                                         latent_path=latent_path,
                                         keys=keys,
                                         sizes=sizes)
        examples = reduce_mem_usage(examples, cols=cols)
        write_feather(examples[['session_id', 'impressions'] + list(cols)],
                      path)
        #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list(cols)

    return examples
Пример #2
0
Файл: time.py Проект: rn5l/rsc19
def time_features(base_path,
                  log,
                  examples,
                  preprocessed_path=PREPROCESSED_FOLDER,
                  redo=False):

    name = 'time_features'

    path = Path(base_path + 'features/' + name + '.fthr')
    if path.is_file() and not redo:
        features = load_feather(path)
        features = features[features.session_id.isin(
            examples.session_id.unique())]
        examples = copy_features(examples, features)
    else:
        examples, cols = create_features(log,
                                         examples,
                                         preprocessed_path=preprocessed_path)
        examples = reduce_mem_usage(examples, cols=cols)
        write_feather(examples[['session_id', 'impressions'] + list(cols)],
                      path)
        #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list(cols)

    return examples
Пример #3
0
def pop_features(base_path,
                 log,
                 examples,
                 hidden=False,
                 min_pop=None,
                 train_only=False,
                 redo=False):

    name = 'pop_features'
    if hidden:
        name += '_hidden'
    if min_pop is not None:
        name += '_mp' + str(min_pop)
    if train_only:
        name += '_trainonly'

    path = Path(base_path + 'features/' + name + '.fthr')
    if path.is_file() and not redo:
        features = load_feather(path)
        features = features[features.session_id.isin(
            examples.session_id.unique())]
        examples = copy_features(examples, features)
    else:
        examples, cols = create_features(log,
                                         examples,
                                         hidden=hidden,
                                         min_pop=min_pop,
                                         train_only=train_only)
        examples = reduce_mem_usage(examples)
        write_feather(examples[['session_id', 'impressions'] + list(cols)],
                      path)
        #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list(cols)

    return examples
Пример #4
0
def meta_features(base_path,
                  meta_path,
                  log,
                  examples,
                  latent='d2v',
                  latent_size=16,
                  redo=False):

    name = 'meta_features'
    if latent == None:
        name += '_all'
    else:
        name += '_' + str(latent_size)

    path = Path(base_path + 'features/' + name + '.fthr')
    if path.is_file() and not redo:
        features = load_feather(path)
        features = features[features.session_id.isin(
            examples.session_id.unique())]
        examples = copy_features(examples, features)
    else:
        examples, cols = create_features(meta_path,
                                         log,
                                         examples,
                                         latent_prefix=latent,
                                         latent_size=latent_size)
        examples = reduce_mem_usage(examples)
        write_feather(examples[['session_id', 'impressions'] + list(cols)],
                      path)
        #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list(cols)

    return examples
Пример #5
0
def crawl_features(base_path, crawl_path, log, examples, redo=False):
    
    name = 'crawl_features'
    
    path = Path( base_path + 'features/' + name + '.fthr' )
    if path.is_file() and not redo:
        features = load_feather( path )
        features = features[features.session_id.isin( examples.session_id.unique() )]
        examples = copy_features( examples, features )
    else:
        examples, cols = create_features( crawl_path, log, examples )
        examples = reduce_mem_usage(examples)
        write_feather( examples[['session_id','impressions'] + list(cols)], path )
        #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list( cols )
    return examples
Пример #6
0
def session_features(base_path, log, examples, price_path=None, crawl_path=CRAWL_FOLDER, poi_path=POI_FOLDER, redo=False):
    
    name = 'session_features'
    if price_path is None:
        price_path = base_path
    
    path = Path( base_path + 'features/' + name + '.fthr' )
    if path.is_file() and not redo:
        features = load_feather( path )
        features = features[features.session_id.isin( examples.session_id.unique() )]
        examples = copy_features( examples, features )
    else:
        examples, cols = create_features( log, examples, price_path=price_path, crawl_path=crawl_path, poi_path=poi_path )
        examples = reduce_mem_usage(examples, cols=cols)
        write_feather( examples[['session_id','impressions'] + list(cols)], path )
        #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list( cols )
        
    return examples
Пример #7
0
def list_context_features(base_path, log, examples, shifts=SHIFTS, redo=False):

    name = 'list_context_features_' + str(shifts)

    path = Path(base_path + 'features/' + name + '.fthr')
    if path.is_file() and not redo:
        features = load_feather(path)
        features = features[features.session_id.isin(
            examples.session_id.unique())]
        examples = copy_features(examples, features)
    else:
        examples, cols = create_features(log, examples, shifts=shifts)
        examples = reduce_mem_usage(examples)
        write_feather(examples[['session_id', 'impressions'] + list(cols)],
                      path)
        #examples[['session_id','impressions','prices','label','position'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' )
        print_col_list(cols)

    return examples
Пример #8
0
def split_competition(data):

    if DAYS_TRAIN_COMPETITION is not None:
        maxtr = datetime.fromtimestamp(data[data.train == 1].timestamp.max())
        mintr = datetime.fromtimestamp(data[data.train == 1].timestamp.min())
        start = maxtr - timedelta(days=DAYS_TRAIN_COMPETITION)

        data['mintimestamp'] = data.groupby('user_id').timestamp.transform(min)

        if RANDOM_TRAIN:
            keep = ((data['mintimestamp'] >= start.timestamp()) &
                    (data.train == 1))
            num_sess = data[keep].session_id.nunique()

            sess = list(data[data.train == 1].session_id.unique())
            shuffle(sess)
            keep = sess[:num_sess]

            keep = (data.train == 0) | data.session_id.isin(keep)
            data = data[keep]
        else:
            keep = ((data['mintimestamp'] >= start.timestamp()) &
                    (data.train == 1))
            keep = keep | (data.train == 0)
            data = data[keep]

        mintr = data[data.train == 1].timestamp.min()
        minva = data[data.train == 0].timestamp.min()
        maxva = data[data.train == 0].timestamp.max()

        print(datetime.fromtimestamp(mintr))
        print(datetime.fromtimestamp(minva))
        print(datetime.fromtimestamp(maxva))

        del data['mintimestamp']
    data['hidden'] = 0

    hide_test = data.reference.isnull() & (data.action_type == CLICK)
    data.ix[hide_test, 'reference'] = np.nan
    data.ix[hide_test, 'hidden'] = 1

    hide_train = data[(data.train == 1)
                      & (data.action_type == CLICK)].copy()  # filter clickout
    hide_train = hide_train.drop_duplicates('user_id', keep='last')
    data.ix[hide_train.index, 'reference'] = np.nan
    data.ix[hide_train.index, 'hidden'] = 1

    tmp = pd.DataFrame()
    tmp['maxstamp'] = data[data.hidden == 1].groupby(
        'session_id').timestamp.max()
    data = data.merge(tmp, right_index=True, left_on='session_id', how='left')
    data['maxstamp'] = data['maxstamp'].fillna(data.timestamp.max())

    data['exclude'] = 0
    data.ix[data.timestamp > data.maxstamp, 'exclude'] = 1
    del data['maxstamp'], tmp

    data = reduce_mem_usage(data)
    write_hdfs(data, TARGET + 'data_log.hd5')
    #data.to_csv( TARGET + 'data_log.csv', index=False )
    data[data.train == 0].to_csv(TARGET + 'data_log_test.csv')

    examples = expand_and_label(data)

    write_hdfs(examples, TARGET + 'data_examples.hd5')
Пример #9
0
def split_sample(data):

    data = data[data.train == 1].copy()
    data = reduce_mem_usage(data)

    maxtr = datetime.fromtimestamp(data.timestamp.max())
    mintr = datetime.fromtimestamp(data.timestamp.min())
    minva = maxtr - timedelta(days=DAYS_TEST)

    if DAYS_TRAIN is not None:
        mintr = maxtr - timedelta(days=DAYS_TEST + DAYS_TRAIN)

    print(mintr)
    print(maxtr)
    print(minva)

    data['mintimestamp'] = data.groupby('user_id').timestamp.transform(min)

    data['train'] = (data['mintimestamp'] >= mintr.timestamp()).astype(int)
    if RANDOM_TRAIN:
        data.ix[data['mintimestamp'] >= minva.timestamp(), 'train'] = 0
        num_sess = data[data.train == 1].session_id.nunique()

        data['train'] = 1
        data.ix[data['mintimestamp'] >= minva.timestamp(), 'train'] = 0

        sess = list(data[data.train == 1].session_id.unique())
        shuffle(sess)
        keep = sess[:num_sess]

        keep = (data.train == 0) | data.session_id.isin(keep)
        data = data[keep]
    else:
        data = data[data.train == 1]
        data.loc[data['mintimestamp'] >= minva.timestamp(), 'train'] = 0

    print(data[['session_id', 'timestamp', 'mintimestamp', 'train']])

    mintr = data[data.train == 1].timestamp.min()
    minva = data[data.train == 0].timestamp.min()
    maxva = data[data.train == 0].timestamp.max()

    print(datetime.fromtimestamp(mintr))
    print(datetime.fromtimestamp(minva))
    print(datetime.fromtimestamp(maxva))

    print(len(data[data.train == 1]))
    print(len(data[data.train == 0]))

    data = data.reset_index(drop=True)
    del data['mintimestamp']

    #print( len( set(test.session_id.unique()) & set(train.session_id.unique()) ) )

    data['hidden'] = 0
    data['exclude'] = 0

    examples_log = data[data.action_type == CLICK].copy()  # filter clickout
    examples_log = examples_log.drop_duplicates('user_id', keep='last')
    truth = examples_log[examples_log.train == 0]

    #hide all
    data.loc[examples_log.index.values, 'reference'] = np.nan
    data.loc[examples_log.index.values, 'hidden'] = 1

    print('hidden test sum ',
          data[(data.hidden == 1) & (data.train == 0)].hidden.sum())

    tmp = pd.DataFrame()
    tmp['maxstamp'] = data[data.hidden == 1].groupby(
        'session_id').timestamp.max()
    data = data.merge(tmp, right_index=True, left_on='session_id', how='left')
    data['maxstamp'] = data['maxstamp'].fillna(data.timestamp.max())

    data.loc[data.timestamp > data.maxstamp, 'exclude'] = 1
    del data['maxstamp'], tmp

    print('hidden test sum ',
          data[(data.hidden == 1) & (data.train == 0)].hidden.sum())

    examples = expand_and_label(data)

    #hide test completely
    data.loc[examples_log[examples_log.train == 0].index.values,
             'item_id'] = np.nan
    data.loc[examples_log[examples_log.train == 0].index.values,
             'price_session'] = np.nan

    data = reduce_mem_usage(data)
    write_hdfs(data, TARGET + 'data_log.hd5')
    #data.to_csv( TARGET + 'data_log.csv' )
    data[data.train == 0].to_csv(TARGET + 'data_log_test.csv')

    write_hdfs(examples, TARGET + 'data_examples.hd5')
    #examples.to_csv( TARGET + 'data_examples.csv', index=False )

    truth.to_csv(TARGET + 'truth.csv', index=False)

    with open(TARGET + 'size.txt', 'w') as out:
        out.write('train_size: {}, test_size: {}'.format(
            DAYS_TRAIN, DAYS_TEST))