def latent_sim_features(base_path, log, examples, latent_path=None, keys=KEYS, sizes=SIZES, redo=False): name = 'latent_sim_features' if latent_path is None: latent_path = base_path path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(log, examples, latent_path=latent_path, keys=keys, sizes=sizes) examples = reduce_mem_usage(examples, cols=cols) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def time_features(base_path, log, examples, preprocessed_path=PREPROCESSED_FOLDER, redo=False): name = 'time_features' path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(log, examples, preprocessed_path=preprocessed_path) examples = reduce_mem_usage(examples, cols=cols) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def meta_features(base_path, meta_path, log, examples, latent='d2v', latent_size=16, redo=False): name = 'meta_features' if latent == None: name += '_all' else: name += '_' + str(latent_size) path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(meta_path, log, examples, latent_prefix=latent, latent_size=latent_size) examples = reduce_mem_usage(examples) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def resolve_na(train): cols = [] for col in train.columns: if sum(train[col].isnull()) > 0: #print('na ',col) cols.append(col) print_col_list(cols, name='NA_COLS') exit()
def crawl_features(base_path, crawl_path, log, examples, redo=False): name = 'crawl_features' path = Path( base_path + 'features/' + name + '.fthr' ) if path.is_file() and not redo: features = load_feather( path ) features = features[features.session_id.isin( examples.session_id.unique() )] examples = copy_features( examples, features ) else: examples, cols = create_features( crawl_path, log, examples ) examples = reduce_mem_usage(examples) write_feather( examples[['session_id','impressions'] + list(cols)], path ) #examples[['session_id','impressions','prices','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list( cols ) return examples
def session_features(base_path, log, examples, price_path=None, crawl_path=CRAWL_FOLDER, poi_path=POI_FOLDER, redo=False): name = 'session_features' if price_path is None: price_path = base_path path = Path( base_path + 'features/' + name + '.fthr' ) if path.is_file() and not redo: features = load_feather( path ) features = features[features.session_id.isin( examples.session_id.unique() )] examples = copy_features( examples, features ) else: examples, cols = create_features( log, examples, price_path=price_path, crawl_path=crawl_path, poi_path=poi_path ) examples = reduce_mem_usage(examples, cols=cols) write_feather( examples[['session_id','impressions'] + list(cols)], path ) #examples[['session_id','impressions','label','step'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list( cols ) return examples
def list_context_features(base_path, log, examples, shifts=SHIFTS, redo=False): name = 'list_context_features_' + str(shifts) path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(log, examples, shifts=shifts) examples = reduce_mem_usage(examples) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','prices','label','position'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def price_features(base_path, log, examples, min_occurences=None, hidden=False, train_only=False, fillna_mean=False, redo=False): name = 'price_features' if train_only: name += '_trainonly' if hidden: name += '_hidden' if min_occurences is not None: name += '_min' + str(min_occurences) if fillna_mean: name += '_fillmean' path = Path(base_path + 'features/' + name + '.fthr') if path.is_file() and not redo: features = load_feather(path) features = features[features.session_id.isin( examples.session_id.unique())] examples = copy_features(examples, features) else: examples, cols = create_features(base_path, log, examples, min_occurences=min_occurences, hidden=hidden, train_only=train_only, fillna_mean=fillna_mean) examples = reduce_mem_usage(examples, cols=cols) write_feather(examples[['session_id', 'impressions'] + list(cols)], path) #examples[['session_id','impressions','prices','city','platform','label'] + list(cols)].to_csv( base_path + 'features/' + name + '.csv' ) print_col_list(cols) return examples
def main(): examples = create_set(SET, conf=None, redo=True) print_col_list(examples)