def dump(acc, unc, name): acc = reduce_mem_usage(acc) unc = reduce_mem_usage(unc) save_dir = pathlib.Path('../data/submissions') if not save_dir.exists(): save_dir.mkdir(parents=True) now = pd.Timestamp.now() acc_name = f'{name}-acc-{now:%Y%m%d}-{now:%H%M%S}.joblib' unc_name = f'{name}-unc-{now:%Y%m%d}-{now:%H%M%S}.joblib' joblib.dump(acc, save_dir / acc_name, compress=True) joblib.dump(unc, save_dir / unc_name, compress=True)
def transform(self, X): if not isinstance(self.dtypes, dict): X = reduce_mem_usage(X) self.dtypes = X.dtypes else: X = X.astype(self.dtypes) return X
def save_dataset(df): # Split data into train, val, and test temp, test = train_test_split(df, stratify=df['Severity'], test_size=0.1, random_state=42) train, val = train_test_split(temp, stratify=temp['Severity'], test_size=0.1, random_state=42) del temp # Impute missing values train = impute_df(train) val = impute_df(val) test = impute_df(test) # Scaling variables to known range scaler = MinMaxScaler(feature_range=(-1, 1)) scaler = train_scaler(train, scaler) train = transform_data(train, scaler) val = transform_data(val, scaler) test = transform_data(test, scaler) df = reduce_mem_usage(df, verbose=False) train = reduce_mem_usage(train, verbose=False) val = reduce_mem_usage(val, verbose=False) test = reduce_mem_usage(test, verbose=False) df.to_pickle(f'{NEW_DATA_DIR}/full_df.pkl') train.to_pickle(f'{NEW_DATA_DIR}/train.pkl') val.to_pickle(f'{NEW_DATA_DIR}/val.pkl') test.to_pickle(f'{NEW_DATA_DIR}/test.pkl') # train.to_csv(f'{NEW_DATA_DIR}/train.csv') # val.to_csv(f'{NEW_DATA_DIR}/val.csv') # test.to_csv(f'{NEW_DATA_DIR}/test.csv') del train, test, val
def predict_by_saved_model(config): args = AttributeDict(config) if args.each: v_sales_dict = joblib.load( '../data/05_preprocess/each_item/v_sales_dict.joblib') data_count = joblib.load( '../data/05_preprocess/each_item/data_count.joblib') dims = joblib.load('../data/05_preprocess/each_item/dims.joblib') weight = joblib.load('../data/06_weight/weight_each.joblib') te = joblib.load('../data/07_te/each_te.joblib') else: v_sales_dict = joblib.load( '../data/05_preprocess/agg_item/v_sales_dict.joblib') data_count = joblib.load( '../data/05_preprocess/agg_item/data_count.joblib') dims = joblib.load('../data/05_preprocess/agg_item/dims.joblib') weight = joblib.load('../data/06_weight/weight_agg.joblib') te = joblib.load('../data/07_te/agg_te.joblib') v_sales = next(iter(v_sales_dict.values())) drop_columns = [ 'sort_key', 'id', 'cat_id', 'd', 'release_date', 'date', 'weekday', 'year', 'week_of_month', 'holidy' ] if not args.use_prices: drop_columns += [ 'release_ago', 'sell_price', 'diff_price', 'price_max', 'price_min', 'price_std', 'price_mean', 'price_trend', 'price_norm', 'diff_price_norm', 'price_nunique', 'dept_max', 'dept_min', 'dept_std', 'dept_mean', 'price_in_dept', 'mean_in_dept', 'cat_max', 'cat_min', 'cat_std', 'cat_mean', 'price_in_cat', 'mean_in_cat', 'price_in_month', 'price_in_year', ] cat_columns = [ 'aggregation_level', 'item_id', 'dept_id', 'store_id', 'state_id', 'month', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'day_of_week' ] features = [ col for col in v_sales.columns if col not in drop_columns + [TARGET] ] is_cats = [col in cat_columns for col in features] cat_dims = [] emb_dims = [] for col in features: if col in cat_columns: cat_dims.append(dims['cat_dims'][col]) emb_dims.append(dims['emb_dims'][col]) dims = pd.DataFrame({'cat_dims': cat_dims, 'emb_dims': emb_dims}) train_index = 1 if args.useval else 2 trainset = M5Dataset(v_sales_dict, data_count, features, weight, te, remove_last4w=train_index, min_data_4w=0, over_sample=args.over_sample) train_loader = torch.utils.data.DataLoader( trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, worker_init_fn=get_worker_init_fn(args.seed)) model = M5MLPLSTMModel(is_cats, dims, n_hidden=args.n_hidden, dropout=args.dropout, use_te=args.use_te) criterion = M5Distribution(dist=args.dist, df=args.df) module = M5LightningModule(model, criterion, train_loader, None, None, args) if torch.cuda.is_available(): module = module.cuda() filename = '../models/each.ckpt' if args.each else '../models/agg.ckpt' module.load_state_dict(torch.load(filename)['state_dict']) device = next(iter(module.parameters())).device if args.each: cuda_rng_state = torch.load('../models/cuda_rng_state_each.dmp') else: cuda_rng_state = torch.load('../models/cuda_rng_state_agg.dmp') torch.cuda.set_rng_state(cuda_rng_state, device=device) val_acc, val_unc = predict(args, module, criterion, trainset.data_dict, weight, te, evaluation=False) eva_acc, eva_unc = predict(args, module, criterion, trainset.data_dict, weight, te, evaluation=True) pred_acc = reduce_mem_usage(pd.concat([val_acc, eva_acc])) pred_unc = reduce_mem_usage(pd.concat([val_unc, eva_unc])) return pred_acc, pred_unc
del train, test, val if __name__ == "__main__": create_dir(NEW_DATA_DIR) first_file = True # Multiple files data = None data_filename = '' num_sample = None filenames = glob(f'data/accidents_*.csv') print('Number of files found', len(filenames)) # multiple files, so they need to be handled separately for i, f in enumerate(filenames): print(f, i) if first_file: data = load_dataset(f, num_sample) first_UK_file = False else: new_data = load_dataset(f, num_sample) data = do_concat(data, new_data) data = reduce_mem_usage(data, verbose=False) if data is not None: save_dataset(data)
def generate_grid_price(self, grid_base_path, grid_price_path): prices_df = self.DW.prices_df calendar_df = self.DW.calendar_df logging.info('generate_grid_price') logging.info('load grid_base') grid_df = pd.read_pickle(grid_base_path) prices_df['price_max'] = prices_df.groupby( ['store_id', 'item_id'])['sell_price'].transform('max') prices_df['price_min'] = prices_df.groupby( ['store_id', 'item_id'])['sell_price'].transform('min') prices_df['price_std'] = prices_df.groupby( ['store_id', 'item_id'])['sell_price'].transform('std') prices_df['price_mean'] = prices_df.groupby( ['store_id', 'item_id'])['sell_price'].transform('mean') prices_df[ 'price_norm'] = prices_df['sell_price'] / prices_df['price_max'] prices_df['price_nunique'] = prices_df.groupby( ['store_id', 'item_id'])['sell_price'].transform('nunique') prices_df['item_nunique'] = prices_df.groupby( ['store_id', 'sell_price'])['item_id'].transform('nunique') calendar_prices = calendar_df[['wm_yr_wk', 'month', 'year']] calendar_prices = calendar_prices.drop_duplicates(subset=['wm_yr_wk']) prices_df = prices_df.merge( calendar_prices[['wm_yr_wk', 'month', 'year']], on=['wm_yr_wk'], how='left') del calendar_prices prices_df[ 'price_momentum'] = prices_df['sell_price'] / prices_df.groupby([ 'store_id', 'item_id' ])['sell_price'].transform(lambda x: x.shift(1)) prices_df[ 'price_momentum_m'] = prices_df['sell_price'] / prices_df.groupby([ 'store_id', 'item_id', 'month' ])['sell_price'].transform('mean') prices_df[ 'price_momentum_y'] = prices_df['sell_price'] / prices_df.groupby([ 'store_id', 'item_id', 'year' ])['sell_price'].transform('mean') prices_df['sell_price_cent'] = [ math.modf(p)[0] for p in prices_df['sell_price'] ] prices_df['price_max_cent'] = [ math.modf(p)[0] for p in prices_df['price_max'] ] prices_df['price_min_cent'] = [ math.modf(p)[0] for p in prices_df['price_min'] ] del prices_df['month'], prices_df['year'] # logging.info('prices_df.columns: {}'.format(prices_df.columns)) # logging.info('prices_df.shape: {}'.format(prices_df.shape)) # logging.info('prices_df - {}'.format(prices_df.head(-5))) # store_id item_id wm_yr_wk sell_price price_max ... price_momentum_m price_momentum_y sell_price_cent price_max_cent price_min_cent # 0 CA_1 HOBBIES_1_002 11121 3.97 3.97 ... 1.0 1.0 0.97 0.97 0.97 # 1 CA_1 HOBBIES_1_002 11122 3.97 3.97 ... 1.0 1.0 0.97 0.97 0.97 # 656812 WI_3 FOODS_3_827 11615 1.00 1.00 ... 1.0 1.0 0.00 0.00 0.00 # 656813 WI_3 FOODS_3_827 11616 1.00 1.00 ... 1.0 1.0 0.00 0.00 0.00 logging.info('merge prices') original_columns = list(grid_df) grid_df = grid_df.merge(prices_df, on=['store_id', 'item_id', 'wm_yr_wk'], how='left') keep_columns = [ col for col in list(grid_df) if col not in original_columns ] grid_df = grid_df[self.main_index_list + keep_columns] grid_df = reduce_mem_usage(grid_df) # logging.info('grid_df.columns: {}'.format(grid_df.columns)) # logging.info('grid_df.shape: {}'.format(grid_df.shape)) # logging.info('grid_df - {}'.format(grid_df.head(-5))) # id d sell_price price_max ... price_momentum_y sell_price_cent price_max_cent price_min_cent # 0 HOBBIES_1_025_CA_1_evaluation d_1 NaN NaN ... NaN NaN NaN NaN # 1 HOBBIES_1_052_CA_1_evaluation d_1 NaN NaN ... NaN NaN NaN NaN # 592185 FOODS_3_194_WI_3_evaluation d_1948 NaN NaN ... NaN NaN NaN NaN # 592186 FOODS_3_282_WI_3_evaluation d_1948 NaN NaN ... NaN NaN NaN NaN logging.info('save grid_price') grid_df.to_pickle(grid_price_path) del prices_df return
def preprocess(v_sales, transform_sales=True): logging.info('create dims') cat_columns = [ 'aggregation_level', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', ] label_encoders = {} remove_columns = [] for column in tqdm(cat_columns): if column in v_sales.columns: encoder = LabelEncoder() v_sales[column] = encoder.fit_transform(v_sales[column].fillna('NA')) label_encoders[column] = encoder else: remove_columns.append(column) for column in remove_columns: cat_columns.remove(column) v_sales['month'] = v_sales['month'] - 1 cat_columns = ['day_of_week', 'month'] + cat_columns cat_dims = v_sales[cat_columns].nunique() dims = pd.DataFrame(cat_dims, columns=['cat_dims']) dims['emb_dims'] = cat_dims.apply(lambda x: min(50, (x + 1) // 2)) minmax_columns = [ 'release_ago', 'wm_yr_wk', 'wday', 'day', 'week', 'year_delta', 'week_of_month', 'price_nunique', ] power_columns = [ 'sell_price', 'diff_price', 'price_max', 'price_min', 'price_std', 'price_mean', # 'price_trend', 'price_norm', 'diff_price_norm', 'dept_max', 'dept_min', 'dept_std', 'dept_mean', 'cat_max', 'cat_min', 'cat_std', 'cat_mean', ] logging.info('start MinMaxScaler') minmax_scalers = {} for column in tqdm(minmax_columns): scaler = MinMaxScaler() v_sales[column] = scaler.fit_transform(v_sales[[column]]) minmax_scalers[column] = scaler logging.info('start PowerTransformer') power_transformers = {} for column in tqdm(power_columns): logging.info(column) scaler = PowerTransformer() v_sales[column] = v_sales[column].fillna(0).astype('float64') v_sales[column] = scaler.fit_transform(v_sales[[column]]) v_sales = reduce_mem_usage(v_sales) power_transformers[column] = scaler logging.info('create data_count') data_count = pd.DataFrame(v_sales['id'].value_counts()).reset_index() data_count.columns = ['id', 'count'] logging.info('create v_sales_dict and transform sales') id_list = v_sales['id'].unique() v_sales = v_sales.set_index(['id', 'sort_key']).sort_index() v_sales_dict = {} sales_transformers = {} for data_id in tqdm(id_list): data = v_sales.loc[data_id].reset_index() data[TARGET] = data[TARGET].astype('float64') # Int32 -> float64 if transform_sales: scaler = PowerTransformer() scaler.fit(data[[TARGET]].iloc[:-28]) data[TARGET] = scaler.transform(data[[TARGET]]) sales_transformers[data_id] = scaler v_sales_dict[data_id] = data return v_sales_dict, data_count, dims, label_encoders, minmax_scalers, power_transformers, sales_transformers
def dump(df, name): df = reduce_mem_usage(df) save_dir = pathlib.Path('../data/04_agg') if not save_dir.exists(): save_dir.mkdir(parents=True) joblib.dump(df, save_dir / f'{name}.joblib', compress=True)
import lightgbm as lgb import xgboost as xgb from sklearn.metrics import f1_score from sklearn.linear_model import LogisticRegression import warnings from util import DefaultConfig from util import reduce_mem_usage warnings.filterwarnings('ignore') import time start = time.clock() # 训练集 traindata = reduce_mem_usage( pd.read_hdf(path_or_buf=DefaultConfig.test_traindata_cache_path, mode='r', key='train')) # 标签 label = pd.read_hdf(path_or_buf=DefaultConfig.test_label_cache_path, mode='r', key='label') # 测试集 testdata = reduce_mem_usage( pd.read_hdf(path_or_buf=DefaultConfig.test_testdata_cache_path, mode='r', key='test')) # 删除无关特征 train = traindata.drop(DefaultConfig.delete_columns, axis=1) test = testdata.drop(DefaultConfig.delete_columns, axis=1)
import os import pandas as pd import yaml import load_data from util import get_logger, reduce_mem_usage logger = get_logger() file_path = os.path.dirname(__file__) CONFIG_FILE = '../config/config.yaml' with open(CONFIG_FILE) as file: yml = yaml.load(file) INPUT_DIR_NAME = yml['SETTING']['INPUT_DIR_NAME'] def create_pickle(train, test, specs, train_labels): logger.info('save pickle file') train.to_pickle(file_path + INPUT_DIR_NAME + 'train.pkl') test.to_pickle(file_path + INPUT_DIR_NAME + 'test.pkl') specs.to_pickle(file_path + INPUT_DIR_NAME + 'specs.pkl') train_labels.to_pickle(file_path + INPUT_DIR_NAME + 'train_labels.pkl') if __name__ == '__main__': train = reduce_mem_usage(load_data.read_train()) test = reduce_mem_usage(load_data.read_test()) specs = reduce_mem_usage(load_data.read_specs()) train_labels = reduce_mem_usage(load_data.read_train_labels()) create_pickle(train, test, specs, train_labels)