Пример #1
0
def dump(acc, unc, name):
    acc = reduce_mem_usage(acc)
    unc = reduce_mem_usage(unc)
    save_dir = pathlib.Path('../data/submissions')
    if not save_dir.exists():
        save_dir.mkdir(parents=True)
    now = pd.Timestamp.now()
    acc_name = f'{name}-acc-{now:%Y%m%d}-{now:%H%M%S}.joblib'
    unc_name = f'{name}-unc-{now:%Y%m%d}-{now:%H%M%S}.joblib'
    joblib.dump(acc, save_dir / acc_name, compress=True)
    joblib.dump(unc, save_dir / unc_name, compress=True)
Пример #2
0
 def transform(self, X):
     if not isinstance(self.dtypes, dict):
         X = reduce_mem_usage(X)
         self.dtypes = X.dtypes
     else:
         X = X.astype(self.dtypes)
     
     return X
def save_dataset(df):
    # Split data into train, val, and test
    temp, test = train_test_split(df,
                                  stratify=df['Severity'],
                                  test_size=0.1,
                                  random_state=42)
    train, val = train_test_split(temp,
                                  stratify=temp['Severity'],
                                  test_size=0.1,
                                  random_state=42)
    del temp

    # Impute missing values
    train = impute_df(train)
    val = impute_df(val)
    test = impute_df(test)

    # Scaling variables to known range
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler = train_scaler(train, scaler)

    train = transform_data(train, scaler)
    val = transform_data(val, scaler)
    test = transform_data(test, scaler)

    df = reduce_mem_usage(df, verbose=False)
    train = reduce_mem_usage(train, verbose=False)
    val = reduce_mem_usage(val, verbose=False)
    test = reduce_mem_usage(test, verbose=False)

    df.to_pickle(f'{NEW_DATA_DIR}/full_df.pkl')

    train.to_pickle(f'{NEW_DATA_DIR}/train.pkl')
    val.to_pickle(f'{NEW_DATA_DIR}/val.pkl')
    test.to_pickle(f'{NEW_DATA_DIR}/test.pkl')

    # train.to_csv(f'{NEW_DATA_DIR}/train.csv')
    # val.to_csv(f'{NEW_DATA_DIR}/val.csv')
    # test.to_csv(f'{NEW_DATA_DIR}/test.csv')

    del train, test, val
Пример #4
0
def predict_by_saved_model(config):
    args = AttributeDict(config)
    if args.each:
        v_sales_dict = joblib.load(
            '../data/05_preprocess/each_item/v_sales_dict.joblib')
        data_count = joblib.load(
            '../data/05_preprocess/each_item/data_count.joblib')
        dims = joblib.load('../data/05_preprocess/each_item/dims.joblib')
        weight = joblib.load('../data/06_weight/weight_each.joblib')
        te = joblib.load('../data/07_te/each_te.joblib')
    else:
        v_sales_dict = joblib.load(
            '../data/05_preprocess/agg_item/v_sales_dict.joblib')
        data_count = joblib.load(
            '../data/05_preprocess/agg_item/data_count.joblib')
        dims = joblib.load('../data/05_preprocess/agg_item/dims.joblib')
        weight = joblib.load('../data/06_weight/weight_agg.joblib')
        te = joblib.load('../data/07_te/agg_te.joblib')
    v_sales = next(iter(v_sales_dict.values()))
    drop_columns = [
        'sort_key', 'id', 'cat_id', 'd', 'release_date', 'date', 'weekday',
        'year', 'week_of_month', 'holidy'
    ]
    if not args.use_prices:
        drop_columns += [
            'release_ago',
            'sell_price',
            'diff_price',
            'price_max',
            'price_min',
            'price_std',
            'price_mean',
            'price_trend',
            'price_norm',
            'diff_price_norm',
            'price_nunique',
            'dept_max',
            'dept_min',
            'dept_std',
            'dept_mean',
            'price_in_dept',
            'mean_in_dept',
            'cat_max',
            'cat_min',
            'cat_std',
            'cat_mean',
            'price_in_cat',
            'mean_in_cat',
            'price_in_month',
            'price_in_year',
        ]
    cat_columns = [
        'aggregation_level', 'item_id', 'dept_id', 'store_id', 'state_id',
        'month', 'event_name_1', 'event_type_1', 'event_name_2',
        'event_type_2', 'day_of_week'
    ]
    features = [
        col for col in v_sales.columns if col not in drop_columns + [TARGET]
    ]
    is_cats = [col in cat_columns for col in features]
    cat_dims = []
    emb_dims = []
    for col in features:
        if col in cat_columns:
            cat_dims.append(dims['cat_dims'][col])
            emb_dims.append(dims['emb_dims'][col])
    dims = pd.DataFrame({'cat_dims': cat_dims, 'emb_dims': emb_dims})
    train_index = 1 if args.useval else 2
    trainset = M5Dataset(v_sales_dict,
                         data_count,
                         features,
                         weight,
                         te,
                         remove_last4w=train_index,
                         min_data_4w=0,
                         over_sample=args.over_sample)
    train_loader = torch.utils.data.DataLoader(
        trainset,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        worker_init_fn=get_worker_init_fn(args.seed))
    model = M5MLPLSTMModel(is_cats,
                           dims,
                           n_hidden=args.n_hidden,
                           dropout=args.dropout,
                           use_te=args.use_te)
    criterion = M5Distribution(dist=args.dist, df=args.df)
    module = M5LightningModule(model, criterion, train_loader, None, None,
                               args)
    if torch.cuda.is_available():
        module = module.cuda()
    filename = '../models/each.ckpt' if args.each else '../models/agg.ckpt'
    module.load_state_dict(torch.load(filename)['state_dict'])
    device = next(iter(module.parameters())).device
    if args.each:
        cuda_rng_state = torch.load('../models/cuda_rng_state_each.dmp')
    else:
        cuda_rng_state = torch.load('../models/cuda_rng_state_agg.dmp')
    torch.cuda.set_rng_state(cuda_rng_state, device=device)
    val_acc, val_unc = predict(args,
                               module,
                               criterion,
                               trainset.data_dict,
                               weight,
                               te,
                               evaluation=False)
    eva_acc, eva_unc = predict(args,
                               module,
                               criterion,
                               trainset.data_dict,
                               weight,
                               te,
                               evaluation=True)
    pred_acc = reduce_mem_usage(pd.concat([val_acc, eva_acc]))
    pred_unc = reduce_mem_usage(pd.concat([val_unc, eva_unc]))
    return pred_acc, pred_unc
    del train, test, val


if __name__ == "__main__":

    create_dir(NEW_DATA_DIR)

    first_file = True  # Multiple files
    data = None
    data_filename = ''
    num_sample = None

    filenames = glob(f'data/accidents_*.csv')
    print('Number of files found', len(filenames))

    # multiple files, so they need to be handled separately
    for i, f in enumerate(filenames):
        print(f, i)

        if first_file:
            data = load_dataset(f, num_sample)
            first_UK_file = False
        else:
            new_data = load_dataset(f, num_sample)
            data = do_concat(data, new_data)
            data = reduce_mem_usage(data, verbose=False)

    if data is not None:
        save_dataset(data)
Пример #6
0
    def generate_grid_price(self, grid_base_path, grid_price_path):
        prices_df = self.DW.prices_df
        calendar_df = self.DW.calendar_df

        logging.info('generate_grid_price')
        logging.info('load grid_base')
        grid_df = pd.read_pickle(grid_base_path)

        prices_df['price_max'] = prices_df.groupby(
            ['store_id', 'item_id'])['sell_price'].transform('max')
        prices_df['price_min'] = prices_df.groupby(
            ['store_id', 'item_id'])['sell_price'].transform('min')
        prices_df['price_std'] = prices_df.groupby(
            ['store_id', 'item_id'])['sell_price'].transform('std')
        prices_df['price_mean'] = prices_df.groupby(
            ['store_id', 'item_id'])['sell_price'].transform('mean')
        prices_df[
            'price_norm'] = prices_df['sell_price'] / prices_df['price_max']
        prices_df['price_nunique'] = prices_df.groupby(
            ['store_id', 'item_id'])['sell_price'].transform('nunique')
        prices_df['item_nunique'] = prices_df.groupby(
            ['store_id', 'sell_price'])['item_id'].transform('nunique')

        calendar_prices = calendar_df[['wm_yr_wk', 'month', 'year']]
        calendar_prices = calendar_prices.drop_duplicates(subset=['wm_yr_wk'])
        prices_df = prices_df.merge(
            calendar_prices[['wm_yr_wk', 'month', 'year']],
            on=['wm_yr_wk'],
            how='left')
        del calendar_prices

        prices_df[
            'price_momentum'] = prices_df['sell_price'] / prices_df.groupby([
                'store_id', 'item_id'
            ])['sell_price'].transform(lambda x: x.shift(1))
        prices_df[
            'price_momentum_m'] = prices_df['sell_price'] / prices_df.groupby([
                'store_id', 'item_id', 'month'
            ])['sell_price'].transform('mean')
        prices_df[
            'price_momentum_y'] = prices_df['sell_price'] / prices_df.groupby([
                'store_id', 'item_id', 'year'
            ])['sell_price'].transform('mean')

        prices_df['sell_price_cent'] = [
            math.modf(p)[0] for p in prices_df['sell_price']
        ]
        prices_df['price_max_cent'] = [
            math.modf(p)[0] for p in prices_df['price_max']
        ]
        prices_df['price_min_cent'] = [
            math.modf(p)[0] for p in prices_df['price_min']
        ]

        del prices_df['month'], prices_df['year']

        # logging.info('prices_df.columns: {}'.format(prices_df.columns))
        # logging.info('prices_df.shape: {}'.format(prices_df.shape))
        # logging.info('prices_df - {}'.format(prices_df.head(-5)))
        #        store_id        item_id  wm_yr_wk  sell_price  price_max  ...  price_momentum_m  price_momentum_y  sell_price_cent  price_max_cent  price_min_cent
        # 0          CA_1  HOBBIES_1_002     11121        3.97       3.97  ...               1.0               1.0             0.97            0.97            0.97
        # 1          CA_1  HOBBIES_1_002     11122        3.97       3.97  ...               1.0               1.0             0.97            0.97            0.97
        # 656812     WI_3    FOODS_3_827     11615        1.00       1.00  ...               1.0               1.0             0.00            0.00            0.00
        # 656813     WI_3    FOODS_3_827     11616        1.00       1.00  ...               1.0               1.0             0.00            0.00            0.00

        logging.info('merge prices')
        original_columns = list(grid_df)
        grid_df = grid_df.merge(prices_df,
                                on=['store_id', 'item_id', 'wm_yr_wk'],
                                how='left')
        keep_columns = [
            col for col in list(grid_df) if col not in original_columns
        ]
        grid_df = grid_df[self.main_index_list + keep_columns]
        grid_df = reduce_mem_usage(grid_df)

        # logging.info('grid_df.columns: {}'.format(grid_df.columns))
        # logging.info('grid_df.shape: {}'.format(grid_df.shape))
        # logging.info('grid_df - {}'.format(grid_df.head(-5)))
        #                                    id       d  sell_price  price_max  ...  price_momentum_y  sell_price_cent  price_max_cent  price_min_cent
        # 0       HOBBIES_1_025_CA_1_evaluation     d_1         NaN        NaN  ...               NaN              NaN             NaN             NaN
        # 1       HOBBIES_1_052_CA_1_evaluation     d_1         NaN        NaN  ...               NaN              NaN             NaN             NaN
        # 592185    FOODS_3_194_WI_3_evaluation  d_1948         NaN        NaN  ...               NaN              NaN             NaN             NaN
        # 592186    FOODS_3_282_WI_3_evaluation  d_1948         NaN        NaN  ...               NaN              NaN             NaN             NaN

        logging.info('save grid_price')
        grid_df.to_pickle(grid_price_path)
        del prices_df
        return
Пример #7
0
def preprocess(v_sales, transform_sales=True):
    logging.info('create dims')
    cat_columns = [
        'aggregation_level',
        'item_id',
        'dept_id',
        'cat_id',
        'store_id',
        'state_id',
        'event_name_1',
        'event_type_1',
        'event_name_2',
        'event_type_2',
    ]
    label_encoders = {}
    remove_columns = []
    for column in tqdm(cat_columns):
        if column in v_sales.columns:
            encoder = LabelEncoder()
            v_sales[column] = encoder.fit_transform(v_sales[column].fillna('NA'))
            label_encoders[column] = encoder
        else:
            remove_columns.append(column)
    for column in remove_columns:
        cat_columns.remove(column)
    v_sales['month'] = v_sales['month'] - 1
    cat_columns = ['day_of_week', 'month'] + cat_columns
    cat_dims = v_sales[cat_columns].nunique()
    dims = pd.DataFrame(cat_dims, columns=['cat_dims'])
    dims['emb_dims'] = cat_dims.apply(lambda x: min(50, (x + 1) // 2))

    minmax_columns = [
        'release_ago',
        'wm_yr_wk',
        'wday',
        'day',
        'week',
        'year_delta',
        'week_of_month',
        'price_nunique',
    ]
    power_columns = [
        'sell_price',
        'diff_price',
        'price_max',
        'price_min',
        'price_std',
        'price_mean',
        # 'price_trend',
        'price_norm',
        'diff_price_norm',
        'dept_max',
        'dept_min',
        'dept_std',
        'dept_mean',
        'cat_max',
        'cat_min',
        'cat_std',
        'cat_mean',
    ]
    logging.info('start MinMaxScaler')
    minmax_scalers = {}
    for column in tqdm(minmax_columns):
        scaler = MinMaxScaler()
        v_sales[column] = scaler.fit_transform(v_sales[[column]])
        minmax_scalers[column] = scaler

    logging.info('start PowerTransformer')
    power_transformers = {}
    for column in tqdm(power_columns):
        logging.info(column)
        scaler = PowerTransformer()
        v_sales[column] = v_sales[column].fillna(0).astype('float64')
        v_sales[column] = scaler.fit_transform(v_sales[[column]])
        v_sales = reduce_mem_usage(v_sales)
        power_transformers[column] = scaler
    
    logging.info('create data_count')
    data_count = pd.DataFrame(v_sales['id'].value_counts()).reset_index()
    data_count.columns = ['id', 'count']

    logging.info('create v_sales_dict and transform sales')
    id_list = v_sales['id'].unique()
    v_sales = v_sales.set_index(['id', 'sort_key']).sort_index()
    v_sales_dict = {}
    sales_transformers = {}
    for data_id in tqdm(id_list):
        data = v_sales.loc[data_id].reset_index()
        data[TARGET] = data[TARGET].astype('float64')  # Int32 -> float64
        if transform_sales:
            scaler = PowerTransformer()
            scaler.fit(data[[TARGET]].iloc[:-28])
            data[TARGET] = scaler.transform(data[[TARGET]])
            sales_transformers[data_id] = scaler
        v_sales_dict[data_id] = data

    return v_sales_dict, data_count, dims, label_encoders, minmax_scalers, power_transformers, sales_transformers
Пример #8
0
def dump(df, name):
    df = reduce_mem_usage(df)
    save_dir = pathlib.Path('../data/04_agg')
    if not save_dir.exists():
        save_dir.mkdir(parents=True)
    joblib.dump(df, save_dir / f'{name}.joblib', compress=True)
Пример #9
0
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
import warnings
from util import DefaultConfig
from util import reduce_mem_usage

warnings.filterwarnings('ignore')

import time

start = time.clock()
# 训练集
traindata = reduce_mem_usage(
    pd.read_hdf(path_or_buf=DefaultConfig.test_traindata_cache_path,
                mode='r',
                key='train'))
# 标签
label = pd.read_hdf(path_or_buf=DefaultConfig.test_label_cache_path,
                    mode='r',
                    key='label')
# 测试集
testdata = reduce_mem_usage(
    pd.read_hdf(path_or_buf=DefaultConfig.test_testdata_cache_path,
                mode='r',
                key='test'))

# 删除无关特征
train = traindata.drop(DefaultConfig.delete_columns, axis=1)
test = testdata.drop(DefaultConfig.delete_columns, axis=1)
import os
import pandas as pd
import yaml
import load_data
from util import get_logger, reduce_mem_usage

logger = get_logger()
file_path = os.path.dirname(__file__)

CONFIG_FILE = '../config/config.yaml'
with open(CONFIG_FILE) as file:
    yml = yaml.load(file)
INPUT_DIR_NAME = yml['SETTING']['INPUT_DIR_NAME']


def create_pickle(train, test, specs, train_labels):
    logger.info('save pickle file')
    train.to_pickle(file_path + INPUT_DIR_NAME + 'train.pkl')
    test.to_pickle(file_path + INPUT_DIR_NAME + 'test.pkl')
    specs.to_pickle(file_path + INPUT_DIR_NAME + 'specs.pkl')
    train_labels.to_pickle(file_path + INPUT_DIR_NAME + 'train_labels.pkl')


if __name__ == '__main__':
    train = reduce_mem_usage(load_data.read_train())
    test = reduce_mem_usage(load_data.read_test())
    specs = reduce_mem_usage(load_data.read_specs())
    train_labels = reduce_mem_usage(load_data.read_train_labels())
    create_pickle(train, test, specs, train_labels)