def fit_and_forecast(model_idx, model, embedding, x_train, x_test): revenue_scale = utils.from_pickle(Config.save_dir + "revenue_scale") volume_scale = utils.from_pickle(Config.save_dir + "volume_scale") store_ids = x_test.store_id_bk.unique() for sid in store_ids: s_train = x_train[x_train.store_id_bk == sid] s_test = x_test[x_test.store_id_bk == sid] s_x = s_train.append(s_test).reset_index().drop( ["index", "store_id_bk"], axis=1) if embedding: s_fit_forecast = model.predict(split_features(np.array(s_x))) else: s_fit_forecast = model.predict(np.array(s_x)) s_fit_forecast[:, 0] = revenue_scale.inverse_transform( s_fit_forecast[:, 0].reshape(-1, 1))[:, 0] # s_fit_forecast[:, 1] = volume_scale.inverse_transform( # s_fit_forecast[:, 1]) if not os.path.exists(Config.output_path): os.makedirs(Config.output_path) # Store fit and forecast to file np.savetxt(Config.output_path + "{0}_{1}.csv".format(model_idx, sid), s_fit_forecast[:, 0])
def get_dataset(seed=0, samples=50, test_split=0.5, save_dir=None, us=[0], rad=False, **kwargs): data = {} assert save_dir is not None path = '{}/cartpole-gym-dataset.pkl'.format(save_dir) try: data = from_pickle(path) print("Successfully loaded data from {}".format(path)) except: print("Had a problem loading data from {}. Rebuilding dataset...".format(path)) trajs_force = [] for u in us: trajs, tspan, _ = sample_gym(seed=seed, trials=samples, u=u, **kwargs) trajs_force.append(trajs) data['x'] = np.stack(trajs_force, axis=0) # (3, 45, 50, 3) # make a train/test split split_ix = int(samples * test_split) split_data = {} split_data['x'], split_data['test_x'] = data['x'][:,:,:split_ix,:], data['x'][:,:,split_ix:,:] data = split_data data['t'] = tspan # to_pickle(data, path) return data
def run_lr(num_models): # Load store_ids store_ids = utils.from_pickle(Config.save_dir + "store_id.pkl") # Compute mse, mae and mape mse, mae, mape = 0.0, 0.0, 0.0 for sid in store_ids: if not os.path.exists( Config.output_path + "{0}_{1}.csv".format(0, sid)): continue # Prepare train/test x, y = form_whole_dataset(num_models, sid) x_train, x_test, y_train, y_test = split_train_test(x, y) # Build linear regression model and do prediction lrm = LinearRegression() lrm.fit(x_train, y_train) pred = lrm.predict(x_test) # Compute mse, mae and mape mse += metrics.mean_squared_error(y_test, pred) mae += metrics.mean_absolute_error(y_test, pred) mape += np.mean(abs(y_test - pred) / y_test) mse /= len(store_ids) mae /= len(store_ids) mape /= len(store_ids) return mse, mae, mape
def get_dataset(seed=0, samples=50, test_split=0.5, save_dir=None, us=[0], name='pendulum-gym-image-dataset.pkl', **kwargs): data = {} assert save_dir is not None # path = '{}/pendulum-small-angle-image-dataset.pkl'.format(save_dir) path = os.path.join(save_dir, name) try: data = from_pickle(path) print("Successfully loaded data from {}".format(path)) except: print("Had a problem loading data from {}. Rebuilding dataset...".format(path)) trajs_frames_force = [] trajs_force = [] for u in us: trajs_frames, trajs, tspan, _ = sample_gym(seed=seed, trials=samples, u=u, **kwargs) trajs_frames_force.append(trajs_frames) trajs_force.append(trajs) # make a train/test split split_ix = int(samples * test_split) tmp = np.stack(trajs_frames_force, axis=0) # (n_u, n_ts, n_trial, 50, 50) data['x'], data['test_x'] = tmp[:,:,:split_ix,:,:], tmp[:,:,split_ix:,:,:] tmp = np.stack(trajs_force, axis=0) # (n_u, n_ts, n_trial, 3) data['obs'], data['test_obs'] = tmp[:,:,:split_ix,:], tmp[:,:,split_ix:,:] data['t'] = tspan data['us'] = us to_pickle(data, path) return data
def get_dataset(seed=0, samples=50, test_split=0.5, save_dir=None, us=[0], name='acrobot-gym-image-dataset-rgb-0.pkl', **kwargs): data = {} assert save_dir is not None path = save_dir + '/' + name try: data = from_pickle(path) print("Successfully loaded data from {}".format(path)) except: print("Had a problem loading data from {}. Rebuilding dataset...".format(path)) trajs_frames_force = [] trajs_force = [] for u in us: trajs_frames, trajs, tspan, _ = sample_gym(seed=seed, trials=samples, u=u, **kwargs) trajs_frames = (np.moveaxis(trajs_frames, -1, -3) / 255.0) trajs_frames_force.append(trajs_frames) trajs_force.append(trajs) # make a train/test split split_ix = int(samples * test_split) tmp = np.stack(trajs_frames_force, axis=0) # (n_u, n_ts, n_trial, 3, 64, 64) data['x'], data['test_x'] = tmp[:,:,:split_ix], tmp[:,:,split_ix:] tmp = np.stack(trajs_force, axis=0) # (n_u, n_ts, n_trial, 3) data['obs'], data['test_obs'] = tmp[:,:,:split_ix,:], tmp[:,:,split_ix:,:] data['t'] = tspan data['us'] = us to_pickle(data, path) return data
def run_lr(num_models): # Load store_ids store_ids = utils.from_pickle(Config.save_dir + "store_id.pkl") # Compute mse, mae and mape mse, mae, mape = 0.0, 0.0, 0.0 for sid in store_ids: if not os.path.exists(Config.output_path + "{0}_{1}.csv".format(0, sid)): continue # Prepare train/test x, y = form_whole_dataset(num_models, sid) x_train, x_test, y_train, y_test = split_train_test(x, y) # Build linear regression model and do prediction lrm = LinearRegression() lrm.fit(x_train, y_train) pred = lrm.predict(x_test) # Compute mse, mae and mape mse += metrics.mean_squared_error(y_test, pred) mae += metrics.mean_absolute_error(y_test, pred) mape += np.mean(abs(y_test - pred) / y_test) mse /= len(store_ids) mae /= len(store_ids) mape /= len(store_ids) return mse, mae, mape
def main(input_fname, output_fname): guide_to_seqnames = from_pickle(input_fname) fingerprinted_guides = [ (fingerprint(guide), guide) for guide in guide_to_seqnames.keys() ] fingerprinted_guides.sort(key=lambda a: a[0]) sorted_guides = [guide for _, guide in fingerprinted_guides] to_pickle(output_fname, sorted_guides)
def final_data(embedding=True, num_var2keeps=None, cat_var2keeps=None): # Build train/test dataframes train_filename = Config.save_dir + "train_set.pkl" test_filename = Config.save_dir + "test_set.pkl" if os.path.exists(train_filename) and os.path.exists(test_filename): train = utils.from_pickle(train_filename) test = utils.from_pickle(test_filename) else: train, test = _train_test_split() train = train[num_var2keeps + cat_var2keeps + ["store_id_bk", "total_revenue", "total_volume"]] test = test[num_var2keeps + cat_var2keeps + ["store_id_bk", "total_revenue", "total_volume"]] if embedding: return _data2embedding(train, test) else: return _data2onehot(train, test, cat_var2keeps)
def get_dataset(experiment_name, save_dir, **kwargs): '''Returns a PDE dataset.''' path = '{}/{}-dataset.pkl'.format(save_dir, experiment_name) try: data = from_pickle(path) print("Successfully loaded data from {}".format(path)) except: print( "Had a problem loading data from {}. Rebuilding dataset...".format( path)) data = make_dataset(experiment_name, **kwargs) to_pickle(data, path) os.makedirs('{}/data/'.format(save_dir), exist_ok=True) import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt u_all = np.concatenate([data['u'], data['test_u']], axis=0) energy_all = np.concatenate([data['energy'], data['test_energy']], axis=0) mass_all = u_all.sum(-1).squeeze(-1) for idx in range(len(u_all)): u = u_all[idx] energy = energy_all[idx] mass = mass_all[idx] fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True, figsize=(6., 6.), facecolor='white') t = data['t_eval'] M = u.shape[-1] y = np.arange(M) / M T, Y = np.meshgrid(t, y) if experiment_name.startswith('ch'): ax1.pcolormesh(T, Y, u.squeeze(1).T, cmap='seismic', vmin=-1, vmax=1) else: ax1.pcolormesh(T, Y, u.squeeze(1).T, cmap='seismic') ax1.set_aspect('auto') ax1.set_yticks((0 - .5 / M, 1 - .5 / M)) ax1.set_yticklabels((0, 1)) ax2.plot(t, energy) ax3.plot(t, mass) ax3.set_xticks((t[0], t[-1])) ax3.set_xticklabels((t[0], t[-1])) fig.savefig('{}/data/data_{}_{:02d}.png'.format( save_dir, experiment_name, idx)) plt.close() return data
def __init__(self, data_path): data = from_pickle(data_path) ts, bs, d, d = data['x'][0].shape flatten_images = np.reshape(data['x'][0], (ts, bs, d * d), order='F').astype('float32') concat_images = np.concatenate( (flatten_images[:-1], flatten_images[1:]), axis=2) self.x = np.reshape(concat_images[:-1], ((ts - 2) * bs, 2 * d * d), order='F') self.next_x = np.reshape(concat_images[1:], ((ts - 2) * bs, 2 * d * d), order='F')
def _train_test_split(): # Build the store_weather dataframe store_weather_filename = Config.save_dir + "store_weather.pkl" if os.path.exists(store_weather_filename): store_weather = utils.from_pickle(store_weather_filename) else: store_weather = _preprocess_data() # Split train test for each store train = pd.DataFrame({}) test = pd.DataFrame({}) store_ids = store_weather.store_id_bk.unique() for sid in store_ids: c_store = store_weather[store_weather.store_id_bk == sid] s_train = c_store[:-Config.test_size] s_test = c_store[-Config.test_size:] train = train.append(s_train).reset_index().drop(["index"], axis=1) test = test.append(s_test).reset_index().drop(["index"], axis=1) # Scale numeric columns num_cols = ["p_total_revenue", "p_total_volume", "mean_temp", "total_precipitation", "total_snow"] scaler = MaxAbsScaler().fit(train.loc[:, num_cols]) train.loc[:, num_cols] = scaler.transform(train.loc[:, num_cols]) test.loc[:, num_cols] = scaler.transform(test.loc[:, num_cols]) # Scale 2 output columns revenue_scale = MaxAbsScaler().fit(train.loc[:, ["total_revenue"]]) volume_scale = MaxAbsScaler().fit(train.loc[:, ["total_volume"]]) train.loc[:, ["total_revenue"]] = revenue_scale.transform( train.loc[:, ["total_revenue"]]) test.loc[:, ["total_revenue"]] = revenue_scale.transform( test.loc[:, ["total_revenue"]]) train.loc[:, ["total_volume"]] = volume_scale.transform( train.loc[:, ["total_volume"]]) test.loc[:, ["total_volume"]] = volume_scale.transform( test.loc[:, ["total_volume"]]) # Save the train/test dataframes to pickle objects utils.to_pickle(Config.save_dir + "train_set.pkl", train) utils.to_pickle(Config.save_dir + "test_set.pkl", test) # Save the 2 scaler for later use utils.to_pickle(Config.save_dir + "revenue_scale", revenue_scale) utils.to_pickle(Config.save_dir + "volume_scale", volume_scale) # Save store_ids utils.to_pickle(Config.save_dir + "store_id.pkl", store_ids) return train, test
def get_dataset(experiment_name, save_dir, **kwargs): '''Returns an orbital dataset. Also constructs the dataset if no saved version is available.''' path = '{}/{}-orbits-dataset.pkl'.format(save_dir, experiment_name) try: data = from_pickle(path) print("Successfully loaded data from {}".format(path)) except: print("Had a problem loading data from {}. Rebuilding dataset...".format(path)) data = make_orbits_dataset(**kwargs) to_pickle(data, path) return data
def get_model(args, baseline, structure, damping, num_points, gym=False): if structure == False and baseline == True: nn_model = MLP(args.input_dim, 600, args.input_dim, args.nonlinearity).to(device) model = SymODEN_R(args.input_dim, H_net=nn_model, device=device, baseline=True) elif structure == False and baseline == False: H_net = MLP(args.input_dim, 400, 1, args.nonlinearity).to(device) g_net = MLP(int(args.input_dim / 2), 200, int(args.input_dim / 2)).to(device) model = SymODEN_R(args.input_dim, H_net=H_net, g_net=g_net, device=device, baseline=False) elif structure == True and baseline == False: # M_net = MLP(1, args.hidden_dim, 1).to(device) M_net = MLP(int(args.input_dim / 2), 300, int(args.input_dim / 2)) V_net = MLP(int(args.input_dim / 2), 50, 1).to(device) g_net = MLP(int(args.input_dim / 2), 200, int(args.input_dim / 2)).to(device) model = SymODEN_R(args.input_dim, M_net=M_net, V_net=V_net, g_net=g_net, device=device, baseline=False, structure=True).to(device) else: raise RuntimeError( 'argument *baseline* and *structure* cannot both be true') model_name = 'baseline_ode' if baseline else 'hnn_ode' struct = '-struct' if structure else '' rad = '-rad' if args.rad else '' path = '{}pend-{}{}-{}-p{}{}.tar'.format(args.save_dir, model_name, struct, args.solver, num_points, rad) model.load_state_dict(torch.load(path, map_location=device)) path = '{}/pend-{}{}-{}-p{}-stats{}.pkl'.format(args.save_dir, model_name, struct, args.solver, num_points, rad) stats = from_pickle(path) return model, stats
def get_dataset(self, experiment_name, save_dir): '''Returns the trajectory dataset. Also constructs the dataset if no saved version is available.''' path = '{}/{}-orbits-dataset_{}_EnsemblesPerEnergy_{}_OrbitLen_{}_Resolution_{}_energyPoints{}.pkl'.format( save_dir, experiment_name, self.integrator, self.ensembles, self.tspan[1], self.time_points, self.energyPoints) #path = "../Henon-Heiles-orbits-dataset_RK45_EnsemblesPerEnergy_20_OrbitLen_5000_Resolution_50000_energyPoints20.pkl" #path = "../Henon-Heiles-orbits-dataset_RK45_EnsemblesPerEnergy_20_OrbitLen_1000_Resolution_10000_energyPoints20.pkl" try: data = from_pickle(path) print("Successfully loaded data from {}".format(path)) except: print("Had a problem loading data from {}. Rebuilding dataset...". format(path)) data = self.make_orbits_dataset() to_pickle(data, path) return data
def get_dataset(experiment_name, save_dir, **kwargs): '''Returns a dataset bult on top of OpenAI Gym observations. Also constructs the dataset if no saved version is available.''' if experiment_name == "pendulum": env_name = "Pendulum-v0" elif experiment_name == "acrobot": env_name = "Acrobot-v1" else: assert experiment_name in ['pendulum'] path = '{}/{}-pixels-dataset.pkl'.format(save_dir, experiment_name) try: data = from_pickle(path) print("Successfully loaded data from {}".format(path)) except: print("Had a problem loading data from {}. Rebuilding dataset...".format(path)) data = make_gym_dataset(**kwargs) to_pickle(data, path) return data
def get_model(args, baseline, structure, naive, num_points): M_net = PSD(3, 400, 2).to(device) g_net = MatrixNet(3, 300, 4, shape=(2,2)).to(device) if structure == False: if naive and baseline: raise RuntimeError('argument *baseline* and *naive* cannot both be true') elif naive: input_dim = 6 output_dim = 5 nn_model = MLP(input_dim, 1000, output_dim, args.nonlinearity).to(device) model = SymODEN_R1_T1(args.num_angle, H_net=nn_model, device=device, baseline=baseline, naive=naive, u_dim=2) elif baseline: input_dim = 6 output_dim = 4 nn_model = MLP(input_dim, 700, output_dim, args.nonlinearity).to(device) model = SymODEN_R1_T1(args.num_angle, H_net=nn_model, M_net=M_net, device=device, baseline=baseline, naive=naive, u_dim=2) else: input_dim = 5 output_dim = 1 nn_model = MLP(input_dim, 500, output_dim, args.nonlinearity).to(device) model = SymODEN_R1_T1(args.num_angle, H_net=nn_model, M_net=M_net, g_net=g_net, device=device, baseline=baseline, naive=naive, u_dim=2) elif structure == True and baseline ==False and naive==False: V_net = MLP(3, 300, 1).to(device) model = SymODEN_R1_T1(args.num_angle, M_net=M_net, V_net=V_net, g_net=g_net, device=device, baseline=baseline, structure=True, u_dim=2).to(device) else: raise RuntimeError('argument *structure* is set to true, no *baseline* or *naive*!') if naive: label = '-naive_ode' elif baseline: label = '-baseline_ode' else: label = '-hnn_ode' struct = '-struct' if structure else '' path = '{}/{}{}{}-{}-p{}.tar'.format(args.save_dir, args.name, label, struct, args.solver, args.num_points) model.load_state_dict(torch.load(path, map_location=device)) path = '{}/{}{}{}-{}-p{}-stats.pkl'.format(args.save_dir, args.name, label, struct, args.solver, args.num_points) stats = from_pickle(path) return model, stats
def get_dataset(experiment_name, save_dir, u, **kwargs): '''Returns a dataset bult on top of OpenAI Gym observations. Also constructs the dataset if no saved version is available.''' if experiment_name == "pendulum": env_name = "Pendulum-v0" elif experiment_name == "acrobot": env_name = "Acrobot-v1" elif experiment_name == "cartpole": env_name = 'My_FA_CartPole-v0' else: assert experiment_name in ['pendulum', 'acrobot', 'cartpole'] path = '{}/{}-pixels-dataset.pkl'.format(save_dir, experiment_name) try: data = from_pickle(path) print("Successfully loaded data from {}".format(path)) except: print( "Had a problem loading data from {}. Rebuilding dataset...".format( path)) data = {} for u_ in u: data_ = make_gym_dataset(u=u[0], **kwargs) for k, v in data_.items(): if k in ['meta']: continue new = data_[k] old = data.get(k, np.array([]).reshape(0, new.shape[1])) data[k] = np.vstack((old, data_[k])) to_pickle(data, path) return data
def deserialize(self, string, **kw): """ Function for serializing object => string. This can be overwritten for custom uses. The default is to do nothing ('serializer'=None) If the connection is intialized with 'serializer' set to 'json.gz', 'json', 'gz', or 'zip', we'll do the transformations. """ serializer = kw.get('serializer', self._serializer) if serializer == "json.gz": return utils.from_json(utils.from_gz(string)) elif serializer == "json": return utils.from_json(string) elif serializer == "gz": return utils.from_gz(string) elif serializer == "zip": return utils.from_zip(string) elif serializer == "pickle": return utils.from_pickle(obj) elif serializer is not None: raise NotImplementedError( 'Only json, gz, json.gz, zip, and pickle' 'are supported as serializers.') return string
def main(cfg): # -------------------- # setup # -------------------- logger.debug('setup...') EXPERIMENT_NAME = '{}_{}'.format( datetime.datetime.now().strftime('%Y%m%d'), cfg.experiment) logger.add(pathlib.Path(cfg.path + f'output/log/{EXPERIMENT_NAME}.log'), enqueue=True, backtrace=True) scores = {} # -------------------- # load data # -------------------- logger.debug('load preprocessed & fe data...') train = from_pickle( pathlib.Path(cfg.path + f'output/feature_engineered/train{EXPERIMENT_NAME}.pkl')) test = from_pickle( pathlib.Path(cfg.path + f'output/feature_engineered/test{EXPERIMENT_NAME}.pkl')) # new targets ids = 'data_id' target = 'Global_Sales' non_targets = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'] for c in list( itertools.combinations( ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'], 3)): new_tg = c[0] + '_' + c[1] + '_' + c[2] train[new_tg] = train[c[0]] + train[c[1]] + train[c[2]] non_targets.append(new_tg) # log transform for t in non_targets + [target]: train[t] = np.log1p(train[t]) assert t in train.columns.values.tolist() logger.debug(t + ' exists') # features to use drops = [ 'Platform_Genre_nunique_by_Publisher', 'Year_of_Release_nunique_by_Publisher', 'Platform_nunique_by_Publisher', 'Platform_nunique_by_Developer', 'Genre_nunique_by_Developer', 'category2vec_0_max', 'category2vec_0_min' ] drops += [ f for f in test.columns.values.tolist() if ('_x_' in f) & f.endswith('Publisher') ] if cfg.fit.cv == 'KFold': group = 'Publisher' elif cfg.fit.cv == 'StratifiedKFold2': train['bins'] = create_folds(train, target) group = 'bins' cat_feats = ['Platform', 'Genre', 'Rating', 'Platform_Genre'] drops += [ids, group, 'Publisher', target] + non_targets features = [f for f in test.columns.values.tolist() if f not in drops] cat_feats = [f for f in cat_feats if f in features] # -------------------- # 1st feature selection (adversarial validation) # -------------------- m = 'lgb' train['is_train'] = 1 test['is_train'] = 0 df = pd.concat([train, test], ignore_index=True) auc = 1 counts = 0 while auc > cfg.fit.adversarial_validation: if counts > 0: drops = fi_df['features'].values[:int(0.02 * len(features))].tolist() print('drops:', drops) features = fi_df['features'].values[int(0.02 * len(features)):].tolist() cat_feats = [f for f in cat_feats if f in features] # fit oof_, y_pred_, fi_df, model = fit_single_model(df, df, 'is_train', features, cat_feats, group, m, task='binary', cv='StratifiedKFold', n_splits=2, nsa=1, cfg=cfg) auc = model.score counts += 1 logger.debug('adversarial validation score (auc) = {}'.format(auc)) if cfg.fit.adversarial_validation < 1: savepath_adv = pathlib.Path( cfg.path + f'output/feature_importance/adv_{EXPERIMENT_NAME}.csv') fi_df.to_csv(savepath_adv, index=False) else: logger.debug('skipping adversarial validation...') # -------------------- # 2nd feature selection (simply fit and select top 64% features) # -------------------- # initialize for stacking m = 'lgb' n = 'full_feats_' + target cv = cfg.fit.cv task = 'regression' oof_df = pd.DataFrame() oof_df[target] = train[target].values oof_df[group] = train[group].values oof_df[ids] = train[ids].values ypred_df = pd.DataFrame() ypred_df[ids] = test[ids].values assert oof_df[ids].values[-1] + 1 == ypred_df[ids].values[0] # fit oof_, y_pred_, fi_df, _ = fit_single_model(train, test, target, features, cat_feats, group, model_name=m, task=task, cv=cv, n_splits=cfg.fit.nfold, nsa=cfg.fit.nsa, cfg=cfg) savepath, score = after_modeling(train[target].values, oof_, fi_df, m, cv, cfg, EXPERIMENT_NAME) scores[f'lgb_full{len(features)}_features'] = score # feature selection features = fi_df['features'].values[:int(0.64 * len(features))].tolist() cat_feats = [f for f in cat_feats if f in features] logger.debug(f'{len(features)} selected features') # assign oof_df[n] = oof_ ypred_df[n] = y_pred_ # ------------------------ # fit for non-targets (for stacking) # ------------------------ non_targets = [ f for f in non_targets if f not in ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'] ] for i, t in enumerate(non_targets): # fit if (cfg.mode == 'debug') & (t != target): continue n = f'pred_{t}' logger.debug('# -----------------------') logger.debug(f'predicting {t}...') logger.debug('# -----------------------') oof_, y_pred_, fi_df, _ = fit_single_model(train, test, t, features, cat_feats, group, model_name=m, task=task, cv=cv, n_splits=cfg.fit.nfold, nsa=cfg.fit.nsa, cfg=cfg) # assign oof_df[n] = oof_ ypred_df[n] = y_pred_ # ------------------------ # fit for target # ------------------------ models = ['catb', 'xgb'] for m in models: # fitting logger.debug('# -----------------------') logger.debug(f'predicting {target} by {m}...') logger.debug('# -----------------------') oof_, y_pred_, fi_df, _ = fit_single_model(train, test, target, features, cat_feats, group, model_name=m, task=task, cv=cv, n_splits=cfg.fit.nfold, nsa=cfg.fit.nsa, cfg=cfg) # assign oof_df[f'{m}_{target}'] = oof_ ypred_df[f'{m}_{target}'] = y_pred_ # store validation scores for n in ypred_df.columns.values.tolist(): if target in n: score = validation_score(train[target].values, oof_df[n].values) logger.debug('Overall score for {} = {}'.format(n, score)) scores[n] = score # ------------------- # stacking # ------------------- logger.debug('stacking ensemble...') m = 'linear' oof = np.zeros(train.shape[0]) y_pred = np.zeros(test.shape[0]) logger.debug(f'fitting {m}...') stacking_feats = [ f for f in ypred_df.columns.values.tolist() if f not in drops ] oof, y_pred, fi_df, _ = fit_single_model(oof_df, ypred_df, target, stacking_feats, [], group, model_name=m, task=task, cv=cv, n_splits=cfg.fit.nfold, nsa=cfg.fit.nsa, cfg=cfg) # ------------------- # evaluate results # ------------------- ss = pd.read_csv( pathlib.Path(cfg.path + 'input/atmacup8_sample-submission.csv')) ss[target] = np.expm1(y_pred) score = validation_score(train[target].values, oof) logger.debug('Overall score for ensemble = {}'.format(score)) scores['final'] = score # -------------------- # save files # -------------------- logger.debug('saving files...') # oof, submissions fi_df.to_csv(pathlib.Path( cfg.path + f'output/feature_importance/weights_{EXPERIMENT_NAME}.csv'), index=False) np.save(pathlib.Path(cfg.path + f'output/oof/oof_final{EXPERIMENT_NAME}'), oof) ss.to_csv(pathlib.Path(cfg.path + f'output/submission/{EXPERIMENT_NAME}.csv'), index=False) # -------------------- # mlflow # -------------------- mlflow.set_tracking_uri("http://mlflow:5000") mlflow.set_experiment(EXPERIMENT_NAME) with tempfile.TemporaryDirectory() as tmp_dir, mlflow.start_run() as run: # Added this line logger.debug('tracking uri:', mlflow.get_tracking_uri()) logger.debug('artifact uri:', mlflow.get_artifact_uri()) # hyperparameters mlflow.log_params(cfg.lightgbm) if cfg.mode != 'debug': mlflow.log_params(cfg.catboost) mlflow.log_params(cfg.xgboost) # settings mlflow.log_param('mode', cfg.mode) mlflow.log_param('seed', cfg.seed) mlflow.log_param('ensemble', cfg.ensemble) # scores for k, v in scores.items(): mlflow.log_metric(k, v) # outputs artifacts = { 'ensemble_weights': cfg.path + f'output/feature_importance/weights_{EXPERIMENT_NAME}.csv', 'feature_importance': savepath, 'oof': cfg.path + f'output/oof/oof_final{EXPERIMENT_NAME}.npy', 'submission': cfg.path + f'output/submission/{EXPERIMENT_NAME}.csv', } for name, file_path in artifacts.items(): mlflow.log_artifact(pathlib.Path(file_path)) logger.debug('all done')
def train(args): device = 'cuda:0' if torch.cuda.is_available() else 'cpu' n_available_GPUs = torch.cuda.device_count() dtype = torch.get_default_dtype() torch.set_grad_enabled(False) # set random seed torch.manual_seed(args.seed) np.random.seed(args.seed) # arrange data data = get_dataset(args.name, args.save_dir, verbose=True, device='cpu', test_split=0.1) train_u = torch.tensor(data['u'], requires_grad=True, device=device, dtype=dtype) test_u = torch.tensor(data['test_u'], requires_grad=True, device=device, dtype=dtype) train_dudt = torch.tensor(data['dudt'], device=device, dtype=dtype) test_dudt = torch.tensor(data['test_dudt'], device=device, dtype=dtype) t_eval = data['t_eval'] dt = data['dt'] M = test_u.shape[-1] train_shape_origin = train_u.shape test_shape_origin = test_u.shape u1 = train_u[:, :-1].contiguous().view(-1, 1, train_u.shape[-1]) u2 = train_u[:, 1:].contiguous().view(-1, 1, train_u.shape[-1]) dudt = ((u2 - u1) / dt).detach() train_u = train_u.view(-1, 1, train_u.shape[-1]) test_u = test_u.view(-1, 1, test_u.shape[-1]) train_dudt = train_dudt.view(-1, 1, train_dudt.shape[-1]) test_dudt = test_dudt.view(-1, 1, test_dudt.shape[-1]) # init model and optimizer alpha = 2 if args.name.startswith('ch') else 1 model = dgnet.DGNetPDE1d(args.input_dim, args.hidden_dim, nonlinearity=args.nonlinearity, model=args.model, solver=args.solver, name=args.name, dx=data['dx'], alpha=alpha) print(model) model = model.to(device) stats = {'train_loss': [], 'test_loss': []} import glob files = glob.glob('{}.tar'.format(args.result_path)) if len(files) > 0: f = files[0] path_tar = f model.load_state_dict(torch.load(path_tar, map_location=device)) path_pkl = f.replace('.tar', '.pkl') stats = from_pickle(path_pkl) args.total_steps = 0 print('Model successfully loaded from {}'.format(path_tar)) if args.load: path_tar = '{}.tar'.format(args.result_path).replace('_long', '') model.load_state_dict(torch.load(path_tar, map_location=device)) args.total_steps = 0 print('Model successfully loaded from {}'.format(path_tar)) optim = torch.optim.Adam(model.parameters(), args.learn_rate, weight_decay=0) # vanilla train loop for step in range(args.total_steps): # train step idx = torch.randperm(u1.shape[0])[:args.batch_size] with torch.enable_grad(): if n_available_GPUs > 1: dudt_hat = torch.nn.parallel.data_parallel( model, u1[idx], module_kwargs={ 'dt': dt, 'x2': u2[idx], 'func': 'discrete_time_derivative' }) else: dudt_hat = model.discrete_time_derivative(u1[idx], dt=dt, x2=u2[idx]) loss = L2_loss(dudt[idx], dudt_hat) optim.zero_grad() loss.backward() optim.step() # run test data test_idx = torch.randperm(test_u.shape[0])[:args.batch_size] test_dudt_hat = model.time_derivative(test_u[test_idx]) test_loss = L2_loss(test_dudt[test_idx], test_dudt_hat) # logging stats['train_loss'].append(loss.item()) stats['test_loss'].append(test_loss.item()) if args.verbose and step % args.print_every == 0: print("step {}, train_loss {:.4e}, test_loss {:.4e}".format( step, loss.item(), test_loss.item())) if len(train_u) > 0: train_dudt_hat = torch.cat([ model.time_derivative(train_u[idx:idx + args.batch_size]) for idx in range(0, len(train_u), args.batch_size) ], dim=0) train_dist = (train_dudt - train_dudt_hat)**2 test_dudt_hat = torch.cat([ model.time_derivative(test_u[idx:idx + args.batch_size]) for idx in range(0, len(test_u), args.batch_size) ], dim=0) test_dist = (test_dudt - test_dudt_hat)**2 print('Final train loss {:.4e}\nFinal test loss {:.4e}'.format( train_dist.mean().item(), test_dist.mean().item())) stats['final_train_loss'] = train_dist.mean().item() stats['final_test_loss'] = test_dist.mean().item() else: stats['final_train_loss'] = 0.0 stats['final_test_loss'] = 0.0 # sequence generator os.makedirs('{}/results/'.format(args.save_dir), exist_ok=True) print('Generating test sequences') train_u = train_u.view(*train_shape_origin) test_u = test_u.view(*test_shape_origin) import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt test_u_truth = [] test_u_model = [] for idx in range(len(test_u)): print('Generating a sequence {}/{}'.format(idx, len(test_u)), end='\r') u_truth = test_u[idx].squeeze(1).detach().cpu().numpy() u_model = model.get_orbit( x0=test_u[idx, :1], t_eval=t_eval).squeeze(2).squeeze(1).detach().cpu().numpy() test_u_truth.append(u_truth) test_u_model.append(u_model) energy_truth = data['model'].get_energy(u_truth) energy_model = data['model'].get_energy(u_model) if args.model != 'node': energy_model_truth = model( torch.from_numpy(u_truth).unsqueeze(-2).to(device)).squeeze( 2).squeeze(1).detach().cpu().numpy() * data['dx'] energy_model_model = model( torch.from_numpy(u_model).unsqueeze(-2).to(device)).squeeze( 2).squeeze(1).detach().cpu().numpy() * data['dx'] mass_truth = u_truth.sum(-1) mass_model = u_model.sum(-1) if args.name.startswith('ch'): vmax = 1 vmin = -1 else: vmax = max(np.abs(u_truth).max(), np.abs(u_model).max()) vmin = -vmax fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, sharex=True, figsize=(9., 15.), facecolor='white') ax1.imshow(u_truth.T, interpolation='nearest', vmin=vmin, vmax=vmax, cmap='seismic') ax1.set_aspect('auto') ax1.set_yticks((-0.5, M - 0.5)) ax1.set_yticklabels((0, 1)) ax2.imshow(u_model.T, interpolation='nearest', vmin=vmin, vmax=vmax, cmap='seismic') ax2.set_aspect('auto') ax2.set_yticks((-0.5, M - 0.5)) ax2.set_yticklabels((0, 1)) ax3.plot([], [], color='white', label='energy') if args.model != 'node': ax3.plot(energy_model_truth - energy_model_truth[0], dashes=[2, 2], color='C0') ax3.plot(energy_model_model - energy_model_model[0], dashes=[2, 2], color='C1') ax3.plot(energy_truth - energy_truth[0], color='C0', label='ground truth') ax3.plot(energy_model - energy_model[0], color='C1', label=args.model) ax3.legend() ax4.plot([], [], color='white', label='mass') ax4.plot(mass_truth, color='C0') ax4.plot(mass_model, color='C1') ax4.set_xticks(t_eval[::len(t_eval) // 5] / dt) ax4.set_xticklabels(t_eval[::len(t_eval) // 5]) ax4.set_xlabel('time') fig.savefig('{}_plot{:02d}.png'.format(args.result_path, idx)) plt.close() test_u_truth = np.stack(test_u_truth, axis=0)[:, 1:] test_u_model = np.stack(test_u_model, axis=0)[:, 1:] energy_truth = data['model'].get_energy(test_u_truth) energy_model = data['model'].get_energy(test_u_model) print('energy MSE model', ((energy_truth - energy_model)**2).mean()) stats['energy_mse_mean'] = ((energy_truth - energy_model)**2).mean() print('state MSE model', ((test_u_truth - test_u_model)**2).mean()) stats['state_mse_mean'] = ((test_u_truth - test_u_model)**2).mean() stats['test_u_truth'] = test_u_truth stats['test_u_model'] = test_u_model stats['energy_truth'] = energy_truth stats['energy_model'] = energy_model if args.model != 'node': energy_model_truth = model( torch.from_numpy(test_u_truth).reshape( -1, 1, test_u_truth.shape[-1]).to(device)).detach().cpu( ).numpy().reshape(*test_u_truth.shape[:-1]) energy_model_model = model( torch.from_numpy(test_u_model).reshape( -1, 1, test_u_model.shape[-1]).to(device)).detach().cpu( ).numpy().reshape(*test_u_model.shape[:-1]) stats['energy_model_truth'] = energy_model_truth stats['energy_model_model'] = energy_model_model return model, stats