示例#1
0
def fit_and_forecast(model_idx, model, embedding, x_train, x_test):
    revenue_scale = utils.from_pickle(Config.save_dir + "revenue_scale")
    volume_scale = utils.from_pickle(Config.save_dir + "volume_scale")
    store_ids = x_test.store_id_bk.unique()

    for sid in store_ids:
        s_train = x_train[x_train.store_id_bk == sid]
        s_test = x_test[x_test.store_id_bk == sid]
        s_x = s_train.append(s_test).reset_index().drop(
            ["index", "store_id_bk"], axis=1)

        if embedding:
            s_fit_forecast = model.predict(split_features(np.array(s_x)))
        else:
            s_fit_forecast = model.predict(np.array(s_x))

        s_fit_forecast[:, 0] = revenue_scale.inverse_transform(
            s_fit_forecast[:, 0].reshape(-1, 1))[:, 0]
        #        s_fit_forecast[:, 1] = volume_scale.inverse_transform(
        #            s_fit_forecast[:, 1])

        if not os.path.exists(Config.output_path):
            os.makedirs(Config.output_path)

        # Store fit and forecast to file
        np.savetxt(Config.output_path + "{0}_{1}.csv".format(model_idx, sid),
                   s_fit_forecast[:, 0])
示例#2
0
def get_dataset(seed=0, samples=50, test_split=0.5, save_dir=None, us=[0], rad=False, **kwargs):
    data = {}

    assert save_dir is not None
    path = '{}/cartpole-gym-dataset.pkl'.format(save_dir)
    try:
        data = from_pickle(path)
        print("Successfully loaded data from {}".format(path))
    except:
        print("Had a problem loading data from {}. Rebuilding dataset...".format(path))
        trajs_force = []
        for u in us:
            trajs, tspan, _ = sample_gym(seed=seed, trials=samples, u=u, **kwargs)
            trajs_force.append(trajs)
        data['x'] = np.stack(trajs_force, axis=0) # (3, 45, 50, 3)
        # make a train/test split
        split_ix = int(samples * test_split)
        split_data = {}
        split_data['x'], split_data['test_x'] = data['x'][:,:,:split_ix,:], data['x'][:,:,split_ix:,:]

        data = split_data
        data['t'] = tspan

        # to_pickle(data, path)
    return data
示例#3
0
def run_lr(num_models):
    # Load store_ids
    store_ids = utils.from_pickle(Config.save_dir + "store_id.pkl")

    # Compute mse, mae and mape
    mse, mae, mape = 0.0, 0.0, 0.0
    for sid in store_ids:
        if not os.path.exists(
                        Config.output_path + "{0}_{1}.csv".format(0, sid)):
            continue

        # Prepare train/test
        x, y = form_whole_dataset(num_models, sid)
        x_train, x_test, y_train, y_test = split_train_test(x, y)

        # Build linear regression model and do prediction
        lrm = LinearRegression()
        lrm.fit(x_train, y_train)
        pred = lrm.predict(x_test)

        # Compute mse, mae and mape
        mse += metrics.mean_squared_error(y_test, pred)
        mae += metrics.mean_absolute_error(y_test, pred)
        mape += np.mean(abs(y_test - pred) / y_test)

    mse /= len(store_ids)
    mae /= len(store_ids)
    mape /= len(store_ids)

    return mse, mae, mape
示例#4
0
def get_dataset(seed=0, samples=50, test_split=0.5, save_dir=None, 
                us=[0], name='pendulum-gym-image-dataset.pkl', **kwargs):
    data = {}

    assert save_dir is not None
    # path = '{}/pendulum-small-angle-image-dataset.pkl'.format(save_dir)
    path = os.path.join(save_dir, name)
    try:
        data = from_pickle(path)
        print("Successfully loaded data from {}".format(path))
    except:
        print("Had a problem loading data from {}. Rebuilding dataset...".format(path))
        trajs_frames_force = []
        trajs_force = []
        for u in us:
            trajs_frames, trajs, tspan, _ = sample_gym(seed=seed, trials=samples, u=u, **kwargs)
            trajs_frames_force.append(trajs_frames)
            trajs_force.append(trajs)
        # make a train/test split
        split_ix = int(samples * test_split)
        tmp = np.stack(trajs_frames_force, axis=0) # (n_u, n_ts, n_trial, 50, 50)
        data['x'], data['test_x'] = tmp[:,:,:split_ix,:,:], tmp[:,:,split_ix:,:,:]
        tmp = np.stack(trajs_force, axis=0) # (n_u, n_ts, n_trial, 3)
        data['obs'], data['test_obs'] = tmp[:,:,:split_ix,:], tmp[:,:,split_ix:,:]

        data['t'] = tspan
        data['us'] = us

        to_pickle(data, path)
    return data
示例#5
0
def get_dataset(seed=0, samples=50, test_split=0.5, save_dir=None, 
                us=[0], name='acrobot-gym-image-dataset-rgb-0.pkl', **kwargs):
    data = {}

    assert save_dir is not None
    path = save_dir + '/' + name
    try:
        data = from_pickle(path)
        print("Successfully loaded data from {}".format(path))
    except:
        print("Had a problem loading data from {}. Rebuilding dataset...".format(path))
        trajs_frames_force = []
        trajs_force = []
        for u in us:
            trajs_frames, trajs, tspan, _ = sample_gym(seed=seed, trials=samples, u=u, **kwargs)
            trajs_frames = (np.moveaxis(trajs_frames, -1, -3) / 255.0)
            trajs_frames_force.append(trajs_frames)
            trajs_force.append(trajs)
        # make a train/test split
        split_ix = int(samples * test_split)
        tmp = np.stack(trajs_frames_force, axis=0) # (n_u, n_ts, n_trial, 3, 64, 64)
        data['x'], data['test_x'] = tmp[:,:,:split_ix], tmp[:,:,split_ix:]
        tmp = np.stack(trajs_force, axis=0) # (n_u, n_ts, n_trial, 3)
        data['obs'], data['test_obs'] = tmp[:,:,:split_ix,:], tmp[:,:,split_ix:,:]

        data['t'] = tspan
        data['us'] = us

        to_pickle(data, path)
    return data
示例#6
0
def run_lr(num_models):
    # Load store_ids
    store_ids = utils.from_pickle(Config.save_dir + "store_id.pkl")

    # Compute mse, mae and mape
    mse, mae, mape = 0.0, 0.0, 0.0
    for sid in store_ids:
        if not os.path.exists(Config.output_path +
                              "{0}_{1}.csv".format(0, sid)):
            continue

        # Prepare train/test
        x, y = form_whole_dataset(num_models, sid)
        x_train, x_test, y_train, y_test = split_train_test(x, y)

        # Build linear regression model and do prediction
        lrm = LinearRegression()
        lrm.fit(x_train, y_train)
        pred = lrm.predict(x_test)

        # Compute mse, mae and mape
        mse += metrics.mean_squared_error(y_test, pred)
        mae += metrics.mean_absolute_error(y_test, pred)
        mape += np.mean(abs(y_test - pred) / y_test)

    mse /= len(store_ids)
    mae /= len(store_ids)
    mape /= len(store_ids)

    return mse, mae, mape
示例#7
0
def main(input_fname, output_fname):
    guide_to_seqnames = from_pickle(input_fname)

    fingerprinted_guides = [
        (fingerprint(guide), guide) for guide in guide_to_seqnames.keys()
    ]
    fingerprinted_guides.sort(key=lambda a: a[0])
    sorted_guides = [guide for _, guide in fingerprinted_guides]

    to_pickle(output_fname, sorted_guides)
示例#8
0
def final_data(embedding=True, num_var2keeps=None, cat_var2keeps=None):
    # Build train/test dataframes
    train_filename = Config.save_dir + "train_set.pkl"
    test_filename = Config.save_dir + "test_set.pkl"
    if os.path.exists(train_filename) and os.path.exists(test_filename):
        train = utils.from_pickle(train_filename)
        test = utils.from_pickle(test_filename)
    else:
        train, test = _train_test_split()

    train = train[num_var2keeps + cat_var2keeps +
                  ["store_id_bk", "total_revenue", "total_volume"]]
    test = test[num_var2keeps + cat_var2keeps +
                ["store_id_bk", "total_revenue", "total_volume"]]

    if embedding:
        return _data2embedding(train, test)
    else:
        return _data2onehot(train, test, cat_var2keeps)
示例#9
0
def get_dataset(experiment_name, save_dir, **kwargs):
    '''Returns a PDE dataset.'''
    path = '{}/{}-dataset.pkl'.format(save_dir, experiment_name)

    try:
        data = from_pickle(path)
        print("Successfully loaded data from {}".format(path))
    except:
        print(
            "Had a problem loading data from {}. Rebuilding dataset...".format(
                path))
        data = make_dataset(experiment_name, **kwargs)
        to_pickle(data, path)
        os.makedirs('{}/data/'.format(save_dir), exist_ok=True)
        import matplotlib as mpl
        mpl.use('Agg')
        import matplotlib.pyplot as plt
        u_all = np.concatenate([data['u'], data['test_u']], axis=0)
        energy_all = np.concatenate([data['energy'], data['test_energy']],
                                    axis=0)
        mass_all = u_all.sum(-1).squeeze(-1)
        for idx in range(len(u_all)):
            u = u_all[idx]
            energy = energy_all[idx]
            mass = mass_all[idx]
            fig, (ax1, ax2, ax3) = plt.subplots(3,
                                                1,
                                                sharex=True,
                                                figsize=(6., 6.),
                                                facecolor='white')
            t = data['t_eval']
            M = u.shape[-1]
            y = np.arange(M) / M
            T, Y = np.meshgrid(t, y)
            if experiment_name.startswith('ch'):
                ax1.pcolormesh(T,
                               Y,
                               u.squeeze(1).T,
                               cmap='seismic',
                               vmin=-1,
                               vmax=1)
            else:
                ax1.pcolormesh(T, Y, u.squeeze(1).T, cmap='seismic')
            ax1.set_aspect('auto')
            ax1.set_yticks((0 - .5 / M, 1 - .5 / M))
            ax1.set_yticklabels((0, 1))
            ax2.plot(t, energy)
            ax3.plot(t, mass)
            ax3.set_xticks((t[0], t[-1]))
            ax3.set_xticklabels((t[0], t[-1]))
            fig.savefig('{}/data/data_{}_{:02d}.png'.format(
                save_dir, experiment_name, idx))
            plt.close()
    return data
示例#10
0
    def __init__(self, data_path):
        data = from_pickle(data_path)
        ts, bs, d, d = data['x'][0].shape
        flatten_images = np.reshape(data['x'][0], (ts, bs, d * d),
                                    order='F').astype('float32')
        concat_images = np.concatenate(
            (flatten_images[:-1], flatten_images[1:]), axis=2)

        self.x = np.reshape(concat_images[:-1], ((ts - 2) * bs, 2 * d * d),
                            order='F')
        self.next_x = np.reshape(concat_images[1:], ((ts - 2) * bs, 2 * d * d),
                                 order='F')
示例#11
0
def _train_test_split():
    # Build the store_weather dataframe
    store_weather_filename = Config.save_dir + "store_weather.pkl"
    if os.path.exists(store_weather_filename):
        store_weather = utils.from_pickle(store_weather_filename)
    else:
        store_weather = _preprocess_data()

    # Split train test for each store
    train = pd.DataFrame({})
    test = pd.DataFrame({})
    store_ids = store_weather.store_id_bk.unique()
    for sid in store_ids:
        c_store = store_weather[store_weather.store_id_bk == sid]
        s_train = c_store[:-Config.test_size]
        s_test = c_store[-Config.test_size:]
        train = train.append(s_train).reset_index().drop(["index"], axis=1)
        test = test.append(s_test).reset_index().drop(["index"], axis=1)

    # Scale numeric columns
    num_cols = ["p_total_revenue", "p_total_volume", "mean_temp",
                "total_precipitation", "total_snow"]
    scaler = MaxAbsScaler().fit(train.loc[:, num_cols])
    train.loc[:, num_cols] = scaler.transform(train.loc[:, num_cols])
    test.loc[:, num_cols] = scaler.transform(test.loc[:, num_cols])

    # Scale 2 output columns
    revenue_scale = MaxAbsScaler().fit(train.loc[:, ["total_revenue"]])
    volume_scale = MaxAbsScaler().fit(train.loc[:, ["total_volume"]])
    train.loc[:, ["total_revenue"]] = revenue_scale.transform(
        train.loc[:, ["total_revenue"]])
    test.loc[:, ["total_revenue"]] = revenue_scale.transform(
        test.loc[:, ["total_revenue"]])
    train.loc[:, ["total_volume"]] = volume_scale.transform(
        train.loc[:, ["total_volume"]])
    test.loc[:, ["total_volume"]] = volume_scale.transform(
        test.loc[:, ["total_volume"]])

    # Save the train/test dataframes to pickle objects
    utils.to_pickle(Config.save_dir + "train_set.pkl", train)
    utils.to_pickle(Config.save_dir + "test_set.pkl", test)

    # Save the 2 scaler for later use
    utils.to_pickle(Config.save_dir + "revenue_scale", revenue_scale)
    utils.to_pickle(Config.save_dir + "volume_scale", volume_scale)

    # Save store_ids
    utils.to_pickle(Config.save_dir + "store_id.pkl", store_ids)

    return train, test
示例#12
0
def get_dataset(experiment_name, save_dir, **kwargs):
    '''Returns an orbital dataset. Also constructs
    the dataset if no saved version is available.'''

    path = '{}/{}-orbits-dataset.pkl'.format(save_dir, experiment_name)

    try:
        data = from_pickle(path)
        print("Successfully loaded data from {}".format(path))
    except:
        print("Had a problem loading data from {}. Rebuilding dataset...".format(path))
        data = make_orbits_dataset(**kwargs)
        to_pickle(data, path)

    return data
示例#13
0
def get_model(args, baseline, structure, damping, num_points, gym=False):
    if structure == False and baseline == True:
        nn_model = MLP(args.input_dim, 600, args.input_dim,
                       args.nonlinearity).to(device)
        model = SymODEN_R(args.input_dim,
                          H_net=nn_model,
                          device=device,
                          baseline=True)
    elif structure == False and baseline == False:
        H_net = MLP(args.input_dim, 400, 1, args.nonlinearity).to(device)
        g_net = MLP(int(args.input_dim / 2), 200,
                    int(args.input_dim / 2)).to(device)
        model = SymODEN_R(args.input_dim,
                          H_net=H_net,
                          g_net=g_net,
                          device=device,
                          baseline=False)
    elif structure == True and baseline == False:
        # M_net = MLP(1, args.hidden_dim, 1).to(device)
        M_net = MLP(int(args.input_dim / 2), 300, int(args.input_dim / 2))
        V_net = MLP(int(args.input_dim / 2), 50, 1).to(device)
        g_net = MLP(int(args.input_dim / 2), 200,
                    int(args.input_dim / 2)).to(device)
        model = SymODEN_R(args.input_dim,
                          M_net=M_net,
                          V_net=V_net,
                          g_net=g_net,
                          device=device,
                          baseline=False,
                          structure=True).to(device)
    else:
        raise RuntimeError(
            'argument *baseline* and *structure* cannot both be true')
    model_name = 'baseline_ode' if baseline else 'hnn_ode'
    struct = '-struct' if structure else ''
    rad = '-rad' if args.rad else ''
    path = '{}pend-{}{}-{}-p{}{}.tar'.format(args.save_dir, model_name, struct,
                                             args.solver, num_points, rad)
    model.load_state_dict(torch.load(path, map_location=device))
    path = '{}/pend-{}{}-{}-p{}-stats{}.pkl'.format(args.save_dir, model_name,
                                                    struct, args.solver,
                                                    num_points, rad)
    stats = from_pickle(path)
    return model, stats
示例#14
0
    def get_dataset(self, experiment_name, save_dir):
        '''Returns the trajectory dataset. Also constructs
           the dataset if no saved version is available.'''

        path = '{}/{}-orbits-dataset_{}_EnsemblesPerEnergy_{}_OrbitLen_{}_Resolution_{}_energyPoints{}.pkl'.format(
            save_dir, experiment_name, self.integrator, self.ensembles,
            self.tspan[1], self.time_points, self.energyPoints)
        #path = "../Henon-Heiles-orbits-dataset_RK45_EnsemblesPerEnergy_20_OrbitLen_5000_Resolution_50000_energyPoints20.pkl"
        #path = "../Henon-Heiles-orbits-dataset_RK45_EnsemblesPerEnergy_20_OrbitLen_1000_Resolution_10000_energyPoints20.pkl"
        try:
            data = from_pickle(path)
            print("Successfully loaded data from {}".format(path))
        except:
            print("Had a problem loading data from {}. Rebuilding dataset...".
                  format(path))
            data = self.make_orbits_dataset()
            to_pickle(data, path)

        return data
示例#15
0
def get_dataset(experiment_name, save_dir, **kwargs):
  '''Returns a dataset bult on top of OpenAI Gym observations. Also constructs
  the dataset if no saved version is available.'''
  
  if experiment_name == "pendulum":
    env_name = "Pendulum-v0"
  elif experiment_name == "acrobot":
    env_name = "Acrobot-v1"
  else:
    assert experiment_name in ['pendulum']
    
  path = '{}/{}-pixels-dataset.pkl'.format(save_dir, experiment_name)

  try:
      data = from_pickle(path)
      print("Successfully loaded data from {}".format(path))
  except:
      print("Had a problem loading data from {}. Rebuilding dataset...".format(path))
      data = make_gym_dataset(**kwargs)
      to_pickle(data, path)

  return data
示例#16
0
def get_model(args, baseline, structure, naive, num_points):
    M_net = PSD(3, 400, 2).to(device)
    g_net = MatrixNet(3, 300, 4, shape=(2,2)).to(device)
    if structure == False:
        if naive and baseline:
            raise RuntimeError('argument *baseline* and *naive* cannot both be true')
        elif naive:
            input_dim = 6
            output_dim = 5
            nn_model = MLP(input_dim, 1000, output_dim, args.nonlinearity).to(device)
            model = SymODEN_R1_T1(args.num_angle, H_net=nn_model, device=device, baseline=baseline, naive=naive, u_dim=2)
        elif baseline:
            input_dim = 6
            output_dim = 4
            nn_model = MLP(input_dim, 700, output_dim, args.nonlinearity).to(device)
            model = SymODEN_R1_T1(args.num_angle, H_net=nn_model, M_net=M_net, device=device, baseline=baseline, naive=naive, u_dim=2)
        else:
            input_dim = 5
            output_dim = 1
            nn_model = MLP(input_dim, 500, output_dim, args.nonlinearity).to(device)
            model = SymODEN_R1_T1(args.num_angle, H_net=nn_model, M_net=M_net, g_net=g_net, device=device, baseline=baseline, naive=naive, u_dim=2)
    elif structure == True and baseline ==False and naive==False:
        V_net = MLP(3, 300, 1).to(device)
        model = SymODEN_R1_T1(args.num_angle, M_net=M_net, V_net=V_net, g_net=g_net, device=device, baseline=baseline, structure=True, u_dim=2).to(device)
    else:
        raise RuntimeError('argument *structure* is set to true, no *baseline* or *naive*!')

    if naive:
        label = '-naive_ode'
    elif baseline:
        label = '-baseline_ode'
    else:
        label = '-hnn_ode'
    struct = '-struct' if structure else ''
    path = '{}/{}{}{}-{}-p{}.tar'.format(args.save_dir, args.name, label, struct, args.solver, args.num_points)
    model.load_state_dict(torch.load(path, map_location=device))
    path = '{}/{}{}{}-{}-p{}-stats.pkl'.format(args.save_dir, args.name, label, struct, args.solver, args.num_points)
    stats = from_pickle(path)
    return model, stats
示例#17
0
def get_dataset(experiment_name, save_dir, u, **kwargs):
    '''Returns a dataset bult on top of OpenAI Gym observations. Also constructs
  the dataset if no saved version is available.'''

    if experiment_name == "pendulum":
        env_name = "Pendulum-v0"
    elif experiment_name == "acrobot":
        env_name = "Acrobot-v1"
    elif experiment_name == "cartpole":
        env_name = 'My_FA_CartPole-v0'
    else:
        assert experiment_name in ['pendulum', 'acrobot', 'cartpole']

    path = '{}/{}-pixels-dataset.pkl'.format(save_dir, experiment_name)

    try:
        data = from_pickle(path)
        print("Successfully loaded data from {}".format(path))
    except:
        print(
            "Had a problem loading data from {}. Rebuilding dataset...".format(
                path))

        data = {}
        for u_ in u:
            data_ = make_gym_dataset(u=u[0], **kwargs)
            for k, v in data_.items():
                if k in ['meta']:
                    continue

                new = data_[k]
                old = data.get(k, np.array([]).reshape(0, new.shape[1]))
                data[k] = np.vstack((old, data_[k]))

        to_pickle(data, path)

    return data
示例#18
0
    def deserialize(self, string, **kw):
        """
        Function for serializing object => string.
        This can be overwritten for custom 
        uses.

        The default is to do nothing ('serializer'=None)
        If the connection is intialized with 'serializer' set to 
        'json.gz', 'json', 'gz', or 'zip', we'll do the 
        transformations.
        """

        serializer = kw.get('serializer',  self._serializer)

        if serializer == "json.gz":
            return utils.from_json(utils.from_gz(string))
        
        elif serializer == "json":
            return utils.from_json(string)

        elif serializer == "gz":
            return utils.from_gz(string)

        elif serializer == "zip":
            return utils.from_zip(string)

        elif serializer == "pickle":
            return utils.from_pickle(obj)

        elif serializer is not None:

            raise NotImplementedError(
                'Only json, gz, json.gz, zip, and pickle'
                'are supported as serializers.')
        
        return string
示例#19
0
def main(cfg):
    # --------------------
    # setup
    # --------------------
    logger.debug('setup...')
    EXPERIMENT_NAME = '{}_{}'.format(
        datetime.datetime.now().strftime('%Y%m%d'), cfg.experiment)
    logger.add(pathlib.Path(cfg.path + f'output/log/{EXPERIMENT_NAME}.log'),
               enqueue=True,
               backtrace=True)
    scores = {}

    # --------------------
    # load data
    # --------------------
    logger.debug('load preprocessed & fe data...')
    train = from_pickle(
        pathlib.Path(cfg.path +
                     f'output/feature_engineered/train{EXPERIMENT_NAME}.pkl'))
    test = from_pickle(
        pathlib.Path(cfg.path +
                     f'output/feature_engineered/test{EXPERIMENT_NAME}.pkl'))

    # new targets
    ids = 'data_id'
    target = 'Global_Sales'
    non_targets = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']
    for c in list(
            itertools.combinations(
                ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'], 3)):
        new_tg = c[0] + '_' + c[1] + '_' + c[2]
        train[new_tg] = train[c[0]] + train[c[1]] + train[c[2]]
        non_targets.append(new_tg)

    # log transform
    for t in non_targets + [target]:
        train[t] = np.log1p(train[t])
        assert t in train.columns.values.tolist()
        logger.debug(t + ' exists')

    # features to use
    drops = [
        'Platform_Genre_nunique_by_Publisher',
        'Year_of_Release_nunique_by_Publisher',
        'Platform_nunique_by_Publisher', 'Platform_nunique_by_Developer',
        'Genre_nunique_by_Developer', 'category2vec_0_max',
        'category2vec_0_min'
    ]
    drops += [
        f for f in test.columns.values.tolist()
        if ('_x_' in f) & f.endswith('Publisher')
    ]
    if cfg.fit.cv == 'KFold':
        group = 'Publisher'
    elif cfg.fit.cv == 'StratifiedKFold2':
        train['bins'] = create_folds(train, target)
        group = 'bins'
    cat_feats = ['Platform', 'Genre', 'Rating', 'Platform_Genre']
    drops += [ids, group, 'Publisher', target] + non_targets
    features = [f for f in test.columns.values.tolist() if f not in drops]
    cat_feats = [f for f in cat_feats if f in features]

    # --------------------
    # 1st feature selection (adversarial validation)
    # --------------------
    m = 'lgb'
    train['is_train'] = 1
    test['is_train'] = 0
    df = pd.concat([train, test], ignore_index=True)
    auc = 1
    counts = 0
    while auc > cfg.fit.adversarial_validation:
        if counts > 0:
            drops = fi_df['features'].values[:int(0.02 *
                                                  len(features))].tolist()
            print('drops:', drops)
            features = fi_df['features'].values[int(0.02 *
                                                    len(features)):].tolist()
            cat_feats = [f for f in cat_feats if f in features]

        # fit
        oof_, y_pred_, fi_df, model = fit_single_model(df,
                                                       df,
                                                       'is_train',
                                                       features,
                                                       cat_feats,
                                                       group,
                                                       m,
                                                       task='binary',
                                                       cv='StratifiedKFold',
                                                       n_splits=2,
                                                       nsa=1,
                                                       cfg=cfg)
        auc = model.score
        counts += 1
        logger.debug('adversarial validation score (auc) = {}'.format(auc))

    if cfg.fit.adversarial_validation < 1:
        savepath_adv = pathlib.Path(
            cfg.path + f'output/feature_importance/adv_{EXPERIMENT_NAME}.csv')
        fi_df.to_csv(savepath_adv, index=False)
    else:
        logger.debug('skipping adversarial validation...')

    # --------------------
    # 2nd feature selection (simply fit and select top 64% features)
    # --------------------
    # initialize for stacking
    m = 'lgb'
    n = 'full_feats_' + target
    cv = cfg.fit.cv
    task = 'regression'
    oof_df = pd.DataFrame()
    oof_df[target] = train[target].values
    oof_df[group] = train[group].values
    oof_df[ids] = train[ids].values
    ypred_df = pd.DataFrame()
    ypred_df[ids] = test[ids].values
    assert oof_df[ids].values[-1] + 1 == ypred_df[ids].values[0]

    # fit
    oof_, y_pred_, fi_df, _ = fit_single_model(train,
                                               test,
                                               target,
                                               features,
                                               cat_feats,
                                               group,
                                               model_name=m,
                                               task=task,
                                               cv=cv,
                                               n_splits=cfg.fit.nfold,
                                               nsa=cfg.fit.nsa,
                                               cfg=cfg)
    savepath, score = after_modeling(train[target].values, oof_, fi_df, m, cv,
                                     cfg, EXPERIMENT_NAME)
    scores[f'lgb_full{len(features)}_features'] = score

    # feature selection
    features = fi_df['features'].values[:int(0.64 * len(features))].tolist()
    cat_feats = [f for f in cat_feats if f in features]
    logger.debug(f'{len(features)} selected features')

    # assign
    oof_df[n] = oof_
    ypred_df[n] = y_pred_

    # ------------------------
    # fit for non-targets (for stacking)
    # ------------------------
    non_targets = [
        f for f in non_targets
        if f not in ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']
    ]
    for i, t in enumerate(non_targets):
        # fit
        if (cfg.mode == 'debug') & (t != target):
            continue
        n = f'pred_{t}'
        logger.debug('# -----------------------')
        logger.debug(f'predicting {t}...')
        logger.debug('# -----------------------')
        oof_, y_pred_, fi_df, _ = fit_single_model(train,
                                                   test,
                                                   t,
                                                   features,
                                                   cat_feats,
                                                   group,
                                                   model_name=m,
                                                   task=task,
                                                   cv=cv,
                                                   n_splits=cfg.fit.nfold,
                                                   nsa=cfg.fit.nsa,
                                                   cfg=cfg)

        # assign
        oof_df[n] = oof_
        ypred_df[n] = y_pred_

    # ------------------------
    # fit for target
    # ------------------------
    models = ['catb', 'xgb']
    for m in models:
        # fitting
        logger.debug('# -----------------------')
        logger.debug(f'predicting {target} by {m}...')
        logger.debug('# -----------------------')
        oof_, y_pred_, fi_df, _ = fit_single_model(train,
                                                   test,
                                                   target,
                                                   features,
                                                   cat_feats,
                                                   group,
                                                   model_name=m,
                                                   task=task,
                                                   cv=cv,
                                                   n_splits=cfg.fit.nfold,
                                                   nsa=cfg.fit.nsa,
                                                   cfg=cfg)

        # assign
        oof_df[f'{m}_{target}'] = oof_
        ypred_df[f'{m}_{target}'] = y_pred_

    # store validation scores
    for n in ypred_df.columns.values.tolist():
        if target in n:
            score = validation_score(train[target].values, oof_df[n].values)
            logger.debug('Overall score for {} = {}'.format(n, score))
            scores[n] = score

    # -------------------
    # stacking
    # -------------------
    logger.debug('stacking ensemble...')
    m = 'linear'
    oof = np.zeros(train.shape[0])
    y_pred = np.zeros(test.shape[0])
    logger.debug(f'fitting {m}...')
    stacking_feats = [
        f for f in ypred_df.columns.values.tolist() if f not in drops
    ]
    oof, y_pred, fi_df, _ = fit_single_model(oof_df,
                                             ypred_df,
                                             target,
                                             stacking_feats, [],
                                             group,
                                             model_name=m,
                                             task=task,
                                             cv=cv,
                                             n_splits=cfg.fit.nfold,
                                             nsa=cfg.fit.nsa,
                                             cfg=cfg)

    # -------------------
    # evaluate results
    # -------------------
    ss = pd.read_csv(
        pathlib.Path(cfg.path + 'input/atmacup8_sample-submission.csv'))
    ss[target] = np.expm1(y_pred)
    score = validation_score(train[target].values, oof)
    logger.debug('Overall score for ensemble = {}'.format(score))
    scores['final'] = score

    # --------------------
    # save files
    # --------------------
    logger.debug('saving files...')

    # oof, submissions
    fi_df.to_csv(pathlib.Path(
        cfg.path + f'output/feature_importance/weights_{EXPERIMENT_NAME}.csv'),
                 index=False)
    np.save(pathlib.Path(cfg.path + f'output/oof/oof_final{EXPERIMENT_NAME}'),
            oof)
    ss.to_csv(pathlib.Path(cfg.path +
                           f'output/submission/{EXPERIMENT_NAME}.csv'),
              index=False)

    # --------------------
    # mlflow
    # --------------------
    mlflow.set_tracking_uri("http://mlflow:5000")
    mlflow.set_experiment(EXPERIMENT_NAME)
    with tempfile.TemporaryDirectory() as tmp_dir, mlflow.start_run() as run:
        # Added this line
        logger.debug('tracking uri:', mlflow.get_tracking_uri())
        logger.debug('artifact uri:', mlflow.get_artifact_uri())

        # hyperparameters
        mlflow.log_params(cfg.lightgbm)
        if cfg.mode != 'debug':
            mlflow.log_params(cfg.catboost)
            mlflow.log_params(cfg.xgboost)

        # settings
        mlflow.log_param('mode', cfg.mode)
        mlflow.log_param('seed', cfg.seed)
        mlflow.log_param('ensemble', cfg.ensemble)

        # scores
        for k, v in scores.items():
            mlflow.log_metric(k, v)

        # outputs
        artifacts = {
            'ensemble_weights': cfg.path +
            f'output/feature_importance/weights_{EXPERIMENT_NAME}.csv',
            'feature_importance': savepath,
            'oof': cfg.path + f'output/oof/oof_final{EXPERIMENT_NAME}.npy',
            'submission':
            cfg.path + f'output/submission/{EXPERIMENT_NAME}.csv',
        }
        for name, file_path in artifacts.items():
            mlflow.log_artifact(pathlib.Path(file_path))

    logger.debug('all done')
示例#20
0
def train(args):
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    n_available_GPUs = torch.cuda.device_count()
    dtype = torch.get_default_dtype()
    torch.set_grad_enabled(False)

    # set random seed
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    # arrange data
    data = get_dataset(args.name,
                       args.save_dir,
                       verbose=True,
                       device='cpu',
                       test_split=0.1)
    train_u = torch.tensor(data['u'],
                           requires_grad=True,
                           device=device,
                           dtype=dtype)
    test_u = torch.tensor(data['test_u'],
                          requires_grad=True,
                          device=device,
                          dtype=dtype)

    train_dudt = torch.tensor(data['dudt'], device=device, dtype=dtype)
    test_dudt = torch.tensor(data['test_dudt'], device=device, dtype=dtype)

    t_eval = data['t_eval']
    dt = data['dt']
    M = test_u.shape[-1]

    train_shape_origin = train_u.shape
    test_shape_origin = test_u.shape
    u1 = train_u[:, :-1].contiguous().view(-1, 1, train_u.shape[-1])
    u2 = train_u[:, 1:].contiguous().view(-1, 1, train_u.shape[-1])
    dudt = ((u2 - u1) / dt).detach()

    train_u = train_u.view(-1, 1, train_u.shape[-1])
    test_u = test_u.view(-1, 1, test_u.shape[-1])
    train_dudt = train_dudt.view(-1, 1, train_dudt.shape[-1])
    test_dudt = test_dudt.view(-1, 1, test_dudt.shape[-1])

    # init model and optimizer
    alpha = 2 if args.name.startswith('ch') else 1
    model = dgnet.DGNetPDE1d(args.input_dim,
                             args.hidden_dim,
                             nonlinearity=args.nonlinearity,
                             model=args.model,
                             solver=args.solver,
                             name=args.name,
                             dx=data['dx'],
                             alpha=alpha)
    print(model)
    model = model.to(device)
    stats = {'train_loss': [], 'test_loss': []}

    import glob
    files = glob.glob('{}.tar'.format(args.result_path))
    if len(files) > 0:
        f = files[0]
        path_tar = f
        model.load_state_dict(torch.load(path_tar, map_location=device))
        path_pkl = f.replace('.tar', '.pkl')
        stats = from_pickle(path_pkl)
        args.total_steps = 0
        print('Model successfully loaded from {}'.format(path_tar))

    if args.load:
        path_tar = '{}.tar'.format(args.result_path).replace('_long', '')
        model.load_state_dict(torch.load(path_tar, map_location=device))
        args.total_steps = 0
        print('Model successfully loaded from {}'.format(path_tar))

    optim = torch.optim.Adam(model.parameters(),
                             args.learn_rate,
                             weight_decay=0)

    # vanilla train loop
    for step in range(args.total_steps):

        # train step
        idx = torch.randperm(u1.shape[0])[:args.batch_size]
        with torch.enable_grad():
            if n_available_GPUs > 1:
                dudt_hat = torch.nn.parallel.data_parallel(
                    model,
                    u1[idx],
                    module_kwargs={
                        'dt': dt,
                        'x2': u2[idx],
                        'func': 'discrete_time_derivative'
                    })
            else:
                dudt_hat = model.discrete_time_derivative(u1[idx],
                                                          dt=dt,
                                                          x2=u2[idx])
            loss = L2_loss(dudt[idx], dudt_hat)
        optim.zero_grad()
        loss.backward()
        optim.step()

        # run test data
        test_idx = torch.randperm(test_u.shape[0])[:args.batch_size]
        test_dudt_hat = model.time_derivative(test_u[test_idx])
        test_loss = L2_loss(test_dudt[test_idx], test_dudt_hat)
        # logging
        stats['train_loss'].append(loss.item())
        stats['test_loss'].append(test_loss.item())
        if args.verbose and step % args.print_every == 0:
            print("step {}, train_loss {:.4e}, test_loss {:.4e}".format(
                step, loss.item(), test_loss.item()))
    if len(train_u) > 0:
        train_dudt_hat = torch.cat([
            model.time_derivative(train_u[idx:idx + args.batch_size])
            for idx in range(0, len(train_u), args.batch_size)
        ],
                                   dim=0)
        train_dist = (train_dudt - train_dudt_hat)**2
        test_dudt_hat = torch.cat([
            model.time_derivative(test_u[idx:idx + args.batch_size])
            for idx in range(0, len(test_u), args.batch_size)
        ],
                                  dim=0)
        test_dist = (test_dudt - test_dudt_hat)**2

        print('Final train loss {:.4e}\nFinal test loss {:.4e}'.format(
            train_dist.mean().item(),
            test_dist.mean().item()))
        stats['final_train_loss'] = train_dist.mean().item()
        stats['final_test_loss'] = test_dist.mean().item()
    else:
        stats['final_train_loss'] = 0.0
        stats['final_test_loss'] = 0.0

    # sequence generator
    os.makedirs('{}/results/'.format(args.save_dir), exist_ok=True)
    print('Generating test sequences')
    train_u = train_u.view(*train_shape_origin)
    test_u = test_u.view(*test_shape_origin)
    import matplotlib as mpl
    mpl.use('Agg')
    import matplotlib.pyplot as plt
    test_u_truth = []
    test_u_model = []
    for idx in range(len(test_u)):
        print('Generating a sequence {}/{}'.format(idx, len(test_u)), end='\r')
        u_truth = test_u[idx].squeeze(1).detach().cpu().numpy()
        u_model = model.get_orbit(
            x0=test_u[idx, :1],
            t_eval=t_eval).squeeze(2).squeeze(1).detach().cpu().numpy()

        test_u_truth.append(u_truth)
        test_u_model.append(u_model)
        energy_truth = data['model'].get_energy(u_truth)
        energy_model = data['model'].get_energy(u_model)

        if args.model != 'node':
            energy_model_truth = model(
                torch.from_numpy(u_truth).unsqueeze(-2).to(device)).squeeze(
                    2).squeeze(1).detach().cpu().numpy() * data['dx']
            energy_model_model = model(
                torch.from_numpy(u_model).unsqueeze(-2).to(device)).squeeze(
                    2).squeeze(1).detach().cpu().numpy() * data['dx']

        mass_truth = u_truth.sum(-1)
        mass_model = u_model.sum(-1)

        if args.name.startswith('ch'):
            vmax = 1
            vmin = -1
        else:
            vmax = max(np.abs(u_truth).max(), np.abs(u_model).max())
            vmin = -vmax

        fig, (ax1, ax2, ax3, ax4) = plt.subplots(4,
                                                 1,
                                                 sharex=True,
                                                 figsize=(9., 15.),
                                                 facecolor='white')
        ax1.imshow(u_truth.T,
                   interpolation='nearest',
                   vmin=vmin,
                   vmax=vmax,
                   cmap='seismic')
        ax1.set_aspect('auto')
        ax1.set_yticks((-0.5, M - 0.5))
        ax1.set_yticklabels((0, 1))

        ax2.imshow(u_model.T,
                   interpolation='nearest',
                   vmin=vmin,
                   vmax=vmax,
                   cmap='seismic')
        ax2.set_aspect('auto')
        ax2.set_yticks((-0.5, M - 0.5))
        ax2.set_yticklabels((0, 1))

        ax3.plot([], [], color='white', label='energy')
        if args.model != 'node':
            ax3.plot(energy_model_truth - energy_model_truth[0],
                     dashes=[2, 2],
                     color='C0')
            ax3.plot(energy_model_model - energy_model_model[0],
                     dashes=[2, 2],
                     color='C1')
        ax3.plot(energy_truth - energy_truth[0],
                 color='C0',
                 label='ground truth')
        ax3.plot(energy_model - energy_model[0], color='C1', label=args.model)
        ax3.legend()

        ax4.plot([], [], color='white', label='mass')
        ax4.plot(mass_truth, color='C0')
        ax4.plot(mass_model, color='C1')
        ax4.set_xticks(t_eval[::len(t_eval) // 5] / dt)
        ax4.set_xticklabels(t_eval[::len(t_eval) // 5])
        ax4.set_xlabel('time')

        fig.savefig('{}_plot{:02d}.png'.format(args.result_path, idx))
        plt.close()

    test_u_truth = np.stack(test_u_truth, axis=0)[:, 1:]
    test_u_model = np.stack(test_u_model, axis=0)[:, 1:]
    energy_truth = data['model'].get_energy(test_u_truth)
    energy_model = data['model'].get_energy(test_u_model)

    print('energy MSE model', ((energy_truth - energy_model)**2).mean())
    stats['energy_mse_mean'] = ((energy_truth - energy_model)**2).mean()

    print('state MSE model', ((test_u_truth - test_u_model)**2).mean())
    stats['state_mse_mean'] = ((test_u_truth - test_u_model)**2).mean()

    stats['test_u_truth'] = test_u_truth
    stats['test_u_model'] = test_u_model
    stats['energy_truth'] = energy_truth
    stats['energy_model'] = energy_model

    if args.model != 'node':
        energy_model_truth = model(
            torch.from_numpy(test_u_truth).reshape(
                -1, 1, test_u_truth.shape[-1]).to(device)).detach().cpu(
                ).numpy().reshape(*test_u_truth.shape[:-1])
        energy_model_model = model(
            torch.from_numpy(test_u_model).reshape(
                -1, 1, test_u_model.shape[-1]).to(device)).detach().cpu(
                ).numpy().reshape(*test_u_model.shape[:-1])
        stats['energy_model_truth'] = energy_model_truth
        stats['energy_model_model'] = energy_model_model

    return model, stats