def update(self):
        if self.normrew:
            rffs = np.array(
                [self.rff.update(rew) for rew in self.rollout.buf_rews.T])
            rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel())
            self.rff_rms.update_from_moments(rffs_mean, rffs_std**2,
                                             rffs_count)
            rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var)
        else:
            rews = np.copy(self.rollout.buf_rews)
        self.calculate_advantages(rews=rews,
                                  use_news=self.use_news,
                                  gamma=self.gamma,
                                  lam=self.lam)

        info = dict(
            advmean=self.buf_advs.mean(),
            advstd=self.buf_advs.std(),
            retmean=self.buf_rets.mean(),
            retstd=self.buf_rets.std(),
            vpredmean=self.rollout.buf_vpreds.mean(),
            vpredstd=self.rollout.buf_vpreds.std(),
            ev=explained_variance(self.rollout.buf_vpreds.ravel(),
                                  self.buf_rets.ravel()),
            rew_mean=np.mean(self.rollout.buf_rews),
            recent_best_ext_ret=self.rollout.current_max
            if self.rollout.current_max is not None else 0,
        )
        if self.rollout.best_ext_ret is not None:
            info['best_ext_ret'] = self.rollout.best_ext_ret

        # store images for debugging
        # from PIL import Image
        # if not os.path.exists('logs/images/'):
        #         os.makedirs('logs/images/')
        # for i in range(self.rollout.buf_obs_last.shape[0]):
        #     obs = self.rollout.buf_obs_last[i][0]
        #     Image.fromarray((obs*255.).astype(np.uint8)).save('logs/images/%04d.png'%i)

        # normalize advantages
        if self.normadv:
            m, s = get_mean_and_std(self.buf_advs)
            self.buf_advs = (self.buf_advs - m) / (s + 1e-7)
        envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches
        envsperbatch = max(1, envsperbatch)
        envinds = np.arange(self.nenvs * self.nsegs_per_env)

        def resh(x):
            if self.nsegs_per_env == 1:
                return x
            sh = x.shape
            return x.reshape((sh[0] * self.nsegs_per_env,
                              self.nsteps_per_seg) + sh[2:])

        ph_buf = [
            (self.stochpol.ph_ac, resh(self.rollout.buf_acs)),
            (self.ph_rews, resh(self.rollout.buf_rews)),
            (self.ph_oldvpred, resh(self.rollout.buf_vpreds)),
            (self.ph_oldnlp, resh(self.rollout.buf_nlps)),
            (self.stochpol.ph_ob, resh(self.rollout.buf_obs)),
            (self.ph_ret, resh(self.buf_rets)),
            (self.ph_adv, resh(self.buf_advs)),
        ]
        ph_buf.extend([(self.dynamics_list[0].last_ob,
                        self.rollout.buf_obs_last.reshape([
                            self.nenvs * self.nsegs_per_env, 1,
                            *self.ob_space.shape
                        ]))])
        mblossvals = []

        for _ in range(self.nepochs):
            np.random.shuffle(envinds)
            for start in range(0, self.nenvs * self.nsegs_per_env,
                               envsperbatch):
                end = start + envsperbatch
                mbenvinds = envinds[start:end]
                fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf}
                fd.update({
                    self.ph_lr: self.lr,
                    self.ph_cliprange: self.cliprange
                })
                mblossvals.append(getsess().run(self._losses + (self._train, ),
                                                fd)[:-1])

        mblossvals = [mblossvals[0]]
        info.update(
            zip(['opt_' + ln for ln in self.loss_names],
                np.mean([mblossvals[0]], axis=0)))
        info["rank"] = MPI.COMM_WORLD.Get_rank()
        self.n_updates += 1
        info["n_updates"] = self.n_updates
        info.update({
            dn: (np.mean(dvs) if len(dvs) > 0 else 0)
            for (dn, dvs) in self.rollout.statlists.items()
        })
        info.update(self.rollout.stats)
        if "states_visited" in info:
            info.pop("states_visited")
        tnow = time.time()
        info["ups"] = 1. / (tnow - self.t_last_update)
        info["total_secs"] = tnow - self.t_start
        info['tps'] = MPI.COMM_WORLD.Get_size(
        ) * self.rollout.nsteps * self.nenvs / (tnow - self.t_last_update)
        self.t_last_update = tnow

        return info
Пример #2
0
                                      branches=branches,
                                      selection=selection_test)
features_data = root_numpy.tree2array(tree,
                                      branches=branches,
                                      selection=selection_data)

features = {
    "train": features_train,
    "test": features_test,
    "data": features_data,
}

preprocess_dict = {}
if args.z_score:
    for feat in training_features:
        mean, std = utils.get_mean_and_std(features_train[feat])
        preprocess_dict[feat] = {"mean": float(mean), "std_dev": float(std)}

    with open(z_score_json, "w") as f_out:
        json.dump(preprocess_dict, f_out, indent=4, sort_keys=True)

f_out = h5py.File(output_file, "w")
f_out.create_dataset("feature_names", data=training_features)

for set in features.keys():
    global_features, label = prep_utils.create_features_and_label(
        features[set], training_features, signal, bkg, preprocess_dict,
        args.z_score)

    f_out.create_dataset("global_%s" % set, data=global_features)
    f_out.create_dataset("label_%s" % set, data=label)
    def update(self):
        # Rewards normalization
        # if self.normrew:
        #     rffs = np.array([self.rff.update(rew) for rew in self.rollout.buf_rews.T])
        #     rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel())
        #     self.rff_rms.update_from_moments(rffs_mean, rffs_std ** 2, rffs_count)
        #     rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var)
        
        # Intrinsic Rewards Normalization
        if self.normrew:
            rffs_int = np.array([self.rff.update(rew) for rew in self.rollout.buf_int_rews.T])
            self.rff_rms.update(rffs_int.ravel())        
            int_rews = self.rollout.buf_int_rews / np.sqrt(self.rff_rms.var)
        else:
            int_rews = np.copy(self.rollout.buf_int_rews)
        
        mean_int_rew = np.mean(int_rews)
        max_int_rew = np.max(int_rews)
        
        # Do not normalize extrinsic rewards 
        ext_rews = self.rollout.buf_ext_rews

        nsteps = self.rollout.nsteps

        # If separate value fcn are used
        if self.hps['num_vf']==2:
            #Calculate intrinsic returns and advantages.
            lastgaelam = 0
            for t in range(nsteps - 1, -1, -1):  # nsteps-2 ... 0
                if self.use_news:
                    nextnew = self.rollout.buf_news[:, t + 1] if t + 1 < nsteps else self.rollout.buf_new_last
                else:
                    nextnew = 0 # No dones for intrinsic rewards with self.use_news=False
                nextvals = self.rollout.buf_vpreds_int[:, t + 1] if t + 1 < nsteps else self.rollout.buf_vpred_int_last
                nextnotnew = 1 - nextnew
                delta = int_rews[:, t] + self.gamma * nextvals * nextnotnew - self.rollout.buf_vpreds_int[:, t]
                self.buf_advs_int[:, t] = lastgaelam = delta + self.gamma * self.lam * nextnotnew * lastgaelam
            self.buf_rets_int[:] = self.buf_advs_int + self.rollout.buf_vpreds_int

            #Calculate extrinsic returns and advantages.
            lastgaelam = 0

            for t in range(nsteps - 1, -1, -1):  # nsteps-2 ... 0
                nextnew = self.rollout.buf_news[:, t + 1] if t + 1 < nsteps else self.rollout.buf_new_last
                nextvals = self.rollout.buf_vpreds_ext[:, t + 1] if t + 1 < nsteps else self.rollout.buf_vpred_ext_last
                nextnotnew = 1 - nextnew
                delta = ext_rews[:, t] + self.gamma_ext * nextvals * nextnotnew - self.rollout.buf_vpreds_ext[:, t]
                self.buf_advs_ext[:, t] = lastgaelam = delta + self.gamma_ext * self.lam * nextnotnew * lastgaelam
            self.buf_rets_ext[:] = self.buf_advs_ext + self.rollout.buf_vpreds_ext
            
            #Combine the extrinsic and intrinsic advantages.
            self.buf_advs = self.int_coeff*self.buf_advs_int + self.ext_coeff*self.buf_advs_ext
        else:
            #Calculate mixed intrinsic and extrinsic returns and advantages.
            rews = self.rollout.buf_rews = self.rollout.reward_fun(int_rew=int_rews, ext_rew=ext_rews)            
            lastgaelam = 0
            for t in range(nsteps - 1, -1, -1):  # nsteps-2 ... 0
                nextnew = self.rollout.buf_news[:, t + 1] if t + 1 < nsteps else self.rollout.buf_new_last
                nextvals = self.rollout.buf_vpreds[:, t + 1] if t + 1 < nsteps else self.rollout.buf_vpred_last
                nextnotnew = 1 - nextnew
                delta = rews[:, t] + self.gamma * nextvals * nextnotnew - self.rollout.buf_vpreds[:, t]
                self.buf_advs[:, t] = lastgaelam = delta + self.gamma * self.lam * nextnotnew * lastgaelam
            self.buf_rets[:] = self.buf_advs + self.rollout.buf_vpreds
        
        info = dict(
            # advmean=self.buf_advs.mean(),
            # advstd=self.buf_advs.std(),  
            recent_best_ext_ret=self.rollout.current_max,
            recent_best_eplen = self.rollout.current_minlen,
            recent_worst_eplen = self.rollout.current_maxlen   
        )

        if self.hps['num_vf'] ==2:
            info['retmean_int']=self.buf_rets_int.mean()
            info['retmean_ext']=self.buf_rets_ext.mean()
            info['retstd_int']=self.buf_rets_int.std()
            info['retstd_ext']=self.buf_rets_ext.std()
            info['vpredmean_int']=self.rollout.buf_vpreds_int.mean()
            info['vpredmean_ext']=self.rollout.buf_vpreds_ext.mean()
            info['vpredstd_int']=self.rollout.buf_vpreds_int.std()
            info['vpredstd_ext']=self.rollout.buf_vpreds_ext.std()
            info['ev_int']=explained_variance(self.rollout.buf_vpreds_int.ravel(), self.buf_rets_int.ravel())            
            info['ev_ext']=explained_variance(self.rollout.buf_vpreds_ext.ravel(), self.buf_rets_ext.ravel())            
            info['rew_int_mean']=mean_int_rew
            info['recent_best_int_rew']=max_int_rew
        else:
            # info['retmean']=self.buf_rets.mean()
            # info['retstd']=self.buf_rets.std()
            # info['vpredmean']=self.rollout.buf_vpreds.mean()
            # info['vpredstd']=self.rollout.buf_vpreds.std()
            info['rew_mean']=np.mean(self.rollout.buf_rews)
            info['eplen_std']=np.std(self.rollout.statlists['eplen'])            
            info['eprew_std']=np.std(self.rollout.statlists['eprew'])
            # info['ev']=explained_variance(self.rollout.buf_vpreds.ravel(), self.buf_rets.ravel())            

        if self.rollout.best_ext_ret is not None:
            info['best_ext_ret'] = self.rollout.best_ext_ret
            info['best_eplen'] = self.rollout.best_eplen

        # normalize advantages
        if self.normadv:
            m, s = get_mean_and_std(self.buf_advs)
            self.buf_advs = (self.buf_advs - m) / (s + 1e-7)
        envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches
        envsperbatch = max(1, envsperbatch)
        envinds = np.arange(self.nenvs * self.nsegs_per_env)

        def resh(x):
            if self.nsegs_per_env == 1:
                return x
            sh = x.shape
            return x.reshape((sh[0] * self.nsegs_per_env, self.nsteps_per_seg) + sh[2:])
        
        #Create feed_dict for optimization.
        ph_buf = [
                (self.stochpol.ph_ac, resh(self.rollout.buf_acs)),
                (self.ph_oldnlp, resh(self.rollout.buf_nlps)),
                (self.stochpol.ph_ob, resh(self.rollout.buf_obs)),
                (self.ph_adv, resh(self.buf_advs)),
                ]

        if self.hps['num_vf']==2:
            ph_buf.extend([                
                (self.ph_ret_int, resh(self.buf_rets_int)),
                (self.ph_ret_ext, resh(self.buf_rets_ext)),
            ])       
        else:
            ph_buf.extend([
                (self.ph_rews, resh(self.rollout.buf_rews)),
                (self.ph_oldvpred, resh(self.rollout.buf_vpreds)),
                (self.ph_ret, resh(self.buf_rets)),
            ])

        ph_buf.extend([
            (self.dynamics.last_ob,
             self.rollout.buf_obs_last.reshape([self.nenvs * self.nsegs_per_env, 1, *self.ob_space.shape]))
        ])

        #Optimizes on current data for several epochs.
        mblossvals = []

        for _ in range(self.nepochs):
            np.random.shuffle(envinds)
            for start in range(0, self.nenvs * self.nsegs_per_env, envsperbatch):
                end = start + envsperbatch
                mbenvinds = envinds[start:end]
                fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf}                
                fd.update({self.ph_lr: self.lr, self.ph_cliprange: self.cliprange})
                mblossvals.append(getsess().run(self._losses + (self._train,), fd)[:-1])

        mblossvals = [mblossvals[0]]
        # info.update(zip(['opt_' + ln for ln in self.loss_names], np.mean([mblossvals[0]], axis=0)))
        # info["rank"] = MPI.COMM_WORLD.Get_rank()
        self.n_updates += 1
        info["n_updates"] = self.n_updates
        info.update({dn: (np.mean(dvs) if len(dvs) > 0 else 0) for (dn, dvs) in self.rollout.statlists.items()})
        info.update(self.rollout.stats)
        if "states_visited" in info:
            info.pop("states_visited")
        tnow = time.time()
        # info["ups"] = 1. / (tnow - self.t_last_update)
        info["total_secs"] = tnow - self.t_start
        # info['tps'] = MPI.COMM_WORLD.Get_size() * self.rollout.nsteps * self.nenvs / (tnow - self.t_last_update)
        self.t_last_update = tnow

        return info
Пример #4
0
    def update(self):
        if self.normrew:
            rffs = np.array(
                [self.rff.update(rew) for rew in self.rollout.buf_rews.T])
            rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel())
            self.rff_rms.update_from_moments(rffs_mean, rffs_std**2,
                                             rffs_count)
            rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var)
            if self.dynamics.dropout:
                rffs2 = np.array([
                    self.rff2.update(rew)
                    for rew in self.rollout.buf_rews_mean.T
                ])
                rffs2_mean, rffs2_std, rffs2_count = mpi_moments(rffs2.ravel())
                self.rff_rms2.update_from_moments(rffs2_mean, rffs2_std**2,
                                                  rffs2_count)
                rews_m = self.rollout.buf_rews_mean / np.sqrt(
                    self.rff_rms2.var)
                rews = rews_m + rews

        else:
            rews = np.copy(self.rollout.buf_rews)
        self.calculate_advantages(rews=rews,
                                  use_news=self.use_news,
                                  gamma=self.gamma,
                                  lam=self.lam)

        info = dict(advmean=self.buf_advs.mean(),
                    advstd=self.buf_advs.std(),
                    retmean=self.buf_rets.mean(),
                    retstd=self.buf_rets.std(),
                    vpredmean=self.rollout.buf_vpreds.mean(),
                    vpredstd=self.rollout.buf_vpreds.std(),
                    ev=explained_variance(self.rollout.buf_vpreds.ravel(),
                                          self.buf_rets.ravel()),
                    rew_mean=np.mean(self.rollout.buf_rews),
                    recent_best_ext_ret=self.rollout.current_max)
        if self.rollout.best_ext_ret is not None:
            info['best_ext_ret'] = self.rollout.best_ext_ret

        # if self.flipout:
        #     info['dyn_mean'] = np.mean(self.rollout.buf_dyn_rew)
        # normalize advantages
        if self.normadv:
            m, s = get_mean_and_std(self.buf_advs)
            self.buf_advs = (self.buf_advs - m) / (s + 1e-7)
        envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches
        envsperbatch = max(1, envsperbatch)
        envinds = np.arange(self.nenvs * self.nsegs_per_env)

        def resh(x):
            if self.nsegs_per_env == 1:
                return x
            sh = x.shape
            return x.reshape((sh[0] * self.nsegs_per_env,
                              self.nsteps_per_seg) + sh[2:])

        ph_buf = [
            (self.policy.placeholder_action, resh(self.rollout.buf_acs)),
            (self.placeholder_rews, resh(self.rollout.buf_rews)),
            (self.placeholder_oldvpred, resh(self.rollout.buf_vpreds)),
            (self.placeholder_oldnlp, resh(self.rollout.buf_nlps)),
            (self.policy.placeholder_observation, resh(self.rollout.buf_obs)),
            (self.placeholder_ret, resh(self.buf_rets)),
            (self.placeholder_advantage, resh(self.buf_advs)),
        ]
        ph_buf.extend([(self.dynamics.last_ob,
                        self.rollout.buf_obs_last.reshape([
                            self.nenvs * self.nsegs_per_env, 1,
                            *self.ob_space.shape
                        ]))])
        # if self.flipout:
        #     ph_buf.extend([(self.placeholder_dyn_mean, resh(self.buf_n_dyn_rew))])

        if self.bootstrapped:
            ph_buf.extend([
                (self.dynamics.mask_placeholder,
                 self.rollout.buf_mask.reshape(-1, self.dynamics.n_heads, 1))
            ])
        mblossvals = []

        for _ in range(self.nepochs):
            np.random.shuffle(envinds)
            for start in range(0, self.nenvs * self.nsegs_per_env,
                               envsperbatch):
                end = start + envsperbatch
                mbenvinds = envinds[start:end]
                fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf}
                fd.update({
                    self.placeholder_lr: self.lr,
                    self.placeholder_cliprange: self.cliprange
                })
                if self.dynamics.dropout:
                    fd.update({self.dynamics.is_training: True})
                mblossvals.append(tf.get_default_session().run(
                    self._losses + (self._train, ), fd)[:-1])

        mblossvals = [mblossvals[0]]
        info.update(
            zip(['opt_' + ln for ln in self.loss_names],
                np.mean([mblossvals[0]], axis=0)))
        info["rank"] = MPI.COMM_WORLD.Get_rank()
        self.n_updates += 1
        info["n_updates"] = self.n_updates
        info.update({
            dn: (np.mean(dvs) if len(dvs) > 0 else 0)
            for (dn, dvs) in self.rollout.statlists.items()
        })
        info.update(self.rollout.stats)
        if "states_visited" in info:
            info.pop("states_visited")
        tnow = time.time()
        info["ups"] = 1. / (tnow - self.t_last_update)
        info["total_secs"] = tnow - self.t_start
        info['tps'] = MPI.COMM_WORLD.Get_size(
        ) * self.rollout.nsteps * self.nenvs / (tnow - self.t_last_update)
        self.t_last_update = tnow

        return info
Пример #5
0
    def update(self):
        if self.normrew:         # 规约奖励, 根据 MPI 从其余线程获取的信息
            rffs = np.array([self.rff.update(rew) for rew in self.rollout.buf_rews.T])
            rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel())
            self.rff_rms.update_from_moments(rffs_mean, rffs_std ** 2, rffs_count)
            rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var)
        else:
            rews = np.copy(self.rollout.buf_rews)

        # 调用本类的函数, 根据奖励序列 rews 计算 advantage function
        self.calculate_advantages(rews=rews, use_news=self.use_news, gamma=self.gamma, lam=self.lam)

        # 记录一些统计量进行输出
        info = dict(
            advmean=self.buf_advs.mean(),
            advstd=self.buf_advs.std(),
            retmean=self.buf_rets.mean(),
            retstd=self.buf_rets.std(),
            vpredmean=self.rollout.buf_vpreds.mean(),
            vpredstd=self.rollout.buf_vpreds.std(),
            ev=explained_variance(self.rollout.buf_vpreds.ravel(), self.buf_rets.ravel()),
            rew_mean=np.mean(self.rollout.buf_rews),
            rew_mean_norm=np.mean(rews),
            recent_best_ext_ret=self.rollout.current_max
        )
        if self.rollout.best_ext_ret is not None:
            info['best_ext_ret'] = self.rollout.best_ext_ret

        # normalize advantages. 对计算得到的 advantage 由 mean 和 std 进行规约.
        if self.normadv:
            m, s = get_mean_and_std(self.buf_advs)
            self.buf_advs = (self.buf_advs - m) / (s + 1e-7)
        envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches
        envsperbatch = max(1, envsperbatch)
        envinds = np.arange(self.nenvs * self.nsegs_per_env)

        def resh(x):
            if self.nsegs_per_env == 1:
                return x
            sh = x.shape
            return x.reshape((sh[0] * self.nsegs_per_env, self.nsteps_per_seg) + sh[2:])

        # 将本类中定义的 placeholder 与 rollout 类中收集的样本numpy 对应起来, 准备作为 feed-dict
        ph_buf = [
            (self.stochpol.ph_ac, resh(self.rollout.buf_acs)),
            (self.ph_rews, resh(self.rollout.buf_rews)),
            (self.ph_oldvpred, resh(self.rollout.buf_vpreds)),
            (self.ph_oldnlp, resh(self.rollout.buf_nlps)),
            (self.stochpol.ph_ob, resh(self.rollout.buf_obs)),   # 以上是rollout在于环境交互中记录的numpy
            (self.ph_ret, resh(self.buf_rets)),                  # 根据 rollout 记录计算得到的 return
            (self.ph_adv, resh(self.buf_advs)),                  # 根据 rollout 记录计算得到的 advantage.
        ]
        ph_buf.extend([
            (self.dynamics.last_ob,
             self.rollout.buf_obs_last.reshape([self.nenvs * self.nsegs_per_env, 1, *self.ob_space.shape]))
        ])
        mblossvals = []          # 记录训练中的损失

        # 训练 Agent 损失
        for _ in range(self.nepochs):
            np.random.shuffle(envinds)
            for start in range(0, self.nenvs * self.nsegs_per_env, envsperbatch):
                end = start + envsperbatch
                mbenvinds = envinds[start:end]
                fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf}     # 构造 feed_dict
                fd.update({self.ph_lr: self.lr, self.ph_cliprange: self.cliprange})
                mblossvals.append(getsess().run(self._losses + (self._train,), fd)[:-1])    # 计算损失, 同时进行更新

        # add bai.  单独再次训练 DVAE
        for tmp in range(self.nepochs_dvae):
            print("额外训练dvae. ", tmp)
            np.random.shuffle(envinds)
            for start in range(0, self.nenvs * self.nsegs_per_env, envsperbatch):     # 循环8次
                end = start + envsperbatch
                mbenvinds = envinds[start:end]
                fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf}                       # 构造 feed_dict
                fd.update({self.ph_lr: self.lr, self.ph_cliprange: self.cliprange})
                d_loss, _ = getsess().run([self.dynamics_loss, self._train_dvae], fd)   # 计算dvae损失, 同时进行更新
                print(d_loss, end=", ")
            print("\n")

        mblossvals = [mblossvals[0]]
        info.update(zip(['opt_' + ln for ln in self.loss_names], np.mean([mblossvals[0]], axis=0)))
        info["rank"] = MPI.COMM_WORLD.Get_rank()
        self.n_updates += 1
        info["n_updates"] = self.n_updates
        info.update({dn: (np.mean(dvs) if len(dvs) > 0 else 0) for (dn, dvs) in self.rollout.statlists.items()})
        info.update(self.rollout.stats)
        if "states_visited" in info:
            info.pop("states_visited")
        tnow = time.time()
        info["ups"] = 1. / (tnow - self.t_last_update)
        info["total_secs"] = tnow - self.t_start
        info['tps'] = MPI.COMM_WORLD.Get_size() * self.rollout.nsteps * self.nenvs / (tnow - self.t_last_update)
        self.t_last_update = tnow

        return info
Пример #6
0
    def update(self):
        if self.normrew:
            rffs = np.array(
                [self.rff.update(rew) for rew in self.rollout.buf_rews.T])
            rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel())
            self.rff_rms.update_from_moments(rffs_mean, rffs_std**2,
                                             rffs_count)
            rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var)
        else:
            rews = np.copy(self.rollout.buf_rews)
        self.calculate_advantages(rews=rews,
                                  use_news=self.use_news,
                                  gamma=self.gamma,
                                  lam=self.lam)

        info = dict(advmean=self.buf_advs.mean(),
                    advstd=self.buf_advs.std(),
                    retmean=self.buf_rets.mean(),
                    retstd=self.buf_rets.std(),
                    vpredmean=self.rollout.buf_vpreds.mean(),
                    vpredstd=self.rollout.buf_vpreds.std(),
                    ev=explained_variance(self.rollout.buf_vpreds.ravel(),
                                          self.buf_rets.ravel()),
                    rew_mean=np.mean(self.rollout.buf_rews),
                    recent_best_ext_ret=self.rollout.current_max)
        if self.rollout.best_ext_ret is not None:
            info['best_ext_ret'] = self.rollout.best_ext_ret

        # normalize advantages
        if self.normadv:
            m, s = get_mean_and_std(self.buf_advs)
            self.buf_advs = (self.buf_advs - m) / (s + 1e-7)
        envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches
        envsperbatch = max(1, envsperbatch)
        envinds = np.arange(self.nenvs * self.nsegs_per_env)

        def resh(x):
            if self.nsegs_per_env == 1:
                return x
            sh = x.shape
            return x.reshape((sh[0] * self.nsegs_per_env,
                              self.nsteps_per_seg) + sh[2:])

        ph_buf = [
            (self.trainpol.ph_ac, resh(self.rollout.buf_acs)),
            (self.ph_rews, resh(self.rollout.buf_rews)),
            (self.ph_oldvpred, resh(self.rollout.buf_vpreds)),
            (self.ph_oldnlp, resh(self.rollout.buf_nlps)),
            (self.trainpol.ph_ob, resh(self.rollout.buf_obs)),
            (self.ph_ret, resh(self.buf_rets)),
            (self.ph_adv, resh(self.buf_advs)),
        ]
        ph_buf.extend([(self.train_dynamics.last_ob,
                        self.rollout.buf_obs_last.reshape([
                            self.nenvs * self.nsegs_per_env, 1,
                            *self.ob_space.shape
                        ]))])
        ph_buf.extend([
            (self.trainpol.states_ph,
             resh(self.rollout.buf_states_first)),  # rnn inputs
            (self.trainpol.masks_ph, resh(self.rollout.buf_news))
        ])
        if 'err' in self.policy_mode:
            ph_buf.extend([(self.trainpol.pred_error,
                            resh(self.rollout.buf_errs))])  # New
        if 'ac' in self.policy_mode:
            ph_buf.extend([(self.trainpol.ph_ac, resh(self.rollout.buf_acs)),
                           (self.trainpol.ph_ac_first,
                            resh(self.rollout.buf_acs_first))])
        if 'pred' in self.policy_mode:
            ph_buf.extend([(self.trainpol.obs_pred,
                            resh(self.rollout.buf_obpreds))])

        # with open(os.getcwd() + "/record_instruction.txt", 'r') as rec_inst:
        #     rec_n = []
        #     rec_all_n = []
        #     while True:
        #         line = rec_inst.readline()
        #         if not line: break
        #         args = line.split()
        #         rec_n.append(int(args[0]))
        #         if len(args) > 1:
        #             rec_all_n.append(int(args[0]))
        #     if self.n_updates in rec_n and MPI.COMM_WORLD.Get_rank() == 0:
        #         print("Enter!")
        #         with open(self.logdir + '/full_log' + str(self.n_updates) + '.pk', 'wb') as full_log:
        #             import pickle
        #             debug_data = {"buf_obs" : self.rollout.buf_obs,
        #                           "buf_obs_last" : self.rollout.buf_obs_last,
        #                           "buf_acs" : self.rollout.buf_acs,
        #                           "buf_acs_first" : self.rollout.buf_acs_first,
        #                           "buf_news" : self.rollout.buf_news,
        #                           "buf_news_last" : self.rollout.buf_new_last,
        #                           "buf_rews" : self.rollout.buf_rews,
        #                           "buf_ext_rews" : self.rollout.buf_ext_rews}
        #             if self.n_updates in rec_all_n:
        #                 debug_data.update({"buf_err": self.rollout.buf_errs,
        #                                     "buf_err_last": self.rollout.buf_errs_last,
        #                                     "buf_obpreds": self.rollout.buf_obpreds,
        #                                     "buf_obpreds_last": self.rollout.buf_obpreds_last,
        #                                     "buf_vpreds": self.rollout.buf_vpreds,
        #                                     "buf_vpred_last": self.rollout.buf_vpred_last,
        #                                     "buf_states": self.rollout.buf_states,
        #                                     "buf_states_first": self.rollout.buf_states_first,
        #                                     "buf_nlps": self.rollout.buf_nlps,})
        #             pickle.dump(debug_data, full_log)

        mblossvals = []

        for _ in range(self.nepochs):
            np.random.shuffle(envinds)
            for start in range(0, self.nenvs * self.nsegs_per_env,
                               envsperbatch):
                end = start + envsperbatch
                mbenvinds = envinds[start:end]
                fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf}
                fd.update({
                    self.ph_lr: self.lr,
                    self.ph_cliprange: self.cliprange
                })
                mblossvals.append(getsess().run(self._losses + (self._train, ),
                                                fd)[:-1])

        mblossvals = [mblossvals[0]]
        info.update(
            zip(['opt_' + ln for ln in self.loss_names],
                np.mean([mblossvals[0]], axis=0)))
        info["rank"] = MPI.COMM_WORLD.Get_rank()
        self.n_updates += 1
        info["n_updates"] = self.n_updates
        info.update({
            dn: (np.mean(dvs) if len(dvs) > 0 else 0)
            for (dn, dvs) in self.rollout.statlists.items()
        })
        info.update(self.rollout.stats)
        if "states_visited" in info:
            info.pop("states_visited")
        tnow = time.time()
        info["ups"] = 1. / (tnow - self.t_last_update)
        info["total_secs"] = tnow - self.t_start
        info['tps'] = MPI.COMM_WORLD.Get_size(
        ) * self.rollout.nsteps * self.nenvs / (tnow - self.t_last_update)
        self.t_last_update = tnow

        # New
        if 'err' in self.policy_mode:
            info["error"] = np.sqrt(np.power(self.rollout.buf_errs, 2).mean())

        if self.n_updates % self.tboard_period == 0 and MPI.COMM_WORLD.Get_rank(
        ) == 0:
            if self.full_tensorboard_log:
                summary = getsess().run(self.merged_summary_op, fd)  # New
                self.summary_writer.add_summary(
                    summary, self.rollout.stats["tcount"])  # New
            for k, v in info.items():
                summary = tf.Summary(value=[
                    tf.Summary.Value(tag=k, simple_value=v),
                ])
                self.summary_writer.add_summary(summary,
                                                self.rollout.stats["tcount"])

        return info
Пример #7
0
def main():
    global best_score
    start_epoch = args.start_epoch

    #Data  Loader
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')

    data_train = datasets.ImageFolder(
        traindir, transforms.Compose([transforms.ToTensor()]))
    mean_tr, std_tr = get_mean_and_std(data_train)
    data_test = datasets.ImageFolder(
        valdir, transforms.Compose([transforms.ToTensor()]))
    mean_te, std_te = get_mean_and_std(data_test)

    #Note that for imgaug, we should convert the PIL images to NumPy arrays before applying the transforms.

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=mean_tr, std=std_tr)
        ]))
    test_dataset = datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=mean_te, std=std_te)
        ]))

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        sampler=ImbalancedDatasetSampler(train_dataset),
        batch_size=args.train_batch,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=True)
    val_loader = torch.utils.data.DataLoader(
        test_dataset  #, sampler=ImbalancedDatasetSampler(test_dataset)
        ,
        batch_size=args.test_batch,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=True)

    #    test_loader = torch.utils.data.DataLoader(test_dataset #, sampler=ImbalancedDatasetSampler(test_dataset)
    #        ,batch_size=320, shuffle=False,	num_workers=args.workers, pin_memory=True)

    #    for inputs, targets in train_loader:

    #Create Model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = resnet34()
    model = model.to(device)
    summary(model, (3, 224, 224))
    #    for child in model.named_children():
    #        print(child)
    #    model.fc.weight
    #    (list(model.layer4.children()))[0].conv1.weights

    #Get the number of model parameters
    print('Number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model.parameters()])))

    model = torch.nn.DataParallel(model).cuda()
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(name)
#cudnn.benchmark = True

# define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum,
                          nesterov=args.nesterov,
                          weight_decay=args.weight_decay)

    title = 'AF'
    if args.resume:
        # Load checkpoint.
        print('==> Resuming from checkpoint..')
        assert os.path.isfile(
            args.resume), 'Error: no checkpoint directory found!'
        args.checkpoint = os.path.dirname(args.resume)
        checkpoint = torch.load(args.resume)
        best_score = checkpoint['best_score']
        print(best_score)
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                        title=title,
                        resume=True)
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title)
        logger.set_names([
            'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc 1.',
            'Valid Acc 1.'
        ])

    if args.evaluate:
        print('\nEvaluation only')
        test_loss, test_acc = test(val_loader, model, criterion, start_epoch,
                                   use_cuda)
        print(' Test Loss:  %.8f, Test Acc:  %.2f' % (test_loss, test_acc))
        return

# Train and val
    for epoch in range(start_epoch, args.epochs):
        adjust_learning_rate(optimizer, epoch)

        #    	print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr']))
        #Adjust Orhto decay rate
        odecay = adjust_ortho_decay_rate(epoch + 1)
        sendecay = adjust_sen_decay(epoch + 1)

        train_loss, train_acc = train(train_loader, model, criterion,
                                      optimizer, epoch, use_cuda, odecay,
                                      sendecay)
        test_loss, test_acc = test(val_loader, model, criterion, epoch,
                                   use_cuda)

        # append logger file
        logger.append(
            [state['lr'], train_loss, test_loss, train_acc, test_acc])

        # save model
        is_best = test_acc > best_score
        best_score = max(test_acc, best_score)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'acc': test_acc,
                'best_score': best_score,
                'optimizer': optimizer.state_dict(),
            },
            is_best,
            checkpoint=args.checkpoint)

    logger.close()
    logger.plot()
    savefig(os.path.join(args.checkpoint, 'log.eps'))
    print('Best Fscore:')
    print(best_score)
Пример #8
0
    def update(self):
        if self.normrew:
            rffs = np.array(
                [self.rff.update(rew) for rew in self.rollout.buf_rews.T])
            rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel())
            self.rff_rms.update_from_moments(rffs_mean, rffs_std**2,
                                             rffs_count)
            rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var)
        else:
            rews = np.copy(self.rollout.buf_rews)
        self.calculate_advantages(rews=rews,
                                  use_news=self.use_news,
                                  gamma=self.gamma,
                                  lam=self.lam)

        info = dict(advmean=self.buf_advs.mean(),
                    advstd=self.buf_advs.std(),
                    retmean=self.buf_rets.mean(),
                    retstd=self.buf_rets.std(),
                    vpredmean=self.rollout.buf_vpreds.mean(),
                    vpredstd=self.rollout.buf_vpreds.std(),
                    ev=explained_variance(self.rollout.buf_vpreds.ravel(),
                                          self.buf_rets.ravel()),
                    rew_mean=np.mean(self.rollout.buf_rews),
                    recent_best_ext_ret=self.rollout.current_max)
        if self.rollout.best_ext_ret is not None:
            info['best_ext_ret'] = self.rollout.best_ext_ret

        # normalize advantages
        if self.normadv:
            m, s = get_mean_and_std(self.buf_advs)
            self.buf_advs = (self.buf_advs - m) / (s + 1e-7)
        envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches
        envsperbatch = max(1, envsperbatch)
        envinds = np.arange(self.nenvs * self.nsegs_per_env)

        def mask(x, grad_mask):
            if self.early_stop:
                #print("x shape: {}".format(np.shape(x)))
                #grad_mask = self.rollout.grad_mask
                #print("mask shape: {}".format(np.shape(pseudo_dones)))
                #no_grad_mask = 1 - grad_mask
                sh = np.shape(x)
                if sh[1] < np.shape(grad_mask)[1]:
                    return x
                broadcast_shape = (sh[0], sh[1]) + sh[2:]
                #print("mask shape: {}".format(broadcast_shape))
                for i in range(len(broadcast_shape) - 2):
                    #    no_grad_mask = tf.expand_dims(no_grad_mask, -1)
                    grad_mask = np.expand_dims(grad_mask, -1)
                #no_grad_mask =tf.cast(no_grad_mask, x.dtype)
                #grad_mask = tf.cast(grad_mask, x.dtype)
                #result = tf.placeholder(x.dtype, shape=broadcast_shape)
                #result = tf.stop_gradient(tf.multiply(no_grad_mask, x)) + tf.multiply(grad_mask, x)
                #print("Result size: {}".format(result.shape))
                result = np.multiply(grad_mask, x)
                return result
            else:
                return x

        def resh(x):
            if self.nsegs_per_env == 1:
                return x
            sh = x.shape
            return x.reshape((sh[0] * self.nsegs_per_env,
                              self.nsteps_per_seg) + sh[2:])

        new_count = np.count_nonzero(self.rollout.buf_news)
        print(self.rollout.buf_news)
        if self.early_stop:
            print(self.rollout.grad_mask)
        print(new_count)
        ph_buf = [
            (self.stochpol.ph_ac, resh(self.rollout.buf_acs)),
            (self.ph_rews, resh(self.rollout.buf_rews)),
            (self.ph_oldvpred, resh(self.rollout.buf_vpreds)),
            (self.ph_oldnlp, resh(self.rollout.buf_nlps)),
            (self.stochpol.ph_ob, resh(self.rollout.buf_obs)),
            (self.ph_ret, resh(self.buf_rets)),
            (self.ph_adv, resh(self.buf_advs)),
        ]
        if self.depth_pred:
            ph_buf.extend([
                (self.stochpol.ph_depths, resh(self.rollout.buf_depths)),
            ])
        if self.aux_input:
            ph_buf.extend([
                (self.stochpol.ph_vel, resh(self.rollout.buf_vels)),
                (self.stochpol.ph_prev_rew,
                 resh(self.rollout.buf_prev_ext_rews)),
                (self.stochpol.ph_prev_ac, resh(self.rollout.buf_prev_acs)),
            ])
        if self.dynamics.auxiliary_task.features_shared_with_policy:
            ph_buf.extend([
                (self.dynamics.auxiliary_task.ph_features,
                 resh(self.rollout.buf_feats)),
                (self.dynamics.auxiliary_task.ph_last_features,
                 resh(np.expand_dims(self.rollout.buf_feat_last, axis=1))),
            ])
        #print("Buff obs shape: {}".format(self.rollout.buf_obs.shape))
        #print("Buff rew shape: {}".format(self.rollout.buf_rews.shape))
        #print("Buff nlps shape: {}".format(self.rollout.buf_nlps.shape))
        #print("Buff vpreds shape: {}".format(self.rollout.buf_vpreds.shape))
        ph_buf.extend([(self.dynamics.last_ob,
                        self.rollout.buf_obs_last.reshape([
                            self.nenvs * self.nsegs_per_env, 1,
                            *self.ob_space.shape
                        ]))])
        mblossvals = []
        #if self.lstm:
        #print("Train lstm 1 state: {}, {}".format(self.rollout.train_lstm1_c, self.rollout.train_lstm1_h))
        #if self.lstm2_size:
        #print("Train lstm2 state: {}, {}".format(self.rollout.train_lstm2_c, self.rollout.train_lstm2_h))
        for _ in range(self.nepochs):
            np.random.shuffle(envinds)
            for start in range(0, self.nenvs * self.nsegs_per_env,
                               envsperbatch):
                end = start + envsperbatch
                mbenvinds = envinds[start:end]
                #mbenvinds = tf.convert_to_tensor(mbenvinds)
                #fd = {ph: buf[mbenvinds] if type(buf) is np.ndarray else buf.eval()[mbenvinds] for (ph, buf) in ph_buf}
                if self.early_stop:
                    grad_mask = self.rollout.grad_mask[mbenvinds]
                    fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf}
                    fd.update({self.ph_gradmask: grad_mask})
                else:
                    fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf}
                fd.update({
                    self.ph_lr: self.lr,
                    self.ph_cliprange: self.cliprange
                })
                if self.lstm:
                    fd.update({
                        self.stochpol.c_in_1:
                        self.rollout.train_lstm1_c[mbenvinds, :],
                        self.stochpol.h_in_1:
                        self.rollout.train_lstm1_h[mbenvinds, :]
                    })
                if self.lstm and self.lstm2_size:
                    fd.update({
                        self.stochpol.c_in_2:
                        self.rollout.train_lstm2_c[mbenvinds, :],
                        self.stochpol.h_in_2:
                        self.rollout.train_lstm2_h[mbenvinds, :]
                    })
                if self.log_grads:
                    outs = getsess().run(
                        self._losses + (self._train, self._summary), fd)
                    losses = outs[:-2]
                    summary = outs[-1]
                    mblossvals.append(losses)
                    wandb.tensorflow.log(tf.summary.merge_all())
                    self.grad_writer.add_summary(
                        summary,
                        getsess().run(self.global_step))
                else:
                    mblossvals.append(getsess().run(
                        self._losses + (self._train, ), fd)[:-1])
        mblossvals = [mblossvals[0]]
        info.update(
            zip(['opt_' + ln for ln in self.loss_names],
                np.mean([mblossvals[0]], axis=0)))
        info["rank"] = MPI.COMM_WORLD.Get_rank()
        self.n_updates += 1
        info["n_updates"] = self.n_updates
        info.update({
            dn: (np.mean(dvs) if len(dvs) > 0 else 0)
            for (dn, dvs) in self.rollout.statlists.items()
        })
        info.update(self.rollout.stats)
        if "states_visited" in info:
            info.pop("states_visited")
        tnow = time.time()
        info["ups"] = 1. / (tnow - self.t_last_update)
        info["total_secs"] = tnow - self.t_start
        info['tps'] = MPI.COMM_WORLD.Get_size(
        ) * self.rollout.nsteps * self.nenvs / (tnow - self.t_last_update)
        self.t_last_update = tnow

        return info
Пример #9
0
    transforms = utils.get_trans(size=cfg.img_size)

    train_dst = MyDataset(x_train, y_train, transform=transforms['train'])
    valid_dst = MyDataset(x_val, y_val, transform=transforms['val'])

    train_loader = torch.utils.data.DataLoader(train_dst,
                                               batch_size=cfg.bs,
                                               shuffle=True,
                                               pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(valid_dst,
                                               batch_size=cfg.bs,
                                               shuffle=False,
                                               pin_memory=True)

    # 得到均值方差
    print(utils.get_mean_and_std(train_dst))
    # 使用多模型融合
    models_list = get_model(cfg.model_names)
    for i, cur_cnn in enumerate(models_list):
        cnn = cur_cnn
        # 因为要保存model
        name = cfg.model_names[i] + '.pkl'
        cnn.to(device)
        # 训练数据
        loss_fn = nn.CrossEntropyLoss()
        # loss_fn = utils.LabelSmoothingCrossEntropy()
        optimizer = optim.Adam(cnn.parameters(), lr=cfg.lr, weight_decay=1e-4)
        # optimizer = optim.SGD(cnn.parameters(), lr=cfg.lr, momentum=0.9, nesterov=True)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                         mode='max',
                                                         patience=3,
Пример #10
0
    alpha = (array - mean) * (1. / std)
    return (1 - numpy.exp(-alpha)) / (1 + numpy.exp(-alpha))


for name in feature_names:
    if "leptons_" == name or "jets_" == name or "objects_" in name:
        feature_names.remove(name)

print("Here are the ordered global features:", feature_names)

if args.z_score:
    preprocess_dict = {}
    for feature in feature_names:
        if ("objects_" not in feature and "leptons_" != feature
                and "jets_" != feature):
            mean, stddev = utils.get_mean_and_std(features[feature])
            preprocess_dict[feature] = {
                "mean": float(mean),
                "std_dev": float(stddev)
            }

global_features = utils.create_array(features, feature_names, preprocess_dict,
                                     args.z_score)
global_features_validation = utils.create_array(features_validation,
                                                feature_names, preprocess_dict,
                                                args.z_score)
global_features_data = utils.create_array(features_data, feature_names,
                                          preprocess_dict, args.z_score)
global_features_final_fit = utils.create_array(features_final_fit,
                                               feature_names, preprocess_dict,
                                               args.z_score)
Пример #11
0
    def update(self):
        if self.normrew:
            rffs = np.array(
                [self.rff.update(rew) for rew in self.rollout.buf_rews.T])
            rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel())
            self.rff_rms.update_from_moments(rffs_mean, rffs_std**2,
                                             rffs_count)
            rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var)
        else:
            rews = np.copy(self.rollout.buf_rews)
        self.calculate_advantages(rews=rews,
                                  use_news=self.use_news,
                                  gamma=self.gamma,
                                  lam=self.lam)

        info = dict(advmean=self.buf_advs.mean(),
                    advstd=self.buf_advs.std(),
                    retmean=self.buf_rets.mean(),
                    retstd=self.buf_rets.std(),
                    vpredmean=self.rollout.buf_vpreds.mean(),
                    vpredstd=self.rollout.buf_vpreds.std(),
                    ev=explained_variance(self.rollout.buf_vpreds.ravel(),
                                          self.buf_rets.ravel()),
                    rew_mean=np.mean(self.rollout.buf_rews),
                    recent_best_ext_ret=self.rollout.current_max)
        if self.rollout.best_ext_ret is not None:
            info['best_ext_ret'] = self.rollout.best_ext_ret

        to_report = {
            'total': 0.0,
            'pg': 0.0,
            'vf': 0.0,
            'ent': 0.0,
            'approxkl': 0.0,
            'clipfrac': 0.0,
            'aux': 0.0,
            'dyn_loss': 0.0,
            'feat_var': 0.0
        }

        # normalize advantages
        if self.normadv:
            m, s = get_mean_and_std(self.buf_advs)
            self.buf_advs = (self.buf_advs - m) / (s + 1e-7)
        envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches
        envsperbatch = max(1, envsperbatch)
        envinds = np.arange(self.nenvs * self.nsegs_per_env)

        mblossvals = []

        for _ in range(self.nepochs):
            np.random.shuffle(envinds)
            for start in range(0, self.nenvs * self.nsegs_per_env,
                               envsperbatch):
                end = start + envsperbatch
                mbenvinds = envinds[start:end]

                acs = self.rollout.buf_acs[mbenvinds]
                rews = self.rollout.buf_rews[mbenvinds]
                vpreds = self.rollout.buf_vpreds[mbenvinds]
                nlps = self.rollout.buf_nlps[mbenvinds]
                obs = self.rollout.buf_obs[mbenvinds]
                rets = self.buf_rets[mbenvinds]
                advs = self.buf_advs[mbenvinds]
                last_obs = self.rollout.buf_obs_last[mbenvinds]

                lr = self.lr
                cliprange = self.cliprange

                self.stochpol.update_features(obs, acs)
                self.dynamics.auxiliary_task.update_features(obs, last_obs)
                self.dynamics.update_features(obs, last_obs)

                feat_loss = torch.mean(self.dynamics.auxiliary_task.get_loss())
                dyn_loss = torch.mean(self.dynamics.get_loss())

                acs = torch.tensor(flatten_dims(acs, len(self.ac_space.shape)))
                neglogpac = self.stochpol.pd.neglogp(acs)
                entropy = torch.mean(self.stochpol.pd.entropy())
                vpred = self.stochpol.vpred
                vf_loss = 0.5 * torch.mean(
                    (vpred.squeeze() - torch.tensor(rets))**2)

                nlps = torch.tensor(flatten_dims(nlps, 0))
                ratio = torch.exp(nlps - neglogpac.squeeze())

                advs = flatten_dims(advs, 0)
                negadv = torch.tensor(-advs)
                pg_losses1 = negadv * ratio
                pg_losses2 = negadv * torch.clamp(
                    ratio, min=1.0 - cliprange, max=1.0 + cliprange)
                pg_loss_surr = torch.max(pg_losses1, pg_losses2)
                pg_loss = torch.mean(pg_loss_surr)
                ent_loss = (-self.ent_coef) * entropy

                approxkl = 0.5 * torch.mean((neglogpac - nlps)**2)
                clipfrac = torch.mean(
                    (torch.abs(pg_losses2 - pg_loss_surr) > 1e-6).float())
                feat_var = torch.std(self.dynamics.auxiliary_task.features)

                total_loss = pg_loss + ent_loss + vf_loss + feat_loss + dyn_loss

                total_loss.backward()
                self.optimizer.step()
                self.optimizer.zero_grad()

                to_report['total'] += total_loss.data.numpy() / (
                    self.nminibatches * self.nepochs)
                to_report['pg'] += pg_loss.data.numpy() / (self.nminibatches *
                                                           self.nepochs)
                to_report['vf'] += vf_loss.data.numpy() / (self.nminibatches *
                                                           self.nepochs)
                to_report['ent'] += ent_loss.data.numpy() / (
                    self.nminibatches * self.nepochs)
                to_report['approxkl'] += approxkl.data.numpy() / (
                    self.nminibatches * self.nepochs)
                to_report['clipfrac'] += clipfrac.data.numpy() / (
                    self.nminibatches * self.nepochs)
                to_report['feat_var'] += feat_var.data.numpy() / (
                    self.nminibatches * self.nepochs)
                to_report['aux'] += feat_loss.data.numpy() / (
                    self.nminibatches * self.nepochs)
                to_report['dyn_loss'] += dyn_loss.data.numpy() / (
                    self.nminibatches * self.nepochs)

        info.update(to_report)
        self.n_updates += 1
        info["n_updates"] = self.n_updates
        info.update({
            dn: (np.mean(dvs) if len(dvs) > 0 else 0)
            for (dn, dvs) in self.rollout.statlists.items()
        })
        info.update(self.rollout.stats)
        if "states_visited" in info:
            info.pop("states_visited")
        tnow = time.time()
        info["ups"] = 1. / (tnow - self.t_last_update)
        info["total_secs"] = tnow - self.t_start
        info['tps'] = self.rollout.nsteps * self.nenvs / (
            tnow - self.t_last_update)  # MPI.COMM_WORLD.Get_size() *
        self.t_last_update = tnow

        return info
Пример #12
0
'''AlexNet for CIFAR10. FC layers are removed. Paddings are adjusted.
Without BN, the start learning rate should be 0.01
(c) YANG, Wei
'''

import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data as data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import models.cifar as models


from utils import get_mean_and_std

transform_stats = transforms.Compose([
        transforms.ToTensor()
    ])

dataset = datasets.ImageFolder(root='../Data/coco/images/cbas34_train',transform=transform_stats)

cbas_mean, cbas_std = get_mean_and_std(dataset)

print('CBAS-34 mean: {}'.format(cbas_mean))
print('CBAS-34 std: {}'.format(cbas_std))