def master_update(self): # Receive gradient from a worker t1 = time.time() ## update_info = self.comm.recv(source=MPI.ANY_SOURCE, tag=TAG_UPDATE_START, status=self.status) worker_source = self.status.Get_source() ## t2 = time.time() t = t2 - t1 dh_logger.info( jm(type='receive_gradient', rank=self.comm.Get_rank(), duration=t, start_time=t1, end_time=t2, rank_worker_source=worker_source)) t1 = time.time() ## workerg = update_info['workerg'] stepsize = update_info['stepsize'] if self.scale_grad_by_procs: workerg /= self.comm.Get_size() - 1 # one is the parameter server self.t += 1 a = stepsize * np.sqrt(1 - self.beta2**self.t) / (1 - self.beta1**self.t) self.m = self.beta1 * self.m + (1 - self.beta1) * workerg self.v = self.beta2 * self.v + (1 - self.beta2) * (workerg * workerg) step = (-a) * self.m / (np.sqrt(self.v) + self.epsilon) update_vars = self.getflat() + step self.setfromflat(update_vars) ## t2 = time.time() t = t2 - t1 dh_logger.info( jm(type='update_parameters', rank=self.comm.Get_rank(), duration=t, start_time=t1, end_time=t2, rank_worker_source=worker_source)) t1 = time.time() ## self.comm.send(update_vars, dest=worker_source, tag=TAG_UPDATE_DONE) ## t2 = time.time() t = t2 - t1 dh_logger.info( jm(type='send_parameters', rank=self.comm.Get_rank(), duration=t, start_time=t1, end_time=t2, rank_worker_dest=worker_source)) return worker_source
def worker_update(self, localg, stepsize): # Send local gradient to master update_info = dict(workerg=localg, stepsize=stepsize) t1 = time.time() ## self.comm.send(update_info, dest=0, tag=TAG_UPDATE_START) update_vars = self.comm.recv(source=0, tag=TAG_UPDATE_DONE, status=self.status) ## t2 = time.time() t = t2 - t1 dh_logger.info( jm(type='receive_parameters', rank=self.comm.Get_rank(), duration=t, start_time=t1, end_time=t2, master_rank=0)) t1 = time.time() ## self.setfromflat(update_vars) ## t2 = time.time() t = t2 - t1 dh_logger.info( jm(type='setfromflat', rank=self.comm.Get_rank(), duration=t, start_time=t1, end_time=t2, master_rank=0))
def train(self, num_epochs=None): num_epochs = self.num_epochs if num_epochs is None else num_epochs if num_epochs > 0: min_mse = math.inf for i in range(num_epochs): self.model.fit(self.dataset_train, epochs=1, steps_per_epoch=self.train_steps_per_epoch, callbacks=self.callbacks) y_orig, y_pred = self.predict() try: unnormalize_mse = mean_squared_error(y_orig, y_pred) except ValueError as err: logger.error(traceback.format_exc()) unnormalize_mse = np.finfo('float32').max except: raise # self.train_history[f'{self.metrics_name[0]}_valid'] = unnormalize_mse min_mse = min(min_mse, unnormalize_mse) logger.info(jm(epoch=i, validation_mse=float(unnormalize_mse))) logger.info(jm(type='result', mse=float(min_mse))) return min_mse elif num_epochs == 0: y_orig, y_pred = self.predict() try: unnormalize_mse = mean_squared_error(y_orig, y_pred) except ValueError as err: logger.error(traceback.format_exc()) unnormalize_mse = np.finfo('float32').max except: raise logger.info(jm(epoch=0, validation_mse=float(unnormalize_mse))) logger.info(jm(type='result', mse=float(unnormalize_mse))) return unnormalize_mse else: raise RuntimeError( f'Number of epochs should be >= 0: {num_epochs}')
def train(self, num_epochs=None): num_epochs = self.num_epochs if num_epochs is None else num_epochs if num_epochs > 0: max_acc = 0 for i in range(num_epochs): self.model.fit(self.dataset_train, epochs=1, steps_per_epoch=self.train_steps_per_epoch, callbacks=self.callbacks) valid_info = self.model.evaluate( self.dataset_valid, steps=self.valid_steps_per_epoch) valid_loss, valid_acc = valid_info[0], valid_info[1] * 100 max_acc = max(max_acc, valid_acc) logger.info( jm(epoch=i, validation_loss=valid_loss, validation_acc=float(valid_acc))) logger.info(jm(type='result', acc=float(max_acc))) return max_acc elif num_epochs == 0: valid_info = self.model.evaluate(self.dataset_valid, steps=self.valid_steps_per_epoch) valid_loss, valid_acc = valid_info[0], valid_info[1] * 100 logger.info( jm(epoch=0, validation_loss=valid_loss, validation_acc=float(valid_acc))) logger.info(jm(type='result', acc=float(valid_acc))) return valid_acc else: raise RuntimeError( f'Number of epochs should be >= 0: {num_epochs}')
def train(num_episodes, seed, space, evaluator, num_episodes_per_batch): rank = MPI.COMM_WORLD.Get_rank() if rank == 0: # rank zero simule the use of a parameter server pass else: workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank( ) if seed is not None else None set_global_seeds(workerseed) # MAKE ENV_NAS cs_kwargs = space['create_structure'].get('kwargs') if cs_kwargs is None: structure = space['create_structure']['func']() else: structure = space['create_structure']['func'](**cs_kwargs) num_nodes = structure.num_nodes timesteps_per_actorbatch = num_nodes * num_episodes_per_batch num_timesteps = timesteps_per_actorbatch * num_episodes max_timesteps = num_timesteps timesteps_per_actorbatch = timesteps_per_actorbatch env = NasEnv(space, evaluator, structure) seg_gen = traj_segment_generator(env, timesteps_per_actorbatch) timesteps_so_far = 0 iters_so_far = 0 cond = sum([max_timesteps > 0]) assert cond == 1, f"Only one time constraint permitted: cond={cond}, max_timesteps={max_timesteps}" while True: if max_timesteps and timesteps_so_far >= max_timesteps: break logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() dh_logger.info( jm(type='seg', rank=MPI.COMM_WORLD.Get_rank(), **seg)) iters_so_far += 1 env.close()
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) reward_rule=reward_for_final_timestep): rank = MPI.COMM_WORLD.Get_rank() # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdamAsync(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() t1 = time.time() ## adam.sync() ## t2 = time.time() t = t2 - t1 dh_logger.info( jm(type='adam.sync', rank=rank, duration=t, start_time=t1, end_time=t2)) if rank == 0: # ZERO is the parameter server while True: t1 = time.time() ## BEGIN - TIMING ## rank_worker_source = adam.master_update() ## END - TIMING ## t2 = time.time() t = t2 - t1 dh_logger.info( jm(type='adam.master_update', rank=rank, duration=t, rank_worker_source=rank_worker_source, start_time=t1, end_time=t2)) else: # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True, reward_affect_func=reward_rule) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards cond = sum([ max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0 ]) assert cond == 1, f"Only one time constraint permitted: cond={cond}, max_iters={max_iters}, max_timesteps={max_timesteps}, max_episodes={max_episodes}, max_seconds={max_seconds}" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError #logger.log("********** Iteration %i ************"%iters_so_far) t1 = time.time() ## BEGIN - TIMING ## seg = seg_gen.__next__() ## END - TIMING ## t2 = time.time() t = t2 - t1 dh_logger.info( jm(type='batch_computation', rank=rank, duration=t, start_time=t1, end_time=t2)) dh_logger.info(jm(type='seg', rank=rank, **seg)) add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) # optim_batchsize = optim_batchsize or ob.shape[0] optim_batchsize = ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new( ) # set old parameter values to new parameter values dh_logger.info(f"Rank={rank}: Optimizing...") # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) t1 = time.time() ## BEGIN - TIMING ## adam.worker_update(g, optim_stepsize * cur_lrmult) ## END - TIMING ## t2 = time.time() t = t2 - t1 dh_logger.info( jm(type='adam.worker_update', rank=rank, duration=t, start_time=t1, end_time=t2)) losses.append(newlosses) dh_logger.info(f"Rank={rank}: Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0, use_mpi=False) lens = seg["ep_lens"] rews = seg["ep_rets"] episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 return pi
def learn(env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers gamma, lam, # advantage estimation max_timesteps=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) reward_rule=reward_for_final_timestep ): rank = MPI.COMM_WORLD.Get_rank() # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") input_c_vf = U.get_placeholder_cached(name="c_vf") input_h_vf = U.get_placeholder_cached(name="h_vf") input_c_pol = U.get_placeholder_cached(name="c_pol") input_h_pol = U.get_placeholder_cached(name="h_pol") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function( [ob, ac, atarg, ret, lrmult, input_c_vf, input_h_vf, input_c_pol, input_h_pol], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult, input_c_vf, input_h_vf, input_c_pol, input_h_pol], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator_ph(pi, env, timesteps_per_actorbatch, stochastic=True, reward_affect_func=reward_rule) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert max_timesteps > 0, f"The number of timesteps should be > 0 but is {max_timesteps}" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************"%iters_so_far) seg = seg_gen.__next__() dh_logger.info(jm(type='seg', rank=rank, **seg)) add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] c_vf = np.squeeze(np.array([c for c, _ in seg["hs_vf"]])) h_vf = np.squeeze(np.array([h for _, h in seg["hs_vf"]])) c_pol = np.squeeze(np.array([c for c, _ in seg["hs_pol"]])) h_pol = np.squeeze(np.array([h for _, h in seg["hs_pol"]])) vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret, c_vf=c_vf, h_vf=h_vf, c_pol=c_pol, h_pol=h_pol), shuffle=not pi.recurrent) # optim_batchsize = optim_batchsize or ob.shape[0] optim_batchsize = ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch gradients = [] for batch in d.iterate_once(optim_batchsize): for i in range(len(batch["ob"])): *newlosses, g = lossandgrad( batch["ob"][i:i+1], batch["ac"][i:i+1], batch["atarg"][i:i+1], batch["vtarg"][i:i+1], cur_lrmult, batch["c_vf"][i:i+1], batch["h_vf"][i:i+1], batch["c_pol"][i:i+1], batch["h_pol"][i:i+1]) losses.append(newlosses) gradients.append(g) g = np.array(gradients).sum(0) adam.update(g, optim_stepsize * cur_lrmult) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): for i in range(len(batch["ob"])): newlosses = compute_losses( batch["ob"][i:i+1], batch["ac"][i:i+1], batch["atarg"][i:i+1], batch["vtarg"][i:i+1], cur_lrmult, batch["c_vf"][i:i+1], batch["h_vf"][i:i+1], batch["c_pol"][i:i+1], batch["h_pol"][i:i+1]) losses.append(newlosses) meanlosses,_,_ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_"+name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank()==0: logger.dump_tabular() return pi