def create_disc(sim, save_path): import os from rts.discriminator import Discriminator if save_path is None: raise NameError( "A save_path should always be given to a discriminator") disc = Discriminator(sim.obs_dim * 2 + sim.act_dim) disc.save_path = save_path if nodes.mpi_role == 'main': os.makedirs(disc.save_path) disc.save(disc.save_path) data_out = { nodes.pnid + ":actor_weight": warehouse.Entry(action="set", value=disc.get_weights()) } data = warehouse.send(data_out) data_out = { nodes.pnid + ":actor_weight": warehouse.Entry(action="get", value=None) } data = warehouse.send(data_out) disc.set_weights(data[nodes.pnid + ":actor_weight"].value) return disc
def node_wrapper(*args, **kwargs): global proc_num global pnid proc_num += 1 pnid = str(proc_num)+":" warehouse.send({"proc_num": warehouse.Entry(action="set_max", value=proc_num)}) return func(*args, **kwargs)
def load_actor (actor, path): if mpi_role == 'main': actor.load(path) data_out = {pnid+":actor_weight":warehouse.Entry(action="set", value=actor.get_weights())} data = warehouse.send(data_out) data_out = {pnid+":actor_weight":warehouse.Entry(action="get", value=None)} data = warehouse.send(data_out) actor.set_weights(data[pnid+":actor_weight"].value)
def generate_trans_batch (env, actor, rollout_nb, rollout_len, log_std, save_path): mpi_role = nodes.mpi_role proc_num = nodes.proc_num pnid = nodes.pnid if mpi_role == 'main': os.makedirs(save_path) msg = {pnid+"trans" : warehouse.Entry(action="get_l", value=rollout_nb)} data = warehouse.send(msg) all_trans = np.stack(data[pnid+"trans"].value) np.save(os.path.join(save_path, "all_trans.npy"), all_trans) elif mpi_role == 'worker': msg = {"proc_num" : warehouse.Entry(action="get", value=None)} data = warehouse.send(msg) while proc_num >= data["proc_num"].value and not warehouse.is_work_done: all_trans = [] obs = env.reset () obs = np.asarray(obs).reshape((1,1,-1)) for i in range(rollout_len): trans = [obs.flatten()] act = actor.model(obs).numpy() act = act + np.random.normal(size=act.flatten().shape[0]).reshape(act.shape) * np.exp(log_std) obs, rew, done = env.step(act) obs = np.asarray(obs).reshape((1,1,-1)) trans.append((obs.flatten()-trans[0])*10) trans.append(act.flatten()) all_trans.append(np.concatenate(trans)) all_trans = np.asarray(all_trans).reshape((rollout_len,-1)) msg = { pnid+"trans" : warehouse.Entry(action="add", value=all_trans), "proc_num" : warehouse.Entry(action="get", value=None)} data = warehouse.send(msg)
def simple_actor (obs_dim, act_dim, obs_mean=None, obs_std=None, blindfold=None, inp_dim=None, save_path=None): import os from models.actor import SimpleActor if save_path is None: raise NameError("A save_path should always be given to an actor") actor = SimpleActor (obs_dim, act_dim, obs_mean=obs_mean, obs_std=obs_std, blindfold=blindfold, inp_dim=inp_dim) actor.save_path = save_path if mpi_role == 'main': os.makedirs(actor.save_path) actor.save(actor.save_path) data_out = {pnid+":actor_weight":warehouse.Entry(action="set", value=actor.get_weights())} data = warehouse.send(data_out) data_out = {pnid+":actor_weight":warehouse.Entry(action="get", value=None)} data = warehouse.send(data_out) actor.set_weights(data[pnid+":actor_weight"].value) return actor
def train_discrim (disc, real_trans_path, all_env, actor, epoch_nb, train_step_per_epoch, rollout_per_epoch, rollout_len, log_std, model_save_interval, tensorboard_path): mpi_role = nodes.mpi_role proc_num = nodes.proc_num pnid = nodes.pnid if mpi_role == 'main': os.makedirs(tensorboard_path) from rts.discriminator import Trainer trainer = Trainer(disc, tensorboard_path) trainer.model_save_interval = model_save_interval real_lab = np.asarray([1, 0]).reshape((1,2)) synth_lab = np.asarray([0, 1]).reshape((1,2)) all_real_trans = np.load(os.path.join(real_trans_path, "all_trans.npy")) all_real_trans, all_real_labs = format_trans(all_real_trans, real_lab) start_time = time.time() for n in range(epoch_nb): # get the latest rollouts msg = { pnid+"trans" : warehouse.Entry(action="get_l", value=rollout_per_epoch), "dumped" : warehouse.Entry(action="get", value=None) } data = warehouse.send(msg) dumped_rollout_nb = data["dumped"].value all_synth_trans_raw = data[pnid+"trans"].value all_synth_trans, all_synth_labs = format_trans(np.concatenate(all_synth_trans_raw, axis=0), synth_lab) # put the training data together all_trans = np.concatenate([all_real_trans, all_synth_trans], axis=0) all_labs = np.concatenate([all_real_labs, all_synth_labs], axis=0) # random offset for regularisation all_trans += np.random.normal(size=all_trans.shape) * 0.003 # update the network weights accuracy = trainer.train_network(n, all_trans, all_labs, train_step_per_epoch) #debug n_rollouts = len(all_synth_trans_raw) print("Epoch {} :".format(n), flush=True) print("Loaded {} synthetic rollouts for training while dumping {} for a total of {} transitions.".format(n_rollouts, dumped_rollout_nb, all_synth_trans.shape[0]), flush=True) dt = time.time() - start_time start_time = time.time() if dt > 0: print("fps : {}".format(all_synth_trans.shape[0]/dt), flush=True) print("accuracy : {}/{}".format(accuracy, all_trans.shape[0]), flush=True) msg = {pnid+":disc_weight":warehouse.Entry(action="set", value=disc.get_weights())} data = warehouse.send(msg) elif mpi_role == "worker": msg = {"proc_num" : warehouse.Entry(action="get", value=None)} data = warehouse.send(msg) while proc_num >= data["proc_num"].value and not warehouse.is_work_done: all_trans = [] env = all_env[np.random.randint(len(all_env))] obs = env.reset () obs = np.asarray(obs).reshape((1,1,-1)) done = [False] i = 0 while i < rollout_len and not done[0]: i += 1 trans = [obs.flatten()] act = actor.model(obs).numpy() act = act + np.random.normal(size=act.flatten().shape[0]).reshape(act.shape) * np.exp(log_std) obs, rew, done = env.step(act) obs = np.asarray(obs).reshape((1,1,-1)) trans.append((obs.flatten()-trans[0])*10) trans.append(act.flatten()) all_trans.append(np.concatenate(trans)) all_trans = np.asarray(all_trans).reshape((i,-1)) msg = { pnid+"trans" : warehouse.Entry(action="add", value=all_trans), "proc_num" : warehouse.Entry(action="get", value=None)} data = warehouse.send(msg) msg = {pnid+":disc_weight":warehouse.Entry(action="get", value=None)} data = warehouse.send(msg) disc.set_weights(data[pnid+":disc_weight"].value)
def run(self, save_path, proc_num, input_dict, output_dict): env = input_dict['Environment'][0] actor = input_dict['Actor'][0] teacher = input_dict['Teacher'][0] # --- set the weights of the actor action_layers to those of the teacher's --- for actor_layer, teacher_layer in zip(actor.action_layers, teacher.action_layers): actor_layer.set_weights(teacher_layer.get_weights()) actor_layer.trainable = False USE_ADR = hasattr(env, 'adr') if self.mpi_role == 'main': tensorboard_path = os.path.join(save_path['tensorboard'], self.data['tensorboard_name_prop']) os.makedirs(tensorboard_path) trainer = Distillation(env, actor, teacher, tensorboard_path) trainer.model_save_interval = int( self.data['model_save_interval_prop']) train_step_nb = int(self.data['train_step_nb_prop']) start_time = time.time() desired_rollout_nb = int(self.data['rollout_nb_prop']) for n in range(int(self.data['epoch_nb_prop'])): # send the network weights # and get the latest rollouts req = ["s", "a", "r", "mask", "dumped", "adr"] msg = { "node": proc_num, "weights": trainer.get_weights(), "rollout_nb": desired_rollout_nb, "request": req } data = warehouse.send(msg) all_s = data["s"] all_a = data["a"] all_r = data["r"] all_masks = data["mask"] dumped_rollout_nb = data["dumped"] if USE_ADR: env.adr.update(data["adr"]) env.adr.log() # update the network weights trainer.train_network(n, all_s, all_a, all_r, all_masks, train_step_nb) #debug n_rollouts = all_s.shape[0] rollout_len = all_s.shape[1] print("Epoch {} :".format(n), flush=True) print( "Loaded {} rollouts for training while dumping {}.".format( n_rollouts, dumped_rollout_nb), flush=True) dt = time.time() - start_time start_time = time.time() if dt > 0: print("fps : {}".format(n_rollouts * rollout_len / dt), flush=True) print("mean_rew : {}".format( np.sum(all_r * all_masks) / np.sum(all_masks)), flush=True) if USE_ADR: env.adr.save() elif self.mpi_role == 'worker': trainer = Distillation(env, actor, teacher) rollout_len = int(self.data['rollout_len_prop']) #data = warehouse.send({"request":["node"]}) ; self.data['name'] == data['node']" msg = {"request": ["weights", "node"]} data = warehouse.send(msg) while proc_num > data["node"]: time.sleep(0.3) data = warehouse.send(msg) while proc_num == data["node"]: test_adr = USE_ADR and np.random.random() < float( self.data['adr_prob_prop']) env.test_adr = test_adr trainer.set_weights(data["weights"]) if test_adr: # simulate rollout all_s, all_a, all_r, all_mask = trainer.get_rollout( env.adr_rollout_len) msg = { "node": proc_num, "adr": env.adr.get_msg(), "request": ["weights", "adr", "node"] } else: # simulate rollout all_s, all_a, all_r, all_mask = trainer.get_rollout( rollout_len) # send rollout back to warehouse # and get network weights and update actor msg = { "node": proc_num, "s": all_s, "a": all_a, "r": all_r, "mask": all_mask, "request": ["weights", "adr", "node"] } data = warehouse.send(msg) if USE_ADR: env.adr.update(data["adr"]) for actor_layer, teacher_layer in zip(actor.action_layers, teacher.action_layers): actor_layer.trainable = True output_dict['Trained actor'] = trainer.actor
def train_ppo (actor, env, epoch_nb, rollout_per_epoch, rollout_len, train_step_per_epoch, init_log_std, model_save_interval, adr_test_prob, tensorboard_path): mpi_role = nodes.mpi_role proc_num = nodes.proc_num pnid = nodes.pnid import os import time from ppo import PPO from models.critic import Critic USE_ADR = hasattr(env, 'adr') and adr_test_prob > 1e-7 if mpi_role == 'main': os.makedirs(tensorboard_path) critic = Critic(env) trainer = PPO(env, actor, critic, tensorboard_path, init_log_std=init_log_std) trainer.model_save_interval = model_save_interval start_time = time.time() for n in range(epoch_nb): # send the network weights # and get the latest rollouts msg = { pnid+"weights" : warehouse.Entry(action="set", value=trainer.get_weights()), pnid+"adr" : warehouse.Entry(action="get", value=None), pnid+"s" : warehouse.Entry(action="get_l", value=rollout_per_epoch), pnid+"a" : warehouse.Entry(action="get_l", value=rollout_per_epoch), pnid+"r" : warehouse.Entry(action="get_l", value=rollout_per_epoch), pnid+"neglog" : warehouse.Entry(action="get_l", value=rollout_per_epoch), pnid+"mask" : warehouse.Entry(action="get_l", value=rollout_per_epoch), "dumped" : warehouse.Entry(action="get", value=None) } data = warehouse.send(msg) all_s = np.concatenate(data[pnid+"s"].value, axis=0) all_a = np.concatenate(data[pnid+"a"].value, axis=0) all_r = np.concatenate(data[pnid+"r"].value, axis=0) all_neglog = np.concatenate(data[pnid+"neglog"].value, axis=0) all_masks = np.concatenate(data[pnid+"mask"].value, axis=0) dumped_rollout_nb = data["dumped"].value if USE_ADR: env.adr.update(data[pnid+"adr"].value) env.adr.log() # update the network weights all_last_values, all_gae, all_new_value = trainer.calc_gae(all_s, all_r, all_masks) trainer.train_networks(n, all_s, all_a, all_r, all_neglog, all_masks, train_step_per_epoch, all_last_values, all_gae, all_new_value) #debug n_rollouts = all_s.shape[0] cur_rollout_len = all_s.shape[1] print("Epoch {} :".format(n), flush=True) #dumped_rollout_nb = "?" print("Loaded {} rollouts for training while dumping {}.".format(n_rollouts, dumped_rollout_nb), flush=True) dt = time.time() - start_time start_time = time.time() if dt > 0: print("fps : {}".format(n_rollouts*cur_rollout_len/dt), flush=True) print("mean_rew : {}".format(np.sum(all_r * all_masks)/np.sum(all_masks)), flush=True) if USE_ADR: env.adr.save() elif mpi_role == 'worker': trainer = PPO(env, actor, Critic(env), init_log_std=init_log_std) msg = { pnid+"weights" : warehouse.Entry(action="get", value=None), pnid+"adr" : warehouse.Entry(action="set", value={}), "proc_num" : warehouse.Entry(action="get", value=None)} data = warehouse.send(msg) while proc_num >= data["proc_num"].value and not warehouse.is_work_done: test_adr = USE_ADR and np.random.random() < adr_test_prob env.test_adr = test_adr trainer.set_weights (data[pnid+"weights"].value) if test_adr: # simulate rollout all_s, all_a, all_r, all_neglog, all_mask = trainer.get_rollout(env.adr_rollout_len) msg = { pnid+"adr" : warehouse.Entry(action="update", value=env.adr.get_msg()), pnid+"weights" : warehouse.Entry(action="get", value=None), "proc_num" : warehouse.Entry(action="get", value=None)} else: # simulate rollout all_s, all_a, all_r, all_neglog, all_mask = trainer.get_rollout(rollout_len) # send rollout back to warehouse # and get network weights to update actor msg = { pnid+"s" : warehouse.Entry(action="add", value=all_s), pnid+"a" : warehouse.Entry(action="add", value=all_a), pnid+"r" : warehouse.Entry(action="add", value=all_r), pnid+"neglog" : warehouse.Entry(action="add", value=all_neglog), pnid+"mask" : warehouse.Entry(action="add", value=all_mask), pnid+"weights" : warehouse.Entry(action="get", value=None), pnid+"adr" : warehouse.Entry(action="get", value=None), "proc_num" : warehouse.Entry(action="get", value=None)} data = warehouse.send(msg) if USE_ADR: env.adr.update(data[pnid+"adr"].value)
""" mpiexec -n 4 python start_training.py exp_0 mpiexec -n 32 python start_training.py exp_0 tensorboard --logdir=results/exp_0/tensorboard --host localhost --port 6006 ps -aux """ if __name__ == "__main__": comm = MPI.COMM_WORLD my_rank = comm.Get_rank() my_name = MPI.Get_processor_name() mpi_role = 'main' if my_rank == 0 else ('wh' if my_rank == 1 else 'worker') nodes.mpi_role = mpi_role nodes.my_rank = my_rank warehouse.start_warehouse(comm, my_rank, 1) if not mpi_role == 'wh': main_programm () warehouse.send({}, work_done=True) print("Thread {} has ended neatly.".format(my_rank))