def __init__(self, env_fns, spaces=None): """ envs: list of gym environments to run in subprocesses """ self.waiting = False self.closed = False nenvs = len(env_fns) self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) #set_start_method('forkserver') set_start_method('spawn') #set_start_method('fork') self.ps = [ Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns) ] for p in self.ps: p.daemon = True # if the main process crashes, we should not cause things to hang p.start() for remote in self.work_remotes: remote.close() self.remotes[0].send(('get_spaces', None)) observation_space, action_space = self.remotes[0].recv() VecEnv.__init__(self, len(env_fns), observation_space, action_space)
def start_interaction(self, env_fns, dynamics, nlump=2): self.loss_names, self._losses = zip(*list(self.to_report.items())) params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if MPI.COMM_WORLD.Get_size() > 1: trainer = MpiAdamOptimizer(learning_rate=self.ph_lr, comm=MPI.COMM_WORLD) else: trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr) gradsandvars = trainer.compute_gradients(self.total_loss, params) self._train = trainer.apply_gradients(gradsandvars) if MPI.COMM_WORLD.Get_rank() == 0: getsess().run( tf.variables_initializer( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))) bcast_tf_vars_from_root( getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) self.all_visited_rooms = [] self.all_scores = [] self.nenvs = nenvs = len(env_fns) self.nlump = nlump self.lump_stride = nenvs // self.nlump self.envs = [ VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for l in range(self.nlump) ] self.rollout = Rollout(hps=self.hps, ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs, nsteps_per_seg=self.nsteps_per_seg, nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump, envs=self.envs, policy=self.stochpol, int_rew_coeff=self.int_coeff, ext_rew_coeff=self.ext_coeff, record_rollouts=self.use_recorder, dynamics=dynamics) self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_advs_int = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_advs_ext = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets_int = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets_ext = np.zeros((nenvs, self.rollout.nsteps), np.float32) if self.normrew: self.rff = RewardForwardFilter(self.gamma) self.rff_rms = RunningMeanStd() self.step_count = 0 self.t_last_update = time.time() self.t_start = time.time()
def __init__(self, params): print("----- Initiating ----- ") print("----- step 1 configure logger") configure_logger(params['output_dir']) log('Parameters {}'.format(params)) self.params = params print("----- step 2 load pre-collected things") self.binding = load_bindings(params['rom_file_path']) self.max_word_length = self.binding['max_word_length'] self.sp = spm.SentencePieceProcessor() self.sp.Load(params['spm_file']) print("----- step 3 build KGA2CEnv") kg_env = KGA2CEnv(params['rom_file_path'], params['seed'], self.sp, params['tsv_file'], step_limit=params['reset_steps'], stuck_steps=params['stuck_steps'], gat=params['gat']) self.vec_env = VecEnv(params['batch_size'], kg_env, params['openie_path']) print("----- step 4 build FrotzEnv and templace generator") env = FrotzEnv(params['rom_file_path']) self.vocab_act, self.vocab_act_rev = load_vocab(env) self.template_generator = TemplateActionGenerator(self.binding) print("----- step 5 build kga2c model") self.model = KGA2C(params, self.template_generator.templates, self.max_word_length, self.vocab_act, self.vocab_act_rev, len(self.sp), gat=self.params['gat']).cuda() if params['preload_weights']: print("load pretrained") self.model = torch.load(self.params['preload_weights'])['model'] else: print("train from scratch") print("----- step 6 set training parameters") self.batch_size = params['batch_size'] self.optimizer = optim.Adam(self.model.parameters(), lr=params['lr']) self.loss_fn1 = nn.BCELoss() self.loss_fn2 = nn.BCEWithLogitsLoss() self.loss_fn3 = nn.MSELoss() print("----- Init finished! ----- ")
def start_interaction(self, env_fns, dynamics, nlump=2): self.loss_names, self._losses = zip(*list(self.to_report.items())) params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) self.caculate_number_parameters(params) flow_params = [v for v in params if 'flow' in v.name] other_params = [v for v in params if 'flow' not in v.name] print('length of flow params: ', len(flow_params)) print('length of agent params: ', len(other_params)) trainer_flow = tf.train.AdamOptimizer(learning_rate=self.flow_lr) trainer_agent = tf.train.AdamOptimizer(learning_rate=self.ph_lr) grads = tf.gradients(self.total_loss, flow_params + other_params) grads_flow = grads[:len(flow_params)] grads_agent = grads[len(flow_params):] train_flow = trainer_flow.apply_gradients(zip(grads_flow, flow_params)) train_agent = trainer_agent.apply_gradients(zip(grads_agent, other_params)) self._train = tf.group(train_flow, train_agent) if MPI.COMM_WORLD.Get_rank() == 0: getsess().run(tf.variables_initializer(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))) bcast_tf_vars_from_root(getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) self.all_visited_rooms = [] self.all_scores = [] self.nenvs = nenvs = len(env_fns) self.nlump = nlump self.lump_stride = nenvs // self.nlump self.envs = [ VecEnv(env_fns[l * self.lump_stride: (l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for l in range(self.nlump)] self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs, nsteps_per_seg=self.nsteps_per_seg, nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump, envs=self.envs, policy=self.stochpol, int_rew_coeff=self.int_coeff, ext_rew_coeff=self.ext_coeff, record_rollouts=self.use_recorder, dynamics=dynamics) self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32) if self.normrew: self.rff = RewardForwardFilter(self.gamma) self.rff_rms = RunningMeanStd() self.step_count = 0 self.t_last_update = time.time() self.t_start = time.time()
def start_interaction(self, env_fns, dynamics, nlump=2): # 在开始与环境交互时定义变量和计算图, 初始化 rollout 类 self.loss_names, self._losses = zip(*list(self.to_report.items())) # 定义损失、梯度和反向传播. 在训练时调用 sess.run(self._train) 进行迭代 params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) params_dvae = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="dvae_reward") print("total params:", np.sum([np.prod(v.get_shape().as_list()) for v in params])) # 6629459 print("dvae params:", np.sum([np.prod(v.get_shape().as_list()) for v in params_dvae])) # 2726144 if MPI.COMM_WORLD.Get_size() > 1: trainer = MpiAdamOptimizer(learning_rate=self.ph_lr, comm=MPI.COMM_WORLD) else: trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr) gradsandvars = trainer.compute_gradients(self.total_loss, params) self._train = trainer.apply_gradients(gradsandvars) # add bai. 单独计算 DVAE 的梯度 gradsandvars_dvae = trainer.compute_gradients(self.dynamics_loss, params_dvae) self._train_dvae = trainer.apply_gradients(gradsandvars_dvae) if MPI.COMM_WORLD.Get_rank() == 0: getsess().run(tf.variables_initializer(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))) bcast_tf_vars_from_root(getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) self.all_visited_rooms = [] self.all_scores = [] self.nenvs = nenvs = len(env_fns) # 默认 128 self.nlump = nlump # 默认 1 self.lump_stride = nenvs // self.nlump # 128/1=128 self.envs = [ VecEnv(env_fns[l * self.lump_stride: (l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for l in range(self.nlump)] # 该类在 rollouts.py 中定义 self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs, nsteps_per_seg=self.nsteps_per_seg, nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump, envs=self.envs, policy=self.stochpol, int_rew_coeff=self.int_coeff, ext_rew_coeff=self.ext_coeff, record_rollouts=self.use_recorder, dynamics=dynamics) # 环境数(线程数), 周期T self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32) if self.normrew: self.rff = RewardForwardFilter(self.gamma) self.rff_rms = RunningMeanStd() self.step_count = 0 self.t_last_update = time.time() self.t_start = time.time()
def main(): # assert jericho.__version__ == '2.1.0', "This code is designed to be run with Jericho version 2.1.0." args = parse_args() print(args) configure_logger(args.output_dir) start_redis() agent = DRRN_Agent(args) env = JerichoEnv(args.rom_path, args.seed, args.env_step_limit) envs = VecEnv(args.num_envs, env) env.create() # Create the environment for evaluation train(agent, env, envs, args.max_steps, args.update_freq, args.eval_freq, args.checkpoint_freq, args.log_freq)
def start_interaction(self, env_fns, dynamics, nlump=2): param_list = self.stochpol.param_list + self.dynamics.param_list + self.dynamics.auxiliary_task.param_list # copy a link, not deepcopy. self.optimizer = torch.optim.Adam(param_list, lr=self.lr) self.optimizer.zero_grad() self.all_visited_rooms = [] self.all_scores = [] self.nenvs = nenvs = len(env_fns) self.nlump = nlump self.lump_stride = nenvs // self.nlump self.envs = [ VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for l in range(self.nlump) ] self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs, nsteps_per_seg=self.nsteps_per_seg, nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump, envs=self.envs, policy=self.stochpol, int_rew_coeff=self.int_coeff, ext_rew_coeff=self.ext_coeff, record_rollouts=self.use_recorder, dynamics=dynamics) self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32) if self.normrew: self.rff = RewardForwardFilter(self.gamma) self.rff_rms = RunningMeanStd() self.step_count = 0 self.t_last_update = time.time() self.t_start = time.time()
default=11, help='number of events to record (default: 11)') args = parser.parse_args() try: os.makedirs(args.log_dir) except OSError: pass cig = "cig" in args.config_path global envs es = [ make_env(i, args.config_path, visual=args.visual, cig=cig) for i in range(args.num_processes) ] envs = VecEnv([es[i] for i in range(args.num_processes)]) scenario = args.config_path.split("/")[1].split(".")[0] exp_name = scenario + ("_event" if args.roe else "") print("Scenario: " + scenario) actor_critic = torch.load( "/Users/git/rarity-of-events/models/3/783a542d-71cf-11e9-8daf-005056a54761.pt" ) # actor_critic = torch.load("/Users/git/rarity-of-events/models/2/f399ade2-6d52-11e9-8dad-005056a54761.pt") print("Model loaded") actor_critic.eval() obs_shape = envs.observation_space_shape
from envs import make_visual_env, make_env from vec_env import VecEnv from time import sleep from random import choice import argparse parser = argparse.ArgumentParser() parser.add_argument('--vis', type=int, default=0) args = parser.parse_args() num_envs = 1 if args.vis: envs = VecEnv([ make_visual_env('./scenarios/deathmatch_maze.cfg') for i in range(num_envs) ]) else: envs = VecEnv([ make_env(0, './scenarios/deathmatch_maze.cfg') for i in range(num_envs) ]) # Define some actions. Each list entry corresponds to declared buttons: # MOVE_LEFT, MOVE_RIGHT, ATTACK # 5 more combinations are naturally possible but only 3 are included for transparency when watching. # actions = [[True, False, False], [False, True, False], [False, False, True]] actions = range(envs.action_space_shape) episode_num = 0 while True: print('Episode #', episode_num) for j in range(1000):
def main(): print("###############################################################") print("#################### VISDOOM LEARNER START ####################") print("###############################################################") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None global envs envs = VecEnv( [make_env(i, args.config_path) for i in range(args.num_processes)], logging=True, log_dir=args.log_dir) obs_shape = envs.observation_space_shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if args.algo == 'a2c' or args.algo == 'acktr': actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape) elif args.algo == 'a2t': source_models = [] files = glob.glob(os.path.join(args.source_models_path, '*.pt')) for file in files: print(file, 'loading model...') source_models.append(torch.load(file)) actor_critic = A2TPolicy(obs_shape[0], envs.action_space_shape, source_models) elif args.algo == 'resnet': # args.num_stack = 3 actor_critic = ResnetPolicy(obs_shape[0], envs.action_space_shape) action_shape = 1 if args.cuda: actor_critic.cuda() if args.algo == 'a2c' or args.algo == 'resnet': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'a2t': a2t_params = [p for p in actor_critic.parameters() if p.requires_grad] optimizer = optim.RMSprop(a2t_params, args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space_shape) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space_shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action = actor_critic.act( Variable(rollouts.observations[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) # print ('Actions:', cpu_actions, 'Rewards:', reward) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, action.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.observations[:-1].view( -1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr', 'a2t', 'resnet']: values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c' or args.algo == 'resnet': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) elif args.algo == 'a2t': nn.utils.clip_grad_norm(a2t_params, args.max_grad_norm) optimizer.step() rollouts.observations[0].copy_(rollouts.observations[-1]) if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: envs.log() end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, 'VizDoom', args.algo) except IOError: pass envs.close() time.sleep(5)
def main(): print("###############################################################") print("#################### VIZDOOM LEARNER START ####################") print("###############################################################") save_path = os.path.join(args.save_dir, "a2c") num_updates = int(args.num_frames) // args.num_steps // args.num_processes reward_name = "" if args.roe: reward_name = "_event" scenario_name = args.config_path.split("/")[1].split(".")[0] print("############### " + scenario_name + " ###############") log_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str( args.agent_id) + ".log" log_event_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str( args.agent_id) + ".eventlog" log_event_reward_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str( args.agent_id) + ".eventrewardlog" start_updates = 0 start_step = 0 best_final_rewards = -1000000.0 os.environ['OMP_NUM_THREADS'] = '1' global envs es = [ make_env(i, args.config_path, visual=args.visual, bots=args.bots) for i in range(args.num_processes) ] envs = VecEnv([es[i] for i in range(args.num_processes)]) obs_shape = envs.observation_space_shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if args.resume: actor_critic = torch.load( os.path.join(save_path, log_file_name + ".pt")) filename = glob.glob(os.path.join(args.log_dir, log_file_name))[0] if args.roe: e # TODO: Load event buffer with open(filename) as file: lines = file.readlines() start_updates = (int)(lines[-1].strip().split(",")[0]) start_steps = (int)(lines[-1].strip().split(",")[1]) num_updates += start_updates else: if not args.debug: try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, log_file_name)) for f in files: os.remove(f) with open(log_file_name, "w") as myfile: myfile.write("") files = glob.glob( os.path.join(args.log_dir, log_event_file_name)) for f in files: os.remove(f) with open(log_event_file_name, "w") as myfile: myfile.write("") files = glob.glob( os.path.join(args.log_dir, log_event_reward_file_name)) for f in files: os.remove(f) with open(log_event_reward_file_name, "w") as myfile: myfile.write("") actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape) action_shape = 1 if args.cuda: actor_critic.cuda() optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space_shape) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space_shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) last_game_vars = [] for i in range(args.num_processes): last_game_vars.append(np.zeros(args.num_events)) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) episode_intrinsic_rewards = torch.zeros([args.num_processes, 1]) final_intrinsic_rewards = torch.zeros([args.num_processes, 1]) episode_events = torch.zeros([args.num_processes, args.num_events]) final_events = torch.zeros([args.num_processes, args.num_events]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() # Create event buffer if args.qd: event_buffer = EventBufferSQLProxy(args.num_events, args.capacity, args.exp_id, args.agent_id) elif not args.resume: event_buffer = EventBuffer(args.num_events, args.capacity) else: event_buffer = pickle.load( open(log_file_name + "_event_buffer_temp.p", "rb")) event_episode_rewards = [] start = time.time() for j in np.arange(start_updates, num_updates): for step in range(args.num_steps): value, action = actor_critic.act( Variable(rollouts.observations[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() obs, reward, done, info, events = envs.step(cpu_actions) intrinsic_reward = [] # Fix broken rewards - upscale for i in range(len(reward)): if scenario_name in ["deathmatch", "my_way_home"]: reward[i] *= 100 if scenario_name == "deadly_corridor": reward[i] = 1 if events[i][2] >= 1 else 0 for e in events: if args.roe: intrinsic_reward.append(event_buffer.intrinsic_reward(e)) else: r = reward[len(intrinsic_reward)] intrinsic_reward.append(r) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() intrinsic_reward = torch.from_numpy( np.expand_dims(np.stack(intrinsic_reward), 1)).float() #events = torch.from_numpy(np.expand_dims(np.stack(events), args.num_events)).float() events = torch.from_numpy(events).float() episode_rewards += reward episode_intrinsic_rewards += intrinsic_reward episode_events += events # Event stats event_rewards = [] for ei in range(0, args.num_events): ev = np.zeros(args.num_events) ev[ei] = 1 er = event_buffer.intrinsic_reward(ev) event_rewards.append(er) event_episode_rewards.append(event_rewards) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_intrinsic_rewards *= masks final_events *= masks final_rewards += (1 - masks) * episode_rewards final_intrinsic_rewards += (1 - masks) * episode_intrinsic_rewards final_events += (1 - masks) * episode_events for i in range(args.num_processes): if done[i]: event_buffer.record_events(np.copy( final_events[i].numpy()), frame=j * args.num_steps) episode_rewards *= masks episode_intrinsic_rewards *= masks episode_events *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, action.data, value.data, intrinsic_reward, masks) final_episode_reward = np.mean(event_episode_rewards, axis=0) event_episode_rewards = [] next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.observations[:-1].view( -1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.observations[0].copy_(rollouts.observations[-1]) if final_rewards.mean() > best_final_rewards and not args.debug: try: os.makedirs(save_path) except OSError: pass best_final_rewards = final_rewards.mean() save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save( save_model, os.path.join(save_path, log_file_name.split(".log")[0] + ".pt")) if j % args.save_interval == 0 and args.save_dir != "" and not args.debug: try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, log_file_name + "_temp.pt")) if isinstance(event_buffer, EventBuffer): pickle.dump(event_buffer, open(log_file_name + "_event_buffer_temp.p", "wb")) if j % args.log_interval == 0: envs.log() end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps log = "Updates {}, num timesteps {}, FPS {}, mean/max reward {:.5f}/{:.5f}, mean/max intrinsic reward {:.5f}/{:.5f}"\ .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.max(), final_intrinsic_rewards.mean(), final_intrinsic_rewards.max() ) log_to_file = "{}, {}, {:.5f}, {:.5f}, {:.5f}, {:.5f}\n" \ .format(j, total_num_steps, final_rewards.mean(), final_rewards.std(), final_intrinsic_rewards.mean(), final_intrinsic_rewards.std()) log_to_event_file = ','.join( map(str, event_buffer.get_event_mean().tolist())) + "\n" log_to_event_reward_file = ','.join( map(str, event_buffer.get_event_rewards().tolist())) + "\n" print(log) print(log_to_event_file) # Save to files with open(log_file_name, "a") as myfile: myfile.write(log_to_file) with open(log_event_file_name, "a") as myfile: myfile.write(str(total_num_steps) + "," + log_to_event_file) with open(log_event_reward_file_name, "a") as myfile: myfile.write( str(total_num_steps) + "," + log_to_event_reward_file) envs.close() time.sleep(5)
def main(): print("###############################################################") print("#################### VIZDOOM LEARNER START ####################") print("###############################################################") save_path = os.path.join(args.save_dir, str(args.exp_id)) log_path = os.path.join(args.log_dir, str(args.exp_id)) num_updates = int(args.num_frames) // args.num_steps // args.num_processes reward_name = "" if args.roe: reward_name = "_event" scenario_name = args.config_path.split("/")[1].split(".")[0] print("############### " + scenario_name + " ###############") log_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str( args.exp_id) + "_" + str(args.agent_id) + ".log" #log_event_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(args.exp_id) + "_" + str(args.agent_id) + ".eventlog" #log_event_reward_file_name = "vizdoom_" + scenario_name + reward_name + "_" + str(args.exp_id) + "_" + str(args.agent_id) + ".eventrewardlog" start_updates = 0 start_step = 0 best_final_rewards = -1000000.0 os.environ['OMP_NUM_THREADS'] = '1' cig = "cig" in args.config_path global envs es = [ make_env(i, args.config_path, visual=args.visual, cig=cig) for i in range(args.num_processes) ] envs = VecEnv([es[i] for i in range(args.num_processes)]) obs_shape = envs.observation_space_shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if args.resume: actor_critic = torch.load( os.path.join(save_path, f"{args.agent_id}.pt")) filename = glob.glob(os.path.join(log_path, log_file_name))[0] with open(filename) as file: lines = file.readlines() start_updates = (int)(lines[-1].strip().split(",")[0]) start_steps = (int)(lines[-1].strip().split(",")[1]) num_updates += start_updates else: try: os.makedirs(save_path) except OSError: pass try: os.makedirs(log_path) except OSError: files = glob.glob(os.path.join(args.log_dir, log_file_name)) for f in files: os.remove(f) #with open(log_file_name, "w") as myfile: # myfile.write("") #files = glob.glob(os.path.join(args.log_dir, log_event_file_name)) #for f in files: # os.remove(f) #with open(log_event_file_name, "w") as myfile: # myfile.write("") #files = glob.glob(os.path.join(args.log_dir, log_event_reward_file_name)) #for f in files: # os.remove(f) #with open(log_event_reward_file_name, "w") as myfile: # myfile.write("") actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape) action_shape = 1 if args.cuda: actor_critic.cuda() optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space_shape) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space_shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) last_game_vars = [] for i in range(args.num_processes): last_game_vars.append(np.zeros(args.num_events)) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) episode_intrinsic_rewards = torch.zeros([args.num_processes, 1]) final_intrinsic_rewards = torch.zeros([args.num_processes, 1]) episode_events = torch.zeros([args.num_processes, args.num_events]) final_events = torch.zeros([args.num_processes, args.num_events]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() def mean_distance_to_nearest_neighbor(elite_events): d = [] nearest = None for a in range(len(elite_events)): for b in range(len(elite_events)): if a != b: elite_a = elite_events[a] elite_b = elite_events[b] dist = np.linalg.norm(elite_a - elite_b) if nearest is None or dist < nearest: nearest = dist if nearest is not None: d.append(nearest) nearest = None return np.mean(d) def distance_to_nearest_neighbor(elite_events, events): nearest = None for elite_a in elite_events: dist = np.linalg.norm(elite_a - events) if nearest is None or dist < nearest: nearest = dist return nearest def add_to_archive(frame, episode_length): #print("Final rewards: ", final_rewards.numpy()) fitness = final_rewards.numpy().mean() #print("raw: ", final_events.numpy()) behavior = final_events.numpy().mean(axis=0) #print("Fitness:", fitness) #print("Behavior:", behavior) neighbors = event_buffer.get_neighbors(behavior, args.niche_divs, episode_length) add = len(neighbors) == 0 for neighbor in neighbors: if fitness > neighbor.fitness: add = True else: add = False break if add: if len(neighbors) > 0: event_buffer.remove_elites(neighbors) #print(f"- Removing elites {[neighbor.elite_id for neighbor in neighbors]}") for neighbor in neighbors: try: #print(f"- Deleting model {neighbor.elite_id}") os.remove( os.path.join(save_path, f"{neighbor.elite_id}.pt")) #print("Successfully deleted model with id : ", neighbor.elite_id) except: print("Error while deleting model with id : ", neighbor.elite_id) name = str(uuid.uuid1()) #print("Adding elite") event_buffer.add_elite(name, behavior, fitness, frame, episode_length) save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, f"{name}.pt")) # Create event buffer event_buffer = EventBufferSQLProxy(args.num_events, args.capacity, args.exp_id, args.agent_id, qd=args.qd, per_step=args.per_step) event_episode_rewards = [] episode_finished = np.zeros(args.num_processes) start = time.time() for j in np.arange(start_updates, num_updates): for step in range(args.num_steps): value, action = actor_critic.act( Variable(rollouts.observations[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() obs, reward, done, info, events = envs.step(cpu_actions) intrinsic_reward = [] # Fix broken rewards - upscale for i in range(len(reward)): if scenario_name in ["deathmatch", "my_way_home"]: reward[i] *= 100 if scenario_name == "deadly_corridor": reward[i] = 1 if events[i][2] >= 1 else 0 for e in events: if args.roe: ir = event_buffer.intrinsic_reward(e) if args.per_step: ir = ir / 4200 intrinsic_reward.append(ir) else: r = reward[len(intrinsic_reward)] intrinsic_reward.append(r) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() intrinsic_reward = torch.from_numpy( np.expand_dims(np.stack(intrinsic_reward), 1)).float() #events = torch.from_numpy(np.expand_dims(np.stack(events), args.num_events)).float() events = torch.from_numpy(events).float() episode_rewards += reward episode_intrinsic_rewards += intrinsic_reward episode_events += events # Event stats ''' event_rewards = [] for ei in range(0,args.num_events): ev = np.zeros(args.num_events) ev[ei] = 1 er = event_buffer.intrinsic_reward(ev) if args.per_step: er = er / 4200 er = event_buffer.intrinsic_reward(ev) event_rewards.append(er) event_episode_rewards.append(event_rewards) ''' # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_intrinsic_rewards *= masks final_events *= masks final_rewards += (1 - masks) * episode_rewards final_intrinsic_rewards += (1 - masks) * episode_intrinsic_rewards final_events += (1 - masks) * episode_events for i in range(args.num_processes): if done[i]: #event_buffer.record_events(np.copy(final_events[i].numpy()), frame=j*args.num_steps*args.num_processes) episode_length = (step + j * args.num_steps) - episode_finished[i] episode_finished[i] = episode_length + episode_finished[i] add_to_archive( step * args.num_processes + j * args.num_steps * args.num_processes, episode_length) episode_rewards *= masks episode_intrinsic_rewards *= masks episode_events *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, action.data, value.data, intrinsic_reward, masks) #final_episode_reward = np.mean(event_episode_rewards, axis=0) #event_episode_rewards = [] next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True))[0].data if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.observations[:-1].view( -1, *obs_shape)) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.observations[0].copy_(rollouts.observations[-1]) if j % args.log_interval == 0: envs.log() end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps log = "Updates {}, num timesteps {}, FPS {}, mean/max reward {:.5f}/{:.5f}, mean/max intrinsic reward {:.5f}/{:.5f}"\ .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.max(), final_intrinsic_rewards.mean(), final_intrinsic_rewards.max() ) log_to_file = "{}, {}, {:.5f}, {:.5f}, {:.5f}, {:.5f}\n" \ .format(j, total_num_steps, final_rewards.mean(), final_rewards.std(), final_intrinsic_rewards.mean(), final_intrinsic_rewards.std()) with open(os.path.join(log_path, log_file_name), "a") as myfile: myfile.write(log_to_file) save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() torch.save(save_model, os.path.join(save_path, f"{args.agent_id}.pt")) print(log) envs.close() time.sleep(5)
class KGA2CTrainer(object): ''' KGA2C main class. ''' def __init__(self, params): print("----- Initiating ----- ") print("----- step 1 configure logger") configure_logger(params['output_dir']) log('Parameters {}'.format(params)) self.params = params print("----- step 2 load pre-collected things") self.binding = load_bindings(params['rom_file_path']) self.max_word_length = self.binding['max_word_length'] self.sp = spm.SentencePieceProcessor() self.sp.Load(params['spm_file']) print("----- step 3 build KGA2CEnv") kg_env = KGA2CEnv(params['rom_file_path'], params['seed'], self.sp, params['tsv_file'], step_limit=params['reset_steps'], stuck_steps=params['stuck_steps'], gat=params['gat']) self.vec_env = VecEnv(params['batch_size'], kg_env, params['openie_path']) print("----- step 4 build FrotzEnv and templace generator") env = FrotzEnv(params['rom_file_path']) self.vocab_act, self.vocab_act_rev = load_vocab(env) self.template_generator = TemplateActionGenerator(self.binding) print("----- step 5 build kga2c model") self.model = KGA2C(params, self.template_generator.templates, self.max_word_length, self.vocab_act, self.vocab_act_rev, len(self.sp), gat=self.params['gat']).cuda() if params['preload_weights']: print("load pretrained") self.model = torch.load(self.params['preload_weights'])['model'] else: print("train from scratch") print("----- step 6 set training parameters") self.batch_size = params['batch_size'] self.optimizer = optim.Adam(self.model.parameters(), lr=params['lr']) self.loss_fn1 = nn.BCELoss() self.loss_fn2 = nn.BCEWithLogitsLoss() self.loss_fn3 = nn.MSELoss() print("----- Init finished! ----- ") def generate_targets(self, admissible, objs): ''' Generates ground-truth targets for admissible actions. :param admissible: List-of-lists of admissible actions. Batch_size x Admissible :param objs: List-of-lists of interactive objects. Batch_size x Objs :returns: template targets and object target tensors ''' tmpl_target = [] obj_targets = [] for adm in admissible: obj_t = set() cur_t = [0] * len(self.template_generator.templates) for a in adm: cur_t[a.template_id] = 1 obj_t.update(a.obj_ids) tmpl_target.append(cur_t) obj_targets.append(list(obj_t)) tmpl_target_tt = torch.FloatTensor(tmpl_target).cuda() # Note: Adjusted to use the objects in the admissible actions only object_mask_target = [] for objl in obj_targets: # in objs cur_objt = [0] * len(self.vocab_act) for o in objl: cur_objt[o] = 1 object_mask_target.append([[cur_objt], [cur_objt]]) obj_target_tt = torch.FloatTensor(object_mask_target).squeeze().cuda() return tmpl_target_tt, obj_target_tt def generate_graph_mask(self, graph_infos): assert len(graph_infos) == self.batch_size mask_all = [] for graph_info in graph_infos: mask = [0] * len(self.vocab_act.keys()) # Case 1 (default): KG as mask if self.params['masking'] == 'kg': graph_state = graph_info.graph_state # Full KG as mask --> same as KG-A2C # graph_state = graph_info.graph_state_5_mask # sub_KG_5 as mask, disabled ents = set() # Obtain entities ---> maybe I can perform graph pooling before this step for u, v in graph_state.edges: ents.add(u) ents.add(v) # Build mask: only use those related to entities for ent in ents: for ent_word in ent.split(): if ent_word[:self. max_word_length] in self.vocab_act_rev: idx = self.vocab_act_rev[ ent_word[:self.max_word_length]] mask[idx] = 1 # Case 2: interactive objects ground truth as the mask. elif self.params['masking'] == 'interactive': for o in graph_info.objs: o = o[:self.max_word_length] if o in self.vocab_act_rev.keys() and o != '': mask[self.vocab_act_rev[o]] = 1 # Case 3: no mask. elif self.params['masking'] == 'none': mask = [1] * len(self.vocab_act.keys()) else: assert False, 'Unrecognized masking {}'.format( self.params['masking']) mask_all.append(mask) return torch.BoolTensor(mask_all).cuda().detach() def discount_reward(self, transitions, last_values): returns, advantages = [], [] R = last_values.data for t in reversed(range(len(transitions))): _, _, values, rewards, done_masks, _, _, _, _, _, _ = transitions[ t] R = rewards + self.params['gamma'] * R * done_masks adv = R - values returns.append(R) advantages.append(adv) return returns[::-1], advantages[::-1] def train(self, max_steps): print("=== === === start training!!! === === ===") start = time.time() transitions = [] obs, infos, graph_infos = self.vec_env.reset() for step in range(1, max_steps + 1): # Step 1: build model inputs tb.logkv('Step', step) obs_reps = np.array([g.ob_rep for g in graph_infos]) scores = [info['score'] for info in infos] graph_mask_tt = self.generate_graph_mask(graph_infos) graph_state_reps = [g.graph_state_rep for g in graph_infos] graph_rep_1 = [ g.graph_state_rep_1_connectivity for g in graph_infos ] graph_rep_2 = [g.graph_state_rep_2_roomitem for g in graph_infos] graph_rep_3 = [g.graph_state_rep_3_youritem for g in graph_infos] graph_rep_4 = [g.graph_state_rep_4_otherroom for g in graph_infos] # Step 2: predict probs, actual items tmpl_pred_tt, obj_pred_tt, dec_obj_tt, dec_tmpl_tt, value, dec_steps = self.model( obs_reps, scores, graph_state_reps, graph_rep_1, graph_rep_2, graph_rep_3, graph_rep_4, graph_mask_tt) tb.logkv_mean('Value', value.mean().item()) # Step 3: Log the predictions and ground truth values topk_tmpl_probs, topk_tmpl_idxs = F.softmax( tmpl_pred_tt[0]).topk(5) topk_tmpls = [ self.template_generator.templates[t] for t in topk_tmpl_idxs.tolist() ] tmpl_pred_str = ', '.join([ '{} {:.3f}'.format(tmpl, prob) for tmpl, prob in zip(topk_tmpls, topk_tmpl_probs.tolist()) ]) # Step 4: Generate the ground truth and object mask admissible = [g.admissible_actions for g in graph_infos] objs = [g.objs for g in graph_infos] tmpl_gt_tt, obj_mask_gt_tt = self.generate_targets( admissible, objs) # Step 5: Log template/object predictions/ground_truth gt_tmpls = [ self.template_generator.templates[i] for i in tmpl_gt_tt[0]. nonzero().squeeze().cpu().numpy().flatten().tolist() ] gt_objs = [ self.vocab_act[i] for i in obj_mask_gt_tt[ 0, 0].nonzero().squeeze().cpu().numpy().flatten().tolist() ] topk_o1_probs, topk_o1_idxs = F.softmax(obj_pred_tt[0, 0]).topk(5) topk_o1 = [self.vocab_act[o] for o in topk_o1_idxs.tolist()] o1_pred_str = ', '.join([ '{} {:.3f}'.format(o, prob) for o, prob in zip(topk_o1, topk_o1_probs.tolist()) ]) chosen_actions = self.decode_actions(dec_tmpl_tt, dec_obj_tt) # Step 6: Next step obs, rewards, dones, infos, graph_infos = self.vec_env.step( chosen_actions) # Step 7: logging tb.logkv_mean( 'TotalStepsPerEpisode', sum([i['steps'] for i in infos]) / float(len(graph_infos))) tb.logkv_mean('Valid', infos[0]['valid']) if dones[0]: log('Step {} EpisodeScore {}'.format(step, infos[0]['score'])) for done, info in zip(dones, infos): if done: tb.logkv_mean('EpisodeScore', info['score']) # Step 8: append into transitions rew_tt = torch.FloatTensor(rewards).cuda().unsqueeze(1) done_mask_tt = (~torch.tensor(dones)).float().cuda().unsqueeze(1) self.model.reset_hidden(done_mask_tt) transitions.append( (tmpl_pred_tt, obj_pred_tt, value, rew_tt, done_mask_tt, tmpl_gt_tt, dec_tmpl_tt, dec_obj_tt, obj_mask_gt_tt, graph_mask_tt, dec_steps)) # Step 9: update model per 8 steps if len(transitions) >= self.params['bptt']: tb.logkv('StepsPerSecond', float(step) / (time.time() - start)) self.model.clone_hidden() obs_reps = np.array([g.ob_rep for g in graph_infos]) scores = [info['score'] for info in infos] graph_mask_tt = self.generate_graph_mask(graph_infos) graph_state_reps = [g.graph_state_rep for g in graph_infos] graph_rep_1 = [ g.graph_state_rep_1_connectivity for g in graph_infos ] graph_rep_2 = [ g.graph_state_rep_2_roomitem for g in graph_infos ] graph_rep_3 = [ g.graph_state_rep_3_youritem for g in graph_infos ] graph_rep_4 = [ g.graph_state_rep_4_otherroom for g in graph_infos ] _, _, _, _, next_value, _ = self.model( obs_reps, scores, graph_state_reps, graph_rep_1, graph_rep_2, graph_rep_3, graph_rep_4, graph_mask_tt) returns, advantages = self.discount_reward( transitions, next_value) tb.logkv_mean('Advantage', advantages[-1].median().item()) loss = self.update(transitions, returns, advantages) del transitions[:] self.model.restore_hidden() print("Total time: {:.2f} mins".format( (time.time() - start) / 60.)) if step % self.params['checkpoint_interval'] == 0: parameters = {'model': self.model} torch.save(parameters, os.path.join(self.params['output_dir'], 'kga2c.pt')) self.vec_env.close_extras() def update(self, transitions, returns, advantages): assert len(transitions) == len(returns) == len(advantages) loss = 0 for trans, ret, adv in zip(transitions, returns, advantages): tmpl_pred_tt, obj_pred_tt, value, _, _, tmpl_gt_tt, dec_tmpl_tt, \ dec_obj_tt, obj_mask_gt_tt, graph_mask_tt, dec_steps = trans # Supervised Template Loss tmpl_probs = F.softmax(tmpl_pred_tt, dim=1) template_loss = self.params['template_coeff'] * self.loss_fn1( tmpl_probs, tmpl_gt_tt) # Supervised Object Loss object_mask_target = obj_mask_gt_tt.permute((1, 0, 2)) obj_probs = F.softmax(obj_pred_tt, dim=2) object_mask_loss = self.params['object_coeff'] * self.loss_fn1( obj_probs, object_mask_target) # Build the object mask o1_mask, o2_mask = [0] * self.batch_size, [0] * self.batch_size for d, st in enumerate(dec_steps): if st > 1: o1_mask[d] = 1 o2_mask[d] = 1 elif st == 1: o1_mask[d] = 1 o1_mask = torch.FloatTensor(o1_mask).cuda() o2_mask = torch.FloatTensor(o2_mask).cuda() # Policy Gradient Loss policy_obj_loss = torch.FloatTensor([0]).cuda() cnt = 0 for i in range(self.batch_size): if dec_steps[i] >= 1: cnt += 1 batch_pred = obj_pred_tt[0, i, graph_mask_tt[i]] action_log_probs_obj = F.log_softmax(batch_pred, dim=0) dec_obj_idx = dec_obj_tt[0, i].item() graph_mask_list = graph_mask_tt[i].nonzero().squeeze().cpu( ).numpy().flatten().tolist() idx = graph_mask_list.index(dec_obj_idx) log_prob_obj = action_log_probs_obj[idx] policy_obj_loss += -log_prob_obj * adv[i].detach() if cnt > 0: policy_obj_loss /= cnt tb.logkv_mean('PolicyObjLoss', policy_obj_loss.item()) log_probs_obj = F.log_softmax(obj_pred_tt, dim=2) log_probs_tmpl = F.log_softmax(tmpl_pred_tt, dim=1) action_log_probs_tmpl = log_probs_tmpl.gather( 1, dec_tmpl_tt).squeeze() policy_tmpl_loss = (-action_log_probs_tmpl * adv.detach().squeeze()).mean() tb.logkv_mean('PolicyTemplateLoss', policy_tmpl_loss.item()) policy_loss = policy_tmpl_loss + policy_obj_loss value_loss = self.params['value_coeff'] * self.loss_fn3(value, ret) tmpl_entropy = -(tmpl_probs * log_probs_tmpl).mean() tb.logkv_mean('TemplateEntropy', tmpl_entropy.item()) object_entropy = -(obj_probs * log_probs_obj).mean() tb.logkv_mean('ObjectEntropy', object_entropy.item()) # Minimizing entropy loss will lead to increased entropy entropy_loss = self.params['entropy_coeff'] * -(tmpl_entropy + object_entropy) loss += template_loss + object_mask_loss + value_loss + entropy_loss + policy_loss tb.logkv('Loss', loss.item()) tb.logkv('TemplateLoss', template_loss.item()) tb.logkv('ObjectLoss', object_mask_loss.item()) tb.logkv('PolicyLoss', policy_loss.item()) tb.logkv('ValueLoss', value_loss.item()) tb.logkv('EntropyLoss', entropy_loss.item()) tb.dumpkvs() loss.backward() # Compute the gradient norm grad_norm = 0 for p in list( filter(lambda p: p.grad is not None, self.model.parameters())): grad_norm += p.grad.data.norm(2).item() tb.logkv('UnclippedGradNorm', grad_norm) nn.utils.clip_grad_norm_(self.model.parameters(), self.params['clip']) # Clipped Grad norm grad_norm = 0 for p in list( filter(lambda p: p.grad is not None, self.model.parameters())): grad_norm += p.grad.data.norm(2).item() tb.logkv('ClippedGradNorm', grad_norm) self.optimizer.step() self.optimizer.zero_grad() return loss def decode_actions(self, decoded_templates, decoded_objects): ''' Returns string representations of the given template actions. :param decoded_template: Tensor of template indices. :type decoded_template: Torch tensor of size (Batch_size x 1). :param decoded_objects: Tensor of o1, o2 object indices. :type decoded_objects: Torch tensor of size (2 x Batch_size x 1). ''' decoded_actions = [] for i in range(self.batch_size): decoded_template = decoded_templates[i].item() decoded_object1 = decoded_objects[0][i].item() decoded_object2 = decoded_objects[1][i].item() decoded_action = self.tmpl_to_str(decoded_template, decoded_object1, decoded_object2) decoded_actions.append(decoded_action) return decoded_actions def tmpl_to_str(self, template_idx, o1_id, o2_id): """ Returns a string representation of a template action. """ template_str = self.template_generator.templates[template_idx] holes = template_str.count('OBJ') assert holes <= 2 if holes <= 0: return template_str elif holes == 1: return template_str.replace('OBJ', self.vocab_act[o1_id]) else: return template_str.replace('OBJ', self.vocab_act[o1_id], 1)\ .replace('OBJ', self.vocab_act[o2_id], 1)
def main(): es = [make_env(i, args.board_size) for i in range(args.num_processes)] envs = VecEnv([es[i] for i in range(args.num_processes)]) spatial_obs_space = es[0].observation_space.spaces['board'].shape non_spatial_space = (1, 50) action_space = len(es[0].actions) # MODELS # if args.resume: ac_agent = torch.load("models/" + args.model_name) # Load model else: ac_agent = PrunedHybrid(spatial_obs_space[0], action_space, args.board_size) optimizer = optim.RMSprop(ac_agent.parameters(), args.learning_rate) # Creating the memory to store the steps taken if args.board_size == 1: action_space = 242 elif args.board_size == 3: action_space = 492 elif args.board_size == 5: action_space = 908 else: raise NotImplementedError("Not able to handle board size", args.board_size) memory = Memory(args.num_steps, args.num_processes, spatial_obs_space, non_spatial_space, action_space) obs = envs.reset() spatial_obs, non_spatial_obs = update_obs(obs) memory.spatial_obs[0].copy_(torch.from_numpy(spatial_obs).float()) memory.non_spatial_obs[0].copy_(torch.from_numpy(non_spatial_obs).float()) if args.resume & args.log: log_file = "logs/" + args.log_filename with open(log_file) as log: lines = log.readlines()[-1] resume_updates = float(lines.split(", ")[0]) resume_episodes = float(lines.split(", ")[1]) resume_steps = float(lines.split(", ")[3]) else: resume_updates = 0 resume_episodes = 0 resume_steps = 0 renderer = Renderer() rewards = 0 episodes = 0 for update in range(args.num_updates): for step in range(args.num_steps): available_actions = envs.actions() active_players = envs.active_players() own_players = envs.own_players() values, actions_policy = ac_agent.act( Variable(memory.spatial_obs[step]), Variable(memory.non_spatial_obs[step]), available_actions) if args.board_size == 1: actions, x_positions, y_positions = utils.map_actions_1v1(actions_policy) elif args.board_size == 3: actions, x_positions, y_positions = utils.map_actions_3v3_new_approach(actions_policy, active_players, own_players) elif args.board_size == 5: actions, x_positions, y_positions = utils.map_actions_5v5_pruned(actions_policy, active_players, own_players) else: raise NotImplementedError("Not able to handle board size", args.board_size) action_objects = [] for action, position_x, position_y in zip(actions, x_positions, y_positions): action_object = { 'action-type': action, 'x': position_x, 'y': position_y } action_objects.append(action_object) obs, reward, done, info, events = envs.step(action_objects) if args.render: for i in range(args.num_processes): renderer.render(obs[i], i) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() rewards += reward.sum().item() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) dones = masks.squeeze() episodes += args.num_processes - dones.sum().item() # Update the observations returned by the environment spatial_obs, non_spatial_obs = update_obs(obs) # insert the step taken into memory memory.insert(step, torch.from_numpy(spatial_obs).float(), torch.from_numpy(non_spatial_obs).float(), torch.tensor(actions_policy), torch.tensor(values), reward, masks, available_actions) next_value = ac_agent(Variable(memory.spatial_obs[-1]), Variable(memory.non_spatial_obs[-1]))[0].data # Compute returns memory.compute_returns(next_value, args.gamma) spatial = Variable(memory.spatial_obs[:-1]) # shape [20, 4, 26, 7, 14] spatial = spatial.view(-1, *spatial_obs_space) # shape [80, 26, 7, 14] non_spatial = Variable(memory.non_spatial_obs[:-1]) # shape [20, 4, 1, 49] non_spatial = non_spatial.view(-1, 50) # shape [80, 49] actions = Variable(torch.LongTensor(memory.actions.view(-1, 1))) actions_mask = Variable(memory.available_actions[:-1]) # Evaluate the actions taken action_log_probs, values, dist_entropy = ac_agent.evaluate_actions(Variable(spatial), Variable(non_spatial), actions, actions_mask) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(memory.returns[:-1]) - values value_loss = advantages.pow(2).mean() # Compute loss action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() total_loss = (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef) total_loss.backward() nn.utils.clip_grad_norm_(ac_agent.parameters(), args.max_grad_norm) optimizer.step() memory.non_spatial_obs[0].copy_(memory.non_spatial_obs[-1]) memory.spatial_obs[0].copy_(memory.spatial_obs[-1]) # Logging if (update + 1) % args.log_interval == 0 and args.log: log_file_name = "logs/" + args.log_filename # Updates updates = update + 1 resume_updates += updates # Episodes resume_episodes += episodes # Steps steps = args.num_processes * args.num_steps resume_steps += steps # Rewards reward = rewards mean_reward_pr_episode = reward / episodes log = "Updates {}, Episodes {}, Episodes this update {}, Total Timesteps {}, Reward {}, Mean Reward pr. Episode {:.2f}"\ .format(resume_updates, resume_episodes, episodes, resume_steps, reward, mean_reward_pr_episode) log_to_file = "{}, {}, {}, {}, {}, {}\n" \ .format(resume_updates, resume_episodes, episodes, resume_steps, reward, mean_reward_pr_episode) print(log) # Save to files with open(log_file_name, "a") as myfile: myfile.write(log_to_file) # Saving the agent torch.save(ac_agent, "models/" + args.model_name) rewards = 0 episodes = 0
help='vizdoom configuration file path (default: ./scenarios/basic.cfg)') parser.add_argument( '--load-dir', default='./trained_models/', help='directory to save agent logs (default: ./trained_models/)') parser.add_argument('--log-dir', default='/tmp/doom/', help='directory to save agent logs (default: /tmp/doom)') args = parser.parse_args() try: os.makedirs(args.log_dir) except OSError: pass envs = VecEnv([make_visual_env(args.config_path)]) actor_critic = torch.load(os.path.join(args.load_dir, args.env_name + ".pt")) actor_critic.eval() obs_shape = envs.observation_space_shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) current_obs = torch.zeros(1, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space_shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs
def start_interaction(self, env_fns, dynamics, nlump=2): self.loss_names, self._losses = zip(*list(self.to_report.items())) self.global_step = tf.Variable(0, trainable=False) params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if MPI.COMM_WORLD.Get_size() > 1: if self.agent_num is None: trainer = MpiAdamOptimizer(learning_rate=self.ph_lr, comm=MPI.COMM_WORLD) else: if self.agent_num is None: if self.optim == 'adam': trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr) elif self.optim == 'sgd': print("using sgd") print("________________________") if self.decay: self.decay_lr = tf.train.exponential_decay( self.ph_lr, self.global_step, 2500, .96, staircase=True) trainer = tf.train.GradientDescentOptimizer( learning_rate=self.decay_lr) else: trainer = tf.train.GradientDescentOptimizer( learning_rate=self.ph_lr) elif self.optim == 'momentum': print('using momentum') print('________________________') trainer = tf.train.MomentumOptimizer( learning_rate=self.ph_lr, momentum=0.9) if self.agent_num is None: gradsandvars = trainer.compute_gradients(self.total_loss, params) l2_norm = lambda t: tf.sqrt(tf.reduce_sum(tf.pow(t, 2))) if self.log_grads: for grad, var in gradsandvars: tf.summary.histogram(var.name + '/gradient', l2_norm(grad)) tf.summary.histogram(var.name + '/value', l2_norm(var)) grad_mean = tf.reduce_mean(tf.abs(grad)) tf.summary.scalar(var.name + '/grad_mean', grad_mean) if self.decay: tf.summary.scalar('decay_lr', self.decay_lr) self._summary = tf.summary.merge_all() tf.add_to_collection("summary_op", self._summary) if self.grad_clip > 0: grads, gradvars = zip(*gradsandvars) grads, _ = tf.clip_by_global_norm(grads, self.grad_clip) gradsandvars = list(zip(grads, gradvars)) self._train = trainer.apply_gradients(gradsandvars, global_step=self.global_step) self._updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self._train = tf.group(self._train, self._updates) tf.add_to_collection("train_op", self._train) else: self._train = tf.get_collection("train_op")[0] if self.log_grads: self._summary = tf.get_collection("summary_op")[0] if MPI.COMM_WORLD.Get_rank() == 0: getsess().run( tf.variables_initializer( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))) bcast_tf_vars_from_root( getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) self.all_visited_rooms = [] self.all_scores = [] self.nenvs = nenvs = len(env_fns) self.nlump = nlump self.lump_stride = nenvs // self.nlump self.envs = [ VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride], spaces=[self.env_ob_space, self.ac_space]) for l in range(self.nlump) ] self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs, nsteps_per_seg=self.nsteps_per_seg, nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump, envs=self.envs, policy=self.stochpol, int_rew_coeff=self.int_coeff, ext_rew_coeff=self.ext_coeff, record_rollouts=self.use_recorder, dynamics=dynamics, exp_name=self.exp_name, env_name=self.env_name, video_log_freq=self.video_log_freq, model_save_freq=self.model_save_freq, use_apples=self.use_apples, multi_envs=self.multi_envs, lstm=self.lstm, lstm1_size=self.lstm1_size, lstm2_size=self.lstm2_size, depth_pred=self.depth_pred, early_stop=self.early_stop, aux_input=self.aux_input) self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32) if self.normrew: self.rff = RewardForwardFilter(self.gamma) self.rff_rms = RunningMeanStd() self.step_count = 0 self.t_last_update = time.time() self.t_start = time.time()
def start_interaction(self, env_fns, dynamics, nlump=2): self.loss_names, self._losses = zip(*list(self.to_report.items())) params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if MPI.COMM_WORLD.Get_size() > 1: trainer = MpiAdamOptimizer(learning_rate=self.ph_lr, comm=MPI.COMM_WORLD) else: trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr) #gvs = trainer.compute_gradients(self.total_loss, params) #self.gshape = gs #gs = [g for (g,v) in gvs] #self.normg = tf.linalg.global_norm(gs) #new_g = [tf.clip_by_norm(g,10.0) for g in gs i] #self.nnormg = tf.linalg.global_norm(new_g) def ClipIfNotNone(grad): return tf.clip_by_value(grad, -25.0, 25.0) if grad is not None else grad gradsandvars = trainer.compute_gradients(self.total_loss, params) #gs = [g for (g,v) in gradsandvars] #new_g = [tf.clip_by_norm(g,10.0) for g in gs if g is not None] gradsandvars = [(ClipIfNotNone(g), v) for g, v in gradsandvars] #new_g = [g for (g,v) in gradsandvars] #self.nnormg = tf.linalg.global_norm(new_g) #gradsandvars = [(ClipIfNotNone(grad), var) for grad, var in gradsandvars] self._train = trainer.apply_gradients(gradsandvars) if MPI.COMM_WORLD.Get_rank() == 0: getsess().run( tf.variables_initializer( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))) bcast_tf_vars_from_root( getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) self.all_visited_rooms = [] self.all_scores = [] self.nenvs = nenvs = len(env_fns) print('-------NENVS-------', self.nenvs) self.nlump = nlump print('----------NLUMPS-------', self.nlump) self.lump_stride = nenvs // self.nlump print('-------LSTRIDE----', self.lump_stride) print('--------OBS SPACE ---------', self.ob_space) print('-------------AC SPACE-----', self.ac_space) #assert 1==2 print('-----BEFORE VEC ENV------') self.envs = [ VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for l in range(self.nlump) ] print('-----AFTER VEC ENV------') self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs, nsteps_per_seg=self.nsteps_per_seg, nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump, envs=self.envs, policy=self.stochpol, int_rew_coeff=self.int_coeff, ext_rew_coeff=self.ext_coeff, record_rollouts=self.use_recorder, dynamics=dynamics, exp_name=self.exp_name, env_name=self.env_name, to_eval=self.to_eval) self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32) if self.normrew: self.rff = RewardForwardFilter(self.gamma) self.rff_rms = RunningMeanStd() self.step_count = 0 self.t_last_update = time.time() self.t_start = time.time() self.saver = tf.train.Saver(max_to_keep=5)