global_step.get()) writer.add_scalar("step_time", timer.end(), global_step.get()) writer.add_scalar("episodic_reward", reward, global_step.get()) writer.add_scalar("episodic_sum_reward", total_reward, global_step.get()) writer.add_scalar("episode_length", local_step.get(), global_step.get()) logger.info("Sum reward: {}, episode={}".format(total_reward, episode)) if global_step.get() > c.ddpg_warmup_steps: for i in range(local_step.get()): timer.begin() ddpg.update(update_policy=i % 2 == 0, update_target=i % 2 == 0) ddpg.update_lr_scheduler() writer.add_scalar("train_step_time", timer.end(), global_step.get()) if render: create_gif_subproc( frames, save_env.get_trial_image_dir() + "/{}_{}".format(episode, global_step)) local_step.reset() episode_finished = False logger.info("End episode {} at {}".format( episode, dt.now().strftime("%m/%d-%H:%M:%S")))
action_dim] = agents[ag].final_step() actions = t.clamp(actions, min=-1, max=1) state, reward, episode_finished, info = env.step( actions[0].to("cpu")) frames.append(env.render(mode="rgb_array")) state = t.tensor(state, dtype=t.float32, device=device) reward = t.tensor(reward, dtype=t.float32, device=device).unsqueeze(dim=0) total_reward += reward for agent, r in zip(agents, reward[0]): agent.set_reward(r.view(1, 1)) old_samples = [agent.get_sample() for agent in agents] for agent in agents: agent.update_history(local_step.get()) agent.reset_negotiate() step_end = time.time() logger.info("Step {} completed in {:.3f} s".format( local_step, step_end - step_begin)) create_gif_subproc(frames, "{}/test".format(load_dir)) episode_end = time.time() logger.info("Episode completed in {:.3f} s".format(episode_end - episode_begin))
range(first_episode, last_episode + 1)): tmp_observe, total_reward, local_step, frames = result logger.info("Sum reward: {}, episode={}".format( float(total_reward), episode_num)) writer.add_scalar("episodic_sum_reward", float(total_reward), episode_num) writer.add_scalar("episode_length", local_step, episode_num) for obsrv in tmp_observe: ppo.store_transition(obsrv) if len(frames) != 0: # sub-processes cannot start a sub-process # so we have to store results in the main process create_gif_subproc( frames, save_env.get_trial_image_dir() + "/{}".format(episode_num)) # model serialization if episode_num % c.model_save_int == 0: ppo.save(save_env.get_trial_model_dir(), version=episode_num) logger.info("End episode {}-{} at {}".format( first_episode, last_episode, dt.now().strftime("%m/%d-%H:%M:%S"))) # begin training timer.begin() ppo.update() ppo.update_lr_scheduler() writer.add_scalar("train_step_time", timer.end(), episode.get())
format(local_step, step_end - step_begin, epoch, episode)) logger.info("Sum reward: {}, epoch={}, episode={}".format( t.mean(total_reward), epoch, episode)) if global_step.get() > ddpg_warmup_steps: for i in range(local_step.get()): ddpg_train_begin = time.time() # if using non-batched agents, set concatenate_samples=False ddpg.update(update_policy=i % 2 == 0) ddpg.update_lr_scheduler() ddpg_train_end = time.time() logger.info( "DDPG train Step {} completed in {:.3f} s, epoch={}, episode={}" .format(i, ddpg_train_end - ddpg_train_begin, epoch, episode)) if render: create_gif_subproc( frames, "{}/log/images/{}_{}_{}".format(root_dir, epoch, episode, global_step.get())) local_step.reset() episode_finished = False episode_end = time.time() logger.info("Episode {} completed in {:.3f} s, epoch={}".format( episode, episode_end - episode_begin, epoch)) episode.reset()