test_env = common.make_env(test=True) writer = SummaryWriter(comment="-01_a2c_" + args.name + suffix) net = common.AtariA2C(envs[0].observation_space.shape, envs[0].action_space.n).to(device) print(net) optimizer = optim.RMSprop(net.parameters(), lr=LEARNING_RATE, eps=1e-5) step_idx = 0 total_steps = 0 best_reward = None ts_start = time.time() best_test_reward = None with ptan.common.utils.TBMeanTracker(writer, batch_size=100) as tb_tracker: for mb_obs, mb_rewards, mb_actions, mb_values, _, done_rewards, done_steps in \ common.iterate_batches(envs, net, device=device): if len(done_rewards) > 0: total_steps += sum(done_steps) speed = total_steps / (time.time() - ts_start) if best_reward is None: best_reward = done_rewards.max() elif best_reward < done_rewards.max(): best_reward = done_rewards.max() tb_tracker.track("total_reward_max", best_reward, step_idx) tb_tracker.track("total_reward", done_rewards, step_idx) tb_tracker.track("total_steps", done_steps, step_idx) print("%d: done %d episodes, mean_reward=%.2f, best_reward=%.2f, speed=%.2f" % ( step_idx, len(done_rewards), done_rewards.mean(), best_reward, speed)) common.train_a2c(net, mb_obs, mb_rewards, mb_actions, mb_values, optimizer, tb_tracker, step_idx, device=device)
obs = envs[0].reset() obs_v = ptan.agent.default_states_preprocessor([obs]).to(device) res = net_i2a(obs_v) optimizer = optim.RMSprop(net_i2a.parameters(), lr=LEARNING_RATE, eps=1e-5) policy_opt = optim.Adam(net_policy.parameters(), lr=POLICY_LR) step_idx = 0 total_steps = 0 ts_start = time.time() best_reward = None best_test_reward = None with ptan.common.utils.TBMeanTracker(writer, batch_size=10) as tb_tracker: for mb_obs, mb_rewards, mb_actions, mb_values, mb_probs, done_rewards, done_steps in \ common.iterate_batches(envs, net_i2a, device): if len(done_rewards) > 0: total_steps += sum(done_steps) speed = total_steps / (time.time() - ts_start) if best_reward is None: best_reward = done_rewards.max() elif best_reward < done_rewards.max(): best_reward = done_rewards.max() tb_tracker.track("total_reward_max", best_reward, step_idx) tb_tracker.track("total_reward", done_rewards, step_idx) tb_tracker.track("total_steps", done_steps, step_idx) print("%d: done %d episodes, mean_reward=%.2f, best_reward=%.2f, speed=%.2f f/s" % ( step_idx, len(done_rewards), done_rewards.mean(), best_reward, speed)) obs_v = common.train_a2c(net_i2a, mb_obs, mb_rewards, mb_actions, mb_values, optimizer, tb_tracker, step_idx, device=device)
suffix = "" test_env = common.make_env(test=True) writer = SummaryWriter(comment="-01_a2c_" + args.name + suffix) net = common.AtariA2C(envs[0].observation_space.shape, envs[0].action_space.n) if args.cuda: net.cuda() print(net) optimizer = optim.RMSprop(net.parameters(), lr=LEARNING_RATE, eps=1e-5) step_idx = 0 best_reward = None best_test_reward = None with ptan.common.utils.TBMeanTracker(writer, batch_size=100) as tb_tracker: for mb_obs, mb_rewards, mb_actions, mb_values, done_rewards, done_steps in common.iterate_batches(envs, net, cuda=args.cuda): if len(done_rewards) > 0: if best_reward is None: best_reward = done_rewards.max() elif best_reward < done_rewards.max(): best_reward = done_rewards.max() tb_tracker.track("total_reward_max", best_reward, step_idx) tb_tracker.track("total_reward", done_rewards, step_idx) tb_tracker.track("total_steps", done_steps, step_idx) print("%d: done %d episodes, mean_reward=%.2f, best_reward=%.2f" % ( step_idx, len(done_rewards), done_rewards.mean(), best_reward)) common.train_a2c(net, mb_obs, mb_rewards, mb_actions, mb_values, optimizer, tb_tracker, step_idx, cuda=args.cuda) step_idx += 1 if args.steps is not None and args.steps < step_idx:
obs = envs[0].reset() obs_v = ptan.agent.default_states_preprocessor([obs], cuda=args.cuda) res = net_i2a(obs_v) optimizer = optim.RMSprop(net_i2a.parameters(), lr=LEARNING_RATE, eps=1e-5) policy_opt = optim.Adam(net_policy.parameters(), lr=POLICY_LR) step_idx = 0 total_steps = 0 ts_start = time.time() best_reward = None best_test_reward = None with ptan.common.utils.TBMeanTracker(writer, batch_size=10) as tb_tracker: for mb_obs, mb_rewards, mb_actions, mb_values, mb_probs, done_rewards, done_steps in \ common.iterate_batches(envs, net_i2a, cuda=args.cuda): if len(done_rewards) > 0: total_steps += sum(done_steps) speed = total_steps / (time.time() - ts_start) if best_reward is None: best_reward = done_rewards.max() elif best_reward < done_rewards.max(): best_reward = done_rewards.max() tb_tracker.track("total_reward_max", best_reward, step_idx) tb_tracker.track("total_reward", done_rewards, step_idx) tb_tracker.track("total_steps", done_steps, step_idx) print( "%d: done %d episodes, mean_reward=%.2f, best_reward=%.2f, speed=%.2f f/s" % (step_idx, len(done_rewards), done_rewards.mean(), best_reward, speed))