def data_func(net, device, train_queue): envs = [make_env() for _ in range(NUM_ENVS)] new_net = common.AtariA2C(envs[0].observation_space.shape, envs[0].action_space.n).to(device) new_net.load_state_dict(net) agent = ptan.agent.PolicyAgent(lambda x: new_net(x)[0], device=device, apply_softmax=True) exp_source = ptan.experience.ExperienceSourceFirstLast( envs, agent, gamma=GAMMA, steps_count=REWARD_STEPS) for exp in exp_source: new_rewards = exp_source.pop_total_rewards() if new_rewards: train_queue.put(TotalReward(reward=np.mean(new_rewards))) train_queue.put(exp)
if __name__ == '__main__': mp.set_start_method('spawn') parser = argparse.ArgumentParser() parser.add_argument('--cuda', default=False, action='store_true', help='Enable cuda') parser.add_argument('-n', '--name', required=True, help='Name of the run') args = parser.parse_args() # step 3 parse_args device = 'cuda' if args.cuda else 'cpu' env = make_env() net = common.AtariA2C(env.observation_space.shape, env.action_space.n).to(device) net.share_memory() # ??? optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE, eps=1e-3) train_queue = mp.Queue(maxsize=PROCESSES_COUNT) data_proc_list = [] for proc_idx in range(PROCESSES_COUNT): proc_name = '-a3c-grad_' + NAME + '_' + args.name + '#%d' % proc_idx data_proc = mp.Process(target=grads_func, args=(proc_name, net, device, train_queue)) data_proc.start() data_proc_list.append(data_proc)
os.makedirs(saves_path, exist_ok=True) envs = [common.make_env() for _ in range(common.NUM_ENVS)] test_env = common.make_env(test=True) if args.seed: common.set_seed(args.seed, envs, cuda=args.cuda) suffix = "-seed=%d" % args.seed else: suffix = "" writer = SummaryWriter(comment="-03_i2a_" + args.name + suffix) obs_shape = envs[0].observation_space.shape act_n = envs[0].action_space.n net_policy = common.AtariA2C(obs_shape, act_n).to(device) net_em = i2a.EnvironmentModel(obs_shape, act_n) net_em.load_state_dict(torch.load(args.em, map_location=lambda storage, loc: storage)) net_em = net_em.to(device) net_i2a = i2a.I2A(obs_shape, act_n, net_em, net_policy, ROLLOUTS_STEPS).to(device) print(net_i2a) obs = envs[0].reset() obs_v = ptan.agent.default_states_preprocessor([obs]).to(device) res = net_i2a(obs_v) optimizer = optim.RMSprop(net_i2a.parameters(), lr=LEARNING_RATE, eps=1e-5) policy_opt = optim.Adam(net_policy.parameters(), lr=POLICY_LR)
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") parser.add_argument("-n", "--name", required=True, help="Name of the run") parser.add_argument("-m", "--model", required=True, help="File with model to load") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") saves_path = os.path.join("saves", "02_env_" + args.name) os.makedirs(saves_path, exist_ok=True) envs = [common.make_env() for _ in range(NUM_ENVS)] writer = SummaryWriter(comment="-02_env_" + args.name) net = common.AtariA2C(envs[0].observation_space.shape, envs[0].action_space.n) net_em = i2a.EnvironmentModel(envs[0].observation_space.shape, envs[0].action_space.n).to(device) net.load_state_dict(torch.load(args.model, map_location=lambda storage, loc: storage)) net = net.to(device) print(net_em) optimizer = optim.Adam(net_em.parameters(), lr=LEARNING_RATE) step_idx = 0 best_loss = np.inf with ptan.common.utils.TBMeanTracker(writer, batch_size=100) as tb_tracker: for mb_obs, mb_obs_next, mb_actions, mb_rewards, done_rewards, done_steps in iterate_batches(envs, net, device): if len(done_rewards) > 0: m_reward = np.mean(done_rewards) m_steps = np.mean(done_steps) print("%d: done %d episodes, mean reward=%.2f, steps=%.2f" % ( step_idx, len(done_rewards), m_reward, m_steps))
os.makedirs(saves_path, exist_ok=True) envs = [common.make_env() for _ in range(common.NUM_ENVS)] test_env = common.make_env(test=True) if args.seed: common.set_seed(args.seed, envs, cuda=args.cuda) suffix = "-seed=%d" % args.seed else: suffix = "" writer = SummaryWriter(comment="-03_i2a_" + args.name + suffix) obs_shape = envs[0].observation_space.shape act_n = envs[0].action_space.n net_policy = common.AtariA2C(obs_shape, act_n) net_em = i2a.EnvironmentModel(obs_shape, act_n) net_em.load_state_dict( torch.load(args.em, map_location=lambda storage, loc: storage)) net_i2a = i2a.I2A(obs_shape, act_n, net_em, net_policy, ROLLOUTS_STEPS) if args.cuda: net_policy.cuda() net_em.cuda() net_i2a.cuda() print(net_i2a) obs = envs[0].reset()