"--act_model", help="The pretrained actor model") parser.add_argument("-cm", "--crt_model", help="the pretrained critic model") args = parser.parse_args() device = "cuda" if torch.cuda.is_available() else "cpu" save_path = os.path.join("saves", "a2c-" + args.name) os.makedirs(save_path, exist_ok=True) envs = [gym.make(args.env) for _ in range(ENVS_COUNT)] test_env = gym.make(ENV_ID) act_net = model.ModelActor(envs[0].observation_space.shape[0], envs[0].action_space.shape[0]).to(device) crt_net = model.ModelCritic(envs[0].observation_space.shape[0]).to(device) print(act_net) print(crt_net) if args.act_model: act_net.load_state_dict(torch.load(args.act_model)) if args.crt_model: crt_net.load_state_dict(torch.load(args.crt_model)) writer = SummaryWriter(comment='-a2c_' + args.name) agent = model.AgentA2C(act_net, device) exp_source = drl.experience.ExperienceSourceFirstLast( envs, agent, gamma=GAMMA, steps_count=REWARD_STEP) act_optimizer = optim.Adam(act_net.parameters(), lr=LEARNING_RATE_ACTOR)
help="If specified, save every N-th step as an image") parser.add_argument("--acktr", default=False, action='store_true', help="Enable Acktr-specific tweaks") args = parser.parse_args() get_link_state = rospy.ServiceProxy("/gazebo/get_link_state", GetLinkState) pitch = 0 rospy.Subscriber('/Bobby/imu', Imu, get_angular_vel) counter = 0 env = make_env(args) if args.record: env = wrappers.Monitor(env, args.record) net = model.ModelActor(env.observation_space.shape[0], env.action_space.shape[0], args.hid) if args.acktr: opt = kfac.KFACOptimizer(net) net.load_state_dict(torch.load(args.model)) obs = env.reset() total_reward = 0.0 total_steps = 0 while True: obs_v = torch.FloatTensor(obs) mu_v = net(obs_v) action = mu_v.squeeze(dim=0).data.numpy() action = np.clip(action, -1, 1) if np.isscalar(action): action = [action]
def main(): parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action='store_true', help='Enable CUDA') parser.add_argument("-n", "--name", required=True, help="Name of the run") parser.add_argument("-e", "--env", default=ENV_ID, help="Environment id, default=" + ENV_ID) parser.add_argument("--lrc", default=LEARNING_RATE_CRITIC, type=float, help="Critic learning rate") parser.add_argument("--lra", default=LEARNING_RATE_ACTOR, type=float, help="Actor learning rate") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") save_path = os.path.join("saves", "ppo-" + args.name) os.makedirs(save_path, exist_ok=True) env = gym.make(args.env) test_env = gym.make(args.env) net_act = model.ModelActor(env.observation_space.shape[0], env.action_space.shape[0]).to(device) net_crt = model.ModelCritic(env.observation_space.shape[0]).to(device) print(net_act) print(net_crt) writer = SummaryWriter(comment="-ppo_" + args.name) agent = model.AgentA2C(net_act, device=device) exp_source = ptan.experience.ExperienceSource(env, agent, steps_count=1) opt_act = optim.Adam(net_act.parameters(), lr=args.lra) opt_crt = optim.Adam(net_crt.parameters(), lr=args.lrc) trajectory = [] best_reward = None with ptan.common.utils.RewardTracker(writer) as tracker: for step_idx, exp in enumerate(exp_source): rewards_steps = exp_source.pop_rewards_steps() if rewards_steps: rewards, steps = zip(*rewards_steps) writer.add_scalar("episode_steps", np.mean(steps), step_idx) tracker.reward(np.mean(rewards), step_idx) if step_idx % TEST_ITERS == 0: ts = time.time() rewards, steps = test_net(net_act, test_env, device=device) print("Test done in %.2f sec, reward %.3f, steps %d" % (time.time() - ts, rewards, steps)) writer.add_scalar("test_reward", rewards, step_idx) writer.add_scalar("test_steps", steps, step_idx) if best_reward is None or best_reward < rewards: if best_reward is not None: print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards)) name = "best_%+.3f_%d.dat" % (rewards, step_idx) fname = os.path.join(save_path, name) torch.save(net_act.state_dict(), fname) best_reward = rewards trajectory.append(exp) if len(trajectory) < TRAJECTORY_SIZE: continue traj_states = [t[0].state for t in trajectory] traj_actions = [t[0].action for t in trajectory] traj_states_v = torch.FloatTensor(traj_states) traj_states_v = traj_states_v.to(device) traj_actions_v = torch.FloatTensor(traj_actions) traj_actions_v = traj_actions_v.to(device) traj_adv_v, traj_ref_v = calc_adv_ref(trajectory, net_crt, traj_states_v, device=device) mu_v = net_act(traj_states_v) old_logprob_v = calc_logprob(mu_v, net_act.logstd, traj_actions_v) # normalize advantages traj_adv_v = traj_adv_v - torch.mean(traj_adv_v) traj_adv_v /= torch.std(traj_adv_v) # drop last entry from the trajectory, an our adv and ref value calculated without it trajectory = trajectory[:-1] old_logprob_v = old_logprob_v[:-1].detach() sum_loss_value = 0.0 sum_loss_policy = 0.0 count_steps = 0 for epoch in range(PPO_EPOCHES): for batch_ofs in range(0, len(trajectory), PPO_BATCH_SIZE): batch_l = batch_ofs + PPO_BATCH_SIZE states_v = traj_states_v[batch_ofs:batch_l] actions_v = traj_actions_v[batch_ofs:batch_l] batch_adv_v = traj_adv_v[batch_ofs:batch_l] batch_adv_v = batch_adv_v.unsqueeze(-1) batch_ref_v = traj_ref_v[batch_ofs:batch_l] batch_old_logprob_v = \ old_logprob_v[batch_ofs:batch_l] # critic training opt_crt.zero_grad() value_v = net_crt(states_v) loss_value_v = F.mse_loss(value_v.squeeze(-1), batch_ref_v) loss_value_v.backward() opt_crt.step() # actor training opt_act.zero_grad() mu_v = net_act(states_v) logprob_pi_v = calc_logprob(mu_v, net_act.logstd, actions_v) ratio_v = torch.exp(logprob_pi_v - batch_old_logprob_v) surr_obj_v = batch_adv_v * ratio_v c_ratio_v = torch.clamp(ratio_v, 1.0 - PPO_EPS, 1.0 + PPO_EPS) clipped_surr_v = batch_adv_v * c_ratio_v loss_policy_v = -torch.min(surr_obj_v, clipped_surr_v).mean() loss_policy_v.backward() opt_act.step() sum_loss_value += loss_value_v.item() sum_loss_policy += loss_policy_v.item() count_steps += 1 trajectory.clear() writer.add_scalar("advantage", traj_adv_v.mean().item(), step_idx) writer.add_scalar("values", traj_ref_v.mean().item(), step_idx) writer.add_scalar("loss_policy", sum_loss_policy / count_steps, step_idx) writer.add_scalar("loss_value", sum_loss_value / count_steps, step_idx)
help='Enable CUDA') parser.add_argument("-n", "--name", required=True, help="Name of the run") parser.add_argument("-e", "--env", default=ENV_ID, help="Environment id, default=" + ENV_ID) args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") save_path = os.path.join("saves", "trpo-" + args.name) os.makedirs(save_path, exist_ok=True) env = gym.make(args.env) test_env = gym.make(args.env) net_act = model.ModelActor(env.observation_space.shape[0], env.action_space.shape[0]).to(device) net_crt = model.ModelCritic(env.observation_space.shape[0]).to(device) print(net_act) print(net_crt) writer = SummaryWriter(comment="-trpo_" + args.name) agent = model.AgentA2C(net_act, device=device) exp_source = ptan.experience.ExperienceSource(env, agent, steps_count=1) opt_crt = optim.Adam(net_crt.parameters(), lr=LEARNING_RATE_CRITIC) trajectory = [] best_reward = None with ptan.common.utils.RewardTracker(writer) as tracker: for step_idx, exp in enumerate(exp_source): rewards_steps = exp_source.pop_rewards_steps()
help='Enable CUDA') parser.add_argument("-n", "--name", required=True, help="Name of the run") parser.add_argument("-e", "--env", default=ENV_ID, help="Environment id, default=" + ENV_ID) args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") save_path = os.path.join("saves", "acktr-" + args.name) os.makedirs(save_path, exist_ok=True) envs = [wrap_dqn(gym.make(args.env)) for _ in range(ENVS_COUNT)] test_env = wrap_dqn(gym.make(args.env)) net_act = model.ModelActor(envs[0].observation_space.shape, envs[0].action_space.n).to(device) net_crt = model.ModelCritic(envs[0].observation_space.shape).to(device) print(net_act) print(net_crt) writer = SummaryWriter(comment="-acktr_" + args.name) agent = model.AgentA2C(net_act, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( envs, agent, GAMMA, steps_count=REWARD_STEPS) opt_act = kfac.KFACOptimizer(net_act, lr=LEARNING_RATE_ACTOR) opt_crt = optim.Adam(net_crt.parameters(), lr=LEARNING_RATE_CRITIC) batch = [] best_reward = None with ptan.common.utils.RewardTracker(writer) as tracker:
LEARNING_RATE_ACTOR = 1e-3 LEARNING_RATE_CRITIC = 1e-3 ENTROPY_BETA = 1e-3 ENVS_COUNT = 16 if __name__ == "__main__": parser = make_parser() args, device, save_path, test_env, maxeps, maxsec = parse_args( parser, "acktr") envs = [make_env(args.env) for _ in range(ENVS_COUNT)] net_act = model.ModelActor(envs[0].observation_space.shape[0], envs[0].action_space.shape[0], args.hid).to(device) net_crt = model.ModelCritic(envs[0].observation_space.shape[0], args.hid).to(device) print(net_act) print(net_crt) writer = SummaryWriter(comment="-acktr_" + args.name) agent = model.AgentA2C(net_act, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast( envs, agent, GAMMA, steps_count=REWARD_STEPS) opt_act = kfac.KFACOptimizer(net_act, lr=LEARNING_RATE_ACTOR) opt_crt = optim.Adam(net_crt.parameters(), lr=LEARNING_RATE_CRITIC) batch = []
default=False, action="store_true", help="enable cuda") args = parser.parse_args() device = torch.device( "cuda" if args.cuda and torch.cuda.is_available() else "cpu") writer = SummaryWriter(comment="sumo-ants-ppo") save_path = "/home/chenkehan/RESEARCH/codes/try/DL_RL/ppo_sumo_ants/save_train_data" env = gym.make(ENV_ID) test_env = gym.make(ENV_ID) obs_shape = env.observation_space.spaces[1].shape[0] action_shape = env.action_space.spaces[1].shape[0] net_act = model.ModelActor(obs_shape, action_shape).to(device) net_crt = model.ModelCritic(obs_shape).to(device) print(net_act) print(net_crt) agent = model.AgentA2C(net_act, device=device) exp_source = experience.MAExperienceSource(env, agent, steps_count=1) opt_act = optim.Adam(net_act.parameters(), lr=LEARNING_RATE_ACTOR) opt_crt = optim.Adam(net_crt.parameters(), lr=LEARNING_RATE_CRITIC) trajectory = [] best_reward = None for step_idx, exp in enumerate(exp_source): reward_steps = exp_source.pop_rewards_steps()
parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", required=True, help="Model file to load") parser.add_argument("-e", "--env", default=ENV_ID, help="Environment name to use, default=" + ENV_ID) parser.add_argument( "-r", "--record", help="If specified, sets the recording dir, default=Disabled") parser.add_argument("--eval", default=False, action='store_true', help='Evaluates Agent') args = parser.parse_args() reward_eval_env = gym.make(args.env) env = gym.make(args.env) if args.record: env = gym.wrappers.Monitor(env, args.record, force=True) net = model.ModelActor(env.observation_space.shape[0], env.action_space.shape[0], HID_SIZE, ACTF) net.load_state_dict(torch.load(args.model)) agent = model.Agent(net, FIXED_SIGMA_VALUE, BETA) if args.eval: print("Evaluating Agent...") rewards = 0.0 steps = 0 for _ in range(100): obs = reward_eval_env.reset() while True: obs_v = torch.FloatTensor([obs]) mu_v = agent.get_actions_deterministic(obs_v) action = mu_v.squeeze(dim=0).data.cpu().numpy() obs, reward, done, _ = reward_eval_env.step(action) rewards += reward
action='store_true', help='Enable CUDA') parser.add_argument("-n", "--name", required=True, help="Name of the run") parser.add_argument("-e", "--env", default=ENV_ID, help="Environment id, default=" + ENV_ID) args = parser.parse_args() save_path = os.path.join("saves", "a2c-" + args.name) os.makedirs(save_path, exist_ok=True) envs = [gym.make(args.env) for _ in range(ENVS_COUNT)] test_env = gym.make(args.env) net_act = model.ModelActor(envs[0].observation_space.shape[0], envs[0].action_space.shape[0]) net_crt = model.ModelCritic(envs[0].observation_space.shape[0]) if args.cuda: net_act.cuda() net_crt.cuda() print(net_act) print(net_crt) writer = SummaryWriter(comment="-a2c_" + args.name) agent = model.AgentA2C(net_act, cuda=args.cuda) exp_source = ptan.experience.ExperienceSourceFirstLast( envs, agent, GAMMA, steps_count=REWARD_STEPS) opt_act = optim.Adam(net_act.parameters(), lr=LEARNING_RATE_ACTOR) opt_crt = optim.Adam(net_crt.parameters(), lr=LEARNING_RATE_CRITIC)