Пример #1
0
                        "--act_model",
                        help="The pretrained actor model")
    parser.add_argument("-cm",
                        "--crt_model",
                        help="the pretrained critic model")
    args = parser.parse_args()

    device = "cuda" if torch.cuda.is_available() else "cpu"

    save_path = os.path.join("saves", "a2c-" + args.name)
    os.makedirs(save_path, exist_ok=True)

    envs = [gym.make(args.env) for _ in range(ENVS_COUNT)]
    test_env = gym.make(ENV_ID)

    act_net = model.ModelActor(envs[0].observation_space.shape[0],
                               envs[0].action_space.shape[0]).to(device)
    crt_net = model.ModelCritic(envs[0].observation_space.shape[0]).to(device)
    print(act_net)
    print(crt_net)
    if args.act_model:
        act_net.load_state_dict(torch.load(args.act_model))
    if args.crt_model:
        crt_net.load_state_dict(torch.load(args.crt_model))

    writer = SummaryWriter(comment='-a2c_' + args.name)
    agent = model.AgentA2C(act_net, device)

    exp_source = drl.experience.ExperienceSourceFirstLast(
        envs, agent, gamma=GAMMA, steps_count=REWARD_STEP)

    act_optimizer = optim.Adam(act_net.parameters(), lr=LEARNING_RATE_ACTOR)
Пример #2
0
                        help="If specified, save every N-th step as an image")
    parser.add_argument("--acktr",
                        default=False,
                        action='store_true',
                        help="Enable Acktr-specific tweaks")
    args = parser.parse_args()
    get_link_state = rospy.ServiceProxy("/gazebo/get_link_state", GetLinkState)
    pitch = 0
    rospy.Subscriber('/Bobby/imu', Imu, get_angular_vel)

    counter = 0
    env = make_env(args)
    if args.record:
        env = wrappers.Monitor(env, args.record)

    net = model.ModelActor(env.observation_space.shape[0],
                           env.action_space.shape[0], args.hid)
    if args.acktr:
        opt = kfac.KFACOptimizer(net)
    net.load_state_dict(torch.load(args.model))

    obs = env.reset()
    total_reward = 0.0
    total_steps = 0

    while True:
        obs_v = torch.FloatTensor(obs)
        mu_v = net(obs_v)
        action = mu_v.squeeze(dim=0).data.numpy()
        action = np.clip(action, -1, 1)
        if np.isscalar(action):
            action = [action]
Пример #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda",
                        default=False,
                        action='store_true',
                        help='Enable CUDA')
    parser.add_argument("-n", "--name", required=True, help="Name of the run")
    parser.add_argument("-e",
                        "--env",
                        default=ENV_ID,
                        help="Environment id, default=" + ENV_ID)
    parser.add_argument("--lrc",
                        default=LEARNING_RATE_CRITIC,
                        type=float,
                        help="Critic learning rate")
    parser.add_argument("--lra",
                        default=LEARNING_RATE_ACTOR,
                        type=float,
                        help="Actor learning rate")
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    save_path = os.path.join("saves", "ppo-" + args.name)
    os.makedirs(save_path, exist_ok=True)

    env = gym.make(args.env)
    test_env = gym.make(args.env)

    net_act = model.ModelActor(env.observation_space.shape[0],
                               env.action_space.shape[0]).to(device)
    net_crt = model.ModelCritic(env.observation_space.shape[0]).to(device)
    print(net_act)
    print(net_crt)

    writer = SummaryWriter(comment="-ppo_" + args.name)
    agent = model.AgentA2C(net_act, device=device)
    exp_source = ptan.experience.ExperienceSource(env, agent, steps_count=1)

    opt_act = optim.Adam(net_act.parameters(), lr=args.lra)
    opt_crt = optim.Adam(net_crt.parameters(), lr=args.lrc)

    trajectory = []
    best_reward = None
    with ptan.common.utils.RewardTracker(writer) as tracker:
        for step_idx, exp in enumerate(exp_source):
            rewards_steps = exp_source.pop_rewards_steps()
            if rewards_steps:
                rewards, steps = zip(*rewards_steps)
                writer.add_scalar("episode_steps", np.mean(steps), step_idx)
                tracker.reward(np.mean(rewards), step_idx)

            if step_idx % TEST_ITERS == 0:
                ts = time.time()
                rewards, steps = test_net(net_act, test_env, device=device)
                print("Test done in %.2f sec, reward %.3f, steps %d" %
                      (time.time() - ts, rewards, steps))
                writer.add_scalar("test_reward", rewards, step_idx)
                writer.add_scalar("test_steps", steps, step_idx)
                if best_reward is None or best_reward < rewards:
                    if best_reward is not None:
                        print("Best reward updated: %.3f -> %.3f" %
                              (best_reward, rewards))
                        name = "best_%+.3f_%d.dat" % (rewards, step_idx)
                        fname = os.path.join(save_path, name)
                        torch.save(net_act.state_dict(), fname)
                    best_reward = rewards

            trajectory.append(exp)
            if len(trajectory) < TRAJECTORY_SIZE:
                continue

            traj_states = [t[0].state for t in trajectory]
            traj_actions = [t[0].action for t in trajectory]
            traj_states_v = torch.FloatTensor(traj_states)
            traj_states_v = traj_states_v.to(device)
            traj_actions_v = torch.FloatTensor(traj_actions)
            traj_actions_v = traj_actions_v.to(device)
            traj_adv_v, traj_ref_v = calc_adv_ref(trajectory,
                                                  net_crt,
                                                  traj_states_v,
                                                  device=device)
            mu_v = net_act(traj_states_v)
            old_logprob_v = calc_logprob(mu_v, net_act.logstd, traj_actions_v)

            # normalize advantages
            traj_adv_v = traj_adv_v - torch.mean(traj_adv_v)
            traj_adv_v /= torch.std(traj_adv_v)

            # drop last entry from the trajectory, an our adv and ref value calculated without it
            trajectory = trajectory[:-1]
            old_logprob_v = old_logprob_v[:-1].detach()

            sum_loss_value = 0.0
            sum_loss_policy = 0.0
            count_steps = 0

            for epoch in range(PPO_EPOCHES):
                for batch_ofs in range(0, len(trajectory), PPO_BATCH_SIZE):
                    batch_l = batch_ofs + PPO_BATCH_SIZE
                    states_v = traj_states_v[batch_ofs:batch_l]
                    actions_v = traj_actions_v[batch_ofs:batch_l]
                    batch_adv_v = traj_adv_v[batch_ofs:batch_l]
                    batch_adv_v = batch_adv_v.unsqueeze(-1)
                    batch_ref_v = traj_ref_v[batch_ofs:batch_l]
                    batch_old_logprob_v = \
                        old_logprob_v[batch_ofs:batch_l]

                    # critic training
                    opt_crt.zero_grad()
                    value_v = net_crt(states_v)
                    loss_value_v = F.mse_loss(value_v.squeeze(-1), batch_ref_v)
                    loss_value_v.backward()
                    opt_crt.step()

                    # actor training
                    opt_act.zero_grad()
                    mu_v = net_act(states_v)
                    logprob_pi_v = calc_logprob(mu_v, net_act.logstd,
                                                actions_v)
                    ratio_v = torch.exp(logprob_pi_v - batch_old_logprob_v)
                    surr_obj_v = batch_adv_v * ratio_v
                    c_ratio_v = torch.clamp(ratio_v, 1.0 - PPO_EPS,
                                            1.0 + PPO_EPS)
                    clipped_surr_v = batch_adv_v * c_ratio_v
                    loss_policy_v = -torch.min(surr_obj_v,
                                               clipped_surr_v).mean()
                    loss_policy_v.backward()
                    opt_act.step()

                    sum_loss_value += loss_value_v.item()
                    sum_loss_policy += loss_policy_v.item()
                    count_steps += 1

            trajectory.clear()
            writer.add_scalar("advantage", traj_adv_v.mean().item(), step_idx)
            writer.add_scalar("values", traj_ref_v.mean().item(), step_idx)
            writer.add_scalar("loss_policy", sum_loss_policy / count_steps,
                              step_idx)
            writer.add_scalar("loss_value", sum_loss_value / count_steps,
                              step_idx)
Пример #4
0
                        help='Enable CUDA')
    parser.add_argument("-n", "--name", required=True, help="Name of the run")
    parser.add_argument("-e",
                        "--env",
                        default=ENV_ID,
                        help="Environment id, default=" + ENV_ID)
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    save_path = os.path.join("saves", "trpo-" + args.name)
    os.makedirs(save_path, exist_ok=True)

    env = gym.make(args.env)
    test_env = gym.make(args.env)

    net_act = model.ModelActor(env.observation_space.shape[0],
                               env.action_space.shape[0]).to(device)
    net_crt = model.ModelCritic(env.observation_space.shape[0]).to(device)
    print(net_act)
    print(net_crt)

    writer = SummaryWriter(comment="-trpo_" + args.name)
    agent = model.AgentA2C(net_act, device=device)
    exp_source = ptan.experience.ExperienceSource(env, agent, steps_count=1)

    opt_crt = optim.Adam(net_crt.parameters(), lr=LEARNING_RATE_CRITIC)

    trajectory = []
    best_reward = None
    with ptan.common.utils.RewardTracker(writer) as tracker:
        for step_idx, exp in enumerate(exp_source):
            rewards_steps = exp_source.pop_rewards_steps()
Пример #5
0
                        help='Enable CUDA')
    parser.add_argument("-n", "--name", required=True, help="Name of the run")
    parser.add_argument("-e",
                        "--env",
                        default=ENV_ID,
                        help="Environment id, default=" + ENV_ID)
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    save_path = os.path.join("saves", "acktr-" + args.name)
    os.makedirs(save_path, exist_ok=True)

    envs = [wrap_dqn(gym.make(args.env)) for _ in range(ENVS_COUNT)]
    test_env = wrap_dqn(gym.make(args.env))

    net_act = model.ModelActor(envs[0].observation_space.shape,
                               envs[0].action_space.n).to(device)
    net_crt = model.ModelCritic(envs[0].observation_space.shape).to(device)
    print(net_act)
    print(net_crt)

    writer = SummaryWriter(comment="-acktr_" + args.name)
    agent = model.AgentA2C(net_act, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(
        envs, agent, GAMMA, steps_count=REWARD_STEPS)

    opt_act = kfac.KFACOptimizer(net_act, lr=LEARNING_RATE_ACTOR)
    opt_crt = optim.Adam(net_crt.parameters(), lr=LEARNING_RATE_CRITIC)

    batch = []
    best_reward = None
    with ptan.common.utils.RewardTracker(writer) as tracker:
LEARNING_RATE_ACTOR = 1e-3
LEARNING_RATE_CRITIC = 1e-3
ENTROPY_BETA = 1e-3
ENVS_COUNT = 16

if __name__ == "__main__":

    parser = make_parser()

    args, device, save_path, test_env, maxeps, maxsec = parse_args(
        parser, "acktr")

    envs = [make_env(args.env) for _ in range(ENVS_COUNT)]

    net_act = model.ModelActor(envs[0].observation_space.shape[0],
                               envs[0].action_space.shape[0],
                               args.hid).to(device)
    net_crt = model.ModelCritic(envs[0].observation_space.shape[0],
                                args.hid).to(device)
    print(net_act)
    print(net_crt)

    writer = SummaryWriter(comment="-acktr_" + args.name)
    agent = model.AgentA2C(net_act, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(
        envs, agent, GAMMA, steps_count=REWARD_STEPS)

    opt_act = kfac.KFACOptimizer(net_act, lr=LEARNING_RATE_ACTOR)
    opt_crt = optim.Adam(net_crt.parameters(), lr=LEARNING_RATE_CRITIC)

    batch = []
Пример #7
0
                        default=False,
                        action="store_true",
                        help="enable cuda")

    args = parser.parse_args()
    device = torch.device(
        "cuda" if args.cuda and torch.cuda.is_available() else "cpu")
    writer = SummaryWriter(comment="sumo-ants-ppo")
    save_path = "/home/chenkehan/RESEARCH/codes/try/DL_RL/ppo_sumo_ants/save_train_data"

    env = gym.make(ENV_ID)
    test_env = gym.make(ENV_ID)

    obs_shape = env.observation_space.spaces[1].shape[0]
    action_shape = env.action_space.spaces[1].shape[0]
    net_act = model.ModelActor(obs_shape, action_shape).to(device)
    net_crt = model.ModelCritic(obs_shape).to(device)
    print(net_act)
    print(net_crt)

    agent = model.AgentA2C(net_act, device=device)
    exp_source = experience.MAExperienceSource(env, agent, steps_count=1)

    opt_act = optim.Adam(net_act.parameters(), lr=LEARNING_RATE_ACTOR)
    opt_crt = optim.Adam(net_crt.parameters(), lr=LEARNING_RATE_CRITIC)

    trajectory = []
    best_reward = None

    for step_idx, exp in enumerate(exp_source):
        reward_steps = exp_source.pop_rewards_steps()
    parser = argparse.ArgumentParser()
    parser.add_argument("-m", "--model", required=True, help="Model file to load")
    parser.add_argument("-e", "--env", default=ENV_ID,
                        help="Environment name to use, default=" + ENV_ID)
    parser.add_argument(
        "-r", "--record", help="If specified, sets the recording dir, default=Disabled")
    parser.add_argument("--eval", default=False, action='store_true', help='Evaluates Agent')
    args = parser.parse_args()

    reward_eval_env = gym.make(args.env)

    env = gym.make(args.env)
    if args.record:
        env = gym.wrappers.Monitor(env, args.record, force=True)

    net = model.ModelActor(env.observation_space.shape[0], env.action_space.shape[0],
                           HID_SIZE, ACTF)
    net.load_state_dict(torch.load(args.model))
    agent = model.Agent(net, FIXED_SIGMA_VALUE, BETA)

    if args.eval:
        print("Evaluating Agent...")
        rewards = 0.0
        steps = 0
        for _ in range(100):
            obs = reward_eval_env.reset()
            while True:
                obs_v = torch.FloatTensor([obs])
                mu_v = agent.get_actions_deterministic(obs_v)
                action = mu_v.squeeze(dim=0).data.cpu().numpy()
                obs, reward, done, _ = reward_eval_env.step(action)
                rewards += reward
Пример #9
0
                        action='store_true',
                        help='Enable CUDA')
    parser.add_argument("-n", "--name", required=True, help="Name of the run")
    parser.add_argument("-e",
                        "--env",
                        default=ENV_ID,
                        help="Environment id, default=" + ENV_ID)
    args = parser.parse_args()

    save_path = os.path.join("saves", "a2c-" + args.name)
    os.makedirs(save_path, exist_ok=True)

    envs = [gym.make(args.env) for _ in range(ENVS_COUNT)]
    test_env = gym.make(args.env)

    net_act = model.ModelActor(envs[0].observation_space.shape[0],
                               envs[0].action_space.shape[0])
    net_crt = model.ModelCritic(envs[0].observation_space.shape[0])
    if args.cuda:
        net_act.cuda()
        net_crt.cuda()
    print(net_act)
    print(net_crt)

    writer = SummaryWriter(comment="-a2c_" + args.name)
    agent = model.AgentA2C(net_act, cuda=args.cuda)
    exp_source = ptan.experience.ExperienceSourceFirstLast(
        envs, agent, GAMMA, steps_count=REWARD_STEPS)

    opt_act = optim.Adam(net_act.parameters(), lr=LEARNING_RATE_ACTOR)
    opt_crt = optim.Adam(net_crt.parameters(), lr=LEARNING_RATE_CRITIC)