示例#1
0
    envs = [wrap_dqn(gym.make(args.env)) for _ in range(ENVS_COUNT)]
    test_env = wrap_dqn(gym.make(args.env))

    net_act = model.ModelActor(envs[0].observation_space.shape,
                               envs[0].action_space.n).to(device)
    net_crt = model.ModelCritic(envs[0].observation_space.shape).to(device)
    print(net_act)
    print(net_crt)

    writer = SummaryWriter(comment="-acktr_" + args.name)
    agent = model.AgentA2C(net_act, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(
        envs, agent, GAMMA, steps_count=REWARD_STEPS)

    opt_act = kfac.KFACOptimizer(net_act, lr=LEARNING_RATE_ACTOR)
    opt_crt = optim.Adam(net_crt.parameters(), lr=LEARNING_RATE_CRITIC)

    batch = []
    best_reward = None
    with ptan.common.utils.RewardTracker(writer) as tracker:
        with ptan.common.utils.TBMeanTracker(writer,
                                             batch_size=100) as tb_tracker:
            for step_idx, exp in enumerate(exp_source):
                rewards_steps = exp_source.pop_rewards_steps()
                if rewards_steps:
                    rewards, steps = zip(*rewards_steps)
                    tb_tracker.track("episode_steps", np.mean(steps), step_idx)
                    tracker.reward(np.mean(rewards), step_idx)

                if step_idx % TEST_ITERS == 0:
示例#2
0
                        action='store_true',
                        help="Enable Acktr-specific tweaks")
    args = parser.parse_args()
    get_link_state = rospy.ServiceProxy("/gazebo/get_link_state", GetLinkState)
    pitch = 0
    rospy.Subscriber('/Bobby/imu', Imu, get_angular_vel)

    counter = 0
    env = make_env(args)
    if args.record:
        env = wrappers.Monitor(env, args.record)

    net = model.ModelActor(env.observation_space.shape[0],
                           env.action_space.shape[0], args.hid)
    if args.acktr:
        opt = kfac.KFACOptimizer(net)
    net.load_state_dict(torch.load(args.model))

    obs = env.reset()
    total_reward = 0.0
    total_steps = 0

    while True:
        obs_v = torch.FloatTensor(obs)
        mu_v = net(obs_v)
        action = mu_v.squeeze(dim=0).data.numpy()
        action = np.clip(action, -1, 1)
        if np.isscalar(action):
            action = [action]
        obs, reward, done, _ = env.step(action)
        total_reward += reward
示例#3
0
                               envs[0].action_space.shape[0]).to(device)
    crt_net = model.ModelCritic(envs[0].observation_space.shape[0]).to(device)
    print(act_net)
    print(crt_net)
    if args.act_model:
        act_net.load_state_dict(torch.load(args.act_model))
    if args.crt_model:
        crt_net.load_state_dict(torch.load(args.crt_model))

    writer = SummaryWriter(comment='-a2c_' + args.name)
    agent = model.AgentA2C(act_net, device)

    exp_source = drl.experience.ExperienceSourceFirstLast(
        envs, agent, gamma=GAMMA, steps_count=REWARD_STEP)

    act_optimizer = kfac.KFACOptimizer(act_net.parameters(),
                                       lr=LEARNING_RATE_ACTOR)
    crt_optimizer = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE_CRITIC)

    batch = []
    best_reward = None

    with drl.tracker.RewardTracker(writer) as tracker:
        with drl.tracker.TBMeanTracker(writer, 10) as tb_tracker:
            for step_idx, exp in enumerate(exp_source):

                rewards_steps = exp_source.pop_rewards_steps()
                if rewards_steps:
                    rewards, steps = zip(*rewards_steps)
                    tb_tracker.track("episode_steps", steps[0], step_idx)
                    tracker.reward(rewards[0], step_idx)