示例#1
0
    parser.add_argument("-n", "--name", required=True, help="Name of the run")
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    save_path = os.path.join("saves", "d4pg-" + args.name)
    os.makedirs(save_path, exist_ok=True)

    # Wrappen der Unity-Umgebung in eine Gym-Umgebung
    channel = EngineConfigurationChannel()
    unity_env = UnityEnvironment(ENV_ID, seed=1, side_channels=[channel])
    channel.set_configuration_parameters(time_scale=20.0)
    env = UnityToGymWrapper(unity_env)

    # Erstellen des Modells nach der D4PG-Architektur
    act_net = model.D4PGActor(env.observation_space.shape[0], env.action_space.shape[0]).to(device)
    crt_net = model.D4PGCritic(env.observation_space.shape[0], env.action_space.shape[0], N_ATOMS, Vmin, Vmax).to(device)
    print(act_net)
    print(crt_net)
    tgt_act_net = ptan.agent.TargetNet(act_net)
    tgt_crt_net = ptan.agent.TargetNet(crt_net)

    # Erstellen des Agenten mit der PTAN-Bibliothek und des Buffers
    writer = SummaryWriter(comment="-d4pg_" + args.name)
    agent = model.AgentD4PG(act_net, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA, steps_count=REWARD_STEPS)
    buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE)
    act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE)
    crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE)

    frame_idx = 0
    best_reward = None
示例#2
0
def main():

    env = KukaGymEnv(renders=True, isDiscrete=False, maxSteps=10000000)
    save_path = os.path.join("saves", "ddpg-")
    os.makedirs(save_path, exist_ok=True)

    device = torch.device("cuda")
    act_net = model.DDPGActor(env.observation_space.shape[0],
                              env.action_space.shape[0]).to(device)
    crt_net = model.D4PGCritic(env.observation_space.shape[0],
                               env.action_space.shape[0], N_ATOMS, Vmin,
                               Vmax).to(device)
    print(act_net)
    print(crt_net)
    tgt_act_net = common.TargetNet(act_net)
    tgt_crt_net = common.TargetNet(crt_net)
    writer = SummaryWriter(comment="-d4pg_")
    agent = model.AgentDDPG(act_net, device=device)
    exp_source = experience.ExperienceSourceFirstLast(env,
                                                      agent,
                                                      gamma=GAMMA,
                                                      steps_count=REWARD_STEPS)
    buffer = experience.ExperienceReplayBuffer(exp_source,
                                               buffer_size=REPLAY_SIZE)
    act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE)
    crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE)
    frame_idx = 0
    best_reward = None
    with common.RewardTracker(writer) as tracker:
        with common.TBMeanTracker(writer, batch_size=10) as tb_tracker:
            while True:
                frame_idx += 1
                #print("populate")
                buffer.populate(1)
                rewards_steps = exp_source.pop_rewards_steps()
                #print(rewards_steps)
                if rewards_steps:
                    rewards, steps = zip(*rewards_steps)
                    tb_tracker.track("episode_steps", steps[0], frame_idx)
                    tracker.reward(rewards[0], frame_idx)

                if len(buffer) < 100:
                    continue
                batch = buffer.sample(BATCH_SIZE)
                #print("infer")
                states_v, actions_v, rewards_v, dones_mask, last_states_v = common.unpack_batch_ddqn(
                    batch, device)
                #print("train critic")# train critic
                crt_opt.zero_grad()
                crt_distr_v = crt_net(states_v, actions_v)
                last_act_v = tgt_act_net.target_model(last_states_v)
                last_distr_v = F.softmax(tgt_crt_net.target_model(
                    last_states_v, last_act_v),
                                         dim=1)
                proj_distr_v = distr_projection(last_distr_v,
                                                rewards_v,
                                                dones_mask,
                                                gamma=GAMMA**REWARD_STEPS,
                                                device=device)
                prob_dist_v = -F.log_softmax(crt_distr_v, dim=1) * proj_distr_v
                critic_loss_v = prob_dist_v.sum(dim=1).mean()
                critic_loss_v.backward()
                crt_opt.step()
                tb_tracker.track("loss_critic", critic_loss_v, frame_idx)
                #print("train actor")
                # train actor
                act_opt.zero_grad()
                act_opt.zero_grad()
                cur_actions_v = act_net(states_v)
                crt_distr_v = crt_net(states_v, cur_actions_v)
                actor_loss_v = -crt_net.distr_to_q(crt_distr_v)
                actor_loss_v = actor_loss_v.mean()
                actor_loss_v.backward()
                act_opt.step()
                tb_tracker.track("loss_actor", actor_loss_v, frame_idx)
                tgt_act_net.alpha_sync(alpha=1 - 1e-3)
                tgt_crt_net.alpha_sync(alpha=1 - 1e-3)
                if frame_idx % TEST_ITERS == 0:
                    print("testing")
                    env.reset()
                    ts = time.time()
                    rewards, steps = test_net(act_net, env, device=device)
                    print("Test done in %.2f sec, reward %.3f, steps %d" %
                          (time.time() - ts, rewards, steps))
                    writer.add_scalar("test_reward", rewards, frame_idx)
                    writer.add_scalar("test_steps", steps, frame_idx)
                    if best_reward is None or best_reward < rewards:
                        if best_reward is not None:
                            print("Best reward updated: %.3f -> %.3f" %
                                  (best_reward, rewards))
                            name = "best_%+.3f_%d.dat" % (rewards, frame_idx)
                            fname = os.path.join(save_path, name)
                            torch.save(act_net.state_dict(), fname)
                        best_reward = rewards
示例#3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda",
                        default=False,
                        action='store_true',
                        help='Enable CUDA')
    parser.add_argument("-n", "--name", required=True, help="Name of the run")
    args = parser.parse_args()
    device = torch.device(
        "cuda" if args.cuda and torch.cuda.is_available() else "cpu")

    save_path = os.path.join("saves", "d4pg-" + args.name)
    os.makedirs(save_path, exist_ok=True)

    env = gym.make(ENV_ID)
    test_env = gym.make(ENV_ID)

    act_net = model.D4PGActor(env.observation_space.shape[0],
                              env.action_space.shape[0]).to(device)
    crt_net = model.D4PGCritic(env.observation_space.shape[0],
                               env.action_space.shape[0], N_ATOMS, VMIN,
                               VMAX).to(device)
    tgt_act_net = ptan.agent.TargetNet(act_net)
    tgt_crt_net = ptan.agent.TargetNet(crt_net)

    writer = SummaryWriter(comment="-d4pg_" + args.name)

    agent = model.AgentD4PG(act_net, device=device)
    exp_source = ExperienceSourceFirstLast(env,
                                           agent,
                                           gamma=GAMMA,
                                           steps_count=REWARD_STEPS)
    buffer = PrioritizedReplayBuffer(exp_source, REPLAY_SIZE,
                                     PRIO_REPLAY_ALPHA)
    act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE)
    crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE)

    frame_idx = 0
    best_reward = 0
    with RewardTracker(writer) as tracker:
        with TBMeanTracker(writer, batch_size=10) as tb_tracker:
            while True:
                frame_idx += 1
                buffer.populate(1)
                beta = min(
                    1.0,
                    BETA_START + frame_idx * (1.0 - BETA_START) / BETA_FRAMES)

                rewards_steps = exp_source.pop_rewards_steps()
                if rewards_steps:
                    rewards, steps = zip(*rewards_steps)
                    tb_tracker.track('episode_steps', steps[0], frame_idx)
                    tracker.reward(rewards[0], frame_idx)

                if len(buffer) < REPLAY_INITIAL:
                    continue

                batch, batch_indices, batch_weights = buffer.sample(
                    BATCH_SIZE, beta)
                actor_loss, critic_loss, sample_prios = calc_loss(
                    batch, batch_weights, act_net, crt_net, tgt_act_net,
                    tgt_crt_net, device)

                train(actor_loss, critic_loss, act_net, crt_net, tgt_act_net,
                      tgt_crt_net, act_opt, crt_opt, device)

                tb_tracker.track('loss_actor', actor_loss, frame_idx)
                tb_tracker.track('loss_critic', critic_loss, frame_idx)

                buffer.update_priorities(batch_indices,
                                         sample_prios.data.cpu().numpy())

                if frame_idx % TEST_ITERS == 0:
                    ts = time.time()
                    rewards, steps = test(act_net, test_env, device=device)
                    print('Test done in %.2f sec, reward %.3f, steps %d' %
                          (time.time() - ts, rewards, steps))
                    writer.add_scalar('test_reward', rewards, frame_idx)
                    writer.add_scalar('test_steps', steps, frame_idx)
                    if best_reward is None or best_reward < rewards:
                        if best_reward is not None:
                            print('Best reward updated: %.3f -> %.3f' %
                                  (best_reward, rewards))
                            name = 'best_%+.3f_%d.dat' % (rewards, frame_idx)
                            fname = os.path.join(save_path, name)
                            torch.save(act_net.state_dict(), fname)
                        best_reward = rewards

    writer.close()