예제 #1
0
                        action='store_true',
                        help='Enable CUDA')
    parser.add_argument("--name", required=True, help="Name of the run")
    parser.add_argument("-n", required=True, help="Unroll parameter")
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    save_path = os.path.join("saves", "ddpg-" + args.name)
    os.makedirs(save_path, exist_ok=True)

    env = gym.make(ENV_ID)
    test_env = gym.make(ENV_ID)

    act_net = model.DDPGActor(env.observation_space.shape[0],
                              env.action_space.shape[0]).to(device)
    crt_net = model.DDPGCritic(env.observation_space.shape[0],
                               env.action_space.shape[0]).to(device)
    print(act_net)
    print(crt_net)
    tgt_act_net = ptan.agent.TargetNet(act_net)
    tgt_crt_net = ptan.agent.TargetNet(crt_net)

    writer = SummaryWriter(comment="-ddpg_" + args.name)
    agent = model.AgentDDPG(act_net, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(env,
                                                           agent,
                                                           gamma=GAMMA,
                                                           steps_count=args.n)
    buffer = ptan.experience.ExperienceReplayBuffer(exp_source,
                                                    buffer_size=REPLAY_SIZE)
    act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE)
    crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE)
            steps += 1
            # If done proceed to next try
            if is_done:
                break
        
    return rewards / count, steps / count

# Create buffer auxiliars
Experience = namedtuple('Episode', field_names=['state', 'action', 'reward', 'last_state', 'done'])

# Initialize simulator
sim = simulator.Agent(random(), random())

# Initialize networks and inteligent agents
act_net = model.DDPGActor(OBSERVATION_SPACE, ACTION_SPACE).to(device)
crt_net = model.DDPGCritic(OBSERVATION_SPACE, ACTION_SPACE).to(device)
tgt_act_net = ptan.agent.TargetNet(act_net)
tgt_crt_net = ptan.agent.TargetNet(crt_net)

agent = model.AgentDDPG(act_net, device=device)

act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE)
crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE)

# Define soft_max function for discrete actions
def softmax_function(values):
    return_values = [max(MIN_PROB_EXPLORATION, np.exp(value)/np.exp(values).sum()) for value in values]
    return np.random.choice(len(return_values), p=return_values/sum(return_values))

buffer = []
iteration = 0