示例#1
0
import numpy as np

import DQN

NUM_EPISODE = 10
NUM_STEP = 100

if __name__ == "__main__":

    mainQN = DQN.QNetwork(debug_log=True)
    memory = DQN.Memory(max_size=1000)
    #actor = DQN.Actor()

    for episode in range(NUM_EPISODE):

        print('episode {}'.format(episode))

        state = np.random.rand(16 * 16 * 8).reshape(1, 16, 16, 8)
        #action1 = [7, 7]
        #action2 = [8, 8]

        for step in range(NUM_STEP):

            #action, _ = actor.get_action(state, step, mainQN, 'r', action1, action2, 1, True, False, False)
            action = np.array([0, 0])

            if step == NUM_STEP - 1:
                next_state = np.zeros((1, 16, 16, 8))
                reward = 1.0
            else:
                next_state = np.random.rand(16 * 16 * 8).reshape(1, 16, 16, 8)
solved_reward = 230  # stop training if avg_reward > solved_reward
log_interval = 20  # print avg reward in the interval
max_episodes = 50000  # max training episodes
max_timesteps = 3000  # max timesteps in one episode
n_latent_var = 64  # number of variables in hidden layer
update_timestep = 200  # update policy every n timesteps

#Change these first
lr = 0.002
betas = (0.9, 0.999)
gamma = 0.99  # discount factor
K_epochs = 4  # update policy for K epochs
eps_clip = 0.2  # clip parameter for PPO
#############################################
# print(dir(DQN))
memory = DQN.Memory()
model = DQN.DQN(state_dim, action_dim, n_latent_var, lr, betas, gamma,
                K_epochs, eps_clip)

# memory = PPO.Memory()
# model = PPO.PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip)
print("About to load model...")
if loadingBool:
    try:
        print(file)
        ## DQN
        model.policy_net.load_state_dict(torch.load(file))
        model.target_net.load_state_dict(torch.load(file))

        ## PPO
        # model.policy.load_state_dict(torch.load(file))