def __init__(self):
        self.model = ActorCritic(num_actions=len(ACTIONS))
        self.model.to("cuda")

        self.worker = RolloutWorker(self, ENV_ID, NUM_WORKERS, T)

        self.train_history = dict()
        self.train_history['frames_trained'] = 0
        self.train_history['average_entropy'] = []
        self.train_history['average_values'] = []
Exemplo n.º 2
0
def train(num = 2000):
    agent = ActorCritic(env.observation_space.shape[0], env.action_space.n)
    # agent.load_model()

    steps = []
    for i_episode in range(num):
        old_observation = env.reset()
        old_action = agent.get_action(np.reshape(old_observation, [1, env.observation_space.shape[0]]))
        done = False
        step = 0

        while not done:
            step = step + 1
            # env.render()
            observation, reward, done, info = env.step(old_action)
            
            if done:
                reward = - 20

            td_error = agent.train_critic(reward, np.reshape(old_observation, [1, env.observation_space.shape[0]]), np.reshape(observation, [1, env.observation_space.shape[0]]))
            agent.train_actor(td_error, np.reshape(old_observation, [1, env.observation_space.shape[0]]), old_action)

            old_observation = observation
            old_action = agent.get_action(np.reshape(old_observation, [1, env.observation_space.shape[0]]))

            if done:
                steps.append(step)
                print("{}:{} steps".format(i_episode, step))
                agent.save_model()
                break
 def __init__(self,
              state_size,
              action_size,
              num_agents,
              hidden_size=64,
              lr=1e-4):
     self.model = ActorCritic(state_size,
                              action_size,
                              hidden_size=hidden_size)
     self.optimizer = Adam(self.model.parameters(), lr=lr)
     self.agents = [PPO_Agent() for _ in range(num_agents)]
Exemplo n.º 4
0
    def __init__(self, state_dim, action_dim, n_agents, lr, betas, gamma, K_epochs, eps_clip):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs

        self.policy = ActorCritic(state_dim, action_dim, n_agents).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas)
        self.policy_old = ActorCritic(state_dim, action_dim, n_agents).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())

        self.MseLoss = nn.MSELoss()
Exemplo n.º 5
0
def main(args):

    training = int(args[1])
    test_interwal = int(args[2])
    load = int(args[3])

    env = gym.make('BipedalWalker-v2')
    memory = None

    if training == 1:
        memory = Memory(MAX_BUFFER)
        prepopulate_memory(memory, env)

    rewards = []
    start_time = time.time()
    max_reward = 0

    trainer = ActorCritic(env.observation_space.shape[0],
                          env.action_space.shape[0], memory, load)

    for episode in np.arange(MAX_EPISODES):
        if training == 1:
            env_run(env, episode, trainer, memory, True)
        if episode % test_interwal == 0:
            max_reward += env_run(env, episode, trainer, None, False)
            rewards.append(max_reward / ((episode / test_interwal) + 1))
    plt.plot(rewards)
    plt.show()
Exemplo n.º 6
0
def train(num = 500):
    agent = ActorCritic(env.observation_space.shape[0], [-A_BOUND, A_BOUND])
    # agent.load_model()

    steps = []
    RENDER = False
    for i_episode in range(num):
        old_observation = env.reset()
        old_action = agent.get_action(np.reshape(old_observation, [1, env.observation_space.shape[0]]))
        done = False
        step = 0
        ep_r = 0
        while not done:
            step = step + 1
            if RENDER:
                env.render()
            observation, reward, done, info = env.step(old_action)
            
            reward /= 10

            td_error = agent.train_critic(reward, np.reshape(old_observation, [1, env.observation_space.shape[0]]), np.reshape(observation, [1, env.observation_space.shape[0]]))
            agent.train_actor(td_error, np.reshape(old_observation, [1, env.observation_space.shape[0]]), old_action)

            old_observation = observation
            old_action = agent.get_action(np.reshape(old_observation, [1, env.observation_space.shape[0]]))
            ep_r += reward
            if done:
                print("{} {}".format(i_episode, ep_r))
                if ep_r > -50:
                    RENDER = True
                break
Exemplo n.º 7
0
def main():
    env = gym.make('CartPole-v1')
    model = ActorCritic(LEARNING_RATE, GAMMA)
    score = 0.0
    print_interval = 20
    for n_epi in range(10000):
        s = env.reset()
        done = False

        while not done:
            for i in range(n_rollout):
                prob = model.pi(torch.from_numpy(s).float())
                m = Categorical(prob)
                a = m.sample().item()

                s_prime, r, done, info = env.step(a)
                model.put_data((s, a, r, s_prime, done))

                s = s_prime
                score += r
                if done:
                    break
            model.train_net()

        if n_epi % print_interval == 0 and n_epi != 0:
            print("# of episode :{}, avg score: {}".format(n_epi, score / print_interval))
            score = 0.0
    env.close()
Exemplo n.º 8
0
def test(num = 500):
    agent = ActorCritic(env.observation_space.shape[0], env.action_space.n)
    agent.load_model()

    steps = []
    for i_episode in range(num):
        old_observation = env.reset()
        old_action = agent.get_action(np.reshape(old_observation, [1, env.observation_space.shape[0]]))
        done = False
        step = 0

        while not done:
            step = step + 1
            # env.render()
            observation, reward, done, info = env.step(old_action)
            
            if done:
                reward = - 20

            old_observation = observation
            old_action = agent.get_action(np.reshape(old_observation, [1, env.observation_space.shape[0]]))

            if done:
                steps.append(step)
                print("{}:{} steps".format(i_episode, step))
                break
            
        # if the average steps of consecutive 100 games is lower than a standard
        # we consider the method passes the game
        if len(steps) > 200 and sum(steps[-200:])/200 >=195:
            print(sum(steps[-200:])/200)
            break
Exemplo n.º 9
0
def agent(net_params_queue, exp_queues, config, id):
    torch.set_num_threads(1)

    env = GymEnv(env_id=id, config=config)

    net = ActorCritic(False, config)
    send_rate_list = config['sending_rate']
    default_bwe_idx = config['default_bwe']

    # experience RTC if not forced to stop
    while True:
        env.reset()
        action = default_bwe_idx
        bwe = send_rate_list[action]
        s_batch = []
        a_batch = []
        r_batch = []
        entropy_batch = []

        done = False
        actor_network_params = net_params_queue.get()
        for target_param, source_param in zip(net.ActorNetwork.parameters(),
                                              actor_network_params):
            target_param.data.copy_(source_param.data)
        # todo: Agent interact with gym
        while not done:
            state, reward, done, _ = env.step(
                bwe)  # todo: the shape of state needs to be regulated

            r_batch.append(reward)

            action, entropy = net.predict(state)
            bwe = send_rate_list[action]
            s_batch.append(state)
            a_batch.append(action)
            entropy_batch.append(entropy)
        # ignore the first bwe and state since we don't have the ability to control it
        exp_queues.put(
            [s_batch[1:], a_batch[1:], r_batch[1:], done, entropy_batch[1:]])
Exemplo n.º 10
0
def Evaluation(Eseed, lseed):
    seed = Eseed + 10  #試行毎seed変更(学習と同じにならないようにずらす)
    ######### パラメータ #########
    env_name = "Pendulum-v0"
    save_interval = 10
    lr = 3 * pow(10, -4)
    gamma = 0.99  #減衰率
    batch_size = 256
    max_timesteps = 200
    max_episodes = 500  #最大エピソード数2500
    num_step = max_timesteps * max_episodes
    gamma = 0.99  #減衰率
    batch_size = 256
    save_step = save_interval * max_timesteps
    directory = "./preTrained/actorcritic/{}".format(
        env_name)  # save trained models
    filename = "ActorCritic_{}_{}".format(env_name, lseed)
    #############################

    env = gym.make(env_name)
    env.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    min_action = float(env.action_space.low[0])

    policy = ActorCritic(lr, state_dim, action_dim, max_action, min_action,
                         batch_size, gamma)

    temp = np.zeros(1)
    sumR_ave = 0.0

    for st in range(save_step, num_step + 1, save_step):  #1つけんと最後のデータ読み込まん
        policy.load_models(directory, filename, st)
        sumR = 0.0
        for e in range(10):  #10エピソードの平均
            state = env.reset()
            action = policy.select_action(state)

            for s in range(200):  #not done
                #env.render()
                action = policy.select_action(state)
                next_state, reward, done, _ = env.step(action)
                sumR += reward
                state = next_state

        sumR_ave = sumR / 10
        if st == save_step:
            temp = sumR_ave
        else:
            temp = np.vstack((temp, sumR_ave))
    env.close()
    return temp
    def __init__(self, writer, state_dim=172, action_dim=5, n_latent_var=512, lr=3e-4, betas=(0.9, 0.999),
                 gamma=0.99, ppo_epochs=3, icm_epochs=1, eps_clip=0.2, ppo_batch_size=128,
                 icm_batch_size=16, intr_reward_strength=0.02, lamb=0.95, device='cpu'):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.lambd = lamb
        self.eps_clip = eps_clip
        self.ppo_epochs = ppo_epochs
        self.icm_epochs = icm_epochs
        self.ppo_batch_size = ppo_batch_size
        self.icm_batch_size = icm_batch_size
        self.intr_reward_strength = intr_reward_strength
        self.device = device
        self.writer = writer
        self.timestep = 0
        self.icm = ICM(activation=Swish()).to(self.device)

        self.policy = ActorCritic(state_dim=state_dim,
                                  action_dim=action_dim,
                                  n_latent_var=n_latent_var,
                                  activation=Swish(),
                                  device=self.device,
                                  ).to(self.device)
        self.policy_old = ActorCritic(state_dim,
                                      action_dim,
                                      n_latent_var,
                                      activation=Swish(),
                                      device=self.device
                                      ).to(self.device)

        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas)
        self.optimizer_icm = torch.optim.Adam(self.icm.parameters(), lr=lr, betas=betas)
        self.policy_old.load_state_dict(self.policy.state_dict())

        self.MseLoss = nn.MSELoss(reduction='none')
Exemplo n.º 12
0
    def __init__(self):

        self.train_history = dict()

        if os.path.isfile("./checkpoints/model.pt"):
            self.model = torch.load("./checkpoints/model.pt")
            self.train_history['frames_trained'] = torch.load(
                "./checkpoints/frames_trained.pt")
            self.train_history['average_entropy'] = torch.load(
                "./checkpoints/average_entropy.pt")
            self.train_history['average_values'] = torch.load(
                "./checkpoints/average_values.pt")
            print("Model loaded from last checkpoint.")
        else:
            self.model = ActorCritic(num_actions=len(ACTIONS))
            self.train_history['frames_trained'] = torch.tensor(0)
            self.train_history['average_entropy'] = torch.tensor(
                [0], dtype=torch.float)
            self.train_history['average_values'] = torch.tensor(
                [0], dtype=torch.float)
            print("New model created.")
        self.model.to("cuda")

        self.worker = RolloutWorker(self, ENV_ID, NUM_WORKERS, T)
Exemplo n.º 13
0
def main(method='DQN', isTrain=True):
    global FPSCLOCK, DISPLAYSURF, GEMIMAGES, GAMESOUNDS, BASICFONT, BOARDRECTS, RL

    # import reinforece learning
    if method == "ActorCritic":
        from ActorCritic import ActorCritic
        RL = ActorCritic(getBlankBoard(), [BOARDSLOTS, 2])
    elif method == "DQN":
        from DQN import DQN
        observation = getFeatureVec(getBlankBoard())
        RL = DQN(observation, [BOARDSLOTS, 2])
    # Initial set up.
    pygame.init()
    FPSCLOCK = pygame.time.Clock()
    DISPLAYSURF = pygame.display.set_mode((WINDOWWIDTH, WINDOWHEIGHT))
    pygame.display.set_caption('Gemgem')
    BASICFONT = pygame.font.Font('freesansbold.ttf', 36)

    # Load the images
    GEMIMAGES = []
    for i in range(1, NUMGEMIMAGES + 1):
        gemImage = pygame.image.load('gem%s.png' % i)
        if gemImage.get_size() != (GEMIMAGESIZE, GEMIMAGESIZE):
            gemImage = pygame.transform.smoothscale(
                gemImage, (GEMIMAGESIZE, GEMIMAGESIZE))
        GEMIMAGES.append(gemImage)

    # Load the sounds.
    GAMESOUNDS = {}
    GAMESOUNDS['bad swap'] = pygame.mixer.Sound('badswap.wav')
    GAMESOUNDS['match'] = []
    for i in range(NUMMATCHSOUNDS):
        GAMESOUNDS['match'].append(pygame.mixer.Sound('match%s.wav' % i))

    # Create pygame.Rect objects for each board space to
    # do board-coordinate-to-pixel-coordinate conversions.
    BOARDRECTS = []
    for x in range(BOARDWIDTH):
        BOARDRECTS.append([])
        for y in range(BOARDHEIGHT):
            r = pygame.Rect(
                (XMARGIN + (x * GEMIMAGESIZE), YMARGIN + (y * GEMIMAGESIZE),
                 GEMIMAGESIZE, GEMIMAGESIZE))
            BOARDRECTS[x].append(r)

    runGame(300000000, isTrain)
Exemplo n.º 14
0
def trainAC(env, state_size, action_size, lr, n_agents, dim_act, dim_actprob,
            batch_size, setting):
    #n_agents, dim_obs, dim_act,dim_actprob, batch_size,device
    #ifload = setting["ifload"]#False
    n_iters = setting["iter"]
    AC = ActorCritic(n_agents, state_size, dim_act, dim_actprob, batch_size,
                     device, setting)
    step_n = 10
    for iter in range(n_iters):
        state = env.reset()
        state = np.stack(state)
        done = False
        for i in range(step_n):
            state = torch.FloatTensor(state).to(device)
            #action = dist.sample()
            dist, actions, log_probs, act_prob = AC.select_action(state)
            acts = [act.detach() for act in actions]
            obs_n, reward_n, _, _ = env.step(acts)
            #print(obs_n,"]]]]]]")
            if i == step_n - 1:
                done = True
            next_state = obs_n
            #entropy += dist.entropy().mean()
            AC.storeSample(state, log_probs, reward_n, 1 - done, acts)
            state = next_state

            if done:
                if iter % 20 == 0:
                    print('Iteration: {}, Score: {}'.format(
                        iter, np.sum(np.array(AC.rewards))))
                break

        next_state, thact = transTensor(next_state, acts, n_agents)
        for ag in range(AC.n_agents):
            next_value = AC.critics[ag](next_state, thact)
            AC.next_value.append(next_value)

        actloss = AC.update()
        if iter % 20 == 0:
            print("action_loss  ", actloss)
        ifsave = setting["ifsave"]  #True
        if iter % 250 == 0 and ifsave:
            torch.save(AC.actor,
                       "model_rule/actor_" + setting["actor_name"] + ".pkl")
            for j in range(AC.n_agents):
                torch.save(
                    AC.critics[j], "model_rule/critic_" +
                    setting["critic_name"] + str(j) + '.pkl')
    return AC
Exemplo n.º 15
0
def trainIters(n_iters):
    AC = ActorCritic(2, state_size, 1, 2, 32, device)
    step_n = 50
    for iter in range(n_iters):
        state = env.reset()
        state = np.stack(state)
        done = False
        for i in range(step_n):
            state = torch.FloatTensor(state).to(device)
            #action = dist.sample()
            dist, action, log_prob, act_prob = AC.select_action(state)
            acts = [action.detach() for ag in range(2)]
            obs_n, reward_n, _, _ = env.step(acts)

            if i == step_n - 1:
                done = True
            next_state = obs_n
            #entropy += dist.entropy().mean()
            AC.storeSample(state, log_prob, reward_n, 1 - done, acts)
            state = next_state

            if done:
                if iter % 20 == 0:
                    print('Iteration: {}, Score: {}'.format(
                        iter, np.sum(np.array(AC.rewards))))
                break

        next_state, thact = transTensor(next_state, acts)
        for ag in range(AC.n_agents):
            next_value = AC.critics[ag](next_state, thact)
            AC.next_value.append(next_value)

        actloss = AC.update()
        if iter % 20 == 0:
            print("action_loss  ", actloss)
        ifsave = False
        if iter % 250 == 0 and ifsave:
            torch.save(AC.actor, 'model/actor_v2.pkl')
            for j in range(AC.n_agents):
                torch.save(AC.critics[j], 'model/criticv2' + str(j) + '.pkl')
    return AC
Exemplo n.º 16
0
    'action_shape': action_dim,
    'action_scale': action_max,
    'tau': 1e-3
}

actor_dict = {
    'layer_sizes': [480, 360],
    'activation': 'selu',
    'pool_size': 2,
    'dropout_rate': 0.3,
    'use_bn': False,
    'use_do': True
}

# Create actor and critic objects based on environment information.
ActorObj = ActorCritic(actor_type=True, **ac_dict, lr=1e-3, **actor_dict)
CriticObj = ActorCritic(actor_type=False, **ac_dict, lr=1e-3)

# Make experience buffer and noise.
buffer_size = int(5e4)
BufferObj = IndividualBuffers(buffer_size, state_dim, action_dim)
NoiseObj = makeOUNoise(noise_type='none',
                       mu=np.zeros(action_dim),
                       sigma=np.full(action_dim, 0.2))

# Training arguments.
arg_dict = {
    'ActorObj': ActorObj,
    'CriticObj': CriticObj,
    'buffer': BufferObj,
    'noise': NoiseObj,
Exemplo n.º 17
0
def main():

    env = gym.make('Pendulum-v0')
    action_dim = env.action_space.shape[0]
    state_dim = env.observation_space.shape[0]
    agent = ActorCritic(state_dim, action_dim)
    state = env.reset()
    timestep_limit = min(250, env.spec.timestep_limit)
    print "timestep limit set to : ", timestep_limit
    #timestep_limit = env.spec.timestep_limit   # For checking purposes; make it proper for run
    # Initial data build up
    done_flag = 0
    for i in range(REPLAY_MEMORY):

        if (done_flag == True):
            state = env.reset()

        action = env.action_space.sample()
        next_state, reward, done_flag, info = env.step(action)
        agent.append_memory(state, action, reward, next_state, done_flag)
        state = next_state

    print "Initial memory built!!"

    # Initial Training for a few steps
    for _ in range(5):
        agent.update_networks()
        agent.update_target_networks()

    print "Initial network performance = ", policy_evaluation(agent, env, 2)
    # =================================================================================

    print "******** Starting learning process *************"
    num_episodes = 5
    update_freq = 1  # update after how many steps (within each episode)
    print_freq = 1  # how often to print (episodes)

    performance = np.zeros(num_episodes)
    best_ep = 0
    best_agent = copy.deepcopy(agent)

    start_time = t.time()

    for ep in range(num_episodes):
        done_flag = 0
        state = env.reset()
        time = 0

        while (done_flag != True and time <= timestep_limit):
            actor_out = agent.learner.actor.predict(state.reshape(1, -1))[0]
            action = actor_out  # need to add exploration here
            next_state, reward, done_flag, _ = env.step(action)
            agent.append_memory(state, action, reward, next_state, done_flag)
            state = next_state

            if (time % update_freq == 0):
                agent.update_networks(epochs=5)
                #agent.update_target_networks()  --> Ideall I should update here, but it's way too slow.
                #print time, timestep_limit

            time += 1

        performance[ep] = policy_evaluation(agent, env, 5)

        # Update the target networks (I'll use a larger tau here)
        agent.update_target_networks(tau=0.01)

        if (ep % print_freq == 0):
            print "Now in episode: ", ep + 1, " of ", num_episodes
            print "Agent performance = ", performance[ep]

        if (performance[ep] > performance[best_ep]):
            best_agent = copy.deepcopy(agent)
            best_ep = ep

    end_time = t.time()
    print "Total time", (end_time - start_time)
    plt.plot(performance[-100:])
    plt.show()
    }

    actor_dict = {
        'layer_sizes': [480, 360],
        'activation': activation_name,
        'pool_size': 2,
        'dropout_rate': 0.5,
        'use_bn': False,
        'use_do': True,
        'DR': DR,
        'total_tasks': len(env_names),
        'cpack': False
    }

    # Create actor and critic objects based on environment information.
    ActorObj = ActorCritic(actor_type=True, **ac_dict, lr=1e-3, **actor_dict)

    # Critic object has a simpler version if not using PackNet on it.
    # Which is only needed for multi-task PackNet.
    if not CPACK:
        CriticObj = ActorCritic(actor_type=False,
                                **ac_dict,
                                lr=1e-3,
                                cpack=CPACK)
    else:
        critic_dict = {
            'layer_sizes': [480, 360],
            'activation': activation_name,
            'pool_size': 2,
            'dropout_rate': 0.5,
            'use_bn': False,
Exemplo n.º 19
0
def single_agent():
    config = load_config()
    # num_agents = config['num_agents']
    torch.set_num_threads(1)

    env = GymEnv(config=config)
    env.reset()

    net = ActorCritic(True, config)
    net.ActorNetwork.init_params()
    net.CriticNetwork.init_params()

    bwe = config['sending_rate'][config['default_bwe']]

    i = 1
    s_batch = []
    r_batch = []
    a_batch = []

    # experience RTC if not forced to stop
    ax = []
    ay = []
    plt.ion()
    while True:
        # todo: Agent interact with gym
        state, reward, done, _ = env.step(bwe)

        r_batch.append(reward)

        action = net.predict(state)
        bwe = config['sending_rate'][action]
        a_batch.append(action)
        s_batch.append(state)

        # todo: need to be fixed
        if done:
            action = config['default_bwe']
            bwe = config['sending_rate'][action]
            # update network
            net.getNetworkGradient(s_batch, a_batch, r_batch, done)
            net.updateNetwork()
            print('Network update.')

            i += 1
            ax.append(i)
            # ay.append(entropy)
            ay.append(reward)
            plt.clf()
            plt.plot(ax, ay)
            plt.pause(0.1)
            # s_batch.append(np.zeros(config['state_dim'], config['state_length']))
            # a_batch.append(action)
            env.reset()
            print('Environment has been reset.')
            print('Epoch {}, Reward: {}'.format(i - 1, reward))
        if i % 100 == 0:
            # print('Current BWE: ' + str(bwe))
            torch.save(net.ActorNetwork.state_dict(),
                       config['model_dir'] + '/actor1_{}.pt'.format(str(i)))
            torch.save(net.CriticNetwork.state_dict(),
                       config['model_dir'] + '/critic13m_{}.pt'.format(str(i)))
            print('Model Restored.')
class ICMPPO:
    def __init__(self, writer, state_dim=172, action_dim=5, n_latent_var=512, lr=3e-4, betas=(0.9, 0.999),
                 gamma=0.99, ppo_epochs=3, icm_epochs=1, eps_clip=0.2, ppo_batch_size=128,
                 icm_batch_size=16, intr_reward_strength=0.02, lamb=0.95, device='cpu'):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.lambd = lamb
        self.eps_clip = eps_clip
        self.ppo_epochs = ppo_epochs
        self.icm_epochs = icm_epochs
        self.ppo_batch_size = ppo_batch_size
        self.icm_batch_size = icm_batch_size
        self.intr_reward_strength = intr_reward_strength
        self.device = device
        self.writer = writer
        self.timestep = 0
        self.icm = ICM(activation=Swish()).to(self.device)

        self.policy = ActorCritic(state_dim=state_dim,
                                  action_dim=action_dim,
                                  n_latent_var=n_latent_var,
                                  activation=Swish(),
                                  device=self.device,
                                  ).to(self.device)
        self.policy_old = ActorCritic(state_dim,
                                      action_dim,
                                      n_latent_var,
                                      activation=Swish(),
                                      device=self.device
                                      ).to(self.device)

        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas)
        self.optimizer_icm = torch.optim.Adam(self.icm.parameters(), lr=lr, betas=betas)
        self.policy_old.load_state_dict(self.policy.state_dict())

        self.MseLoss = nn.MSELoss(reduction='none')

    def update(self, memory, timestep):
        # Convert lists from memory to tensors
        self.timestep = timestep
        old_states = torch.stack(memory.states).to(self.device).detach()
        old_states = torch.transpose(old_states, 0, 1)
        old_actions = torch.stack(memory.actions).T.to(self.device).detach()
        old_logprobs = torch.stack(memory.logprobs).T.to(self.device).detach()

        # Finding s, n_s, a, done, reward:
        curr_states = old_states[:, :-1, :]
        next_states = old_states[:, 1:, :]
        actions = old_actions[:, :-1].long()
        rewards = torch.tensor(memory.rewards[:-1]).T.to(self.device).detach()
        mask = (~torch.tensor(memory.is_terminals).T.to(self.device).detach()[:, :-1]).type(torch.long)
        with torch.no_grad():
            intr_reward, _, _ = self.icm(actions, curr_states, next_states, mask)
        intr_rewards = torch.clamp(self.intr_reward_strength * intr_reward, 0, 1)

        self.writer.add_scalar('Mean_intr_reward_per_1000_steps',
                               intr_rewards.mean() * 1000,
                               self.timestep
                               )

        # Finding comulitive advantage
        with torch.no_grad():
            state_values = torch.squeeze(self.policy.value_layer(curr_states))
            next_state_values = torch.squeeze(self.policy.value_layer(next_states))
            td_target = (rewards + intr_rewards) / 2 + self.gamma * next_state_values * mask
            delta = td_target - state_values

            self.writer.add_scalar('maxValue',
                                   state_values.max(),
                                   timestep
                                   )
            self.writer.add_scalar('meanValue',
                                   state_values.mean(),
                                   self.timestep
                                   )

            advantage = torch.zeros(1, 16).to(self.device)
            advantage_lst = []
            for i in range(delta.size(1) - 1, -1, -1):
                delta_t, mask_t = delta[:, i], mask[:, i]
                advantage = delta_t + (self.gamma * self.lambd * advantage) * mask_t
                advantage_lst.insert(0, advantage)

            advantage_lst = torch.cat(advantage_lst, dim=0).T
            # Get local advantage to train value function
            local_advantages = state_values + advantage_lst
            # Normalizing the advantage
            advantages = (advantage_lst - advantage_lst.mean()) / (advantage_lst.std() + 1e-10)

        # Optimize policy for ppo epochs:
        epoch_surr_loss = 0
        for _ in range(self.ppo_epochs):
            indexes = np.random.permutation(actions.size(1))
            # Train PPO and icm
            for i in range(0, len(indexes), self.ppo_batch_size):
                batch_ind = indexes[i:i + self.ppo_batch_size]
                batch_curr_states = curr_states[:, batch_ind, :]
                batch_actions = actions[:, batch_ind]
                batch_mask = mask[:, batch_ind]
                batch_advantages = advantages[:, batch_ind]
                batch_local_advantages = local_advantages[:, batch_ind]
                batch_old_logprobs = old_logprobs[:, batch_ind]

                # Finding actions logprobs and states values
                batch_logprobs, batch_state_values, batch_dist_entropy = self.policy.evaluate(batch_curr_states,
                                                                                              batch_actions)

                # Finding the ratio (pi_theta / pi_theta__old):
                ratios = torch.exp(batch_logprobs - batch_old_logprobs.detach())

                # Apply leaner decay and multiply 16 times cause agents_batch is 16 long
                decay_epsilon = linear_decay_eps(self.timestep * 16)
                decay_beta = linear_decay_beta(self.timestep * 16)

                # Finding Surrogate Loss:
                surr1 = ratios * batch_advantages
                surr2 = torch.clamp(ratios, 1 - decay_epsilon, 1 + decay_epsilon) * batch_advantages
                loss = -torch.min(surr1, surr2) * batch_mask + \
                       0.5 * nn.MSELoss(reduction='none')(batch_state_values,
                                                           batch_local_advantages.detach()) * batch_mask - \
                       decay_beta * batch_dist_entropy * batch_mask
                loss = loss.mean()

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                linear_decay_lr(self.optimizer, self.timestep * 16)

                epoch_surr_loss += loss.item()

        self._icm_update(self.icm_epochs, self.icm_batch_size, curr_states, next_states, actions, mask)
        self.writer.add_scalar('Lr',
                               self.optimizer.param_groups[0]['lr'],
                               self.timestep
        )
        self.writer.add_scalar('Surrogate_loss',
                               epoch_surr_loss / (self.ppo_epochs * (len(indexes) // self.ppo_batch_size + 1)),
                               self.timestep
        )

        # Copy new weights into old policy:
        self.policy_old.load_state_dict(self.policy.state_dict())

    def _icm_update(self, epochs, batch_size, curr_states, next_states, actions, mask):
        epoch_forw_loss = 0
        epoch_inv_loss = 0
        for _ in range(epochs):
            indexes = np.random.permutation(actions.size(1))
            for i in range(0, len(indexes), batch_size):
                batch_ind = indexes[i:i + batch_size]
                batch_curr_states = curr_states[:, batch_ind, :]
                batch_next_states = next_states[:, batch_ind, :]
                batch_actions = actions[:, batch_ind]
                batch_mask = mask[:, batch_ind]

                _, inv_loss, forw_loss = self.icm(batch_actions,
                                                  batch_curr_states,
                                                  batch_next_states,
                                                  batch_mask)
                epoch_forw_loss += forw_loss.item()
                epoch_inv_loss += inv_loss.item()
                unclip_intr_loss = 10 * (0.2 * forw_loss + 0.8 * inv_loss)

                # take gradient step
                self.optimizer_icm.zero_grad()
                unclip_intr_loss.backward()
                self.optimizer_icm.step()
                linear_decay_lr(self.optimizer_icm, self.timestep * 16)
        self.writer.add_scalar('Forward_loss',
                               epoch_forw_loss / (epochs * (len(indexes) // batch_size + 1)),
                               self.timestep
        )
        self. writer.add_scalar('Inv_loss',
                                epoch_inv_loss / (epochs * (len(indexes) // batch_size + 1)),
                                self.timestep
        )
Exemplo n.º 21
0
def main():
    
    env = gym.make('Pendulum-v0')
    action_dim = env.action_space.shape[0]
    state_dim = env.observation_space.shape[0]
    agent = ActorCritic(state_dim, action_dim)
    state = env.reset()
    timestep_limit = min(250, env.spec.timestep_limit)
    print "timestep limit set to : ", timestep_limit
    #timestep_limit = env.spec.timestep_limit   # For checking purposes; make it proper for run
    # Initial data build up
    done_flag = 0
    for i in range(REPLAY_MEMORY):

        if (done_flag == True):
            state = env.reset()

        action = env.action_space.sample()
        next_state, reward, done_flag, info = env.step(action)
        agent.append_memory(state, action, reward, next_state, done_flag)
        state = next_state

    print "Initial memory built!!"

    # Initial Training for a few steps
    for _ in range(5):
        agent.update_networks()
        agent.update_target_networks()

    print "Initial network performance = ", policy_evaluation(agent, env, 2)
    # =================================================================================

    print "******** Starting learning process *************"
    num_episodes = 5
    update_freq = 1        # update after how many steps (within each episode)
    print_freq = 1         # how often to print (episodes)

    performance = np.zeros(num_episodes)
    best_ep = 0
    best_agent = copy.deepcopy(agent)

    start_time = t.time()

    for ep in range(num_episodes):
        done_flag = 0
        state = env.reset()
        time = 0
    
        while (done_flag!=True and time<=timestep_limit):
            actor_out = agent.learner.actor.predict(state.reshape(1,-1))[0]
            action = actor_out   # need to add exploration here
            next_state, reward, done_flag, _ = env.step(action)
            agent.append_memory(state, action, reward, next_state, done_flag)
            state = next_state

            if (time % update_freq == 0):
                agent.update_networks(epochs=5)
                #agent.update_target_networks()  --> Ideall I should update here, but it's way too slow.
                #print time, timestep_limit
    
            time += 1

        performance[ep] = policy_evaluation(agent, env, 5)

        # Update the target networks (I'll use a larger tau here)
        agent.update_target_networks(tau=0.01)


        if (ep % print_freq == 0):
            print "Now in episode: ", ep+1, " of ", num_episodes
            print "Agent performance = ", performance[ep]

        if (performance[ep] > performance[best_ep]):
            best_agent = copy.deepcopy(agent)
            best_ep = ep

    end_time = t.time()
    print "Total time", (end_time - start_time)
    plt.plot(performance[-100:])
    plt.show()
Exemplo n.º 22
0
def train(rank, args, shared_model, optimizer=None):
    torch.manual_seed(args.seed + rank)

    env = GeneralEnvironment('policy.mdl')

    model = ActorCritic()

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = env.reset()
    state = torch.Tensor(state)
    model.init_hidden(env.map_height, env.map_width)
    done = True

    episode_length = 0
    while True:
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())

        values = []
        log_probs = []
        rewards = []
        entropies = []
        off_targets = []

        for step in range(args.num_steps):
            episode_length += 1
            value, logit = model(Variable(state.unsqueeze(0)))
            prob = F.softmax(logit)
            old_prob = prob

            # Set the probability of all items that not owned by user to
            # 0
            army_map = state[0, ...]
            label_map = (army_map > 0)
            label_map = label_map.view(1, env.map_height, env.map_width)
            label_map = label_map.expand(8, env.map_height, env.map_width)
            label_map = label_map.contiguous()
            label_map = label_map.view(-1)
            # prob[~label_map] = 0
            prob = old_prob * Variable(label_map.float())
            # Penalize model for predicting off target tiles
            off_prob = old_prob * Variable((~label_map).float())
            off_targets.append(off_prob.sum(1))

            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1)
            entropies.append(entropy)

            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))

            state, reward, done, _ = env.step(action.numpy().flat[0])
            done = done or episode_length >= args.max_episode_length

            if done:
                episode_length = 0
                state = env.reset()
                model.init_hidden(env.map_height, env.map_width)

            state = torch.Tensor(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _ = model(Variable(state.unsqueeze(0)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - args.entropy_coef * entropies[i] + \
                args.off_tile_coef * off_targets[i]

        optimizer.zero_grad()
        loss = policy_loss + args.value_loss_coef * value_loss

        (loss).backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
        model.reset_hidden()
        gc.collect()
Exemplo n.º 23
0
def main():

    env = PendulumEnv()
    num_episodes = 5
    num_of_time_steps = 200

    #1
    policy1 = ActorCritic(env,
                          alpha_value=0.0001,
                          alpha_policy=0.01,
                          gamma=0.99,
                          sigma=1.5)
    reward_plot1 = train(env, policy1, num_episodes, num_of_time_steps)
    plt.figure()
    plt.plot(reward_plot1)
    plt.xlabel("Every 10th epsiode")
    plt.ylabel("Sum of rewards in the episode")
    plt.title("1 > Policy Step Size > Value Step Size")
    #plt.savefig('1-3_1.png')
    plt.show()

    #2
    policy2 = ActorCritic(env,
                          alpha_value=0.1,
                          alpha_policy=0.01,
                          gamma=0.99,
                          sigma=1.5)
    reward_plot2 = train(env, policy2, num_episodes, num_of_time_steps)
    plt.figure()
    plt.plot(reward_plot2)
    plt.xlabel("Every 10th epsiode")
    plt.ylabel("Sum of rewards in the episode")
    plt.title("1 > Policy Step Size < Value Step Size")
    #plt.savefig('1-3_2.png')
    plt.show()

    #3
    policy3 = ActorCritic(env,
                          alpha_value=0.001,
                          alpha_policy=0.001,
                          gamma=0.99,
                          sigma=1.5)
    reward_plot3 = train(env, policy3, num_episodes, num_of_time_steps)
    plt.figure()
    plt.plot(reward_plot3)
    plt.xlabel("Every 10th epsiode")
    plt.ylabel("Sum of rewards in the episode")
    plt.title("1 > Policy Step Size = Value Step Size")
    #plt.savefig('1-3_3.png')
    plt.show()

    #4
    policy4 = ActorCritic(env,
                          alpha_value=1,
                          alpha_policy=0.1,
                          gamma=0.99,
                          sigma=1.5)
    reward_plot4 = train(env, policy4, num_episodes, num_of_time_steps)
    plt.figure()
    plt.plot(reward_plot4)
    plt.xlabel("Every 10th epsiode")
    plt.ylabel("Sum of rewards in the episode")
    plt.title("Policy Step Size > Value Step Size > 1")
    plt.show()
    #plt.savefig('1-3_4.png')
    #5
    plt.figure()
    plt.plot(reward_plot1, label="1 > alpha_Policy > alpha_Value")
    plt.plot(reward_plot2, label="1 > alpha_Policy < alpha_Value")
    plt.plot(reward_plot3, label="1 > alpha_Policy = alpha_Value")
    plt.plot(reward_plot4, label="alpha_Policy > alpha_Value > 1")
    plt.title("Performance for different values of policy and value step size")
    plt.xlabel("Every 10th epsiode")
    plt.ylabel("Sum of rewards in the episode")
    plt.legend()
    #plt.savefig("all_plots.png")
    plt.show()
Exemplo n.º 24
0
parser.add_argument('--off-tile-coef',
                    type=float,
                    default=10,
                    help='weight to penalize bad movement')
parser.add_argument('--checkpoint-interval',
                    type=float,
                    default=None,
                    help='interval to save model')

if __name__ == '__main__':
    os.environ['OMP_NUM_THREADS'] = '1'

    args = parser.parse_args()

    env = GeneralEnvironment('2_epoch.mdl')
    shared_model = ActorCritic()

    shared_model.share_memory()

    if args.no_shared:
        optimizer = None
    else:
        optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr)
        optimizer.share_memory()

    processes = []

    p = mp.Process(target=test, args=(args.num_processes, args, shared_model))
    p.start()
    processes.append(p)
Exemplo n.º 25
0
class PPO:
    def __init__(self, state_dim, action_dim, n_agents, lr, betas, gamma, K_epochs, eps_clip):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs

        self.policy = ActorCritic(state_dim, action_dim, n_agents).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas)
        self.policy_old = ActorCritic(state_dim, action_dim, n_agents).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())

        self.MseLoss = nn.MSELoss()

    def update(self, memory):
        # Monte Carlo estimate of state rewards:
        rewards = []
        discounted_reward = [0, 0]
        for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
            if all(is_terminal):
                discounted_reward =[0, 0]
            elif is_terminal[0]:
                discounted_reward[0] = 0
            elif is_terminal[1]:
                discounted_reward[1] = 0
            discounted_reward[0] = reward[0] + self.gamma * discounted_reward[0]
            discounted_reward[1] = reward[1] + self.gamma * discounted_reward[1]
            rewards.insert(0, discounted_reward)

        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

        # convert list to tensor
        old_states = torch.stack(memory.states).to(device).detach()
        old_actions = torch.stack(memory.actions).to(device).detach()
        old_logprobs = torch.stack(memory.logprobs).to(device).detach()

        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):
            # Evaluating old actions and values :
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)

            # Finding the ratio (pi_theta / pi_theta__old):
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss:
            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages
            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.01 * dist_entropy

            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        # Copy new weights into old policy:
        self.policy_old.load_state_dict(self.policy.state_dict())

        return loss.mean()
Exemplo n.º 26
0
class PPOAgent:
    def __init__(self):

        self.train_history = dict()

        if os.path.isfile("./checkpoints/model.pt"):
            self.model = torch.load("./checkpoints/model.pt")
            self.train_history['frames_trained'] = torch.load(
                "./checkpoints/frames_trained.pt")
            self.train_history['average_entropy'] = torch.load(
                "./checkpoints/average_entropy.pt")
            self.train_history['average_values'] = torch.load(
                "./checkpoints/average_values.pt")
            print("Model loaded from last checkpoint.")
        else:
            self.model = ActorCritic(num_actions=len(ACTIONS))
            self.train_history['frames_trained'] = torch.tensor(0)
            self.train_history['average_entropy'] = torch.tensor(
                [0], dtype=torch.float)
            self.train_history['average_values'] = torch.tensor(
                [0], dtype=torch.float)
            print("New model created.")
        self.model.to("cuda")

        self.worker = RolloutWorker(self, ENV_ID, NUM_WORKERS, T)

    def select_act(self, states, train_mode=True):
        states = torch.tensor(states).to("cuda")
        prob_dists, values = self.model(states)
        if train_mode:
            actions = prob_dists.sample()
        else:
            actions = torch.argmax(prob_dists.probs, dim=1)

        action_log_probs = prob_dists.log_prob(actions)

        values = values.data.cpu().numpy()
        actions = actions.data.cpu().numpy()
        action_log_probs = action_log_probs.data.cpu().numpy()
        return values, actions, action_log_probs

    def train_step(self):
        states, actions, old_action_log_probs, returns, advantages \
        = self.worker.rollout()

        states = torch.tensor(states).to("cuda")
        actions = torch.tensor(actions).to("cuda")
        old_action_log_probs = torch.tensor(old_action_log_probs).to("cuda")
        returns = torch.tensor(returns).to("cuda")
        advantages = torch.tensor(advantages).to("cuda")

        optimizer = torch.optim.Adam(self.model.parameters(),
                                     lr=LEARNING_RATE,
                                     eps=1e-5)

        loss_surr = self._surrogate_loss(states, actions, old_action_log_probs,
                                         advantages)
        loss_surr_before = loss_surr.data.cpu().numpy()

        loss_value = self._value_loss(states, returns)
        loss_value_before = loss_value.data.cpu().numpy()

        loss_ent = self._entropy_loss(states)
        loss_ent_before = loss_ent.data.cpu().numpy()

        for epoch in range(NUM_EPOCHS):
            dataset_size = states.shape[0]
            batch_size = dataset_size // NUM_MINIBATCHES
            random_indices = torch.randperm(dataset_size,
                                            device="cpu").to("cuda")

            for n in range(NUM_MINIBATCHES):
                batch_indices = random_indices[n * batch_size:n * batch_size +
                                               batch_size]
                states_batch = states[batch_indices]
                old_action_log_probs_batch = old_action_log_probs[
                    batch_indices]
                actions_batch = actions[batch_indices]
                advantages_batch = advantages[batch_indices]
                returns_batch = returns[batch_indices]

                advantages_batch = (advantages_batch - advantages_batch.mean()
                                    ) / (advantages_batch.std() + 1e-6)

                loss_surr_batch = self._surrogate_loss(
                    states_batch, actions_batch, old_action_log_probs_batch,
                    advantages_batch)

                loss_value_batch = self._value_loss(states_batch,
                                                    returns_batch)

                loss_ent_batch = self._entropy_loss(states_batch)

                loss_batch = loss_surr_batch + C1 * loss_value_batch + C2 * loss_ent_batch

                #                loss_batch_np = loss_batch.data.cpu().numpy()

                optimizer.zero_grad()
                loss_batch.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                               MAX_GRAD_NORM)
                optimizer.step()
                pass

        loss_surr = self._surrogate_loss(states, actions, old_action_log_probs,
                                         advantages)
        loss_surr_after = loss_surr.data.cpu().numpy()

        loss_value = self._value_loss(states, returns)
        loss_value_after = loss_value.data.cpu().numpy()

        loss_ent = self._entropy_loss(states)
        loss_ent_after = loss_ent.data.cpu().numpy()

        self.train_history['frames_trained'] += 4 * states.shape[0]
        self.train_history['average_entropy'] = torch.cat(
            (self.train_history['average_entropy'],
             torch.tensor([float(loss_ent_after)])))
        self.train_history['average_values'] = torch.cat(
            (self.train_history['average_values'],
             torch.tensor([returns.mean()])))

        print("Frames trained: ",
              self.train_history['frames_trained'].cpu().numpy())
        print("Loss before: {: .6f} {:.6f} {:.6f}".format(
            loss_surr_before, loss_value_before, loss_ent_before))
        print("Loss after : {: .6f} {:.6f} {:.6f}".format(
            loss_surr_after, loss_value_after, loss_ent_after))
        torch.save(self.model, "./checkpoints/model.pt")
        torch.save(self.train_history['frames_trained'],
                   "./checkpoints/frames_trained.pt")
        torch.save(self.train_history['average_entropy'],
                   "./checkpoints/average_entropy.pt")
        torch.save(self.train_history['average_values'],
                   "./checkpoints/average_values.pt")

    def test_step(self):
        self.worker.rollout(train_mode=False)

    def _surrogate_loss(self, states, actions, old_action_log_probs,
                        advantages):
        advantages -= advantages.mean()
        advantages /= advantages.std() + 1e-8
        #        advantages_np = advantages.data.cpu().numpy()

        pd, _ = self.model(states)

        action_log_probs = pd.log_prob(actions)
        #        action_log_probs_np = action_log_probs.data.cpu().numpy()

        r = torch.exp(action_log_probs - old_action_log_probs)
        #        r_np = r.data.cpu().numpy()

        r_clip = torch.clamp(r, 1 - EPSILON, 1 + EPSILON)
        #        r_clip_np = r_clip.data.cpu().numpy()

        surr1 = r * advantages
        #        surr1_np = surr1.data.cpu().numpy()

        surr2 = r_clip * advantages
        #        surr2_np = surr2.data.cpu().numpy()

        loss_policy_batch = -torch.min(surr1, surr2)
        #        loss_policy_batch_np = loss_policy_batch.data.cpu().numpy()

        loss_policy = torch.mean(loss_policy_batch, dim=0)
        #        loss_policy_np = loss_policy.data.cpu().numpy()

        return loss_policy

    def _entropy_loss(self, states):

        pd, _ = self.model(states)
        loss_entropy_batch = -pd.entropy()
        #        loss_entropy_batch_np = loss_entropy_batch.data.cpu().numpy()

        loss_entropy = torch.mean(loss_entropy_batch)
        #        loss_entropy_np = loss_entropy.data.cpu().numpy()

        return loss_entropy

    def _value_loss(self, states, returns):

        _, values = self.model(states)
        #        values_np = values.data.cpu().numpy()

        loss_value_batch = (returns - values)**2
        #        loss_value_batch_np = loss_value_batch.data.cpu().numpy()

        loss_value = 0.5 * torch.mean(loss_value_batch, dim=0)
        #        loss_value_np = loss_value.data.cpu().numpy()

        return loss_value
Exemplo n.º 27
0
def test(rank, args, shared_model):
    torch.manual_seed(args.seed + rank)

    env = GeneralEnvironment('2_epoch.mdl')

    model = ActorCritic()
    model.eval()

    state = env.reset()
    model.init_hidden(env.map_height, env.map_width)
    state = torch.Tensor(state)
    reward_sum = 0
    done = True

    start_time = time.time()
    checkpoint_interval = 1

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())

        value, logit = model(Variable(
            state.unsqueeze(0), volatile=True))
        prob = F.softmax(logit)

        # Set the probability of all items that not owned by user to 
        # 0
        army_map = state[0, ...]
        label_map = (army_map > 0)
        label_map = label_map.view(1, env.map_height, env.map_width)
        label_map = label_map.expand(8, env.map_height, env.map_width)
        label_map = label_map.contiguous()
        label_map = label_map.view(-1)
        prob = prob * Variable(label_map.float())

        action = prob.max(1, keepdim=True)[1].data.numpy()

        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            model.init_hidden(env.map_height, env.map_width)
            time.sleep(60)
            checkpoint_interval += 1

            if checkpoint_interval % arg.checkpoint_interval == 0:
                torch.save(model.cpu().state_dict(), 'reinforce_trained.mdl')

        state = torch.Tensor(state)
class MultiAgent():
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents,
                 hidden_size=64,
                 lr=1e-4):
        self.model = ActorCritic(state_size,
                                 action_size,
                                 hidden_size=hidden_size)
        self.optimizer = Adam(self.model.parameters(), lr=lr)
        self.agents = [PPO_Agent() for _ in range(num_agents)]

    def save_checkpoint(self, filepath=None):
        if filepath is None: filepath = 'checkpoint.pth'
        torch.save(self.model.state_dict(), filepath)

    def load_checkpoint(self, filepath):
        self.model.load_state_dict(torch.load(filepath))

    def act(self, states):
        results = zip(*[
            agent.chooce_action(self.model, state)
            for agent, state in zip(self.agents, states)
        ])
        actions, log_probs, values = map(lambda x: np.array(x).squeeze(1),
                                         results)
        return actions, log_probs, values

    def step(self,
             states,
             actions,
             rewards,
             dones,
             log_probs,
             values,
             is_terminal=False):
        for i, agent in enumerate(self.agents):
            if is_terminal:
                agent.register_trajectories(states[i],
                                            None,
                                            None,
                                            None,
                                            None,
                                            values[i],
                                            is_terminal=is_terminal)
            else:
                agent.register_trajectories(states[i], actions[i], rewards[i],
                                            dones[i], log_probs[i], values[i])

    def process_trajectories(self, gamma=0.99, gae_tau=0.95):
        for agent in self.agents:
            agent.calculate_gae_returns(gamma=gamma, gae_tau=gae_tau)

    def maybe_learn(self, i_episode, update_every=4):
        if i_episode % update_every == 0:
            accumulated_trajectories = []
            for agent in self.agents:
                accumulated_trajectories += agent.processed_trajectories

            self.learn(accumulated_trajectories)

    def learn(self,
              accumulated_trajectories,
              batch_size=64,
              epsilon_clip=0.2,
              gradient_clip=10,
              beta=0.001,
              critic_discount=1.,
              num_epochs=5):
        # Unroll and convert accumulated trajectories to tensors
        states, actions, old_log_probs, returns, advantages = map(
            torch.FloatTensor, zip(*accumulated_trajectories))

        # Normalized advantages
        advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                         1e-7)

        # Get random batches from accumulated trajectories
        batcher = DataLoader(Batcher(states, actions, old_log_probs, returns,
                                     advantages),
                             batch_size=batch_size,
                             shuffle=True)

        self.model.train()
        for _ in range(num_epochs):
            for states, actions, old_log_probs, returns, advantages in batcher:
                # Get updated values from policy
                values, dist = self.model(states)
                new_log_probs = dist.log_prob(actions)
                entropy = dist.entropy()

                # Calculate ratio and clip, so that learning doesn't change new policy much from old
                ratio = (new_log_probs - old_log_probs).exp()
                clip = torch.clamp(ratio, 1 - epsilon_clip, 1 + epsilon_clip)
                clipped_surrogate = torch.min(ratio * advantages,
                                              clip * advantages)

                # Get losses
                actor_loss = -torch.mean(
                    clipped_surrogate) - beta * entropy.mean()
                critic_loss = torch.mean(torch.square((returns - values)))
                losses = critic_loss * critic_discount + actor_loss

                # Do the optimizer step
                self.optimizer.zero_grad()
                losses.backward()
                nn.utils.clip_grad_norm_(self.model.parameters(),
                                         gradient_clip)
                self.optimizer.step()

        # Reset collected trajectories
        for agent in self.agents:
            agent.reset()
Exemplo n.º 29
0
def central_agent(net_params_queue, exp_queues, config):
    torch.set_num_threads(1)

    # log training info
    logging.basicConfig(filename=config['log_dir'] +
                        '/Central_agent_training.log',
                        filemode='w',
                        level=logging.INFO)

    assert len(net_params_queue) == config['num_agents']
    assert len(exp_queues) == config['num_agents']

    net = ActorCritic(True, config)

    # since the original pensieve does not use critic in workers
    # push actor_net_params into net_params_queue only, and save parameters regarding both networks separately
    if config['load_model']:
        actor_net_params = torch.load(config['model_dir'] +
                                      '/actor_300k1_80.pt')
        critic_net_params = torch.load(config['model_dir'] +
                                       '/critic_300k1_80.pt')
        net.ActorNetwork.load_state_dict(actor_net_params)
        net.CriticNetwork.load_state_dict(critic_net_params)
    else:
        net.ActorNetwork.init_params()
        net.CriticNetwork.init_params()
    #
    actor_net_params = list(net.ActorNetwork.parameters())
    for i in range(config['num_agents']):
        # actor_net_params = net.ActorNetwork.parameters()
        net_params_queue[i].put(actor_net_params)

    epoch = 0
    total_reward = 0.0
    total_batch_len = 0.0
    episode_entropy = 0.0
    ax = []
    ay = []
    plt.ion()

    while True:
        start = time.time()
        actor_net_params = list(net.ActorNetwork.parameters())
        for i in range(config['num_agents']):
            net_params_queue[i].put(actor_net_params)

        for i in range(config['num_agents']):
            s_batch, a_batch, r_batch, done, e_batch = exp_queues[i].get()

            net.getNetworkGradient(s_batch, a_batch, r_batch, done)

            total_reward += np.sum(r_batch)
            total_batch_len += len(r_batch)
            episode_entropy += np.sum(e_batch)

        net.updateNetwork()
        epoch += 1
        avg_reward = total_reward / total_batch_len
        # avg_entropy = total_entropy / total_batch_len

        logging.info('Epoch ' + str(epoch) + '\nAverage reward: ' +
                     str(avg_reward) + '\nEpisode entropy: ' +
                     str(episode_entropy))
        ax.append(epoch)
        ay.append(episode_entropy)
        plt.clf()
        plt.plot(ax, ay)
        plt.pause(0.1)

        total_reward = 0.0
        total_batch_len = 0
        episode_entropy = 0.0

        if epoch % config['save_interval'] == 0:
            print('Train Epoch ' + str(epoch) + ', Model restored.')
            print('Epoch costs ' + str(time.time() - start) + ' seconds.')
            torch.save(
                net.ActorNetwork.state_dict(),
                config['model_dir'] + '/actor_300k_' + str(epoch) + '.pt')
            torch.save(
                net.CriticNetwork.state_dict(),
                config['model_dir'] + '/critic_300k_' + str(epoch) + '.pt')