Exemplo n.º 1
0
def main():
    #    try:
    parse_cmd_args()

    sess = tf.Session()
    K.set_session(sess)
    db = Database()
    env = Environment(db, argus)

    actor_critic = ActorCritic(env, sess, learning_rate=argus['learning_rate'], train_min_size=argus['train_min_size'],
                               size_mem=argus['maxlen_mem'], size_predict_mem=argus['maxlen_predict_mem'])

    num_trials = argus['num_trial']  # ?
    # trial_len  = 500   # ?
    # ntp
    env.preheat()

    # First iteration
    cur_state = env._get_obs()  # np.array      (inner_metric + sql)
    cur_state = cur_state.reshape((1, env.state.shape[0]))
    # action = env.action_space.sample()
    action = env.fetch_action()  # np.array
    action_2 = action.reshape((1, env.action_space.shape[0]))  # for memory
    new_state, reward, done, _ = env.step(action, 0, 1)  # apply the action -> to steady state -> return the reward
    new_state = new_state.reshape((1, env.state.shape[0]))
    reward_np = np.array([reward])

    print("0-shape-")
    print(new_state.shape)
    actor_critic.remember(cur_state, action_2, reward_np, new_state, done)
    actor_critic.train()  # len<32, useless

    cur_state = new_state
    for i in range(num_trials):
        # env.render()
        cur_state = cur_state.reshape((1, env.state.shape[0]))
        action, isPredicted = actor_critic.act(cur_state)
        print(action)
        action_2 = action.reshape((1, env.action_space.shape[0]))  # for memory
        # action.tolist()                                          # to execute
        new_state, reward, done, _ = env.step(action, isPredicted, i + 1)
        new_state = new_state.reshape((1, env.state.shape[0]))

        reward_np = np.array([reward])
        print("%d-shape-" % i)
        print(new_state.shape)

        actor_critic.remember(cur_state, action_2, reward_np, new_state, done)
        actor_critic.train()

        cur_state = new_state
    '''
Exemplo n.º 2
0
    early_stop = False

    #init = tf.global_variables_initializer()
    with tf.Session() as sess:
        writer = tf.summary.FileWriter('./log/train', sess.graph)
        sess.run(tf.global_variables_initializer())

        while not early_stop:

            log_probs, values, states, actions, rewards, masks = [], [], [], [], [], []

            for q in range(
                    PPO_STEPS
            ):  #each ppo steps generates actions, states, rewards
                print("PPO_steps:{}".format(q))
                action, value, norm_dist = model.act(state)
                next_state, reward, done, _ = env.step(action)
                # each state, reward, done is a list of results from each parallel environment
                if render:
                    env.render()
                log_prob_ = norm_dist.log_prob(action)

                log_probs.append(log_prob_)
                values.append(value)
                states.append(state)
                actions.append(action)
                rewards.append(reward)
                masks.append(1 - done)
                #storing
                state = next_state
                frame_idx += 1
Exemplo n.º 3
0
    action_2 = action.reshape((1, env.action_space.shape[0]))  # for memory
    new_state, reward, done, socre,  _ = env.step(action, 0, 1)  # apply the action -> to steady state -> return the reward
    new_state = new_state.reshape((1, env.state.shape[0]))
    reward_np = np.array([reward])

    print("0-shape")
    print(new_state.shape)
    actor_critic.remember(cur_state, action_2, reward_np, new_state, done)
    actor_critic.train()  # len<32, useless

    cur_state = new_state
    predicted_rewardList = []
    for epoch in range(num_trials):
        # env.render()
        cur_state = cur_state.reshape((1, env.state.shape[0]))
        action, isPredicted = actor_critic.act(cur_state)
        print(action)
        action_2 = action.reshape((1, env.action_space.shape[0]))  # for memory
        # action.tolist()                                          # to execute
        new_state, reward, done, score, _ = env.step(action, isPredicted, epoch + 1)
        new_state = new_state.reshape((1, env.state.shape[0]))
        if isPredicted == 1:
            predicted_rewardList.append([epoch, reward])

        reward_np = np.array([reward])
        print("%d-shape" % epoch)
        print(new_state.shape)

        actor_critic.remember(cur_state, action_2, reward_np, new_state, done)
        actor_critic.train()
Exemplo n.º 4
0
    def train(self):
        self.NUM_AGENTS = 1
        # self.NUM_AGENTS = len(dict_model)
        # print("train", dict_model)
        # actor_critics = []
        # local_brains = []
        # rollouts = []
        if DEBUG: print(self.config)
        actor_critic = ActorCritic(self.n_in, self.n_out)
        global_brain = Brain(actor_critic, self.config)
        rollout = RolloutStorage(self.NUM_ADVANCED_STEP, self.NUM_PARALLEL,
                                 self.obs_shape, self.device)

        current_obs = torch.zeros(self.NUM_PARALLEL,
                                  self.obs_shape).to(self.device)
        episode_rewards = torch.zeros([self.NUM_PARALLEL, 1])
        final_rewards = torch.zeros([self.NUM_PARALLEL, 1])

        episode = np.zeros(self.NUM_PARALLEL)

        obs = self.envs.reset()
        obs = np.array(obs)
        obs = torch.from_numpy(obs).float()
        current_obs = obs

        rollout.observations[0].copy_(current_obs)

        while True:
            # for step in range(self.NUM_ADVANCED_STEP):
            for step in range(self.max_step):
                print("step", step)
                with torch.no_grad():
                    # action = actor_critic.act(rollouts.observations[step]) # ここでアクション決めて
                    action = torch.zeros(self.NUM_PARALLEL,
                                         self.NUM_AGENTS).long().to(
                                             self.device)  # 各観測に対する,各エージェントの行動
                    if DEBUG:
                        print("actionサイズ", self.NUM_PARALLEL, self.NUM_AGENTS)
                    # for i, (k,v) in enumerate( dict_model.items() ):
                    #     if k == training_target:
                    #         tmp_action = v.act(current_obs)
                    #         target_action = copy.deepcopy(tmp_action)
                    #     else:
                    #         tmp_action = v.act_greedy(current_obs)
                    #     action[:,i] = tmp_action.squeeze()
                    action = actor_critic.act(obs)
                    if DEBUG: print("action", action)
                if DEBUG: print("step前のここ?", action.shape)
                obs, reward, done, infos = self.envs.step(action)  # これで時間を進める
                print("reward(train)", reward)
                episode_rewards += reward

                # if done then clean the history of observation
                masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                           for done_ in done])
                if DEBUG: print("done.shape", done.shape)
                if DEBUG: print("masks.shape", masks.shape)
                if DEBUG: print("obs.shape", obs.shape)
                with open(self.resdir + "/episode_reward.txt", "a") as f:
                    for i, info in enumerate(infos):
                        if 'episode' in info:
                            f.write("{:}\t{:}\t{:}\n".format(
                                episode[i], info['env_id'],
                                info['episode']['r']))
                            print(episode[i], info['env_id'],
                                  info['episode']['r'])
                            episode[i] += 1

                final_rewards *= masks
                final_rewards += (1 - masks) * episode_rewards

                episode_rewards *= masks
                current_obs *= masks

                current_obs = obs  # ここで観測を更新している

                rollout.insert(current_obs, action.data, reward, masks,
                               self.NUM_ADVANCED_STEP)
                with open(self.resdir + "/reward_log.txt",
                          "a") as f:  # このログはエピソードが終わったときだけでいい->要修正
                    f.write("{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}\n".format(
                        episode.mean(), step,
                        reward.max().numpy(),
                        reward.min().numpy(),
                        reward.mean().numpy(),
                        episode_rewards.max().numpy(),
                        episode_rewards.min().numpy(),
                        episode_rewards.mean().numpy()))
                    print(episode.mean(), step,
                          reward.mean().numpy(),
                          episode_rewards.mean().numpy())

            with torch.no_grad():
                next_value = actor_critic.get_value(
                    rollout.observations[-1]).detach()

            rollout.compute_returns(next_value, self.gamma)
            value_loss, action_loss, total_loss, entropy = global_brain.update(
                rollout)

            with open(self.resdir + "/loss_log.txt", "a") as f:
                f.write("{:}\t{:}\t{:}\t{:}\t{:}\n".format(
                    episode.mean(), value_loss, action_loss, entropy,
                    total_loss))
                print(
                    "value_loss {:.4f}\taction_loss {:.4f}\tentropy {:.4f}\ttotal_loss {:.4f}"
                    .format(value_loss, action_loss, entropy, total_loss))

            rollout.after_update()

            if int(episode.mean()) + 1 > self.NUM_EPISODES:
                # print("ループ抜ける")
                break
            obs = self.envs.reset()

        if self.args.save:
            save_model(actor_critic, self.resdir + "/model")
        # ここでベストなモデルを保存していた(備忘)
        # print("%s番目のエージェントのtrain終了"%training_target)
        # dict_model[training_target] = actor_critic # {}
        return actor_critic
Exemplo n.º 5
0
class PPO:
    def __init__(self, state_dim, action_dim, action_std, lr, betas, gamma,
                 K_epochs, eps_clip):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs

        self.policy = ActorCritic(state_dim, action_dim, action_std).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(),
                                          lr=lr,
                                          betas=betas)

        self.policy_old = ActorCritic(state_dim, action_dim,
                                      action_std).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())

        try:
            self.policy.load_state_dict(
                torch.load('./PPO_continuous_drone.pth', map_location=device))
            self.policy_old.load_state_dict(
                torch.load('./PPO_continuous_old_drone.pth',
                           map_location=device))
            print('Saved models loaded')
        except:
            print('New models generated')
            pass

        self.MseLoss = nn.MSELoss()

    def select_action(self, state, memory):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        return self.policy_old.act(state, memory).cpu().data.numpy().flatten()

    def update(self, memory):
        # Monte Carlo estimate of rewards:
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards),
                                       reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)

        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

        # convert list to tensor
        old_states = torch.squeeze(torch.stack(memory.states).to(device),
                                   1).detach()
        old_actions = torch.squeeze(torch.stack(memory.actions).to(device),
                                    1).detach()
        old_logprobs = torch.squeeze(torch.stack(memory.logprobs),
                                     1).to(device).detach()

        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):
            # Evaluating old actions and values :
            logprobs, state_values, dist_entropy = self.policy.evaluate(
                old_states, old_actions)

            # Finding the ratio (pi_theta / pi_theta__old):
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss:
            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.eps_clip,
                                1 + self.eps_clip) * advantages
            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(
                state_values, rewards) - 0.01 * dist_entropy

            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        # Copy new weights into old policy:
        self.policy_old.load_state_dict(self.policy.state_dict())