Exemplo n.º 1
0
class Agent():
    def __init__(self, env):
        self.num_actions = env.action_space.n

        target_dqn = DQNParamNoise(env)
        self.algo = DQNParamNoise(env, target_dqn=target_dqn)

        self.num_steps = 0
        self.episode_steps = 0
        self.env = env
        self.memory = Memory(maxlen=10000)
        self.rewards = deque(maxlen=100)
        self.epsilon = 0.005
        self.total_reward = 0

    def reset(self, learn=True):
        self.algo.episode_start()
        self.total_reward = 0

    def store_in_memory(self, state, action, reward, new_state, done):
        self.memory.append((state, action, reward, new_state, done))

    def learn_from_memory(self):
        self.algo.train(self.memory.get_batch(batch_size=32))

    def step(self, state, learn=True):
        action_probs = self.algo.eval(np.expand_dims(state, 0)).double()[0]
        # print(action_probs)
        action_probs = action_probs / action_probs.sum()

        if random.random() < self.epsilon:
            action = random.randint(0, self.num_actions - 1)
        else:
            action = action_probs.max(0)[1]

        new_state, reward, done, info = self.env.step(action)
        self.store_in_memory(state, action, reward, new_state, done)
        if self.num_steps >= 100 and learn:
            self.learn_from_memory()

        if done:
            print(
                f"[{self.num_steps}] reward {self.env.mean_reward}, steps {self.episode_steps}, speed {self.env.speed} f/s, epsilon {self.epsilon}"
            )
            self.episode_steps = 0
            self.algo.save()

        self.num_steps += 1
        self.episode_steps += 1
        self.total_reward += reward

        self.epsilon -= 1e-04
        self.epsilon = max(0.005, self.epsilon)

        return new_state, reward, done, info
Exemplo n.º 2
0
def train(env,
          model,
          base_path,
          batch_size=64,
          epsilon=0.01,
          update_every=4,
          update_target_every=1000,
          learning_starts=200,
          memory_size=500000,
          num_iterations=6250000):
    if not os.path.exists(base_path):
        os.makedirs(base_path)
    model_path = os.path.join(base_path, "model")
    begin_i = model.load(model_path)

    memory_buffer = Memory(memory_size)
    results_buffer = ResultsBuffer(base_path)

    state = env.reset()
    for i in range(learning_starts):
        action = np.random.randint(
            env.action_n
        ) if np.random.uniform() < epsilon else model.get_action(state)
        next_state, reward, done, info = env.step(action)
        memory_buffer.append((state, action, reward, next_state, done))
        state = next_state

    state = env.reset()
    start = time.time()
    for i in range(begin_i + 1, num_iterations):
        action = np.random.randint(
            env.action_n
        ) if np.random.uniform() < epsilon else model.get_action(state)
        next_state, reward, done, info = env.step(action)
        results_buffer.update_info(info)
        memory_buffer.append((state, action, reward, next_state, done))
        state = next_state

        if i > 0 and i % update_every == 0:
            summaries = model.update(*memory_buffer.sample(batch_size))
            results_buffer.update_info(summaries)

        if i > 0 and i % (update_every * update_target_every) == 0:
            model.update_target()
            model.save(model_path, i)

            t = time.time() - start
            print("Save model, global step:{}, delta_time:{}.".format(i, t))
            start = time.time()
Exemplo n.º 3
0
class RandomAgent(Learner):
    def __init__(self, observation_space, action_space, memory_len=1000):
        self.action_space = action_space
        self.memory = Memory(memory_len, observation_space.shape)

    def handle_transition(self, s, a, r, sp, done):
        s = self._convert_to_torch(s)
        sp = self._convert_to_torch(sp)

        self.memory.append((s, a, r, sp, done))
        pass

    def exploration_strategy(self, s):
        return self.action_space.sample()

    def deterministic_strategy(self, s):
        return self.action_space.sample()
Exemplo n.º 4
0
class QAgent():
    def __init__(self,
                 env=None,
                 config: str = None,
                 seed: int = None,
                 model: tf.keras.Model = None):
        assert env is not None, "A GYM environment must be provided"
        assert config is not None, "A config filename must be provided"
        assert model is not None, "A keras model must be provided"
        self.env = env
        self.config_file = config
        self.config = Config(self.config_file)
        self.model_name = f'train_{int(time.time())}'
        self.log_dir = ""
        self.model = model
        self.target_model = None
        self.tensorboard = None
        self.rng = np.random.default_rng(seed)
        self.memory = None
        self.last_step = 0
        self.current_episode = 0

    def compile(self, optimizer=None, loss=Huber()):
        # lr_schedule = ExponentialDecay(
        #                         initial_learning_rate=self.config.learning_rate,
        #                         decay_steps=self.config.lr_decay_steps,
        #                         decay_rate=self.config.lr_decay,
        #                         lr_min=self.config.lr_min)
        # optimizer = Adam(learning_rate=lr_schedule)
        if optimizer is None:
            optimizer = Adam(learning_rate=self.config.learning_rate)

        self.target_model = clone_model(self.model)
        self.target_model.compile(optimizer='sgd', loss='mse')

        self.model.compile(loss=loss,
                           optimizer=optimizer,
                           metrics=['accuracy'])

    def adjust_lr(self, lr=None):
        assert lr is not None
        K.set_value(self.model.optimizer.learning_rate, lr)

    def load_model(self, filepath):
        self.model.load_weights(filepath)
        self.target_model.set_weights(self.model.get_weights())

    def save(self, filepath=""):
        self.save_model(filepath)
        self.save_checkpoint(filepath)

    def save_model(self, filepath=""):
        filepath = os.path.join(filepath, "models")
        if not os.path.exists(filepath):
            os.makedirs(filepath)
        self.model.save_weights(os.path.join(filepath,
                                             self.model_name + ".h5"),
                                overwrite=True)

    def save_checkpoint(self, filepath=""):
        filepath = os.path.join(filepath, "models")
        if not os.path.exists(filepath):
            os.makedirs(filepath)
        # save memory, current step
        data = {
            "memory": self.memory.json(),
            "last_step": self.last_step,
            "current_episode": self.current_episode,
            "model_name": self.model_name
        }
        with open(os.path.join(filepath, self.model_name + "_checkpoint.json"),
                  "w") as jsonfile:
            json.dump(data, jsonfile)

    def load_checkpoint(self, filename):
        with open(filename, "r") as json_file:
            data = json.load(json_file)
        self.last_step = data['last_step']
        self.current_episode = data['current_episode']
        self.model_name = data['model_name']
        if self.memory is None:
            self.config = Config(self.config_file)
            self.memory = Memory(max_len=self.config.max_queue_length)
        self.memory.load(data['memory'])

    def _encode_state(self, state):
        return state

    def _train_model(self, step):
        if self.memory.length < self.config.batch_size:
            return

        mini_batch = self.memory.sample(self.config.batch_size)

        current_states = self._encode_state(mini_batch.states)
        next_states = self._encode_state(mini_batch.new_states)

        # current Q values for each action
        q_values = self.model.predict_on_batch(current_states)

        # identify the best action to take and get the corresponding target Q value
        target_q_values = self.target_model.predict_on_batch(next_states)
        q_batch = np.max(target_q_values, axis=1).flatten()

        indices = (np.arange(self.config.batch_size), mini_batch.actions)
        q_values[indices] = mini_batch.rewards + (
            1 - mini_batch.done) * self.config.discount_factor * q_batch

        # As the model will predict `q_values`, only the Q value for the proper action (given by indices)
        # differ and count for the loss computation.
        self.tensorboard.on_step_begin()
        metrics = self.model.train_on_batch(current_states.astype(np.float32),
                                            q_values.astype(np.float32),
                                            return_dict=True)
        self.tensorboard.on_step_end(step=step, logs=metrics)

    def _get_epsilon(self, episode):
        epsilon = self.config.min_epsilon + \
                          (self.config.max_epsilon - self.config.min_epsilon) * np.exp(-self.config.decay_epsilon * episode)
        return epsilon

    def _remember(self, state, action, reward, new_state, done):
        self.memory.append(state, action, reward, new_state, done)

    def _get_action_for_state(self, state):
        state_decoded = self._encode_state(state)
        predicted = self.model.predict_on_batch(np.array([state_decoded]))
        action = np.argmax(predicted[0])
        return action

    def _choose_action(self, state, epsilon):
        if self.rng.uniform() < epsilon:
            # Explore
            action = self.env.action_space.sample()
        else:
            # Exploit
            action = self._get_action_for_state(state)
        return action

    def fit(self):

        try:
            self.config = Config(self.config_file)
            if self.tensorboard is None:
                self.log_dir = os.path.join(self.config.log_dir,
                                            self.model_name)
                self.tensorboard = LogTensorBoard(log_dir=self.log_dir)
            self.tensorboard.set_model(self.model)

            if self.memory is None:
                self.memory = Memory(max_len=self.config.max_queue_length)

            state = self.env.reset()
            done = False
            epsilon = self._get_epsilon(self.current_episode)
            steps_in_episode = 0
            reward_queue = deque(maxlen=10)
            reward_in_episode = 0

            pbar = trange(self.last_step,
                          self.config.train_steps,
                          initial=self.last_step,
                          total=self.config.train_steps)
            for step in pbar:
                steps_in_episode += 1
                self.last_step = step

                # Greedy exploration strategy
                action = self._choose_action(state, epsilon)
                new_state, reward, done, info = self.env.step(action)
                self._remember(state, action, reward, new_state, done)
                reward_in_episode += reward

                if steps_in_episode == self.config.max_steps_per_episode:
                    done = True

                # Train with the Bellman equation
                if step > self.config.warmup_steps:
                    self._train_model(step)

                state = new_state

                if done:
                    steps_in_episode = 0
                    state = self.env.reset()
                    done = False
                    self.current_episode += 1
                    reward_queue.append(reward_in_episode)
                    reward_in_episode = 0
                    epsilon = self._get_epsilon(self.current_episode)
                    pbar.set_postfix({"reward": np.mean(reward_queue)})

                if step % self.config.target_model_update == 0:
                    self.target_model.set_weights(self.model.get_weights())

            self.last_step += 1

        except KeyboardInterrupt:
            print("Training has been interrupted")

    def play(self,
             verbose: bool = False,
             sleep: float = 0.2,
             max_steps: int = 100):
        # Play an episode
        try:
            actions_str = [
                "South", "North", "East", "West", "Pickup", "Dropoff"
            ]

            iteration = 0
            state = self.env.reset(
            )  # reset environment to a new, random state
            self.env.render()
            if verbose:
                print(f"Iter: {iteration} - Action: *** - Reward ***")
            time.sleep(sleep)
            done = False

            while not done:
                action = self._get_action_for_state(state)
                iteration += 1
                state, reward, done, info = self.env.step(action)
                clear_output(wait=True)
                self.env.render()
                if verbose:
                    print(
                        f"Iter: {iteration} - Action: {action}({actions_str[action]}) - Reward {reward}"
                    )
                time.sleep(sleep)
                if iteration == max_steps:
                    print("cannot converge :(")
                    break
        except KeyboardInterrupt:
            pass

    def evaluate(self, max_steps: int = 100):
        try:
            total_steps, total_penalties = 0, 0
            episodes = 100

            for episode in trange(episodes):
                state = self.env.reset(
                )  # reset environment to a new, random state
                nb_steps, penalties, reward = 0, 0, 0

                done = False

                while not done:
                    action = self._get_action_for_state(state)
                    state, reward, done, info = self.env.step(action)

                    if reward == -10:
                        penalties += 1

                    nb_steps += 1
                    if nb_steps == max_steps:
                        done = True

                total_penalties += penalties
                total_steps += nb_steps

            print(f"Results after {episodes} episodes:")
            print(f"Average timesteps per episode: {total_steps / episodes}")
            print(
                f"Average penalties per episode: {total_penalties / episodes}")
        except KeyboardInterrupt:
            pass
Exemplo n.º 5
0
class DDPG(object):
    def __init__(self):
        agent_args = Singleton_arger()['agent']
        self.actor_lr = agent_args['actor_lr']
        self.critic_lr = agent_args['critic_lr']
        self.lr_decay = agent_args['lr_decay']
        self.l2_critic = agent_args['l2_critic']
        self.batch_size = agent_args['batch_size']
        self.discount = agent_args['discount']
        self.tau = agent_args['tau']
        self.with_cuda = agent_args['with_cuda']
        self.buffer_size = int(agent_args['buffer_size'])

    def setup(self, nb_pos, nb_laser, nb_actions):
        self.lr_coef = 1

        model_args = Singleton_arger()['model']
        actor = Actor(nb_pos,
                      nb_laser,
                      nb_actions,
                      hidden1=model_args['hidden1'],
                      hidden2=model_args['hidden2'],
                      layer_norm=model_args['layer_norm'])
        critic = Critic(nb_pos,
                        nb_laser,
                        nb_actions,
                        hidden1=model_args['hidden1'],
                        hidden2=model_args['hidden2'],
                        layer_norm=model_args['layer_norm'])
        self.nb_pos = nb_pos
        self.nb_laser = nb_laser
        self.actor = copy.deepcopy(actor)
        self.actor_target = copy.deepcopy(actor)
        self.critic = copy.deepcopy(critic)
        self.critic_target = copy.deepcopy(critic)

        self.memory = Memory(self.buffer_size, (nb_actions, ),
                             (nb_pos + nb_laser, ), self.with_cuda)

        if self.with_cuda:
            for net in (self.actor, self.actor_target, self.critic,
                        self.critic_target):
                if net is not None:
                    net.cuda()

        p_groups = [{
            'params': [
                param,
            ],
            'weight_decay':
            self.l2_critic if ('weight' in name) and ('LN' not in name) else 0
        } for name, param in self.critic.named_parameters()]
        self.critic_optim = Adam(params=p_groups,
                                 lr=self.critic_lr,
                                 weight_decay=self.l2_critic)
        self.actor_optim = Adam(self.actor.parameters(), lr=self.actor_lr)

    def reset_noise(self):
        pass

    def before_epoch(self):
        pass

    def before_cycle(self):
        pass

    def store_transition(self, s_t, a_t, r_t, s_t1, done_t):
        s_t = torch.tensor(s_t, dtype=torch.float32, requires_grad=False)
        if self.with_cuda:
            s_t = s_t.cuda()
        self.memory.append(s_t, a_t, r_t, s_t1, done_t)

    def update_critic(self, batch=None, pass_batch=False):
        # Sample batch
        if batch is None:
            batch = self.memory.sample(self.batch_size)
        assert batch is not None
        tensor_obs0 = batch['obs0'].split([self.nb_pos, self.nb_laser], dim=1)
        tensor_obs1 = batch['obs1'].split([self.nb_pos, self.nb_laser], dim=1)
        # Prepare for the target q batch
        with torch.no_grad():
            next_q_values = self.critic_target([
                tensor_obs1[0],
                tensor_obs1[1],
                self.actor_target(tensor_obs1),
            ])

            target_q_batch = batch['rewards'] + self.discount * (
                1 - batch['terminals1']) * next_q_values
        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic(
            [tensor_obs0[0], tensor_obs0[1], batch['actions']])
        value_loss = nn.functional.mse_loss(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()
        if pass_batch:
            return value_loss.item(), batch
        else:
            return value_loss.item()

    def update_actor(self, batch=None, pass_batch=False):
        if batch is None:
            batch = self.memory.sample(self.batch_size)
        assert batch is not None
        tensor_obs0 = batch['obs0'].split([self.nb_pos, self.nb_laser], dim=1)
        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic(
            [tensor_obs0[0], tensor_obs0[1],
             self.actor(tensor_obs0)])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()
        if pass_batch:
            return policy_loss.item(), batch
        else:
            return policy_loss.item()

    def update_critic_target(self, soft_update=True):
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau \
                                    if soft_update else param.data)

    def update_actor_target(self, soft_update=True):
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau \
                                    if soft_update else param.data)

    def apply_lr_decay(self):
        if self.lr_decay > 0:
            self.lr_coef = self.lr_decay * self.lr_coef / (self.lr_coef +
                                                           self.lr_decay)
            for (opt, base_lr) in ((self.actor_optim, self.actor_lr),
                                   (self.critic_optim, self.critic_lr)):
                for group in opt.param_groups:
                    group['lr'] = base_lr * self.lr_coef

    def calc_last_error(self):
        # Sample batch
        batch = self.memory.sample_last(self.batch_size)
        tensor_obs0 = batch['obs0'].split([self.nb_pos, self.nb_laser], dim=1)
        tensor_obs1 = batch['obs1'].split([self.nb_pos, self.nb_laser], dim=1)
        # Prepare for the target q batch
        with torch.no_grad():
            next_q_values = self.critic_target([
                tensor_obs1[0],
                tensor_obs1[1],
                self.actor_target(tensor_obs1),
            ])
            target_q_batch = batch['rewards'] + self.discount * (
                1 - batch['terminals1']) * next_q_values
            q_batch = self.critic_target(
                [tensor_obs0[0], tensor_obs0[1], batch['actions']])
            value_loss = nn.functional.mse_loss(q_batch, target_q_batch)
        return value_loss.item()

    def select_action(self, s_t, apply_noise):
        s_t = torch.tensor(np.vstack(s_t),
                           dtype=torch.float32,
                           requires_grad=False).cuda()
        s_t = s_t.split([self.nb_pos, self.nb_laser], dim=1)

        with torch.no_grad():
            action = self.actor(s_t).cpu().numpy()
        action = np.clip(action, -1., 1.)
        return action

    def load_weights(self, output):
        self.actor = torch.load('{}/actor.pkl'.format(output))
        self.critic = torch.load('{}/critic.pkl'.format(output))

    def save_model(self, output):
        torch.save(self.actor, '{}/actor.pkl'.format(output))
        torch.save(self.critic, '{}/critic.pkl'.format(output))

    def get_actor_buffer(self):
        actor_buffer = io.BytesIO()
        torch.save(self.actor, actor_buffer)
        return actor_buffer
Exemplo n.º 6
0
class DDPG(object):
    def __init__(self, nb_actions, nb_states, layer_norm, obs_norm, actor_lr,
                 critic_lr, SGLD_coef, noise_decay, lr_decay, batch_size,
                 discount, tau, pool_size, parameters_noise, action_noise,
                 SGLD_mode, pool_mode, with_cuda):

        self.nb_actions = nb_actions
        self.nb_states = nb_states
        self.layer_norm = layer_norm
        self.parameters_noise = parameters_noise
        self.action_noise = action_noise
        self.batch_size = batch_size
        self.discount = discount
        self.tau = tau
        self.pool_size = pool_size
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr
        self.SGLD_coef = SGLD_coef
        self.noise_coef = 1
        self.noise_decay = noise_decay
        self.lr_coef = 1
        self.lr_decay = lr_decay
        self.SGLD_mode = SGLD_mode
        self.pool_mode = pool_mode
        self.with_cuda = with_cuda

        self.actor = Actor(nb_states=self.nb_states,
                           nb_actions=self.nb_actions,
                           layer_norm=self.layer_norm)
        self.actor_target = Actor(nb_states=self.nb_states,
                                  nb_actions=self.nb_actions,
                                  layer_norm=self.layer_norm)
        self.critic = Critic(nb_states, nb_actions, layer_norm=self.layer_norm)
        self.critic_target = Critic(nb_states,
                                    nb_actions,
                                    layer_norm=self.layer_norm)
        if self.with_cuda:
            self.actor.cuda()
            self.actor_target.cuda()
            self.critic.cuda()
            self.critic_target.cuda()

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

        #self.actor_optim  = SGD(self.actor.parameters(), lr=actor_lr, momentum=0.9,weight_decay  = 0.01)
        self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr)
        #self.critic_optim  = SGD(self.critic.parameters(), lr=critic_lr, momentum=0.9,weight_decay  = 0.01)
        self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr)

        self.memory = Memory(int(1e6), (nb_actions, ), (nb_states, ),
                             with_cuda)
        self.obs_norm = obs_norm
        if self.obs_norm:
            self.run_obs_norm = Run_Normalizer((nb_states, ), self.with_cuda)
        self.is_training = True

        if self.pool_size > 0:
            self.agent_pool = Agent_pool(self.pool_size)

        self.s_t = None
        self.a_t = None

    def store_transition(self, s_t, a_t, r_t, s_t1, done_t):
        if self.is_training:
            self.memory.append(s_t, a_t, r_t, s_t1, done_t)
        if self.obs_norm:
            self.run_obs_norm.observe(s_t)
        self.s_t = s_t1

    def update(self):
        # Sample batch
        batch = self.memory.sample(self.batch_size)

        tensor_obs0 = batch['obs0']
        tensor_obs1 = batch['obs1']
        if self.obs_norm:
            tensor_obs0 = self.run_obs_norm.normalize(tensor_obs0)
            tensor_obs1 = self.run_obs_norm.normalize(tensor_obs1)

        # Prepare for the target q batch
        with torch.no_grad():
            next_q_values = self.critic_target([
                tensor_obs1,
                self.actor_target(tensor_obs1),
            ])

            target_q_batch = batch['rewards'] + \
                self.discount*(1-batch['terminals1'])*next_q_values
        # Critic update
        self.critic.zero_grad()

        q_batch = self.critic([tensor_obs0, batch['actions']])
        value_loss = nn.functional.mse_loss(q_batch, target_q_batch)
        value_loss.backward()
        self.critic_optim.step()
        if (self.SGLD_mode == 2) or (self.SGLD_mode == 3):
            SGLD_update(self.critic, self.critic_lr * self.lr_coef,
                        self.SGLD_coef)
        # Actor update
        self.actor.zero_grad()

        policy_loss = -self.critic([tensor_obs0, self.actor(tensor_obs0)])

        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.actor_optim.step()
        if (self.SGLD_mode == 1) or (self.SGLD_mode == 3):
            SGLD_update(self.actor, self.actor_lr * self.lr_coef,
                        self.SGLD_coef)

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)
        return value_loss.item(), policy_loss.item()

    def apply_lr_decay(self):
        if self.lr_decay > 0:
            self.lr_coef = self.lr_decay * self.lr_coef / (self.lr_coef +
                                                           self.lr_decay)
            self.critic_optim.param_groups[0][
                'lr'] = self.critic_lr * self.lr_coef

    def apply_noise_decay(self):
        if self.noise_decay > 0:
            self.noise_coef = self.noise_decay * self.noise_coef / (
                self.noise_coef + self.noise_decay)

    def select_action(self, random=False, s_t=None, if_noise=True):
        if random:
            action = np.random.uniform(-1., 1., self.nb_actions)
        else:
            if s_t is None: raise RuntimeError()
            s_t = torch.tensor(s_t, dtype=torch.float32, requires_grad=False)
            if self.with_cuda:
                s_t = s_t.cuda()
            if self.obs_norm:
                s_t = self.run_obs_norm.normalize(s_t)
            with torch.no_grad():
                action = self.actor(s_t).cpu().numpy().squeeze(0)

            if if_noise & (self.action_noise is not None):
                action += self.is_training * max(self.noise_coef,
                                                 0) * self.action_noise()
        action = np.clip(action, -1., 1.)
        self.a_t = action
        return action

    def load_weights(self, output):
        self.actor = torch.load('{}/actor.pkl'.format(output))
        self.critic = torch.load('{}/critic.pkl'.format(output))
        if self.obs_norm:
            self.run_obs_norm = torch.load('{}/obs_norm.pkl'.format(output))

    def save_model(self, output):
        torch.save(self.actor, '{}/actor.pkl'.format(output))
        torch.save(self.critic, '{}/critic.pkl'.format(output))
        if self.obs_norm:
            torch.save(self.run_obs_norm, '{}/obs_norm.pkl'.format(output))

    def get_actor_buffer(self):
        buffer = io.BytesIO()
        torch.save(self.actor, buffer)
        return buffer

    def get_norm_param(self):
        return self.run_obs_norm.mean.cpu(), self.run_obs_norm.var.cpu()

    #TODO recode agent pool
    def append_actor(self):
        self.agent_pool.actor_append(self.actor.state_dict(),
                                     self.actor_target.state_dict())

    def pick_actor(self):
        actor, actor_target = self.agent_pool.get_actor()
        self.actor.load_state_dict(actor)
        self.actor_target.load_state_dict(actor_target)

    def append_critic(self):
        self.agent_pool.critic_append(self.critic.state_dict(),
                                      self.critic_target.state_dict())

    def pick_critic(self):
        critic, critic_target = self.agent_pool.get_critic()
        self.critic.load_state_dict(critic)
        self.critic_target.load_state_dict(critic_target)

    def append_actor_critic(self):
        self.agent_pool.actor_append(self.actor.state_dict(),
                                     self.actor_target.state_dict())
        self.agent_pool.critic_append(self.critic.state_dict(),
                                      self.critic_target.state_dict())

    def pick_actor_critic(self):
        actor, actor_target, critic, critic_target = self.agent_pool.get_agent(
        )
        self.actor.load_state_dict(actor)
        self.actor_target.load_state_dict(actor_target)
        self.critic.load_state_dict(critic)
        self.critic_target.load_state_dict(critic_target)

    def append_agent(self):
        if self.pool_mode == 1:
            self.append_actor()
        elif self.pool_mode == 2:
            self.append_critic()
        elif self.pool_mode == 3:
            self.append_actor_critic()

    def pick_agent(self):
        if self.pool_mode == 1:
            self.pick_actor()
        elif self.pool_mode == 2:
            self.pick_critic()
        elif self.pool_mode == 3:
            self.pick_actor_critic()

    def reset(self, obs):
        self.s_t = obs
        if self.action_noise is not None:
            self.action_noise.reset()
Exemplo n.º 7
0
class Agent:
    """
    class implements agent
    """
    def __init__(self, state_size, action_size, args):
        self.args = args
        with open(
                os.path.dirname(
                    os.path.abspath(inspect.getfile(inspect.currentframe()))) +
                '/agent_args.json') as f:
            data = json.load(f)
        self.initial_epsilon = int(
            data[self.args.environment]["initial_epsilon"])
        self.final_epsilon = float(
            data[self.args.environment]["final_epsilon"])
        self.current_epsilon = self.initial_epsilon
        self.epsilon_decay = float(
            data[self.args.environment]["epsilon_decay"])
        self.gamma = float(data[self.args.environment]["gamma"])
        self.minibatch_size = int(
            data[self.args.environment]["minibatch_size"])
        self.learning_rate = float(
            data[self.args.environment]["learning_rate"])
        self.fraction_update = float(
            data[self.args.environment]["fraction_update"])
        self.loss = data[self.args.environment]["loss"]

        self.memory_type = self.args.memory
        self.memory_size = int(data[self.args.environment]["memory_size"])
        if self.memory_type == "basic":
            self.memory = deque(maxlen=self.memory_size)
        else:
            self.memory = Memory(self.memory_size)

        self.action_size = action_size
        self.state_size = state_size
        if self.args.mdl_blueprint and not self.args.dont_save:
            self.mdl_blueprint = True
        else:
            self.mdl_blueprint = False
        network = Network(state_size, action_size, self.learning_rate,
                          self.loss, [True, self.mdl_blueprint])

        self.net_units = None
        if data[self.args.environment]["net_units"] != "None":
            self.net_units = [
                int(i) for i in data[self.args.environment]["net_units"]
            ]
        self.model_type = self.args.network
        if self.model_type == "2layer_bsc_mdl":
            self.model_net = network.make_2layer_mdl(self.net_units)
            self.target_net = network.make_2layer_mdl(self.net_units)
        elif self.model_type == "2layer_duel_mdl":
            self.model_net = network.make_2layer_duel_mdl(self.net_units)
            self.target_net = network.make_2layer_duel_mdl(self.net_units)
        elif self.model_type == "bsc_img_mdl":
            self.model_net = network.make_bsc_img_mdl()
            self.target_net = network.make_bsc_img_mdl()
        elif self.model_type == "duel_img_model":
            self.model_net = network.make_duel_img_mdl()
            self.target_net = network.make_duel_img_mdl()
        elif self.model_type == "1layer_ram_mdl":
            self.model_net = network.make_1layer_mdl(self.net_units)
            self.target_net = network.make_1layer_mdl(self.net_units)

        self.update_target_net()

        self.algorithm = self.args.algorithm
        self.algorithms = {
            "DQN": self.train_dqn,
            "DQN+TN": self.train_target_dqn,
            "DDQN": self.train_ddqn,
        }

    def update_target_net(self):
        """
        method updates target network
        """
        self.target_net.set_weights(self.model_net.get_weights())
        print("[Target network was updated.]")

    def update_target_net_partially(self):
        """
        method updates target network by parts
        """
        weights_model = self.model_net.get_weights()
        weights_target = self.target_net.get_weights()

        for i in range(len(weights_target)):
            weights_target[i] = weights_model[
                i] * self.fraction_update + weights_target[i] * (
                    1 - self.fraction_update)

        self.target_net.set_weights(weights_target)
        print("[Target network was updated by parts.]")

    def get_error(self, state, action, reward, next_state, done):
        """
        method returns difference between Q-value from primary and target network
        """
        q_value = self.model_net.predict(np.array([state]))
        ns_model_pred = self.model_net.predict(np.array([next_state]))
        ns_target_pred = self.target_net.predict(np.array([next_state]))

        obs_error = q_value[0][action]

        if done == 1:
            q_value[0][action] = reward
        else:
            q_value[0][action] = reward + self.gamma * ns_target_pred[0][
                np.argmax(ns_model_pred)]

        obs_error = abs(obs_error - q_value[0][action])

        return obs_error

    def remember(self, state, action, reward, next_state, done, rand_agent):
        """
        method saves observation (experience) to experience replay memory
        """
        if self.memory_type == "basic":
            self.memory.append((state, action, reward, next_state, done))
        else:
            if rand_agent:
                obs_error = abs(reward)
            else:
                obs_error = self.get_error(state, action, reward, next_state,
                                           done)

            self.memory.add_observation(
                (state, action, reward, next_state, done), obs_error)

    def clear_memory(self):
        """
        method clears replay memory
        """
        self.memory.clear()

    def decrease_epsilon(self):
        """
        method decreases epsilon
        """
        if self.current_epsilon > self.final_epsilon:
            if (self.current_epsilon -
                    self.epsilon_decay) > self.final_epsilon:
                self.current_epsilon = self.current_epsilon - self.epsilon_decay
            else:
                self.current_epsilon = self.final_epsilon

    def get_action(self, task, state, non_normalized_state, epsilon):
        """
        method returns action to take
        """
        if not epsilon:
            q_value = self.model_net.predict(np.array([state]))
        else:
            if np.random.rand() <= self.current_epsilon:
                if task.name == "2048-v0":
                    possible_actions = possible_moves(non_normalized_state)
                    while True:
                        rand_action = np.random.randint(0,
                                                        self.action_size,
                                                        size=1)[0]
                        if possible_actions[rand_action] == 1:
                            return rand_action
                else:
                    return np.random.randint(0, self.action_size, size=1)[0]
            else:
                q_value = self.model_net.predict(np.array([state]))

        if task.name == "2048-v0":
            possible_actions = possible_moves(non_normalized_state)
            while True:
                chosen_action = np.argmax(q_value)
                if possible_actions[chosen_action] == 1:
                    return chosen_action
                else:
                    q_value[0][chosen_action] = -100

        return np.argmax(q_value)

    def get_minibatch(self):
        """
        method returns minibatch from diffrent memory types
        """
        if self.memory_type == "basic":
            minibatch = random.sample(list(self.memory), self.minibatch_size)
            state = np.array([i[0] for i in minibatch])
            action = [i[1] for i in minibatch]
            reward = [i[2] for i in minibatch]
            next_state = np.array([i[3] for i in minibatch])
            done = [i[4] for i in minibatch]
        else:
            minibatch = self.memory.sample(self.minibatch_size)

            state = np.array([i[1][0] for i in minibatch])
            action = [i[1][1] for i in minibatch]
            reward = [i[1][2] for i in minibatch]
            next_state = np.array([i[1][3] for i in minibatch])
            done = [i[1][4] for i in minibatch]

        return state, action, reward, next_state, done

    def train(self):
        """
        method trains agent with selected algorithm
        """
        self.algorithms[self.algorithm]()

    def train_dqn(self):
        """
        method trains agent using DQN
        """
        if self.memory_type == "basic":
            if len(self.memory) >= self.minibatch_size:
                state, action, reward, next_state, done = self.get_minibatch()
            else:
                return
        else:
            if self.memory.length >= self.minibatch_size:
                state, action, reward, next_state, done = self.get_minibatch()
            else:
                return

        errors = np.zeros(self.minibatch_size)

        possible_actions_curr = []
        if self.args.environment == "2048-v0":
            for i, item in enumerate(state):
                possible_actions_curr.append(possible_moves(item))

            state = state / 16384.0 - 0.5
            next_state = next_state / 16384.0 - 0.5

        q_value = self.model_net.predict(np.array(state))
        ns_model_pred = self.model_net.predict(np.array(next_state))

        for i in range(0, self.minibatch_size):
            errors[i] = q_value[i][action[i]]

            if done[i] == 1:
                q_value[i][action[i]] = reward[i]
            else:
                q_value[i][action[i]] = reward[i] + self.gamma * np.max(
                    ns_model_pred[i])

            errors[i] = abs(errors[i] - q_value[i][action[i]])

        for i, item in enumerate(possible_actions_curr):
            for e, elem in enumerate(item):
                if elem == 0:
                    q_value[i][e] = -1

        self.model_net.fit(state, q_value, epochs=1, verbose=0)
        if self.memory_type == "dueling":
            self.memory.update_minibatch(minibatch, errors)

    def train_target_dqn(self):
        """
        method trains agent using DQN with target network
        """
        if self.memory_type == "basic":
            if len(self.memory) >= self.minibatch_size:
                state, action, reward, next_state, done = self.get_minibatch()
            else:
                return
        else:
            if self.memory.length >= self.minibatch_size:
                state, action, reward, next_state, done = self.get_minibatch()
            else:
                return

        errors = np.zeros(self.minibatch_size)

        possible_actions_curr = []
        if self.args.environment == "2048-v0":
            for i, item in enumerate(state):
                possible_actions_curr.append(possible_moves(item))

            state = state / 16384.0 - 0.5
            next_state = next_state / 16384.0 - 0.5

        q_value = self.model_net.predict(np.array(state))
        ns_target_pred = self.target_net.predict(np.array(next_state))

        for i in range(0, self.minibatch_size):
            errors[i] = q_value[i][action[i]]

            if done[i] == 1:
                q_value[i][action[i]] = reward[i]
            else:
                q_value[i][action[i]] = reward[i] + self.gamma * np.max(
                    ns_target_pred[i])

            errors[i] = abs(errors[i] - q_value[i][action[i]])

        for i, item in enumerate(possible_actions_curr):
            for e, elem in enumerate(item):
                if elem == 0:
                    q_value[i][e] = -1

        self.model_net.fit(state, q_value, epochs=1, verbose=0)
        if self.memory_type == "dueling":
            self.memory.update_minibatch(minibatch, errors)

    def train_ddqn(self):
        """
        method trains agent using DDQN
        """
        if self.memory_type == "basic":
            if len(self.memory) >= self.minibatch_size:
                state, action, reward, next_state, done = self.get_minibatch()
            else:
                return
        else:
            if self.memory.length >= self.minibatch_size:
                state, action, reward, next_state, done = self.get_minibatch()
            else:
                return

        errors = np.zeros(self.minibatch_size)

        possible_actions_curr = []
        if self.args.environment == "2048-v0":
            for i, item in enumerate(state):
                possible_actions_curr.append(possible_moves(item))

            state = state / 16384.0 - 0.5
            next_state = next_state / 16384.0 - 0.5

        q_value = self.model_net.predict(state)
        ns_model_pred = self.model_net.predict(next_state)
        ns_target_pred = self.target_net.predict(next_state)

        for i in range(0, self.minibatch_size):
            errors[i] = q_value[i][action[i]]

            if done[i] == 1:
                q_value[i][action[i]] = reward[i]
            else:
                q_value[i][action[
                    i]] = reward[i] + self.gamma * ns_target_pred[i][np.argmax(
                        ns_model_pred[i])]

            errors[i] = abs(errors[i] - q_value[i][action[i]])

        for i, item in enumerate(possible_actions_curr):
            for e, elem in enumerate(item):
                if elem == 0:
                    q_value[i][e] = -1

        self.model_net.fit(state, q_value, epochs=1, verbose=0)
        if self.memory_type == "dueling":
            self.memory.update_minibatch(minibatch, errors)

    def load_model_weights(self, name):
        """
        method loads weights to primary neural network
        """
        self.model_net.load_weights(name)
        print("[Model has been loaded from \"{}\".]".format(name))

    def save_model_weights(self, name):
        """
        method saves weights of primary neural network
        """
        self.model_net.save_weights("./model-{}".format(name))
        print("[Model was saved to \"./model-{}\".]".format(name))

    def load_target_weights(self, name):
        """
        method loads weights to target neural network
        """
        self.target_net.load_weights(name)
        print("[Target model has been loaded from \"{}\".]".format(name))

    def save_target_weights(self, name):
        """
        method saves weights of target neural network
        """
        self.target_net.save_weights("./target-{}".format(name))
        print("[Target model was saved to \"./target-{}\".]".format(name))
Exemplo n.º 8
0
class QLearning(Learner):
    def __init__(self,
                 n_actions,
                 opt=Adam,
                 opt_args={},
                 loss=MSELoss,
                 gamma=0.99,
                 do_target=True,
                 memory_len=10000,
                 name=None,
                 memory_shape=(4, 84, 84),
                 initial_eps=0.1,
                 final_eps=0.01,
                 decay_steps=int(1e6),
                 memory_dtype=torch.uint8):
        self.n_actions = n_actions
        self._memory = Memory(memory_len, memory_shape, dtype=memory_dtype)
        self.Q = Sequential(Conv2d(4, 32, kernel_size=8, stride=4),
                            LeakyReLU(), Conv2d(32,
                                                64,
                                                kernel_size=4,
                                                stride=2), LeakyReLU(),
                            Conv2d(64, 64, kernel_size=3,
                                   stride=1), LeakyReLU(), Flatten(),
                            Linear(3136, 512), LeakyReLU(),
                            Linear(512, self.n_actions))
        self._name = name

        self.gamma = gamma

        self.opt = opt(self.Q.parameters(), **opt_args)
        self._base_loss_fn = MSELoss()
        self._steps = 0

        self.eps = initial_eps
        #self.decay = (final_eps / initial_eps) ** (1/decay_steps)

        # Linear Decay
        self.decay = (initial_eps - final_eps) / decay_steps

    def learn(self, batch_size=100, n_samples=100):
        if len(self._memory) < n_samples:
            return 'n/a'

        self.Q.train()
        X, y = self._build_dataset(n_samples)
        y_pred = self.Q(X)
        loss = self._base_loss_fn(y, y_pred)

        self.opt.zero_grad()
        loss.backward()
        clip_grad_value_(self.Q.parameters(), 1)
        self.opt.step()

        self.Q.eval()
        return loss.item()

    def _build_dataset(self, n):
        with torch.no_grad():
            s_s, a_s, r_s, sp_s, done_mask = self._memory.sample(n)

            vhat_sp_s = torch.max(self.Q(sp_s.float()), dim=1).values
            vhat_sp_s[done_mask] = 0

            targets = self.Q(s_s.float())

            for idx, t in enumerate(targets):
                t[int(
                    a_s[idx].byte())] = r_s[idx] + self.gamma * vhat_sp_s[idx]

            X = s_s.float()
            y = targets
        return X, y

    def handle_transition(self, s, a, r, sp, done):
        s = self._convert_to_torch(s)
        sp = self._convert_to_torch(sp)

        self._memory.append(
            (s, torch.from_numpy(np.array([a]))[0], r, sp, done))

        if (self._steps % 4) == 0:
            self.learn(n_samples=1024)

        self._steps += 1

    def get_action_vals(self, s):
        s = self._convert_to_torch(s)

        return self.Q(s[None, :])

    def exploration_strategy(self, s):
        #self.eps *= self.decay
        self.eps -= self.decay
        if np.random.random() > self.eps:
            ps = np.zeros(self.n_actions)
            best_action = torch.argmax(self.Q(s[None, :]))
            try:
                ps[best_action] = 1.
            except:
                print(self._name)
                exit()
        else:
            ps = np.full(self.n_actions, 1 / self.n_actions)

        return ps

    def deterministic_strategy(self, s):
        s = self._convert_to_torch(s)

        eps = 0.05
        if np.random.random() > eps:
            ps = np.zeros(self.n_actions)
            best_action = torch.argmax(self.Q(s[None, :])).detach().numpy()
            ps[best_action] = 1.
        else:
            ps = np.full(self.n_actions, 1 / self.n_actions)

        return ps
class Agent(object):
    def __init__(self,
                 observation_space_dims,
                 action_space,
                 discount_factor=.96,
                 model_path='./model'):

        self.discount_factor = discount_factor
        self.model_path = model_path
        self.global_step = 0
        self.history = History(log_path=model_path)
        self.max_reward = 1000
        self.lock = th.Lock()
        self.lock_swap = th.Lock()

        self.action_shape = action_space.shape  #(19,)
        self.observation_shape = (observation_space_dims, )  #(321,)
        self.inputdims = observation_space_dims
        self.memory = None
        self.block_training = False

        print("observation shape:", self.observation_shape)
        print("action shape: ", self.action_shape)

        self.is_continuous = True if isinstance(action_space,
                                                gym.spaces.Box) else False
        if self.is_continuous:
            low = action_space.low
            high = action_space.high
            num_of_actions = action_space.shape[0]

            self.action_bias = high / 2. + low / 2.
            self.action_multiplier = high - self.action_bias

            def clamp_action(actions):
                return np.clip(actions,
                               a_max=action_space.high,
                               a_min=action_space.low)

            self.clamp_action = clamp_action
        else:
            # not supported
            raise RuntimeError(
                'This version of DDPG only supports continuous action space')

        self.outputdims = num_of_actions

        ids, ods = self.inputdims, self.outputdims
        #print('inputs:{}, outputs:{}'.format(ids, ods))

        # start TF
        #tf.reset_default_graph()
        self.tf_graph = tf.Graph()
        self.sess = tf.Session(graph=self.tf_graph)

        # setup model
        with self.tf_graph.as_default():
            self.nr_networks = hyper.nr_agents
            self.actor = []
            self.critic = []
            self.actor_target = []
            self.critic_target = []
            for i in range(self.nr_networks):
                self.actor.append(
                    self.create_actor_network(ids, ods, 'actor_o' + str(i)))
                self.critic.append(
                    self.create_critic_network(ids, ods, 'critic_o' + str(i)))
                self.actor_target.append(
                    self.create_actor_network(ids, ods, 'actor_t' + str(i)))
                self.critic_target.append(
                    self.create_critic_network(ids, ods, 'critic_t' + str(i)))

            # setup tf actions
            self.train, self.predict, self.sync_target, self.evaluate, self.swap_actors = self.train_step_gen(
            )

            # setup model saving
            self.saver = tf.train.Saver(max_to_keep=10000)

            # init tf
            self.sess.run(tf.global_variables_initializer())
            # sync model => model_target (on first run)
            for i in range(self.nr_networks):
                self.sync_target(i)

    def setup_memory(self):
        if self.memory == None:
            print("Creating memory buffer, hold on...")
            limit = hyper.memory_size
            self.memory = Memory(limit=limit,
                                 action_shape=self.action_shape,
                                 observation_shape=self.observation_shape)

    def create_actor_network(self, num_inputs, num_outputs, scope):
        def actor_model(state):

            with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):

                x = tf.layers.dense(state,
                                    512,
                                    kernel_initializer=w_init(3.0, num_inputs),
                                    name='a1',
                                    reuse=tf.AUTO_REUSE)
                x = tf.nn.leaky_relu(x, alpha=0.35)
                x = tf.layers.dense(x,
                                    256,
                                    kernel_initializer=w_init(3.0, 512),
                                    name='a2',
                                    reuse=tf.AUTO_REUSE)
                x = tf.nn.leaky_relu(x, alpha=0.35)
                x = tf.layers.dense(x,
                                    256,
                                    kernel_initializer=w_init(3.0, 256),
                                    name='a3',
                                    reuse=tf.AUTO_REUSE)
                x = tf.contrib.layers.layer_norm(x, center=True, scale=True)
                x = tf.nn.leaky_relu(x, alpha=0.35)

                x = tf.layers.dense(x,
                                    256,
                                    kernel_initializer=w_init(2.0, 256),
                                    name='a4',
                                    reuse=tf.AUTO_REUSE)
                x = tf.nn.relu(x)
                x = tf.layers.dense(x,
                                    256,
                                    kernel_initializer=w_init(2.0, 256),
                                    name='a5',
                                    reuse=tf.AUTO_REUSE)
                x = tf.nn.relu(x)
                x = tf.layers.dense(x,
                                    256,
                                    kernel_initializer=w_init(2.0, 256),
                                    name='a6',
                                    reuse=tf.AUTO_REUSE)
                x = tf.nn.relu(x)

                x = tf.layers.dense(x,
                                    num_outputs,
                                    kernel_initializer=w_init(0.5, 256),
                                    name='a9',
                                    reuse=tf.AUTO_REUSE)

                x = tf.nn.tanh(x) * self.action_multiplier + self.action_bias
                return x

        return actor_model

    def create_critic_network(self, num_inputs, num_outputs, scope):
        def critic_model(input):
            state = input[0]
            action = input[1]

            with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
                x = tf.layers.dense(state,
                                    256,
                                    kernel_initializer=w_init(3.0, num_inputs),
                                    name='c1s',
                                    reuse=tf.AUTO_REUSE)
                x = tf.nn.leaky_relu(x, alpha=0.35)
                x = tf.layers.dense(x,
                                    256,
                                    kernel_initializer=w_init(3.0, 256),
                                    name='c2s',
                                    reuse=tf.AUTO_REUSE)
                x = tf.nn.leaky_relu(x, alpha=0.35)

                y = tf.layers.dense(action,
                                    256,
                                    kernel_initializer=w_init(
                                        3.0, num_outputs),
                                    name='c1a',
                                    reuse=tf.AUTO_REUSE)
                y = tf.nn.leaky_relu(y, alpha=0.35)

                x = tf.concat([x, y], axis=1)

                x = tf.layers.dense(x,
                                    256,
                                    kernel_initializer=w_init(3.0, 256 + 256),
                                    name='c2',
                                    reuse=tf.AUTO_REUSE)
                x = tf.contrib.layers.layer_norm(x, center=True, scale=True)
                x = tf.nn.leaky_relu(x, alpha=0.35)

                x = tf.layers.dense(x,
                                    256,
                                    kernel_initializer=w_init(3.0, 256),
                                    name='c3',
                                    reuse=tf.AUTO_REUSE)
                x = tf.nn.leaky_relu(x, alpha=0.35)
                x = tf.layers.dense(x,
                                    256,
                                    kernel_initializer=w_init(3.0, 256),
                                    name='c4',
                                    reuse=tf.AUTO_REUSE)
                x = tf.nn.leaky_relu(x, alpha=0.35)

                x = tf.layers.dense(x,
                                    1,
                                    kernel_initializer=w_init(1.0, 256),
                                    name='c9',
                                    reuse=tf.AUTO_REUSE)

                return x

        return critic_model

    def train_step_gen(self):
        s1 = tf.placeholder(tf.float32, shape=[None, self.inputdims])
        a1 = tf.placeholder(tf.float32, shape=[None, self.outputdims])
        r1 = tf.placeholder(tf.float32, shape=[None, 1])
        isdone = tf.placeholder(tf.float32, shape=[None, 1])
        s2 = tf.placeholder(tf.float32, shape=[None, self.inputdims])

        tau = tf.Variable(1e-3, name='tau', trainable=False)

        self.train_ops = []
        self.predict_ops = []
        self.sync_target_ops = []
        self.evaluate_ops = []
        self.actor_vars = []

        for i in range(self.nr_networks):
            scope = 'ac_' + str(i)
            with tf.variable_scope(scope):
                # 1. update the critic
                a2 = self.actor_target[i](s2)
                q2 = self.critic_target[i]([s2, a2])
                q1_target = r1 + (1 - isdone) * (self.discount_factor +
                                                 i * hyper.discount_step) * q2
                q1_predict = self.critic[i]([s1, a1])
                critic_loss = tf.reduce_mean((q1_target - q1_predict)**2)

                # 2. update the actor
                a1_predict = self.actor[i](s1)
                q1_predict2 = self.critic[i]([s1, a1_predict])
                actor_loss = tf.reduce_mean(-q1_predict2)

                # 3. shift the weights (aka target network)
                aw = tf.trainable_variables(scope=scope + '/actor_o')
                cw = tf.trainable_variables(scope=scope + '/critic_o')
                atw = tf.trainable_variables(scope=scope + '/actor_t')
                ctw = tf.trainable_variables(scope=scope + '/critic_t')
                self.actor_vars.append([aw, atw])
                one_m_tau = 1 - tau
                shift1 = [
                    tf.assign(atw[i], aw[i] * tau + atw[i] * (one_m_tau))
                    for i, _ in enumerate(aw)
                ]
                shift2 = [
                    tf.assign(ctw[i], cw[i] * tau + ctw[i] * (one_m_tau))
                    for i, _ in enumerate(cw)
                ]

                # 4. inference
                a_infer = self.actor[i](s1)
                q_infer = self.critic[i]([s1, a_infer])

                # optimizer
                with tf.variable_scope('opt_a'):
                    opt_actor = tf.train.AdamOptimizer(
                        hyper.lr_actor)  #, name='Adam' default
                    astep = opt_actor.minimize(actor_loss, var_list=aw)
                with tf.variable_scope('opt_c'):
                    opt_critic = tf.train.AdamOptimizer(
                        hyper.lr_critic)  #, name='Adam'
                    cstep = opt_critic.minimize(critic_loss, var_list=cw)

                self.train_ops.append(
                    [critic_loss, actor_loss, cstep, astep, shift1, shift2])
                self.predict_ops.append([a_infer, q_infer])
                self.sync_target_ops.append([shift1, shift2])
                self.evaluate_ops.append([q1_predict])

        # setup ops for swapping actors
        self.copy_ops = []
        with tf.variable_scope('copy'):
            self.actor_backup = []
            self.actor_t_backup = []
            # Create variables of actor shape to hold a backup
            # I'm sure there's a better way to do this
            for _, av in enumerate(self.actor_vars[0][0]):
                self.actor_backup.append(tf.Variable(av))
            for _, av in enumerate(self.actor_vars[0][1]):
                self.actor_t_backup.append(tf.Variable(av))

            # copy the first one
            self.backup_cp = [
                tf.assign(self.actor_backup[k], self.actor_vars[0][0][k])
                for k, _ in enumerate(self.actor_vars[0][0])
            ]
            self.backup_cp_t = [
                tf.assign(self.actor_t_backup[k], self.actor_vars[0][1][k])
                for k, _ in enumerate(self.actor_vars[0][1])
            ]

            # copy actors to index-1
            for i in range(self.nr_networks - 1):
                cp = [
                    tf.assign(self.actor_vars[i][0][k],
                              self.actor_vars[i + 1][0][k])
                    for k, _ in enumerate(self.actor_vars[i][0])
                ]
                cp_t = [
                    tf.assign(self.actor_vars[i][1][k],
                              self.actor_vars[i + 1][1][k])
                    for k, _ in enumerate(self.actor_vars[i][1])
                ]
                self.copy_ops.append([cp, cp_t])

            # copy the backup to the last
            last_id = self.nr_networks - 1
            self.last_cp = [
                tf.assign(self.actor_vars[last_id][0][k], self.actor_backup[k])
                for k, _ in enumerate(self.actor_vars[last_id][0])
            ]
            self.last_cp_t = [
                tf.assign(self.actor_vars[last_id][1][k],
                          self.actor_t_backup[k])
                for k, _ in enumerate(self.actor_vars[last_id][1])
            ]

        def swap_actors():
            with self.lock_swap:
                with self.lock:
                    # could setup control_dependencies/groups, but this is not run very often
                    self.sess.run([self.backup_cp, self.backup_cp_t],
                                  feed_dict={})
                    for _, cp in enumerate(self.copy_ops):
                        self.sess.run(cp, feed_dict={})
                    self.sess.run([self.last_cp, self.last_cp_t], feed_dict={})

        def train(memory, i):
            [s1d, a1d, r1d, isdoned, s2d] = memory
            res = self.sess.run(self.train_ops[i],
                                feed_dict={
                                    s1: s1d,
                                    a1: a1d,
                                    r1: r1d,
                                    isdone: isdoned,
                                    s2: s2d,
                                    tau: hyper.tau
                                })
            return res

        def predict(state, i):
            res = self.sess.run(self.predict_ops[i], feed_dict={s1: state})
            return res

        def sync_target(i):
            self.sess.run(self.sync_target_ops[i], feed_dict={tau: 1.})

        def evaluate(state, action, i):
            [qv] = self.sess.run(self.evaluate_ops[i],
                                 feed_dict={
                                     s1: state,
                                     a1: action
                                 })
            return qv

        return train, predict, sync_target, evaluate, swap_actors

    def test_swap_actors(self):
        for i in range(self.nr_networks):
            print(self.sess.run(self.actor_vars[i][0][0][0][0], feed_dict={}))

    def get_max_action(self, observation):
        obs_b = np.reshape(observation, (1, len(observation)))

        # get actions
        all_actions = []
        for aci in range(self.nr_networks):
            [actions, _] = self.predict(obs_b, aci)
            # setup for batches, get first
            action = actions[0]
            all_actions.append(action)

        # create combinations
        for ai in range(self.nr_networks):
            for aj in range(self.nr_networks):
                if aj < ai:
                    a1 = all_actions[ai]
                    a2 = all_actions[aj]
                    avg_action = (a1 + a2) * 0.5
                    all_actions.append(avg_action)

        # and more combinations
        for ai in range(self.nr_networks):
            for aj in range(self.nr_networks):
                for ak in range(self.nr_networks):
                    if aj < ai and ak < aj:
                        a1 = all_actions[ai]
                        a2 = all_actions[aj]
                        a3 = all_actions[ak]
                        avg_action = (a1 + a2 + a3) * (1.0 / 3.0)
                        all_actions.append(avg_action)

        # for sanity
        for ai in range(len(all_actions)):
            all_actions[ai] = self.clamp_action(all_actions[ai])

        # make it a np array, so we can batch it
        all_actions = np.asarray(all_actions)
        # stack observation for batching
        all_obs = np.repeat(obs_b, len(all_actions), axis=0)
        # get qv from each critic and sum them
        for ci in range(self.nr_networks):
            if ci == 0:
                all_qv_b = self.evaluate(all_obs, all_actions, ci)
            else:
                all_qv_b += self.evaluate(all_obs, all_actions, ci)
        all_qv_b /= self.nr_networks

        # evaluate actions
        max_action = None
        max_qv = None
        for ai in range(len(all_qv_b)):
            qv_a = all_qv_b[ai]
            if max_qv == None or qv_a > max_qv:
                max_action = all_actions[ai]
                max_qv = qv_a

        return max_action

    def get_all_actions(self, observation):
        obs_b = np.reshape(observation, (1, len(observation)))
        # get actions
        all_actions = []
        for aci in range(self.nr_networks):
            [actions, _] = self.predict(obs_b, aci)
            # setup for batches, get first
            action = actions[0]
            all_actions.append(action)
        return all_actions

    def get_action_qs(self, observation, all_actions):
        obs_b = np.reshape(observation, (1, len(observation)))
        # make it a np array, so we can batch it
        all_actions = np.asarray(all_actions)
        # stack observation for batching
        all_obs = np.repeat(obs_b, len(all_actions), axis=0)
        # get qv from each critic and sum them
        for ci in range(self.nr_networks):
            if ci == 0:
                all_qv_b = self.evaluate(all_obs, all_actions, ci)
            else:
                all_qv_b += self.evaluate(all_obs, all_actions, ci)
        all_qv_b /= self.nr_networks
        return all_qv_b

    def get_action(self, observation, i):
        obs = np.reshape(observation, (1, len(observation)))
        [actions, q] = self.predict(obs, i)
        actions, q = actions[0], q[0]
        return actions

    def train_batch(self, i):
        if self.block_training:
            return
        # only if enough samples in memory
        if self.memory.size() > hyper.batch_size * 128:
            # sample a minibatch
            [s1, a1, r1, isdone,
             s2] = self.memory.sample_batch(hyper.batch_size)
            # print(s1.shape,a1.shape,r1.shape,isdone.shape,s2.shape)
            self.train([s1, a1, r1, isdone, s2], i)

    def append_memory(self, s1, a1, r1, isdone, s2):
        self.memory.append(s1, a1, r1, isdone, s2)

    def run_episode(self,
                    fenv,
                    max_steps=-1,
                    training=False,
                    render=False,
                    noise_level=0.,
                    ac_id=0):
        time_start = time.time()

        noise_source = None
        if noise_level > 0.0:
            noise_source = one_fsq_noise()
            # warm up noise source
            for _ in range(2000):
                noise_source.one((self.outputdims, ), noise_level)

        max_steps = max_steps if max_steps > 0 else 50000
        steps = 0
        total_reward = 0

        try:
            # this might be a remote env
            observation = np.array(fenv.reset())
        except Exception as e:
            print('Bad things during reset. Episode terminated.', e)
            traceback.print_exc()
            return

        while True and steps <= max_steps:
            steps += 1

            observation_before_action = observation  # s1

            exploration_noise = 0.0
            if noise_level > 0.0:
                exploration_noise = noise_source.one((self.outputdims, ),
                                                     noise_level)

            # get action
            action = None
            with self.lock_swap:
                if training:
                    action = self.get_action(observation_before_action, ac_id)
                else:
                    action = self.get_max_action(observation_before_action)

            # add noise to our actions, since our policy is deterministic
            if noise_level > 0.0:
                exploration_noise *= self.action_multiplier
                action += exploration_noise
            action = self.clamp_action(action)

            # step
            try:
                # can't send receive np arrays over pyro
                action_out = [float(action[i]) for i in range(len(action))]
                observation, reward, done, _info = fenv.step(action_out)
                observation = np.array(observation)
            except Exception as e:
                print('Bad things during step. Episode terminated.', e)
                traceback.print_exc()
                return

            # d1
            isdone = 1 if done else 0
            total_reward += reward

            # train
            if training == True:
                # The code works without this lock, but depending on training speed there is too much noise on updates.
                # The model always trains and is more stable with lock here
                with self.lock:
                    self.append_memory(observation_before_action, action,
                                       reward, isdone,
                                       observation)  # s1,a1,r1,isdone,s2
                    for i in range(self.nr_networks):
                        self.train_batch(i)
            else:
                if render:
                    fenv.render()

            if done:
                break

        totaltime = time.time() - time_start

        if training == True:
            self.global_step += 1
            print(
                self.global_step,
                ': Episode done in {} steps in {:.2f} sec, {:.4f} sec/step, got reward :{:.2f}'
                .format(steps, totaltime, totaltime / steps, total_reward))
            self.history.append_train(total_reward, noise_level, steps)
        else:
            print(
                'Test done in {} steps in {:.2f} sec, {:.4f} sec/step, got reward :{:.2f}'
                .format(steps, totaltime, totaltime / steps, total_reward))
            self.history.append_test(total_reward, self.global_step, steps)
            if render == False:
                # background test
                if total_reward > self.max_reward:
                    self.max_reward = total_reward
                    self.save_weights("max_model")
                    print("Saved new max model with score: ", total_reward)

        return total_reward

    def save_weights(self, model_name="model"):
        with self.lock_swap:
            with self.lock:
                self.saver.save(self.sess,
                                self.model_path + "/" + model_name,
                                global_step=self.global_step)
        print("Saved model at global episode:", self.global_step)

    def load_weights(self, model=""):
        print('Loading Model...')
        path = ""
        if model == "":
            checkpoint = tf.train.get_checkpoint_state(self.model_path)
            if checkpoint:
                path = checkpoint.model_checkpoint_path
        else:
            path = self.model_path + "/" + model
        try:
            self.saver.restore(self.sess, path)
            print("Loaded model from checkpoint:", path)
            return True
        except Exception as ex:
            print("No model checkpoint available!")
            return False
class DDPG(object):
    def __init__(self,
                 memory_capacity,
                 batch_size,
                 prioritiy,
                 noise_target_action=False,
                 alpha=0.2,
                 use_n_step=False,
                 n_step_return=5,
                 is_training=True,
                 LAMBDA_BC=100,
                 policy_delay=1,
                 use_TD3=False,
                 experiment_name='none',
                 Q_value_range=(-250, 5)):
        self.batch_size = batch_size
        self.is_prioritiy = prioritiy
        self.n_step_return = n_step_return
        self.use_n_step = use_n_step
        self.LAMBDA_BC = LAMBDA_BC
        self.use_TD3 = use_TD3
        self.experiment_name = experiment_name
        self.Q_value_range = Q_value_range  # 限制q的范围,防止过估计.

        self.demo_percent = []  # demo 在 sample中所占比例
        if prioritiy:
            from priority_memory import PrioritizedMemory
            self.memory = PrioritizedMemory(capacity=memory_capacity,
                                            alpha=alpha)
        else:
            from memory import Memory
            self.memory = Memory(limit=memory_capacity,
                                 action_shape=(4, ),
                                 observation_shape=(224, 224, 3),
                                 full_state_shape=(15, ))

        self.pointer = 0  # memory 计数器
        self.sess = tf.InteractiveSession()  # 创建一个默认会话
        self.lambda_1_step = 0.5  # 1_step_return_loss的权重
        self.lambda_n_step = 0.5  # n_step_return_loss的权重
        self.beta = 0.6
        self.act_limit = np.array([0.05, 0.05, 0.05, np.radians(90)])

        # actor 比 critic 更新频率小
        self.policy_delay_iterate = 0
        self.policy_delay = policy_delay

        # 定义 placeholders
        self.observe_Input = tf.placeholder(tf.float32, [None, 15],
                                            name='observe_Input')
        self.observe_Input_ = tf.placeholder(tf.float32, [None, 15],
                                             name='observe_Input_')
        self.f_s = tf.placeholder(tf.float32, [None, 15],
                                  name='full_state_Input')
        self.f_s_ = tf.placeholder(tf.float32, [None, 15],
                                   name='fill_state_Input_')
        self.R = tf.placeholder(tf.float32, [None, 1], 'r')
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.ISWeights = tf.placeholder(tf.float32, [None, 1],
                                        name='IS_weights')
        self.n_step_steps = tf.placeholder(tf.float32,
                                           shape=(None, 1),
                                           name='n_step_reached')
        self.q_demo = tf.placeholder(tf.float32, [None, 1],
                                     name='Q_of_actions_from_memory')
        self.come_from_demo = tf.placeholder(tf.float32, [None, 1],
                                             name='Demo_index')
        self.action_memory = tf.placeholder(tf.float32, [None, 4],
                                            name='actions_from_memory')

        with tf.variable_scope('obs_rms'):
            self.obs_rms = RunningMeanStd(shape=(15, ))
        with tf.variable_scope('state_rms'):
            self.state_rms = RunningMeanStd(shape=(15, ))
        with tf.name_scope('obs_preprocess'):
            self.normalized_observe_Input = tf.clip_by_value(
                normalize(self.observe_Input, self.obs_rms), -10., 10.)
            self.normalized_observe_Input_ = tf.clip_by_value(
                normalize(self.observe_Input_, self.obs_rms), -10., 10.)
        with tf.name_scope('state_preprocess'):
            self.normalized_f_s0 = normalize(self.f_s, self.state_rms)
            self.normalized_f_s1 = normalize(self.f_s_, self.state_rms)

        with tf.variable_scope('Actor'):
            self.action = self.build_actor(self.normalized_observe_Input,
                                           scope='eval',
                                           trainable=True,
                                           is_training=is_training)
            self.action_ = self.build_actor(self.normalized_observe_Input_,
                                            scope='target',
                                            trainable=False,
                                            is_training=False)

            # Target policy smoothing, by adding clipped noise to target actions
            if noise_target_action:
                epsilon = tf.random_normal(tf.shape(self.action_),
                                           stddev=0.007)
                epsilon = tf.clip_by_value(epsilon, -0.01, 0.01)
                a2 = self.action_ + epsilon
                noised_action_ = tf.clip_by_value(a2, -self.act_limit,
                                                  self.act_limit)
            else:
                noised_action_ = self.action_

        with tf.variable_scope('Critic'):
            # Q值都要被clip 防止过估计.
            self.q_1 = tf.clip_by_value(
                self.build_critic(self.normalized_f_s0,
                                  self.action,
                                  scope='eval_1',
                                  trainable=True,
                                  is_training=is_training),
                self.Q_value_range[0], self.Q_value_range[1])

            q_1_ = self.build_critic(self.normalized_f_s1,
                                     noised_action_,
                                     scope='target_1',
                                     trainable=False,
                                     is_training=False)

            if self.use_TD3:
                q_2 = tf.clip_by_value(
                    self.build_critic(self.normalized_f_s0,
                                      self.action,
                                      scope='eval_2',
                                      trainable=True,
                                      is_training=is_training),
                    self.Q_value_range[0], self.Q_value_range[1])

                q_2_ = self.build_critic(self.normalized_f_s1,
                                         noised_action_,
                                         scope='target_2',
                                         trainable=False,
                                         is_training=False)

        # Collect networks parameters. It would make it more easily to manage them.
        self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='Actor/eval')
        self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='Actor/target')
        self.ce1_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                            scope='Critic/eval_1')
        self.ct1_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                            scope='Critic/target_1')

        if self.use_TD3:
            self.ce2_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                scope='Critic/eval_2')
            self.ct2_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                scope='Critic/target_2')

        with tf.variable_scope('Soft_Update'):
            self.soft_replace_a = [
                tf.assign(t, (1 - TAU) * t + TAU * e)
                for t, e in zip(self.at_params, self.ae_params)
            ]
            self.soft_replace_c = [
                tf.assign(t, (1 - TAU) * t + TAU * e)
                for t, e in zip(self.ct1_params, self.ce1_params)
            ]
            if self.use_TD3:
                self.soft_replace_c += [
                    tf.assign(t, (1 - TAU) * t + TAU * e)
                    for t, e in zip(self.ct2_params, self.ce2_params)
                ]

        # critic 的误差 为 (one-step-td 误差 + n-step-td 误差 + critic_online 的L2惩罚)
        # TD3: critic一共有4个, 算两套 critic的误差, 秀儿.
        with tf.variable_scope('Critic_Lose'):
            if self.use_TD3:
                min_q_ = tf.minimum(q_1_, q_2_)
            else:
                min_q_ = q_1_

            self.q_target = self.R + (1. - self.terminals1) * GAMMA * min_q_
            if self.use_n_step:
                self.n_step_target_q = self.R + (
                    1. - self.terminals1) * tf.pow(GAMMA,
                                                   self.n_step_steps) * min_q_
                cliped_n_step_target_q = tf.clip_by_value(
                    self.n_step_target_q, self.Q_value_range[0],
                    self.Q_value_range[1])

            cliped_q_target = tf.clip_by_value(self.q_target,
                                               self.Q_value_range[0],
                                               self.Q_value_range[1])

            self.td_error_1 = tf.abs(cliped_q_target - self.q_1)
            if self.use_TD3:
                self.td_error_2 = tf.abs(cliped_q_target - q_2)

            if self.use_n_step:
                self.nstep_td_error_1 = tf.abs(cliped_n_step_target_q -
                                               self.q_1)
                if self.use_TD3:
                    self.nstep_td_error_2 = tf.abs(cliped_n_step_target_q -
                                                   q_2)

            L2_regular_1 = tf.contrib.layers.apply_regularization(
                tf.contrib.layers.l2_regularizer(0.001),
                weights_list=self.ce1_params)
            if self.use_TD3:
                L2_regular_2 = tf.contrib.layers.apply_regularization(
                    tf.contrib.layers.l2_regularizer(0.001),
                    weights_list=self.ce2_params)

            one_step_losse_1 = tf.reduce_mean(
                tf.multiply(self.ISWeights, tf.square(
                    self.td_error_1))) * self.lambda_1_step
            if self.use_TD3:
                one_step_losse_2 = tf.reduce_mean(
                    tf.multiply(self.ISWeights, tf.square(
                        self.td_error_2))) * self.lambda_1_step

            if self.use_n_step:
                n_step_td_losses_1 = tf.reduce_mean(
                    tf.multiply(self.ISWeights, tf.square(
                        self.nstep_td_error_1))) * self.lambda_n_step
                c_loss_1 = one_step_losse_1 + n_step_td_losses_1 + L2_regular_1

                if self.use_TD3:
                    n_step_td_losses_2 = tf.reduce_mean(
                        tf.multiply(
                            self.ISWeights, tf.square(
                                self.nstep_td_error_2))) * self.lambda_n_step
                    c_loss_2 = one_step_losse_2 + n_step_td_losses_2 + L2_regular_2
            else:
                c_loss_1 = one_step_losse_1 + L2_regular_1

                if self.use_TD3:
                    c_loss_2 = one_step_losse_2 + L2_regular_2

        # actor 的 loss 为 最大化q(s,a) 最小化行为克隆误差.
        # (只有demo的transition 且 demo的action 比 actor生成的action q_1(s,a)高的时候 才会有克隆误差)
        with tf.variable_scope('Actor_lose'):
            Is_worse_than_demo = self.q_1 < self.q_demo
            Is_worse_than_demo = tf.cast(Is_worse_than_demo, tf.float32)
            worse_than_demo = tf.cast(tf.reduce_sum(Is_worse_than_demo),
                                      tf.int8)

            # 算action误差 我用的是平方和, 也有人用均方误差 reduce_mean. 其实都可以.
            # 我的action本来都是很小的数.
            action_diffs = Is_worse_than_demo * tf.reduce_sum(
                self.come_from_demo *
                tf.square(self.action - self.action_memory),
                1,
                keepdims=True)

            L_BC = self.LAMBDA_BC * tf.reduce_sum(action_diffs)
            a_loss = -tf.reduce_mean(self.q_1) + L_BC

        # Setting optimizer for Actor and Critic
        update_ops = tf.get_collection(
            tf.GraphKeys.UPDATE_OPS)  # batch-normal 参数更新
        with tf.variable_scope('Critic_Optimizer'):
            if self.use_TD3:
                self.ctrain = tf.group(tf.train.AdamOptimizer(LR_C).minimize(
                    c_loss_1, var_list=self.ce1_params),
                                       tf.train.AdamOptimizer(LR_C).minimize(
                                           c_loss_2, var_list=self.ce2_params),
                                       name='ctrain')
            else:
                self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(
                    c_loss_1, var_list=self.ce1_params)

        with tf.variable_scope('Actor_Optimizer'):
            with tf.control_dependencies(update_ops):
                self.atrain = tf.train.AdamOptimizer(LR_A).minimize(
                    a_loss, var_list=self.ae_params)

        self.sess.run(tf.global_variables_initializer())

        #  init_target net-work with evaluate net-params
        init_a_t = [
            tf.assign(t, e) for t, e in zip(self.at_params, self.ae_params)
        ]
        init_c_t = [
            tf.assign(t, e) for t, e in zip(self.ct1_params, self.ce1_params)
        ]
        if self.use_TD3:
            init_c_t += [
                tf.assign(t, e)
                for t, e in zip(self.ct2_params, self.ce2_params)
            ]
        self.sess.run(init_a_t)
        self.sess.run(init_c_t)

        # 保存模型
        var_list = [
            var for var in tf.global_variables() if "moving" in var.name
        ]
        var_list += tf.trainable_variables()
        self.saver = tf.train.Saver(var_list=var_list, max_to_keep=1)
        self.writer = tf.summary.FileWriter(
            "logs/" + self.experiment_name + "/", self.sess.graph)
        self.a_summary = tf.summary.merge([
            tf.summary.scalar('a_loss', a_loss, family='actor'),
            tf.summary.scalar('L_BC', L_BC, family='actor'),
            tf.summary.scalar('worse_than_demo',
                              worse_than_demo,
                              family='actor')
        ])

        if self.use_TD3:
            self.c_summary = tf.summary.merge([
                tf.summary.scalar('c_loss_1', c_loss_1, family='critic'),
                tf.summary.scalar('c_loss_2', c_loss_2, family='critic')
            ])
        else:
            self.c_summary = tf.summary.merge(
                [tf.summary.scalar('c_loss_1', c_loss_1, family='critic')])

        self.episode_result = tf.placeholder(tf.int8, name='episode_result')
        self.episode_summary = tf.summary.merge([
            tf.summary.scalar('Episode_Result( success or not )',
                              self.episode_result,
                              family='Result')
        ])

    def pi(self, obs):
        obs = obs.astype(dtype=np.float32)
        return self.sess.run(self.action,
                             {self.observe_Input: obs[np.newaxis, :]})[0]

    def Save(self):
        # 只存权重,不存计算图.
        self.saver.save(self.sess,
                        save_path="model/" + self.experiment_name +
                        "/model.ckpt",
                        write_meta_graph=False)

    def load(self):
        self.saver.restore(self.sess,
                           save_path="model/" + self.experiment_name +
                           "/model.ckpt")

    def save_episoed_result(self, result, episoed):
        s = self.sess.run(self.episode_summary,
                          feed_dict={self.episode_result: result})
        self.writer.add_summary(s, episoed)

    def learn(self):
        if self.is_prioritiy:
            batch, n_step_batch, percentage = self.memory.sample_rollout(
                batch_size=self.batch_size,
                nsteps=self.n_step_return,
                beta=self.beta,
                gamma=GAMMA)
            self.demo_percent.append(float(percentage))
        else:
            batch = self.memory.sample(batch_size=self.batch_size)

        one_step_target_q = self.sess.run(
            self.q_target,
            feed_dict={
                self.observe_Input_: batch['f_s1'],  # low dim input
                self.R: batch['rewards'],
                self.terminals1: batch['terminals1'],
                self.f_s_: batch['f_s1']
            })

        if self.use_TD3:
            opt = [
                self.td_error_1, self.td_error_2, self.ctrain, self.c_summary,
                self.q_1
            ]
        else:
            opt = [self.td_error_1, self.ctrain, self.c_summary, self.q_1]

        if self.is_prioritiy and self.use_n_step:
            n_step_target_q = self.sess.run(self.n_step_target_q,
                                            feed_dict={
                                                self.terminals1:
                                                n_step_batch["terminals1"],
                                                self.n_step_steps:
                                                n_step_batch["step_reached"],
                                                self.R:
                                                n_step_batch['rewards'],
                                                self.observe_Input_:
                                                n_step_batch['f_s1'],
                                                self.f_s_:
                                                n_step_batch['f_s1']
                                            })

            res = self.sess.run(opt,
                                feed_dict={
                                    self.q_target: one_step_target_q,
                                    self.n_step_target_q: n_step_target_q,
                                    self.f_s: batch['f_s0'],
                                    self.action: batch['actions'],
                                    self.ISWeights: batch['weights']
                                })
        else:
            res = self.sess.run(opt,
                                feed_dict={
                                    self.q_target: one_step_target_q,
                                    self.f_s: batch['f_s0'],
                                    self.action: batch['actions'],
                                    self.ISWeights: batch['weights']
                                })

        if self.use_TD3:
            td_error_1, td_error_2, _, c_s, q_demo = res
            td_error = (td_error_1 + td_error_2) / 2.0
        else:
            td_error, _, c_s, q_demo = res

        # actor update
        if self.policy_delay_iterate % self.policy_delay == 0:
            _, a_s, = self.sess.run(
                [self.atrain, self.a_summary], {
                    self.observe_Input: batch['f_s0'],
                    self.q_demo: q_demo,
                    self.f_s: batch['f_s0'],
                    self.come_from_demo: batch['demos'],
                    self.action_memory: batch['actions']
                })
            self.sess.run(self.soft_replace_a)
            self.writer.add_summary(a_s)

        if self.is_prioritiy:
            self.memory.update_priorities(batch['idxes'], td_errors=td_error)

        self.sess.run(self.soft_replace_c)
        self.writer.add_summary(c_s)
        self.policy_delay_iterate += 1

    def store_transition(self,
                         obs0,
                         action,
                         reward,
                         obs1,
                         full_state0,
                         full_state1,
                         terminal1,
                         demo=False):
        obs0 = obs0.astype(np.float32)
        obs1 = obs1.astype(np.float32)
        full_state0 = full_state0.astype(np.float32)
        full_state1 = full_state1.astype(np.float32)
        if demo:
            self.memory.append_demo(obs0=obs0,
                                    f_s0=full_state0,
                                    action=action,
                                    reward=reward,
                                    obs1=obs1,
                                    f_s1=full_state1,
                                    terminal1=terminal1)
        else:
            self.memory.append(obs0=obs0,
                               f_s0=full_state0,
                               action=action,
                               reward=reward,
                               obs1=obs1,
                               f_s1=full_state1,
                               terminal1=terminal1)

        # 增量式的更新observe的均值标准差
        # self.obs_rms.update(np.array([obs0]))
        # self.obs_rms.update(np.array([obs1]))
        self.state_rms.update(np.array([full_state0]))
        self.state_rms.update(np.array([full_state1]))

        self.pointer += 1

    def build_actor(self, observe_input, scope, trainable, is_training=True):
        bn_a = partial(bn, trainable=trainable, training=is_training)
        fc_a = partial(tf.layers.dense, activation=None, trainable=trainable)
        conv2_a = partial(conv2_, trainable=trainable)
        relu = partial(tf.nn.relu)
        with tf.variable_scope(scope):
            # conv -> BN -> relu
            # net = relu(bn_a(conv2_a( observe_input, 32 )))
            # net = relu(bn_a(conv2_a( net, 32 )))
            # net = relu(bn_a(conv2_a( net, 64 )))
            # net = relu(bn_a(conv2_a( net, 64 )))
            # net = relu(bn_a(conv2_a( net, 128 )))
            # net = relu(bn_a(conv2_a( net, 128 )))
            #
            # net = tf.layers.flatten(net)
            net = observe_input
            net = relu(bn_a(fc_a(net, 128)))
            net = relu(bn_a(fc_a(net, 128)))
            action_output = fc_a(
                net,
                4,
                activation=tf.nn.tanh,
                kernel_initializer=tf.initializers.random_uniform(
                    minval=-0.0003, maxval=0.0003))
            #输出(1,4)
            action_output = action_output * self.act_limit
            # dx a[0] (-0.05,0.05)
            # dy a[1] (-0.05,0.05)
            # dz a[2] (-0.05,0.05)
            # da a[3] (-pi/2,pi/2)

            return action_output

    def build_critic(self, f_s, a, scope, trainable, is_training=True):
        bn_a = partial(bn, trainable=trainable, training=is_training)
        relu = partial(tf.nn.relu)
        fc_c = partial(tf.layers.dense, activation=None, trainable=trainable)
        with tf.variable_scope(scope):

            net = tf.concat([f_s, a], axis=1)
            net = relu(bn_a(fc_c(net, 128)))
            net = relu(bn_a(fc_c(net, 128)))

            q = fc_c(net,
                     1,
                     kernel_initializer=tf.initializers.random_uniform(
                         minval=-0.0003, maxval=0.0003))
            # Q(s,a) 输出一个[None,1]
            return q
Exemplo n.º 11
0
class DQN(object):
    def __init__(self):
        agent_args = Singleton_arger()['agent']
        self.critic_lr = agent_args['critic_lr']
        self.lr_decay = agent_args['lr_decay']
        self.l2_critic = agent_args['l2_critic']
        self.batch_size = agent_args['batch_size']
        self.discount = agent_args['discount']
        self.tau = agent_args['tau']
        self.with_cuda = agent_args['with_cuda']
        self.buffer_size = int(agent_args['buffer_size'])
        self.num_update_time = 10

    def setup(self, obs_shape, nb_action):
        self.lr_coef = 1
        self.epsilon = 1
        self.nb_action = nb_action
        model_args = Singleton_arger()['model']

        qnet = QNet(obs_shape, nb_action)

        self.qnet = copy.deepcopy(qnet)
        self.target_qnet = copy.deepcopy(qnet)

        self.memory = Memory(self.buffer_size, nb_action, self.with_cuda)

        if self.with_cuda:
            self.qnet.cuda()
            self.target_qnet.cuda()

        self.qnet_optim = Adam(self.qnet.parameters(), lr=self.critic_lr)

    def reset_noise(self):
        pass

    def before_epoch(self):
        pass

    def before_cycle(self):
        pass

    def before_iter(self):
        self.epsilon = max((self.epsilon - (1 - 0.01) / 250000), 0.01)

    def store_transition(self, s_t, a_t, r_t, s_t1, done_t):
        #s_t = torch.tensor(s_t,dtype = torch.float32,requires_grad = False)
        self.memory.append(s_t, a_t, r_t, s_t1, done_t)

    def update_target(self):
        for target_param, param in zip(self.target_qnet.parameters(),
                                       self.qnet.parameters()):
            target_param.data.copy_(param.data)

    def update(self):
        batch = self.memory.sample(self.batch_size)
        self.qnet_optim.zero_grad()

        q_eval = self.qnet(batch['obs0']).gather(1, batch['actions'])
        with torch.no_grad():
            _, a_next = self.qnet(batch['obs1']).max(1)
            q_next = self.target_qnet(batch['obs1']).gather(
                1, a_next.unsqueeze(1))
            q_target = batch['rewards'] + self.discount * (
                1 - batch['terminals1']) * q_next
        value_loss = nn.functional.mse_loss(q_eval, q_target)

        value_loss.backward()
        self.qnet_optim.step()
        return value_loss.item()

    def calc_last_error(self):
        # Sample batch
        batch = self.memory.sample_last(self.batch_size)
        #tensor_obs0 = batch['obs0']
        #tensor_obs1 = batch['obs1']
        # Prepare for the target q batch
        with torch.no_grad():
            q_eval = self.qnet(batch['obs0']).gather(1, batch['actions'])
            _, a_next = self.qnet(batch['obs1']).max(1)
            q_next = self.target_qnet(batch['obs1']).gather(
                1, a_next.unsqueeze(1))
            q_target = batch['rewards'] + self.discount * (
                1 - batch['terminals1']) * q_next
            value_loss = nn.functional.mse_loss(q_eval, q_target)
        return value_loss.item()

    def apply_lr_decay(self):
        if self.lr_decay > 0:
            self.lr_coef = self.lr_decay * self.lr_coef / (self.lr_coef +
                                                           self.lr_decay)
            for group in self.qnet_optim.param_groups:
                group['lr'] = self.critic_lr * self.lr_coef

    def select_action(self, s_t, apply_noise):
        if apply_noise and np.random.rand() < self.epsilon:
            return np.random.random_integers(0, self.nb_action - 1)
        s_t = torch.tensor(np.expand_dims(np.array(s_t), axis=0),
                           dtype=torch.float32,
                           requires_grad=False)
        if self.with_cuda:
            s_t = s_t.cuda()
        with torch.no_grad():
            q_value = self.qnet(s_t)
        action = np.argmax(q_value.cpu().numpy().squeeze(0))
        return action

    def load_weights(self, output):
        self.qnet = torch.load('{}/qnet.pkl'.format(output))

    def save_model(self, output):
        torch.save(self.qnet, '{}/qnet.pkl'.format(output))

    def get_qnet_buffer(self):
        qnet_buffer = io.BytesIO()
        torch.save(self.qnet, qnet_buffer)
        return qnet_buffer
Exemplo n.º 12
0
class DDPG:
    def __init__(self, env, args):
        ob_space = env.observation_space
        goal_dim = env.goal_dim
        ob_dim = ob_space.shape[0]
        self.ob_dim = ob_dim
        self.ac_dim = ac_dim = 7
        self.goal_dim = goal_dim
        self.num_iters = args.num_iters
        self.random_prob = args.random_prob
        self.tau = args.tau
        self.reward_scale = args.reward_scale
        self.gamma = args.gamma

        self.log_interval = args.log_interval
        self.save_interval = args.save_interval
        self.rollout_steps = args.rollout_steps
        self.env = env
        self.batch_size = args.batch_size
        self.train_steps = args.train_steps
        self.closest_dist = np.inf
        self.warmup_iter = args.warmup_iter
        self.max_grad_norm = args.max_grad_norm
        self.use_her = args.her
        self.k_future = args.k_future
        self.model_dir = os.path.join(args.save_dir, 'model')
        self.pretrain_dir = args.pretrain_dir
        os.makedirs(self.model_dir, exist_ok=True)
        self.global_step = 0
        self.actor = Actor(ob_dim=ob_dim,
                           act_dim=ac_dim,
                           hid1_dim=args.hid1_dim,
                           hid2_dim=args.hid2_dim,
                           hid3_dim=args.hid3_dim,
                           init_method=args.init_method)
        self.critic = Critic(ob_dim=ob_dim,
                             act_dim=ac_dim,
                             hid1_dim=args.hid1_dim,
                             hid2_dim=args.hid2_dim,
                             hid3_dim=args.hid3_dim,
                             init_method=args.init_method)
        if args.resume or args.test or args.pretrain_dir is not None:
            self.load_model(args.resume_step, pretrain_dir=args.pretrain_dir)
        if not args.test:
            self.actor_target = Actor(ob_dim=ob_dim,
                                      act_dim=ac_dim,
                                      hid1_dim=args.hid1_dim,
                                      hid2_dim=args.hid2_dim,
                                      hid3_dim=args.hid3_dim,
                                      init_method=args.init_method)
            self.critic_target = Critic(ob_dim=ob_dim,
                                        act_dim=ac_dim,
                                        hid1_dim=args.hid1_dim,
                                        hid2_dim=args.hid2_dim,
                                        hid3_dim=args.hid3_dim,
                                        init_method=args.init_method)
            self.actor_optim = self.construct_optim(self.actor,
                                                    lr=args.actor_lr)
            cri_w_decay = args.critic_weight_decay
            self.critic_optim = self.construct_optim(self.critic,
                                                     lr=args.critic_lr,
                                                     weight_decay=cri_w_decay)
            self.hard_update(self.actor_target, self.actor)
            self.hard_update(self.critic_target, self.critic)

            self.actor_target.eval()
            self.critic_target.eval()
            if args.noise_type == 'ou_noise':
                mu = np.zeros(ac_dim)
                sigma = float(args.ou_noise_std) * np.ones(ac_dim)
                self.action_noise = OrnsteinUhlenbeckActionNoise(mu=mu,
                                                                 sigma=sigma)
            elif args.noise_type == 'uniform':
                low_limit = args.uniform_noise_low
                high_limit = args.uniform_noise_high
                dec_step = args.max_noise_dec_step
                self.action_noise = UniformNoise(low_limit=low_limit,
                                                 high_limit=high_limit,
                                                 dec_step=dec_step)

            elif args.noise_type == 'gaussian':
                mu = np.zeros(ac_dim)
                sigma = args.normal_noise_std * np.ones(ac_dim)
                self.action_noise = NormalActionNoise(mu=mu, sigma=sigma)

            self.memory = Memory(limit=int(args.memory_limit),
                                 action_shape=(int(ac_dim), ),
                                 observation_shape=(int(ob_dim), ))
            self.critic_loss = nn.MSELoss()
            self.ob_norm = args.ob_norm
            if self.ob_norm:
                self.obs_oms = OnlineMeanStd(shape=(1, ob_dim))
            else:
                self.obs_oms = None

        self.cuda()

    def test(self, render=False, record=True, slow_t=0):
        dist, succ_rate = self.rollout(render=render,
                                       record=record,
                                       slow_t=slow_t)
        print('Final step distance: ', dist)

    def train(self):
        self.net_mode(train=True)
        tfirststart = time.time()
        epoch_episode_rewards = deque(maxlen=1)
        epoch_episode_steps = deque(maxlen=1)
        total_rollout_steps = 0
        for epoch in range(self.global_step, self.num_iters):
            episode_reward = 0
            episode_step = 0
            self.action_noise.reset()
            obs = self.env.reset()
            obs = obs[0]
            epoch_actor_losses = []
            epoch_critic_losses = []
            if self.use_her:
                ep_experi = {
                    'obs': [],
                    'act': [],
                    'reward': [],
                    'new_obs': [],
                    'ach_goals': [],
                    'done': []
                }
            for t_rollout in range(self.rollout_steps):
                total_rollout_steps += 1
                ran = np.random.random(1)[0]
                if self.pretrain_dir is None and epoch < self.warmup_iter or \
                        ran < self.random_prob:
                    act = self.random_action().flatten()
                else:
                    act = self.policy(obs).flatten()
                new_obs, r, done, info = self.env.step(act)
                ach_goals = new_obs[1].copy()
                new_obs = new_obs[0].copy()
                episode_reward += r
                episode_step += 1
                self.memory.append(obs, act, r * self.reward_scale, new_obs,
                                   ach_goals, done)
                if self.use_her:
                    ep_experi['obs'].append(obs)
                    ep_experi['act'].append(act)
                    ep_experi['reward'].append(r * self.reward_scale)
                    ep_experi['new_obs'].append(new_obs)
                    ep_experi['ach_goals'].append(ach_goals)
                    ep_experi['done'].append(done)
                if self.ob_norm:
                    self.obs_oms.update(new_obs)
                obs = new_obs
            epoch_episode_rewards.append(episode_reward)
            epoch_episode_steps.append(episode_step)
            if self.use_her:
                for t in range(episode_step - self.k_future):
                    ob = ep_experi['obs'][t]
                    act = ep_experi['act'][t]
                    new_ob = ep_experi['new_obs'][t]
                    ach_goal = ep_experi['ach_goals'][t]
                    k_futures = np.random.choice(np.arange(
                        t + 1, episode_step),
                                                 self.k_future - 1,
                                                 replace=False)
                    k_futures = np.concatenate((np.array([t]), k_futures))
                    for future in k_futures:
                        new_goal = ep_experi['ach_goals'][future]
                        her_ob = np.concatenate(
                            (ob[:-self.goal_dim], new_goal), axis=0)
                        her_new_ob = np.concatenate(
                            (new_ob[:-self.goal_dim], new_goal), axis=0)
                        res = self.env.cal_reward(ach_goal.copy(), new_goal,
                                                  act)
                        her_reward, _, done = res
                        self.memory.append(her_ob, act,
                                           her_reward * self.reward_scale,
                                           her_new_ob, ach_goal.copy(), done)
            self.global_step += 1
            if epoch >= self.warmup_iter:
                for t_train in range(self.train_steps):
                    act_loss, cri_loss = self.train_net()
                    epoch_critic_losses.append(cri_loss)
                    epoch_actor_losses.append(act_loss)

            if epoch % self.log_interval == 0:
                tnow = time.time()
                stats = {}
                if self.ob_norm:
                    stats['ob_oms_mean'] = safemean(self.obs_oms.mean.numpy())
                    stats['ob_oms_std'] = safemean(self.obs_oms.std.numpy())
                stats['total_rollout_steps'] = total_rollout_steps
                stats['rollout/return'] = safemean(
                    [rew for rew in epoch_episode_rewards])
                stats['rollout/ep_steps'] = safemean(
                    [l for l in epoch_episode_steps])
                if epoch >= self.warmup_iter:
                    stats['actor_loss'] = np.mean(epoch_actor_losses)
                    stats['critic_loss'] = np.mean(epoch_critic_losses)
                stats['epoch'] = epoch
                stats['actor_lr'] = self.actor_optim.param_groups[0]['lr']
                stats['critic_lr'] = self.critic_optim.param_groups[0]['lr']
                stats['time_elapsed'] = tnow - tfirststart
                for name, value in stats.items():
                    logger.logkv(name, value)
                logger.dumpkvs()
            if (epoch == 0 or epoch >= self.warmup_iter) and \
                    self.save_interval and\
                    epoch % self.save_interval == 0 and \
                    logger.get_dir():
                mean_final_dist, succ_rate = self.rollout()
                logger.logkv('epoch', epoch)
                logger.logkv('test/total_rollout_steps', total_rollout_steps)
                logger.logkv('test/mean_final_dist', mean_final_dist)
                logger.logkv('test/succ_rate', succ_rate)

                tra_mean_dist, tra_succ_rate = self.rollout(train_test=True)
                logger.logkv('train/mean_final_dist', tra_mean_dist)
                logger.logkv('train/succ_rate', tra_succ_rate)

                # self.log_model_weights()
                logger.dumpkvs()
                if mean_final_dist < self.closest_dist:
                    self.closest_dist = mean_final_dist
                    is_best = True
                else:
                    is_best = False
                self.save_model(is_best=is_best, step=self.global_step)

    def train_net(self):
        batch_data = self.memory.sample(batch_size=self.batch_size)
        for key, value in batch_data.items():
            batch_data[key] = torch.from_numpy(value)
        obs0_t = batch_data['obs0']
        obs1_t = batch_data['obs1']
        obs0_t = self.normalize(obs0_t, self.obs_oms)
        obs1_t = self.normalize(obs1_t, self.obs_oms)
        obs0 = Variable(obs0_t).float().cuda()
        with torch.no_grad():
            vol_obs1 = Variable(obs1_t).float().cuda()

        rewards = Variable(batch_data['rewards']).float().cuda()
        actions = Variable(batch_data['actions']).float().cuda()
        terminals = Variable(batch_data['terminals1']).float().cuda()

        cri_q_val = self.critic(obs0, actions)
        with torch.no_grad():
            target_net_act = self.actor_target(vol_obs1)
            target_net_q_val = self.critic_target(vol_obs1, target_net_act)
            # target_net_q_val.volatile = False
            target_q_label = rewards
            target_q_label += self.gamma * target_net_q_val * (1 - terminals)
            target_q_label = target_q_label.detach()

        self.actor.zero_grad()
        self.critic.zero_grad()
        cri_loss = self.critic_loss(cri_q_val, target_q_label)
        cri_loss.backward()
        if self.max_grad_norm is not None:
            torch.nn.utils.clip_grad_norm(self.critic.parameters(),
                                          self.max_grad_norm)
        self.critic_optim.step()

        self.critic.zero_grad()
        self.actor.zero_grad()
        net_act = self.actor(obs0)
        net_q_val = self.critic(obs0, net_act)
        act_loss = -net_q_val.mean()
        act_loss.backward()

        if self.max_grad_norm is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(),
                                          self.max_grad_norm)
        self.actor_optim.step()

        self.soft_update(self.actor_target, self.actor, self.tau)
        self.soft_update(self.critic_target, self.critic, self.tau)
        return act_loss.cpu().data.numpy(), cri_loss.cpu().data.numpy()

    def normalize(self, x, stats):
        if stats is None:
            return x
        return (x - stats.mean) / stats.std

    def denormalize(self, x, stats):
        if stats is None:
            return x
        return x * stats.std + stats.mean

    def net_mode(self, train=True):
        if train:
            self.actor.train()
            self.critic.train()
        else:
            self.actor.eval()
            self.critic.eval()

    def load_model(self, step=None, pretrain_dir=None):
        model_dir = self.model_dir
        if pretrain_dir is not None:
            ckpt_file = os.path.join(self.pretrain_dir, 'model_best.pth')
        else:
            if step is None:
                ckpt_file = os.path.join(model_dir, 'model_best.pth')
            else:
                ckpt_file = os.path.join(model_dir,
                                         'ckpt_{:08d}.pth'.format(step))
        if not os.path.isfile(ckpt_file):
            raise ValueError("No checkpoint found at '{}'".format(ckpt_file))
        mutils.print_yellow('Loading checkpoint {}'.format(ckpt_file))
        checkpoint = torch.load(ckpt_file)
        if pretrain_dir is not None:
            actor_dict = self.actor.state_dict()
            critic_dict = self.critic.state_dict()
            actor_pretrained_dict = {
                k: v
                for k, v in checkpoint['actor_state_dict'].items()
                if k in actor_dict
            }
            critic_pretrained_dict = {
                k: v
                for k, v in checkpoint['critic_state_dict'].items()
                if k in critic_dict
            }
            actor_dict.update(actor_pretrained_dict)
            critic_dict.update(critic_pretrained_dict)
            self.actor.load_state_dict(actor_dict)
            self.critic.load_state_dict(critic_dict)
            self.global_step = 0
        else:
            self.actor.load_state_dict(checkpoint['actor_state_dict'])
            self.critic.load_state_dict(checkpoint['critic_state_dict'])
            self.global_step = checkpoint['global_step']
        if step is None:
            mutils.print_yellow('Checkpoint step: {}'
                                ''.format(checkpoint['ckpt_step']))

        self.warmup_iter += self.global_step
        mutils.print_yellow('Checkpoint loaded...')

    def save_model(self, is_best, step=None):
        if step is None:
            step = self.global_step
        ckpt_file = os.path.join(self.model_dir,
                                 'ckpt_{:08d}.pth'.format(step))
        data_to_save = {
            'ckpt_step': step,
            'global_step': self.global_step,
            'actor_state_dict': self.actor.state_dict(),
            'actor_optimizer': self.actor_optim.state_dict(),
            'critic_state_dict': self.critic.state_dict(),
            'critic_optimizer': self.critic_optim.state_dict()
        }

        mutils.print_yellow('Saving checkpoint: %s' % ckpt_file)
        torch.save(data_to_save, ckpt_file)
        if is_best:
            torch.save(data_to_save,
                       os.path.join(self.model_dir, 'model_best.pth'))

    def rollout(self, train_test=False, render=False, record=False, slow_t=0):
        test_conditions = self.env.train_test_conditions \
            if train_test else self.env.test_conditions
        done_num = 0
        final_dist = []
        episode_length = []
        for idx in range(test_conditions):
            if train_test:
                obs = self.env.train_test_reset(cond=idx)
            else:
                obs = self.env.test_reset(cond=idx)
            for t_rollout in range(self.rollout_steps):
                obs = obs[0].copy()
                act = self.policy(obs, stochastic=False).flatten()
                obs, r, done, info = self.env.step(act)
                if render:
                    self.env.render()
                    if slow_t > 0:
                        time.sleep(slow_t)
                if done:
                    done_num += 1
                    break
            if record:
                print('dist: ', info['dist'])
            final_dist.append(info['dist'])
            episode_length.append(t_rollout)
        final_dist = np.array(final_dist)
        mean_final_dist = np.mean(final_dist)
        succ_rate = done_num / float(test_conditions)
        if record:
            with open('./test_data.json', 'w') as f:
                json.dump(final_dist.tolist(), f)

            print('\nDist statistics:')
            print("Minimum: {0:9.4f} Maximum: {1:9.4f}"
                  "".format(np.min(final_dist), np.max(final_dist)))
            print("Mean: {0:9.4f}".format(mean_final_dist))
            print("Standard Deviation: {0:9.4f}".format(np.std(final_dist)))
            print("Median: {0:9.4f}".format(np.median(final_dist)))
            print("First quartile: {0:9.4f}"
                  "".format(np.percentile(final_dist, 25)))
            print("Third quartile: {0:9.4f}"
                  "".format(np.percentile(final_dist, 75)))
            print('Success rate:', succ_rate)
        if render:
            while True:
                self.env.render()
        return mean_final_dist, succ_rate

    def log_model_weights(self):
        for name, param in self.actor.named_parameters():
            logger.logkv('actor/' + name, param.clone().cpu().data.numpy())
        for name, param in self.actor_target.named_parameters():
            logger.logkv('actor_target/' + name,
                         param.clone().cpu().data.numpy())
        for name, param in self.critic.named_parameters():
            logger.logkv('critic/' + name, param.clone().cpu().data.numpy())
        for name, param in self.critic_target.named_parameters():
            logger.logkv('critic_target/' + name,
                         param.clone().cpu().data.numpy())

    def random_action(self):
        act = np.random.uniform(-1., 1., self.ac_dim)
        return act

    def policy(self, obs, stochastic=True):
        self.actor.eval()
        ob = Variable(torch.from_numpy(obs)).float().cuda().view(1, -1)
        act = self.actor(ob)
        act = act.cpu().data.numpy()
        if stochastic:
            act = self.action_noise(act)
        self.actor.train()
        return act

    def cuda(self):
        self.critic.cuda()
        self.actor.cuda()
        if hasattr(self, 'critic_target'):
            self.critic_target.cuda()
            self.actor_target.cuda()
            self.critic_loss.cuda()

    def construct_optim(self, net, lr, weight_decay=None):
        if weight_decay is None:
            weight_decay = 0
        params = mutils.add_weight_decay([net], weight_decay=weight_decay)
        optimizer = optim.Adam(params, lr=lr, weight_decay=weight_decay)
        return optimizer

    def soft_update(self, target, source, tau):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)

    def hard_update(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
        episodeReward = reward
        shortMemory.push(state, action, mask, nextStateNumpy, reward)

        if done:
            break
        else:
            state = nextState.to(device)

        if len(memory) > batchSize:
            for _ in range(updatesPerStep):
                transition = memory.sample(batchSize)
                batch = Transition(*zip(*transition))
                valueLoss = agent.updateParameters(batch, device)
                valueLossEp += valueLoss

    memory.append(shortMemory)
    rewards.append(episodeReward)

    if episode % checkEvery == 0:
        testRewards = []
        for _ in range(numberOfTests):
            state = env.reset()
            startingPositionPuck = state["achieved_goal"]
            orginalDistance = np.linalg.norm(startingPositionPuck -
                                             desiredGoal)
            while True:
                state = stateToTensor(state, desiredGoal).to(device=device)
                #env.render()
                action = agent.selectAction(state, useTarget=True)
                action = action.cpu().numpy()
                nextState, reward, done, _ = env.step(
Exemplo n.º 14
0
class DQNAgent:
    def __init__(self, sess, env, window_size, input_shape, gamma, batch_size,
                 update_freq, is_duel, is_double, is_per, is_distributional,
                 num_step, is_noisy, learning_rate, train_step):

        self.sess = sess
        self.env = env
        self.per = is_per
        self.noisy = is_noisy
        self.dist = is_distributional
        self.duel = is_duel
        self.double = is_double
        self.eps_start = 1.0
        self.eps_end = 0.01
        self.eps_step = 500000
        self.beta_start = 0.4
        self.batch_size = batch_size
        self.gamma = gamma
        self.n_steps = num_step
        self.update_freq = update_freq
        self.learning_rate = learning_rate
        self.mem_size = 1000000
        self.num_actions = env.num_action.n
        self.train_step = train_step
        self.input_shape = input_shape
        self.window_size = window_size
        self.history = None  #np.zeros(shape=(1, self.input_shape[0], self.input_shape[1], self.window_size), dtype=np.uint8)
        self.state = None
        self.update_network = False
        if self.dist:
            self.num_atoms = 51
        else:
            self.num_atoms = 1

        #self, sess, window_size, input_shape, name='dqn',double=True, duel=False, dist=False, noisy=False, trainable=True
        self.predict_network = DQN(self.sess,
                                   window_size,
                                   input_shape,
                                   self.num_actions,
                                   self.num_atoms,
                                   name='pred_net',
                                   double=self.double,
                                   duel=self.duel,
                                   dist=self.dist,
                                   noisy=self.noisy,
                                   trainable=True)
        self.target_network = DQN(self.sess,
                                  window_size,
                                  input_shape,
                                  self.num_actions,
                                  self.num_atoms,
                                  name='target_net',
                                  double=self.double,
                                  duel=self.duel,
                                  dist=self.dist,
                                  noisy=self.noisy,
                                  trainable=True)
        self.target_network.create_copy_op(self.predict_network)

        if self.per == 1:
            self.memory = Memory(self.mem_size, self.n_steps, self.gamma)
        else:
            self.memory = deque()

        with tf.variable_scope('optimizer'):
            self.targets = tf.placeholder('float32', [None], name='target_q')
            self.actions = tf.placeholder('int64', [None], name='action')
            actions_onehot = tf.one_hot(self.actions,
                                        self.num_actions,
                                        1.0,
                                        0.0,
                                        name='action_onehot')
            pred_q = tf.reduce_sum(self.predict_network.outputs *
                                   actions_onehot,
                                   reduction_indices=1,
                                   name='q_acted')

            self.importance_weights = tf.placeholder('float32', [None],
                                                     name='importance_weights')

            if self.per:
                # use importance sampling
                self.delta = tf.square(self.targets - pred_q,
                                       name='squared_error')
            else:
                # use huber loss
                td_error = self.targets - pred_q
                self.delta = tf.where(tf.abs(td_error) < 1.0,
                                      0.5 * tf.square(td_error),
                                      tf.abs(td_error) - 0.5,
                                      name='clipped_error')

            self.loss = tf.reduce_mean(tf.multiply(self.importance_weights,
                                                   self.delta),
                                       name='loss')

            optimizer = tf.train.AdamOptimizer(self.learning_rate,
                                               epsilon=1.5e-4)
            self.optim = optimizer.minimize(self.loss,
                                            global_step=self.train_step)

    def reset(self):
        state = self.env.reset()
        self.history = np.stack((state, state, state, state), axis=2)
        self.history = np.reshape(
            [self.history],
            (self.input_shape[0], self.input_shape[1], self.window_size))

    def train(self, episode):

        self.cnt = self.sess.run(self.train_step)

        if self.per == 1:
            beta = min(
                1.0, self.beta_start +
                (1 - self.beta_start) * float(self.cnt) / float(self.eps_step))
            samples, weights = self.memory.sample(self.batch_size, beta)
        else:
            # random.sample activates in dic or list
            samples = random.sample(list(self.memory), self.batch_size)
            weights = np.ones(self.batch_size)

        batch_s = []  # state
        batch_r = []  # reward
        batch_a = []  # action
        batch_n = []  # next state
        batch_t = []  # terminal flag

        if self.per:
            for i in range(len(samples)):
                batch_s.append(samples[i][1][0])
                batch_r.append(samples[i][1][1])
                batch_a.append(samples[i][1][2])
                batch_n.append(samples[i][1][3])
                batch_t.append(samples[i][1][4])
        else:
            for i in samples:
                batch_s.append(i[0])
                batch_r.append(i[1])
                batch_a.append(i[2])
                batch_n.append(i[3])
                batch_t.append(i[4])

        batch_s = np.array(batch_s)
        batch_r = np.array(batch_r)
        batch_a = np.array(batch_a)
        batch_n = np.array(batch_n)
        batch_t = np.array(batch_t)

        batch_n = np.float32(batch_n / 255.0)
        batch_s = np.float32(batch_s / 255.0)

        if self.double:
            pred_next_max_action = self.predict_network.calc_actions(batch_n)
            target_next_qmax = self.target_network.calc_outputs_with_idx(
                batch_n, [[idx, pred_a]
                          for idx, pred_a in enumerate(pred_next_max_action)])
            target_q = (1. - batch_t) * self.gamma * target_next_qmax + batch_r
            # print(batch_r)
        else:
            target_next_qmax = self.target_network.calc_max_outputs(batch_n)
            target_q = (1. - batch_t) * self.gamma * target_next_qmax + batch_r

        _, q_t, loss, step = self.sess.run(
            [
                self.optim, self.predict_network.outputs, self.loss,
                self.train_step
            ], {
                self.targets: target_q,
                self.actions: batch_a,
                self.predict_network.inputs: batch_s,
                self.importance_weights: weights
            })

        if self.per:
            for i in range(len(batch_a)):
                error = abs(target_q[i] - q_t[i][int(batch_a[i])])
                self.memory.update(samples[i][0], error)

        if step % self.update_freq == 0:
            print(episode, " episode, ", step,
                  "th steps update target network")
            self.update_network = True
            self.target_network.run_copy()

    def get_action(self, history, Training=True):

        if Training == True:
            self.cnt = self.sess.run(self.train_step)
            eps = max(self.eps_end,
                      self.eps_start - float(self.cnt) / float(self.eps_step))
            # print('epsilon : ', eps)
            if np.random.rand() < eps and not self.noisy:
                # exploration
                move = np.random.randint(0, self.num_actions)

                max_q_pred = None
            else:

                ob = np.float32(history / 255.0)
                ob = np.reshape(ob, (1, self.input_shape[0],
                                     self.input_shape[1], self.window_size))
                move = self.predict_network.calc_actions(ob)[0]
                max_q_pred = max(self.predict_network.calc_outputs(ob)[0])

        else:
            ob = np.float32(history / 255.0)
            ob = np.reshape(ob, (1, self.input_shape[0], self.input_shape[1],
                                 self.window_size))
            move = self.predict_network.calc_actions(ob)[0]
            max_q_pred = max(self.predict_network.calc_outputs(ob)[0])

        return move, max_q_pred

    def step(self, num_steps, training=True):

        cumulative_reward = 0
        terminal = 0
        last_history = self.history
        last_action = 0
        for _ in range(num_steps):

            action, q_value = self.get_action(self.history, Training=training)
            next_state, reward, terminal = self.env.step(action,
                                                         Training=training)
            #print("reward:", reward)
            if training:
                reward = np.clip(reward, -1., 1.)

            self.state = next_state
            cumulative_reward += reward
            last_action = action

            s1 = np.reshape(next_state,
                            (self.input_shape[0], self.input_shape[1], 1))
            next_history = np.append(self.history[:, :, 1:], s1, axis=2)

            self.history = next_history
            if terminal == True:
                break

        return last_history, last_action, cumulative_reward, self.history, q_value, terminal

    def evaluate(self, num_episode):

        rewards_list = []

        for _ in range(num_episode):

            cumulative_reward = 0

            self.reset()

            while True:

                action, q_value = self.get_action(self.history, Training=False)
                next_state, reward, terminal = self.env.step(action,
                                                             Training=False)
                cumulative_reward += reward

                s1 = np.reshape(next_state,
                                (self.input_shape[0], self.input_shape[1], 1))
                next_history = np.append(self.history[:, :, 1:], s1, axis=2)
                self.history = next_history

                if terminal == True:
                    break
            rewards_list.append(cumulative_reward)

        return np.mean(rewards_list), np.std(rewards_list)

    #experience:(old_state, reward, action, new_state, Done)
    def append(self, experience):

        if self.per == 1:
            old_state = experience[0]
            reward = experience[1]
            action = experience[2]
            new_state = experience[3]
            done = experience[4]

            if self.double:
                ob = np.float32(new_state / 255.0)

                observation = np.reshape(
                    ob, (1, self.input_shape[0], self.input_shape[1],
                         self.window_size))
                pred_next_max_action = self.predict_network.calc_actions(
                    observation)

                target_next_qmax = self.target_network.calc_outputs_with_idx(
                    observation,
                    [[idx, pred_a]
                     for idx, pred_a in enumerate(pred_next_max_action)])
                target_q = (1. - done) * self.gamma * target_next_qmax + float(
                    reward)

            else:
                ob = np.float32(new_state / 255.0)

                observation = np.reshape(
                    ob, (1, self.input_shape[0], self.input_shape[1],
                         self.window_size))
                target_next_qmax = self.target_network.calc_max_outputs(
                    observation)
                target_q = (1. - done) * self.gamma * target_next_qmax + float(
                    reward)

            ob_last = np.float32(old_state / 255.0)
            last_observation = np.reshape(
                ob_last, (1, self.input_shape[0], self.input_shape[1],
                          self.window_size))
            pred_q = self.predict_network.calc_outputs_with_idx(
                last_observation, [[0, action]])

            error = abs(target_q - pred_q)

            self.memory.add(error[0], experience)
        else:
            self.memory.append(experience)
            if len(self.memory) > self.mem_size:
                self.memory.popleft()
Exemplo n.º 15
0
class Agent:
    def __init__(self, sess, eps_schedule, lr_schedule):
        self.dqn_online = Agent._make_dqn('online')
        self.dqn_target = Agent._make_dqn('target')

        self.sess = sess
        self.eps_schedule = eps_schedule
        self.lr_schedule = lr_schedule
        self.memory = Memory(MEMORY_SIZE)

        self.step = 0

    def get_action(self, s):
        if random.random() < self.eps_schedule.get():
            return random.randint(0, ACTIONS_DIM - 1)
        qs = self.dqn_online.predict(np.array([s]), self.sess)
        a = np.argmax(qs)
        return a

    def on_reward(self, s, a, r, s_, done):
        self.memory.append([s, a, r, s_, done], MAX_WEIGHT)

        self.step += 1
        if self.step % STEPS_TO_TRAIN == 0:
            self._train()
        if self.step % STEPS_TO_COPY == 0:
            self._copy()

    def _train(self):
        n = min(self.memory.size, BATCH_SIZE)
        samples = self.memory.sample_n(n)

        ss, ss_, ws = [], [], []
        for ([s, a, r, s_, done], i, w) in samples:
            ss.append(s)
            ss_.append(s_)
            ws.append([w])

        ss, ss_ = np.array(ss), np.array(ss_)

        qs = self._predict_online(ss)
        qs_ = self._predict_online(ss_)
        ts_ = self._predict_target(ss_)

        ds = []
        for i, ([s, a, r, s_, done], _, _) in enumerate(samples):
            reward = r

            # There's no need to discount future.
            if not done:
                reward += ts_[i][np.argmax(qs_[i])]

            delta = abs(reward - qs[i][a]) + 0.001
            ds.append(delta)

            qs[i][a] = reward

        for i, (_, j, _) in enumerate(samples):
            self.memory.set_delta(j, ds[i])

        self.dqn_online.train(ss, qs, ws, self.lr_schedule.get(), self.sess)

    def _copy(self):
        return self.dqn_online.copy_to(self.dqn_target, self.sess)

    def _predict_online(self, ss):
        return self.dqn_online.predict(ss, self.sess)

    def _predict_target(self, ss):
        return self.dqn_target.predict(ss, self.sess)

    @staticmethod
    def _make_dqn(name):
        return DQN(name=name,
                   states_dim=STATES_DIM,
                   actions_dim=ACTIONS_DIM,
                   hidden_layers=HIDDEN_LAYERS,
                   hidden_units=HIDDEN_UNITS)
Exemplo n.º 16
0
class DDPG(object):
    def setup_placeholders(self):
        # placeholders
        # Prefixes and suffixes:
        # ob - observation
        # ac - action
        # _no - this tensor should have shape (batch size /n/, observation dim)
        # _na - this tensor should have shape (batch size /n/, action dim)
        # _n  - this tensor should have shape (batch size /n/)
        # placeholders前面都加一个前缀是好文明,可以方便在之后区分variable和placeholder
        self.sy_ob_no = tf.placeholder(tf.float32, shape=[None, self.ob_dim], name="ob")
        self.sy_ob_next = tf.placeholder(tf.float32, shape=[None, self.ob_dim], name="ob_next")
        self.terminal_next = tf.placeholder(tf.float32, shape=[None, 1], name="terminal_next")
        self.sy_rewards = tf.placeholder(tf.float32, shape=[None, 1], name="sy_rewards")
        self.sy_critic_targets = tf.placeholder(tf.float32, shape=[None, 1], name="sy_critic_targets")
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')
        # actions的维度为整个actions选择的概率,而不只是输出一个被选择的action
        # tensorforce是按以下实现的,因此应该也是输入的概率
        # x_actions = tf.reshape(tf.cast(x_actions, dtype=tf.float32), (-1, 1))
        # 现在的问题是cartpole返回的shape是一个() 空tuple 但实际上应该是两个action 感觉应该是个bug
        self.sy_actions = tf.placeholder(tf.float32, shape=[None, self.ac_dim], name='actions')

    def setup_network(self):
        # 指定Reuse即可reuse同一个scope下的网络参数 self.actor返回一个tensor,表示选择每个action的概率
        self.actor_tf = build_actor(self.sy_ob_no, self.ac_dim, scope_name='actor')
        # 默认axis为0 会返回[0,0],即沿着第0维归一化,由于只有一个数,因此固定返回0,0
        # tf.argmax返回一个[1] 的tensor 使用tf.squeeze规约为int 否则env.step会检查不通过
        self.actor_choose_action = tf.squeeze(tf.argmax(self.actor_tf, axis=1))
        # target 输入下一次的ob
        self.target_actor_tf = build_actor(self.sy_ob_next, self.ac_dim, scope_name='target_actor')

        # 输入的是action的placeholder 这里可以选择输入action的选择概率,即actor_network的原始输出
        # 也可以选择输入argmax之后的action index
        self.critic_tf = build_critic(self.sy_ob_no, self.sy_actions, scope_name='critic')
        # 输入的是模型的action概率分布,将这个输入到critic的第二层,也算是actor和critic共用一部分参数
        # critic_tf和critic_with_actor_tf使用同一个网络,只是输入的action不同
        self.critic_with_actor_tf = build_critic(self.sy_ob_no, self.actor_tf, scope_name='critic', reuse=True)
        # 计算next_q时用的是target_actor的输出作为actor部分的输入
        next_q = build_critic(self.sy_ob_next, self.target_actor_tf, scope_name='target_critic')
        # terminal_next如果是tf.int32的placeholder,则会报 *
        self.target_q = self.sy_rewards + (1 - self.terminal_next) * self.gamma * next_q

        # setup var updates
        actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor')
        target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor')
        critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic')
        target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_critic')
        actor_init_updates, actor_soft_updates = get_target_updates(actor_vars, target_actor_vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(critic_vars, target_critic_vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

        # setup loss
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        # 构造AdamOptimizer.minimize之后,会让actor_vars中的参数膨胀两倍,因此需要先设置updates,再设置loss
        self.actor_update_op = tf.train.AdamOptimizer(self.actor_lr).minimize(self.actor_loss)
        self.critic_loss = tf.reduce_mean(tf.square(self.critic_tf-self.sy_critic_targets))
        self.critic_update_op = tf.train.AdamOptimizer(self.critic_lr).minimize(self.critic_loss)


    def __init__(self,
                 env=None,
                 discrete=True,
                 ob_shape=(),
                 ac_dim=0,
                 gamma=1.0,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 logdir=None,
                 normalize_returns=True,
                 # network arguments
                 n_layers=1,
                 size=32,
                 gae_lambda=-1.0,
                 tau=0.001 #parameter update rate
                ):
        self.gamma = gamma
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.normalize_returns = normalize_returns
        self.n_layers = n_layers
        self.size = size
        self.gae_lambda = gae_lambda
        self.tau = tau

        # Configure output directory for logging
        logz.configure_output_dir(logdir)
        # Log experimental parameters
        # args = inspect.getfullargspec(train_DDPG)[0]
        # locals_ = locals()
        # params = {k: locals_[k] if k in locals_ else None for k in args}
        # logz.save_params(params)

        # Make the gym environment
        self.env = env
        # Is this env continuous, or discrete?
        self.discrete = discrete
        self.ac_dim = ac_dim
        self.ob_dim = ob_shape[0]
        #observation_shape in cartpole is (2,) 一个tuple
        self.memory = Memory(limit=int(1e6), action_shape=ac_dim, observation_shape=ob_shape)
        self.setup_placeholders()
        self.setup_network()

    def sample_action(self,obs,compute_Q=True):
        feed_dict = {self.sy_ob_no:[obs]}
        # baseline的代码中这里直接输出action的选择概率,而且传入env.step时乘以env.high 应该是用于连续action的做法
        # 而我们求argmax则是用于离散action的做法
        # build_critic(self.sy_ob_no, self.actor_tf, scope_name='critic', reuse=True) critic传入的是actor的输出
        if compute_Q:
            action, action_prob, q = self.sess.run([self.actor_choose_action, self.actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        else:
            action, action_prob = self.sess.run([self.actor_choose_action, self.actor_tf], feed_dict=feed_dict)
            q = None

        # 去除多余的维度,并限制在-1到1之间
        action_prob = action_prob.flatten()
        action_prob = np.clip(action_prob, -1., 1.)
        return action, action_prob, q

    def soft_sync_target_actor(self):
        self.sess.run(self.target_soft_updates)

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        self.memory.append(obs0, action, reward, obs1, terminal1)

    # 相当于baseline.ddpg.train 执行一次更新,
    def update_loss(self):
        batch = self.memory.sample(batch_size=self.batch_size)

        target_Q = self.sess.run(self.target_q, feed_dict={
            self.sy_ob_next: batch['obs1'],
            self.sy_rewards: batch['rewards'],
            self.terminal_next: batch['terminals1'].astype('float32'),
        })
        ops = [self.actor_loss, self.critic_loss, self.actor_update_op, self.critic_update_op]
        actor_loss, critic_loss, _, _ = self.sess.run(ops, feed_dict={
            self.sy_ob_no: batch['obs0'],
            self.sy_actions: batch['actions'],
            self.sy_critic_targets: target_Q,
        })

        return critic_loss, actor_loss

    # 完整的训练流程
    def train(self,
              seed=0,
              n_iter=100,
              animate=False,
              min_timesteps_per_batch=1000,
              batch_epochs=1,
              batch_size = 32,
              max_path_length=None,
              ):
        self.batch_size = batch_size
        start = time.time()
        # Set random seeds
        tf.set_random_seed(seed)
        np.random.seed(seed)
        # Maximum length for episodes
        max_path_length = max_path_length
        tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)

        sess = tf.Session(config=tf_config)
        self.sess = sess
        sess.__enter__()  # equivalent to `with sess:`
        tf.global_variables_initializer().run()  # pylint: disable=E1101
        sess.run(self.target_init_updates)
        # todo: use finalize to make sure no new node in graph
        #sess.graph.finalize() #make it readonly, speed up
        # ========================================================================================#
        # Training Loop
        # ========================================================================================#
        #max_action = self.env.action_space.high
        total_timesteps = 0

        for itr in range(n_iter):
            #print('start train itr=%d max_step=%d batch=%d'%(itr, max_path_length, min_timesteps_per_batch))
            # Collect paths until we have enough timesteps
            # 每一轮结束或者超过max_path_length时会结束一次path
            # 每一轮path结束后填充到paths中,检查一次总的batch步数是否超过batch需求数,超过了则退出,开始训练
            # 因此每次训练的都是完整的数据

            # PG算法每次都使用当前分布sample action,不涉及exploration
            # TODO 改成observation和train分开两个进程,这样不用互相等待
            timesteps_this_batch = 0
            paths = []
            while True:
                ob = self.env.reset()
                #obs, acs, ac_probs, rewards, ob_nexts, dones = [], [], [], [], [], []
                obs, acs, rewards, ob_nexts, dones = [], [], [], [], []
                animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and animate)
                steps = 0
                while True:
                    if animate_this_episode:
                        self.env.render()
                        time.sleep(0.05)
                    obs.append(ob)
                    # eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)
                    # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    # baseline将action限制在-1,1 再scale 可以看下这样是否有必要
                    if self.discrete:
                        ac, ac_prob, q = self.sample_action(ob, False)
                        acs.append(ac)
                        ob_next, rew, done, _ = self.env.step(ac)
                    else:
                        _, ac_prob, q = self.sample_action(ob, False)
                        #ac_prob = tf.Print(ac_prob, [ac_prob, ac_prob.shape], 'sample action')
                        acs.append(ac_prob)
                        ob_next, rew, done, _ = self.env.step(ac_prob)
                    #ac_probs.append(ac_prob)

                    ob_nexts.append(ob_next)
                    dones.append(done)
                    rewards.append(rew)
                    self.store_transition(ob, ac_prob, rew, ob_next, done)
                    steps += 1
                    if done or steps > max_path_length:
                        break
                path = {"observation": np.array(obs),
                        "reward": np.array(rewards),
                        "action": np.array(acs),
                        "ob_next": np.array(ob_nexts),
                        "done": np.array(dones)}
                paths.append(path)
                timesteps_this_batch += pathlength(path)
                if timesteps_this_batch > min_timesteps_per_batch:
                    break
            total_timesteps += timesteps_this_batch

            # Build arrays for observation, action for the policy gradient update by concatenating
            # across paths
            ob_no = np.concatenate([path["observation"] for path in paths])
            ac_na = np.concatenate([path["action"] for path in paths])

            # todo train process
            # todo memory sample in paths
            epoch_actor_losses = []
            epoch_critic_losses = []
            for epoch in range(batch_epochs):
                cl, al = self.update_loss()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                self.soft_sync_target_actor()
                # Log diagnostics
                returns = [path["reward"].sum() for path in paths]
                ep_lengths = [pathlength(path) for path in paths]
                #print('log iter %d'%itr)
                #logz.log_tabular("LossDelta", loss_1 - loss_2)
                logz.log_tabular("Time", time.time() - start)
                logz.log_tabular("Iteration", itr)
                logz.log_tabular("AverageReturn", np.mean(returns))
                logz.log_tabular("StdReturn", np.std(returns))
                logz.log_tabular("MaxReturn", np.max(returns))
                logz.log_tabular("MinReturn", np.min(returns))
                logz.log_tabular("EpLenMean", np.mean(ep_lengths))
                logz.log_tabular("EpLenStd", np.std(ep_lengths))
                logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
                logz.log_tabular("TimestepsSoFar", total_timesteps)
                logz.dump_tabular()
                logz.pickle_tf_vars()