示例#1
0
    def __init__(self, gamma, tau,num_inputs, env,device, results_path=None):

        self.gamma = gamma
        self.tau = tau
        self.min_action,self.max_action = env.action_range()
        self.device = device
        self.num_actions = env.action_space()
        self.noise_stddev = 0.3

        self.results_path = results_path
        self.checkpoint_path = os.path.join(self.results_path, 'checkpoint/')
        os.makedirs(self.checkpoint_path, exist_ok=True)

        # Define the actor
        self.actor = Actor(num_inputs, self.num_actions).to(device)
        self.actor_target = Actor(num_inputs, self.num_actions).to(device)

        # Define the critic
        self.critic = Critic(num_inputs, self.num_actions).to(device)
        self.critic_target = Critic(num_inputs, self.num_actions).to(device)

        # Define the optimizers for both networks
        self.actor_optimizer  = Adam(self.actor.parameters(),  lr=1e-4 )                          # optimizer for the actor network
        self.critic_optimizer = Adam(self.critic.parameters(), lr=1e-4,   weight_decay=0.002)  # optimizer for the critic network

        self.hard_swap()

        self.ou_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.num_actions),
                                            sigma=float(self.noise_stddev) * np.ones(self.num_actions))
        self.ou_noise.reset()
示例#2
0
 def setup(self, nb_states, nb_actions):
     super(action_noise_DDPG, self).setup(nb_states, nb_actions)
     exploration_args = Singleton_arger()['exploration']
     self.noise_decay = exploration_args['noise_decay']
     self.noise_coef = 1
     self.rollout_actor = copy.deepcopy(self.actor)
     self.action_noise = OrnsteinUhlenbeckActionNoise(
         mu=np.zeros(nb_actions),
         sigma=float(exploration_args['stddev']) * np.ones(nb_actions))
     if self.with_cuda:
         for net in (self.rollout_actor, ):
             if net is not None:
                 net.cuda()
    def __init__(self, state_size, action_size, action_bound_high,
                 action_bound_low, imitation_data_path):

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.state_size = state_size
        self.action_bound_high = torch.Tensor([action_bound_high]).to(device)
        self.action_bound_low = torch.Tensor([action_bound_low]).to(device)
        self.action_size = action_size

        self.buffer = EfficientReplayMemory(Parameters.BUFFER_SIZE,
                                            self.state_size, self.action_size)
        self.imitation_buffer = EfficientReplayMemory(
            Parameters.IMITATION_BUFFER_SIZE, self.state_size,
            self.action_size)
        self.imitation_buffer.load_memory(imitation_data_path)
        self.imitation_lambda = Parameters.IMITATION_LAMBDA

        # Actor
        self.policy_function = Policy(self.state_size, self.action_size,
                                      self.action_bound_high)
        self.policy_function_target = Policy(self.state_size, self.action_size,
                                             self.action_bound_high)
        self.policy_function_noisy = Policy(self.state_size, self.action_size,
                                            self.action_bound_high)
        self.policy_function_optim = Adam(self.policy_function.parameters(),
                                          lr=Parameters.ACTOR_LEARNING_RATE)
        self.imitation_optimizer = Adam(self.policy_function.parameters(),
                                        lr=self.imitation_lambda)

        # critic 1 (q-value)
        self.q_function = QFunction(self.state_size, self.action_size)
        self.q_function_target = QFunction(self.state_size, self.action_size)
        self.q_function_optim = Adam(self.q_function.parameters(),
                                     lr=Parameters.CRITIC_LEARNING_RATE)

        # Noise parameters
        self.action_noise = OrnsteinUhlenbeckActionNoise(self.action_size)
        self.desired_action_std = Parameters.DESIRED_ACTION_STD
        self.current_noise_std = Parameters.INITIAL_NOISE_STD
        self.coefficient = Parameters.ADAPT_COEFFICIENT

        # hyperparameters
        self.gamma = Parameters.GAMMA
        self.tau = Parameters.TAU

        self.hard_update_network(self.policy_function_target,
                                 self.policy_function)
        self.hard_update_network(self.q_function_target, self.q_function)
示例#4
0
class action_noise_DDPG(DDPG):
    def __init__(self):
        super(action_noise_DDPG, self).__init__()

    def setup(self, nb_pos, nb_laser, nb_actions):
        super(action_noise_DDPG, self).setup(nb_pos, nb_laser, nb_actions)
        self.nb_pos = nb_pos
        self.nb_laser = nb_laser
        exploration_args = Singleton_arger()['exploration']
        self.noise_decay = exploration_args['noise_decay']
        self.noise_coef = 1
        self.rollout_actor = copy.deepcopy(self.actor)
        self.action_noise = OrnsteinUhlenbeckActionNoise(
            mu=np.zeros(nb_actions),
            sigma=float(exploration_args['stddev']) * np.ones(nb_actions))
        if self.with_cuda:
            for net in (self.rollout_actor, ):
                if net is not None:
                    net.cuda()

    def reset_noise(self):
        self.action_noise.reset()

    def before_epoch(self):
        self.apply_noise_decay()

    def apply_noise_decay(self):
        if self.noise_decay > 0:
            self.noise_coef = self.noise_decay * self.noise_coef / (
                self.noise_coef + self.noise_decay)

    def select_action(self, s_t, apply_noise):
        s_t = torch.tensor(np.vstack(s_t),
                           dtype=torch.float32,
                           requires_grad=False).cuda()
        s_t = s_t.split([self.nb_pos, self.nb_laser], dim=1)
        #s_t = torch.tensor(s_t,dtype = torch.float32,requires_grad = False)
        #if self.with_cuda:
        #    s_t = s_t.cuda()
        with torch.no_grad():
            action = self.actor(s_t).cpu().numpy()
        if apply_noise:
            action += max(self.noise_coef, 0) * self.action_noise()
        action = np.clip(action, -1., 1.)
        return action
示例#5
0
    def __init__(self, session, state_shape, action_shape, action_bound,
                 learning_rate, tau, batch_size):
        """ Initialize actor and target networks and update methods. """
        self.session = session
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.action_bound = action_bound
        self.learning_rate = learning_rate
        self.tau = tau
        self.batch_size = batch_size

        # Initialize addititve Ornstein Uhlenbeck noise
        self.OU_noise = OrnsteinUhlenbeckActionNoise(
            mu=np.zeros(self.action_shape))

        # Initialize actor network
        self.phase = tf.placeholder(tf.bool, name='phase_act')
        self.inputs, self.out, self.scaled_out = \
            self.create_actor_network(self.phase)
        self.network_params = tf.trainable_variables()

        # Initialize target actor network
        self.target_inputs, self.target_out, self.target_scaled_out = \
            self.create_actor_network(self.phase, prefix='tar_')
        self.target_network_params = \
            tf.trainable_variables()[len(self.network_params):]

        # Define target update op
        self.update_target_network_params = \
            [self.target_network_params[i].assign(
            tf.multiply(self.network_params[i], self.tau) +
            tf.multiply(self.target_network_params[i], 1.0 - self.tau))
            for i in range(len(self.target_network_params))]

        # Define ops for getting necessary gradients
        self.action_gradient = \
            tf.placeholder(tf.float32, [None, self.action_shape])
        self.unnormalized_actor_gradients = tf.gradients(
            self.scaled_out, self.network_params, -self.action_gradient)
        self.actor_gradients = list(
            map(lambda x: tf.div(x, self.batch_size),
                self.unnormalized_actor_gradients))

        # Define optimization op
        # update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        # with tf.control_dependencies(update_ops):
        self.optimize = tf.train.AdamOptimizer(self.learning_rate).\
            apply_gradients(zip(self.actor_gradients, self.network_params))

        self.num_trainable_vars = \
            len(self.network_params) + len(self.target_network_params)
示例#6
0
 def __init__(self, FLAGS):
     """
     This class build the model that implements the deterministic 
     gradient descent algorithm.
     
     :param FLAGS: TensorFlow flags which contain the values for hyperparameters
     
     """
     
     self.FLAGS=FLAGS
     
     self.env = gym.make('Pendulum-v0')
     self.state_size = len(self.env.observation_space.sample())
     self.num_episodes=1000
     self.batch_size=64
     
     self.exp_replay=ExperienceReplay(50000,1500, FLAGS)
     
     self.action_noise=OrnsteinUhlenbeckActionNoise(self.env,mu= 0.0, sigma=0.2, theta=.15, dt=1e-2, x0=None)
     
     self.actor_target=Actor(scope='target',target_network=None,env=self.env, flags=FLAGS)
     self.actor=Actor(scope='actor',target_network=self.actor_target,env=self.env, flags=FLAGS)
     
     self.critic_target=Critic(scope='target',target_network=None,env=self.env, flags=FLAGS)
     self.critic=Critic(scope='critic',target_network=self.critic_target,env=self.env, flags=FLAGS)
     
     init = tf.global_variables_initializer()
     self.session = tf.InteractiveSession()
     self.session.run(init)
     
     self.critic.set_session(self.session)
     self.actor.set_session(self.session)
     self.actor_target.set_session(self.session)
     self.critic_target.set_session(self.session)
     
     self.critic.init_target_network()
     self.actor.init_target_network()
示例#7
0
    def __init__(self, experiment, batch_size):
        self._dummy_env = gym.make(experiment)
        self._sess = tf.Session()

        self._sum_writer = tf.summary.FileWriter('logs/', self._sess.graph)

        # Hardcoded for now
        self._dim_state = 25
        self._dim_goal = 3
        self._dim_action = self._dummy_env.action_space.shape[0]
        self._dim_env = 1
        self._batch_size = batch_size

        # agent noise
        self._action_noise = OrnsteinUhlenbeckActionNoise(
            mu=np.zeros(self._dim_action))

        self._actor = Actor(self._sess, self._dim_state, self._dim_goal,
                            self._dim_action, self._dummy_env, TAU,
                            LEARNING_RATE, self._batch_size)

        self._critic = Critic(self._sess, self._dim_state, self._dim_goal,
                              self._dim_action, self._dim_env, self._dummy_env,
                              TAU, LEARNING_RATE,
                              self._actor.get_num_trainable_vars(),
                              self._sum_writer)

        self._saver = tf.train.Saver(max_to_keep=None)

        self._sess.run(tf.global_variables_initializer())

        self._actor.initialize_target_network()
        self._critic.initialize_target_network()

        # training monitoring
        self._success_rate = tf.Variable(0., name="success_rate")
        self._python_success_rate = tf.placeholder("float32", [])

        self._update_success_rate = self._success_rate.assign(
            self._python_success_rate)
        self._merged = tf.summary.scalar("successrate",
                                         self._update_success_rate)
示例#8
0
    max_timesteps = train_env.spec.timestep_limit

    # set noise type
    current_noise_type = args.noise_type.strip()
    nb_actions = train_env.action_space.shape[0]
    if 'normal' in current_noise_type:
        _, stddev = current_noise_type.split('_')
        action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                         sigma=float(stddev) *
                                         np.ones(nb_actions))
        action_noise.reset()
    if 'ou' in current_noise_type:
        _, stddev = current_noise_type.split('_')
        action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                    sigma=float(stddev) *
                                                    np.ones(nb_actions))
        action_noise.reset()
    else:
        raise RuntimeError(
            'unknown noise type "{}"'.format(current_noise_type))

    episode_rewards = []
    if 'Sparse' in train_env.spec.id:
        sparse = True
        episode_successes = []
    else:
        sparse = False

    state_start, paths, neigh = None, None, None
示例#9
0
def learn(
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=20,
        nb_rollout_steps=100,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        noise_type='adaptive-param_0.2',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        nb_eval_steps=100,
        nb_save_epochs=None,
        batch_size=64,  # per MPI worker
        tau=0.01,
        action_range=(-250.0, 250.0),
        observation_range=(-5.0, 5.0),
        eval_env=None,
        load_path=None,
        save_dir=None,
        param_noise_adaption_interval=50,
        **network_kwargs):

    set_global_seeds(seed)

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles *
                                             nb_rollout_steps)
    else:
        nb_epochs = 500

    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        rank = 0

    memory = Memory(limit=int(1e6))

    network_spec = [{
        'layer_type': 'dense',
        'units': int(256),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(128),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(1),
        'activation': 'tanh',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }]

    vnetwork_spec = [{
        'layer_type': 'concat',
        'nodes_in': ['action_movement', 'observation_self'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(256),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(128),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(1),
        'activation': '',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }]

    network = DdpgPolicy(scope="ddpg",
                         ob_space=env.observation_space,
                         ac_space=env.action_space,
                         network_spec=network_spec,
                         v_network_spec=vnetwork_spec,
                         stochastic=False,
                         reuse=False,
                         build_act=True,
                         trainable_vars=None,
                         not_trainable_vars=None,
                         gaussian_fixed_var=False,
                         weight_decay=0.0,
                         ema_beta=0.99999,
                         normalize_observations=normalize_observations,
                         normalize_returns=normalize_returns,
                         observation_range=observation_range)

    target_network = DdpgPolicy(scope="target",
                                ob_space=env.observation_space,
                                ac_space=env.action_space,
                                network_spec=network_spec,
                                v_network_spec=vnetwork_spec,
                                stochastic=False,
                                reuse=False,
                                build_act=True,
                                trainable_vars=None,
                                not_trainable_vars=None,
                                gaussian_fixed_var=False,
                                weight_decay=0.0,
                                ema_beta=0.99999,
                                normalize_observations=normalize_observations,
                                normalize_returns=normalize_returns,
                                observation_range=observation_range)

    action_noise = None
    param_noise = None
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                action_noise = dict()
                for k, v in env.action_space.spaces.items():
                    act_size = v.spaces[0].shape[-1]
                    _, stddev = current_noise_type.split('_')
                    action_noise[k] = NormalActionNoise(mu=np.zeros(act_size),
                                                        sigma=float(stddev) *
                                                        np.ones(act_size))
            elif 'ou' in current_noise_type:
                action_noise = dict()
                for k, v in env.action_space.spaces.items():
                    act_size = v.spaces[0].shape[-1]
                    _, stddev = current_noise_type.split('_')
                    action_noise[k] = OrnsteinUhlenbeckActionNoise(
                        mu=np.zeros(act_size),
                        sigma=float(stddev) * np.ones(act_size))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    max_action = action_range[1]
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))

    agent = DDPG(network,
                 target_network,
                 memory,
                 env.observation_space,
                 env.action_space,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()

    saver = functools.partial(save_variables, sess=sess)
    loader = functools.partial(load_variables, sess=sess)
    if load_path != None:
        loader(load_path)

    # Prepare everything.
    agent.initialize(sess)
    sess.graph.finalize()

    agent.reset()
    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = env.num_envs
    n_agents = obs['observation_self'].shape[0]

    episode_reward = np.zeros((nenvs, n_agents), dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                # of the environments, so resetting here instead
                agent.reset()
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q, _, _ = agent.step(obs,
                                             apply_noise=True,
                                             compute_Q=True)

                # Execute next action.
                if rank == 0 and render:
                    env.render()

                # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch
                for k, v in action.items():
                    action[k] *= max_action

                nenvs_actions = []
                for i in range(nenvs):
                    nenv_action = {
                        'action_movement':
                        action['action_movement'][i * n_agents:(i + 1) *
                                                  n_agents]
                    }
                    nenvs_actions.append(nenv_action)
                new_obs, r, done, info = env.step(
                    nenvs_actions
                )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                # note these outputs are batched from vecenv

                t += 1
                if rank == 0 and render:
                    env.render()
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                agent.store_transition(
                    obs, action, r, new_obs, done
                )  #the batched data will be unrolled in memory.py's append.

                obs = new_obs

                for d in range(len(done)):
                    if done[d]:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward[d])
                        episode_rewards_history.append(episode_reward[d])
                        epoch_episode_steps.append(episode_step[d])
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1
                        if nenvs == 1:
                            agent.reset()

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)

                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        max_action * eval_action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0

        if MPI is not None:
            mpi_size = MPI.COMM_WORLD.Get_size()
        else:
            mpi_size = 1

        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_std'] = np.std(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/return_history_std'] = np.std(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(
            epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(
                eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = np.array(
            [np.array(x).flatten()[0] for x in combined_stats.values()])
        if MPI is not None:
            combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums)

        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'),
                          'wb') as f:
                    pickle.dump(eval_env.get_state(), f)

        if nb_save_epochs != None and (epoch + 1) % nb_save_epochs == 0:
            if save_dir == None:
                checkdir = osp.join(logger.get_dir(), 'checkpoints')
            else:
                checkdir = osp.join(save_dir, 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % epoch)
            print('Saving to', savepath)
            saver(savepath)

    return agent
示例#10
0
class Model:
    
    def __init__(self, FLAGS):
        """
        This class build the model that implements the deterministic 
        gradient descent algorithm.
        
        :param FLAGS: TensorFlow flags which contain the values for hyperparameters
        
        """
        
        self.FLAGS=FLAGS
        
        self.env = gym.make('Pendulum-v0')
        self.state_size = len(self.env.observation_space.sample())
        self.num_episodes=1000
        self.batch_size=64
        
        self.exp_replay=ExperienceReplay(50000,1500, FLAGS)
        
        self.action_noise=OrnsteinUhlenbeckActionNoise(self.env,mu= 0.0, sigma=0.2, theta=.15, dt=1e-2, x0=None)
        
        self.actor_target=Actor(scope='target',target_network=None,env=self.env, flags=FLAGS)
        self.actor=Actor(scope='actor',target_network=self.actor_target,env=self.env, flags=FLAGS)
        
        self.critic_target=Critic(scope='target',target_network=None,env=self.env, flags=FLAGS)
        self.critic=Critic(scope='critic',target_network=self.critic_target,env=self.env, flags=FLAGS)
        
        init = tf.global_variables_initializer()
        self.session = tf.InteractiveSession()
        self.session.run(init)
        
        self.critic.set_session(self.session)
        self.actor.set_session(self.session)
        self.actor_target.set_session(self.session)
        self.critic_target.set_session(self.session)
        
        self.critic.init_target_network()
        self.actor.init_target_network()
        
    
    def train_networks(self):
        '''Training of the actor and critic networks '''
        
        if len(self.exp_replay.experience['state']) < self.exp_replay.min_experience:
            return
    
        # pick random experience tupels from the expererience replay
        idx = np.random.choice(len(self.exp_replay.experience['state']), size=self.FLAGS.batch_size, replace=False)
        
        state=np.array([self.exp_replay.experience['state'][i] for i in idx]).reshape(self.FLAGS.batch_size,self.state_size)
        action=np.array([self.exp_replay.experience['action'][i] for i in idx]).reshape(self.FLAGS.batch_size,1)
        reward=[self.exp_replay.experience['reward'][i] for i in idx]
        next_state=np.array([self.exp_replay.experience['next_state'][i] for i in idx]).reshape(self.FLAGS.batch_size,self.state_size)
        dones=[self.exp_replay.experience['done'][i] for i in idx]

        #Train critic network
        next_actions=self.actor_target.get_action(next_state)
        q_next=self.critic.target_network.calculate_Q(next_state, next_actions)
        targets=np.array([r+self.FLAGS.gamma*q if not done else r for r, q, done in zip(reward,q_next,dones)])
        self.critic.train(state, targets, action)
        
        #Train actor network
        current_actions=self.actor.get_action(state)        
        q_gradient=self.critic.compute_gradients(state, current_actions)
        self.actor.train(state, q_gradient)
        
        self.actor.update_target_parameter()
        self.critic.update_target_parameter()
        

    def playEpisode(self,episode):
        '''Play an episode in the environment '''
        
        #get initial state from the environment
        state=self.env.reset()
        state=state.reshape(1,self.state_size)
        done=False
        total_reward=0
   
        while not done:

            #get action for an environment state
            action=self.actor.get_action(state)+self.action_noise.get_noise(episode)
            prev_state=state
            # get new-state, reward, done tuple
            state, reward, done, _ = self.env.step(action)
            state=state.reshape(1,self.state_size)
            
            #self.env.render(mode='rgb_array')
            total_reward=total_reward+reward

            # add <state, action, reward, next-state, done > tuple into the experience replay
            self.exp_replay.addExperience(prev_state, action, reward, state, done)
            
            # start the training
            self.train_networks()
            
        return total_reward
            

    def run_model(self):
        '''Main loop. Runs the environment and traing the networks '''
        
        totalrewards = np.empty(self.num_episodes+1)
        n_steps=10
        
        for n in range(0, self.num_episodes+1):
            
            total_reward=self.playEpisode(n)
            
            totalrewards[n]=total_reward 
            
            if n>0 and n%n_steps==0:
                print("episodes: %i, avg_reward (last: %i episodes): %.2f" %(n, n_steps, totalrewards[max(0, n-n_steps):(n+1)].mean()))
示例#11
0
# Herne prostredie
env = gym.make('MountainCarContinuous-v0')

# Actor
actorNet = Actor.Actor(env.observation_space.shape, env.action_space.shape, lr=wandb.config.lr_A)
actorNet_target = Actor.Actor(env.observation_space.shape, env.action_space.shape, lr=wandb.config.lr_A)

# Critic
criticNet = Critic.Critic(env.observation_space.shape, env.action_space.shape, lr=wandb.config.lr_C)
criticNet_target = Critic.Critic(env.observation_space.shape, env.action_space.shape, lr=wandb.config.lr_C)

# replay buffer
rpm = ReplayBuffer.ReplayBuffer(1000000) # 1M history

noise = OrnsteinUhlenbeckActionNoise(mean=0.0, sigma=0.5, size=env.action_space.shape)

# (gradually) replace target network weights with online network weights
def replace_weights(tau=wandb.config.tau):
    theta_a,theta_c = actorNet.model.get_weights(),criticNet.model.get_weights()
    theta_a_targ,theta_c_targ = actorNet_target.model.get_weights(),criticNet_target.model.get_weights()

    # mixing factor tau : we gradually shift the weights...
    theta_a_targ = [theta_a[i]*tau + theta_a_targ[i]*(1-tau) for i in range(len(theta_a))]
    theta_c_targ = [theta_c[i]*tau + theta_c_targ[i]*(1-tau) for i in range(len(theta_c))]

    actorNet_target.model.set_weights(theta_a_targ)
    criticNet_target.model.set_weights(theta_c_targ)

def train(verbose=1, batch_size=wandb.config.batch_size, gamma=wandb.config.gamma):
    # ak je dostatok vzorov k uceniu
示例#12
0
    def train(self):

        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        config.gpu_options.allow_growth = True

        # Create global step and increment operation
        global_step_tensor = tf.Variable(0,
                                         trainable=False,
                                         name='global_step')
        increment_global_step = tf.assign_add(global_step_tensor, 1)

        # Create model saver
        saver = tf.train.Saver()

        sess = tf.Session(config=config)

        if not self.parameters['restore']:
            sess.run(tf.global_variables_initializer())
        else:
            saver.restore(sess, tf.train.latest_checkpoint('./saves'))

        self.actor_critic.set_moving_to_target(sess)
        run_id = np.random.randint(10000)

        trainwriter = tf.summary.FileWriter(logdir='./logs/' + str(run_id),
                                            graph=sess.graph)

        # Get action noise
        action_noise = OrnsteinUhlenbeckActionNoise(
            mu=np.zeros(self.nA),
            sigma=float(self.parameters['sigma']) * np.ones(self.nA))

        # Fill Replay Memory
        state = self.env.reset()
        fill_amount = 0
        while fill_amount < self.parameters['replay_init_size']:

            action = self.env.action_space.sample()
            next_state, reward, done, _ = self.env.step(action)

            if done:
                state = self.env.reset()
            else:
                fill_amount += 1
                self.memory.add(state, action, reward, done, next_state)
                state = next_state

        # Main Loop
        steps = 0

        for i in range(self.parameters['num_epochs']):

            avg_epoch_rewards = 0
            num_epochs = 1
            for e in range(self.parameters['num_episodes']):

                state = self.env.reset()

                ep_reward = 0

                # Perform rollout
                while True:
                    noise = action_noise()
                    action = self.actor_critic.pi(sess, state[None, ...])
                    action += noise
                    action = np.clip(action, self.env.action_space.low[0],
                                     self.env.action_space.high[0])

                    assert action.shape == self.env.action_space.shape
                    """
					# UNCOMMENT TO PRINT ACTIONS
					a0 = tf.Summary(value=[tf.Summary.Value(tag="action_0", simple_value=action[0,0])])
					trainwriter.add_summary(a0,steps)
					a1 = tf.Summary(value=[tf.Summary.Value(tag="action_1", simple_value=action[0,1])])
					trainwriter.add_summary(a1,steps)
					a2 = tf.Summary(value=[tf.Summary.Value(tag="action_2", simple_value=action[0,2])])
					trainwriter.add_summary(a2,steps)
					steps += 1
					"""

                    next_state, reward, done, _ = self.env.step(action)

                    self.memory.add(state, action, reward, done, next_state)

                    if self.parameters['render_train']:
                        self.env.render()

                    ep_reward += reward

                    if done:

                        reward_summary = tf.Summary(value=[
                            tf.Summary.Value(tag="ep_rewards",
                                             simple_value=ep_reward)
                        ])
                        trainwriter.add_summary(
                            reward_summary,
                            i * self.parameters['num_episodes'] + e)
                        action_noise.reset()
                        break

                    state = next_state

                avg_epoch_rewards = avg_epoch_rewards + (
                    ep_reward - avg_epoch_rewards) / num_epochs
                num_epochs += 1

                # Perform train
                for t in range(self.parameters['num_train_steps']):
                    s_state, s_action, s_reward, s_next_state, s_terminal = self.memory.sample(
                    )
                    # Train actor critic model
                    self.actor_critic.update(sess=sess,
                                             filewriter=trainwriter,
                                             state_batch=s_state,
                                             next_state_batch=s_next_state,
                                             action_batch=s_action,
                                             reward_batch=s_reward,
                                             done_batch=s_terminal)
                    sess.run(increment_global_step)

            # Print out epoch stats here

            table_data = [['Epoch', 'Average Reward'],
                          [
                              str(i) + "/" +
                              str(self.parameters['num_epochs']),
                              str(avg_epoch_rewards)
                          ]]

            table = AsciiTable(table_data, "Training Run: " + str(run_id))

            save_path = saver.save(sess, "./saves/model.ckpt")

            os.system('clear')
            print("Model saved in path: %s" % save_path + "\n" + table.table)
示例#13
0
    def train(self):

        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        config.gpu_options.allow_growth = True

        # Create global step and increment operation
        global_step_tensor = tf.Variable(0,
                                         trainable=False,
                                         name='global_step')
        increment_global_step = tf.assign_add(global_step_tensor, 1)

        # Create model saver
        saver = tf.train.Saver(max_to_keep=None)

        sess = tf.Session(config=config)

        if not self.parameters['restore']:
            sess.run(tf.global_variables_initializer())
        else:
            saver.restore(sess, tf.train.latest_checkpoint('./saves'))

        self.actor_critic.set_moving_to_target(sess)
        run_id = np.random.randint(10000)

        trainwriter = tf.summary.FileWriter(logdir='./logs/' + str(run_id),
                                            graph=sess.graph)

        # Get action noise
        action_noise = OrnsteinUhlenbeckActionNoise(
            mu=np.zeros(self.nA),
            sigma=float(self.parameters['sigma']) * np.ones(self.nA))

        # Fill Replay Memory
        state = self.env.reset()
        fill_amount = 0
        while fill_amount < self.parameters['replay_init_size']:

            action = self.env.action_space.sample()
            next_state, reward, done, _ = self.env.step(action)

            if done:
                state = self.env.reset()
            else:
                fill_amount += 1
                self.memory.add(state, action, reward, done, next_state)
                state = next_state

        # Main Loop
        plots = {'critic_loss': [], 'actor_loss': [], 'episode_reward': []}

        plots_dir = './plots/'
        weights_dir = './weights/'
        graph_dir = './graph/'
        if not os.path.exists(plots_dir):
            os.makedirs(plots_dir)
        if not os.path.exists(weights_dir):
            os.makedirs(weights_dir)
        if not os.path.exists(graph_dir):
            os.makedirs(graph_dir)

        saver.export_meta_graph(graph_dir + self.parameters['env'] +
                                '/graph.meta')

        #cumulative step counter
        cumu_step = 0

        for i in range(self.parameters['num_epochs']):

            avg_epoch_rewards = 0
            n_epochs = 1

            for e in range(self.parameters['num_episodes']):

                state = self.env.reset()

                ep_reward = 0
                ep_n_action = 0

                # Perform rollout
                for _ in range(500):
                    noise = action_noise()
                    action = self.actor_critic.pi(sess, state[None, ...])
                    action += noise
                    action = np.clip(action, self.env.action_space.low[0],
                                     self.env.action_space.high[0])

                    assert action.shape == self.env.action_space.shape

                    next_state, reward, done, _ = self.env.step(action)
                    # print(action)
                    # print(next_state)
                    # print(reward)

                    self.memory.add(state, action, reward, done, next_state)

                    if self.parameters['render_train']: self.env.render()

                    ep_reward += reward
                    ep_n_action += 1
                    cumu_step += 1
                    state = next_state

                    # Perform train
                    avg_critic_loss = 0.0
                    avg_actor_loss = 0.0
                    for t in range(self.parameters['num_train_steps']):
                        s_state, s_action, s_reward, s_next_state, s_terminal = self.memory.sample(
                        )
                        # Train actor critic model
                        _, _, critic_loss, actor_loss = self.actor_critic.update(
                            sess=sess,
                            filewriter=trainwriter,
                            state_batch=s_state,
                            next_state_batch=s_next_state,
                            action_batch=s_action,
                            reward_batch=s_reward,
                            done_batch=s_terminal)
                        avg_critic_loss += critic_loss
                        avg_actor_loss += actor_loss

                        sess.run(increment_global_step)

                    avg_critic_loss /= self.parameters['num_train_steps']
                    avg_actor_loss /= self.parameters['num_train_steps']

                    if done:
                        reward_summary = tf.Summary(value=[
                            tf.Summary.Value(tag="ep_rewards",
                                             simple_value=ep_reward)
                        ])
                        trainwriter.add_summary(
                            reward_summary,
                            i * self.parameters['num_episodes'] + e)
                        action_noise.reset()
                        break

                avg_epoch_rewards = avg_epoch_rewards + (
                    ep_reward - avg_epoch_rewards) / n_epochs
                n_epochs += 1


                print('Epoch: {:d} | Reward: {:d} | Avg_Q_loss: {:.4f} | Avg_a_loss: {:.4f} | Episode: {:d} | Step: {:d} | Cumu Step: {:d}'\
                 .format(i+1, int(ep_reward), avg_critic_loss, avg_actor_loss, e+1, ep_n_action, cumu_step))

                if e % 19 == 0:
                    save_path = saver.save(
                        sess,
                        weights_dir + self.parameters['env'] + '/model.ckpt',
                        global_step=i * e + 1)

                plots['episode_reward'].append(ep_reward)
                plots['critic_loss'].append(critic_loss)
                plots['actor_loss'].append(critic_loss)

                pickle.dump(
                    plots,
                    open(plots_dir + self.parameters['env'] + '_plot.pickle',
                         'wb'))
示例#14
0
def main(args):

    with tf.Session() as sess:
        env = gym.make(args['env'])
        np.random.seed(int(args['random_seed']))
        tf.set_random_seed(int(args['random_seed']))
        env.seed(int(args['random_seed']))

        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        action_bound = env.action_space.high
        # Ensure action bound is symmetric
        # assert (env.action_space.high == -env.action_space.low)

        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             float(args['actor_lr']), float(args['tau']),
                             int(args['minibatch_size']))

        critic = CriticNetwork(sess, state_dim, action_dim,
                               float(args['critic_lr']), float(args['tau']),
                               float(args['gamma']),
                               actor.get_num_trainable_vars())

        actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim))

        if args['train']:
            if not os.path.exists(args['save_dir']):
                os.makedirs(args['save_dir'])
            with open(os.path.join(args['save_dir'], 'config.json'), 'w') as f:
                json.dump(args, f, indent=2)
            train(sess, env, args, actor, critic, actor_noise)
        else:
            # ddpg = []
            # indexes = [e for e in range(400) if e % 10 == 9]
            # indexes = [0] + indexes
            indexes = [399]
            num_test_tasks = 100
            buckets = 1
            successes = []
            directory = args['to_pickle']
            for index in indexes:
                # times = []
                task_success = []
                saver = tf.train.Saver()
                saver.restore(
                    sess, "../final_models/multitask/fixed/{0}/model-{1}.ckpt".
                    format(directory, index))
                for _ in range(buckets):
                    tasks = env.unwrapped.sample_tasks(num_test_tasks)
                    # tasks = [{'goal': np.array([0., 0.])} for e in range(num_test_tasks)]
                    success = 0
                    for task in tasks:
                        s = env.reset_task(task)
                        step = 0
                        d = False
                        while not d:
                            # env.render()
                            action = actor.predict_target(
                                np.reshape(s, (1, actor.s_dim)))[0]
                            step += 1
                            s, r, d, _ = env.step(action)
                        if r == 1:
                            success += 1
                        # times.append(step)
                    env.close()
                    task_success.append(success / num_test_tasks)
                successes.append(task_success)
                # ddpg.append(times)
            # out = [successes, ddpg]
            env.close()
            if not os.path.exists('./pkls'):
                os.makedirs('./pkls')
            with open('./pkls/{0}.pkl'.format(args['save_dir']), 'wb') as f:
                pickle.dump(successes, f)
示例#15
0
def main(args):
    params = '_delta_'+str(args['delta'])+\
              '_wrapper_'+str(args['wrapper'])+\
              '_hindsight_'+str(args['with_hindsight'])
    logdir = args['summary_dir']
    final_dir = logdir+'/'+params+'/'+datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

    logger_step = Logger(dir=final_dir+'/log_step',format_strs=['json', 'tensorboard'])
    logger_episode = Logger(dir=final_dir+'/log_episodes', format_strs=['stdout', 'json', 'tensorboard'])


    actor_lr = float(args['actor_lr'])
    tau = float(args['tau'])
    critic_lr = float(args['critic_lr'])
    gamma = float(args['gamma'])
    batch_size = int(args['minibatch_size'])
    eval_episodes = int(args['eval_episodes'])
    max_episode_steps = int(args['max_episode_steps'])
    max_steps = int(args['max_steps'])
    eval_freq = int(args['eval_freq'])

    train_env = gym.make(args['env'])
    test_env = gym.make(args['env'])


    if args['wrapper'] == 'NoGoal':
        env_wrapper = NoGoal()
    elif args['wrapper'] == 'RandomGoal':
        env_wrapper = RandomGoal()
    elif args['wrapper'] == 'HandCurri':
        env_wrapper = HandmadeCurriculum()
    else:
        print("Nooooooooooooooooooooo")

    state_dim = env_wrapper.state_shape[0]
    action_dim = env_wrapper.action_shape[0]
    action_bound = train_env.action_space.high
    # Ensure action bound is symmetric
    assert (train_env.action_space.high == -train_env.action_space.low)

    actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim))

    # Initialize replay memory
    if args['with_hindsight']:
        memory = HerMemory(env_wrapper, with_reward=True, limit=int(1e6), strategy='last')
    else:
        memory = Memory(env_wrapper, with_reward=True, limit=int(1e6))


    with tf.Session() as sess:

        if args['random_seed'] is not None:
            np.random.seed(int(args['random_seed']))
            tf.set_random_seed(int(args['random_seed']))
            train_env.seed(int(args['random_seed']))
            test_env.seed(int(args['random_seed']))

        actor = ActorNetwork(sess,
                             state_dim,
                             action_dim,
                             action_bound,
                             tau,
                             actor_lr)

        critic = CriticNetwork(sess,
                               state_dim,
                               action_dim,
                               gamma,
                               tau,
                               critic_lr)

        agent = DDPG_agent(sess,
                           actor,
                           actor_noise,
                           critic,
                           train_env,
                           test_env,
                           env_wrapper,
                           memory,
                           logger_step,
                           logger_episode,
                           batch_size,
                           eval_episodes,
                           max_episode_steps,
                           max_steps,
                           eval_freq)
        agent.run()
示例#16
0
文件: ddpg.py 项目: KYJun/rl_tutorial
    def train(self):

        with tf.Session(graph=self.graph) as sess:

            self._load_model(sess, self.params.load_model)
            self.total_episodes = self.params.total_episodes

            # Obtain an initial observation of the environment
            state = self.env.reset()
            state_input = state.reshape([1, self.params.input_dim])

            for episode_number in xrange(self.params.total_episodes):

                done = False
                score = 0

                while not done:

                    if self.global_step > self.params.preTrainStep:

                        # Value network update
                        trainBatch = self.myBuffer.sample(
                            self.params.batch_size)

                        batch_state = np.array(trainBatch[0]).reshape(
                            [self.params.batch_size, self.params.input_dim])
                        batch_actions = np.array(trainBatch[1]).reshape(
                            [self.params.batch_size, self.params.num_actions])
                        batch_rewards = np.array(trainBatch[2])
                        batch_next_state = np.array(trainBatch[3]).reshape(
                            [self.params.batch_size, self.params.input_dim])
                        batch_done = np.array(trainBatch[4])

                        end_multiplier = -(batch_done - 1)

                        target_action = sess.run(self.target_actor.det_prob,
                                                 feed_dict={
                                                     self.target_actor.input_x:
                                                     batch_next_state
                                                 })
                        target_action = np.array([[1, 0] if i == 0 else [0, 1]
                                                  for i in target_action])
                        targetQ_all = sess.run(self.target_critic.Qout,
                                               feed_dict={
                                                   self.target_critic.input_x:
                                                   batch_next_state,
                                                   self.target_critic.actions:
                                                   target_action
                                               })
                        nextQ = np.sum(np.multiply(targetQ_all, target_action),
                                       axis=-1)
                        targetQ = batch_rewards + (self.params.gamma * nextQ *
                                                   end_multiplier)

                        pred_actions = sess.run(
                            self.main_actor.det_prob,
                            feed_dict={self.main_actor.input_x: batch_state})
                        pred_actions = np.array([[1, 0] if i == 0 else [0, 1]
                                                 for i in pred_actions])

                        # Update the network with our target values.
                        sess.run(self.main_critic.update_value_model,
                                 feed_dict={
                                     self.main_critic.input_x: batch_state,
                                     self.main_critic.target_Q: targetQ,
                                     self.main_critic.actions: batch_actions
                                 })
                        self.update_Target(self.critic_targetOps, sess)

                        gradients = sess.run(self.main_critic.action_grads,
                                             feed_dict={
                                                 self.main_critic.input_x:
                                                 batch_state,
                                                 self.main_critic.actions:
                                                 pred_actions
                                             })

                        gradients = np.array(gradients).reshape(
                            self.params.batch_size, self.params.num_actions)
                        sess.run(self.main_actor.optimize,
                                 feed_dict={
                                     self.main_actor.input_x: batch_state,
                                     self.main_actor.action_gradient: gradients
                                 })

                        self.update_Target(self.actor_targetOps, sess)

                    # Make sure the observation is in a shape the network can handle.
                    state_buffer, reward_buffer, action_buffer, next_state_buffer, done_buffer = [], [], [], [], []

                    actor_noise = OrnsteinUhlenbeckActionNoise(
                        mu=np.zeros(self.params.num_actions))

                    action = sess.run(self.main_actor.logits,
                                      feed_dict={
                                          self.main_actor.input_x: state_input
                                      }) + actor_noise()
                    action = np.argmax(action)

                    # step the environment and get new measurements
                    next_state, reward, done, _ = self.env.step(action)

                    next_state = next_state.reshape([1, self.params.input_dim])

                    state_buffer.append(state_input)
                    action_buffer.append([1, 0] if action == 0 else [0, 1])
                    reward_buffer.append(
                        reward if not done or score == 299 else -100)
                    #reward_buffer.append(reward)
                    next_state_buffer.append(next_state)
                    done_buffer.append(done)

                    # move to next state
                    state_input = next_state

                    # add up reward
                    self.reward_sum += reward
                    score += reward
                    self.global_step += 1
                    self.myBuffer.append(state_buffer, action_buffer,
                                         reward_buffer, next_state_buffer,
                                         done_buffer)

                if episode_number % self.params.update_freq == 0:
                    self.running_reward = self.reward_sum if self.running_reward is None else self.running_reward * 0.99 + self.reward_sum * 0.01
                    print(
                        'Current Episode {} Average reward for episode {:.2f}.  Total average reward {:.2f}.'
                        .format(episode_number,
                                self.reward_sum // self.params.update_freq,
                                self.running_reward //
                                self.params.update_freq))
                    self.reward_sum = 0
                    time.sleep(0.5)

                self.state = self.env.reset()
                state_input = self.state.reshape([1, self.params.input_dim])
                self.global_step += 1
示例#17
0
def run(env_id, seed, noise_type, layer_norm, evaluation, memory_limit,
        **kwargs):
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    print("rank: %d" % (rank))
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        enal_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), "gym_eval"))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]

    for current_noise_type in noise_type.split(","):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == "none":
            pass
        elif "adaptive-param" in current_noise_type:
            _, stddev = current_noise_type.split("_")
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif "normal" in current_noise_type:
            _, stddev = current_noise_type.split("_")
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif "ou" in current_noise_type.split("_"):
            _, stddev = current_noise_type.split("_")
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))
    print(type(memory_limit), memory_limit)
    memory = Memory(limit=int(memory_limit),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    seed = seed + 1000000 * rank
    logger.info("rank {} : seed={}, logdir={}".format(rank, seed,
                                                      logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    if rank == 0:
        start_time = time.time()

    if option == 1:
        training.train(env=env,
                       eval_env=eval_env,
                       param_noise=param_noise,
                       action_noise=action_noise,
                       actor=actor,
                       critic=critic,
                       memory=memory,
                       **kwargs)
    elif option == 2:
        training_reward_shaping.train(env=env,
                                      eval_env=eval_env,
                                      param_noise=param_noise,
                                      action_noise=action_noise,
                                      actor=actor,
                                      critic=critic,
                                      memory=memory,
                                      **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info("total runtime: {}s".format(time.time() - start_time))
class DDPGAgent(object):
    def __init__(self, state_size, action_size, action_bound_high,
                 action_bound_low, imitation_data_path):

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.state_size = state_size
        self.action_bound_high = torch.Tensor([action_bound_high]).to(device)
        self.action_bound_low = torch.Tensor([action_bound_low]).to(device)
        self.action_size = action_size

        self.buffer = EfficientReplayMemory(Parameters.BUFFER_SIZE,
                                            self.state_size, self.action_size)
        self.imitation_buffer = EfficientReplayMemory(
            Parameters.IMITATION_BUFFER_SIZE, self.state_size,
            self.action_size)
        self.imitation_buffer.load_memory(imitation_data_path)
        self.imitation_lambda = Parameters.IMITATION_LAMBDA

        # Actor
        self.policy_function = Policy(self.state_size, self.action_size,
                                      self.action_bound_high)
        self.policy_function_target = Policy(self.state_size, self.action_size,
                                             self.action_bound_high)
        self.policy_function_noisy = Policy(self.state_size, self.action_size,
                                            self.action_bound_high)
        self.policy_function_optim = Adam(self.policy_function.parameters(),
                                          lr=Parameters.ACTOR_LEARNING_RATE)
        self.imitation_optimizer = Adam(self.policy_function.parameters(),
                                        lr=self.imitation_lambda)

        # critic 1 (q-value)
        self.q_function = QFunction(self.state_size, self.action_size)
        self.q_function_target = QFunction(self.state_size, self.action_size)
        self.q_function_optim = Adam(self.q_function.parameters(),
                                     lr=Parameters.CRITIC_LEARNING_RATE)

        # Noise parameters
        self.action_noise = OrnsteinUhlenbeckActionNoise(self.action_size)
        self.desired_action_std = Parameters.DESIRED_ACTION_STD
        self.current_noise_std = Parameters.INITIAL_NOISE_STD
        self.coefficient = Parameters.ADAPT_COEFFICIENT

        # hyperparameters
        self.gamma = Parameters.GAMMA
        self.tau = Parameters.TAU

        self.hard_update_network(self.policy_function_target,
                                 self.policy_function)
        self.hard_update_network(self.q_function_target, self.q_function)

    def soft_update_network(self, target, source):
        for target_parameters, source_parameters in zip(
                target.parameters(), source.parameters()):
            target_parameters.data.copy_(target_parameters.data *
                                         (1.0 - self.tau) +
                                         source_parameters.data * self.tau)

    def hard_update_network(self, target, source):
        target.load_state_dict(source.state_dict())

    def chose_action(self, state, exploration=True):
        self.policy_function.eval()

        if exploration and Parameters.PARAMETER_NOISE:
            action = self.policy_function_noisy((Variable(state)))
        else:
            action = self.policy_function((Variable(state)))

        self.policy_function.train()
        action = action.data

        if self.action_noise is not None and exploration:
            action += torch.Tensor(self.action_noise.sample())

        return action.clamp(-1, 1)

    def store_buffer_transition(self, state, action, mask, next_state, reward):
        self.buffer.push(state, action, reward, next_state, mask)

    def smooth_l1_loss(self, input, target, beta=1, size_average=True):
        """
        very similar to the smooth_l1_loss from pytorch because current pytorch variant is buggy
        """
        n = torch.abs(input - target)
        cond = n < beta
        loss = torch.where(cond, 0.5 * n**2 / beta, n - 0.5 * beta)
        if size_average:
            return loss.mean()
        return loss.sum()

    def train(self):
        # sample batch and train
        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.buffer.sample(
            Parameters.BATCH_SIZE)
        loss_imitation = 0

        state_batch = Variable(state_batch)
        action_batch = Variable(action_batch)
        reward_batch = Variable(reward_batch)
        mask_batch = Variable(mask_batch)
        next_state_batch = Variable(next_state_batch)

        # train the critic (Q-function)
        next_actions = self.policy_function_target(next_state_batch)
        next_q_values = self.q_function_target(next_state_batch, next_actions)
        expected_q_values = reward_batch + (self.gamma * mask_batch *
                                            next_q_values)

        self.q_function_optim.zero_grad()
        predicted_q_values = self.q_function(state_batch, action_batch)
        #q_value_loss = F.smooth_l1_loss(predicted_q_values, expected_q_values)
        #q_value_loss = F.smooth_l1_loss(expected_q_values, predicted_q_values)
        q_value_loss = self.smooth_l1_loss(expected_q_values,
                                           predicted_q_values)
        #q_value_loss = (predicted_q_values - expected_q_values).pow(2).mean()
        q_value_loss.backward()
        self.q_function_optim.step()

        # train the policy
        self.policy_function_optim.zero_grad()

        q_value_prediction = self.q_function(state_batch,
                                             self.policy_function(state_batch))

        # maximize the Q value for the chosen action
        policy_loss = -q_value_prediction
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        self.policy_function_optim.step()

        if Parameters.USE_IMITATION_LEARNING:
            state_batch_imitation, action_batch_imitation, _, _, _ = self.imitation_buffer.sample(
                Parameters.IMITATION_BATCH_SIZE)

            action_batch_imitation = Variable(action_batch_imitation,
                                              requires_grad=True)
            state_batch_imitation = Variable(state_batch_imitation,
                                             requires_grad=True)
            predicted_actions = self.chose_action(state_batch_imitation, False)
            q_value_prediction = self.q_function(state_batch_imitation,
                                                 predicted_actions)
            q_value_imitation = self.q_function(state_batch_imitation,
                                                action_batch_imitation)

            # Only try to learn the actions that were actually better than the current policy
            imitation_mask = (q_value_imitation > q_value_prediction)

            self.imitation_optimizer.zero_grad()

            loss_imitation = ((predicted_actions - action_batch_imitation) *
                              imitation_mask.float()).pow(2).mean()
            loss_imitation.backward()

            self.imitation_optimizer.step()

        # update the target networks
        self.update_networks()

        return q_value_loss.item(), policy_loss.item()

    def update_networks(self):
        self.soft_update_network(self.policy_function_target,
                                 self.policy_function)
        self.soft_update_network(self.q_function_target, self.q_function)

    def noise_actor_parameters(self):
        """
        Apply dynamic noise to the actor network PARAMETERS for better exploration.
        See:
        https://github.com/openai/baselines/blob/master/baselines/ddpg/noise.py
        https://blog.openai.com/better-exploration-with-parameter-noise/
        """
        self.hard_update_network(self.policy_function_noisy,
                                 self.policy_function)
        params = self.policy_function_noisy.state_dict()
        for key in params:
            if 'ln' in key:
                pass
            param = params[key]
            param += (torch.randn(param.shape) * self.current_noise_std).to(
                self.policy_function_noisy.device)

    def adapt_parameter_noise(self, states, actions):
        """
        Adapt the rate of noise dynamically according to a specified target.
        See:
        https://github.com/openai/baselines/blob/master/baselines/ddpg/noise.py
        https://blog.openai.com/better-exploration-with-parameter-noise/
        """
        states = torch.cat(states, 0)
        unperturbed_actions = self.chose_action(states, False)
        perturbed_actions = torch.cat(actions, 0)

        # calculate euclidian distance of both actions:
        mean_diff = np.mean(np.square(
            (perturbed_actions - unperturbed_actions).numpy()),
                            axis=0)
        distance = sqrt(np.mean(mean_diff))

        # adapt the standard deviation of the parameter noise
        if distance > self.desired_action_std:
            self.current_noise_std /= self.coefficient
        else:
            self.current_noise_std *= self.coefficient

    def save_models(self, path="./"):
        torch.save(self.policy_function.state_dict(), path + "actor.pt")
        torch.save(self.q_function.state_dict(), path + "critic.pt")
        print("Models saved successfully")

    def load_models(self, path="./"):
        if isfile(path + "actor.pt"):
            self.policy_function.load_state_dict(torch.load(path + "actor.pt"))
            self.q_function.load_state_dict(torch.load("critic.pt"))
            self.policy_function_target.load_state_dict(
                self.policy_function.state_dict())
            self.q_function_target.load_state_dict(
                self.q_function.state_dict())
            print("Models loaded succesfully")
        else:
            print("No model to load")
示例#19
0
    def __init__(self,
                 session,
                 state_shape,
                 action_shape,
                 action_bound,
                 learning_rate,
                 tau,
                 loss_mask=True):
        """ Initialize actor and target networks and update methods. """
        self.session = session
        self.state_shape = state_shape
        self.action_shape = action_shape
        self.action_bound = action_bound
        self.learning_rate = learning_rate
        self.tau = tau

        self.hidden_1_size = 400
        self.hidden_2_size = 300

        self.batch_size = tf.placeholder(tf.int32, shape=[], name='batch_act')
        self.trace_length = tf.placeholder(tf.int32, name='trace_act')
        self.phase = tf.placeholder(tf.bool, name='phase_act')

        # Initialize addititve Ornstein Uhlenbeck noise
        self.OU_noise = \
            OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.action_shape))

        # Initialize actor network
        self.inputs, self.out, self.scaled_out, self.lstm_state, \
            self.lstm_init_state = self.create_actor_network()
        self.network_params = tf.trainable_variables()

        # Initialize target actor network
        self.target_inputs, self.target_out, self.target_scaled_out, \
            self.target_lstm_state, self.target_lstm_init_state = \
                self.create_actor_network(prefix='tar_')
        self.target_network_params = \
            tf.trainable_variables()[len(self.network_params):]

        # Define target update op
        self.update_target_network_params = \
            [self.target_network_params[i].assign(
            tf.multiply(self.network_params[i], self.tau) +
            tf.multiply(self.target_network_params[i], 1.0 - self.tau))
            for i in range(len(self.target_network_params))]

        # Define ops for getting necessary gradients
        self.action_gradient = \
            tf.placeholder(tf.float32, [None, self.action_shape])

        if loss_mask:
            # Mask first half of losses for each trace per Lample & Charlot 2016
            self.maskA = tf.zeros([self.batch_size, self.trace_length // 2])
            self.maskB = tf.ones([self.batch_size, self.trace_length // 2])
            self.mask = tf.concat([self.maskA, self.maskB], 1)
            self.mask = tf.reshape(self.mask, [-1])
            self.action_gradient_adjusted = self.action_gradient * self.mask
        else:
            self.action_gradient_adjusted = self.action_gradient

        self.unnormalized_actor_gradients = tf.gradients(
            self.scaled_out, self.network_params,
            -self.action_gradient_adjusted)
        self.actor_gradients = list(
            map(lambda x: tf.div(x, tf.cast(self.batch_size, tf.float32)),
                self.unnormalized_actor_gradients))

        # Define optimization op
        # TODO: Only update BN params when needed instead of all the time!
        # update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        # with tf.control_dependencies(update_ops):
        self.optimize = tf.train.AdamOptimizer(self.learning_rate).\
            apply_gradients(zip(self.actor_gradients, self.network_params))

        self.num_trainable_vars = \
            len(self.network_params) + len(self.target_network_params)
示例#20
0
class DDPG(object):

    def __init__(self, gamma, tau,num_inputs, env,device, results_path=None):

        self.gamma = gamma
        self.tau = tau
        self.min_action,self.max_action = env.action_range()
        self.device = device
        self.num_actions = env.action_space()
        self.noise_stddev = 0.3

        self.results_path = results_path
        self.checkpoint_path = os.path.join(self.results_path, 'checkpoint/')
        os.makedirs(self.checkpoint_path, exist_ok=True)

        # Define the actor
        self.actor = Actor(num_inputs, self.num_actions).to(device)
        self.actor_target = Actor(num_inputs, self.num_actions).to(device)

        # Define the critic
        self.critic = Critic(num_inputs, self.num_actions).to(device)
        self.critic_target = Critic(num_inputs, self.num_actions).to(device)

        # Define the optimizers for both networks
        self.actor_optimizer  = Adam(self.actor.parameters(),  lr=1e-4 )                          # optimizer for the actor network
        self.critic_optimizer = Adam(self.critic.parameters(), lr=1e-4,   weight_decay=0.002)  # optimizer for the critic network

        self.hard_swap()

        self.ou_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.num_actions),
                                            sigma=float(self.noise_stddev) * np.ones(self.num_actions))
        self.ou_noise.reset()

    def eval_mode(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic_target.eval()
        self.critic.eval()

    def train_mode(self):
        self.actor.train()
        self.actor_target.train()
        self.critic_target.train()
        self.critic.train()


    def get_action(self, state, episode, action_noise=True):
        x = state.to(self.device)

        # Get the continous action value to perform in the env
        self.actor.eval()  # Sets the actor in evaluation mode
        mu = self.actor(x)
        self.actor.train()  # Sets the actor in training mode
        mu = mu.data

        # During training we add noise for exploration
        if action_noise:
            noise = torch.Tensor(self.ou_noise.noise()).to(self.device) * 1.0/(1.0 + 0.1*episode)
            noise = noise.clamp(0,0.1)
            mu = mu + noise  # Add exploration noise ε ~ p(ε) to the action. Do not use OU noise (https://spinningup.openai.com/en/latest/algorithms/ddpg.html)

        # Clip the output according to the action space of the env
        mu = mu.clamp(self.min_action,self.max_action)

        return mu

    def update_params(self, batch):
        # Get tensors from the batch
        state_batch = torch.cat(batch.state).to(self.device)
        action_batch = torch.cat(batch.action).to(self.device)
        reward_batch = torch.cat(batch.reward).to(self.device)
        done_batch = torch.cat(batch.done).to(self.device)
        next_state_batch = torch.cat(batch.next_state).to(self.device)

        # Get the actions and the state values to compute the targets
        next_action_batch = self.actor_target(next_state_batch)
        next_state_action_values = self.critic_target(next_state_batch, next_action_batch.detach())

        # Compute the target
        reward_batch = reward_batch.unsqueeze(1)
        done_batch = done_batch.unsqueeze(1)
        expected_values = reward_batch + (1.0 - done_batch) * self.gamma * next_state_action_values

        # Update the critic network
        self.critic_optimizer.zero_grad()
        state_action_batch = self.critic(state_batch, action_batch)
        value_loss = F.mse_loss(state_action_batch, expected_values.detach())
        value_loss.backward()
        self.critic_optimizer.step()

        # Update the actor network
        self.actor_optimizer.zero_grad()
        policy_loss = -self.critic(state_batch, self.actor(state_batch))
        policy_loss = policy_loss.mean()
        policy_loss.backward()
        for param in self.actor.parameters():
                param.grad.data.clamp_(-1, 1)
        self.actor_optimizer.step()

       # Update the target networks
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return value_loss.item(), policy_loss.item()
    
    def hard_swap(self):
        # Make sure both targets are with the same weight
        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)

    def store_model(self):
        print("Storing model at: ", self.checkpoint_path)
        checkpoint = {
            'actor': self.actor.state_dict(),
            'actor_optim': self.actor_optimizer.state_dict(),
            'critic': self.critic.state_dict(),
            'criti_optim': self.critic_optimizer.state_dict()
        }
        torch.save(checkpoint, os.path.join(self.checkpoint_path, 'checkpoint.pth') )

    def load_model(self):
        files = os.listdir(self.checkpoint_path)
        if files:
            print("Loading models checkpoints!")
            model_dicts = torch.load(os.path.join(self.checkpoint_path, 'checkpoint.pth'),map_location=self.device)
            self.actor.load_state_dict(model_dicts['actor'])
            self.actor_optimizer.load_state_dict(model_dicts['actor_optim'])
            self.critic.load_state_dict(model_dicts['critic'])
            self.critic_optimizer.load_state_dict(model_dicts['criti_optim'])
        else:
            print("Checkpoints not found!")
示例#21
0
def standalone_headless_isolated(pq, cq, plock):
    # locking to prevent mixed-up printing.
    plock.acquire()
    print('starting headless...', pq, cq)
    try:
        import traceback
        from osim.env import RunEnv
        e = RunEnv(visualize=False, max_obstacles=0)
        # bind_alternative_pelvis_judgement(e)
        # use_alternative_episode_length(e)
    except Exception as e:
        print('error on start of standalone')
        traceback.print_exc()

        plock.release()
        return
    else:
        plock.release()

    def report(e):
        # a way to report errors ( since you can't just throw them over a pipe )
        # e should be a string
        print('(standalone) got error!!!')
        # conn.send(('error',e))
        # conn.put(('error',e))
        cq.put(('error', e))

    def floatify(n_p):
        return [float(n_p[i]) for i in range(len(n_p))]

    try:
        previous_o = None
        nb_actions = e.action_space.shape[-1]
        action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                    sigma=0.3 *
                                                    np.ones(nb_actions))
        while True:
            # msg = conn.recv()
            # msg = conn.get()
            msg = pq.get()
            # messages should be tuples,
            # msg[0] should be string

            # isinstance is dangerous, commented out
            # if not isinstance(msg,tuple):
            #     raise Exception('pipe message received by headless is not a tuple')

            if msg[0] == 'reset':  #or (previous_o==None and msg[0]=='step'):
                o = e.reset(difficulty=0)
                o = floatify(o)
                o_processed = generate_observation(o, o)
                previous_o = o
                cq.put(o_processed)

            elif msg[0] == 'step':
                actions = msg[1]
                noisy_action = np.array(actions) + action_noise()
                o, r, d, i = e.step(noisy_action)
                o = floatify(o)  # floatify the observation
                o_processed = generate_observation(o, previous_o)
                previous_o = o
                cq.put((o_processed, r, d, i))
            elif msg[0] == 'action_space':
                a_s = e.action_space
                r_a_s = (a_s.low.tolist(), a_s.high.tolist(), a_s.shape)
                cq.put(r_a_s)
            elif msg[0] == 'observation_space':
                o_s = get_observation_space()
                r_o_s = (o_s['low'].tolist(), o_s['high'].tolist(),
                         o_s['shape'])
                cq.put(r_o_s)
            else:
                cq.close()
                pq.close()
                del e
                break
    except Exception as e:
        traceback.print_exc()
        report(str(e))

    return  # end process
示例#22
0
def learn(
        env,
        seed=None,
        total_timesteps=1e6,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_rollout_steps=100,
        max_ep_len=250,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        noise_type='adaptive-param_0.2',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        start_steps=10000,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        nb_eval_steps=100,
        nb_log_steps=None,
        nb_save_steps=None,
        batch_size=64,  # per MPI worker
        polyak=0.01,
        action_range=(-250.0, 250.0),
        observation_range=(-5.0, 5.0),
        target_noise=0.2,
        noise_clip=0.5,
        policy_delay=2,
        eval_env=None,
        load_path=None,
        save_dir=None,
        **network_kwargs):

    set_global_seeds(seed)

    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        rank = 0

    memory = Memory(limit=int(1e6))

    network_spec = [{
        'layer_type': 'dense',
        'units': int(256),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(128),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(1),
        'activation': 'tanh',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }]

    vnetwork_spec = [{
        'layer_type': 'concat',
        'nodes_in': ['action_movement', 'observation_self'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(256),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(128),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(1),
        'activation': '',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }]

    network = Td3Policy(scope="td3",
                        ob_space=env.observation_space,
                        ac_space=env.action_space,
                        network_spec=network_spec,
                        v_network_spec=vnetwork_spec,
                        stochastic=False,
                        reuse=False,
                        build_act=True,
                        trainable_vars=None,
                        not_trainable_vars=None,
                        gaussian_fixed_var=False,
                        weight_decay=0.0,
                        ema_beta=0.99999,
                        normalize_observations=normalize_observations,
                        normalize_returns=normalize_returns,
                        observation_range=observation_range,
                        action_range=action_range,
                        target_noise=target_noise,
                        noise_clip=noise_clip)

    target_network = Td3Policy(scope="target",
                               ob_space=env.observation_space,
                               ac_space=env.action_space,
                               network_spec=network_spec,
                               v_network_spec=vnetwork_spec,
                               stochastic=False,
                               reuse=False,
                               build_act=True,
                               trainable_vars=None,
                               not_trainable_vars=None,
                               gaussian_fixed_var=False,
                               weight_decay=0.0,
                               ema_beta=0.99999,
                               normalize_observations=normalize_observations,
                               normalize_returns=normalize_returns,
                               observation_range=observation_range,
                               action_range=action_range,
                               target_noise=target_noise,
                               noise_clip=noise_clip,
                               isTarget=True)

    action_noise = None
    param_noise = None
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                action_noise = dict()
                for k, v in env.action_space.spaces.items():
                    act_size = v.spaces[0].shape[-1]
                    _, stddev = current_noise_type.split('_')
                    action_noise[k] = NormalActionNoise(mu=np.zeros(act_size),
                                                        sigma=float(stddev) *
                                                        np.ones(act_size))
            elif 'ou' in current_noise_type:
                action_noise = dict()
                for k, v in env.action_space.spaces.items():
                    act_size = v.spaces[0].shape[-1]
                    _, stddev = current_noise_type.split('_')
                    action_noise[k] = OrnsteinUhlenbeckActionNoise(
                        mu=np.zeros(act_size),
                        sigma=float(stddev) * np.ones(act_size))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    max_action = action_range[1]
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))

    agent = TD3(env,
                network,
                target_network,
                memory,
                env.action_space,
                env.observation_space,
                steps_per_epoch=nb_rollout_steps,
                epochs=nb_epochs,
                gamma=gamma,
                polyak=polyak,
                actor_lr=actor_lr,
                critic_lr=critic_lr,
                batch_size=batch_size,
                start_steps=start_steps,
                action_noise=action_noise,
                target_noise=target_noise,
                noise_clip=noise_clip,
                policy_delay=policy_delay)

    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()

    saver = functools.partial(save_variables, sess=sess)
    loader = functools.partial(load_variables, sess=sess)
    if load_path != None:
        loader(load_path)

    # Prepare everything.
    agent.initialize(sess)
    sess.graph.finalize()

    agent.reset()
    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = env.num_envs
    n_agents = obs['observation_self'].shape[0]

    episode_reward = np.zeros((nenvs, n_agents), dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0

    for t in range(int(total_timesteps)):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True)
            nenvs_actions = []
            for i in range(nenvs):
                nenv_action = {
                    'action_movement':
                    action['action_movement'][i * n_agents:(i + 1) * n_agents]
                }
                nenvs_actions.append(nenv_action)
        else:
            action, q = env.action_space.sample(), None
            nenvs_actions = []
            for i in range(nenvs):
                nenv_action = {
                    'action_movement':
                    action['action_movement'][i * n_agents:(i + 1) *
                                              n_agents][0]
                }
                nenvs_actions.append(nenv_action)

        new_obs, r, done, info = env.step(nenvs_actions)

        episode_reward += r
        episode_step += 1

        for d in range(len(done)):
            done[d] = False if episode_step == max_ep_len else done[d]

        epoch_actions.append(action)
        epoch_qs.append(q)
        agent.store_transition(
            obs, action, r, new_obs,
            done)  #the batched data will be unrolled in memory.py's append.

        obs = new_obs

        for d in range(len(done)):
            if done[d]:
                # Episode done.
                epoch_episode_rewards.append(episode_reward[d])
                episode_rewards_history.append(episode_reward[d])
                epoch_episode_steps.append(episode_step[d])
                episode_reward[d] = 0.
                episode_step[d] = 0
                epoch_episodes += 1
                episodes += 1
                if nenvs == 1:
                    agent.reset()

        episode_actor_losses = []
        episode_critic_losses = []
        episode_critic = []
        episode_critic_twin = []
        if d or (episode_step[0] == max_ep_len):
            """
            Perform all TD3 updates at the end of the trajectory
            (in accordance with source code of TD3 published by
            original authors).
            """
            for j in range(episode_step[0]):
                critic_loss, critic, critic_twin, actor_loss = agent.train(
                    episode_step[0])

                episode_critic_losses.append(critic_loss)
                episode_critic.append(critic)
                episode_critic_twin.append(critic_twin)
                if actor_loss is not None:
                    episode_actor_losses.append(actor_loss)

            obs, r, done, episode_reward, episode_step = env.reset(
            ), 0, False, np.zeros((nenvs, n_agents),
                                  dtype=np.float32), np.zeros(nenvs, dtype=int)

        if (t + 1) % nb_log_steps == 0:
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_std'] = np.std(
                epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/return_history_std'] = np.std(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['train/loss_actor'] = np.mean(episode_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(
                episode_critic_losses)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = np.array(
                [np.array(x).flatten()[0] for x in combined_stats.values()])
            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t
            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            if rank == 0:
                logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)

        if nb_save_steps != None and (t + 1) % nb_save_steps == 0:
            if save_dir == None:
                checkdir = osp.join(logger.get_dir(), 'checkpoints')
            else:
                checkdir = osp.join(save_dir, 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % t)
            print('Saving to', savepath)
            saver(savepath)

    return agent
示例#23
0
文件: ddpg.py 项目: zhongjieGDUT/hcp
    def __init__(self, env, args):
        ob_space = env.observation_space
        goal_dim = env.goal_dim
        ob_dim = ob_space.shape[0]
        self.ob_dim = ob_dim
        self.ac_dim = ac_dim = 7
        self.goal_dim = goal_dim
        self.num_iters = args.num_iters
        self.random_prob = args.random_prob
        self.tau = args.tau
        self.reward_scale = args.reward_scale
        self.gamma = args.gamma

        self.log_interval = args.log_interval
        self.save_interval = args.save_interval
        self.rollout_steps = args.rollout_steps
        self.env = env
        self.batch_size = args.batch_size
        self.train_steps = args.train_steps
        self.closest_dist = np.inf
        self.warmup_iter = args.warmup_iter
        self.max_grad_norm = args.max_grad_norm
        self.use_her = args.her
        self.k_future = args.k_future
        self.model_dir = os.path.join(args.save_dir, 'model')
        self.pretrain_dir = args.pretrain_dir
        os.makedirs(self.model_dir, exist_ok=True)
        self.global_step = 0
        self.actor = Actor(ob_dim=ob_dim,
                           act_dim=ac_dim,
                           hid1_dim=args.hid1_dim,
                           hid2_dim=args.hid2_dim,
                           hid3_dim=args.hid3_dim,
                           init_method=args.init_method)
        self.critic = Critic(ob_dim=ob_dim,
                             act_dim=ac_dim,
                             hid1_dim=args.hid1_dim,
                             hid2_dim=args.hid2_dim,
                             hid3_dim=args.hid3_dim,
                             init_method=args.init_method)
        if args.resume or args.test or args.pretrain_dir is not None:
            self.load_model(args.resume_step, pretrain_dir=args.pretrain_dir)
        if not args.test:
            self.actor_target = Actor(ob_dim=ob_dim,
                                      act_dim=ac_dim,
                                      hid1_dim=args.hid1_dim,
                                      hid2_dim=args.hid2_dim,
                                      hid3_dim=args.hid3_dim,
                                      init_method=args.init_method)
            self.critic_target = Critic(ob_dim=ob_dim,
                                        act_dim=ac_dim,
                                        hid1_dim=args.hid1_dim,
                                        hid2_dim=args.hid2_dim,
                                        hid3_dim=args.hid3_dim,
                                        init_method=args.init_method)
            self.actor_optim = self.construct_optim(self.actor,
                                                    lr=args.actor_lr)
            cri_w_decay = args.critic_weight_decay
            self.critic_optim = self.construct_optim(self.critic,
                                                     lr=args.critic_lr,
                                                     weight_decay=cri_w_decay)
            self.hard_update(self.actor_target, self.actor)
            self.hard_update(self.critic_target, self.critic)

            self.actor_target.eval()
            self.critic_target.eval()
            if args.noise_type == 'ou_noise':
                mu = np.zeros(ac_dim)
                sigma = float(args.ou_noise_std) * np.ones(ac_dim)
                self.action_noise = OrnsteinUhlenbeckActionNoise(mu=mu,
                                                                 sigma=sigma)
            elif args.noise_type == 'uniform':
                low_limit = args.uniform_noise_low
                high_limit = args.uniform_noise_high
                dec_step = args.max_noise_dec_step
                self.action_noise = UniformNoise(low_limit=low_limit,
                                                 high_limit=high_limit,
                                                 dec_step=dec_step)

            elif args.noise_type == 'gaussian':
                mu = np.zeros(ac_dim)
                sigma = args.normal_noise_std * np.ones(ac_dim)
                self.action_noise = NormalActionNoise(mu=mu, sigma=sigma)

            self.memory = Memory(limit=int(args.memory_limit),
                                 action_shape=(int(ac_dim), ),
                                 observation_shape=(int(ob_dim), ))
            self.critic_loss = nn.MSELoss()
            self.ob_norm = args.ob_norm
            if self.ob_norm:
                self.obs_oms = OnlineMeanStd(shape=(1, ob_dim))
            else:
                self.obs_oms = None

        self.cuda()