示例#1
0
    def __init__(self, gymEnvName, model_weights, k, intrinsic=False):

        self.env = gym.make(gymEnvName)

        g = tf.Graph()
        with g.as_default():
            model = AtariVisionModel(k, actiondim=(self.env.action_space.n, 1))
            model.sess.run(tf.initialize_all_variables())
            variables = TensorFlowVariables(model.loss, model.sess)

            variables.set_weights(model_weights)

        self.model = model
        self.trajectory_set = set()
        self.intrinsic = intrinsic

        self.real_action_space = self.env.action_space.n

        #print(self.real_action_space)

        self.action_space = spaces.Discrete(self.env.action_space.n + model.k)
        #print("####",self.env.action_space.n + model.k)
        self.obs = None
        self.done = False
        self.spec = self.env.spec
示例#2
0
文件: policy.py 项目: azahed98/ddo
 def initialize(self):
     with self.g.as_default():
         self.sess = tf.Session(graph=self.g,
                                config=tf.ConfigProto(
                                    intra_op_parallelism_threads=1,
                                    inter_op_parallelism_threads=2))
         self.variables = TensorFlowVariables(self.loss, self.sess)
         self.sess.run(tf.global_variables_initializer())
示例#3
0
    def __init__(self, computation_graph_args, sample_trajectory_args,
                 estimate_return_args):
        super().__init__(computation_graph_args, sample_trajectory_args,
                         estimate_return_args)
        # build computation graph
        self.build_computation_graph()

        # tensorflow: config, session, variable initialization
        self.init_tf_sess()

        self.variables = TensorFlowVariables(self.loss, self.sess)
示例#4
0
文件: runA3CExp.py 项目: azahed98/ddo
def runDDO(env_name="PongDeterministic-v3",
           num_options=2, 
           ddo_learning_rate=1e-3,
           steps_per_discovery=10000,
           rounds=1,
           num_demonstrations_per=100,
           ddo_max_iters=100,
           ddo_vq_iters=100,
           num_workers=1,
           JSD_weight=0,
           entropy_weight=0):

    g = tf.Graph()

    #initialize graph
    with g.as_default():
        a = AtariVisionModel(num_options, actiondim=(gym.make(env_name).action_space.n,1))
        variables = TensorFlowVariables(a.loss, a.sess)

        with tf.variable_scope("optimizer2"):
            opt = tf.train.AdamOptimizer(learning_rate=ddo_learning_rate)
            a.sess.run(tf.initialize_all_variables())

        weights = variables.get_weights()

    #run once to initialize
    env, policy = train(num_workers, env_name=env_name, model=weights, k=num_options, max_steps=1, intrinsic=False)
    trajs,_ = collect_demonstrations(env, policy, N=num_demonstrations_per)

    for i in range(rounds):

        with g.as_default():

            with tf.variable_scope("optimizer2"):

                vq = ddo_vq_iters
                if i > 0:
                    vq = 0

                a.train(opt, trajs, ddo_max_iters, vq)

            weights = variables.get_weights()


        env, policy = train(num_workers, policy=policy, env_name=env_name, model=weights, k=num_options, max_steps=steps_per_discovery)
        trajs, reward = collect_demonstrations(env, policy, N=num_demonstrations_per)

    return {'reward': reward, 'env': env_name, 'num_options': num_options, 'ddo': True, 'intrinsic': False}
示例#5
0
def gridWorldInit():

    t = tf.Graph()

    with t.as_default():
        m = GridWorldModel(2, statedim=(2,1))
        
        m.sess.run(tf.initialize_all_variables())

        variables = TensorFlowVariables(m.loss, m.sess)

    return m, m.opt, t, variables
    def build_model(self, args, **kwargs):
        self.ppo_method = {
            'kl_pen': dict(name='kl_pen', kl_target=0.01, lam=0.5),  # KL penalty
            'clip': dict(name='clip', epsilon=0.2),  # Clipped surrogate objective, find this is better
        }.get(args.ppo_name)

        config = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=True
        )

        config.gpu_options.allow_growth = True

        self.sess = tf.Session(config=config)
        self.tfs = tf.placeholder(tf.float32, [None, args.state_dim], 'state')

        # critic
        with tf.variable_scope('critic'):
            l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu)
        self.v = tf.layers.dense(l1, 1)
        self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
        self.advantage = self.tfdc_r - self.v
        self.closs = tf.reduce_mean(tf.square(self.advantage))
        self.critic_opt = tf.train.AdamOptimizer(args.critic_lr)
        self.c_grads = self.critic_opt.compute_gradients(self.closs)
        self.ctrain_op = self.critic_opt.apply_gradients(self.c_grads)
        self.c_vars = TensorFlowVariables(self.closs, self.sess)

        # actor
        pi, pi_params = self._build_anet('pi', trainable=True)
        oldpi, oldpi_params = self._build_anet('oldpi', trainable=False)
        with tf.variable_scope('sample_action'):
            self.sample_op = tf.squeeze(pi.sample(1), axis=0)  # choosing action
        with tf.variable_scope('update_oldpi'):
            self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]

        self.tfa = tf.placeholder(tf.float32, [None, args.action_dim], 'action')
        self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
        with tf.variable_scope('loss'):
            with tf.variable_scope('surrogate'):
                # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
                ratio = pi.prob(self.tfa) / (oldpi.prob(self.tfa) + 1e-5)
                surr = ratio * self.tfadv
            if args.ppo_name == 'kl_pen':
                self.tflam = tf.placeholder(tf.float32, None, 'lambda')
                kl = tf.distributions.kl_divergence(oldpi, pi)
                self.kl_mean = tf.reduce_mean(kl)
                self.aloss = -(tf.reduce_mean(surr - self.tflam * kl))
            else:  # clipping method, find this is better
                self.aloss = -tf.reduce_mean(tf.minimum(
                    surr,
                    tf.clip_by_value(ratio, 1. - self.ppo_method['epsilon'],
                                     1. + self.ppo_method['epsilon']) * self.tfadv))

        with tf.variable_scope('actor_train'):
            self.actor_opt = tf.train.AdamOptimizer(args.actor_lr)
            self.a_grads = self.actor_opt.compute_gradients(self.aloss)
            self.a_grads = [(t, v) for t, v in self.a_grads if t is not None]
            self.atrain_op = self.actor_opt.apply_gradients(self.a_grads)
            self.a_vars = TensorFlowVariables(self.aloss, self.sess)

        if args.log_dir:
            tf.summary.FileWriter(args.log_dir, self.sess.graph)
        self.sess.run(tf.global_variables_initializer())
class PPO(object):
    """
    This PPO version is adapted from Mofan Zhou, University of Technology Sydney.
    """

    def __init__(self, args):
        self.args = args
        self.build_model(args)

    @staticmethod
    def add_args(parser):
        parser.add_argument('--state_dim', type=int, default=3)
        parser.add_argument('--action_dim', type=int, default=1)
        # parser.add_argument('--update_actor_steps', type=int, default=10)  # unused in ray
        # parser.add_argument('--update_critic_steps', type=int, default=10)  # unused in ray
        parser.add_argument('--actor_lr', type=float, default=1e-4)
        parser.add_argument('--critic_lr', type=float, default=2e-4)
        parser.add_argument('--clip_norm', type=float, default=5)
        parser.add_argument('--ppo_name', choices=['kl_gen', 'clip'], default='clip')
        parser.add_argument("--log_dir", type=str, default=None)

    def build_model(self, args, **kwargs):
        self.ppo_method = {
            'kl_pen': dict(name='kl_pen', kl_target=0.01, lam=0.5),  # KL penalty
            'clip': dict(name='clip', epsilon=0.2),  # Clipped surrogate objective, find this is better
        }.get(args.ppo_name)

        config = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=True
        )

        config.gpu_options.allow_growth = True

        self.sess = tf.Session(config=config)
        self.tfs = tf.placeholder(tf.float32, [None, args.state_dim], 'state')

        # critic
        with tf.variable_scope('critic'):
            l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu)
        self.v = tf.layers.dense(l1, 1)
        self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
        self.advantage = self.tfdc_r - self.v
        self.closs = tf.reduce_mean(tf.square(self.advantage))
        self.critic_opt = tf.train.AdamOptimizer(args.critic_lr)
        self.c_grads = self.critic_opt.compute_gradients(self.closs)
        self.ctrain_op = self.critic_opt.apply_gradients(self.c_grads)
        self.c_vars = TensorFlowVariables(self.closs, self.sess)

        # actor
        pi, pi_params = self._build_anet('pi', trainable=True)
        oldpi, oldpi_params = self._build_anet('oldpi', trainable=False)
        with tf.variable_scope('sample_action'):
            self.sample_op = tf.squeeze(pi.sample(1), axis=0)  # choosing action
        with tf.variable_scope('update_oldpi'):
            self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]

        self.tfa = tf.placeholder(tf.float32, [None, args.action_dim], 'action')
        self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
        with tf.variable_scope('loss'):
            with tf.variable_scope('surrogate'):
                # ratio = tf.exp(pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
                ratio = pi.prob(self.tfa) / (oldpi.prob(self.tfa) + 1e-5)
                surr = ratio * self.tfadv
            if args.ppo_name == 'kl_pen':
                self.tflam = tf.placeholder(tf.float32, None, 'lambda')
                kl = tf.distributions.kl_divergence(oldpi, pi)
                self.kl_mean = tf.reduce_mean(kl)
                self.aloss = -(tf.reduce_mean(surr - self.tflam * kl))
            else:  # clipping method, find this is better
                self.aloss = -tf.reduce_mean(tf.minimum(
                    surr,
                    tf.clip_by_value(ratio, 1. - self.ppo_method['epsilon'],
                                     1. + self.ppo_method['epsilon']) * self.tfadv))

        with tf.variable_scope('actor_train'):
            self.actor_opt = tf.train.AdamOptimizer(args.actor_lr)
            self.a_grads = self.actor_opt.compute_gradients(self.aloss)
            self.a_grads = [(t, v) for t, v in self.a_grads if t is not None]
            self.atrain_op = self.actor_opt.apply_gradients(self.a_grads)
            self.a_vars = TensorFlowVariables(self.aloss, self.sess)

        if args.log_dir:
            tf.summary.FileWriter(args.log_dir, self.sess.graph)
        self.sess.run(tf.global_variables_initializer())

    @ray.method(num_return_vals=2)
    def step(self, s, a, r, a_weights, c_weights):
        self.set_weights(a_weights, c_weights)

        self.sess.run(self.update_oldpi_op)
        adv = self.sess.run(self.advantage, {self.tfs: s, self.tfdc_r: r})

        # update actor
        a_grads = self.sess.run([grad[0] for grad in self.a_grads], {self.tfs: s, self.tfa: a, self.tfadv: adv})
        # update critic
        c_grads = self.sess.run([grad[0] for grad in self.c_grads], {self.tfs: s, self.tfdc_r: r})
        return a_grads, c_grads

    def _build_anet(self, name, trainable):
        with tf.variable_scope(name):
            l1 = tf.layers.dense(self.tfs, 100, tf.nn.relu, trainable=trainable)
            mu = 2 * tf.layers.dense(l1, self.args.action_dim, tf.nn.tanh, trainable=trainable)
            sigma = tf.layers.dense(l1, self.args.action_dim, tf.nn.softplus, trainable=trainable)
            norm_dist = tf.distributions.Normal(loc=mu, scale=sigma)
        params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
        return norm_dist, params

    def choose_actions(self, s: np.ndarray) -> List[np.ndarray]:
        if s.ndim == 1:
            s = s[np.newaxis, :]
        bat_actions = self.sess.run(self.sample_op, {self.tfs: s})
        return [np.clip(a, -2, 2) for a in bat_actions]

    def get_v(self, s):
        if s.ndim < 2: s = s[np.newaxis, :]
        return self.sess.run(self.v, {self.tfs: s})[0, 0]

    @ray.method(num_return_vals=2)
    def get_weights(self):
        return self.a_vars.get_weights(), self.c_vars.get_weights()

    def set_weights(self, a_weights, c_weights):
        self.a_vars.set_weights(a_weights)
        self.c_vars.set_weights(c_weights)
示例#8
0
m = GridWorldModel(2, statedim=(2,1))
m.sess.run(tf.initialize_all_variables())

with tf.variable_scope("optimizer"):

    opt = m.opt

    #opt = tf.train.AdamOptimizer(learning_rate=0.001)
    
    m.train(opt, dataset, 1, 0)


    driver_gradients = opt.compute_gradients(m.loss)

    variables = TensorFlowVariables(m.loss, m.sess)

    vlist = [g for (g,v) in driver_gradients]
    
    update = opt.apply_gradients(driver_gradients)


    variables = TensorFlowVariables(m.loss, m.sess)

    for iter in range(0,1000):

        print("----Iteration---", iter)

        weights = ray.put(variables.get_weights())

示例#9
0
class Parallel_Actor(Agent):
    def __init__(self, computation_graph_args, sample_trajectory_args,
                 estimate_return_args):
        super().__init__(computation_graph_args, sample_trajectory_args,
                         estimate_return_args)
        # build computation graph
        self.build_computation_graph()

        # tensorflow: config, session, variable initialization
        self.init_tf_sess()

        self.variables = TensorFlowVariables(self.loss, self.sess)

    def set_weights(self, weights):
        self.variables.set_weights(weights)

    def get_weights(self):
        weights = self.variables.get_weights()

        return weights

    def test_method(self, i):
        return i * 2

    def sample_trajectories_fake(self, itr, env, counter_actor):
        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            animate_this_episode = (len(paths) == 0 and (itr % 10 == 0)
                                    and self.animate)
            if hasattr(self, 'running_only') and self.animate:
                animate_this_episode = True


#            path = self.sample_trajectory(env, animate_this_episode)
#            paths.append(path)
#            timesteps_this_batch += pathlength(path)
            path_len = self.fake_sample_trajectory()
            counter_actor.increment_counter.remote(path_len)
            current_count = ray.get(counter_actor.return_count.remote())
            timesteps_this_batch += path_len
            time.sleep(0.3)
            #             if timesteps_this_batch > self.min_timesteps_per_batch:
            if current_count > self.min_timesteps_per_batch:
                print('the final pathlength from this worker is ' +
                      str(timesteps_this_batch))
                paths = 'filler'
                break
        return paths, timesteps_this_batch  #

    def sample_trajectories(self, itr, env, counter_actor):
        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            animate_this_episode = (len(paths) == 0 and (itr % 10 == 0)
                                    and self.animate)
            if hasattr(self, 'running_only') and self.animate:
                animate_this_episode = True

            path = self.sample_trajectory(env, animate_this_episode)
            path_len = pathlength(path)
            paths.append(path)
            timesteps_this_batch += path_len

            counter_actor.increment_counter.remote(path_len)
            current_count = ray.get(counter_actor.return_count.remote())

            if current_count > self.min_timesteps_per_batch:
                break
        return paths, timesteps_this_batch

    def fake_sample_trajectory(self):
        #         return randint(100)
        return 100
示例#10
0
    def __init__(self,
                 name,
                 args,
                 env_args,
                 buffer_args,
                 sess_config=None,
                 save=False,
                 log=False,
                 log_tensorboard=False,
                 log_params=False,
                 log_stats=False,
                 device=None):
        # hyperparameters
        self.algo = args['algorithm']
        self.gamma = args.setdefault('gamma', .99)
        self.update_step = 0
        self.max_action_repetitions = args.setdefault('max_action_repetitions',
                                                      1)

        # environment info
        env_args['gamma'] = self.gamma
        env_args['seed'] += 100
        self.eval_env = create_gym_env(env_args)
        env_args['seed'] -= 100
        env_args['log_video'] = False
        self.train_env = create_gym_env(env_args)
        self.state_shape = self.train_env.state_shape
        self.action_dim = self.train_env.action_dim

        # replay buffer hyperparameters
        buffer_args['n_steps'] = args['n_steps']
        buffer_args['gamma'] = args['gamma']
        buffer_args['batch_size'] = args['batch_size']
        self.buffer_type = buffer_args['type']
        if self.buffer_type == 'proportional':
            self.buffer = ProportionalPrioritizedReplay(
                buffer_args, self.state_shape, self.action_dim)
        elif self.buffer_type == 'uniform':
            self.buffer = UniformReplay(buffer_args, self.state_shape,
                                        self.action_dim)
        elif self.buffer_type == 'local':
            self.buffer = LocalBuffer(buffer_args, self.state_shape,
                                      self.action_dim)
        else:
            raise NotImplementedError('No buffer is constructed')

        # arguments for prioritized replay
        self.prio_alpha = float(buffer_args['alpha'])
        self.prio_epsilon = float(buffer_args['epsilon'])

        super().__init__(name,
                         args,
                         sess_config=sess_config,
                         save=save,
                         log=log,
                         log_tensorboard=log_tensorboard,
                         log_params=log_params,
                         log_stats=log_stats,
                         device=device)

        self._initialize_target_net()

        with self.graph.as_default():
            self.variables = TensorFlowVariables(self.loss, self.sess)
示例#11
0
    def __init__(self,
                 name,
                 args,
                 env_args,
                 sess_config=None,
                 save=False,
                 log=False,
                 log_tensorboard=False,
                 log_params=False,
                 log_stats=False,
                 device=None,
                 reuse=None,
                 graph=None):
        # hyperparameters
        self.gamma = args['gamma']
        self.gae_discount = self.gamma * args['lam']
        self.n_minibatches = args['n_minibatches']

        self.use_lstm = args['ac']['use_lstm']
        self.entropy_coef = args['ac']['entropy_coef']
        self.n_value_updates = args['ac']['n_value_updates']
        self.minibatch_idx = 0

        # environment info
        self.env_vec = create_gym_env(env_args)
        self.seq_len = self.env_vec.max_episode_steps

        self.buffer = PPOBuffer(env_args['n_workers'] * env_args['n_envs'],
                                self.seq_len, self.n_minibatches,
                                self.env_vec.state_shape, np.float32,
                                self.env_vec.action_shape, np.float32)

        super().__init__(name,
                         args,
                         sess_config=sess_config,
                         save=save,
                         log=log,
                         log_tensorboard=log_tensorboard,
                         log_params=log_params,
                         log_stats=log_stats,
                         device=device,
                         reuse=reuse,
                         graph=graph)

        self.schedule_lr = 'schedule_lr' in args and args['schedule_lr']
        if self.schedule_lr:
            self.actor_lr_scheduler = PiecewiseSchedule([(0, 1e-4),
                                                         (400000, 1e-4),
                                                         (600000, 5e-5)],
                                                        outside_value=5e-5)
            self.critic_lr_scheduler = PiecewiseSchedule([(0, 3e-4),
                                                          (400000, 3e-4),
                                                          (600000, 5e-5)],
                                                         outside_value=5e-5)

        if self.use_lstm:
            # don't distinguish lstm at training from that at running
            # since training is done after running
            self.last_lstm_state = None

        with self.graph.as_default():
            self.variables = TensorFlowVariables(
                [self.ac.policy_loss, self.ac.V_loss], self.sess)
示例#12
0
文件: policy.py 项目: azahed98/ddo
class Policy(object):
    """Policy base class"""
    def __init__(self, ob_space, ac_space, task, name="local"):
        self.local_steps = 0
        worker_device = "/job:localhost/replica:0/task:0/cpu:0"
        self.g = tf.Graph()
        with self.g.as_default(), tf.device(worker_device):
            with tf.variable_scope(name):
                self.setup_graph(ob_space, ac_space)
                assert all([
                    hasattr(self, attr)
                    for attr in ["vf", "logits", "x", "var_list"]
                ])
            print("Setting up loss")
            self.setup_loss(ac_space)
            self.initialize()

    def setup_graph(self):
        raise NotImplementedError

    def setup_loss(self, num_actions, summarize=True):
        self.ac = tf.placeholder(tf.float32, [None, num_actions], name="ac")
        self.adv = tf.placeholder(tf.float32, [None], name="adv")
        self.r = tf.placeholder(tf.float32, [None], name="r")

        log_prob_tf = tf.nn.log_softmax(self.logits)
        prob_tf = tf.nn.softmax(self.logits)

        # the "policy gradients" loss:  its derivative is precisely the policy gradient
        # notice that self.ac is a placeholder that is provided externally.
        # adv will contain the advantages, as calculated in process_rollout
        pi_loss = -tf.reduce_sum(
            tf.reduce_sum(log_prob_tf * self.ac, [1]) * self.adv)

        # loss of value function
        vf_loss = 0.5 * tf.reduce_sum(tf.square(self.vf - self.r))
        vf_loss = tf.Print(vf_loss, [vf_loss], "Value Fn Loss")
        entropy = -tf.reduce_sum(prob_tf * log_prob_tf)

        bs = tf.to_float(tf.shape(self.x)[0])
        self.loss = pi_loss + 0.5 * vf_loss - entropy * 0.01

        grads = tf.gradients(self.loss, self.var_list)
        self.grads, _ = tf.clip_by_global_norm(grads, 40.0)

        grads_and_vars = list(zip(self.grads, self.var_list))
        opt = tf.train.AdamOptimizer(1e-4)
        self._apply_gradients = opt.apply_gradients(grads_and_vars)

        if summarize:
            tf.summary.scalar("model/policy_loss", pi_loss / bs)
            tf.summary.scalar("model/value_loss", vf_loss / bs)
            tf.summary.scalar("model/entropy", entropy / bs)
            tf.summary.image("model/state", self.x)
            self.summary_op = tf.summary.merge_all()

    def initialize(self):
        with self.g.as_default():
            self.sess = tf.Session(graph=self.g,
                                   config=tf.ConfigProto(
                                       intra_op_parallelism_threads=1,
                                       inter_op_parallelism_threads=2))
            self.variables = TensorFlowVariables(self.loss, self.sess)
            self.sess.run(tf.global_variables_initializer())

    def model_update(self, grads):
        feed_dict = {self.grads[i]: grads[i] for i in range(len(grads))}
        self.sess.run(self._apply_gradients, feed_dict=feed_dict)

    def get_weights(self):
        with self.g.as_default():
            weights = self.variables.get_weights()
        return weights

    def set_weights(self, weights):
        with self.g.as_default():
            self.variables.set_weights(weights)

    def get_gradients(self, batch):
        raise NotImplementedError

    def get_vf_loss(self):
        raise NotImplementedError

    def act(self, ob):
        raise NotImplementedError

    def value(self, ob):
        raise NotImplementedError