Exemplo n.º 1
0
    def __init__(
        self, ob_space, ac_space,
        horizon=2048,
        gamma=0.99, lam=0.95,
        train_epochs=10, batch_size=64,
        buffer_length=10,
        policy=None
        ):
        if policy is None:
            print('no policy designated, use default Policy')
            policy = Policy
        self.current_policy = policy(ob_space, ac_space)
        self.old_policy = policy(ob_space, ac_space)
        self.current_policy.actor.summary()
        self.current_policy.critic.summary()

        self.gamma, self.lam, self.horizon = gamma, lam, horizon
        self.train_epochs, self.batch_size = train_epochs, batch_size
        self.traj_buffer = traj_buffer(buffer_length)

        self.act, self.predict_value, self.train_for_one_step, self.assign_old_eq_new = self.build_functions()

        low, high = ac_space.low, ac_space.high
        self.action_bias = (high + low)/2.
        self.action_multiplier = high - self.action_bias

        # limit action into the range specified by environment.
        def action_limiter(action): # assume input mean 0 std 1
            return np.tanh(action) * self.action_multiplier + self.action_bias
        def action_limiter(action): # assume input uniform [0,1]
            return (action * 2 - 1) * self.action_multiplier + self.action_bias
        self.action_limiter = action_limiter

        # logging of episodic reward.
        from plotter import interprocess_plotter as plotter
        self.plotter = plotter(2)

        # logging of actions. comment out if you don't have opencv
        if not hasattr(self,'wavegraph'):
            from winfrey import wavegraph
            # num_waves = self.outputdims*2+1
            num_waves = self.current_policy.ac_dims*2+1
            def rn():
                r = np.random.uniform()
                return 0.3+r*0.4
            colors = []
            for i in range(num_waves-1):
                color = [rn(),rn(),rn()]
                colors.append(color)
            colors.append([0.2,0.5,0.9])
            self.wavegraph = wavegraph(num_waves,'ac_mean/ac_sto/vf',np.array(colors))

            def loggraph(waves):
                wg = self.wavegraph
                wg.one(waves.reshape((-1,)))

            self.loggraph = loggraph
Exemplo n.º 2
0
    def loggraph(self,waves):
        if not hasattr(self,'wavegraph'):
            def rn():
                r = np.random.uniform()
                return 0.2+r*0.4
            colors = []
            for i in range(len(waves)-1):
                color = [rn(),rn(),rn()]
                colors.append(color)
            colors.append([0.2,0.5,0.9])
            self.wavegraph = wavegraph(len(waves),'actions/noises/Q',np.array(colors))

        wg = self.wavegraph
        wg.one(waves.reshape((-1,)))
Exemplo n.º 3
0
    def __init__(
        self,
        observation_space_dims,
        action_space,
        stack_factor=1,
        discount_factor=.99,  # gamma
        # train_skip_every=1,
        train_multiplier=1,
    ):
        self.rpm = rpm(1000000)  # 1M history
        self.plotter = plotter(num_lines=3)
        self.render = True
        self.training = True
        self.noise_source = one_fsq_noise()
        self.train_counter = 0
        # self.train_skip_every = train_skip_every
        self.train_multiplier = train_multiplier
        self.observation_stack_factor = stack_factor

        self.inputdims = observation_space_dims * self.observation_stack_factor
        # assume observation_space is continuous

        self.is_continuous = True if isinstance(action_space, Box) else False

        if self.is_continuous:  # if action space is continuous

            low = action_space.low
            high = action_space.high

            num_of_actions = action_space.shape[0]

            self.action_bias = high / 2. + low / 2.
            self.action_multiplier = high - self.action_bias

            # say high,low -> [2,7], then bias -> 4.5
            # mult = 2.5. then [-1,1] multiplies 2.5 + bias 4.5 -> [2,7]

            def clamper(actions):
                return np.clip(actions,
                               a_max=action_space.high,
                               a_min=action_space.low)

            self.clamper = clamper
        else:
            num_of_actions = action_space.n

            self.action_bias = .5
            self.action_multiplier = .5  # map (-1,1) into (0,1)

            def clamper(actions):
                return np.clip(actions, a_max=1., a_min=0.)

            self.clamper = clamper

        self.outputdims = num_of_actions
        self.discount_factor = discount_factor
        ids, ods = self.inputdims, self.outputdims
        print('inputdims:{}, outputdims:{}'.format(ids, ods))

        self.actor = self.create_actor_network(ids, ods)
        self.critic = self.create_critic_network(ids, ods)
        self.actor_target = self.create_actor_network(ids, ods)
        self.critic_target = self.create_critic_network(ids, ods)

        # print(self.actor.get_weights())
        # print(self.critic.get_weights())

        self.feed, self.joint_inference, sync_target = self.train_step_gen()

        sess = ct.get_session()
        sess.run(tf.global_variables_initializer())

        sync_target()

        import threading as th
        self.lock = th.Lock()

        if not hasattr(self, 'wavegraph'):
            num_waves = self.outputdims * 2 + 1

            def rn():
                r = np.random.uniform()
                return 0.2 + r * 0.4

            colors = []
            for i in range(num_waves - 1):
                color = [rn(), rn(), rn()]
                colors.append(color)
            colors.append([0.2, 0.5, 0.9])
            self.wavegraph = wavegraph(num_waves, 'actions/noises/Q',
                                       np.array(colors))