def __init__( self, ob_space, ac_space, horizon=2048, gamma=0.99, lam=0.95, train_epochs=10, batch_size=64, buffer_length=10, policy=None ): if policy is None: print('no policy designated, use default Policy') policy = Policy self.current_policy = policy(ob_space, ac_space) self.old_policy = policy(ob_space, ac_space) self.current_policy.actor.summary() self.current_policy.critic.summary() self.gamma, self.lam, self.horizon = gamma, lam, horizon self.train_epochs, self.batch_size = train_epochs, batch_size self.traj_buffer = traj_buffer(buffer_length) self.act, self.predict_value, self.train_for_one_step, self.assign_old_eq_new = self.build_functions() low, high = ac_space.low, ac_space.high self.action_bias = (high + low)/2. self.action_multiplier = high - self.action_bias # limit action into the range specified by environment. def action_limiter(action): # assume input mean 0 std 1 return np.tanh(action) * self.action_multiplier + self.action_bias def action_limiter(action): # assume input uniform [0,1] return (action * 2 - 1) * self.action_multiplier + self.action_bias self.action_limiter = action_limiter # logging of episodic reward. from plotter import interprocess_plotter as plotter self.plotter = plotter(2) # logging of actions. comment out if you don't have opencv if not hasattr(self,'wavegraph'): from winfrey import wavegraph # num_waves = self.outputdims*2+1 num_waves = self.current_policy.ac_dims*2+1 def rn(): r = np.random.uniform() return 0.3+r*0.4 colors = [] for i in range(num_waves-1): color = [rn(),rn(),rn()] colors.append(color) colors.append([0.2,0.5,0.9]) self.wavegraph = wavegraph(num_waves,'ac_mean/ac_sto/vf',np.array(colors)) def loggraph(waves): wg = self.wavegraph wg.one(waves.reshape((-1,))) self.loggraph = loggraph
def loggraph(self,waves): if not hasattr(self,'wavegraph'): def rn(): r = np.random.uniform() return 0.2+r*0.4 colors = [] for i in range(len(waves)-1): color = [rn(),rn(),rn()] colors.append(color) colors.append([0.2,0.5,0.9]) self.wavegraph = wavegraph(len(waves),'actions/noises/Q',np.array(colors)) wg = self.wavegraph wg.one(waves.reshape((-1,)))
def __init__( self, observation_space_dims, action_space, stack_factor=1, discount_factor=.99, # gamma # train_skip_every=1, train_multiplier=1, ): self.rpm = rpm(1000000) # 1M history self.plotter = plotter(num_lines=3) self.render = True self.training = True self.noise_source = one_fsq_noise() self.train_counter = 0 # self.train_skip_every = train_skip_every self.train_multiplier = train_multiplier self.observation_stack_factor = stack_factor self.inputdims = observation_space_dims * self.observation_stack_factor # assume observation_space is continuous self.is_continuous = True if isinstance(action_space, Box) else False if self.is_continuous: # if action space is continuous low = action_space.low high = action_space.high num_of_actions = action_space.shape[0] self.action_bias = high / 2. + low / 2. self.action_multiplier = high - self.action_bias # say high,low -> [2,7], then bias -> 4.5 # mult = 2.5. then [-1,1] multiplies 2.5 + bias 4.5 -> [2,7] def clamper(actions): return np.clip(actions, a_max=action_space.high, a_min=action_space.low) self.clamper = clamper else: num_of_actions = action_space.n self.action_bias = .5 self.action_multiplier = .5 # map (-1,1) into (0,1) def clamper(actions): return np.clip(actions, a_max=1., a_min=0.) self.clamper = clamper self.outputdims = num_of_actions self.discount_factor = discount_factor ids, ods = self.inputdims, self.outputdims print('inputdims:{}, outputdims:{}'.format(ids, ods)) self.actor = self.create_actor_network(ids, ods) self.critic = self.create_critic_network(ids, ods) self.actor_target = self.create_actor_network(ids, ods) self.critic_target = self.create_critic_network(ids, ods) # print(self.actor.get_weights()) # print(self.critic.get_weights()) self.feed, self.joint_inference, sync_target = self.train_step_gen() sess = ct.get_session() sess.run(tf.global_variables_initializer()) sync_target() import threading as th self.lock = th.Lock() if not hasattr(self, 'wavegraph'): num_waves = self.outputdims * 2 + 1 def rn(): r = np.random.uniform() return 0.2 + r * 0.4 colors = [] for i in range(num_waves - 1): color = [rn(), rn(), rn()] colors.append(color) colors.append([0.2, 0.5, 0.9]) self.wavegraph = wavegraph(num_waves, 'actions/noises/Q', np.array(colors))