MAX_EPISODES = 600 MAX_EP_STEPS = 200 LR_A = 1e-4 # learning rate for actor LR_C = 1e-4 # learning rate for critic GAMMA = 0.9 # reward discount REPLACE_ITER_A = 1100 REPLACE_ITER_C = 1000 MEMORY_CAPACITY = 5000 BATCH_SIZE = 16 VAR_MIN = 0.1 RENDER = True LOAD = True MODE = ['easy', 'hard'] n_model = 1 env = ArmEnv(mode=MODE[n_model]) STATE_DIM = env.state_dim ACTION_DIM = env.action_dim ACTION_BOUND = env.action_bound # all placeholder for tf with tf.name_scope('S'): S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s') with tf.name_scope('R'): R = tf.placeholder(tf.float32, [None, 1], name='r') with tf.name_scope('S_'): S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_') class Actor(object): def __init__(self, sess, action_dim, action_bound, learning_rate,
def train(config={}): tf.reset_default_graph() env = ArmEnv(mode='easy', should_random_target=True) ppo = PPO(config) all_ep_r = [] lambdas = [] should_render = 'should_render' in config.keys() and config['should_render'] start = time.clock() for ep in tqdm(range(ppo.EP_MAX), desc='Training'): s = env.reset() buffer_s, buffer_a, buffer_r = [], [], [] ep_r = 0 for t in range(ppo.EP_LEN): # in one episode if should_render: env.render() a = ppo.choose_action(s) s_, r, done = env.step(a) buffer_s.append(s) buffer_a.append(a) buffer_r.append((r+8)/8) # normalize reward, find to be useful s = s_ ep_r += r # update ppo if (t+1) % ppo.BATCH == 0 or t == ppo.EP_LEN-1: v_s_ = ppo.get_v(s_) discounted_r = [] for r in buffer_r[::-1]: v_s_ = r + ppo.GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] ppo.update(bs, ba, br) if ep == 0: all_ep_r.append(ep_r) else: all_ep_r.append(all_ep_r[-1]*0.9 + ep_r*0.1) print( 'Ep: %i' % ep, "|Ep_r: %i" % ep_r, ("|Lam: %.4f" % ppo.METHOD['lam']) if ppo.METHOD['name'] == 'kl_pen' else '', ) if ppo.METHOD['name'] == 'kl_pen': lambdas.append(ppo.METHOD['lam']) elapsed = time.clock() - start print('Train with method {} done!'.format(ppo.METHOD['name'])) print('Time elapsed {}s'.format(elapsed)) if 'should_save' in config and config['should_save']: ppo.save('arm') return { 'method': ppo.METHOD['name'], 'ep_r': all_ep_r, 'lambda': lambdas, 'time': elapsed, # 耗时 'config': config, # 当前变量 }
def __init__(self, name, globalAC): self.env = ArmEnv(mode=MODE[n_model]) self.name = name self.AC = ACNet(name, globalAC)
def __init__(self, wid): self.wid = wid self.env = ArmEnv(mode=MODE[n_model]) self.ppo = GLOBAL_PPO
MAX_EPISODES = 1000 MAX_EP_STEPS = 500 LR_A = 1e-5 # learning rate for actor LR_C = 1e-5 # learning rate for critic GAMMA = 0.9 # reward discount REPLACE_ITER_A = 1100 REPLACE_ITER_C = 1000 MEMORY_CAPACITY = 5000 BATCH_SIZE = 32 VAR_MIN = 0.1 RENDER = True LOAD = False MODE = ['easy', 'hard'] n_model = 1 env = ArmEnv() # STATE_DIM = env.state_dim # ACTION_DIM = env.action_dim # ACTION_BOUND = env.action_bound STATE_DIM = env.s.shape[0] ACTION_DIM = env.joint.shape[0] ACTION_BOUND = [-20, 20] # all placeholder for tf with tf.name_scope('S'): S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s') with tf.name_scope('R'): R = tf.placeholder(tf.float32, [None, 1], name='r') with tf.name_scope('S_'):
def train(config={}, use_unity_arm=False): tf.reset_default_graph() should_random_target = 'should_random_target' in config.keys( ) and config['should_random_target'] env = UnityArmEnv() if use_unity_arm else ArmEnv( mode='easy', should_random_target=should_random_target) config['A_DIM'] = env.action_dim config['S_DIM'] = env.state_dim ppo = PPO(config) all_ep_r = [] lambdas = [] should_render = 'should_render' in config.keys( ) and config['should_render'] start = time.clock() plot = Plot() for ep in range(ppo.EP_MAX): s = env.reset() buffer_s, buffer_a, buffer_r = [], [], [] ep_r = 0 for t in tqdm(range(ppo.EP_LEN), desc='Training EP-' + str(ep) + '/' + str(ppo.EP_MAX) + ': '): # in one episode if should_render: env.render() a = ppo.choose_action(s) s_, r, done = env.step(a) buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) # normalize reward, find to be useful s = s_ ep_r += r # update ppo if (t + 1) % ppo.BATCH == 0 or t == ppo.EP_LEN - 1: v_s_ = ppo.get_v(s_) discounted_r = [] for r in buffer_r[::-1]: v_s_ = r + ppo.GAMMA * v_s_ discounted_r.append(v_s_) discounted_r.reverse() bs, ba, br = np.vstack(buffer_s), np.vstack( buffer_a), np.array(discounted_r)[:, np.newaxis] buffer_s, buffer_a, buffer_r = [], [], [] # bs = (bs - bs.mean()) / (bs.std() + 1e-6) # br = (br - br.mean()) / (br.std() + 1e-6) ppo.update(bs, ba, br) if ep == 0: all_ep_r.append(ep_r) else: all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1) print('Current Reward: ', ep_r) plot.update(all_ep_r) if ppo.METHOD['name'] == 'kl_pen': lambdas.append(ppo.METHOD['lam']) elapsed = time.clock() - start print('Train with method {} done!'.format(ppo.METHOD['name'])) print('Time elapsed {}s'.format(elapsed)) return { 'method': ppo.METHOD['name'], 'ep_r': all_ep_r, 'lambda': lambdas, 'time': elapsed, # 耗时 'config': config, # 当前变量 }, ppo, env
return np.array([1 if a == i else 0 for i in range(n)]) def discount_and_norm_rewards(episode_rewards, gamma): discounted_episode_rewards = np.zeros_like(episode_rewards) cumulative = 0 for t in reversed(range(len(episode_rewards))): cumulative = cumulative * gamma + episode_rewards[t] discounted_episode_rewards[t] = cumulative return discounted_episode_rewards env = ArmEnv(size_x=4, size_y=3, cubes_cnt=4, episode_max_length=2000, finish_reward=200, action_minus_reward=0, tower_target_size=3) s = env.reset() obs_len = len(s) tf.reset_default_graph() state = tf.placeholder('float32', shape=[None, obs_len], name="STATE") actions = tf.squeeze(tf.placeholder('int32', name="ACTIONS")) q = tf.placeholder('float32', name="Q") inp = tf.layers.dense(state, 10, name="INPUT",
GAMMA = 0.9 # reward discount REPLACE_ITER_A = 500 REPLACE_ITER_C = 500 MEMORY_CAPACITY = 10000 BATCH_SIZE = 64 TAU = 0.001 # soft replacement VAR_MIN = 0.01 RENDER = False LOAD = False MODE = ['easy', 'hard'] SPARSE = True n_model = 1 use_her = True K = 4 env = ArmEnv(mode=MODE[n_model], sparse=SPARSE) STATE_DIM = env.state_dim ACTION_DIM = env.action_dim ACTION_BOUND = env.action_bound class Episode_experience(): def __init__(self): self.memory = [] def add(self, state, action, reward, next_state, done, goal): self.memory += [(state, action, reward, next_state, done, goal)] def clear(self): self.memory = []