示例#1
0
MAX_EPISODES = 600
MAX_EP_STEPS = 200
LR_A = 1e-4  # learning rate for actor
LR_C = 1e-4  # learning rate for critic
GAMMA = 0.9  # reward discount
REPLACE_ITER_A = 1100
REPLACE_ITER_C = 1000
MEMORY_CAPACITY = 5000
BATCH_SIZE = 16
VAR_MIN = 0.1
RENDER = True
LOAD = True
MODE = ['easy', 'hard']
n_model = 1

env = ArmEnv(mode=MODE[n_model])
STATE_DIM = env.state_dim
ACTION_DIM = env.action_dim
ACTION_BOUND = env.action_bound

# all placeholder for tf
with tf.name_scope('S'):
    S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s')
with tf.name_scope('R'):
    R = tf.placeholder(tf.float32, [None, 1], name='r')
with tf.name_scope('S_'):
    S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_')


class Actor(object):
    def __init__(self, sess, action_dim, action_bound, learning_rate,
示例#2
0
def train(config={}):
    tf.reset_default_graph()
    env = ArmEnv(mode='easy', should_random_target=True)
    ppo = PPO(config)
    all_ep_r = []
    lambdas = []

    should_render = 'should_render' in config.keys() and config['should_render']

    start = time.clock()

    for ep in tqdm(range(ppo.EP_MAX), desc='Training'):
        s = env.reset()
        buffer_s, buffer_a, buffer_r = [], [], []
        ep_r = 0
        for t in range(ppo.EP_LEN):    # in one episode
            if should_render:
                env.render()
            a = ppo.choose_action(s)
            s_, r, done = env.step(a)
            buffer_s.append(s)
            buffer_a.append(a)
            buffer_r.append((r+8)/8)    # normalize reward, find to be useful
            s = s_
            ep_r += r

            # update ppo
            if (t+1) % ppo.BATCH == 0 or t == ppo.EP_LEN-1:
                v_s_ = ppo.get_v(s_)
                discounted_r = []
                for r in buffer_r[::-1]:
                    v_s_ = r + ppo.GAMMA * v_s_
                    discounted_r.append(v_s_)
                discounted_r.reverse()

                bs, ba, br = np.vstack(buffer_s), np.vstack(
                    buffer_a), np.array(discounted_r)[:, np.newaxis]
                buffer_s, buffer_a, buffer_r = [], [], []
                ppo.update(bs, ba, br)
        if ep == 0:
            all_ep_r.append(ep_r)
        else:
            all_ep_r.append(all_ep_r[-1]*0.9 + ep_r*0.1)
        print(
            'Ep: %i' % ep,
            "|Ep_r: %i" % ep_r,
            ("|Lam: %.4f" %
             ppo.METHOD['lam']) if ppo.METHOD['name'] == 'kl_pen' else '',
        )
        if ppo.METHOD['name'] == 'kl_pen':
            lambdas.append(ppo.METHOD['lam'])

    elapsed = time.clock() - start

    print('Train with method {} done!'.format(ppo.METHOD['name']))
    print('Time elapsed {}s'.format(elapsed))

    if 'should_save' in config and config['should_save']:
        ppo.save('arm')

    return {
        'method': ppo.METHOD['name'],
        'ep_r': all_ep_r,
        'lambda': lambdas,
        'time': elapsed, # 耗时
        'config': config, # 当前变量
    }
示例#3
0
 def __init__(self, name, globalAC):
     self.env = ArmEnv(mode=MODE[n_model])
     self.name = name
     self.AC = ACNet(name, globalAC)
示例#4
0
 def __init__(self, wid):
     self.wid = wid
     self.env = ArmEnv(mode=MODE[n_model])
     self.ppo = GLOBAL_PPO
示例#5
0
MAX_EPISODES = 1000
MAX_EP_STEPS = 500
LR_A = 1e-5  # learning rate for actor
LR_C = 1e-5  # learning rate for critic
GAMMA = 0.9  # reward discount
REPLACE_ITER_A = 1100
REPLACE_ITER_C = 1000
MEMORY_CAPACITY = 5000
BATCH_SIZE = 32
VAR_MIN = 0.1
RENDER = True
LOAD = False
MODE = ['easy', 'hard']
n_model = 1

env = ArmEnv()
# STATE_DIM = env.state_dim
# ACTION_DIM = env.action_dim
# ACTION_BOUND = env.action_bound

STATE_DIM = env.s.shape[0]
ACTION_DIM = env.joint.shape[0]

ACTION_BOUND = [-20, 20]

# all placeholder for tf
with tf.name_scope('S'):
    S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s')
with tf.name_scope('R'):
    R = tf.placeholder(tf.float32, [None, 1], name='r')
with tf.name_scope('S_'):
示例#6
0
def train(config={}, use_unity_arm=False):
    tf.reset_default_graph()

    should_random_target = 'should_random_target' in config.keys(
    ) and config['should_random_target']

    env = UnityArmEnv() if use_unity_arm else ArmEnv(
        mode='easy', should_random_target=should_random_target)
    config['A_DIM'] = env.action_dim
    config['S_DIM'] = env.state_dim
    ppo = PPO(config)
    all_ep_r = []
    lambdas = []

    should_render = 'should_render' in config.keys(
    ) and config['should_render']

    start = time.clock()

    plot = Plot()

    for ep in range(ppo.EP_MAX):
        s = env.reset()
        buffer_s, buffer_a, buffer_r = [], [], []
        ep_r = 0
        for t in tqdm(range(ppo.EP_LEN),
                      desc='Training EP-' + str(ep) + '/' + str(ppo.EP_MAX) +
                      ': '):  # in one episode
            if should_render:
                env.render()
            a = ppo.choose_action(s)
            s_, r, done = env.step(a)
            buffer_s.append(s)
            buffer_a.append(a)
            buffer_r.append(r)  # normalize reward, find to be useful
            s = s_
            ep_r += r

            # update ppo
            if (t + 1) % ppo.BATCH == 0 or t == ppo.EP_LEN - 1:
                v_s_ = ppo.get_v(s_)
                discounted_r = []
                for r in buffer_r[::-1]:
                    v_s_ = r + ppo.GAMMA * v_s_
                    discounted_r.append(v_s_)
                discounted_r.reverse()

                bs, ba, br = np.vstack(buffer_s), np.vstack(
                    buffer_a), np.array(discounted_r)[:, np.newaxis]
                buffer_s, buffer_a, buffer_r = [], [], []
                # bs = (bs - bs.mean()) / (bs.std() + 1e-6)
                # br = (br - br.mean()) / (br.std() + 1e-6)
                ppo.update(bs, ba, br)
        if ep == 0:
            all_ep_r.append(ep_r)
        else:
            all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1)

        print('Current Reward: ', ep_r)
        plot.update(all_ep_r)

        if ppo.METHOD['name'] == 'kl_pen':
            lambdas.append(ppo.METHOD['lam'])

    elapsed = time.clock() - start

    print('Train with method {} done!'.format(ppo.METHOD['name']))
    print('Time elapsed {}s'.format(elapsed))

    return {
        'method': ppo.METHOD['name'],
        'ep_r': all_ep_r,
        'lambda': lambdas,
        'time': elapsed,  # 耗时
        'config': config,  # 当前变量
    }, ppo, env
示例#7
0
文件: arm_new.py 项目: rvtsukanov/AI
    return np.array([1 if a == i else 0 for i in range(n)])


def discount_and_norm_rewards(episode_rewards, gamma):
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    cumulative = 0
    for t in reversed(range(len(episode_rewards))):
        cumulative = cumulative * gamma + episode_rewards[t]
        discounted_episode_rewards[t] = cumulative
    return discounted_episode_rewards


env = ArmEnv(size_x=4,
             size_y=3,
             cubes_cnt=4,
             episode_max_length=2000,
             finish_reward=200,
             action_minus_reward=0,
             tower_target_size=3)

s = env.reset()
obs_len = len(s)

tf.reset_default_graph()
state = tf.placeholder('float32', shape=[None, obs_len], name="STATE")
actions = tf.squeeze(tf.placeholder('int32', name="ACTIONS"))
q = tf.placeholder('float32', name="Q")

inp = tf.layers.dense(state,
                      10,
                      name="INPUT",
示例#8
0
GAMMA = 0.9  # reward discount
REPLACE_ITER_A = 500
REPLACE_ITER_C = 500
MEMORY_CAPACITY = 10000
BATCH_SIZE = 64
TAU = 0.001  # soft replacement
VAR_MIN = 0.01
RENDER = False
LOAD = False
MODE = ['easy', 'hard']
SPARSE = True
n_model = 1
use_her = True
K = 4

env = ArmEnv(mode=MODE[n_model], sparse=SPARSE)
STATE_DIM = env.state_dim

ACTION_DIM = env.action_dim
ACTION_BOUND = env.action_bound


class Episode_experience():
    def __init__(self):
        self.memory = []

    def add(self, state, action, reward, next_state, done, goal):
        self.memory += [(state, action, reward, next_state, done, goal)]

    def clear(self):
        self.memory = []