Пример #1
0
def main(_):
    global master_network
    global global_episodes

    reward_mode = None
    if len(sys.argv) < 2:
        # general params
        # training params
        # PICK ONE and comment others
        params = PARAMS['CE3-CE4']
        # params = PARAMS['CE3-CC4']
        # params = PARAMS['CC3-CE4']
        # params = PARAMS['CC3-CC4']
        # params = PARAMS['CE4']
        # params = PARAMS['CC4']
    else:
        setting = sys.argv[1]
        params = PARAMS[IDX_TO_PARAMS[int(setting) - 1]]
        print(('training_scenario: {}, testing_scenario: {}'.format(
            params['train_scenario_name'], params['test_scenario_name'])))
        reward_mode = sys.argv[2]

    use_physics = False
    num_training_iters = 100

    # RL specific settings
    params['data_dir'] = '../../OpenLockA3CResults/subjects/'
    params['train_attempt_limit'] = 300
    params['test_attempt_limit'] = 300
    params['use_physics'] = False
    params['num_training_iters'] = 100
    params['reward_mode'] = reward_mode

    scenario = select_scenario(params['train_scenario_name'],
                               use_physics=use_physics)

    ENV_NAME = 'arm_lock-v0'

    env = gym.make(ENV_NAME)

    # create session/trial/experiment manager
    manager = SessionManager(env, params, human=False)
    manager.update_scenario(scenario)
    trial_selected = manager.run_trial_common_setup(
        scenario_name=params['train_scenario_name'],
        action_limit=params['train_action_limit'],
        attempt_limit=params['train_attempt_limit'])

    env.observation_space = ObservationSpace(len(scenario.levers))
    MODEL_DIR = manager.writer.subject_path + '/models'
    MONITOR_DIR = manager.writer.subject_path + '/monitor'

    STATE_DIM = env.observation_space.shape
    ACTION_DIM = len(env.action_space)

    # delete temporary env
    env.close()

    tf.reset_default_graph()

    if not os.path.exists(MODEL_DIR):
        os.makedirs(MODEL_DIR)

    with tf.device("/cpu:0"):
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)

        global_episodes = tf.Variable(0,
                                      dtype=tf.int32,
                                      name='global_episodes',
                                      trainable=False)
        trainer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
        master_network = AC_Network(STATE_DIM, ACTION_DIM, CELL_UNITS,
                                    'global', None)  # Generate global network
        num_workers = multiprocessing.cpu_count(
        )  # Set workers to number of available CPU threads

        # For testing and visualisation we only need one worker
        if TEST_MODEL:
            num_workers = 1

        workers = []
        # Create worker classes
        for i in range(num_workers):
            workers.append(
                Worker(name=i,
                       s_size=STATE_DIM,
                       a_size=ACTION_DIM,
                       trainer=trainer,
                       model_path=MODEL_DIR,
                       global_episodes=global_episodes,
                       env_name=ENV_NAME,
                       seed=RANDOM_SEED,
                       test=TEST_MODEL,
                       cell_units=CELL_UNITS,
                       params=params))
        saver = tf.train.Saver(max_to_keep=5)

        # Gym monitor
        if not TEST_MODEL:
            env = workers[0].get_env()
            env = gym.wrappers.Monitor(env,
                                       MONITOR_DIR,
                                       video_callable=False,
                                       force=True)

    with tf.Session() as sess:
        coord = tf.train.Coordinator()
        if LOAD_MODEL or TEST_MODEL:
            print('Loading Model...')
            ckpt = tf.train.get_checkpoint_state(MODEL_DIR)
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            sess.run(tf.global_variables_initializer())

        if TEST_MODEL:
            env = workers[0].get_env()
            env = gym.wrappers.Monitor(env, MONITOR_DIR, force=True)
            workers[0].work(GAMMA, sess, coord, saver)
        else:
            # This is where the asynchronous magic happens.
            # Start the "work" process for each worker in a separate thread.
            print('Launching workers...')
            worker_threads = []
            for worker in workers:
                worker_work = lambda: worker.work(GAMMA, sess, coord, saver)
                t = threading.Thread(target=(worker_work))
                t.start()
                worker_threads.append(t)
            coord.join(worker_threads)
Пример #2
0
class Worker():
    def __init__(self, name, s_size, a_size, trainer, model_path, global_episodes, env_name, seed, test, cell_units, params, testing_trial=False):
        self.name = "worker_" + str(name)
        self.number = name
        self.model_path = model_path
        self.trainer = trainer
        self.global_episodes = global_episodes
        self.increment = self.global_episodes.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.summary_writer = tf.summary.FileWriter("train_" + str(self.number))
        self.is_test = test
        self.a_size = a_size
        self.params = params

        # Create the local copy of the network and the tensorflow op to copy global parameters to local network
        self.local_AC = AC_Network(s_size, a_size, cell_units, self.name, trainer)
        self.update_local_ops = update_target_graph('global', self.name)

        self.testing_trial = testing_trial
        if not self.testing_trial:
            self.scenario_name = params['train_scenario_name']
            self.attempt_limit = params['train_attempt_limit']
        else:
            self.scenario_name = params['test_scenario_name']
            self.attempt_limit = params['test_attempt_limit']

        self.scenario = select_scenario(self.scenario_name, params['use_physics'])
        env = gym.make(env_name)

        self.manager = SessionManager(env, params, human=False)
        self.manager.update_scenario(self.scenario)
        self.manager.env.reward_mode = params['reward_mode']

        self.trial_count = 0
        self.manager.env.seed(seed)

    def get_env(self):
        return self.manager.env

    def train(self, rollout, sess, gamma, r):
        rollout = np.array(rollout)
        states = rollout[:, 0]
        actions = rollout[:, 1]
        rewards = rollout[:, 2]
        values = rollout[:, 5]

        # Here we take the rewards and values from the rollout, and use them to
        # generate the advantage and discounted returns.
        rewards_list = np.asarray(rewards.tolist()+[r])*REWARD_FACTOR
        discounted_rewards = discounting(rewards_list, gamma)[:-1]

        # Advantage estimation
        # JS, P Moritz, S Levine, M Jordan, P Abbeel,
        # "High-dimensional continuous control using generalized advantage estimation."
        # arXiv preprint arXiv:1506.02438 (2015).
        values_list = np.asarray(values.tolist()+[r])*REWARD_FACTOR
        advantages = rewards + gamma * values_list[1:] - values_list[:-1]
        discounted_advantages = discounting(advantages, gamma)


        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        # sess.run(self.local_AC.reset_state_op)
        rnn_state = self.local_AC.state_init
        feed_dict = {self.local_AC.target_v: discounted_rewards,
                     self.local_AC.inputs: np.vstack(states),
                     self.local_AC.actions: np.vstack(actions),
                     self.local_AC.advantages: discounted_advantages,
                     self.local_AC.state_in[0]: rnn_state[0],
                     self.local_AC.state_in[1]: rnn_state[1]}
        v_l, p_l, e_l, g_n, v_n, _ = sess.run([self.local_AC.value_loss,
                                               self.local_AC.policy_loss,
                                               self.local_AC.entropy,
                                               self.local_AC.grad_norms,
                                               self.local_AC.var_norms,
                                               self.local_AC.apply_grads],
                                              feed_dict=feed_dict)
        return v_l / len(rollout), p_l / len(rollout), e_l / len(rollout), g_n, v_n

    def work(self, gamma, sess, coord, saver):
        episode_count = sess.run(self.global_episodes)
        total_steps = 0
        print("Starting worker " + str(self.number))
        with sess.as_default(), sess.graph.as_default():
            sess.run(self.update_local_ops)
            episode_buffer = []
            episode_mini_buffer = []
            episode_values = []
            episode_states = []
            episode_reward = 0
            episode_step_count = 0

            if not self.testing_trial:
                trial_selected = self.manager.run_trial_common_setup(self.params['train_scenario_name'], self.params['train_action_limit'], self.params['train_attempt_limit'], multithreaded=True)
            else:
                trial_selected = self.manager.run_trial_common_setup(self.params['test_scenario_name'], self.params['test_action_limit'], self.params['test_attempt_limit'], specified_trial='trial7', multithreaded=True)

            self.manager.env.reset()
            while not coord.should_stop():

                # update trial if needed
                if self.manager.env.attempt_count > self.attempt_limit or self.manager.logger.cur_trial.success is True:
                    if not self.testing_trial:
                        trial_selected = self.manager.run_trial_common_setup(self.params['train_scenario_name'], self.params['train_action_limit'], self.params['train_attempt_limit'], multithreaded=True)
                    else:
                        trial_selected = self.manager.run_trial_common_setup(self.params['test_scenario_name'], self.params['test_action_limit'], self.params['test_attempt_limit'], specified_trial='trial7', multithreaded=True)
                    print('scenario_name: {}, trial_count: {}, trial_name: {}'.format(self.scenario_name, self.trial_count, trial_selected))
                    sess.run(self.update_local_ops)
                    episode_buffer = []
                    episode_mini_buffer = []
                    episode_values = []
                    episode_states = []
                    episode_reward = 0
                    episode_step_count = 0
                    self.trial_count += 1
                    self.manager.env.reset()

                # Restart environment
                done = False
                state = self.manager.env.reset()

                rnn_state = self.local_AC.state_init

                # Run an episode
                while not done:
                    episode_states.append(state)
                    if self.is_test:
                        self.manager.env.render()

                    # Get preferred action distribution
                    a_dist, v, rnn_state = sess.run([self.local_AC.policy, self.local_AC.value, self.local_AC.state_out],
                                         feed_dict={self.local_AC.inputs: [state],
                                                    self.local_AC.state_in[0]: rnn_state[0],
                                                    self.local_AC.state_in[1]: rnn_state[1]})

                    a0 = weighted_pick(a_dist[0], 1) # Use stochastic distribution sampling
                    if self.is_test:
                        a0 = np.argmax(a_dist[0]) # Use maximum when testing
                    a = np.zeros(self.a_size)
                    a[a0] = 1

                    next_state, reward, done, opt = self.manager.env.step(np.argmax(a), multithreaded=False)

                    episode_reward += reward

                    episode_buffer.append([state, a, reward, next_state, done, v[0, 0]])
                    episode_mini_buffer.append([state, a, reward, next_state, done, v[0, 0]])

                    episode_values.append(v[0, 0])

                    # Train on mini batches from episode
                    if len(episode_mini_buffer) == MINI_BATCH and not self.is_test:
                        v1 = sess.run([self.local_AC.value],
                                      feed_dict={self.local_AC.inputs: [state],
                                                    self.local_AC.state_in[0]: rnn_state[0],
                                                    self.local_AC.state_in[1]: rnn_state[1]})
                        v_l, p_l, e_l, g_n, v_n = self.train(episode_mini_buffer, sess, gamma, v1[0][0])
                        episode_mini_buffer = []

                    # Set previous state for next step
                    state = next_state
                    total_steps += 1
                    episode_step_count += 1

                self.episode_rewards.append(episode_reward)
                self.episode_lengths.append(episode_step_count)
                self.episode_mean_values.append(np.mean(episode_values))

                if episode_count % 100 == 0 and not episode_count % 1000 == 0 and not self.is_test:
                    mean_reward = np.mean(self.episode_rewards[-5:])
                    mean_length = np.mean(self.episode_lengths[-5:])
                    mean_value = np.mean(self.episode_mean_values[-5:])
                    summary = tf.Summary()
                    summary.value.add(tag='Scenario name', simple_value=str(self.manager.env.scenario.name))
                    summary.value.add(tag='trial count', simple_value=str(self.trial_count))
                    summary.value.add(tag='trial name', simple_value=str(trial_selected))
                    summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward))
                    summary.value.add(tag='Perf/Length', simple_value=float(mean_length))
                    summary.value.add(tag='Perf/Value', simple_value=float(mean_value))
                    summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l))
                    summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l))
                    summary.value.add(tag='Losses/Entropy', simple_value=float(e_l))
                    summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n))
                    summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n))
                    self.summary_writer.add_summary(summary, episode_count)

                    self.summary_writer.flush()

                if self.name == 'worker_0':
                    if episode_count % 1000 == 0 and not self.is_test:
                        saver.save(sess, self.model_path + '/model-' + str(episode_count) + '.cptk')

                    print("| Reward: " + str(episode_reward), " | Episode", episode_count)
                    sess.run(self.increment) # Next global episode

                episode_count += 1