예제 #1
0
def main(_):
    '''
    previous = tf.train.import_meta_graph(SAVE_DIR + '/model.ckpt.meta')
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        previous.restore(sess,tf.train.latest_checkpoint(SAVE_DIR+'/'))
        last_vars = tf.trainable_variables()
        data = sess.run(last_vars)
        print('Model Restored')
    '''
    tf.reset_default_graph()

    with tf.Session() as sess:
        env = Preon_env(opt.env_params)
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)

        state_dim = 9
        action_dim = 3
        goal_dim = 2

        actor = ActorNetwork(sess, state_dim, action_dim, goal_dim,
                             ACTOR_LEARNING_RATE, TAU, opt.env_params)
        critic = CriticNetwork(sess, state_dim, action_dim, goal_dim,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars(), opt.env_params)

        if opt.train:
            train(sess, env, actor, critic, action_dim, goal_dim, state_dim)
        else:
            test(sess, env, actor, critic, action_dim, goal_dim, state_dim,
                 opt.test_goal)
예제 #2
0
def main(args):

    with tf.Session() as session:
        np.random.seed(int(args['random_seed']))
        tf.set_random_seed(int(args['random_seed']))

        # initialize ROS interface
        agent = fake.fake_agent()
        plant = fake.fake_plant()

        state_shape = agent.get_state_shape()
        action_shape = agent.get_action_shape()
        action_bound = agent.get_action_bound()

        # initialize function approximators
        actor_network = ActorNetwork(session,
                                     state_shape,
                                     action_shape,
                                     action_bound,
                                     float(args['actor_lr']),
                                     float(args['tau']),
                                     loss_mask=True)
        critic_network = CriticNetwork(session,
                                       state_shape,
                                       action_shape,
                                       float(args['critic_lr']),
                                       float(args['tau']),
                                       float(args['gamma']),
                                       actor_network.get_num_trainable_vars(),
                                       loss_mask=True)
        predictor_network = fake.fake_predictor()
        latent_network = fake.fake_latent()

        learn(session,
              actor_network,
              critic_network,
              predictor_network,
              agent,
              plant,
              latent_network=latent_network,
              buffer_size=int(args['buffer_size']),
              batch_size=int(args['batch_size']),
              trace_length=int(args['trace_length']),
              update_freq=int(args['update_freq']),
              pretrain_steps=int(args['pretrain_steps']),
              update_steps=int(args['update_steps']),
              max_episodes=int(args['max_episodes']),
              max_ep_steps=int(args['max_episode_len']),
              summary_dir=args['summary_dir'])
예제 #3
0
파일: main.py 프로젝트: yosider/RLSnipets
def main():
    with tf.Session() as sess:

        actor = ActorNetwork(sess, STATE_DIM, ACTION_DIM, ACTION_BOUND,
                             ACTOR_LEARNING_RATE, TAU, MINIBATCH_SIZE)
        critic = CriticNetwork(sess, STATE_DIM, ACTION_DIM,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars())

        #actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(ACTION_DIM))

        #TODO: Ornstein-Uhlenbeck noise.

        sess.run(tf.global_variables_initializer())

        # initialize target net
        actor.update_target_network()
        critic.update_target_network()

        # initialize replay memory
        replay_buffer = ReplayBuffer(BUFFER_SIZE)

        # main loop.
        for ep in range(MAX_EPISODES):

            episode_reward = 0
            ep_batch_avg_q = 0

            s = ENV.reset()

            for step in range(MAX_EP_STEPS):

                a = actor.predict(np.reshape(s,
                                             (1, STATE_DIM)))  #+ actor_noise()
                s2, r, terminal, info = ENV.step(a[0])
                #print(s2)

                replay_buffer.add(np.reshape(s, (STATE_DIM,)), \
                                np.reshape(a, (ACTION_DIM,)), \
                                r, \
                                terminal, \
                                np.reshape(s2, (STATE_DIM,)))

                # Batch sampling.
                if replay_buffer.size() > MINIBATCH_SIZE and \
                    step % TRAIN_INTERVAL == 0:
                    s_batch, a_batch, r_batch, t_batch, s2_batch = \
                        replay_buffer.sample_batch(MINIBATCH_SIZE)

                    # target Q値を計算.
                    target_action = actor.predict_target(s2_batch)
                    target_q = critic.predict_target(s2_batch, target_action)

                    # critic の target V値を計算.
                    targets = []
                    for i in range(MINIBATCH_SIZE):
                        if t_batch[i]:
                            # terminal
                            targets.append(r_batch[i])
                        else:
                            targets.append(r_batch[i] + GAMMA * target_q[i])

                    # Critic を train.
                    #TODO: predQはepisodeではなくrandom batchなのでepisode_avg_maxという統計は不適切.
                    pred_q, _ = critic.train(
                        s_batch, a_batch,
                        np.reshape(targets, (MINIBATCH_SIZE, 1)))

                    # Actor を train.
                    a_outs = actor.predict(s_batch)
                    grads = critic.action_gradients(s_batch, a_outs)
                    #print(grads[0].shape)
                    #exit(1)
                    actor.train(s_batch, grads[0])

                    # Update target networks.
                    # 数batchに一度にするべき?
                    actor.update_target_network()
                    critic.update_target_network()

                    ep_batch_avg_q += np.mean(pred_q)

                s = s2
                episode_reward += r

                if terminal:
                    print('Episode:', ep, 'Reward:', episode_reward)
                    reward_log.append(episode_reward)
                    q_log.append(ep_batch_avg_q / step)

                    break
예제 #4
0
def main(_):
    opt = Parameters()
    np.random.seed(opt.seed)
    tf.set_random_seed(opt.seed)

    if opt.train:
        cluster = tf.train.ClusterSpec({
            "ps": opt.parameter_servers,
            "worker": opt.workers
        })
        server = tf.train.Server(cluster,
                                 job_name=FLAGS.job_name,
                                 task_index=FLAGS.task_index)

        if FLAGS.job_name == "ps":
            server.join()
        elif FLAGS.job_name == "worker":
            with tf.device(
                    tf.train.replica_device_setter(
                        worker_device="/job:worker/task:%d" % FLAGS.task_index,
                        cluster=cluster)):
                is_chief = (FLAGS.task_index == 0)
                # count the number of updates
                global_step = tf.get_variable(
                    'global_step', [],
                    initializer=tf.constant_initializer(0),
                    trainable=False)
                step_op = global_step.assign(global_step + 1)

                env = gym.make(opt.env_name)
                if is_chief:
                    env = wrappers.Monitor(env, './tmp/', force=True)

                if opt.env_name == 'MountainCarContinuous-v0':
                    observation_examples = np.array(
                        [env.observation_space.sample() for x in range(10000)])
                    scaler = StandardScaler()
                    scaler.fit(observation_examples)
                else:
                    scaler = None

                # Initialize replay memory
                replay_buffer = ReplayBuffer(opt.rm_size, opt.seed)

                state_dim = env.observation_space.shape[0]
                action_dim = env.action_space.shape[0]
                if abs(env.action_space.low[0]) == abs(
                        env.action_space.high[0]):
                    action_scale = abs(env.action_space.high[0])
                else:
                    print(
                        'Error: Action space in current environment is asymmetric! '
                    )
                    sys.exit()

                actor = ActorNetwork(state_dim, action_dim, action_scale,
                                     opt.actor_lr, opt.tau, scaler)
                critic = CriticNetwork(state_dim, action_dim,
                                       opt.critic_lr, opt.tau,
                                       actor.get_num_trainable_vars(), scaler)

                # Set up summary Ops
                train_ops, valid_ops, training_vars, valid_vars = build_summaries(
                )

                init_op = tf.global_variables_initializer()

                # Add ops to save and restore all the variables.
                saver = tf.train.Saver(max_to_keep=5)

                if opt.continue_training:

                    def restore_model(sess):
                        actor.set_session(sess)
                        critic.set_session(sess)
                        saver.restore(
                            sess,
                            tf.train.latest_checkpoint(opt.save_dir + '/'))
                        actor.restore_params(tf.trainable_variables())
                        critic.restore_params(tf.trainable_variables())
                        print('***********************')
                        print('Model Restored')
                        print('***********************')
                else:

                    def restore_model(sess):
                        actor.set_session(sess)
                        critic.set_session(sess)
                        # Initialize target network weights
                        actor.update_target_network()
                        critic.update_target_network()
                        print('***********************')
                        print('Model Initialized')
                        print('***********************')

                #sv = tf.train.Supervisor(is_chief=is_chief, global_step=global_step, init_op=init_op, summary_op=None, saver=None, init_fn=restore_model)

                #with sv.prepare_or_wait_for_session(server.target) as sess:
                with tf.Session(server.target) as sess:
                    sess.run(init_op)
                    restore_model(sess)

                    writer = tf.summary.FileWriter(opt.summary_dir, sess.graph)

                    stats = []
                    for step in range(opt.max_episodes):
                        '''
                        if sv.should_stop():
                            break
                        '''

                        current_step = sess.run(global_step)
                        # Train normally
                        reward = train(sess, current_step, opt, env, actor,
                                       critic, train_ops, training_vars,
                                       replay_buffer, writer, is_chief)
                        stats.append(reward)

                        if np.mean(stats[-100:]) > 950 and len(stats) >= 101:
                            print(np.mean(stats[-100:]))
                            print("Solved.")
                            if is_chief:
                                save_model(sess, saver, opt, global_step)
                            break

                        if is_chief and step % opt.valid_freq == opt.valid_freq - 1:
                            #test_r = test(sess, current_step, opt, env, actor, critic, valid_ops, valid_vars, writer)
                            save_model(sess, saver, opt, global_step)

                        # Increase global_step
                        sess.run(step_op)

                #sv.stop()
                print('Done')

    else:  # For testing
        pass