예제 #1
0
def initialize_agents(config):
    global_step = tf.Variable(0,
                              dtype=tf.int32,
                              name='global_step',
                              trainable=False)
    envs = [_create_environment(config) for _ in range(config.num_agents)]
    action_size = envs[0].action_space.n
    nb_states = envs[0].nb_states

    if config.agent_type == "a3c":
        global_network = config.network("global", config, action_size,
                                        nb_states)

    if FLAGS.task == "matrix":
        agent = config.agent(envs[0], 0, global_step, config, FLAGS.task)
    elif FLAGS.task == "option":
        agent = config.agent(envs[0], 0, global_step, config, FLAGS.task)
    elif FLAGS.task == "play_option":
        agent = config.agent(envs[0], 0, global_step, config, FLAGS.task)
    else:
        if config.agent_type == "a3c":
            agents = [
                config.agent(envs[i], i, global_step, config, FLAGS.task)
                for i in range(config.num_agents)
            ]
            return agents
        else:
            agent = config.agent(envs[0], 0, global_step, config, FLAGS.task)

    return agent
예제 #2
0
def train(config, env_processes, logdir):
    tf.reset_default_graph()
    sess = tf.Session()
    stage_logdir = os.path.join(logdir, "tabular_sf")
    tf.gfile.MakeDirs(stage_logdir)
    with sess:
        with tf.device("/cpu:0"):
            with config.unlocked:
                config.logdir = logdir
                config.stage_logdir = stage_logdir
                config.network_optimizer = getattr(tf.train,
                                                   config.network_optimizer)
                env = _create_environment(config)
                action_size = env.action_space.n
                nb_states = env.nb_states
                sf = np.eye((env.nb_states))
                delta = np.inf
                theta = 1
                while (theta < delta):
                    delta = 0.0
                    for s in range(nb_states):
                        sf_s = sf[s]
                        a = np.random.choice(range(action_size))
                        s1, r = env.get_next_state_and_reward(s, a)
                        state_features = np.identity(nb_states)
                        sf_s1 = state_features[s] + config.discount * sf[s1]
                        delta = max(delta, np.sum(np.abs(sf_s - sf_s1)))
                        sf[s] = sf_s1

                u, s, v = np.linalg.svd(sf)
                print(s)
예제 #3
0
def train(config, env_processes, logdir):
    tf.reset_default_graph()
    sess = tf.Session()
    with sess:
        with tf.device("/cpu:0"):
            with config.unlocked:
                config.logdir = logdir
                config.network_optimizer = getattr(tf.train,
                                                   config.network_optimizer)
                global_step = tf.Variable(0,
                                          dtype=tf.int32,
                                          name='global_step',
                                          trainable=False)
                envs = [
                    _create_environment(config)
                    for _ in range(config.num_agents)
                ]
                action_size = envs[0].action_space.n
                global_network = config.network("global", config, action_size)
                agents = [
                    config.agent(envs[i], i, global_step, config)
                    for i in range(config.num_agents)
                ]

            saver = loader = utility.define_saver(
                exclude=(r'.*_temporary/.*', ))
            if FLAGS.load_from is not None:
                sess.run(tf.global_variables_initializer())
                ckpt = tf.train.get_checkpoint_state(
                    os.path.join(FLAGS.load_from, "models"))
                print("Loading Model from {}".format(
                    ckpt.model_checkpoint_path))
                loader.restore(sess, ckpt.model_checkpoint_path)
                sess.run(tf.local_variables_initializer())
            else:
                sess.run([
                    tf.global_variables_initializer(),
                    tf.local_variables_initializer()
                ])

            coord = tf.train.Coordinator()

            agent_threads = []
            for agent in agents:
                thread = threading.Thread(
                    target=(lambda: agent.play(sess, coord, saver)))
                thread.start()
                agent_threads.append(thread)

            while True:
                if FLAGS.show_training:
                    for env in envs:
                        env.render()

            coord.join(agent_threads)
예제 #4
0
def train(config, env_processes, logdir):
    tf.reset_default_graph()
    sess = tf.Session()
    previous_stage_logdir = os.path.join(logdir, "sf_repres")
    matrix_stage_logdir = os.path.join(logdir, "sf_matrix")
    stage_logdir = os.path.join(logdir, "plot_sf_policy")
    tf.gfile.MakeDirs(stage_logdir)
    with sess:
        with tf.device("/cpu:0"):
            with config.unlocked:
                config.logdir = logdir
                config.stage_logdir = stage_logdir
                config.matrix_stage_logdir = matrix_stage_logdir
                eval, evect = get_direction(config.matrix_stage_logdir)
                config.network_optimizer = getattr(tf.train,
                                                   config.network_optimizer)
                global_step = tf.Variable(0,
                                          dtype=tf.int32,
                                          name='global_step',
                                          trainable=False)
                env = _create_environment(config)
                action_size = env.action_space.n
                global_network = config.network("global", config, action_size,
                                                5)
                global_network.option = FLAGS.option
                agent = config.option_agent(env, 0, global_step, config,
                                            FLAGS.option, eval, evect,
                                            FLAGS.flip_eigen, 5)

            saver = utility.define_saver(exclude=(r'.*_temporary/.*', ))
            loader = utility.define_saver(exclude=(r'.*_temporary/.*', ))
            sess.run(tf.global_variables_initializer())
            ckpt = tf.train.get_checkpoint_state(
                os.path.join(previous_stage_logdir, "models"))
            print("Loading Model from {}".format(ckpt.model_checkpoint_path))
            loader.restore(sess, ckpt.model_checkpoint_path)
            sess.run(tf.local_variables_initializer())

            coord = tf.train.Coordinator()

            agent_threads = []
            thread = threading.Thread(
                target=(lambda: agent.plot_heatmap(sess, coord, saver)))
            thread.start()
            agent_threads.append(thread)

            coord.join(agent_threads)
def train(config, env_processes, logdir):
  tf.reset_default_graph()
  sess = tf.Session()
  stage_logdir = os.path.join(logdir, "linear_sf")
  tf.gfile.MakeDirs(stage_logdir)
  with sess:
    with tf.device("/cpu:0"):
      with config.unlocked:
        config.logdir = logdir
        config.stage_logdir = stage_logdir
        config.network_optimizer = getattr(tf.train, config.network_optimizer)
        global_step = tf.Variable(0, dtype=tf.int32, name='global_step', trainable=False)
        envs = [_create_environment(config) for _ in range(config.num_agents)]
        action_size = envs[0].action_space.n
        nb_states = envs[0].nb_states
        global_network = config.network("global", config, action_size, nb_states)
        if FLAGS.task == "matrix":
          agent = config.linear_sf_agent(envs[0], 0, global_step, config)
        else:
          agents = [config.linear_sf_agent(envs[i], i, global_step, config) for i in range(config.num_agents)]

      saver = loader = utility.define_saver(exclude=(r'.*_temporary/.*',))
      if FLAGS.resume:
        sess.run(tf.global_variables_initializer())
        ckpt = tf.train.get_checkpoint_state(os.path.join(os.path.join(FLAGS.load_from, "linear_sf"), "models"))
        print("Loading Model from {}".format(ckpt.model_checkpoint_path))
        loader.restore(sess, ckpt.model_checkpoint_path)
        sess.run(tf.local_variables_initializer())
      else:
        sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])

      coord = tf.train.Coordinator()

      agent_threads = []
      if FLAGS.task == "matrix":
        thread = threading.Thread(target=(lambda: agent.build_matrix1(sess, coord, saver)))
        thread.start()
        agent_threads.append(thread)
      else:
        for agent in agents:
          thread = threading.Thread(target=(lambda: agent.play(sess, coord, saver)))
          thread.start()
          agent_threads.append(thread)

      coord.join(agent_threads)