Exemplo n.º 1
0
    def __init__(self, game, thread_id, optimizer, global_step):
        self.name = "worker_" + str(thread_id)
        self.thread_id = thread_id
        self.model_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.model_name)
        self.optimizer = optimizer
        self.global_episode = global_step
        self.increment_global_episode = self.global_episode.assign_add(1)
        self.episode_rewards = []

        # if not FLAGS.train:
        self.episode_optimal_rewards = []
        self.episodes_suboptimal_arms = []

        self.episode_lengths = []
        self.episode_mean_values = []
        self.summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.summaries_dir, FLAGS.model_name) + "/worker_" + str(self.thread_id))
        self.summary = tf.Summary()

        if FLAGS.use_conv:
            self.local_AC = ConvNetwork(self.name, optimizer, self.global_episode)
        else:
            self.local_AC = ACNetwork(self.name, optimizer, self.global_episode)

        self.update_local_vars = update_target_graph('global', self.name)
        self.env = game
Exemplo n.º 2
0
    def __init__(self, game, sess, thread_id, nb_actions, optimizer, global_step):
        self.name = "worker_" + str(thread_id)
        self.thread_id = thread_id
        self.model_path = FLAGS.checkpoint_dir
        self.trainer = optimizer
        self.global_episode = global_step
        self.increment_global_episode = self.global_episode.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []

        self.sess = sess
        self.graph = sess.graph
        # self.summary_writer = tf.summary.FileWriter(FLAGS.summaries_dir + "/worker_" + str(self.thread_id), self.graph)
        self.summary_writer = tf.summary.FileWriter(FLAGS.summaries_dir + "/worker_" + str(self.thread_id))
        self.summary = tf.Summary()

        if FLAGS.lstm:
            self.local_AC = ACNetworkLSTM(self.name, nb_actions, optimizer)
        else:
            self.local_AC = ACNetwork(self.name, nb_actions, optimizer)

        self.update_local_ops = update_target_graph('global', self.name)

        self.actions = np.zeros([nb_actions])
        self.env = game
Exemplo n.º 3
0
    def __init__(self, game, sess, thread_id, nb_actions, optimizer,
                 global_step):
        self.name = "worker_" + str(thread_id)
        self.thread_id = thread_id
        self.model_path = FLAGS.checkpoint_dir
        self.trainer = optimizer
        self.global_episode = global_step
        self.increment_global_episode = self.global_episode.assign_add(1)
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_mean_values = []

        self.sess = sess
        self.graph = sess.graph
        # self.summary_writer = tf.summary.FileWriter(FLAGS.summaries_dir + "/worker_" + str(self.thread_id), self.graph)
        self.summary_writer = tf.summary.FileWriter(FLAGS.summaries_dir +
                                                    "/worker_" +
                                                    str(self.thread_id))
        self.summary = tf.Summary()

        if FLAGS.lstm:
            self.local_AC = ACNetworkLSTM(self.name, nb_actions, optimizer)
        else:
            self.local_AC = ACNetwork(self.name, nb_actions, optimizer)

        self.update_local_ops = update_target_graph('global', self.name)

        self.actions = np.zeros([nb_actions])
        self.env = game
Exemplo n.º 4
0
    def __init__(self, game, optimizer, global_step):
        self.name = "policy_eval"
        self.global_episode = global_step
        self.local_AC = FUNNetwork(self.name, optimizer, self.global_episode)

        self.update_local_ops = update_target_graph('global', self.name)
        self.summary_writer = tf.summary.FileWriter(FLAGS.summaries_dir + "/policy_eval")
        self.env = game
Exemplo n.º 5
0
    def __init__(self, game, optimizer, global_step):
        self.name = "policy_eval"
        self.global_episode = global_step
        self.local_AC = FUNNetwork(self.name, optimizer, self.global_episode)

        self.update_local_ops = update_target_graph('global', self.name)
        self.summary_writer = tf.summary.FileWriter(FLAGS.summaries_dir +
                                                    "/policy_eval")
        self.env = game
Exemplo n.º 6
0
 def __init__(self, game, optimizer, global_step):
     self.name = "policy_eval"
     if FLAGS.use_conv:
         self.local_AC = ConvNetwork(self.name, optimizer, global_step)
     else:
         self.local_AC = ACNetwork(self.name, optimizer, global_step)
     self.update_local_ops = update_target_graph('global', self.name)
     self.summary_writer = tf.summary.FileWriter(FLAGS.summaries_dir + "/policy_eval")
     self.env = game
     # self.actions = np.zeros([nb_actions])
     self.global_episode = global_step
Exemplo n.º 7
0
 def __init__(self, game, nb_actions, optimizer, global_step):
     self.name = "policy_eval"
     if FLAGS.lstm:
         self.local_AC = ACNetworkLSTM(self.name, nb_actions, optimizer)
     else:
         self.local_AC = ACNetwork(self.name, nb_actions, optimizer)
     self.update_local_ops = update_target_graph('global', self.name)
     self.summary_writer = tf.summary.FileWriter(FLAGS.summaries_dir +
                                                 "/policy_eval")
     self.env = game
     self.actions = np.zeros([nb_actions])
     self.global_episode = global_step
Exemplo n.º 8
0
def init_network(input_shape, action_size, model):
    if model == 'nature':
        qnet = network(input_shape, action_size, 'qnet')
        tnet = network(input_shape, action_size, 'tnet')
        update_ops = update_target_graph('qnet', 'tnet')
    elif model == 'gated':
        sys.path.append('../prototype8/gated')
        sys.path.append('../prototype8/')
        from gated_regularized_qnetwork import gated_regularized_qnetwork_visual_input
        from utils import update_target_graph_vars
        qnet = gated_regularized_qnetwork_visual_input(input_shape,
                                                       action_size)
        tnet = None
        update_ops = update_target_graph_vars(qnet.qnet_vars, qnet.tnet_vars)
    return qnet, tnet, update_ops
Exemplo n.º 9
0
def init_model(input_shape, action_size, latent_size, learning_rate, model):
    if model == 'gan':
        jqnet = joint_qnetwork(input_shape, action_size, latent_size,
                               learning_rate)
        update_ops = update_target_graph('qnet', 'target_qnet')
    elif model == 'gated':
        from gated.joint_dqn_gated import joint_dqn_gated
        from utils import update_target_graph_vars
        jqnet = joint_dqn_gated(input_shape, action_size, learning_rate)
        update_ops = update_target_graph_vars(jqnet.qnet_vars, jqnet.tnet_vars)
    elif model == 'gated_reg':
        from gated_regularized_qnetwork import gated_regularized_qnetwork
        from utils import update_target_graph_vars
        jqnet = gated_regularized_qnetwork(input_shape, action_size, 256)
        update_ops = update_target_graph_vars(jqnet.qnet_vars, jqnet.tnet_vars)
    return jqnet, update_ops
Exemplo n.º 10
0
    def __init__(self, game, thread_id, optimizer, global_step):
        self.name = "agent_" + str(thread_id)
        self.thread_id = thread_id
        self.model_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.model_name)
        self.optimizer = optimizer
        self.global_episode = global_step
        self.increment_global_episode = self.global_episode.assign_add(1)
        self.episode_rewards = []

        self.episode_lengths = []
        self.episode_mean_w_values = []
        self.episode_mean_m_values = []
        self.summary_writer = tf.summary.FileWriter(
            os.path.join(FLAGS.summaries_dir, FLAGS.model_name) + "/agent_" + str(self.thread_id))
        self.summary = tf.Summary()

        self.local_AC = FUNNetwork(self.name, optimizer, self.global_episode)

        self.update_local_vars = update_target_graph('global', self.name)
        self.env = game
    def __init__(self, game, thread_id, optimizer, global_step):
        self.name = "agent_" + str(thread_id)
        self.thread_id = thread_id
        self.model_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.model_name)
        self.optimizer = optimizer
        self.global_episode = global_step
        self.increment_global_episode = self.global_episode.assign_add(1)
        self.episode_rewards = []

        self.episode_lengths = []
        self.episode_mean_w_values = []
        self.episode_mean_m_values = []
        self.summary_writer = tf.summary.FileWriter(
            os.path.join(FLAGS.summaries_dir, FLAGS.model_name) + "/agent_" +
            str(self.thread_id))
        self.summary = tf.Summary()

        self.local_AC = FUNNetwork(self.name, optimizer, self.global_episode)

        self.update_local_vars = update_target_graph('global', self.name)
        self.env = game
Exemplo n.º 12
0
    def __init__(self, game, name, a_size, state_size, trainer, model_path,
                 global_epss, data_path, num_units, network):
        self.name = "worker_" + str(name)
        self.number = name
        self.folder = data_path + '/trains/train_' + str(self.number)
        self.model_path = model_path
        self.trainer = trainer
        self.global_epss = global_epss
        self.increment = self.global_epss.assign_add(1)
        self.network = network
        self.eps_rewards = []
        self.eps_mean_values = []

        self.summary_writer = tf.summary.FileWriter(self.folder)

        # Create the local copy of the network and the tensorflow op
        # to copy global parameters to local network
        self.local_AC = AC_Network(a_size, state_size, self.name, trainer,
                                   num_units, network)
        self.update_local_ops = ut.update_target_graph('global', self.name)
        self.env = game
Exemplo n.º 13
0
    def __init__(self,
                 lr,
                 s_size,
                 action_size,
                 h_size,
                 scope,
                 gamma,
                 copy_from_scope=None):
        self._s_size = s_size
        self._action_size = action_size
        self._h_size = h_size
        self._gamma = gamma
        self._regularization_param = 0.001

        # Implementing F(state)=action
        self.state_in = tf.placeholder(shape=[None, self._s_size],
                                       dtype=tf.float32)
        self.reward_holder = tf.placeholder(shape=[None], dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[None], dtype=tf.int32)

        self.action_distribution = self._construct_policy_model(scope)

        taken_action_probability = BrainPG.get_decision_probability(
            self.action_holder, self.action_distribution)

        loss = -tf.reduce_mean(
            tf.log(taken_action_probability) * self.reward_holder)
        self.optimize = tf.train.RMSPropOptimizer(
            learning_rate=lr).minimize(loss)

        # Initialize Variables
        BrainPG.sess.run(
            tf.variables_initializer(
                tf.get_collection(tf.GraphKeys.VARIABLES, scope)))

        self.saver = tf.train.Saver(
            tf.get_collection(tf.GraphKeys.VARIABLES, scope))

        if copy_from_scope is not None:
            BrainPG.sess.run(utils.update_target_graph(copy_from_scope, scope))
Exemplo n.º 14
0
    def __init__(self, game, thread_id, optimizer, global_step, settings):
        self.name = "agent_" + str(thread_id)
        self.thread_id = thread_id
        self.model_path = settings["checkpoint_dir"]
        self.settings = settings
        self.optimizer = optimizer
        self.global_episode = global_step
        self.increment_global_episode = self.global_episode.assign_add(1)
        self.episode_rewards = []

        # if not FLAGS.train:
        self.episode_regrets = []
        self.episodes_suboptimal_arms = []

        self.episode_lengths = []
        self.episode_mean_values = []
        self.summary_writer = tf.summary.FileWriter(settings["summaries_dir"] + "/agent_" + str(self.thread_id))
        self.summary = tf.Summary()

        self.local_AC = ACNetwork(self.name, optimizer, self.global_episode)
        self.update_local_vars = update_target_graph('global', self.name)
        self.env = game
    def __init__(self, game, thread_id, optimizer, global_step, settings):
        self.name = "agent_" + str(thread_id)
        self.thread_id = thread_id
        self.model_path = settings["checkpoint_dir"]
        self.settings = settings
        self.optimizer = optimizer
        self.global_episode = global_step
        self.increment_global_episode = self.global_episode.assign_add(1)
        self.episode_rewards = []

        # if not FLAGS.train:
        self.episode_regrets = []
        self.episodes_suboptimal_arms = []

        self.episode_lengths = []
        self.episode_mean_values = []
        self.summary_writer = tf.summary.FileWriter(settings["summaries_dir"] +
                                                    "/agent_" +
                                                    str(self.thread_id))
        self.summary = tf.Summary()

        self.local_AC = ACNetwork(self.name, optimizer, self.global_episode)
        self.update_local_vars = update_target_graph('global', self.name)
        self.env = game
Exemplo n.º 16
0
    def __init__(self, game, name, s_size, a_size, optimizer=None, model_path=None, global_episodes=None, play=False):
        self.s_size = s_size
        self.a_size = a_size

        self.summary_step = 3

        self.name = "worker_" + str(name)
        self.number = name

        self.episode_reward = []
        self.episode_episode_health = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.episode_health = []
        self.episode_kills = []

        # Create the local copy of the network and the tensorflow op to
        # copy global parameters to local network
        if not play:
            self.model_path = model_path
            self.trainer = optimizer
            self.global_episodes = global_episodes
            self.increment = self.global_episodes.assign_add(1)
            self.local_AC_network = network.ACNetwork(self.name, optimizer, play=play)
            self.summary_writer = tf.summary.FileWriter("./summaries/defend_the_center/agent_%s" % str(self.number))
            self.update_local_ops = tf.group(*utils.update_target_graph('global', self.name))
        else:
            self.local_AC_network = network.ACNetwork(self.name, optimizer, play=play)
        if not isinstance(game, DoomGame):
            raise TypeError("Type Error")

        # The Below code is related to setting up the Doom environment
        game = DoomGame()
        # game.set_doom_scenario_path('../scenarios/deadly_corridor.cfg')
        game.load_config("../scenarios/defend_the_center.cfg")
        # game.set_doom_map("map01")
        game.set_screen_resolution(ScreenResolution.RES_640X480)
        game.set_screen_format(ScreenFormat.RGB24)
        game.set_render_hud(False)
        game.set_render_crosshair(False)
        game.set_render_weapon(True)
        game.set_render_decals(False)
        game.set_render_particles(False)
        # Enables labeling of the in game objects.
        game.set_labels_buffer_enabled(True)
        game.add_available_button(Button.TURN_LEFT)
        game.add_available_button(Button.TURN_RIGHT)
        game.add_available_button(Button.ATTACK)
        game.add_available_game_variable(GameVariable.USER1)
        game.set_episode_timeout(2100)
        game.set_episode_start_time(5)
        game.set_window_visible(play)
        game.set_sound_enabled(False)
        game.set_living_reward(0)
        game.set_mode(Mode.PLAYER)
        if play:
            # game.add_game_args("+viz_render_all 1")
            game.set_render_hud(False)
            game.set_ticrate(35)
        game.init()
        self.env = game
        self.actions = self.button_combinations()
Exemplo n.º 17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='CartPole-v0')
    parser.add_argument("--action-size", type=int, default=2)
    parser.add_argument("--input-shape", type=list, default=[None, 4])
    parser.add_argument("--target-update-freq", type=int, default=200)
    parser.add_argument("--epsilon-max", type=float, default=1.)
    parser.add_argument("--epsilon-min", type=float, default=.01)
    parser.add_argument("--epsilon-decay", type=float, default=.001)

    parser.add_argument("--learning-rate", type=float, default=.99)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--epochs", type=int, default=300)

    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    args = parser.parse_args()

    env = gym.make(args.environment)
    args.action_size = env.action_space.n
    args.input_shape = [None] + list(env.observation_space.shape)

    print args

    # Epsilon parameter
    epsilon = args.epsilon_max

    # Replay memory
    memory = Memory(args.replay_mem_size)

    # Time step
    time_step = 0.

    # Initialize the agent
    qnet = qnetwork(input_shape=args.input_shape,
                    action_size=args.action_size,
                    scope='qnet')
    tnet = qnetwork(input_shape=args.input_shape,
                    action_size=args.action_size,
                    scope='tnet')
    update_ops = update_target_graph('qnet', 'tnet')

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(args.epochs):
            total_reward = 0
            state = env.reset()
            while True:
                #env.render()
                if np.random.rand() < epsilon:
                    action = np.random.randint(args.action_size)
                else:
                    action = qnet.act(sess, state)
                next_state, reward, done, _ = env.step(action)
                total_reward += reward

                # Add to memory
                memory.add([state, action, reward, next_state, done])

                # Reduce epsilon
                time_step += 1.
                epsilon = args.epsilon_min + (
                    args.epsilon_max - args.epsilon_min) * np.exp(
                        -args.epsilon_decay * time_step)

                # Training step
                batch = np.array(memory.sample(args.batch_size))
                qnet.train(sess, batch, args.learning_rate, tnet)

                # s <- s'
                state = np.copy(next_state)

                # Update target network
                if int(time_step) % args.target_update_freq == 0:
                    sess.run(update_ops)

                if done:
                    print 'epoch:', epoch, 'total_rewards:', total_reward
                    break
Exemplo n.º 18
0
    def build_model(self):
        self.inputs = tf.placeholder(
            shape=[None, 1],
            dtype=tf.float32,
        )
        self.labels = tf.placeholder(
            shape=[None, 1],
            dtype=tf.float32,
        )
        self.task_amplitude = tf.placeholder(
            shape=None,
            dtype=tf.float32,
        )
        self.ep = tf.Variable(0,
                              dtype=tf.int32,
                              name='episodes',
                              trainable=False)
        self.inc_ep = self.ep.assign_add(1)

        network_names = ["meta", "learner"]
        self.outputs = {}
        for name in network_names:
            with tf.variable_scope(name):
                dense_1 = tf.layers.dense(
                    inputs=self.inputs,
                    units=self.hidden_1,
                    activation=tf.nn.relu,
                    kernel_initializer=tf.truncated_normal_initializer(
                        .0, .01),
                    name="dense_1",
                )
                dense_2 = tf.layers.dense(
                    inputs=dense_1,
                    units=self.hidden_2,
                    activation=tf.nn.relu,
                    kernel_initializer=tf.truncated_normal_initializer(
                        .0, .01),
                    name="dense_2",
                )
                self.outputs[name] = tf.layers.dense(
                    inputs=dense_2,
                    units=1,
                    activation=None,
                    kernel_initializer=tf.truncated_normal_initializer(
                        .0, .01),
                    name="output",
                )

        self.loss = tf.losses.mean_squared_error(
            self.labels / self.task_amplitude,
            self.outputs["learner"] / self.task_amplitude)

        self.optimize = tf.train.AdamOptimizer(learning_rate=1e-2,
                                               beta1=self.beta1).minimize(
                                                   self.loss)
        self.fresh_optimize = tf.train.AdamOptimizer(
            learning_rate=1e-2).minimize(self.loss)

        local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       self.name)
        self.learner_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                              "{}/learner".format(self.name))
        self.meta_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                           "{}/meta".format(self.name))
        self.gradients = tf.gradients(self.loss, self.learner_vars)
        self.reptile_grad = [
            self.meta_vars[i] - learner_var
            for i, learner_var in enumerate(self.learner_vars)
        ]
        self.update_meta = self.meta_trainer.apply_gradients(
            zip(self.reptile_grad, self.meta_vars))
        self.copy_meta_to_learner = update_target_graph(
            "{}/meta".format(self.name), "{}/learner".format(self.name))
Exemplo n.º 19
0
    def init_network(self):

        input_shape = self.feedback_size + (self.num_frames, )
        worker_device = "/job:worker/task:{}/cpu:0".format(self.agent_index)

        with tf.device(
                tf.train.replica_device_setter(1,
                                               worker_device=worker_device)):
            with tf.variable_scope("global"):
                if self.use_lstm is False:
                    self.shared_network = FFPolicy(input_shape,
                                                   len(self.actions),
                                                   self.network_type)
                else:
                    self.shared_network = LSTMPolicy(input_shape,
                                                     len(self.actions),
                                                     self.network_type)

                self.global_step = tf.get_variable(
                    "global_step",
                    shape=[],
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False,
                    dtype=tf.int32)
                self.best_score = tf.get_variable(
                    "best_score",
                    shape=[],
                    initializer=tf.constant_initializer(-1e2,
                                                        dtype=tf.float32),
                    trainable=False,
                    dtype=tf.float32)

        with tf.device(worker_device):
            with tf.variable_scope('local'):
                if self.use_lstm is False:
                    self.network = FFPolicy(input_shape, len(self.actions),
                                            self.network_type)
                else:
                    self.network = LSTMPolicy(input_shape, len(self.actions),
                                              self.network_type)
                # Sync params
                self.update_local_ops = update_target_graph(
                    self.shared_network.vars, self.network.vars)
                # Learning rate
                self.lr = tf.get_variable(name='lr',
                                          shape=[],
                                          initializer=tf.constant_initializer(
                                              self.learning_rate),
                                          trainable=False,
                                          dtype=tf.float32)
                self.t_lr = tf.placeholder(dtype=tf.float32,
                                           shape=[],
                                           name='new_lr')
                self.assign_lr_op = tf.assign(self.lr, self.t_lr)
                # Best score
                self.t_score = tf.placeholder(dtype=tf.float32,
                                              shape=[],
                                              name='new_score')
                self.assign_best_score_op = tf.assign(self.best_score,
                                                      self.t_score)
                # Build gradient_op
                self.increase_step = self.global_step.assign_add(1)
                gradients = self.network.build_gradient_op(clip_grad=40.0)
                # Additional summaries
                tf.summary.scalar("learning_rate",
                                  self.lr,
                                  collections=['a3c'])
                tf.summary.scalar("score", self.t_score, collections=['a3c'])
                tf.summary.scalar("best_score",
                                  self.best_score,
                                  collections=['a3c'])
                self.summary_op = tf.summary.merge_all('a3c')

        if self.shared_optimizer:
            with tf.device(
                    tf.train.replica_device_setter(
                        1, worker_device=worker_device)):
                with tf.variable_scope("global"):
                    optimizer = create_optimizer(self.update_method, self.lr,
                                                 self.rho,
                                                 self.rmsprop_epsilon)
                    self.train_op = optimizer.apply_gradients(
                        zip(gradients, self.shared_network.vars))
        else:
            with tf.device(worker_device):
                with tf.variable_scope('local'):
                    optimizer = create_optimizer(self.update_method, self.lr,
                                                 self.rho,
                                                 self.rmsprop_epsilon)
                    self.train_op = optimizer.apply_gradients(
                        zip(gradients, self.shared_network.vars))
Exemplo n.º 20
0
def DQN():
    parser = argparse.ArgumentParser()
    parser.add_argument("--environment", type=str, default='CartPole-v0')
    parser.add_argument("--action-size", type=int, default=2)
    parser.add_argument("--input-shape", type=list, default=[None, 4])
    parser.add_argument("--target-update-freq", type=int, default=200)
    parser.add_argument("--epsilon-max", type=float, default=1.)
    parser.add_argument("--epsilon-min", type=float, default=.01)
    parser.add_argument("--epsilon-decay", type=float, default=.001)

    parser.add_argument("--discount-factor", type=float, default=.99)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--epochs", type=int, default=1000)

    parser.add_argument("--replay-mem-size", type=int, default=1000000)
    args = parser.parse_args()

    env = Environment()
    args.action_size = env.nActions
    args.input_shape = [None, env.stateShape]

    print args

    # Epsilon parameter
    epsilon = 0.1  # args.epsilon_max

    # Replay memory
    memory = Memory(args.replay_mem_size)

    # Time step
    time_step = 0.

    # Initialize the agent
    qnet = qnetwork(input_shape=args.input_shape,
                    action_size=args.action_size,
                    scope='qnet')
    tnet = qnetwork(input_shape=args.input_shape,
                    action_size=args.action_size,
                    scope='tnet')
    update_ops = update_target_graph('qnet', 'tnet')

    rewardHistory = np.zeros(args.epochs)
    env.render()

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(args.epochs):
            total_reward = 0
            state = env.reset()
            while (True):
                #env.render()
                if np.random.rand() < epsilon:
                    action = np.random.randint(args.action_size)
                else:
                    action = qnet.act(sess, state)
                [next_state, reward, done] = env.step(action)
                total_reward += reward
                rewardHistory[epoch] += reward

                # Add to memory
                memory.add([state, action, reward, next_state, done])

                # Reduce epsilon
                time_step += 1.
                #epsilon = args.epsilon_min + (args.epsilon_max - args.epsilon_min) * np.exp(-args.epsilon_decay * time_step)

                # Training step
                batch = np.array(memory.sample(args.batch_size))
                qnet.train(sess, batch, args.discount_factor, tnet)

                # s <- s'
                state = np.copy(next_state)

                # Update target network
                if int(time_step) % args.target_update_freq == 0:
                    sess.run(update_ops)

                if done:
                    print 'epoch:', epoch, 'total_rewards:', total_reward
                    break
        '''
        np.set_printoptions(threshold=np.nan)
        for v in range(-5, 5):
            policy = np.zeros((env.W, env.W), dtype='int')
            for x in range(env.W):
                for y in range(env.W):
                    policy[x,y] = qnet.act(sess, np.array([x,y,1,v]))
            print(policy)
        '''
        plt.xlabel('episode #')
        plt.ylabel('reward')
        plt.plot(rewardHistory)
        plt.savefig("DQN")
        plt.show()

        for epoch in range(10):
            total_reward = 0
            state = env.reset()
            while (True):
                env.render()
                action = qnet.act(sess, state)
                [next_state, reward, done] = env.step(action)
                total_reward += reward
                rewardHistory[epoch] += reward

                # Reduce epsilon
                time_step += 1.
                # s <- s'
                state = np.copy(next_state)

                if done:
                    print 'epoch:', epoch, 'total_rewards:', total_reward
                    break
Exemplo n.º 21
0
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())

    print('Objective1_Value: %.4f\t Objective2_Value: %.4f\t' %
          evaluate(sess, model, train_set))
    sys.stdout.flush()
    lr = 1
    start_time = time.time()
    last_auc = 0.0

    for epoch in range(100):
        random.shuffle(train_set)
        random.shuffle(test_set)
        epoch_size = round(len(train_set) / batch_size)
        loss_sum = 0.0
        for _, uij in DataInput(train_set, batch_size):
            loss = model.train(sess, uij, lr)
            loss_sum += loss
        print('Epoch %d Train_Loss: %.4f' %
              (model.global_epoch_step.eval(), loss_sum))
        print('Epoch %d DONE\tCost time: %.2f' %
              (model.global_epoch_step.eval(), time.time() - start_time))
        print('Objective1_Value: %.4f\t Objective2_Value: %.4f\t' %
              evaluate(sess, model, train_set))
        #print('Objective1_Value: %.4f\t Objective2_Value: %.4f\t' % evaluate(sess, model, test_set))
        sys.stdout.flush()
        model.global_epoch_step_op.eval()
        if epoch % 5 == 0:
            update_target_graph('primary_dqn', 'target_dqn')

end_time = time.time()
Exemplo n.º 22
0
    def __init__(self, q_network, ob_space, ac_space, lr, max_grad_norm,
                 units_per_hlayer, activ_fcn, log_interval, logdir, batch_size,
                 trace_length, tau, update_interval, keep_model):
        self.logger = logging.getLogger(self.__module__ + "." +
                                        self.__class__.__name__)
        self.logger.info("Set up DQN learning agent")
        self.num_steps_trained = 0
        self.log_interval = log_interval

        sess = make_session()  # TODO add CPU config information

        # nbatch = batch_size
        self.global_step = tf.get_variable('global_step', [],
                                           tf.int32,
                                           tf.constant_initializer(
                                               0, tf.int32),
                                           trainable=False)

        # Targets in loss computation
        QT = tf.placeholder(shape=[batch_size * trace_length],
                            dtype=tf.float32,
                            name='QT')  # target Q values
        A = tf.placeholder(shape=[batch_size * trace_length],
                           dtype=tf.int32,
                           name='A')  # action indices

        eval_model = q_network(sess,
                               ob_space,
                               ac_space.n,
                               nbatch=1,
                               trace_length=1,
                               units_per_hlayer=units_per_hlayer,
                               scope='model',
                               reuse=False,
                               activ_fcn=activ_fcn)
        train_model = q_network(sess,
                                ob_space,
                                ac_space.n,
                                nbatch=batch_size,
                                trace_length=trace_length,
                                units_per_hlayer=units_per_hlayer,
                                scope='model',
                                reuse=True,
                                activ_fcn=activ_fcn)
        # target_model = TargetNetwork(sess, ob_space, ac_space.n)
        target_model = q_network(sess,
                                 ob_space,
                                 ac_space.n,
                                 nbatch=batch_size,
                                 trace_length=trace_length,
                                 units_per_hlayer=units_per_hlayer,
                                 scope='target',
                                 reuse=False,
                                 activ_fcn=activ_fcn)

        # Obtain loss by taking the mean of squares difference between the target and prediction Q values.
        actions_onehot = tf.one_hot(A, depth=ac_space.n, dtype=tf.float32)
        td_error = tf.losses.mean_squared_error(
            labels=QT,
            predictions=tf.squeeze(
                tf.matmul(tf.multiply(train_model.predQ, actions_onehot),
                          [[1.], [1.]])))
        loss = td_error
        params = tf.trainable_variables(
        )  # was set to 'model', but we would need model and target parameters
        optimizer = tf.train.AdamOptimizer(lr)
        gradients = optimizer.compute_gradients(loss)
        grads, variables = zip(*gradients)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        _train = [
            optimizer.apply_gradients(grads),
            self.global_step.assign_add(update_interval)
        ]  # nbatch

        if log_interval > 0:
            for g, v in grads:
                if g is not None:
                    tf.summary.histogram(
                        "train/grads/%s-grad" % v.name.replace(':', '_'), g)
            for p in params:
                if p is not None:
                    tf.summary.histogram(
                        "train/params/%s" % p.name.replace(':', '_'),
                        p.value())
            tf.summary.scalar("train/vf_loss", loss)
            tf.summary.histogram("others/A", A)
            tf.summary.histogram("others/QT", QT)
            self.summary_step = tf.summary.merge_all()

        tf.add_to_collection('inputs', eval_model.X)
        tf.add_to_collection('predQ', eval_model.predQ)
        if eval_model.initial_state is not None:
            add_to_collection_rnn_state('state_in', eval_model.rnn_state_in)
            add_to_collection_rnn_state('state_out', eval_model.rnn_state_out)
        # tf.add_to_collection('step', eval_model.step)

        tf.global_variables_initializer().run(session=sess)

        def train(obs, actions, targets, states):
            """
            Updates the weights of the neural network, based on its targets, its
            predictions, its loss and its optimizer.

            Args:
                sess: TensorFlow session.
                obs: [current_observation] or observations of batch
                actions: [current_action] or actions of batch
                targets: [current_target] or targets of batch
            """
            feed_dict = {train_model.X: obs, A: actions, QT: targets}
            if states is not None:
                feed_dict[train_model.rnn_state_in] = states
            # evaluate the TF tensors and operations self.loss and self.train_step
            total_loss, _, global_step = sess.run(
                [loss, _train, self.global_step], feed_dict=feed_dict)
            if log_interval > 0 and (self.num_steps_trained % self.log_interval
                                     == 0):
                self.logger.info(
                    'Save summary of network weights, grads and losses.')
                summary_str = sess.run(self.summary_step, feed_dict)
                self.summary_writer.add_summary(
                    tf.Summary.FromString(summary_str), global_step)
            self.num_steps_trained += 1

            return total_loss

        saver = tf.train.Saver(max_to_keep=keep_model)

        def update_target(target_op_holder):
            for op in target_op_holder:
                sess.run(op)
            a = tf.trainable_variables()[0].eval(session=sess)
            b = tf.trainable_variables()[len(params) // 2].eval(session=sess)
            if not a.all() == b.all():
                print("Target Set Failed")

        def save(f_name):
            gs = sess.run(self.global_step)
            self.logger.info(
                'Save network parameters of model at global step %s' % gs)
            saver.save(sess, os.path.join(logdir, f_name), global_step=gs)

        def load(load_path):
            saver.restore(sess, load_path)

        def test_run(env, n_eps, n_pipes):
            self.logger.info('Evaluating current agent')
            ep_return = []
            ep_length = []
            for i in range(0, n_eps):  # TODO parallelize this here!
                obs = env.reset()
                obs = normalize_obs(obs)
                done = False
                if eval_model.initial_state is not None:
                    if len(eval_model.initial_state) > 1:
                        rnn_s_in = (np.zeros(
                            eval_model.initial_state[0].shape),
                                    np.zeros(eval_model.initial_state[1].shape)
                                    )  # init lstm cell vector
                    else:
                        rnn_s_in = np.zeros(eval_model.initial_state.shape
                                            )  # init gru cell vector
                total_return = 0
                total_length = 0

                while not done and (total_return < n_pipes):
                    if eval_model.initial_state is not None:
                        pQ, rnn_s_out = sess.run(
                            [eval_model.predQ, eval_model.rnn_state_out],
                            feed_dict={
                                eval_model.X: [obs],
                                eval_model.rnn_state_in: rnn_s_in
                            })
                    else:
                        pQ = sess.run([eval_model.predQ],
                                      feed_dict={eval_model.X: [obs]})
                    ac = np.argmax(pQ)
                    obs, reward, done, _ = env.step(ac)
                    obs = normalize_obs(obs)
                    total_length += 1
                    total_return += reward
                    if eval_model.initial_state is not None:
                        rnn_s_in = rnn_s_out
                self.logger.info('Episode %s: %s, %s' %
                                 (i, total_return, total_length))
                ep_length.append(total_length)
                ep_return.append(total_return)
            return ep_return

        self.train = train
        self.train_model = train_model
        self.step_model = eval_model
        self.target_model = target_model
        self.target_ops = update_target_graph(
            params, tau)  # TODO implement update_target_graph

        self.update_target = update_target

        self.step = eval_model.step
        self.predict = eval_model.predict
        self.step_initial_state = eval_model.initial_state
        self.train_initial_state = train_model.initial_state
        self.save = save
        self.load = load
        self.test_run = test_run
        self.sess = sess

        if log_interval > 0:
            self.summary_writer = tf.summary.FileWriter(
                logdir, graph_def=sess.graph_def)
        else:
            self.summary_writer = None
Exemplo n.º 23
0
    def __init__(self,
                 game,
                 name,
                 optimizer=None,
                 model_path=None,
                 global_episodes=None,
                 play=False,
                 task_name='healthpack_simple'):
        self.task_name = task_name
        self.play = play
        self.summary_step = 3

        self.name = cfg.AGENT_PREFIX + str(name)
        self.number = name

        self.imitate_data = None

        self.last_total_health = 100.
        self.last_total_kills = 0.
        self.last_total_ammos = 0.
        self.img_shape = cfg.IMG_SHAPE

        self.episode_reward = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.episode_health = []
        self.episode_kills = []

        if not self.play:
            self.model_path = model_path
            self.trainer = optimizer
            self.global_episodes = global_episodes
            self.increment = self.global_episodes.assign_add(1)
            self.local_AC_network = network.ACNetwork(self.name,
                                                      optimizer,
                                                      play=self.play,
                                                      img_shape=cfg.IMG_SHAPE)
            self.summary_writer = tf.summary.FileWriter(
                "./summaries/%s/ag_%s" % (self.task_name, str(self.number)))
            # create a tensorflow op to copy weights from global network regularly when training
            self.update_local_ops = tf.group(
                *utils.update_target_graph('global', self.name))
        else:
            self.local_AC_network = network.ACNetwork(self.name,
                                                      optimizer,
                                                      play=self.play,
                                                      img_shape=cfg.IMG_SHAPE)
        if not isinstance(game, DoomGame):
            raise TypeError("Type Error")

        game = DoomGame()
        game.load_config(cfg.SCENARIO_PATH)
        game.set_doom_map("map01")
        game.set_screen_resolution(ScreenResolution.RES_640X480)
        game.set_screen_format(ScreenFormat.RGB24)
        game.set_render_hud(False)
        game.set_render_crosshair(False)
        game.set_render_weapon(True)
        game.set_render_decals(False)
        game.set_render_particles(True)
        # Enables labeling of the in game objects.
        game.set_labels_buffer_enabled(True)
        game.add_available_button(Button.MOVE_FORWARD)
        game.add_available_button(Button.MOVE_RIGHT)
        game.add_available_button(Button.MOVE_LEFT)
        game.add_available_button(Button.TURN_LEFT)
        game.add_available_button(Button.TURN_RIGHT)
        game.add_available_button(Button.ATTACK)
        game.add_available_button(Button.SPEED)
        game.add_available_game_variable(GameVariable.AMMO2)
        game.add_available_game_variable(GameVariable.HEALTH)
        game.add_available_game_variable(GameVariable.USER2)
        game.set_episode_timeout(2100)
        game.set_episode_start_time(5)
        game.set_window_visible(self.play)
        game.set_sound_enabled(False)
        game.set_living_reward(0)
        game.set_mode(Mode.PLAYER)
        if self.play:
            game.add_game_args("+viz_render_all 1")
            game.set_render_hud(False)
            game.set_ticrate(35)
        game.init()
        self.env = game
        self.actions = cfg.button_combinations()
Exemplo n.º 24
0
    def __init__(self,
                 game,
                 name,
                 optimizer=None,
                 model_path=None,
                 global_episodes=None,
                 play=False,
                 task_name='healthpack_simple'):
        self.task_name = task_name

        self.summary_step = 3

        self.name = "worker_" + str(name)
        self.number = name

        self.last_total_health = 100.
        self.img_shape = cfg.IMG_SHAPE

        self.episode_reward = []
        self.episode_episode_total_pickes = []
        self.episode_lengths = []
        self.episode_mean_values = []
        self.episode_health = []

        # Create the local copy of the network and the tensorflow op to
        # copy global parameters to local network
        if not play:
            self.model_path = model_path
            self.trainer = optimizer
            self.global_episodes = global_episodes
            self.increment = self.global_episodes.assign_add(1)
            self.local_AC_network = network.ACNetwork(self.name,
                                                      optimizer,
                                                      play=play,
                                                      img_shape=cfg.IMG_SHAPE)
            self.summary_writer = tf.summary.FileWriter(
                "./summaries/healthpack/train_health%s" % str(self.number))
            self.update_local_ops = tf.group(
                *utils.update_target_graph(self.task_name +
                                           '/global', self.task_name + '/' +
                                           self.name))
        else:
            self.local_AC_network = network.ACNetwork(self.name,
                                                      optimizer,
                                                      play=play,
                                                      img_shape=cfg.IMG_SHAPE)
        if not isinstance(game, DoomGame):
            raise TypeError("Type Error")

        # The Below code is related to setting up the Doom environment
        game = DoomGame()
        game.set_doom_scenario_path("../scenarios/{}".format(
            'health_gathering_supreme.wad' if cfg.
            IS_SUPREME_VERSION else 'health_gathering.wad'))

        game.set_doom_map("map01")
        game.set_screen_resolution(ScreenResolution.RES_640X480)
        game.set_screen_format(ScreenFormat.RGB24)
        game.set_render_hud(False)
        game.set_render_crosshair(False)
        game.set_render_weapon(True)
        game.set_render_decals(False)
        game.set_render_particles(True)
        # Enables labeling of the in game objects.
        game.set_labels_buffer_enabled(True)
        game.add_available_button(Button.TURN_LEFT)
        game.add_available_button(Button.TURN_RIGHT)
        game.add_available_button(Button.MOVE_FORWARD)
        game.add_available_game_variable(GameVariable.USER1)
        game.set_episode_timeout(2100)
        game.set_episode_start_time(5)
        game.set_window_visible(play)
        game.set_sound_enabled(False)
        game.set_living_reward(0)
        game.set_mode(Mode.PLAYER)
        if play:
            game.add_game_args("+viz_render_all 1")
            game.set_render_hud(False)
            game.set_ticrate(35)
        game.init()
        self.env = game
        self.actions = [
            list(perm)
            for perm in iter.product([False, True],
                                     repeat=game.get_available_buttons_size())
        ]
        self.actions.remove([True, True, True])
        self.actions.remove([True, True, False])
Exemplo n.º 25
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--env-interface", type=str, default='gym!atari')
    parser.add_argument("--environment", type=str, default='CartPole-v0')
    parser.add_argument("--action-size", type=int, default=2)
    parser.add_argument("--input-shape", type=list, default=[None, 4])
    parser.add_argument("--target-update-freq", type=int, default=200)
    parser.add_argument("--epsilon-max", type=float, default=1.)
    parser.add_argument("--epsilon-min", type=float, default=.01)
    parser.add_argument("--epsilon-decay", type=float, default=.001)

    parser.add_argument("--learning-rate", type=float, default=.99)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--epochs", type=int, default=30000)

    parser.add_argument("--replay-mem-size", type=int, default=1000000)

    parser.add_argument("--K",
                        type=int,
                        default=1,
                        help='The number of steps to train the environment')
    parser.add_argument(
        "--L",
        type=int,
        default=1,
        help='The number of Q-learning steps for hypothetical rollouts')
    parser.add_argument("--latent-size",
                        type=int,
                        default=4,
                        help='Size of vector for Z')

    args = parser.parse_args()

    env = env_interface(args.env_interface,
                        args.environment,
                        pixel_feature=False,
                        render=True)

    #args.action_size = env.action_space.n
    args.action_size = env.action_size
    args.input_shape = [None] + list(env.obs_space_shape)

    print args

    # Other parameters
    epsilon = args.epsilon_max

    # Replay memory
    memory = Memory(args.replay_mem_size)

    # Time step
    time_step = 0.

    # Initialize the GANs
    cgan_state = CGAN(input_shape=args.input_shape,
                      action_size=args.action_size,
                      latent_size=args.latent_size,
                      gen_input_shape=args.input_shape)
    cgan_reward = CGAN(input_shape=args.input_shape,
                       action_size=args.action_size,
                       latent_size=args.latent_size,
                       gen_input_shape=[None, 1])

    qnet = qnetwork(input_shape=args.input_shape,
                    action_size=args.action_size,
                    scope='qnet')
    target_qnet = qnetwork(input_shape=args.input_shape,
                           action_size=args.action_size,
                           scope='target_qnet')
    update_ops = update_target_graph('qnet', 'target_qnet')

    rand_no = np.random.rand()
    #env = gym.wrappers.Monitor(env, '/tmp/cartpole-experiment-' + str(rand_no), force=True, video_callable=False)
    init = tf.initialize_all_variables()
    with tf.Session() as sess:
        sess.run(init)
        for epoch in range(args.epochs):
            total_reward = 0
            observation = env.reset()
            for t in range(1000000):
                #env.render()
                action = qnet.get_action(sess, observation)
                if np.random.rand() < epsilon:
                    #action = env.action_space.sample()
                    action = np.random.randint(args.action_size)
                observation1, reward, done, info = env.step(action)
                total_reward += reward

                # Add to memory
                memory.add([observation, action, reward, observation1, done])

                # Reduce epsilon
                time_step += 1.
                epsilon = args.epsilon_min + (
                    args.epsilon_max - args.epsilon_min) * np.exp(
                        -args.epsilon_decay * time_step)

                # Training step
                batch = np.array(memory.sample(args.batch_size))
                qnet.train(sess, batch, args.learning_rate, target_qnet)

                # Training step: environment model
                for k in range(args.K):
                    batch = np.array(memory.sample(args.batch_size))

                    states = np.vstack(batch[:, 0])
                    actions = np.array(batch[:, 1])
                    rewards = batch[:, 2]
                    states1 = np.vstack(batch[:, 3])

                    _, D_loss_state = sess.run(
                        [cgan_state.D_solver, cgan_state.D_loss],
                        feed_dict={
                            cgan_state.states: states,
                            cgan_state.actions: actions,
                            cgan_state.Z: sample_z(len(batch),
                                                   args.latent_size),
                            cgan_state.X: states1
                        })
                    _, G_loss_state = sess.run(
                        [cgan_state.G_solver, cgan_state.G_loss],
                        feed_dict={
                            cgan_state.states: states,
                            cgan_state.actions: actions,
                            cgan_state.Z: sample_z(len(batch),
                                                   args.latent_size)
                        })

                    _, D_loss_reward = sess.run(
                        [cgan_reward.D_solver, cgan_reward.D_loss],
                        feed_dict={
                            cgan_reward.states: states,
                            cgan_reward.actions: actions,
                            cgan_reward.Z: sample_z(len(batch),
                                                    args.latent_size),
                            cgan_reward.X: rewards[..., np.newaxis]
                        })
                    _, G_loss_reward = sess.run(
                        [cgan_reward.G_solver, cgan_reward.G_loss],
                        feed_dict={
                            cgan_reward.states: states,
                            cgan_reward.actions: actions,
                            cgan_reward.Z: sample_z(len(batch),
                                                    args.latent_size)
                        })
                    #print D_loss_state, G_loss_state, D_loss_reward, G_loss_state

                # Training step: imagination rollouts
                if time_step == 0.:
                    print "time_step 0 here"
                if time_step >= 0.:
                    for l in range(args.L):
                        batch = np.array(memory.sample(args.batch_size))
                        assert len(batch) > 0

                        states1 = np.vstack(batch[:, 3])
                        actions = np.random.randint(args.action_size,
                                                    size=len(batch))
                        dones = np.array([False] * len(batch))

                        G_sample_state = sess.run(cgan_state.G_sample,
                                                  feed_dict={
                                                      cgan_state.states:
                                                      states1,
                                                      cgan_state.actions:
                                                      actions,
                                                      cgan_state.Z:
                                                      sample_z(
                                                          len(batch),
                                                          args.latent_size)
                                                  })
                        G_sample_reward = sess.run(cgan_reward.G_sample,
                                                   feed_dict={
                                                       cgan_reward.states:
                                                       states1,
                                                       cgan_reward.actions:
                                                       actions,
                                                       cgan_reward.Z:
                                                       sample_z(
                                                           len(batch),
                                                           args.latent_size)
                                                   })
                        qnet.train(sess, None, args.learning_rate, target_qnet,
                                   states1, actions, G_sample_reward,
                                   G_sample_state, dones)

                # Set observation
                observation = observation1

                # Update?
                if int(time_step) % args.target_update_freq == 0:
                    #print "Updating target..."
                    sess.run(update_ops)

                if done:
                    print "Episode finished after {} timesteps".format(
                        t + 1), 'epoch', epoch, 'total_rewards', total_reward
                    break