def __init__(self, game, thread_id, optimizer, global_step): self.name = "worker_" + str(thread_id) self.thread_id = thread_id self.model_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.model_name) self.optimizer = optimizer self.global_episode = global_step self.increment_global_episode = self.global_episode.assign_add(1) self.episode_rewards = [] # if not FLAGS.train: self.episode_optimal_rewards = [] self.episodes_suboptimal_arms = [] self.episode_lengths = [] self.episode_mean_values = [] self.summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.summaries_dir, FLAGS.model_name) + "/worker_" + str(self.thread_id)) self.summary = tf.Summary() if FLAGS.use_conv: self.local_AC = ConvNetwork(self.name, optimizer, self.global_episode) else: self.local_AC = ACNetwork(self.name, optimizer, self.global_episode) self.update_local_vars = update_target_graph('global', self.name) self.env = game
def __init__(self, game, sess, thread_id, nb_actions, optimizer, global_step): self.name = "worker_" + str(thread_id) self.thread_id = thread_id self.model_path = FLAGS.checkpoint_dir self.trainer = optimizer self.global_episode = global_step self.increment_global_episode = self.global_episode.assign_add(1) self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_values = [] self.sess = sess self.graph = sess.graph # self.summary_writer = tf.summary.FileWriter(FLAGS.summaries_dir + "/worker_" + str(self.thread_id), self.graph) self.summary_writer = tf.summary.FileWriter(FLAGS.summaries_dir + "/worker_" + str(self.thread_id)) self.summary = tf.Summary() if FLAGS.lstm: self.local_AC = ACNetworkLSTM(self.name, nb_actions, optimizer) else: self.local_AC = ACNetwork(self.name, nb_actions, optimizer) self.update_local_ops = update_target_graph('global', self.name) self.actions = np.zeros([nb_actions]) self.env = game
def __init__(self, game, optimizer, global_step): self.name = "policy_eval" self.global_episode = global_step self.local_AC = FUNNetwork(self.name, optimizer, self.global_episode) self.update_local_ops = update_target_graph('global', self.name) self.summary_writer = tf.summary.FileWriter(FLAGS.summaries_dir + "/policy_eval") self.env = game
def __init__(self, game, optimizer, global_step): self.name = "policy_eval" if FLAGS.use_conv: self.local_AC = ConvNetwork(self.name, optimizer, global_step) else: self.local_AC = ACNetwork(self.name, optimizer, global_step) self.update_local_ops = update_target_graph('global', self.name) self.summary_writer = tf.summary.FileWriter(FLAGS.summaries_dir + "/policy_eval") self.env = game # self.actions = np.zeros([nb_actions]) self.global_episode = global_step
def __init__(self, game, nb_actions, optimizer, global_step): self.name = "policy_eval" if FLAGS.lstm: self.local_AC = ACNetworkLSTM(self.name, nb_actions, optimizer) else: self.local_AC = ACNetwork(self.name, nb_actions, optimizer) self.update_local_ops = update_target_graph('global', self.name) self.summary_writer = tf.summary.FileWriter(FLAGS.summaries_dir + "/policy_eval") self.env = game self.actions = np.zeros([nb_actions]) self.global_episode = global_step
def init_network(input_shape, action_size, model): if model == 'nature': qnet = network(input_shape, action_size, 'qnet') tnet = network(input_shape, action_size, 'tnet') update_ops = update_target_graph('qnet', 'tnet') elif model == 'gated': sys.path.append('../prototype8/gated') sys.path.append('../prototype8/') from gated_regularized_qnetwork import gated_regularized_qnetwork_visual_input from utils import update_target_graph_vars qnet = gated_regularized_qnetwork_visual_input(input_shape, action_size) tnet = None update_ops = update_target_graph_vars(qnet.qnet_vars, qnet.tnet_vars) return qnet, tnet, update_ops
def init_model(input_shape, action_size, latent_size, learning_rate, model): if model == 'gan': jqnet = joint_qnetwork(input_shape, action_size, latent_size, learning_rate) update_ops = update_target_graph('qnet', 'target_qnet') elif model == 'gated': from gated.joint_dqn_gated import joint_dqn_gated from utils import update_target_graph_vars jqnet = joint_dqn_gated(input_shape, action_size, learning_rate) update_ops = update_target_graph_vars(jqnet.qnet_vars, jqnet.tnet_vars) elif model == 'gated_reg': from gated_regularized_qnetwork import gated_regularized_qnetwork from utils import update_target_graph_vars jqnet = gated_regularized_qnetwork(input_shape, action_size, 256) update_ops = update_target_graph_vars(jqnet.qnet_vars, jqnet.tnet_vars) return jqnet, update_ops
def __init__(self, game, thread_id, optimizer, global_step): self.name = "agent_" + str(thread_id) self.thread_id = thread_id self.model_path = os.path.join(FLAGS.checkpoint_dir, FLAGS.model_name) self.optimizer = optimizer self.global_episode = global_step self.increment_global_episode = self.global_episode.assign_add(1) self.episode_rewards = [] self.episode_lengths = [] self.episode_mean_w_values = [] self.episode_mean_m_values = [] self.summary_writer = tf.summary.FileWriter( os.path.join(FLAGS.summaries_dir, FLAGS.model_name) + "/agent_" + str(self.thread_id)) self.summary = tf.Summary() self.local_AC = FUNNetwork(self.name, optimizer, self.global_episode) self.update_local_vars = update_target_graph('global', self.name) self.env = game
def __init__(self, game, name, a_size, state_size, trainer, model_path, global_epss, data_path, num_units, network): self.name = "worker_" + str(name) self.number = name self.folder = data_path + '/trains/train_' + str(self.number) self.model_path = model_path self.trainer = trainer self.global_epss = global_epss self.increment = self.global_epss.assign_add(1) self.network = network self.eps_rewards = [] self.eps_mean_values = [] self.summary_writer = tf.summary.FileWriter(self.folder) # Create the local copy of the network and the tensorflow op # to copy global parameters to local network self.local_AC = AC_Network(a_size, state_size, self.name, trainer, num_units, network) self.update_local_ops = ut.update_target_graph('global', self.name) self.env = game
def __init__(self, lr, s_size, action_size, h_size, scope, gamma, copy_from_scope=None): self._s_size = s_size self._action_size = action_size self._h_size = h_size self._gamma = gamma self._regularization_param = 0.001 # Implementing F(state)=action self.state_in = tf.placeholder(shape=[None, self._s_size], dtype=tf.float32) self.reward_holder = tf.placeholder(shape=[None], dtype=tf.float32) self.action_holder = tf.placeholder(shape=[None], dtype=tf.int32) self.action_distribution = self._construct_policy_model(scope) taken_action_probability = BrainPG.get_decision_probability( self.action_holder, self.action_distribution) loss = -tf.reduce_mean( tf.log(taken_action_probability) * self.reward_holder) self.optimize = tf.train.RMSPropOptimizer( learning_rate=lr).minimize(loss) # Initialize Variables BrainPG.sess.run( tf.variables_initializer( tf.get_collection(tf.GraphKeys.VARIABLES, scope))) self.saver = tf.train.Saver( tf.get_collection(tf.GraphKeys.VARIABLES, scope)) if copy_from_scope is not None: BrainPG.sess.run(utils.update_target_graph(copy_from_scope, scope))
def __init__(self, game, thread_id, optimizer, global_step, settings): self.name = "agent_" + str(thread_id) self.thread_id = thread_id self.model_path = settings["checkpoint_dir"] self.settings = settings self.optimizer = optimizer self.global_episode = global_step self.increment_global_episode = self.global_episode.assign_add(1) self.episode_rewards = [] # if not FLAGS.train: self.episode_regrets = [] self.episodes_suboptimal_arms = [] self.episode_lengths = [] self.episode_mean_values = [] self.summary_writer = tf.summary.FileWriter(settings["summaries_dir"] + "/agent_" + str(self.thread_id)) self.summary = tf.Summary() self.local_AC = ACNetwork(self.name, optimizer, self.global_episode) self.update_local_vars = update_target_graph('global', self.name) self.env = game
def __init__(self, game, name, s_size, a_size, optimizer=None, model_path=None, global_episodes=None, play=False): self.s_size = s_size self.a_size = a_size self.summary_step = 3 self.name = "worker_" + str(name) self.number = name self.episode_reward = [] self.episode_episode_health = [] self.episode_lengths = [] self.episode_mean_values = [] self.episode_health = [] self.episode_kills = [] # Create the local copy of the network and the tensorflow op to # copy global parameters to local network if not play: self.model_path = model_path self.trainer = optimizer self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.local_AC_network = network.ACNetwork(self.name, optimizer, play=play) self.summary_writer = tf.summary.FileWriter("./summaries/defend_the_center/agent_%s" % str(self.number)) self.update_local_ops = tf.group(*utils.update_target_graph('global', self.name)) else: self.local_AC_network = network.ACNetwork(self.name, optimizer, play=play) if not isinstance(game, DoomGame): raise TypeError("Type Error") # The Below code is related to setting up the Doom environment game = DoomGame() # game.set_doom_scenario_path('../scenarios/deadly_corridor.cfg') game.load_config("../scenarios/defend_the_center.cfg") # game.set_doom_map("map01") game.set_screen_resolution(ScreenResolution.RES_640X480) game.set_screen_format(ScreenFormat.RGB24) game.set_render_hud(False) game.set_render_crosshair(False) game.set_render_weapon(True) game.set_render_decals(False) game.set_render_particles(False) # Enables labeling of the in game objects. game.set_labels_buffer_enabled(True) game.add_available_button(Button.TURN_LEFT) game.add_available_button(Button.TURN_RIGHT) game.add_available_button(Button.ATTACK) game.add_available_game_variable(GameVariable.USER1) game.set_episode_timeout(2100) game.set_episode_start_time(5) game.set_window_visible(play) game.set_sound_enabled(False) game.set_living_reward(0) game.set_mode(Mode.PLAYER) if play: # game.add_game_args("+viz_render_all 1") game.set_render_hud(False) game.set_ticrate(35) game.init() self.env = game self.actions = self.button_combinations()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--environment", type=str, default='CartPole-v0') parser.add_argument("--action-size", type=int, default=2) parser.add_argument("--input-shape", type=list, default=[None, 4]) parser.add_argument("--target-update-freq", type=int, default=200) parser.add_argument("--epsilon-max", type=float, default=1.) parser.add_argument("--epsilon-min", type=float, default=.01) parser.add_argument("--epsilon-decay", type=float, default=.001) parser.add_argument("--learning-rate", type=float, default=.99) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--epochs", type=int, default=300) parser.add_argument("--replay-mem-size", type=int, default=1000000) args = parser.parse_args() env = gym.make(args.environment) args.action_size = env.action_space.n args.input_shape = [None] + list(env.observation_space.shape) print args # Epsilon parameter epsilon = args.epsilon_max # Replay memory memory = Memory(args.replay_mem_size) # Time step time_step = 0. # Initialize the agent qnet = qnetwork(input_shape=args.input_shape, action_size=args.action_size, scope='qnet') tnet = qnetwork(input_shape=args.input_shape, action_size=args.action_size, scope='tnet') update_ops = update_target_graph('qnet', 'tnet') with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for epoch in range(args.epochs): total_reward = 0 state = env.reset() while True: #env.render() if np.random.rand() < epsilon: action = np.random.randint(args.action_size) else: action = qnet.act(sess, state) next_state, reward, done, _ = env.step(action) total_reward += reward # Add to memory memory.add([state, action, reward, next_state, done]) # Reduce epsilon time_step += 1. epsilon = args.epsilon_min + ( args.epsilon_max - args.epsilon_min) * np.exp( -args.epsilon_decay * time_step) # Training step batch = np.array(memory.sample(args.batch_size)) qnet.train(sess, batch, args.learning_rate, tnet) # s <- s' state = np.copy(next_state) # Update target network if int(time_step) % args.target_update_freq == 0: sess.run(update_ops) if done: print 'epoch:', epoch, 'total_rewards:', total_reward break
def build_model(self): self.inputs = tf.placeholder( shape=[None, 1], dtype=tf.float32, ) self.labels = tf.placeholder( shape=[None, 1], dtype=tf.float32, ) self.task_amplitude = tf.placeholder( shape=None, dtype=tf.float32, ) self.ep = tf.Variable(0, dtype=tf.int32, name='episodes', trainable=False) self.inc_ep = self.ep.assign_add(1) network_names = ["meta", "learner"] self.outputs = {} for name in network_names: with tf.variable_scope(name): dense_1 = tf.layers.dense( inputs=self.inputs, units=self.hidden_1, activation=tf.nn.relu, kernel_initializer=tf.truncated_normal_initializer( .0, .01), name="dense_1", ) dense_2 = tf.layers.dense( inputs=dense_1, units=self.hidden_2, activation=tf.nn.relu, kernel_initializer=tf.truncated_normal_initializer( .0, .01), name="dense_2", ) self.outputs[name] = tf.layers.dense( inputs=dense_2, units=1, activation=None, kernel_initializer=tf.truncated_normal_initializer( .0, .01), name="output", ) self.loss = tf.losses.mean_squared_error( self.labels / self.task_amplitude, self.outputs["learner"] / self.task_amplitude) self.optimize = tf.train.AdamOptimizer(learning_rate=1e-2, beta1=self.beta1).minimize( self.loss) self.fresh_optimize = tf.train.AdamOptimizer( learning_rate=1e-2).minimize(self.loss) local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.name) self.learner_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "{}/learner".format(self.name)) self.meta_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "{}/meta".format(self.name)) self.gradients = tf.gradients(self.loss, self.learner_vars) self.reptile_grad = [ self.meta_vars[i] - learner_var for i, learner_var in enumerate(self.learner_vars) ] self.update_meta = self.meta_trainer.apply_gradients( zip(self.reptile_grad, self.meta_vars)) self.copy_meta_to_learner = update_target_graph( "{}/meta".format(self.name), "{}/learner".format(self.name))
def init_network(self): input_shape = self.feedback_size + (self.num_frames, ) worker_device = "/job:worker/task:{}/cpu:0".format(self.agent_index) with tf.device( tf.train.replica_device_setter(1, worker_device=worker_device)): with tf.variable_scope("global"): if self.use_lstm is False: self.shared_network = FFPolicy(input_shape, len(self.actions), self.network_type) else: self.shared_network = LSTMPolicy(input_shape, len(self.actions), self.network_type) self.global_step = tf.get_variable( "global_step", shape=[], initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False, dtype=tf.int32) self.best_score = tf.get_variable( "best_score", shape=[], initializer=tf.constant_initializer(-1e2, dtype=tf.float32), trainable=False, dtype=tf.float32) with tf.device(worker_device): with tf.variable_scope('local'): if self.use_lstm is False: self.network = FFPolicy(input_shape, len(self.actions), self.network_type) else: self.network = LSTMPolicy(input_shape, len(self.actions), self.network_type) # Sync params self.update_local_ops = update_target_graph( self.shared_network.vars, self.network.vars) # Learning rate self.lr = tf.get_variable(name='lr', shape=[], initializer=tf.constant_initializer( self.learning_rate), trainable=False, dtype=tf.float32) self.t_lr = tf.placeholder(dtype=tf.float32, shape=[], name='new_lr') self.assign_lr_op = tf.assign(self.lr, self.t_lr) # Best score self.t_score = tf.placeholder(dtype=tf.float32, shape=[], name='new_score') self.assign_best_score_op = tf.assign(self.best_score, self.t_score) # Build gradient_op self.increase_step = self.global_step.assign_add(1) gradients = self.network.build_gradient_op(clip_grad=40.0) # Additional summaries tf.summary.scalar("learning_rate", self.lr, collections=['a3c']) tf.summary.scalar("score", self.t_score, collections=['a3c']) tf.summary.scalar("best_score", self.best_score, collections=['a3c']) self.summary_op = tf.summary.merge_all('a3c') if self.shared_optimizer: with tf.device( tf.train.replica_device_setter( 1, worker_device=worker_device)): with tf.variable_scope("global"): optimizer = create_optimizer(self.update_method, self.lr, self.rho, self.rmsprop_epsilon) self.train_op = optimizer.apply_gradients( zip(gradients, self.shared_network.vars)) else: with tf.device(worker_device): with tf.variable_scope('local'): optimizer = create_optimizer(self.update_method, self.lr, self.rho, self.rmsprop_epsilon) self.train_op = optimizer.apply_gradients( zip(gradients, self.shared_network.vars))
def DQN(): parser = argparse.ArgumentParser() parser.add_argument("--environment", type=str, default='CartPole-v0') parser.add_argument("--action-size", type=int, default=2) parser.add_argument("--input-shape", type=list, default=[None, 4]) parser.add_argument("--target-update-freq", type=int, default=200) parser.add_argument("--epsilon-max", type=float, default=1.) parser.add_argument("--epsilon-min", type=float, default=.01) parser.add_argument("--epsilon-decay", type=float, default=.001) parser.add_argument("--discount-factor", type=float, default=.99) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--epochs", type=int, default=1000) parser.add_argument("--replay-mem-size", type=int, default=1000000) args = parser.parse_args() env = Environment() args.action_size = env.nActions args.input_shape = [None, env.stateShape] print args # Epsilon parameter epsilon = 0.1 # args.epsilon_max # Replay memory memory = Memory(args.replay_mem_size) # Time step time_step = 0. # Initialize the agent qnet = qnetwork(input_shape=args.input_shape, action_size=args.action_size, scope='qnet') tnet = qnetwork(input_shape=args.input_shape, action_size=args.action_size, scope='tnet') update_ops = update_target_graph('qnet', 'tnet') rewardHistory = np.zeros(args.epochs) env.render() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for epoch in range(args.epochs): total_reward = 0 state = env.reset() while (True): #env.render() if np.random.rand() < epsilon: action = np.random.randint(args.action_size) else: action = qnet.act(sess, state) [next_state, reward, done] = env.step(action) total_reward += reward rewardHistory[epoch] += reward # Add to memory memory.add([state, action, reward, next_state, done]) # Reduce epsilon time_step += 1. #epsilon = args.epsilon_min + (args.epsilon_max - args.epsilon_min) * np.exp(-args.epsilon_decay * time_step) # Training step batch = np.array(memory.sample(args.batch_size)) qnet.train(sess, batch, args.discount_factor, tnet) # s <- s' state = np.copy(next_state) # Update target network if int(time_step) % args.target_update_freq == 0: sess.run(update_ops) if done: print 'epoch:', epoch, 'total_rewards:', total_reward break ''' np.set_printoptions(threshold=np.nan) for v in range(-5, 5): policy = np.zeros((env.W, env.W), dtype='int') for x in range(env.W): for y in range(env.W): policy[x,y] = qnet.act(sess, np.array([x,y,1,v])) print(policy) ''' plt.xlabel('episode #') plt.ylabel('reward') plt.plot(rewardHistory) plt.savefig("DQN") plt.show() for epoch in range(10): total_reward = 0 state = env.reset() while (True): env.render() action = qnet.act(sess, state) [next_state, reward, done] = env.step(action) total_reward += reward rewardHistory[epoch] += reward # Reduce epsilon time_step += 1. # s <- s' state = np.copy(next_state) if done: print 'epoch:', epoch, 'total_rewards:', total_reward break
sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) print('Objective1_Value: %.4f\t Objective2_Value: %.4f\t' % evaluate(sess, model, train_set)) sys.stdout.flush() lr = 1 start_time = time.time() last_auc = 0.0 for epoch in range(100): random.shuffle(train_set) random.shuffle(test_set) epoch_size = round(len(train_set) / batch_size) loss_sum = 0.0 for _, uij in DataInput(train_set, batch_size): loss = model.train(sess, uij, lr) loss_sum += loss print('Epoch %d Train_Loss: %.4f' % (model.global_epoch_step.eval(), loss_sum)) print('Epoch %d DONE\tCost time: %.2f' % (model.global_epoch_step.eval(), time.time() - start_time)) print('Objective1_Value: %.4f\t Objective2_Value: %.4f\t' % evaluate(sess, model, train_set)) #print('Objective1_Value: %.4f\t Objective2_Value: %.4f\t' % evaluate(sess, model, test_set)) sys.stdout.flush() model.global_epoch_step_op.eval() if epoch % 5 == 0: update_target_graph('primary_dqn', 'target_dqn') end_time = time.time()
def __init__(self, q_network, ob_space, ac_space, lr, max_grad_norm, units_per_hlayer, activ_fcn, log_interval, logdir, batch_size, trace_length, tau, update_interval, keep_model): self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__) self.logger.info("Set up DQN learning agent") self.num_steps_trained = 0 self.log_interval = log_interval sess = make_session() # TODO add CPU config information # nbatch = batch_size self.global_step = tf.get_variable('global_step', [], tf.int32, tf.constant_initializer( 0, tf.int32), trainable=False) # Targets in loss computation QT = tf.placeholder(shape=[batch_size * trace_length], dtype=tf.float32, name='QT') # target Q values A = tf.placeholder(shape=[batch_size * trace_length], dtype=tf.int32, name='A') # action indices eval_model = q_network(sess, ob_space, ac_space.n, nbatch=1, trace_length=1, units_per_hlayer=units_per_hlayer, scope='model', reuse=False, activ_fcn=activ_fcn) train_model = q_network(sess, ob_space, ac_space.n, nbatch=batch_size, trace_length=trace_length, units_per_hlayer=units_per_hlayer, scope='model', reuse=True, activ_fcn=activ_fcn) # target_model = TargetNetwork(sess, ob_space, ac_space.n) target_model = q_network(sess, ob_space, ac_space.n, nbatch=batch_size, trace_length=trace_length, units_per_hlayer=units_per_hlayer, scope='target', reuse=False, activ_fcn=activ_fcn) # Obtain loss by taking the mean of squares difference between the target and prediction Q values. actions_onehot = tf.one_hot(A, depth=ac_space.n, dtype=tf.float32) td_error = tf.losses.mean_squared_error( labels=QT, predictions=tf.squeeze( tf.matmul(tf.multiply(train_model.predQ, actions_onehot), [[1.], [1.]]))) loss = td_error params = tf.trainable_variables( ) # was set to 'model', but we would need model and target parameters optimizer = tf.train.AdamOptimizer(lr) gradients = optimizer.compute_gradients(loss) grads, variables = zip(*gradients) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) _train = [ optimizer.apply_gradients(grads), self.global_step.assign_add(update_interval) ] # nbatch if log_interval > 0: for g, v in grads: if g is not None: tf.summary.histogram( "train/grads/%s-grad" % v.name.replace(':', '_'), g) for p in params: if p is not None: tf.summary.histogram( "train/params/%s" % p.name.replace(':', '_'), p.value()) tf.summary.scalar("train/vf_loss", loss) tf.summary.histogram("others/A", A) tf.summary.histogram("others/QT", QT) self.summary_step = tf.summary.merge_all() tf.add_to_collection('inputs', eval_model.X) tf.add_to_collection('predQ', eval_model.predQ) if eval_model.initial_state is not None: add_to_collection_rnn_state('state_in', eval_model.rnn_state_in) add_to_collection_rnn_state('state_out', eval_model.rnn_state_out) # tf.add_to_collection('step', eval_model.step) tf.global_variables_initializer().run(session=sess) def train(obs, actions, targets, states): """ Updates the weights of the neural network, based on its targets, its predictions, its loss and its optimizer. Args: sess: TensorFlow session. obs: [current_observation] or observations of batch actions: [current_action] or actions of batch targets: [current_target] or targets of batch """ feed_dict = {train_model.X: obs, A: actions, QT: targets} if states is not None: feed_dict[train_model.rnn_state_in] = states # evaluate the TF tensors and operations self.loss and self.train_step total_loss, _, global_step = sess.run( [loss, _train, self.global_step], feed_dict=feed_dict) if log_interval > 0 and (self.num_steps_trained % self.log_interval == 0): self.logger.info( 'Save summary of network weights, grads and losses.') summary_str = sess.run(self.summary_step, feed_dict) self.summary_writer.add_summary( tf.Summary.FromString(summary_str), global_step) self.num_steps_trained += 1 return total_loss saver = tf.train.Saver(max_to_keep=keep_model) def update_target(target_op_holder): for op in target_op_holder: sess.run(op) a = tf.trainable_variables()[0].eval(session=sess) b = tf.trainable_variables()[len(params) // 2].eval(session=sess) if not a.all() == b.all(): print("Target Set Failed") def save(f_name): gs = sess.run(self.global_step) self.logger.info( 'Save network parameters of model at global step %s' % gs) saver.save(sess, os.path.join(logdir, f_name), global_step=gs) def load(load_path): saver.restore(sess, load_path) def test_run(env, n_eps, n_pipes): self.logger.info('Evaluating current agent') ep_return = [] ep_length = [] for i in range(0, n_eps): # TODO parallelize this here! obs = env.reset() obs = normalize_obs(obs) done = False if eval_model.initial_state is not None: if len(eval_model.initial_state) > 1: rnn_s_in = (np.zeros( eval_model.initial_state[0].shape), np.zeros(eval_model.initial_state[1].shape) ) # init lstm cell vector else: rnn_s_in = np.zeros(eval_model.initial_state.shape ) # init gru cell vector total_return = 0 total_length = 0 while not done and (total_return < n_pipes): if eval_model.initial_state is not None: pQ, rnn_s_out = sess.run( [eval_model.predQ, eval_model.rnn_state_out], feed_dict={ eval_model.X: [obs], eval_model.rnn_state_in: rnn_s_in }) else: pQ = sess.run([eval_model.predQ], feed_dict={eval_model.X: [obs]}) ac = np.argmax(pQ) obs, reward, done, _ = env.step(ac) obs = normalize_obs(obs) total_length += 1 total_return += reward if eval_model.initial_state is not None: rnn_s_in = rnn_s_out self.logger.info('Episode %s: %s, %s' % (i, total_return, total_length)) ep_length.append(total_length) ep_return.append(total_return) return ep_return self.train = train self.train_model = train_model self.step_model = eval_model self.target_model = target_model self.target_ops = update_target_graph( params, tau) # TODO implement update_target_graph self.update_target = update_target self.step = eval_model.step self.predict = eval_model.predict self.step_initial_state = eval_model.initial_state self.train_initial_state = train_model.initial_state self.save = save self.load = load self.test_run = test_run self.sess = sess if log_interval > 0: self.summary_writer = tf.summary.FileWriter( logdir, graph_def=sess.graph_def) else: self.summary_writer = None
def __init__(self, game, name, optimizer=None, model_path=None, global_episodes=None, play=False, task_name='healthpack_simple'): self.task_name = task_name self.play = play self.summary_step = 3 self.name = cfg.AGENT_PREFIX + str(name) self.number = name self.imitate_data = None self.last_total_health = 100. self.last_total_kills = 0. self.last_total_ammos = 0. self.img_shape = cfg.IMG_SHAPE self.episode_reward = [] self.episode_lengths = [] self.episode_mean_values = [] self.episode_health = [] self.episode_kills = [] if not self.play: self.model_path = model_path self.trainer = optimizer self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.local_AC_network = network.ACNetwork(self.name, optimizer, play=self.play, img_shape=cfg.IMG_SHAPE) self.summary_writer = tf.summary.FileWriter( "./summaries/%s/ag_%s" % (self.task_name, str(self.number))) # create a tensorflow op to copy weights from global network regularly when training self.update_local_ops = tf.group( *utils.update_target_graph('global', self.name)) else: self.local_AC_network = network.ACNetwork(self.name, optimizer, play=self.play, img_shape=cfg.IMG_SHAPE) if not isinstance(game, DoomGame): raise TypeError("Type Error") game = DoomGame() game.load_config(cfg.SCENARIO_PATH) game.set_doom_map("map01") game.set_screen_resolution(ScreenResolution.RES_640X480) game.set_screen_format(ScreenFormat.RGB24) game.set_render_hud(False) game.set_render_crosshair(False) game.set_render_weapon(True) game.set_render_decals(False) game.set_render_particles(True) # Enables labeling of the in game objects. game.set_labels_buffer_enabled(True) game.add_available_button(Button.MOVE_FORWARD) game.add_available_button(Button.MOVE_RIGHT) game.add_available_button(Button.MOVE_LEFT) game.add_available_button(Button.TURN_LEFT) game.add_available_button(Button.TURN_RIGHT) game.add_available_button(Button.ATTACK) game.add_available_button(Button.SPEED) game.add_available_game_variable(GameVariable.AMMO2) game.add_available_game_variable(GameVariable.HEALTH) game.add_available_game_variable(GameVariable.USER2) game.set_episode_timeout(2100) game.set_episode_start_time(5) game.set_window_visible(self.play) game.set_sound_enabled(False) game.set_living_reward(0) game.set_mode(Mode.PLAYER) if self.play: game.add_game_args("+viz_render_all 1") game.set_render_hud(False) game.set_ticrate(35) game.init() self.env = game self.actions = cfg.button_combinations()
def __init__(self, game, name, optimizer=None, model_path=None, global_episodes=None, play=False, task_name='healthpack_simple'): self.task_name = task_name self.summary_step = 3 self.name = "worker_" + str(name) self.number = name self.last_total_health = 100. self.img_shape = cfg.IMG_SHAPE self.episode_reward = [] self.episode_episode_total_pickes = [] self.episode_lengths = [] self.episode_mean_values = [] self.episode_health = [] # Create the local copy of the network and the tensorflow op to # copy global parameters to local network if not play: self.model_path = model_path self.trainer = optimizer self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.local_AC_network = network.ACNetwork(self.name, optimizer, play=play, img_shape=cfg.IMG_SHAPE) self.summary_writer = tf.summary.FileWriter( "./summaries/healthpack/train_health%s" % str(self.number)) self.update_local_ops = tf.group( *utils.update_target_graph(self.task_name + '/global', self.task_name + '/' + self.name)) else: self.local_AC_network = network.ACNetwork(self.name, optimizer, play=play, img_shape=cfg.IMG_SHAPE) if not isinstance(game, DoomGame): raise TypeError("Type Error") # The Below code is related to setting up the Doom environment game = DoomGame() game.set_doom_scenario_path("../scenarios/{}".format( 'health_gathering_supreme.wad' if cfg. IS_SUPREME_VERSION else 'health_gathering.wad')) game.set_doom_map("map01") game.set_screen_resolution(ScreenResolution.RES_640X480) game.set_screen_format(ScreenFormat.RGB24) game.set_render_hud(False) game.set_render_crosshair(False) game.set_render_weapon(True) game.set_render_decals(False) game.set_render_particles(True) # Enables labeling of the in game objects. game.set_labels_buffer_enabled(True) game.add_available_button(Button.TURN_LEFT) game.add_available_button(Button.TURN_RIGHT) game.add_available_button(Button.MOVE_FORWARD) game.add_available_game_variable(GameVariable.USER1) game.set_episode_timeout(2100) game.set_episode_start_time(5) game.set_window_visible(play) game.set_sound_enabled(False) game.set_living_reward(0) game.set_mode(Mode.PLAYER) if play: game.add_game_args("+viz_render_all 1") game.set_render_hud(False) game.set_ticrate(35) game.init() self.env = game self.actions = [ list(perm) for perm in iter.product([False, True], repeat=game.get_available_buttons_size()) ] self.actions.remove([True, True, True]) self.actions.remove([True, True, False])
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env-interface", type=str, default='gym!atari') parser.add_argument("--environment", type=str, default='CartPole-v0') parser.add_argument("--action-size", type=int, default=2) parser.add_argument("--input-shape", type=list, default=[None, 4]) parser.add_argument("--target-update-freq", type=int, default=200) parser.add_argument("--epsilon-max", type=float, default=1.) parser.add_argument("--epsilon-min", type=float, default=.01) parser.add_argument("--epsilon-decay", type=float, default=.001) parser.add_argument("--learning-rate", type=float, default=.99) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--epochs", type=int, default=30000) parser.add_argument("--replay-mem-size", type=int, default=1000000) parser.add_argument("--K", type=int, default=1, help='The number of steps to train the environment') parser.add_argument( "--L", type=int, default=1, help='The number of Q-learning steps for hypothetical rollouts') parser.add_argument("--latent-size", type=int, default=4, help='Size of vector for Z') args = parser.parse_args() env = env_interface(args.env_interface, args.environment, pixel_feature=False, render=True) #args.action_size = env.action_space.n args.action_size = env.action_size args.input_shape = [None] + list(env.obs_space_shape) print args # Other parameters epsilon = args.epsilon_max # Replay memory memory = Memory(args.replay_mem_size) # Time step time_step = 0. # Initialize the GANs cgan_state = CGAN(input_shape=args.input_shape, action_size=args.action_size, latent_size=args.latent_size, gen_input_shape=args.input_shape) cgan_reward = CGAN(input_shape=args.input_shape, action_size=args.action_size, latent_size=args.latent_size, gen_input_shape=[None, 1]) qnet = qnetwork(input_shape=args.input_shape, action_size=args.action_size, scope='qnet') target_qnet = qnetwork(input_shape=args.input_shape, action_size=args.action_size, scope='target_qnet') update_ops = update_target_graph('qnet', 'target_qnet') rand_no = np.random.rand() #env = gym.wrappers.Monitor(env, '/tmp/cartpole-experiment-' + str(rand_no), force=True, video_callable=False) init = tf.initialize_all_variables() with tf.Session() as sess: sess.run(init) for epoch in range(args.epochs): total_reward = 0 observation = env.reset() for t in range(1000000): #env.render() action = qnet.get_action(sess, observation) if np.random.rand() < epsilon: #action = env.action_space.sample() action = np.random.randint(args.action_size) observation1, reward, done, info = env.step(action) total_reward += reward # Add to memory memory.add([observation, action, reward, observation1, done]) # Reduce epsilon time_step += 1. epsilon = args.epsilon_min + ( args.epsilon_max - args.epsilon_min) * np.exp( -args.epsilon_decay * time_step) # Training step batch = np.array(memory.sample(args.batch_size)) qnet.train(sess, batch, args.learning_rate, target_qnet) # Training step: environment model for k in range(args.K): batch = np.array(memory.sample(args.batch_size)) states = np.vstack(batch[:, 0]) actions = np.array(batch[:, 1]) rewards = batch[:, 2] states1 = np.vstack(batch[:, 3]) _, D_loss_state = sess.run( [cgan_state.D_solver, cgan_state.D_loss], feed_dict={ cgan_state.states: states, cgan_state.actions: actions, cgan_state.Z: sample_z(len(batch), args.latent_size), cgan_state.X: states1 }) _, G_loss_state = sess.run( [cgan_state.G_solver, cgan_state.G_loss], feed_dict={ cgan_state.states: states, cgan_state.actions: actions, cgan_state.Z: sample_z(len(batch), args.latent_size) }) _, D_loss_reward = sess.run( [cgan_reward.D_solver, cgan_reward.D_loss], feed_dict={ cgan_reward.states: states, cgan_reward.actions: actions, cgan_reward.Z: sample_z(len(batch), args.latent_size), cgan_reward.X: rewards[..., np.newaxis] }) _, G_loss_reward = sess.run( [cgan_reward.G_solver, cgan_reward.G_loss], feed_dict={ cgan_reward.states: states, cgan_reward.actions: actions, cgan_reward.Z: sample_z(len(batch), args.latent_size) }) #print D_loss_state, G_loss_state, D_loss_reward, G_loss_state # Training step: imagination rollouts if time_step == 0.: print "time_step 0 here" if time_step >= 0.: for l in range(args.L): batch = np.array(memory.sample(args.batch_size)) assert len(batch) > 0 states1 = np.vstack(batch[:, 3]) actions = np.random.randint(args.action_size, size=len(batch)) dones = np.array([False] * len(batch)) G_sample_state = sess.run(cgan_state.G_sample, feed_dict={ cgan_state.states: states1, cgan_state.actions: actions, cgan_state.Z: sample_z( len(batch), args.latent_size) }) G_sample_reward = sess.run(cgan_reward.G_sample, feed_dict={ cgan_reward.states: states1, cgan_reward.actions: actions, cgan_reward.Z: sample_z( len(batch), args.latent_size) }) qnet.train(sess, None, args.learning_rate, target_qnet, states1, actions, G_sample_reward, G_sample_state, dones) # Set observation observation = observation1 # Update? if int(time_step) % args.target_update_freq == 0: #print "Updating target..." sess.run(update_ops) if done: print "Episode finished after {} timesteps".format( t + 1), 'epoch', epoch, 'total_rewards', total_reward break