def __init__(self, learning_rate, memory_size, batch_size, sess, output_size): self.sess = sess #state_t self.encoder_input = tf.placeholder(tf.float32, shape=[None, n_features], name='encoder_input') self.encoder_output = mlp(inputs=self.encoder_input, n_output=output_size, scope='encoder_output', hiddens=[32, 16, 8]) self.decoder_output = mlp(inputs=self.encoder_output, n_output=n_features, scope='decoder_output', hiddens=[8, 16, 32]) self.encoder_output_ = tf.stop_gradient(self.decoder_output) #some const self.learning_rate = learning_rate self.memory_size = memory_size self.batch_size = batch_size #memory self.memory = Memory(self.memory_size) #for train self.loss = tf.reduce_mean( tf.squared_difference(self.encoder_input, self.decoder_output)) self.train = tf.train.AdamOptimizer(self.learning_rate).minimize( self.loss)
def __init__( self, learning_rate, memory_size, batch_size, sess, output_size ): self.sess = sess #state_t self.encoder_input_t = tf.placeholder(tf.float32, shape=[None, n_features], name='encoder_input_t') self.encoder_output_t = mlp(inputs=self.encoder_input_t, n_output=output_size, scope='encoder_output_t', hiddens=[16, 8]) self.encoder_output_t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoder_output_t') self.decoder_output_t = mlp(inputs=self.encoder_output_t, n_output=n_features, scope='decoder_output_t') self.decoder_output_t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='decoder_output_t') self.encoder_output_t_ = tf.stop_gradient(self.encoder_output_t) #state_t+1 tpo->time plus one self.encoder_input_tpo = tf.placeholder(tf.float32, shape=[None, n_features], name='encoder_input_tpo') self.encoder_output_tpo = mlp(inputs=self.encoder_input_tpo, n_output=output_size, scope='encoder_output_tpo', hiddens=[16, 8]) self.encoder_output_tpo_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoder_output_tpo') self.decoder_output_tpo = mlp(inputs=self.encoder_output_tpo, n_output=n_features, scope='decoder_output_tpo') self.decoder_output_tpo_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='decoder_output_tpo') self.encoder_output_tpo_ = tf.stop_gradient(self.encoder_output_tpo) #sync self.sync_encoder = [tf.assign(x, y) for x, y in zip(self.encoder_output_t_params, self.encoder_output_tpo_params)] self.sync_decoder = [tf.assign(x, y) for x, y in zip(self.decoder_output_t_params, self.decoder_output_tpo_params)] #some const self.learning_rate = learning_rate self.memory_size = memory_size self.batch_size = batch_size #memory self.memory = Memory(self.memory_size) #for train self.loss_0 = tf.reduce_mean(tf.squared_difference(self.encoder_input_t, self.decoder_output_t)) self.loss_1 = tf.reduce_mean(tf.squared_difference(self.encoder_input_tpo,self.decoder_output_tpo)) self.train_0 = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_0) self.train_1 = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_1)
def __init__( self, learning_rate, memory_size, batch_size, sess ): self.sess = sess self.common_encoder_input = tf.placeholder(tf.float32, shape=[None, n_features], name='common_encoder_input') self.common_encoder_output = mlp(inputs=self.common_encoder_input, n_output=n_features, scope='common_encoder_output', hiddens=[16, 8]) self.common_decoder_output = mlp(inputs=self.common_encoder_output, n_output=n_features, scope='common_decoder_output') self.learning_rate = learning_rate self.memory_size = memory_size self.batch_size = batch_size self.memory = Memory(self.memory_size) self.loss = tf.reduce_mean(tf.squared_difference(self.common_encoder_input, self.common_decoder_output)) self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
intra_op_parallelism_threads=1, )) # set saver if SAVE: saver = tf.train.Saver() if LOAD: model_file = tf.train.latest_checkpoint(LOAD_FILE_PATH) saver.restore(sess, model_file) if RESULT_EXPORT: f = open('/~/result.txt', 'w') #try to share some common layers common_eval_input = tf.placeholder(tf.float32, shape=[None, n_features], name='common_eval_input') common_target_input = tf.placeholder(tf.float32, shape=[None, n_features], name='common_target_input') common_eval_output = mlp(inputs=common_eval_input, n_output=64, scope='common_eval_layer', hiddens=hiddens) common_target_output = tf.stop_gradient(mlp(inputs=common_eval_input, n_output=64, scope='common_target_layer', hiddens=hiddens)) #initialize the plot # fig = plt.figure() # ax = fig.add_subplot(1,1,1) # ax.axis("equal") # plt.ion() # plt.ylim((0,10)) # x= [0] # y= [0] #add agents ais = [] for i in range(ai_number): ais.append(DQN(
def __init__(self, n_features, n_actions, model, scope, sess, order, hiddens, beta, C, common_eval_input, common_target_input, common_eval_output, common_target_output, learning_rate=1e-5, decay=0.99, memory_size=20000000, batch_size=100000, epsilon_decrement=0.0005, epsilon_lower=0.2): self.sess = sess self.scope = scope self.n_features = n_features self.batch_size = batch_size self.decay = decay self.model = model self.memory = Memory(memory_size) self.order = order self.beta = beta self.C = C self.learn_times = 0 self.epsilon_lower = epsilon_lower self.epsilon_decrement = epsilon_decrement self.eval_input = tf.placeholder(tf.float32, shape=[None, self.n_features], name='eval_input') self.target_input = tf.placeholder(tf.float32, shape=[None, self.n_features], name='target_input') self.actions_selected = tf.placeholder(tf.int32, shape=[ None, ], name='actions_selected') self.done = tf.placeholder(tf.float32, shape=[ None, ], name='done') self.decays = tf.placeholder(tf.float32, shape=[ None, ], name='decay') self.rewards = tf.placeholder(tf.float32, shape=[ None, ], name='rewards') #about the encoder self.state_input_t = tf.placeholder(tf.float32, shape=[None, self.n_features], name='state_input_t') self.state_input_tpo = tf.placeholder(tf.float32, shape=[None, self.n_features], name='state_input_tpo') self.action_plus_state_input = tf.placeholder( tf.float32, shape=[None, self.n_features + 1], name='action_plus_state_input') #share the first layers self.common_eval_input = common_eval_input self.common_target_input = common_target_input self.common_eval_output = common_eval_output self.common_target_output = common_target_output with tf.variable_scope(self.scope): self._epsilon = tf.get_variable(name='epsilon', dtype=tf.float32, initializer=1.0) self._epsilon_decrement = tf.constant(epsilon_decrement) self.update_epsilon = tf.assign( self._epsilon, self._epsilon - self._epsilon_decrement) self.reset_epsilon = tf.assign(self._epsilon, 1) # self.eval_output = model(inputs=self.eval_input, n_output=n_actions, scope='eval_net', hiddens=hiddens) # self.target_output = tf.stop_gradient( # model(inputs=self.target_input, n_output=n_actions, scope='target_net', hiddens=hiddens)) self.eval_output = model(inputs=self.common_eval_output, n_output=n_actions, scope='eval_net', hiddens=hiddens) self.target_output = tf.stop_gradient( model(inputs=self.common_target_output, n_output=n_actions, scope='target_net', hiddens=hiddens)) #about encoder self.encoder_temp_t = mlp(inputs=self.state_input_t, n_output=64, scope='encoder_temp_t', hiddens=[32, 64]) self.encoder_temp_tpo = tf.stop_gradient( mlp(inputs=self.state_input_tpo, n_output=64, scope='encoder_temp_tpo', hiddens=[32, 64])) self.encoder_output_t = mlp(inputs=self.encoder_temp_t, n_output=self.n_features, scope='encoder_t', hiddens=[64, 32]) self.encoder_output_tpo = mlp(inputs=self.encoder_temp_tpo, n_output=self.n_features, scope='encoder_tpo', hiddens=[64, 32]) self.predict_output = mlp(inputs=self.action_plus_state_input, n_output=64, scope='predict_output', hiddens=[64, 32]) self.predict_mse = tf.reduce_sum( tf.square(self.encoder_temp_tpo - self.predict_output)) * self.n_features self.emax = tf.get_variable(name='emax', dtype=tf.float32, initializer=1.0) self.update_emax = tf.assign( self.emax, tf.maximum(self.emax, self.predict_mse)) self.e_normalize = tf.div(self.predict_mse, self.emax) self.encoder_loss = tf.reduce_sum( tf.square(self.state_input_t - self.encoder_output_t)) self.train_encoder = tf.train.AdamOptimizer( learning_rate).minimize(self.encoder_loss) self.M_loss = self.predict_mse self.train_M = tf.train.AdamOptimizer(learning_rate).minimize( self.M_loss) self.eval_output_selected = tf.reduce_sum( self.eval_output * tf.one_hot(self.actions_selected, n_actions), axis=1) self.eval_output_target = self.rewards + self.decays * tf.reduce_max( self.target_output, axis=1) * (1. - self.done) self.loss = tf.reduce_mean( tf.squared_difference(self.eval_output_selected, self.eval_output_target)) self.train = tf.train.AdamOptimizer(learning_rate).minimize(self.loss) self.eval_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope + '/eval_net') self.target_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope + '/target_net') self.update = [ tf.assign(x, y) for x, y in zip(self.target_params, self.eval_params) ] self.sess.run(tf.global_variables_initializer())
def main(): model_choices = ["atari_deepmind", "cnn_to_lstm", "mlp", "lstm_to_mlp", "cnn_to_lstm_new"] parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--env", help="environment ID", default="BreakoutNoFrameskip-v4") parser.add_argument("--seed", help="RNG seed", type=int, default=0) parser.add_argument("--prioritized", type=int, default=1) parser.add_argument("--dueling", type=int, default=1) parser.add_argument("--num-timesteps", type=int, default=int(5*10e2)) parser.add_argument("--learning-rate", type=float, default=1e-4) parser.add_argument("--batch-size", type=int, default=1) parser.add_argument("--buffer-size", type=int, default=int(1e6)) parser.add_argument("--exploration_steps", type=float, default=1e6) parser.add_argument("--exploration_final_eps", type=float, default=0.1) parser.add_argument("--train-freq", type=int, default=4) parser.add_argument("--learning-starts", type=int, default=int(1e4)) parser.add_argument("--target_network_update_freq", type=int, default=int(1e3)) parser.add_argument("--gamma", type=float, default=0.99) parser.add_argument("--model", type=str, choices = model_choices, default="cnn_to_lstm_new") args = parser.parse_args() logger.configure(log_dir) set_global_seeds(args.seed) env = make_atari(args.env) env = bench.Monitor(env, logger.get_dir()) env = deepq.wrap_atari_dqn(env) if args.model == "mlp": model = mlp(hiddens=[256, 256]) elif args.model == "atari_deepmind": model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)], hiddens=[256], duelings=bool(args.dueling)) elif args.model == "cnn_to_lstm": model = cnn_to_lstm(convs=[(16, 8, 4), (32, 4, 2)], lstm_hidden_size=512, lstm_out_size=256, hiddens=[256, 128], batch_size=int(args.batch_size), duelings=bool(args.dueling)) elif args.model == "cnn_to_lstm_new": model = cnn_to_lstm_new(convs=[(16, 8, 4), (32, 4, 2)], lstm_hidden_size=512, lstm_out_size=256, hiddens=[256, 128], batch_size=int(args.batch_size), duelings=bool(args.dueling)) elif args.model == "lstm_to_mlp": model = lstm_to_mlp(lstm_hidden_size=512, lstm_out_size=256, hiddens=[256, 128], batch_size=int(args.batch_size), duelings=bool(args.dueling)) act = deepq.learn( env, q_func=model, lr=args.learning_rate, max_timesteps=args.num_timesteps, buffer_size=int(args.buffer_size), exploration_fraction=(args.exploration_steps / args.num_timesteps), exploration_final_eps=args.exploration_final_eps, train_freq=args.train_freq, batch_size=int(args.batch_size), learning_starts=int(args.learning_rate), target_network_update_freq=int(args.target_network_update_freq), gamma=args.gamma, prioritized_replay=bool(args.prioritized) ) f = open(os.path.join(log_dir, "README.me"), "w") f.write("\tenv \t{}\n".format(args.env)) f.write("\tmodel\t{}\n".format(args.model)) f.write("\tprioritized\t{}\n".format(args.prioritized)) f.write("\tdueling\t{}\n".format(args.dueling)) f.write(("\tlearning rate\t{}\n".format(args.learning_rate))) f.write(("\tbatch size\t{}\n").format(args.batch_size)) f.write("\tmax timestep\t{}\n".format(args.num_timesteps)) f.write("\tbuffer size\t{}\n".format(args.buffer_size)) f.write("\texploration fraction\t{}\n".format(args.exploration_steps/args.num_timesteps)) f.write("\texploration_final_eps\t{}\n".format(args.exploration_final_eps)) f.write("\ttrain freq\t{}\n".format(args.train_freq)) f.write("\tlearning starts\t{}\n".format(args.learning_rate)) f.write("\ttarget network update freq\t{}\n".format(args.target_network_update_freq)) f.close() act.save("log/{}.pkl".format(args.model + "_" + args.env.replace(" ", "_")))