def __init__(self, abs_size, num_actions, num_abstract_states, gamma=0.99, learning_rate=0.000002, replay_start_size=500, epsilon_start=1.0, epsilon_end=0.01, epsilon_steps=1000000, update_freq=4, target_copy_freq=30000, replay_memory_size=1000000, frame_history=4, batch_size=32, error_clip=1, restore_network_file=None, double=True): config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True self.sess = tf.Session(config=config) self.inp_actions = tf.placeholder(tf.float32, [None, num_actions]) inp_shape = [None, 84, 84, frame_history] inp_dtype = 'uint8' assert type(inp_dtype) is str self.inp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_terminated = tf.placeholder(tf.bool, [None]) self.inp_reward = tf.placeholder(tf.float32, [None]) self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history]) #self.inp_q_choices = tf.placeholder(tf.int32, [None]) self.inp_abs_state_init = tf.placeholder(tf.float32, [None, abs_size]) self.inp_abs_state_goal = tf.placeholder(tf.float32, [None, abs_size]) self.abs_neighbors = dict() self.gamma = gamma with tf.variable_scope('online'): mask_shape = [-1, 1, 1, frame_history] mask = tf.reshape(self.inp_mask, mask_shape) masked_input = self.inp_frames * mask self.q_online = construct_dqn_with_embedding_2_layer(masked_input, self.inp_abs_state_init, self.inp_abs_state_goal, frame_history, num_actions) with tf.variable_scope('target'): mask_shape = [-1, 1, 1, frame_history] sp_mask = tf.reshape(self.inp_sp_mask, mask_shape) masked_sp_input = self.inp_sp_frames * sp_mask self.q_target = construct_dqn_with_embedding_2_layer(masked_sp_input, self.inp_abs_state_init, self.inp_abs_state_goal, frame_history, num_actions) if double: with tf.variable_scope('online', reuse=True): self.q_online_prime = construct_dqn_with_embedding_2_layer(masked_sp_input, self.inp_abs_state_init, self.inp_abs_state_goal, frame_history, num_actions) print(self.q_online_prime) self.maxQ = tf.gather_nd(self.q_target, tf.transpose( [tf.range(0, 32, dtype=tf.int32), tf.cast(tf.argmax(self.q_online_prime, axis=1), tf.int32)], [1, 0])) else: self.maxQ = tf.reduce_max(self.q_target, reduction_indices=1) self.r = self.inp_reward use_backup = tf.cast(tf.logical_not(self.inp_terminated), dtype=tf.float32) self.y = self.r + use_backup * gamma * self.maxQ self.delta = tf.reduce_sum(self.inp_actions * self.q_online, axis=1) - self.y self.error = tf.where(tf.abs(self.delta) < error_clip, 0.5 * tf.square(self.delta), error_clip * tf.abs(self.delta)) self.loss = tf.reduce_sum(self.error) self.g = tf.gradients(self.loss, self.q_online) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.95, centered=True, epsilon=0.01) self.train_op = optimizer.minimize(self.loss, var_list=th.get_vars('online')) self.copy_op = th.make_copy_op('online', 'target') self.saver = tf.train.Saver(var_list=th.get_vars('online')) self.replay_buffer = replay_memory.ReplayMemory((84, 84), abs_size, 'uint8', replay_memory_size, frame_history) self.frame_history = frame_history self.replay_start_size = replay_start_size self.epsilon = dict() self.epsilon_min = epsilon_end self.epsilon_steps = epsilon_steps self.epsilon_delta = (epsilon_start - self.epsilon_min) / self.epsilon_steps self.update_freq = update_freq self.target_copy_freq = target_copy_freq self.action_ticker = 1 self.num_actions = num_actions self.batch_size = batch_size self.sess.run(tf.initialize_all_variables()) if restore_network_file is not None: self.saver.restore(self.sess, restore_network_file) print('Restored network from file') self.sess.run(self.copy_op) self.cts = dict() self.encoding_func = toy_mr_encoder.encode_toy_mr_state self.beta = 0.05
def __init__(self, dqn, num_actions, gamma=0.99, learning_rate=0.00025, replay_start_size=50000, epsilon_start=1.0, epsilon_end=0.01, epsilon_steps=1000000, update_freq=4, target_copy_freq=30000, replay_memory_size=1000000, frame_history=4, batch_size=32, error_clip=1, restore_network_file=None, double=True, use_mmc=True, max_mmc_path_length=1000, mmc_beta=0.1, state_encoder=None, bonus_beta=0.05, cts_size=None): self.dqn = dqn config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.inp_actions = tf.placeholder(tf.float32, [None, num_actions]) self.max_mmc_path_length = max_mmc_path_length self.mmc_beta = mmc_beta inp_shape = [None] + list(self.dqn.get_input_shape()) + [frame_history] inp_dtype = self.dqn.get_input_dtype() assert type(inp_dtype) is str self.inp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_terminated = tf.placeholder(tf.bool, [None]) self.inp_reward = tf.placeholder(tf.float32, [None]) self.inp_mmc_reward = tf.placeholder(tf.float32, [None]) self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.gamma = gamma with tf.variable_scope('online'): mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [ frame_history ] mask = tf.reshape(self.inp_mask, mask_shape) masked_input = self.inp_frames * mask self.q_online = self.dqn.construct_q_network(masked_input) with tf.variable_scope('target'): mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [ frame_history ] sp_mask = tf.reshape(self.inp_sp_mask, mask_shape) masked_sp_input = self.inp_sp_frames * sp_mask self.q_target = self.dqn.construct_q_network(masked_sp_input) if double: with tf.variable_scope('online', reuse=True): self.q_online_prime = self.dqn.construct_q_network( masked_sp_input) self.maxQ = tf.gather_nd( self.q_target, tf.transpose([ tf.range(0, 32, dtype=tf.int32), tf.cast(tf.argmax(self.q_online_prime, axis=1), tf.int32) ], [1, 0])) else: self.maxQ = tf.reduce_max(self.q_target, reduction_indices=1) self.r = self.inp_reward use_backup = tf.cast(tf.logical_not(self.inp_terminated), dtype=tf.float32) self.y = self.r + use_backup * gamma * self.maxQ self.delta_dqn = tf.reduce_sum(self.inp_actions * self.q_online, reduction_indices=1) - self.y self.error_dqn = tf.where( tf.abs(self.delta_dqn) < error_clip, 0.5 * tf.square(self.delta_dqn), error_clip * tf.abs(self.delta_dqn)) if use_mmc: self.delta_mmc = tf.reduce_sum(self.inp_actions * self.q_online, axis=1) - self.inp_mmc_reward self.error_mmc = tf.where( tf.abs(self.delta_mmc) < error_clip, 0.5 * tf.square(self.delta_mmc), error_clip * tf.abs(self.delta_mmc)) # self.delta = (1. - self.mmc_beta) * self.delta_dqn + self.mmc_beta * self.delta_mmc self.loss = (1. - self.mmc_beta) * tf.reduce_sum( self.error_dqn) + self.mmc_beta * tf.reduce_sum(self.error_mmc) else: self.loss = tf.reduce_sum(self.error_dqn) self.g = tf.gradients(self.loss, self.q_online) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.95, centered=True, epsilon=0.01) self.train_op = optimizer.minimize(self.loss, var_list=th.get_vars('online')) self.copy_op = th.make_copy_op('online', 'target') self.saver = tf.train.Saver(var_list=th.get_vars('online')) self.use_mmc = use_mmc self.replay_buffer = ReplayMemory(self.dqn.get_input_shape(), self.dqn.get_input_dtype(), replay_memory_size, frame_history) if self.use_mmc: self.mmc_tracker = MMCPathTracker(self.replay_buffer, self.max_mmc_path_length, self.gamma) self.frame_history = frame_history self.replay_start_size = replay_start_size self.epsilon = epsilon_start self.epsilon_min = epsilon_end self.epsilon_steps = epsilon_steps self.epsilon_delta = (self.epsilon - self.epsilon_min) / self.epsilon_steps self.update_freq = update_freq self.target_copy_freq = target_copy_freq self.action_ticker = 1 self.num_actions = num_actions self.batch_size = batch_size self.sess.run(tf.initialize_all_variables()) if restore_network_file is not None: self.saver.restore(self.sess, restore_network_file) print('Restored network from file') self.sess.run(self.copy_op) self.cts_size = cts_size self.cts = cpp_cts.CPP_CTS(*cts_size) self.encoding_func = state_encoder self.bonus_beta = bonus_beta
def __init__(self, abs_size, num_actions, num_abstract_states, gamma=0.99, learning_rate=0.00025, replay_start_size=500, epsilon_start=1.0, epsilon_end=0.01, epsilon_steps=1000000, update_freq=4, target_copy_freq=30000, replay_memory_size=1000000, frame_history=4, batch_size=32, error_clip=1, restore_network_file=None, double=True, use_mmc=True, max_mmc_path_length=1000, mmc_beta=0.5, max_dqn_number=300, rmax_learner=None): self.rmax_learner = rmax_learner config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True self.sess = tf.Session(config=config) self.inp_actions = tf.placeholder(tf.float32, [None, num_actions]) self.max_mmc_path_length = max_mmc_path_length self.mmc_beta = mmc_beta inp_shape = [None, 84, 84, frame_history] inp_dtype = 'uint8' assert type(inp_dtype) is str self.inp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_terminated = tf.placeholder(tf.bool, [None]) self.inp_reward = tf.placeholder(tf.float32, [None]) self.inp_mmc_reward = tf.placeholder(tf.float32, [None]) self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.inp_dqn_numbers = tf.placeholder(tf.int32, [None]) #self.inp_q_choices = tf.placeholder(tf.int32, [None]) self.inp_abs_state_init = tf.placeholder(tf.float32, [None, abs_size]) self.inp_abs_state_goal = tf.placeholder(tf.float32, [None, abs_size]) self.abs_neighbors = dict() self.gamma = gamma self.max_dqn_number = max_dqn_number #q_constructor = lambda inp: construct_q_network_weights(inp, self.inp_dqn_numbers, max_dqn_number, frame_history, num_actions) # q_constructor = lambda inp: construct_small_network_weights(inp, self.inp_dqn_numbers, max_dqn_number, frame_history, num_actions) q_constructor = lambda inp: construct_dqn_with_embedding_2_layer( inp, self.inp_abs_state_init, self.inp_abs_state_goal, frame_history, num_actions) # q_constructor = lambda inp: construct_dqn_with_subgoal_embedding(inp, self.inp_abs_state_init, self.inp_abs_state_goal, frame_history, num_actions) # q_constructor = lambda inp: construct_meta_dqn_network(inp, self.inp_abs_state_init, self.inp_abs_state_goal, frame_history, num_actions) with tf.variable_scope('online'): mask_shape = [-1, 1, 1, frame_history] mask = tf.reshape(self.inp_mask, mask_shape) masked_input = self.inp_frames * mask self.q_online = q_constructor(masked_input) with tf.variable_scope('target'): mask_shape = [-1, 1, 1, frame_history] sp_mask = tf.reshape(self.inp_sp_mask, mask_shape) masked_sp_input = self.inp_sp_frames * sp_mask self.q_target = q_constructor(masked_sp_input) if double: with tf.variable_scope('online', reuse=True): self.q_online_prime = q_constructor(masked_sp_input) print(self.q_online_prime) self.maxQ = tf.gather_nd( self.q_target, tf.transpose([ tf.range(0, batch_size, dtype=tf.int32), tf.cast(tf.argmax(self.q_online_prime, axis=1), tf.int32) ], [1, 0])) else: self.maxQ = tf.reduce_max(self.q_target, axis=1) self.r = tf.sign(self.inp_reward) use_backup = tf.cast(tf.logical_not(self.inp_terminated), dtype=tf.float32) self.y = self.r + use_backup * gamma * self.maxQ self.delta_dqn = tf.reduce_sum(self.inp_actions * self.q_online, axis=1) - self.y self.error_dqn = tf.where( tf.abs(self.delta_dqn) < error_clip, 0.5 * tf.square(self.delta_dqn), error_clip * tf.abs(self.delta_dqn)) if use_mmc: self.delta_mmc = tf.reduce_sum(self.inp_actions * self.q_online, axis=1) - self.inp_mmc_reward self.error_mmc = tf.where( tf.abs(self.delta_mmc) < error_clip, 0.5 * tf.square(self.delta_mmc), error_clip * tf.abs(self.delta_mmc)) # self.delta = (1. - self.mmc_beta) * self.delta_dqn + self.mmc_beta * self.delta_mmc self.loss = (1. - self.mmc_beta) * tf.reduce_sum( self.error_dqn) + self.mmc_beta * tf.reduce_sum(self.error_mmc) else: self.loss = tf.reduce_sum(self.error_dqn) self.g = tf.gradients(self.loss, self.q_online) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.95, centered=True, epsilon=0.01) self.train_op = optimizer.minimize(self.loss, var_list=th.get_vars('online')) self.copy_op = th.make_copy_op('online', 'target') self.saver = tf.train.Saver(var_list=th.get_vars('online')) self.use_mmc = use_mmc self.replay_buffer = ReplayMemory((84, 84), abs_size, 'uint8', replay_memory_size, frame_history) if self.use_mmc: self.mmc_tracker = MMCPathTracker(self.replay_buffer, self.max_mmc_path_length, self.gamma) self.frame_history = frame_history self.replay_start_size = replay_start_size self.epsilon = [epsilon_start ] * num_abstract_states * num_abstract_states self.epsilon = dict() self.epsilon_min = epsilon_end self.epsilon_steps = epsilon_steps self.epsilon_delta = (epsilon_start - self.epsilon_min) / self.epsilon_steps self.update_freq = update_freq self.target_copy_freq = target_copy_freq self.action_ticker = 1 self.num_actions = num_actions self.batch_size = batch_size self.sess.run(tf.initialize_all_variables()) if restore_network_file is not None: self.saver.restore(self.sess, restore_network_file) print('Restored network from file') self.sess.run(self.copy_op) #################### ## Keeping track of progress of actions self.samples_per_option = 50 self.state_samples_for_option = dict() self.option_action_ticker = dict() self.progress_sample_frequency = 1000
with tf.variable_scope('fc2_sigma'): fc2 = th.fully_connected(z, 50, tf.nn.elu) with tf.variable_scope('dec_sigma'): sigma_x = th.fully_connected(fc2, 11*11, lambda x: x) return mu_x, sigma_x encoding_size = 50 batch_size = 32 inp_image = tf.placeholder(tf.float32, [None, 84, 84, 1]) with tf.variable_scope('encoder'): mu_z, sigma_z = make_encoder(inp_image, encoding_size) z = sigma_z * tf.random_normal([batch_size, encoding_size]) + mu_z with tf.variable_scope('decoder'): mu_x, sigma_x = make_decoder(z) z_variance = tf.sqrt(tf.reduce_sum(tf.square(mu_z), reduction_indices=1)) term1 = (0.5 * tf.reduce_sum(1 - tf.square(mu_z) - 1, reduction_indices=[1])) k = 84*84 term2 = - tf.reduce_sum(0.5*tf.square(inp_image - mu_x), [1, 2, 3]) loss = -tf.reduce_mean((term1 + term2), reduction_indices=0) #loss = tf.reduce_mean(tf.square(inp_image - mu_x)) train_op = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(loss) saver = tf.train.Saver(var_list=th.get_vars('encoder', 'decoder')) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.initialize_all_variables())
def __init__(self, abs_size, num_actions, num_abstract_states, gamma=0.99, learning_rate=0.00025, replay_start_size=5000, epsilon_start=1.0, epsilon_end=0.01, epsilon_steps=1000000, update_freq=4, target_copy_freq=30000, replay_memory_size=1000000, frame_history=4, batch_size=32, error_clip=1, restore_network_file=None, double=True, use_mmc=True, max_mmc_path_length=1000, mmc_beta=0.1, max_dqn_number=300, rmax_learner=None, encoding_func=None, bonus_beta=0.05): self.rmax_learner = rmax_learner config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True self.sess = tf.Session(config=config) self.inp_actions = tf.placeholder(tf.float32, [None, num_actions]) self.max_mmc_path_length = max_mmc_path_length self.mmc_beta = mmc_beta inp_shape = [None, 84, 84, frame_history] inp_dtype = 'uint8' assert type(inp_dtype) is str self.inp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_terminated = tf.placeholder(tf.bool, [None]) self.inp_reward = tf.placeholder(tf.float32, [None]) self.inp_mmc_reward = tf.placeholder(tf.float32, [None]) self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.inp_dqn_numbers = tf.placeholder(tf.int32, [None]) # self.inp_q_choices = tf.placeholder(tf.int32, [None]) self.abs_neighbors = dict() self.gamma = gamma self.max_dqn_number = max_dqn_number # q_constructor = lambda inp: construct_q_network_weights(inp, self.inp_dqn_numbers, max_dqn_number, frame_history, num_actions) #q_constructor = lambda inp: construct_small_network_weights(inp, self.inp_dqn_numbers, max_dqn_number, # frame_history, num_actions) #q_constructor = lambda inp: construct_dqn_with_embedding_2_layer(inp, self.inp_abs_state_init, self.inp_abs_state_goal, frame_history, num_actions) q_constructor = lambda inp: construct_q_network_weights_only_final( inp, self.inp_dqn_numbers, max_dqn_number, frame_history, num_actions) # q_constructor = lambda inp: construct_dqn_with_subgoal_embedding(inp, self.inp_abs_state_init, self.inp_abs_state_goal, frame_history, num_actions) # q_constructor = lambda inp: construct_meta_dqn_network(inp, self.inp_abs_state_init, self.inp_abs_state_goal, frame_history, num_actions) with tf.variable_scope('online'): mask_shape = [-1, 1, 1, frame_history] mask = tf.reshape(self.inp_mask, mask_shape) masked_input = self.inp_frames * mask self.q_online, self.q_online_explore = q_constructor(masked_input) with tf.variable_scope('target'): mask_shape = [-1, 1, 1, frame_history] sp_mask = tf.reshape(self.inp_sp_mask, mask_shape) masked_sp_input = self.inp_sp_frames * sp_mask self.q_target, self.q_target_explore = q_constructor( masked_sp_input) if double: with tf.variable_scope('online', reuse=True): self.q_online_prime, self.q_online_prime_explore = q_constructor( masked_sp_input) print(self.q_online_prime) else: self.q_online_prime = None self.q_online_prime_explore = None self.loss = construct_q_loss(self.q_online, self.q_target, self.inp_actions, self.inp_reward, self.inp_terminated, self.q_online_prime, batch_size, gamma, error_clip, self.inp_mmc_reward, mmc_beta) # if True: # If using explore/exploit nets # self.loss_explore = construct_q_loss(self.q_online_explore, self.q_target_explore, self.inp_actions, self.inp_reward_explore, # self.inp_terminated, # self.q_online_prime_explore, batch_size, gamma, error_clip, self.inp_mmc_reward_explore, # mmc_beta) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.95, centered=True, epsilon=0.01) self.pre_gvs = optimizer.compute_gradients( self.loss, var_list=th.get_vars('online')) self.pre_gvs = [(tf.where(tf.is_nan(grad), tf.zeros_like(grad), grad), var) for grad, var in self.pre_gvs] self.post_gvs = [(tf.clip_by_value(grad, -10., 10.), var) for grad, var in self.pre_gvs] self.train_op = optimizer.apply_gradients(self.post_gvs) self.copy_op = th.make_copy_op('online', 'target') self.saver = tf.train.Saver(var_list=th.get_vars('online')) self.use_mmc = use_mmc self.replay_buffer = ReplayMemory((84, 84), abs_size, 'uint8', replay_memory_size, frame_history) if self.use_mmc: self.mmc_tracker = MMCPathTrackerExplore(self.replay_buffer, self.max_mmc_path_length, self.gamma) self.frame_history = frame_history self.replay_start_size = replay_start_size self.epsilon = [epsilon_start ] * num_abstract_states * num_abstract_states self.epsilon = dict() self.global_epsilon = epsilon_start self.epsilon_min = epsilon_end self.epsilon_steps = epsilon_steps self.epsilon_delta = (epsilon_start - self.epsilon_min) / self.epsilon_steps self.update_freq = update_freq self.target_copy_freq = target_copy_freq self.action_ticker = 1 self.num_actions = num_actions self.batch_size = batch_size self.check_op = tf.add_check_numerics_ops() self.sess.run(tf.initialize_all_variables()) if restore_network_file is not None: self.saver.restore(self.sess, restore_network_file) print('Restored network from file') self.sess.run(self.copy_op) self.encoding_func = encoding_func self.bonus_beta = bonus_beta self.reward_mult = 1. # (10 * self.bonus_beta)/(1-gamma) self.n_hat_tracker = dict() #################### ## Keeping track of progress of actions self.samples_per_option = 50 self.state_samples_for_option = dict() self.option_action_ticker = dict() self.progress_sample_frequency = 1000
def __init__(self, dqn, num_actions, max_vae_loss_buffer_size=10000, variance_max=3.0, gamma=0.99, learning_rate=0.00025, replay_start_size=1000, epsilon_start=0.1, epsilon_end=0.1, epsilon_steps=1000000, update_freq=4, target_copy_freq=30000, replay_memory_size=1000000, frame_history=4, batch_size=32, error_clip=1, restore_network_file=None, double=True): self.dqn = dqn self.variance_max = variance_max self.max_vae_loss_buffer = deque(maxlen=max_vae_loss_buffer_size) self.min_vae_loss_buffer = deque(maxlen=max_vae_loss_buffer_size) config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.inp_actions = tf.placeholder(tf.float32, [None, num_actions]) inp_shape = [None] + list(self.dqn.get_input_shape()) + [frame_history] inp_dtype = self.dqn.get_input_dtype() assert type(inp_dtype) is str self.inp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_terminated = tf.placeholder(tf.bool, [None]) self.inp_reward = tf.placeholder(tf.float32, [None]) self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.gamma = gamma with tf.variable_scope('online'): mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [ frame_history ] mask = tf.reshape(self.inp_mask, mask_shape) masked_input = self.inp_frames * mask self.q_online = self.dqn.construct_q_network(masked_input) with tf.variable_scope('target'): mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [ frame_history ] sp_mask = tf.reshape(self.inp_sp_mask, mask_shape) masked_sp_input = self.inp_sp_frames * sp_mask self.q_target = self.dqn.construct_q_network(masked_sp_input) if double: with tf.variable_scope('online', reuse=True): self.q_online_prime = self.dqn.construct_q_network( masked_sp_input) self.maxQ = tf.gather_nd( self.q_target, tf.transpose([ tf.range(0, 32, dtype=tf.int32), tf.cast(tf.argmax(self.q_online_prime, axis=1), tf.int32) ], [1, 0])) else: self.maxQ = tf.reduce_max(self.q_target, reduction_indices=1) self.r = self.inp_reward use_backup = tf.cast(tf.logical_not(self.inp_terminated), dtype=tf.float32) self.y = self.r + use_backup * gamma * self.maxQ self.delta = tf.reduce_sum(self.inp_actions * self.q_online, reduction_indices=1) - self.y self.error = tf.select( tf.abs(self.delta) < error_clip, 0.5 * tf.square(self.delta), error_clip * tf.abs(self.delta)) self.loss = tf.reduce_sum(self.error) self.g = tf.gradients(self.loss, self.q_online) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.95, centered=True, epsilon=0.01) self.train_op = optimizer.minimize(self.loss, var_list=th.get_vars('online')) self.copy_op = th.make_copy_op('online', 'target') self.saver = tf.train.Saver(var_list=th.get_vars('online')) self.replay_buffer = ReplayMemory(self.dqn.get_input_shape(), self.dqn.get_input_dtype(), replay_memory_size, frame_history) self.frame_history = frame_history self.replay_start_size = replay_start_size self.epsilon = epsilon_start self.epsilon_min = epsilon_end self.epsilon_steps = epsilon_steps self.epsilon_delta = (self.epsilon - self.epsilon_min) / self.epsilon_steps self.update_freq = update_freq self.target_copy_freq = target_copy_freq self.action_ticker = 1 self.num_actions = num_actions self.batch_size = batch_size self.sess.run(tf.initialize_all_variables()) vae_network.saver.restore(self.sess, '../vae_net.ckpt') if restore_network_file is not None: self.saver.restore(self.sess, restore_network_file) print 'Restored network from file' self.sess.run(self.copy_op)
def __init__(self, num_abstract_states, num_actions, gamma=0.9, learning_rate=0.00025, replay_start_size=32, epsilon_start=1.0, epsilon_end=0.1, epsilon_steps=10000, replay_memory_size=100, frame_history=1, batch_size=32, error_clip=1, abstraction_function=None, base_network_file=None): config = tf.ConfigProto() config.gpu_options.allow_growth = True self.num_abstract_states = num_abstract_states self.num_abstract_actions = num_abstract_states * ( num_abstract_states - 1) self.frame_history = frame_history self.abstraction_function = abstraction_function self.sess = tf.Session(config=config) self.inp_actions = tf.placeholder(tf.float32, [None, self.num_abstract_actions]) inp_shape = [None, 84, 84, self.frame_history] inp_dtype = 'uint8' assert type(inp_dtype) is str self.inp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_terminated = tf.placeholder(tf.bool, [None]) self.inp_reward = tf.placeholder(tf.float32, [None]) self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history]) # convert t self.inp_sigma = tf.placeholder(tf.uint8, [None]) self.inp_sigma_onehot = tf.cast( tf.sparse_to_dense( tf.concat(1, [ tf.expand_dims(tf.range(0, batch_size), -1), tf.expand_dims(tf.cast(self.inp_sigma, tf.int32), -1) ]), [batch_size, self.num_abstract_states], 1), tf.float32) self.inp_sigma_p = tf.placeholder(tf.uint8, [None]) self.inp_sigma_p_onehot = tf.cast( tf.sparse_to_dense( tf.concat(1, [ tf.expand_dims(tf.range(0, batch_size), -1), tf.expand_dims(tf.cast(self.inp_sigma_p, tf.int32), -1) ]), [batch_size, self.num_abstract_states], 1), tf.float32) self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.gamma = gamma self.actions_for_sigma = np.zeros( (self.num_abstract_states, self.num_abstract_actions), dtype=np.float32) for a in range(self.num_abstract_actions): i, j = flat_actions_to_state_pairs(a, num_abstract_states) self.actions_for_sigma[i, a] = 1 self.visual_scope = 'visual' self.abstraction_scope = 'abstraction' with tf.variable_scope(self.visual_scope): # mask stuff here mask = tf.reshape(self.inp_mask, [-1, 1, 1, 1]) masked_input = self.inp_frames * mask self.visual_output = hook_visual(masked_input, self.frame_history) with tf.variable_scope(self.abstraction_scope): self.sigma, self.sigma_probs = hook_abstraction( self.visual_output, self.num_abstract_states, batch_size, I=self.inp_sigma_onehot) with tf.variable_scope(self.abstraction_scope, reuse=True): # the one that samples self.sigma_query, self.sigma_query_probs = hook_abstraction( self.visual_output, self.num_abstract_states, 1) with tf.variable_scope(self.visual_scope, reuse=True): mask_sp = tf.reshape(self.inp_sp_mask, [-1, 1, 1, 1]) masked_input_sp = self.inp_sp_frames * mask_sp self.visual_output_sp = hook_visual(masked_input_sp, self.frame_history) with tf.variable_scope(self.abstraction_scope, reuse=True): self.sigma_p, self.sigma_p_probs = hook_abstraction( self.visual_output_sp, self.num_abstract_states, batch_size, I=self.inp_sigma_p_onehot) self.possible_action_vector = tf.stop_gradient( valid_actions_for_sigma(self.actions_for_sigma, self.sigma, self.num_abstract_actions)) with tf.variable_scope('l1_online'): self.q_online = hook_l1(self.sigma, self.num_abstract_actions) with tf.variable_scope('l1_online', reuse=True): self.possible_action_vector_query = -np.inf * ( 1 - valid_actions_for_sigma(self.actions_for_sigma, self.sigma_query, self.num_abstract_actions)) self.possible_action_vector_query = tf.select( tf.is_nan(self.possible_action_vector_query), tf.zeros_like(self.possible_action_vector_query), self.possible_action_vector_query) self.q_online_query = self.possible_action_vector_query + hook_l1( self.sigma_query, self.num_abstract_actions) with tf.variable_scope('l1_online', reuse=True): self.possible_action_vector_prime = -np.inf * ( 1 - valid_actions_for_sigma(self.actions_for_sigma, self.sigma_p, self.num_abstract_actions)) self.possible_action_vector_prime = tf.select( tf.is_nan(self.possible_action_vector_prime), tf.zeros_like(self.possible_action_vector_prime), self.possible_action_vector_prime) self.q_target = self.possible_action_vector_prime + hook_l1( self.sigma_p, self.num_abstract_actions) self.maxQ = tf.reduce_max(self.q_target, reduction_indices=1) self.r = tf.sign(self.inp_reward) use_backup = tf.cast(tf.logical_not(self.inp_terminated), dtype=tf.float32) self.y = tf.stop_gradient(self.r + use_backup * gamma * self.maxQ) self.delta = tf.reduce_sum(self.inp_actions * self.q_online, reduction_indices=1) - self.y self.error = tf.select( tf.abs(self.delta) < error_clip, 0.5 * tf.square(self.delta), error_clip * tf.abs(self.delta)) self.loss = tf.reduce_sum(self.error) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.95, centered=True, epsilon=0.01) # TODO: add th.get_vars(self.visual_scope)+th.get_vars(self.abstraction_scope) if self.abstraction_function is None: self.train_op = optimizer.minimize( self.loss, var_list=th.get_vars('l1_online', self.abstraction_scope, self.visual_scope)) else: self.train_op = optimizer.minimize( self.loss, var_list=th.get_vars('l1_online')) self.saver = tf.train.Saver(var_list=th.get_vars(self.visual_scope) + th.get_vars(self.abstraction_scope) + th.get_vars('l1_online') + th.get_vars('online')) self.replay_buffer = L1ReplayMemory((84, 84), np.uint8, replay_memory_size, 1) self.frame_history = frame_history self.replay_start_size = replay_start_size self.epsilon = epsilon_start self.epsilon_min = epsilon_end self.epsilon_steps = epsilon_steps self.epsilon_delta = (self.epsilon - self.epsilon_min) / self.epsilon_steps self.action_ticker = 1 self.num_actions = num_actions self.batch_size = batch_size self.l0_learner = L0_Learner( self.sess, self.abstraction_scope, self.visual_scope, num_actions, #self.visual_scope, num_actions, self.num_abstract_actions, self.num_abstract_states, abstraction_function=self.abstraction_function, max_episode_steps=20, base_network_file=base_network_file) self.sess.run(tf.initialize_all_variables()) if base_network_file is not None: self.l0_learner.base_network_saver.restore(self.sess, base_network_file) print('Restored network from file')
def __init__(self, sess, abstraction_scope, visual_scope, num_actions, num_abstract_actions, num_abstract_states, gamma=0.99, learning_rate=0.00025, replay_start_size=5000, epsilon_start=1.0, epsilon_end=0.1, epsilon_steps=1000000, update_freq=4, target_copy_freq=10000, replay_memory_size=1000000, frame_history=1, batch_size=32, error_clip=1, abstraction_function=None, max_episode_steps=-1, base_network_file=None): self.sess = sess self.num_abstract_actions = num_abstract_actions self.num_abstract_states = num_abstract_states self.num_actions = num_actions self.batch_size = batch_size self.gamma = gamma self.frame_history = frame_history self.replay_buffer = ReplayMemory((84, 84), 'uint8', replay_memory_size, frame_history) self.abstraction_scope = abstraction_scope self.abstraction_function = abstraction_function self.inp_frames = tf.placeholder(tf.uint8, [None, 84, 84, self.frame_history]) self.inp_sp_frames = tf.placeholder(tf.uint8, [None, 84, 84, self.frame_history]) self.inp_terminated = tf.placeholder(tf.bool, [None]) self.inp_reward = tf.placeholder(tf.float32, [None]) self.inp_mask = tf.placeholder(tf.uint8, [None, frame_history]) self.inp_sp_mask = tf.placeholder(tf.uint8, [None, frame_history]) self.inp_actions = tf.placeholder(tf.float32, [None, num_actions]) # onehot vector #self.inp_sigma = tf.placeholder(tf.float32, [None, self.num_abstract_states]) self.reward_matrix = -np.ones( (num_abstract_states, num_abstract_states, num_abstract_actions), dtype=np.float32) # make self transitions 0 for i in range(num_abstract_states): self.reward_matrix[i, i, :] = 0 # make goal transitions have reward 1 for a in range(num_abstract_actions): i, j = flat_actions_to_state_pairs(a, num_abstract_states) self.reward_matrix[i, j, a] = 1 self.actions_for_sigma = np.zeros( (num_abstract_states, num_abstract_actions), dtype=np.float32) for a in range(num_abstract_actions): i, j = flat_actions_to_state_pairs(a, num_abstract_states) self.actions_for_sigma[i, a] = 1 # mask stuff here mask = tf.reshape(self.inp_mask, [-1, 1, 1, 1]) masked_input = self.inp_frames * mask l0_vis_scope = 'l0_vis' with tf.variable_scope(l0_vis_scope): self.visual_output_base = hook_visual(masked_input, self.frame_history) self.visual_output = tf.stop_gradient(self.visual_output_base) with tf.variable_scope('online_base'): self.q_online_base = hook_base(self.visual_output_base, self.num_actions) with tf.variable_scope('online_1'): self.q_online_1 = hook_l0(self.visual_output, 1, self.num_actions) with tf.variable_scope('online_2'): self.q_online_2 = hook_l0(self.visual_output, 1, self.num_actions) self.q_online = tf.concat(1, [self.q_online_1, self.q_online_2]) mask_sp = tf.reshape(self.inp_sp_mask, [-1, 1, 1, 1]) masked_input_sp = self.inp_sp_frames * mask_sp l0_target_vis_scope = 'l0_target_vis' with tf.variable_scope(l0_target_vis_scope): self.visual_output_sp = hook_visual(masked_input_sp, self.frame_history) with tf.variable_scope('target_base'): self.q_target_base = hook_base(self.visual_output_sp, self.num_actions) with tf.variable_scope('target_1'): self.q_target_1 = hook_l0(self.visual_output_sp, 1, self.num_actions) with tf.variable_scope('target_2'): self.q_target_2 = hook_l0(self.visual_output_sp, 1, self.num_actions) self.q_target = tf.concat(1, [self.q_target_1, self.q_target_2]) # with tf.variable_scope(visual_scope, reuse=True): # # mask stuff here # mask = tf.reshape(self.inp_mask, [-1, 1, 1, 1]) # masked_input = self.inp_frames * mask # self.visual_output = hook_visual(masked_input, self.frame_history) # # mask_sp = tf.reshape(self.inp_sp_mask, [-1, 1, 1, 1]) # masked_input_sp = self.inp_sp_frames * mask_sp # self.visual_output_sp = hook_visual(masked_input_sp, self.frame_history) # # with tf.variable_scope('online'): # self.q_online = hook_l0(self.visual_output, self.num_abstract_actions, self.num_actions) # with tf.variable_scope('target'): # self.q_target = hook_l0(self.visual_output_sp, self.num_abstract_actions, self.num_actions) # TODO set up double dqn for later experiments. # Q matrix is (num_abstract_actions, num_actions), results in vector with max-q for each abstract action. self.maxQ = tf.reduce_max(self.q_target, reduction_indices=2) with tf.variable_scope(visual_scope, reuse=True): self.l1_visual_output = hook_visual(masked_input, self.frame_history) self.l1_visual_output_sp = hook_visual(masked_input_sp, self.frame_history) with tf.variable_scope(self.abstraction_scope, reuse=True): self.sigma = tf.stop_gradient( hook_abstraction(self.l1_visual_output, num_abstract_states, batch_size)[0]) self.sigma_p = tf.stop_gradient( hook_abstraction(self.l1_visual_output_sp, num_abstract_states, batch_size)[0]) self.sigma_query, self.sigma_query_probs = hook_abstraction( self.l1_visual_output, self.num_abstract_states, 1) self.r = tf.reduce_sum( tf.reshape(self.sigma_p, [-1, 1, num_abstract_states, 1]) * \ tf.reshape(self.sigma, [-1, num_abstract_states, 1, 1]) * \ tf.reshape(self.reward_matrix, [1, num_abstract_states, num_abstract_states, num_abstract_actions]), reduction_indices=[1, 2]) # Give a reward of -1 if reached a terminal state self.r = (self.r * tf.reshape(tf.cast(tf.logical_not(self.inp_terminated), dtype=tf.float32), [-1, 1])) +\ tf.reshape(tf.cast(self.inp_terminated, dtype=tf.float32) * -1, [-1, 1]) self.use_backup = tf.cast(tf.logical_not(self.inp_terminated), dtype=tf.float32) * tf.reduce_sum( self.sigma_p * self.sigma, reduction_indices=1) self.y = tf.stop_gradient(self.r + tf.reshape(self.use_backup, [-1, 1]) * gamma * self.maxQ) self.delta = tf.reduce_sum( tf.reshape(self.inp_actions, [-1, 1, num_actions]) * self.q_online, reduction_indices=2) - self.y valid_actions_mask = valid_actions_for_sigma(self.actions_for_sigma, self.sigma, self.num_abstract_actions) self.masked_delta = self.delta * valid_actions_mask self.error = tf.select( tf.abs(self.masked_delta) < error_clip, 0.5 * tf.square(self.masked_delta), error_clip * tf.abs(self.masked_delta)) # base dqn self.maxQ_base = tf.reduce_max(self.q_target_base, reduction_indices=1) self.r_base = tf.sign(self.inp_reward) use_backup_base = tf.cast(tf.logical_not(self.inp_terminated), dtype=tf.float32) self.y_base = tf.stop_gradient(self.r_base + use_backup_base * gamma * self.maxQ_base) self.delta_base = tf.reduce_sum(self.inp_actions * self.q_online_base, reduction_indices=1) - self.y_base self.error_base = tf.select( tf.abs(self.delta_base) < error_clip, 0.5 * tf.square(self.delta_base), error_clip * tf.abs(self.delta_base)) self.loss = tf.reduce_sum(self.error) + tf.reduce_sum(self.error_base) self.g = tf.gradients(self.loss, self.q_online) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.95, centered=True, epsilon=0.01) self.train_op = optimizer.minimize(self.loss, var_list=th.get_vars( 'online_1', 'online_2', 'online_base', l0_vis_scope)) self.copy_op = [ th.make_copy_op('online_1', 'target_1'), th.make_copy_op('online_2', 'target_2'), th.make_copy_op(l0_vis_scope, l0_target_vis_scope), th.make_copy_op('online_base', 'target_base') ] self.replay_buffer = L1ReplayMemory((84, 84), 'uint8', replay_memory_size, frame_history) self.frame_history = frame_history self.replay_start_size = replay_start_size self.epsilon = epsilon_start self.epsilon_min = epsilon_end self.epsilon_steps = epsilon_steps self.epsilon_delta = (self.epsilon - self.epsilon_min) / self.epsilon_steps self.update_freq = update_freq self.target_copy_freq = target_copy_freq self.action_ticker = 1 self.max_episode_steps = max_episode_steps self.num_actions = num_actions self.batch_size = batch_size self.base_network_saver = tf.train.Saver( var_list=th.get_vars('online_base', l0_vis_scope))