def __init__(self, rom_path=_default_rom_path, frame_skip=4, history_length=4, resize_mode='scale', resized_rows=84, resized_cols=84, crop_offset=8, display_screen=False, max_null_op=30, replay_memory_size=1000000, replay_start_size=100, death_end_episode=True): super(AtariGame, self).__init__() self.rng = get_numpy_rng() self.ale = ale_load_from_rom(rom_path=rom_path, display_screen=display_screen) self.start_lives = self.ale.lives() self.action_set = self.ale.getMinimalActionSet() self.resize_mode = resize_mode self.resized_rows = resized_rows self.resized_cols = resized_cols self.crop_offset = crop_offset self.frame_skip = frame_skip self.history_length = history_length self.max_null_op = max_null_op self.death_end_episode = death_end_episode self.screen_buffer_length = 2 self.screen_buffer = numpy.empty((self.screen_buffer_length, self.ale.getScreenDims()[1], self.ale.getScreenDims()[0]), dtype='uint8') self.replay_memory = ReplayMemory(state_dim=(resized_rows, resized_cols), history_length=history_length, memory_size=replay_memory_size, replay_start_size=replay_start_size) self.start()
def __init__(self, args): '''Constructor''' self.WARM_UP = 0 self.QUALIFYING = 1 self.RACE = 2 self.UNKNOWN = 3 self.stage = args.stage self.parser = msgParser.MsgParser() self.state = carState.CarState() self.control = carControl.CarControl() self.steers = [-1.0, -0.8, -0.6, -0.5, -0.4, -0.3, -0.2, -0.15, -0.1, -0.05, 0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.0] self.speeds = [-1.0, -0.5, 0.0, 0.5, 1.0] self.num_inputs = 19 self.num_steers = len(self.steers) self.num_speeds = len(self.speeds) self.num_actions = self.num_steers + self.num_speeds self.net = DeepQNetwork(self.num_inputs, self.num_steers, self.num_speeds, args) self.mem = ReplayMemory(args.replay_size, self.num_inputs, args) self.minibatch_size = args.batch_size if args.load_replay: self.mem.load(args.load_replay) if args.load_weights: self.net.load_weights(args.load_weights) self.save_weights_prefix = args.save_weights_prefix self.save_interval = args.save_interval self.save_replay = args.save_replay self.enable_training = args.enable_training self.enable_exploration = args.enable_exploration self.save_csv = args.save_csv if self.save_csv: self.csv_file = open(args.save_csv, "wb") self.csv_writer = csv.writer(self.csv_file) self.csv_writer.writerow(['episode', 'distFormStart', 'distRaced', 'curLapTime', 'lastLapTime', 'racePos', 'epsilon', 'replay_memory', 'train_steps']) self.total_train_steps = 0 self.exploration_decay_steps = args.exploration_decay_steps self.exploration_rate_start = args.exploration_rate_start self.exploration_rate_end = args.exploration_rate_end self.skip = args.skip self.show_sensors = args.show_sensors self.show_qvalues = args.show_qvalues self.episode = 0 self.distances = [] self.onRestart() if self.show_sensors: from sensorstats import Stats self.stats = Stats(inevery=8) if self.show_qvalues: from plotq import PlotQ self.plotq = PlotQ(self.num_steers, self.num_speeds)
def test_get_minibatch(self): replay_memory = ReplayMemory(None, self.use_gpu_replay_mem, self.max_replay_memory, self.train_batch_size, self.screen_history, self.screen_width, self.screen_height, self.minibatch_random, self.screen_order) for i in range(255): screen = np.zeros((self.screen_height, self.screen_width)) screen.fill(i + 1) replay_memory.add(i + 1, 10 * (i + 1), screen, False) if i > self.train_batch_size + self.screen_history: prestates, actions, rewards, poststates, terminals = replay_memory.get_minibatch() for b in range(self.train_batch_size-1): for h in range(self.screen_history-1): self.assertTrue(prestates[b+1, 0, 0, h] < prestates[b, 0, 0, h]) self.assertTrue(prestates[b, 0, 0, h+1] > prestates[b, 0, 0, h])
class TestBinaryHeap(unittest.TestCase): def setUp(self): self.heap = BinaryHeap() self.replayMemory = ReplayMemory(10, 32, 4, 84, 84) def test_Add(self): totalNo = 10 for i in range(totalNo): state = np.zeros((84, 84), dtype=np.int) state.fill(i) td = i addedIndex = self.replayMemory.add(0, 0, state, 0) self.heap.add(addedIndex, td) for i in range(totalNo): topItem = self.heap.getTop() self.assertEqual(totalNo - i - 1, topItem[0]) self.heap.remove(0)
def _train_minibatch(self, minibatch_size): if self.replay_memory.size() < minibatch_size: return # Sample a minibatch from replay memory non_terminal_minibatch, terminal_minibatch = \ self.replay_memory.get_minibatch(minibatch_size) non_terminal_minibatch, terminal_minibatch = \ list(non_terminal_minibatch), list(terminal_minibatch) # Compute max q-values for the non-terminal next states based # on the target network next_states = list(ReplayMemory.get_next_states(non_terminal_minibatch)) q_values = self._predict_q_values(next_states, use_target_network=True) max_q_values = q_values.max(axis=1) # Gradient descent feed_dict = self._get_minibatch_feed_dict( max_q_values, non_terminal_minibatch, terminal_minibatch, ) if self._should_log_summary(): _, summary = self.session.run( [self.network.train_op, self.network.summary_op], feed_dict=feed_dict, ) self.summary_writer.add_summary(summary, self.training_steps) else: self.session.run(self.network.train_op, feed_dict=feed_dict) self.training_steps += 1 # Update the target network if needed self._update_target_network()
if __name__ == "__main__": #MODEL = importlib.import_module(FLAGS.model_file) # import network module #MODEL_FILE = os.path.join(BASE_DIR, 'models', FLAGS.model_file+'.py') ####### log writing FLAGS.LOG_DIR = FLAGS.LOG_DIR + '/' + FLAGS.task_name #FLAGS.CHECKPOINT_DIR = os.path.join(FLAGS.CHECKPOINT_DIR, FLAGS.task_name) #tf_util.mkdir(FLAGS.CHECKPOINT_DIR) if not FLAGS.is_training: agent = ActiveMVnet(FLAGS) senv = ShapeNetEnv(FLAGS) if FLAGS.pretrain_restore: restore_pretrain(agent) else: restore_from_iter(agent, FLAGS.test_iter) replay_mem = ReplayMemory(FLAGS) rollout_obj = Rollout(agent, senv, replay_mem, FLAGS) if FLAGS.test_random: test_random(agent, FLAGS.test_episode_num, replay_mem, FLAGS.test_iter, rollout_obj) elif FLAGS.test_oneway: test_oneway(agent, FLAGS.test_episode_num, replay_mem, FLAGS.test_iter, rollout_obj) else: test_active(agent, FLAGS.test_episode_num, replay_mem, FLAGS.test_iter, rollout_obj) sys.exit()
def __init__(self, args): '''Constructor''' self.WARM_UP = 0 self.QUALIFYING = 1 self.RACE = 2 self.UNKNOWN = 3 self.stage = args.stage self.parser = msgParser.MsgParser() self.state = carState.CarState() self.control = carControl.CarControl() self.steers = [-1.0, -0.8, -0.6, -0.5, -0.4, -0.3, -0.2, -0.15, -0.1, -0.05, 0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.0] self.speeds = [-1.0, -0.5, 0.0, 0.5, 1.0] self.num_inputs = 19 self.num_steers = len(self.steers) self.num_speeds = len(self.speeds) self.num_actions = self.num_steers + self.num_speeds self.net = DeepQNetwork(self.num_inputs, self.num_steers, self.num_speeds, args) self.mem = ReplayMemory(args.replay_size, self.num_inputs, args) self.minibatch_size = args.batch_size if args.load_weights: self.net.load_weights(args.load_weights) self.save_weights_prefix = args.save_weights_prefix self.pretrained_network = args.pretrained_network self.steer_lock = 0.785398 self.max_speed = 100 self.algorithm = args.algorithm self.device = args.device self.mode = args.mode self.maxwheelsteps = args.maxwheelsteps self.enable_training = args.enable_training self.enable_exploration = args.enable_exploration self.total_train_steps = 0 self.exploration_decay_steps = args.exploration_decay_steps self.exploration_rate_start = args.exploration_rate_start self.exploration_rate_end = args.exploration_rate_end self.show_sensors = args.show_sensors self.show_qvalues = args.show_qvalues self.episode = 0 self.onRestart() if self.show_sensors: from sensorstats import Stats self.stats = Stats(inevery=8) if self.show_qvalues: from plotq import PlotQ self.plotq = PlotQ(self.num_steers, self.num_speeds) if self.device == 'wheel': from wheel import Wheel self.wheel = Wheel(args.joystick_nr, args.autocenter, args.gain, args.min_force, args.max_force)
limit = 4000 elif args.buffer_type == 'optimal_final': limit = 12000 else: limit = np.inf # Agent agent = SAC(env.observation_space.shape[0], env.action_space, args) #TesnorboardX writer = SummaryWriter(logdir='runs/{}_SAC_{}_{}_{}'.format( datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name, args.policy, "autotune" if args.automatic_entropy_tuning else "")) # Memory memory = ReplayMemory(args.replay_size) # Training Loop total_numsteps = 0 updates = 0 for i_episode in itertools.count(1): episode_reward = 0 episode_steps = 0 done = False state = env.reset() while not done: if args.start_steps > total_numsteps: action = env.action_space.sample() # Sample random action else:
default="INFO", help="Log level.") args = parser.parse_args() logger = logging.getLogger() logger.setLevel(args.log_level) if args.random_seed: random.seed(args.random_seed) # instantiate classes env = GymEnvironment(args.game, args) logger.info("Using Gym Environment") net = DeepQNetwork(env.numActions(), args) statistics = Statistics(net) mem = ReplayMemory(args.replay_size, args) agent = DqnAgent(env, mem, net, args, statistics=statistics) if args.load_weights: logger.info("Loading weights from %s" % args.load_weights) net.load_weights(args.load_weights) if args.play_games: logger.info("Playing for {} game(s)".format(args.play_games)) agent.play(args.play_games) sys.exit() for epoch in xrange(args.start_epoch, args.epochs): logger.info("Epoch #{}/{}".format(epoch + 1, args.epochs)) if args.train_steps:
def _initialize(self, game=None, network_args=None, actions=None, name=None, net_type="dqn", # TODO change to the actual class name? reshaped_x=None, reshaped_y=None, skiprate=3, history_length=4, batchsize=64, update_pattern=(1, 1), replay_memory_size=10000, backprop_start_step=10000, start_epsilon=1.0, end_epsilon=0.1, epsilon_decay_start_step=50000, epsilon_decay_steps=100000, reward_scale=1.0, # TODO useless? melt_steps=10000, shaping_on=False, count_time=False, one_hot_time=False, count_time_interval=1, count_time_max=2100, use_game_variables=True, rearrange_misc=False, remember_n_actions=4, one_hot_nactions=False, misc_scale=None, # TODO seems useless results_file=None, params_file=None, config_file=None, no_timeout_terminal=False # TODO seems useless ): if game is not None: self.game = game self.config_file = None elif config_file is not None: self.config_file = config_file self.game = initialize_doom(self.config_file) else: raise Exception("No game, no config file. Dunno how to initialize doom.") if network_args is None: network_args = dict() if count_time: self.count_time = bool(count_time) if self.count_time: self.one_hot_time = one_hot_time self.count_time_max = int(count_time_max) self.count_time_interval = int(count_time_interval) if one_hot_time: self.count_time_len = int(self.count_time_max / self.count_time_interval) else: self.count_time_len = 1 else: self.count_time_len = 0 self.count_time = False self.name = name if reward_scale is not None: self.reward_scale = reward_scale else: self.reward_scale = 1.0 self.rearrange_misc = rearrange_misc self.batchsize = batchsize self.history_length = max(history_length, 1) self.update_pattern = update_pattern self.epsilon = max(min(start_epsilon, 1.0), 0.0) self.end_epsilon = min(max(end_epsilon, 0.0), self.epsilon) self.epsilon_decay_steps = epsilon_decay_steps self.epsilon_decay_stride = (self.epsilon - end_epsilon) / epsilon_decay_steps self.epsilon_decay_start = epsilon_decay_start_step self.skiprate = max(skiprate, 0) self.shaping_on = shaping_on self.steps = 0 self.melt_steps = melt_steps self.backprop_start_step = max(backprop_start_step, batchsize) self.one_hot_nactions = one_hot_nactions self.no_timeout_terminal = no_timeout_terminal if results_file: self.results_file = results_file else: self.results_file = "results/" + name + ".res" if params_file: self.params_file = params_file else: self.params_file = "params/" + name if self.game.get_available_game_variables_size() > 0 and use_game_variables: self.use_game_variables = True else: self.use_game_variables = False self.last_shaping_reward = 0 self.learning_mode = True if actions is None: self.actions = generate_default_actions(self.game) else: self.actions = actions self.actions_num = len(self.actions) self.actions_stats = np.zeros([self.actions_num], np.int) # changes img_shape according to the history size self.channels = self.game.get_screen_channels() if self.history_length > 1: self.channels *= self.history_length if reshaped_x is None: x = self.game.get_screen_width() y = self.game.get_screen_height() scale_x = scale_y = 1.0 else: x = reshaped_x scale_x = float(x) / self.game.get_screen_width() if reshaped_y is None: y = int(self.game.get_screen_height() * scale_x) scale_y = scale_x else: y = reshaped_y scale_y = float(y) / self.game.get_screen_height() img_shape = [self.channels, y, x] # TODO check if it is slow (it seems that no) if scale_x == 1 and scale_y == 1: def convert(img): img = img.astype(np.float32) / 255.0 return img else: def convert(img): img = img.astype(np.float32) / 255.0 new_image = np.ndarray([img.shape[0], y, x], dtype=img.dtype) for i in xrange(img.shape[0]): # new_image[i] = skimage.transform.resize(img[i], (y,x), preserve_range=True) new_image[i] = cv2.resize(img[i], (x, y), interpolation=cv2.INTER_AREA) return new_image self.convert_image = convert if self.use_game_variables: single_state_misc_len = int(self.game.get_available_game_variables_size() + self.count_time_len) else: single_state_misc_len = int(self.count_time_len) self.single_state_misc_len = single_state_misc_len self.remember_n_actions = remember_n_actions total_misc_len = int(single_state_misc_len * self.history_length) if remember_n_actions > 0: self.remember_n_actions = remember_n_actions if self.one_hot_nactions: self.action_len = int(2 ** floor(log(len(self.actions), 2))) else: self.action_len = len(self.actions[0]) self.last_action = np.zeros([self.action_len], dtype=np.float32) self.last_n_actions = np.zeros([remember_n_actions * self.action_len], dtype=np.float32) total_misc_len += len(self.last_n_actions) if total_misc_len > 0: self.misc_state_included = True self.current_misc_state = np.zeros(total_misc_len, dtype=np.float32) if single_state_misc_len > 0: if misc_scale is not None: self.misc_scale = np.array(misc_scale, dtype=np.float32) else: self.misc_scale = None else: self.misc_state_included = False state_format = dict() state_format["s_img"] = img_shape state_format["s_misc"] = total_misc_len self.replay_memory = ReplayMemory(state_format, replay_memory_size, batchsize) network_args["state_format"] = state_format network_args["actions_number"] = len(self.actions) if net_type in ("dqn", None, ""): self.approximator = approximators.DQN(**network_args) elif net_type in ["duelling", "dueling"]: self.approximator = approximators.DuelingDQN(**network_args) else: if locate('approximators.' + net_type) is not None: self.approximator = locate('approximators.' + net_type)(**network_args) else: raise Exception("Unsupported approximator type.") self.current_image_state = np.zeros(img_shape, dtype=np.float32)
def train(self, num_run=1): in_ts = time.time() for i_run in range(num_run): self.logger.important(f"START TRAINING RUN {i_run}") # Make the environment # Set Seed for repeatability torch.manual_seed(self.seed + i_run) np.random.seed(self.seed + i_run) self.env.seed(self.seed + i_run) self.env.action_space.np_random.seed(self.seed + i_run) # Setup TensorboardX writer_train = SummaryWriter(log_dir='runs/' + self.folder + 'run_' + str(i_run) + '/train') writer_test = SummaryWriter(log_dir='runs/' + self.folder + 'run_' + str(i_run) + '/test') # Setup Replay Memory memory = ReplayMemory(self.replay_size) # TRAINING LOOP total_numsteps = updates = running_episode_reward = running_episode_reward_100 = 0 rewards = [] i_episode = 0 last_episode_steps = 0 while True: self.env.stop_all_motors() while self.env.is_human_controlled(): continue if self.env.is_forget_enabled(): self.restore_model() memory.forget_last(last_episode_steps) i_episode -= 1 self.logger.info("Last Episode Forgotten") if self.env.is_test_phase(): self.test_phase(i_run, i_episode, writer_test) continue if i_episode > self.num_episode: break self.backup_model() self.logger.important(f"START EPISODE {i_episode}") ts = time.time() episode_reward = episode_steps = 0 done = False info = {'undo': False} state = self.env.reset() state_buffer = None if self.pics: state_buffer = StateBuffer(self.state_buffer_size, state) state = state_buffer.get_state() critic_1_loss_acc = critic_2_loss_acc = policy_loss_acc = ent_loss_acc = alpha_acc = 0 while not done: if self.pics: writer_train.add_image( 'episode_{}'.format(str(i_episode)), state_buffer.get_tensor(), episode_steps) if len(memory) < self.warm_up_steps: action = self.env.action_space.sample() else: action = self.select_action( state) # Sample action from policy if len(memory) > self.batch_size: # Number of updates per step in environment for i in range(self.updates_per_step): # Update parameters of all the networks critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = self.update_parameters( memory, self.batch_size, updates) critic_1_loss_acc += critic_1_loss critic_2_loss_acc += critic_2_loss policy_loss_acc += policy_loss ent_loss_acc += ent_loss alpha_acc += alpha updates += 1 next_state, reward, done, info = self.env.step( action) # Step if self.pics: state_buffer.push(next_state) next_state = state_buffer.get_state() episode_steps += 1 total_numsteps += 1 episode_reward += reward mask = 1 if done else float(not done) memory.push(state, action, reward, next_state, mask) # Append transition to memory state = next_state last_episode_steps = episode_steps i_episode += 1 rewards.append(episode_reward) running_episode_reward += (episode_reward - running_episode_reward) / i_episode if len(rewards) < 100: running_episode_reward_100 = running_episode_reward else: last_100 = rewards[-100:] running_episode_reward_100 = np.array(last_100).mean() writer_train.add_scalar('loss/critic_1', critic_1_loss_acc / episode_steps, i_episode) writer_train.add_scalar('loss/critic_2', critic_2_loss_acc / episode_steps, i_episode) writer_train.add_scalar('loss/policy', policy_loss_acc / episode_steps, i_episode) writer_train.add_scalar('loss/entropy_loss', ent_loss_acc / episode_steps, i_episode) writer_train.add_scalar('entropy_temperature/alpha', alpha_acc / episode_steps, i_episode) writer_train.add_scalar('reward/train', episode_reward, i_episode) writer_train.add_scalar('reward/running_mean', running_episode_reward, i_episode) writer_train.add_scalar('reward/running_mean_last_100', running_episode_reward_100, i_episode) self.logger.info( "Ep. {}/{}, t {}, r_t {}, 100_mean {}, time_spent {}s | {}s " .format( i_episode, self.num_episode, episode_steps, round(episode_reward, 2), round(running_episode_reward_100, 2), round(time.time() - ts, 2), str(datetime.timedelta(seconds=time.time() - in_ts)))) self.env.close()
class neonDQN(object): def __init__(self, input_shape, action_space): self._debug = 0 self.mode = 'train' self.input_shape = input_shape self.action_space = action_space self.prev_action = action_space.sample() self.action_space_size = action_space.n self.steps = 0 self.prelearning_steps = 50000 #50000 self.total_steps = 10000 #1000000 self.history_length = input_shape[0] self.history_step = 0 self.observation_buffer = np.zeros(input_shape) # self.prev_state = np.zeros(input_shape[1:]) # learning related self.learning_rate = 0.00025 self.rmsprop_gamma2 = 1 # experience replay related self.memoryIdx = 0 self.memoryFillCount = 0 self.memoryLimit = 50000 #1000000 self.sampleSize = 32 self.states = np.zeros((self.memoryLimit, ) + self.input_shape[1:], dtype='uint8') self.actions = np.zeros((self.memoryLimit, ), dtype='uint8') self.rewards = np.zeros((self.memoryLimit, )) self.nextStates = np.zeros_like(self.states, dtype='uint8') self.dones = np.zeros_like(self.actions, dtype='bool') # target network update related self.targetNetC = 4 #10000 # Q learning related self.gamma = 0.99 #build Q-learning networks print "building network......" self.args = self.generate_parameter() self.net = self.build_network(self.args) self.mem = ReplayMemory(self.memoryLimit, self.args) np.set_printoptions(precision=4, suppress=True) def act(self, observation): observation = self.preprocess_state(observation) self.observation_buffer[:-1, ...] = self.observation_buffer[1:, ...] self.observation_buffer[-1, ...] = observation if self.mode == 'train': epsilon = max( 0.1, 1 - max(self.steps - self.prelearning_steps, 0) / self.total_steps) elif self.mode == 'test': epsilon = .05 else: assert False action = self.choose_action(self.observation_buffer, epsilon) return action def observe(self, state, action, reward, nextState, done): if self.mode == 'test': return state = self.preprocess_state(state) # self.prev_state = state nextState = self.preprocess_state(nextState) # self.prev_state = nextState self.steps += 1 # ========================================================== # plt.figure(2) # plt.subplot(3, 1, 1) # plt.imshow(state) # plt.title("action: " + str(action) + "reward: " + str(reward) # + "done: " + str(done)) # plt.colorbar() # plt.subplot(3, 1, 2) # plt.imshow(nextState) # plt.subplot(3, 1, 3) # plt.imshow(nextState.astype('int16') - state) # plt.colorbar() # plt.show() # ========================================================== self.putInMemory(state, action, reward, nextState, done) # ========================================================== self.mem.add(action, reward, nextState, done) # ========================================================== if self.steps - self.prelearning_steps > 0: # learning starts # state, action, reward, nextState, done = self.sampleFromMemory() # ========================================================== state, action, reward, nextState, done = self.mem.getMinibatch() # ========================================================== self.train(state, action, reward, nextState, done) def preprocess_state(self, state): # state_resize = imresize(state, (84, 84, 3)) # state_resize_gray = np.mean(state_resize, axis=2) # max_state = np.maximum(prev_state, state_resize_gray) # return max_state.astype('uint8') state = cv2.resize(cv2.cvtColor(state, cv2.COLOR_RGB2GRAY), self.input_shape[1:]) return state def putInMemory(self, state, action, reward, nextState, done): memoryIdx = self.memoryIdx self.states[memoryIdx, ...] = state self.actions[memoryIdx, ...] = action self.rewards[memoryIdx, ...] = reward self.nextStates[memoryIdx, ...] = nextState self.dones[memoryIdx, ...] = done self.memoryIdx += 1 self.memoryFillCount = max(self.memoryFillCount, self.memoryIdx) assert self.memoryFillCount <= self.memoryLimit self.memoryIdx = self.memoryIdx % self.memoryLimit def sampleFromMemory(self): # sampleIdx = np.random.permutation(self.memoryLimit) # sampleIdx = sampleIdx[:self.sampleSize] # # state = np.zeros((self.sampleSize,) + self.states.shape[1:]) # action = np.zeros((self.sampleSize,) + self.actions.shape[1:], dtype='int') # reward = np.zeros((self.sampleSize,) + self.rewards.shape[1:]) # nextState = np.zeros((self.sampleSize,) + self.nextStates.shape[1:]) # done = np.zeros((self.sampleSize,) + self.dones.shape[1:], dtype='int') # # for i in xrange(self.sampleSize): # state[i] = self.states[sampleIdx[i]] # action[i] = self.actions[sampleIdx[i]] # reward[i] = self.rewards[sampleIdx[i]] # nextState[i] = self.nextStates[sampleIdx[i]] # done[i] = self.dones[sampleIdx[i]] # # return state, action, reward, nextState, done #================================================================================================== state = np.zeros( (self.sampleSize, self.history_length) + self.states.shape[1:], dtype='uint8') nextState = np.zeros( (self.sampleSize, self.history_length) + self.nextStates.shape[1:], dtype='uint8') indexes = [] while len(indexes) < self.sampleSize: # find random index while True: # sample one index (ignore states wraping over index = random.randint(self.history_length - 1, self.memoryFillCount - 1) # if wraps over current pointer, then get new one if index >= self.memoryIdx and index - (self.history_length - 1) < self.memoryIdx: continue # if wraps over episode end, then get new one # NB! poststate (last screen) can be terminal state! if self.dones[(index - self.history_length + 1):index].any(): continue # if (self.rewards[(index - self.history_length + 1):index] != 0).any(): # continue # otherwise use this index break # NB! having index first is fastest in C-order matrices assert index >= self.history_length - 1 assert index <= self.memoryLimit - 1 state[len(indexes), ...] = self.states[(index - (self.history_length - 1)):(index + 1), ...] nextState[len(indexes), ...] = self.nextStates[( index - (self.history_length - 1)):(index + 1), ...] indexes.append(index) # copy actions, rewards and terminals with direct slicing action = self.actions[indexes] reward = self.rewards[indexes] done = self.dones[indexes] return state, action, reward, nextState, done def build_network(self, args): net = DeepQNetwork(self.action_space_size, args) return net def choose_action(self, state, epsilon): if np.random.rand() < epsilon: return self.action_space.sample() else: return self.greedy(state) def greedy(self, state): # predict the Q values at current state state = state[np.newaxis, :] #replicate by batch_size state = np.tile(state, (self.sampleSize, 1, 1, 1)) # ====================================================== q = self.net.predict(state) #====================================================== # q = self._network_forward(self.network, state) # ====================================================== q = q[0, :] # return the index of maximum Q value return np.argmax(q) def _network_forward(self, net, state): assert state.shape[0] == self.sampleSize assert state.shape[1] == self.input_shape[0] state = state / 255.0 arg_arrays = net.arg_dict train_iter = mx.io.NDArrayIter(data=state, batch_size=state.shape[0]) data = arg_arrays[train_iter.provide_data[0][0]] q = [] for batch in train_iter: # Copy data to executor input. Note the [:]. data[:] = batch.data[0] self.network.forward(is_train=False) q = self.network.outputs[0] return q.asnumpy() def train(self, state, action, reward, nextState, done): epoch = 0 minibatch = state, action, reward, nextState, done self.net.train(minibatch, epoch) # reward = np.clip(reward, -1, 1) # # # future_Qvalue = self._network_forward(self.targetNetwork, nextState) # future_reward = np.max(future_Qvalue, axis=1) # future_reward = future_reward[:, np.newaxis] # # nonzero_reward_list = np.nonzero(reward) # # reward += (1-done)*self.gamma*future_reward # reward += (1-abs(reward))*self.gamma*future_reward # # target_reward = self._network_forward(self.network, state) # old_target_reward = copy.deepcopy(target_reward) # for i in xrange(self.sampleSize): # # target_reward[i][action[i]] = reward[i] # # clip error to [-1, 1], Mnih 2015 Nature # target_reward[i][action[i]] = max(min(reward[i], target_reward[i][action[i]]+1), target_reward[i][action[i]]-1) # # #======================================================================= # if self._debug: # print "reward:", reward.transpose() # print "future_reward:", future_reward.transpose() # print "action:", action.transpose() # print "done: ", done.transpose() # figure_id = 0 # for batch_i in nonzero_reward_list[0]: # if 1: #reward[batch_i, ...] != 0: # figure_id += 1 # plt.figure(figure_id) # for plot_i in range(0, self.history_length): # plt.subplot(3, self.history_length, plot_i + 1) # plt.imshow(state[batch_i, plot_i, ...]) # plt.title("action: " + str(action[batch_i, ...]) + "reward: " + str(reward[batch_i, ...]) # + "done: " + str(done[batch_i, ...])) # plt.colorbar() # # plt.subplot(3, self.history_length, plot_i + 1 + self.history_length) # plt.imshow(nextState[batch_i, plot_i, ...]) # # plt.subplot(3, self.history_length, plot_i + 1 + self.history_length * 2) # plt.imshow(nextState[batch_i, plot_i, ...].astype('int16') - state[batch_i, plot_i, ...]) # if plot_i == 0: # plt.title("reward: " + str(reward[batch_i, ...]) # + " target reward: " + str(target_reward[batch_i, ...]) # + " old reward: " + str(old_target_reward[batch_i, ...])) # plt.colorbar() # # plt.show() # # raw_input() # #======================================================================= # # train_data = state / 255.0 # train_label = target_reward # # # # First we get handle to input arrays # arg_arrays = self.network.arg_dict # batch_size = self.sampleSize # train_iter = mx.io.NDArrayIter(data=train_data, label=train_label, batch_size=batch_size, shuffle=False) # # val_iter = mx.io.NDArrayIter(data=val_data, label=val_label, batch_size=batch_size) # data = arg_arrays[train_iter.provide_data[0][0]] # label = arg_arrays[train_iter.provide_label[0][0]] # # # opt = mx.optimizer.RMSProp( # # learning_rate= self.learning_rate, # # gamma2 = self.rmsprop_gamma2) # # opt = mx.optimizer.Adam( # learning_rate=self.learning_rate) # # updater = mx.optimizer.get_updater(opt) # # # Finally we need a metric to print out training progress # metric = mx.metric.MSE() # # # Training loop begines # train_iter.reset() # metric.reset() # # for batch in train_iter: # # Copy data to executor input. Note the [:]. # data[:] = batch.data[0] # label[:] = batch.label[0] # # # Forward # self.network.forward(is_train=True) # # # You perform operations on exe.outputs here if you need to. # # For example, you can stack a CRF on top of a neural network. # # # Backward # self.network.backward() # # # Update # for i, pair in enumerate(zip(self.network.arg_arrays, self.network.grad_arrays)): # weight, grad = pair # updater(i, grad, weight) # metric.update(batch.label, self.network.outputs) # # if self.steps % 1000 == 0: # print 'steps:', self.steps, 'metric:', metric.get() # print 'network.outputs:', self.network.outputs[0].asnumpy() # print 'label:', batch.label[0].asnumpy() # # np.set_printoptions(precision=4) # print 'delta: ', (batch.label[0].asnumpy() - self.network.outputs[0].asnumpy()) # # t = 0 # # metric.reset() # # for batch in val_iter: # # # Copy data to executor input. Note the [:]. # # data[:] = batch.data[0] # # label[:] = batch.label[0] # # # # # Forward # # self.network.forward(is_train=False) # # metric.update(batch.label, self.network.outputs) # # t += 1 # # if t % 50 == 0: # # print 'epoch:', epoch, 'test iter:', t, 'metric:', metric.get() # # #======================================================================== # #sync target-network with network as mentioned in Mnih et al. Nature 2015 if self.steps % self.targetNetC == 0: self.net.update_target_network() # self.targetNetwork.copy_params_from(self.network.arg_dict, self.network.aux_dict) # Basic Conv + BN + ReLU factory def ConvFactory(self, data, num_filter, kernel, stride=(1, 1), pad=(0, 0), act_type="relu"): # there is an optional parameter ```wrokshpace``` may influece convolution performance # default, the workspace is set to 256(MB) # you may set larger value, but convolution layer only requires its needed but not exactly # MXNet will handle reuse of workspace without parallelism conflict conv = mx.symbol.Convolution(data=data, workspace=256, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad) # bn = mx.symbol.BatchNorm(data=conv) act = mx.symbol.Activation(data=conv, act_type=act_type) return act def generate_parameter(self): def str2bool(v): return v.lower() in ("yes", "true", "t", "1") parser = argparse.ArgumentParser() envarg = parser.add_argument_group('Environment') envarg.add_argument( "--game", default="Catcher-v0", help= "ROM bin file or env id such as Breakout-v0 if training with Open AI Gym." ) envarg.add_argument( "--environment", choices=["ale", "gym"], default="ale", help="Whether to train agent using ALE or OpenAI Gym.") envarg.add_argument( "--display_screen", type=str2bool, default=False, help="Display game screen during training and testing.") # envarg.add_argument("--sound", type=str2bool, default=False, help="Play (or record) sound.") envarg.add_argument( "--frame_skip", type=int, default=4, help="How many times to repeat each chosen action.") envarg.add_argument( "--repeat_action_probability", type=float, default=0, help= "Probability, that chosen action will be repeated. Otherwise random action is chosen during repeating." ) envarg.add_argument("--minimal_action_set", dest="minimal_action_set", type=str2bool, default=True, help="Use minimal action set.") envarg.add_argument( "--color_averaging", type=str2bool, default=True, help="Perform color averaging with previous frame.") envarg.add_argument("--screen_width", type=int, default=64, help="Screen width after resize.") envarg.add_argument("--screen_height", type=int, default=64, help="Screen height after resize.") envarg.add_argument( "--record_screen_path", default="./", help= "Record game screens under this path. Subfolder for each game is created." ) envarg.add_argument("--record_sound_filename", default="./", help="Record game sound in this file.") memarg = parser.add_argument_group('Replay memory') memarg.add_argument("--replay_size", type=int, default=50000, help="Maximum size of replay memory.") memarg.add_argument("--history_length", type=int, default=4, help="How many screen frames form a state.") netarg = parser.add_argument_group('Deep Q-learning network') netarg.add_argument("--learning_rate", type=float, default=0.00025, help="Learning rate.") netarg.add_argument("--discount_rate", type=float, default=0.99, help="Discount rate for future rewards.") netarg.add_argument("--batch_size", type=int, default=32, help="Batch size for neural network.") netarg.add_argument('--optimizer', choices=['rmsprop', 'adam', 'adadelta'], default='rmsprop', help='Network optimization algorithm.') netarg.add_argument( "--decay_rate", type=float, default=0.95, help="Decay rate for RMSProp and Adadelta algorithms.") netarg.add_argument( "--clip_error", type=float, default=1, help= "Clip error term in update between this number and its negative.") netarg.add_argument("--min_reward", type=float, default=-1, help="Minimum reward.") netarg.add_argument("--max_reward", type=float, default=1, help="Maximum reward.") netarg.add_argument("--batch_norm", type=str2bool, default=False, help="Use batch normalization in all layers.") # netarg.add_argument("--rescale_r", type=str2bool, help="Rescale rewards.") # missing: bufferSize=512,valid_size=500,min_reward=-1,max_reward=1 neonarg = parser.add_argument_group('Neon') neonarg.add_argument('--backend', choices=['cpu', 'gpu'], default='gpu', help='backend type') neonarg.add_argument('--device_id', type=int, default=0, help='gpu device id (only used with GPU backend)') neonarg.add_argument( '--datatype', choices=['float16', 'float32', 'float64'], default='float32', help= 'default floating point precision for backend [f64 for cpu only]') neonarg.add_argument( '--stochastic_round', const=True, type=int, nargs='?', default=False, help= 'use stochastic rounding [will round to BITS number of bits if specified]' ) antarg = parser.add_argument_group('Agent') antarg.add_argument("--exploration_rate_start", type=float, default=1, help="Exploration rate at the beginning of decay.") antarg.add_argument("--exploration_rate_end", type=float, default=0.1, help="Exploration rate at the end of decay.") antarg.add_argument( "--exploration_decay_steps", type=float, default=10000, help="How many steps to decay the exploration rate.") antarg.add_argument("--exploration_rate_test", type=float, default=0.05, help="Exploration rate used during testing.") antarg.add_argument( "--train_frequency", type=int, default=4, help="Perform training after this many game steps.") antarg.add_argument( "--train_repeat", type=int, default=1, help="Number of times to sample minibatch during training.") antarg.add_argument( "--target_steps", type=int, default=4, help= "Copy main network to target network after this many game steps.") antarg.add_argument( "--random_starts", type=int, default=30, help= "Perform max this number of dummy actions after game restart, to produce more random game dynamics." ) nvisarg = parser.add_argument_group('Visualization') nvisarg.add_argument( "--visualization_filters", type=int, default=4, help="Number of filters to visualize from each convolutional layer." ) nvisarg.add_argument("--visualization_file", default="tmp", help="Write layer visualization to this file.") mainarg = parser.add_argument_group('Main loop') mainarg.add_argument( "--random_steps", type=int, default=50000, help= "Populate replay memory with random steps before starting learning." ) mainarg.add_argument("--train_steps", type=int, default=250000, help="How many training steps per epoch.") mainarg.add_argument("--test_steps", type=int, default=125000, help="How many testing steps after each epoch.") mainarg.add_argument("--epochs", type=int, default=200, help="How many epochs to run.") mainarg.add_argument( "--start_epoch", type=int, default=0, help= "Start from this epoch, affects exploration rate and names of saved snapshots." ) mainarg.add_argument( "--play_games", type=int, default=0, help="How many games to play, suppresses training and testing.") mainarg.add_argument("--load_weights", help="Load network from file.") mainarg.add_argument( "--save_weights_prefix", help= "Save network to given file. Epoch and extension will be appended." ) mainarg.add_argument("--csv_file", help="Write training progress to this file.") comarg = parser.add_argument_group('Common') comarg.add_argument("--random_seed", type=int, help="Random seed for repeatable experiments.") comarg.add_argument( "--log_level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Log level.") args = parser.parse_args() return args
class Agent: def __init__(self, dimO, dimA): dimA = list(dimA) dimO = list(dimO) nets = nets_dm # init replay memory self.rm = ReplayMemory(rm_size, dimO, dimA, dtype=np.__dict__[rm_dtype]) # own replay memory self.replay_memory = deque(maxlen=rm_size) # start tf session self.sess = tf.Session(config=tf.ConfigProto( inter_op_parallelism_threads=threads, log_device_placement=False, allow_soft_placement=True)) # create tf computational graph # self.theta_p = nets.theta_p(dimO, dimA) self.theta_q = nets.theta_q(dimO, dimA) self.theta_pt, update_pt = exponential_moving_averages(self.theta_p, tau) self.theta_qt, update_qt = exponential_moving_averages(self.theta_q, tau) obs = tf.placeholder(tf.float32, [None] + dimO, "obs") act_test, sum_p = nets.policy(obs, self.theta_p) # explore noise_init = tf.zeros([1] + dimA) noise_var = tf.Variable(noise_init) self.ou_reset = noise_var.assign(noise_init) noise = noise_var.assign_sub((ou_theta) * noise_var - tf.random_normal(dimA, stddev=ou_sigma)) act_expl = act_test + noise # test q, sum_q = nets.qfunction(obs, act_test, self.theta_q, name= 'q_mu_of_s') # training # policy loss meanq = tf.reduce_mean(q, 0) wd_p = tf.add_n([pl2 * tf.nn.l2_loss(var) for var in self.theta_p]) # weight decay loss_p = -meanq + wd_p # policy optimization optim_p = tf.train.AdamOptimizer(learning_rate=lrp) grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=self.theta_p) optimize_p = optim_p.apply_gradients(grads_and_vars_p) with tf.control_dependencies([optimize_p]): train_p = tf.group(update_pt) # q optimization act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train") rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew") obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2") term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2") # q q_train, sum_qq = nets.qfunction(obs, act_train, self.theta_q, name= 'qs_a') # q targets act2, sum_p2 = nets.policy(obs2, theta=self.theta_pt) q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt, name='qsprime_aprime') q_target = tf.stop_gradient(tf.select(term2, rew, rew + discount * q2)) # q_target = tf.stop_gradient(rew + discount * q2) # q loss td_error = q_train - q_target ms_td_error = tf.reduce_mean(tf.square(td_error), 0) wd_q = tf.add_n([ql2 * tf.nn.l2_loss(var) for var in self.theta_q]) # weight decay loss_q = ms_td_error + wd_q # q optimization optim_q = tf.train.AdamOptimizer(learning_rate=lrq) grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=self.theta_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) with tf.control_dependencies([optimize_q]): train_q = tf.group(update_qt) # logging log_obs = [] if dimO[0] > 20 else [tf.histogram_summary("obs/" + str(i), obs[:, i]) for i in range(dimO[0])] log_act = [] if dimA[0] > 20 else [tf.histogram_summary("act/inf" + str(i), act_test[:, i]) for i in range(dimA[0])] log_act2 = [] if dimA[0] > 20 else [tf.histogram_summary("act/train" + str(i), act_train[:, i]) for i in range(dimA[0])] log_misc = [sum_p, sum_qq, tf.histogram_summary("td_error", td_error)] log_grad = [grad_histograms(grads_and_vars_p), grad_histograms(grads_and_vars_q)] log_noise = [tf.histogram_summary('noise', noise_var)] log_train = log_obs + log_act + log_act2 + log_misc + log_grad + log_noise merged = tf.merge_all_summaries() # initialize tf log writer self.writer = tf.train.SummaryWriter(FLAGS.outdir + "/tf", self.sess.graph, flush_secs=20) # init replay memory for recording episodes max_ep_length = 10000 self.rm_log = ReplayMemory(max_ep_length, dimO, dimA, rm_dtype) # tf functions with self.sess.as_default(): self.act_test = Fun(obs, act_test) self._act_expl = Fun(obs, act_expl) self._reset = Fun([], self.ou_reset) self._train_q = Fun([obs, act_train, rew, obs2, term2], [train_q], log_train, self.writer) self._train_p = Fun([obs], [train_p]) self._train_p = Fun([obs], [train_p], log_obs, self.writer) self._train = Fun([obs, act_train, rew, obs2, term2], [train_p, train_q], merged, self.writer) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf") if ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.initialize_all_variables()) self.sess.graph.finalize() self.t = 0 # global training time (number of observations) def reset(self, obs): self._reset() self.observation = obs # initial observation def act(self, test=False): obs = np.expand_dims(self.observation, axis=0) action = self.act_test(obs) if test else self._act_expl(obs) self.action = np.atleast_1d(np.squeeze(action, axis=0)) # TODO: remove this hack return self.action def observe(self, rew, term, obs2, test=False, perform_trainstep= True): obs1 = self.observation self.observation = obs2 # train if not test: self.t = self.t + 1 self.rm.enqueue(obs1, term, self.action, rew) self.replay_memory.append((obs1, self.action, rew, obs2, term)) if self.t > FLAGS.warmup: # print('warmed up') if perform_trainstep: self.train() # elif FLAGS.warmq and self.rm.n > 1000: # # Train Q on warmup # obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize) # self._train_q(obs, act, rew, ob2, term2, log=(np.random.rand() < FLAGS.log), global_step=self.t) # save parameters etc. # if (self.t+45000) % 50000 == 0: # TODO: correct # s = self.saver.save(self.sess,FLAGS.outdir+"f/tf/c",self.t) # print("DDPG Checkpoint: " + s) def train(self): # obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize) obs, act, rew, ob2, term2, = self.get_train_batch() log = (np.random.rand() < FLAGS.log) if FLAGS.async: self._train(obs, act, rew, ob2, term2, log=log, global_step=self.t) else: self._train_q(obs, act, rew, ob2, term2, log=log, global_step=self.t) self._train_p(obs, log=log, global_step=self.t) def write_scalar(self, tag, val): s = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=val)]) self.writer.add_summary(s, self.t) def __del__(self): self.sess.close() def get_train_batch(self): #selecting transitions randomly from the replay memory: indices = np.random.randint(0, len(self.replay_memory), [FLAGS.bsize]) transition_batch = [self.replay_memory[i] for i in indices] states = np.asarray([transition_batch[i][0].squeeze() for i in range(FLAGS.bsize)]) actions = np.asarray([transition_batch[i][1] for i in range(FLAGS.bsize)]) rewards = np.asarray([transition_batch[i][2] for i in range(FLAGS.bsize)]) states_prime = np.asarray([transition_batch[i][3].squeeze() for i in range(FLAGS.bsize)]) term2 = np.asarray([transition_batch[i][4] for i in range(FLAGS.bsize)]) return states, actions, rewards, states_prime, term2
def __init__(self, cfg, restore=False): sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True sess_config.gpu_options.per_process_gpu_memory_fraction = 0.4 self.sess = tf.Session(config=sess_config) self.cfg = cfg assert cfg.gan == 'ls' or cfg.gan == 'w' self.dir = os.path.join('models', cfg.name) self.image_dir = os.path.join(self.dir, 'images-' + cfg.name.replace('/', '-')) self.dump_dir = os.path.join(self.dir, 'dump-' + cfg.name.replace('/', '-')) if not os.path.exists(self.dir): os.makedirs(self.dir) if not os.path.exists(self.dump_dir): os.makedirs(self.dump_dir) if not os.path.exists(self.image_dir): os.makedirs(self.image_dir) if not restore: self.backup_scripts() self.tee = Tee(os.path.join(self.dir, 'log.txt')) self.is_train = tf.placeholder(tf.int32, shape=[], name='is_train') self.is_training = tf.equal(self.is_train, 1) self.memory = ReplayMemory(cfg, load=not restore) self.z = self.memory.z self.real_data = self.memory.real_data self.real_data_feature = self.memory.real_data_feature self.fake_input = self.memory.fake_input self.fake_input_feature = self.memory.fake_input_feature self.states = self.memory.states self.ground_truth = self.memory.ground_truth self.progress = self.memory.progress self.surrogate_loss_addition = 0 with tf.variable_scope('generator'): fake_output, self.generator_debug_output, self.generator_debugger = cfg.generator( [self.fake_input, self.z, self.states], is_train=self.is_train, progress=self.progress, cfg=cfg) self.fake_output, self.new_states, self.surrogate_loss_addition, self.penalty = fake_output self.fake_output_feature = self.fake_input_feature self.memory.fake_output_feature = self.fake_output_feature self.memory.fake_output = self.fake_output print(cfg.critic) self.real_logit, self.real_embeddings, self.test_real_gradients = cfg.critic( images=self.real_data, cfg=cfg, is_train=self.is_training) self.fake_logit, self.fake_embeddings, self.test_fake_gradients = cfg.critic( images=self.fake_output, cfg=cfg, reuse=True, is_train=self.is_training) self.fake_input_logit, self.fake_input_embeddings, _ = cfg.critic( images=self.fake_input, cfg=cfg, reuse=True, is_train=self.is_training) print('real_logit', self.real_logit.shape) with tf.variable_scope('rl_value'): print('self.states', self.states.shape) print('self.new_states', self.new_states.shape) self.old_value, _, _ = cfg.value(images=self.fake_input, states=self.states, cfg=cfg, reuse=False, is_train=self.is_training) self.new_value, _, _ = cfg.value(images=self.fake_output, states=self.new_states, cfg=cfg, reuse=True, is_train=self.is_training) stopped = self.new_states[:, STATE_STOPPED_DIM:STATE_STOPPED_DIM + 1] clear_final = tf.cast( self.new_states[:, STATE_STEP_DIM:STATE_STEP_DIM + 1] > self.cfg.maximum_trajectory_length, tf.float32) print('clear final', clear_final.shape) print('new_value', self.new_value.shape) self.new_value = self.new_value * (1.0 - clear_final) # Reward: the bigger, the better if cfg.supervised: self.raw_reward = (cfg.all_reward + (1 - cfg.all_reward) * stopped) * ( -self.fake_logit) else: if cfg.gan == 'ls': self.raw_reward = (cfg.all_reward + (1 - cfg.all_reward) * stopped) * ( 1 - (self.fake_logit - 1)**2) else: self.raw_reward = (cfg.all_reward + (1 - cfg.all_reward) * stopped) * ( self.fake_logit - tf.stop_gradient(self.fake_input_logit) ) * cfg.critic_logit_multiplier self.reward = self.raw_reward if cfg.use_penalty: self.reward -= self.penalty print('new_states_slice', self.new_states) print('new_states_slice', self.new_states[:, STATE_REWARD_DIM:STATE_REWARD_DIM + 1]) print('fake_logit', self.fake_logit.shape) self.exp_moving_average = tf.train.ExponentialMovingAverage( decay=0.99, zero_debias=True) # TD learning print('reward', self.reward.shape) # If it stops, future return should be zero self.q_value = self.reward + ( 1.0 - stopped) * cfg.discount_factor * self.new_value print('q', self.q_value.shape) self.advantage = tf.stop_gradient(self.q_value) - self.old_value self.v_loss = tf.reduce_mean(self.advantage**2, axis=(0, 1)) if cfg.gan == 'ls': print('** LSGAN') self.c_loss = tf.reduce_mean(self.fake_logit**2) + tf.reduce_mean( (self.real_logit - 1)**2) if cfg.use_TD: routine_loss = -self.q_value * self.cfg.parameter_lr_mul advantage = -self.advantage else: routine_loss = -self.reward advantage = -self.reward print('routine_loss', routine_loss.shape) print('pg_loss', self.surrogate_loss_addition.shape) assert len(routine_loss.shape) == len( self.surrogate_loss_addition.shape) self.g_loss = tf.reduce_mean(routine_loss + self.surrogate_loss_addition * tf.stop_gradient(advantage)) self.emd = self.c_loss self.c_average = tf.constant(0, dtype=tf.float32) else: print('** WGAN') self.c_loss = tf.reduce_mean(self.fake_logit - self.real_logit) if cfg.use_TD: routine_loss = -self.q_value * self.cfg.parameter_lr_mul advantage = -self.advantage else: routine_loss = -self.reward advantage = -self.reward print('routine_loss', routine_loss.shape) print('pg_loss', self.surrogate_loss_addition.shape) assert len(routine_loss.shape) == len( self.surrogate_loss_addition.shape) self.g_loss = tf.reduce_mean(routine_loss + self.surrogate_loss_addition * tf.stop_gradient(advantage)) self.emd = -self.c_loss self.c_average = tf.reduce_mean(self.fake_logit + self.real_logit) * 0.5 update_average = self.exp_moving_average.apply([self.c_average]) self.c_average_smoothed = self.exp_moving_average.average( self.c_average) self.centered_fake_logit = self.fake_logit - self.c_average_smoothed self.fake_gradients = tf.gradients(self.fake_logit, [ self.fake_output, ])[0] # Critic gradient norm and penalty alpha_dist = tf.contrib.distributions.Uniform(low=0., high=1.) alpha = alpha_dist.sample((cfg.batch_size, 1, 1, 1)) interpolated = self.real_data + alpha * (self.fake_output - self.real_data) inte_logit, inte_embeddings, _ = cfg.critic(images=interpolated, cfg=cfg, reuse=True, is_train=self.is_training) gradients = tf.gradients(inte_logit, [ interpolated, ])[0] gradient_norm = tf.sqrt(1e-6 + tf.reduce_sum(gradients**2, axis=[1, 2, 3])) gradient_penalty = cfg.gradient_penalty_lambda * tf.reduce_mean( tf.maximum(gradient_norm - 1.0, 0.0)**2) _ = tf.summary.scalar("grad_penalty_loss", gradient_penalty) self.critic_gradient_norm = tf.reduce_mean(gradient_norm) _ = tf.summary.scalar("grad_norm", self.critic_gradient_norm) if cfg.gan == 'w': if cfg.gradient_penalty_lambda > 0: print('** Using gradient penalty') self.c_loss += gradient_penalty else: gradient_norm = tf.sqrt( tf.reduce_sum(self.fake_gradients**2, axis=[1, 2, 3])) self.critic_gradient_norm = tf.reduce_mean(gradient_norm) print('** NOT using gradient penalty') _ = tf.summary.scalar("g_loss", self.g_loss) _ = tf.summary.scalar("neg_c_loss", -self.c_loss) _ = tf.summary.scalar("EMD", self.emd) self.theta_g = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='generator') self.theta_c = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic') self.theta_v = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='rl_value') print('# variables') print(' generator:', len(self.theta_g)) print(' value:', len(self.theta_v)) print(' critic:', len(self.theta_c)) self.lr_g = tf.placeholder(dtype=tf.float32, shape=[], name='lr_g') self.lr_c = tf.placeholder(dtype=tf.float32, shape=[], name='lr_c') # Optimizer for Value estimator, use the same lr as g self.counter_v = tf.Variable(trainable=False, initial_value=0, dtype=tf.int32) self.opt_v = ly.optimize_loss(loss=self.v_loss, learning_rate=self.cfg.value_lr_mul * self.lr_g, optimizer=cfg.generator_optimizer, variables=self.theta_v, global_step=self.counter_v, summaries=['gradient_norm']) # Optimize for Generator (Actor) self.counter_g = tf.Variable(trainable=False, initial_value=0, dtype=tf.int32) self.opt_g = ly.optimize_loss(loss=self.g_loss, learning_rate=self.lr_g, optimizer=cfg.generator_optimizer, variables=self.theta_g, global_step=self.counter_g, summaries=['gradient_norm']) # Optimize for Discriminator (critic in WGAN or discriminator in LSGAN) self.counter_c = tf.Variable(trainable=False, initial_value=0, dtype=tf.int32) if not self.cfg.supervised: self.opt_c = ly.optimize_loss(loss=self.c_loss, learning_rate=self.lr_c, optimizer=cfg.critic_optimizer, variables=self.theta_c, global_step=self.counter_c, summaries=['gradient_norm']) if cfg.gan == 'w' and cfg.gradient_penalty_lambda <= 0: print( '** make sure your NN input has mean 0, as biases will also be clamped.' ) # Merge the clip operations on critic variables # For WGAN clipped_var_c = [ tf.assign( var, tf.clip_by_value(var, -self.cfg.clamp_critic, self.cfg.clamp_critic)) for var in self.theta_c ] with tf.control_dependencies([self.opt_c]): self.opt_c = tf.tuple(clipped_var_c) with tf.control_dependencies([self.opt_c]): self.opt_c = tf.group(update_average) self.saver = tf.train.Saver( max_to_keep=1) # save all checkpoints max_to_keep=None self.sess.run(tf.global_variables_initializer()) self.merged_all = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter(self.dir, self.sess.graph) if not restore: self.fixed_feed_dict_random = self.memory.get_feed_dict( self.cfg.num_samples) self.high_res_nets = {}
MAX_YAW = 2 * np.pi MAX_X = 20 MAX_Y = 20 max_lidar_value = 14 THRESHOLD_DISTANCE_2_GOAL = 0.2 / max(MAX_X, MAX_Y) UPDATE_EVERY = 5 count = 0 total_numsteps = 0 updates = 0 num_goal_reached = 0 done = False i_episode = 1 episode_reward = 0 max_ep_reward = 0 episode_steps = 0 memory = ReplayMemory(args.replay_size, args.seed) class DeepracerGym(gym.Env): def __init__(self, target_point): super(DeepracerGym, self).__init__() n_actions = 2 #velocity,steering metadata = {'render.modes': ['console']} #self.action_space = spaces.Discrete(n_actions) self.action_space = spaces.Box(np.array([0., -1.]), np.array([1., 1.]), dtype=np.float32) # speed and steering # self.pose_observation_space = spaces.Box(np.array([-1. , -1., -1.]),np.array([1., 1., 1.]),dtype = np.float32) # self.lidar_observation_space = spaces.Box(0,1.,shape=(720,),dtype = np.float32) # self.observation_space = spaces.Tuple((self.pose_observation_space,self.lidar_observation_space))
class GAN: def __init__(self, cfg, restore=False): sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True sess_config.gpu_options.per_process_gpu_memory_fraction = 0.4 self.sess = tf.Session(config=sess_config) self.cfg = cfg assert cfg.gan == 'ls' or cfg.gan == 'w' self.dir = os.path.join('models', cfg.name) self.image_dir = os.path.join(self.dir, 'images-' + cfg.name.replace('/', '-')) self.dump_dir = os.path.join(self.dir, 'dump-' + cfg.name.replace('/', '-')) if not os.path.exists(self.dir): os.makedirs(self.dir) if not os.path.exists(self.dump_dir): os.makedirs(self.dump_dir) if not os.path.exists(self.image_dir): os.makedirs(self.image_dir) if not restore: self.backup_scripts() self.tee = Tee(os.path.join(self.dir, 'log.txt')) self.is_train = tf.placeholder(tf.int32, shape=[], name='is_train') self.is_training = tf.equal(self.is_train, 1) self.memory = ReplayMemory(cfg, load=not restore) self.z = self.memory.z self.real_data = self.memory.real_data self.real_data_feature = self.memory.real_data_feature self.fake_input = self.memory.fake_input self.fake_input_feature = self.memory.fake_input_feature self.states = self.memory.states self.ground_truth = self.memory.ground_truth self.progress = self.memory.progress self.surrogate_loss_addition = 0 with tf.variable_scope('generator'): fake_output, self.generator_debug_output, self.generator_debugger = cfg.generator( [self.fake_input, self.z, self.states], is_train=self.is_train, progress=self.progress, cfg=cfg) self.fake_output, self.new_states, self.surrogate_loss_addition, self.penalty = fake_output self.fake_output_feature = self.fake_input_feature self.memory.fake_output_feature = self.fake_output_feature self.memory.fake_output = self.fake_output print(cfg.critic) self.real_logit, self.real_embeddings, self.test_real_gradients = cfg.critic( images=self.real_data, cfg=cfg, is_train=self.is_training) self.fake_logit, self.fake_embeddings, self.test_fake_gradients = cfg.critic( images=self.fake_output, cfg=cfg, reuse=True, is_train=self.is_training) self.fake_input_logit, self.fake_input_embeddings, _ = cfg.critic( images=self.fake_input, cfg=cfg, reuse=True, is_train=self.is_training) print('real_logit', self.real_logit.shape) with tf.variable_scope('rl_value'): print('self.states', self.states.shape) print('self.new_states', self.new_states.shape) self.old_value, _, _ = cfg.value(images=self.fake_input, states=self.states, cfg=cfg, reuse=False, is_train=self.is_training) self.new_value, _, _ = cfg.value(images=self.fake_output, states=self.new_states, cfg=cfg, reuse=True, is_train=self.is_training) stopped = self.new_states[:, STATE_STOPPED_DIM:STATE_STOPPED_DIM + 1] clear_final = tf.cast( self.new_states[:, STATE_STEP_DIM:STATE_STEP_DIM + 1] > self.cfg.maximum_trajectory_length, tf.float32) print('clear final', clear_final.shape) print('new_value', self.new_value.shape) self.new_value = self.new_value * (1.0 - clear_final) # Reward: the bigger, the better if cfg.supervised: self.raw_reward = (cfg.all_reward + (1 - cfg.all_reward) * stopped) * ( -self.fake_logit) else: if cfg.gan == 'ls': self.raw_reward = (cfg.all_reward + (1 - cfg.all_reward) * stopped) * ( 1 - (self.fake_logit - 1)**2) else: self.raw_reward = (cfg.all_reward + (1 - cfg.all_reward) * stopped) * ( self.fake_logit - tf.stop_gradient(self.fake_input_logit) ) * cfg.critic_logit_multiplier self.reward = self.raw_reward if cfg.use_penalty: self.reward -= self.penalty print('new_states_slice', self.new_states) print('new_states_slice', self.new_states[:, STATE_REWARD_DIM:STATE_REWARD_DIM + 1]) print('fake_logit', self.fake_logit.shape) self.exp_moving_average = tf.train.ExponentialMovingAverage( decay=0.99, zero_debias=True) # TD learning print('reward', self.reward.shape) # If it stops, future return should be zero self.q_value = self.reward + ( 1.0 - stopped) * cfg.discount_factor * self.new_value print('q', self.q_value.shape) self.advantage = tf.stop_gradient(self.q_value) - self.old_value self.v_loss = tf.reduce_mean(self.advantage**2, axis=(0, 1)) if cfg.gan == 'ls': print('** LSGAN') self.c_loss = tf.reduce_mean(self.fake_logit**2) + tf.reduce_mean( (self.real_logit - 1)**2) if cfg.use_TD: routine_loss = -self.q_value * self.cfg.parameter_lr_mul advantage = -self.advantage else: routine_loss = -self.reward advantage = -self.reward print('routine_loss', routine_loss.shape) print('pg_loss', self.surrogate_loss_addition.shape) assert len(routine_loss.shape) == len( self.surrogate_loss_addition.shape) self.g_loss = tf.reduce_mean(routine_loss + self.surrogate_loss_addition * tf.stop_gradient(advantage)) self.emd = self.c_loss self.c_average = tf.constant(0, dtype=tf.float32) else: print('** WGAN') self.c_loss = tf.reduce_mean(self.fake_logit - self.real_logit) if cfg.use_TD: routine_loss = -self.q_value * self.cfg.parameter_lr_mul advantage = -self.advantage else: routine_loss = -self.reward advantage = -self.reward print('routine_loss', routine_loss.shape) print('pg_loss', self.surrogate_loss_addition.shape) assert len(routine_loss.shape) == len( self.surrogate_loss_addition.shape) self.g_loss = tf.reduce_mean(routine_loss + self.surrogate_loss_addition * tf.stop_gradient(advantage)) self.emd = -self.c_loss self.c_average = tf.reduce_mean(self.fake_logit + self.real_logit) * 0.5 update_average = self.exp_moving_average.apply([self.c_average]) self.c_average_smoothed = self.exp_moving_average.average( self.c_average) self.centered_fake_logit = self.fake_logit - self.c_average_smoothed self.fake_gradients = tf.gradients(self.fake_logit, [ self.fake_output, ])[0] # Critic gradient norm and penalty alpha_dist = tf.contrib.distributions.Uniform(low=0., high=1.) alpha = alpha_dist.sample((cfg.batch_size, 1, 1, 1)) interpolated = self.real_data + alpha * (self.fake_output - self.real_data) inte_logit, inte_embeddings, _ = cfg.critic(images=interpolated, cfg=cfg, reuse=True, is_train=self.is_training) gradients = tf.gradients(inte_logit, [ interpolated, ])[0] gradient_norm = tf.sqrt(1e-6 + tf.reduce_sum(gradients**2, axis=[1, 2, 3])) gradient_penalty = cfg.gradient_penalty_lambda * tf.reduce_mean( tf.maximum(gradient_norm - 1.0, 0.0)**2) _ = tf.summary.scalar("grad_penalty_loss", gradient_penalty) self.critic_gradient_norm = tf.reduce_mean(gradient_norm) _ = tf.summary.scalar("grad_norm", self.critic_gradient_norm) if cfg.gan == 'w': if cfg.gradient_penalty_lambda > 0: print('** Using gradient penalty') self.c_loss += gradient_penalty else: gradient_norm = tf.sqrt( tf.reduce_sum(self.fake_gradients**2, axis=[1, 2, 3])) self.critic_gradient_norm = tf.reduce_mean(gradient_norm) print('** NOT using gradient penalty') _ = tf.summary.scalar("g_loss", self.g_loss) _ = tf.summary.scalar("neg_c_loss", -self.c_loss) _ = tf.summary.scalar("EMD", self.emd) self.theta_g = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='generator') self.theta_c = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic') self.theta_v = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='rl_value') print('# variables') print(' generator:', len(self.theta_g)) print(' value:', len(self.theta_v)) print(' critic:', len(self.theta_c)) self.lr_g = tf.placeholder(dtype=tf.float32, shape=[], name='lr_g') self.lr_c = tf.placeholder(dtype=tf.float32, shape=[], name='lr_c') # Optimizer for Value estimator, use the same lr as g self.counter_v = tf.Variable(trainable=False, initial_value=0, dtype=tf.int32) self.opt_v = ly.optimize_loss(loss=self.v_loss, learning_rate=self.cfg.value_lr_mul * self.lr_g, optimizer=cfg.generator_optimizer, variables=self.theta_v, global_step=self.counter_v, summaries=['gradient_norm']) # Optimize for Generator (Actor) self.counter_g = tf.Variable(trainable=False, initial_value=0, dtype=tf.int32) self.opt_g = ly.optimize_loss(loss=self.g_loss, learning_rate=self.lr_g, optimizer=cfg.generator_optimizer, variables=self.theta_g, global_step=self.counter_g, summaries=['gradient_norm']) # Optimize for Discriminator (critic in WGAN or discriminator in LSGAN) self.counter_c = tf.Variable(trainable=False, initial_value=0, dtype=tf.int32) if not self.cfg.supervised: self.opt_c = ly.optimize_loss(loss=self.c_loss, learning_rate=self.lr_c, optimizer=cfg.critic_optimizer, variables=self.theta_c, global_step=self.counter_c, summaries=['gradient_norm']) if cfg.gan == 'w' and cfg.gradient_penalty_lambda <= 0: print( '** make sure your NN input has mean 0, as biases will also be clamped.' ) # Merge the clip operations on critic variables # For WGAN clipped_var_c = [ tf.assign( var, tf.clip_by_value(var, -self.cfg.clamp_critic, self.cfg.clamp_critic)) for var in self.theta_c ] with tf.control_dependencies([self.opt_c]): self.opt_c = tf.tuple(clipped_var_c) with tf.control_dependencies([self.opt_c]): self.opt_c = tf.group(update_average) self.saver = tf.train.Saver( max_to_keep=1) # save all checkpoints max_to_keep=None self.sess.run(tf.global_variables_initializer()) self.merged_all = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter(self.dir, self.sess.graph) if not restore: self.fixed_feed_dict_random = self.memory.get_feed_dict( self.cfg.num_samples) self.high_res_nets = {} def get_training_feed_dict_and_states(self, iter): feed_dict, features = self.memory.get_feed_dict_and_states( self.cfg.batch_size) feed_dict[self.lr_g] = self.cfg.lr_g(iter) feed_dict[self.lr_c] = self.cfg.lr_c(iter) feed_dict[self.is_train] = 1 return feed_dict, features def get_replay_feed_dict(self, iter): feed_dict = self.memory.get_replay_feed_dict(self.cfg.batch_size) feed_dict[self.lr_c] = self.cfg.lr_c(iter) feed_dict[self.is_train] = 1 return feed_dict def train(self): start_t = time.time() g_loss_pool = [] v_loss_pool = [] emd_pool = [] # critic gradient (critic logit w.r.t. critic input image) norm cgn = 0 for iter in range(self.cfg.max_iter_step + 1): progress = float(iter) / self.cfg.max_iter_step iter_start_time = time.time() run_options = tf.RunOptions() run_metadata = tf.RunMetadata() if self.cfg.gan == 'w' and (iter < self.cfg.critic_initialization or iter % 500 == 0): citers = 100 else: citers = self.cfg.citers if iter == 0: # Make sure there are terminating states giters = 100 else: giters = self.cfg.giters # Update generator actor/critic for j in range(giters): feed_dict, features = self.get_training_feed_dict_and_states( iter) if iter == 0: feed_dict[self.lr_g] = 0 feed_dict[self.progress] = progress _, g_loss, v_loss, fake_output, new_states = self.sess.run( [(self.opt_g, self.opt_v), self.g_loss, self.v_loss, self.fake_output, self.new_states], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) if self.cfg.supervised: ground_truth = feed_dict[self.ground_truth] else: ground_truth = None self.memory.replace_memory( self.memory.images_and_states_to_records( fake_output, new_states, features, ground_truth=ground_truth)) v_loss_pool.append(v_loss) g_loss_pool.append(g_loss) if iter % self.cfg.summary_freq == 0 and j == 0: merged = self.sess.run(self.merged_all, feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) self.summary_writer.add_summary(merged, iter) self.summary_writer.add_run_metadata( run_metadata, 'critic_metadata {}'.format(iter), iter) merged = [] # Update GAN discriminator ('critic' for WGAN) for j in range(citers): feed_dict = self.get_replay_feed_dict(iter) if not self.cfg.supervised: # update discriminator only if it is unsupervised _, emd, cgn = self.sess.run( [self.opt_c, self.emd, self.critic_gradient_norm], feed_dict=feed_dict) emd_pool.append(emd) if merged: self.summary_writer.add_summary(merged, iter) self.summary_writer.add_run_metadata( run_metadata, 'generator_metadata {}'.format(iter), iter) # Visualizations if self.cfg.realtime_vis or iter % self.cfg.write_image_interval == 0: self.visualize(iter) v_loss_pool = v_loss_pool[-self.cfg.median_filter_size:] g_loss_pool = g_loss_pool[-self.cfg.median_filter_size:] emd_pool = emd_pool[-self.cfg.median_filter_size:] if (iter + 1) % 500 == 0: self.saver.save(self.sess, os.path.join(self.dir, "model.ckpt"), global_step=(iter + 1)) if iter % 100 == 0: eta = (time.time() - start_t) / (iter + 1) / 3600 * ( self.cfg.max_iter_step - iter) tot_time = (time.time() - start_t) / (iter + 1) / 3600 * ( self.cfg.max_iter_step) if iter < 500: eta = tot_time = 0 print('#--------------------------------------------') print('# Task: %s ela. %.2f min ETA: %.1f/%.1f h' % (self.cfg.name, (time.time() - start_t) / 60.0, eta, tot_time)) self.memory.debug() if iter % 10 == 0: print( 'it%6d,%5.0f ms/it, g_loss=%.2f, v_loss=%.2f, EMD=%.3f, cgn=%.2f' % (iter, 1000 * (time.time() - iter_start_time), np.median(g_loss_pool), np.median(v_loss_pool), np.median(emd_pool), cgn)) def restore(self, ckpt): self.saver.restore(self.sess, os.path.join(self.dir, "model.ckpt-%s" % ckpt)) def gradient_processor(self, grads): if self.cfg.gan == 'ls': # We show negative grad. (since we are minimizing) real_grads = [] for g in grads: if (abs(np.mean(g) - 1)) > 0.001: real_grads.append(g) return -grads / np.std(real_grads) * 0.2 + 0.5 else: return 10 * grads + 0.5 def visualize(self, iter): progress = float(iter) / self.cfg.max_iter_step lower_regions = [] pool_images, pool_states, pool_features = self.memory.records_to_images_states_features( self.memory.image_pool[:self.cfg.num_samples]) if self.cfg.supervised: gt0 = [x[1] for x in pool_images] pool_images = [x[0] for x in pool_images] else: gt0 = None lower_regions.append(pool_images) # Generated data feed_dict = merge_dict(self.fixed_feed_dict_random, { self.is_train: self.cfg.test_random_walk, self.progress: progress }) eval_images = [] eval_states = [] gt1 = self.fixed_feed_dict_random[self.ground_truth] for i in range(self.cfg.test_steps): output_images, output_states = self.sess.run( [self.fake_output, self.new_states], feed_dict=feed_dict) feed_dict[self.fake_input] = output_images feed_dict[self.states] = output_states eval_images.append(output_images) eval_states.append(output_states) best_outputs = [] best_indices = [] for i in range(self.cfg.num_samples): best_index = self.cfg.test_steps - 1 for j in range(self.cfg.test_steps): if eval_states[j][i][STATE_REWARD_DIM] > 0: best_index = j break best_image = eval_images[best_index][i] best_indices.append(best_index + 1) best_outputs.append(best_image) lower_regions.append(best_outputs) # Real data lower_regions.append(self.fixed_feed_dict_random[self.real_data]) if self.cfg.vis_draw_critic_scores: lower_regions[0] = self.draw_critic_scores(lower_regions[0], ground_truth=gt0) lower_regions[1] = self.draw_critic_scores(lower_regions[1], ground_truth=gt1) if not self.cfg.supervised: lower_regions[2] = self.draw_critic_scores(lower_regions[2]) for img, state in zip(lower_regions[0], pool_states): cv2.putText(img, str(state), (4, 33), cv2.FONT_HERSHEY_SIMPLEX, 0.25, (1.0, 0.0, 0.0)) for img, ind in zip(lower_regions[1], best_indices): cv2.putText(img, str(ind), (23, 23), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (1.0, 0.0, 0.0)) lower_regions = list(map(make_image_grid, lower_regions)) seperator = np.ones( (lower_regions[0].shape[0], 16, lower_regions[0].shape[2]), dtype=np.float32) lower_region = np.hstack([ lower_regions[0], seperator, lower_regions[1], seperator, lower_regions[2] ]) upper_region = np.ones_like(lower_region) per_row = lower_region.shape[1] // (self.generator_debugger.width + 4) # The upper part h, w = self.cfg.source_img_size, self.cfg.source_img_size images = [] debug_plots = [] gradients = [] rows = lower_region.shape[0] // (h + 2) // 3 groups_per_row = per_row // (self.cfg.test_steps + 1) per_row = (self.cfg.test_steps + 1) * groups_per_row gts = [] for j in range(min(self.cfg.num_samples, rows * groups_per_row)): if self.cfg.supervised: img_gt = self.memory.get_next_RAW( 1, test=self.cfg.vis_step_test)[0][0] img, gt = img_gt[0], img_gt[1] else: img = self.memory.get_next_RAW(1)[0][0] gt = None # z is useless at test time... images_, debug_plots_, gradients_ = self.draw_steps( img, ground_truth=gt, is_train=self.cfg.test_random_walk, progress=progress) images += images_ if self.cfg.supervised: gts += [gt] * len(images_) gradients_ = [gt] * len(images_) debug_plots += debug_plots_ gradients += gradients_ if not self.cfg.supervised: gradients = self.gradient_processor(np.stack(gradients, axis=0)) pad = 0 for i in range(rows): for j in range(per_row): start_x, start_y = pad + 3 * i * (h + 2), pad + j * (w + 4) index = i * per_row + j if index < len(images): upper_region[start_x:start_x + h, start_y:start_y + w] = images[index] upper_region[start_x + h + 1:start_x + h * 2 + 1, start_y:start_y + w] = gradients[index] upper_region[start_x + 2 * (h + 1):start_x + h * 3 + 2, start_y:start_y + w] = debug_plots[index] seperator = np.ones((16, upper_region.shape[1], upper_region.shape[2]), dtype=np.float32) upper_region = np.vstack([seperator, upper_region, seperator]) img = np.vstack([upper_region, lower_region]) if self.cfg.realtime_vis: cv2.imshow('vis', img[:, :, ::-1]) cv2.waitKey(20) if iter % self.cfg.write_image_interval == 0: fn = os.path.join(self.image_dir, '%06d.png' % iter) cv2.imwrite(fn, img[:, :, ::-1] * 255.0) def draw_value_reward_score(self, img, value, reward, score): img = img.copy() # Average with 0.5 for semi-transparent background img[:14] = img[:14] * 0.5 + 0.25 img[50:] = img[50:] * 0.5 + 0.25 if self.cfg.gan == 'ls': red = -np.tanh(float(score) / 1) * 0.5 + 0.5 else: red = -np.tanh(float(score) / 10.0) * 0.5 + 0.5 top = '%+.2f %+.2f' % (value, reward) cv2.putText(img, top, (3, 7), cv2.FONT_HERSHEY_SIMPLEX, 0.25, (1.0, 1.0 - red, 1.0 - red)) score = '%+.3f' % score cv2.putText(img, score, (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.35, (1.0, 1.0 - red, 1.0 - red)) return img def draw_steps(self, img, progress, ground_truth=None, is_train=0): images = [] debug_plots = [] gradients = [] states = self.memory.get_initial_states(self.cfg.batch_size) tmp_fake_output = [img] * self.cfg.batch_size tmp_fake_output = np.stack(tmp_fake_output, axis=0) initial_value, initial_score = self.sess.run( [self.new_value[0], self.centered_fake_logit[0]], feed_dict={ self.fake_output: tmp_fake_output, self.new_states: states, self.progress: progress }) images.append( self.draw_value_reward_score(img, initial_value, 0, initial_score)) debug_plots.append(img * 0 + 1) # z is useless at test time... gradients.append(img * 0 + 1) for k in range(self.cfg.test_steps): feed_dict = { self.fake_input: [img] * self.cfg.batch_size, self.real_data: [img] * self.cfg.batch_size, self.z: self.memory.get_noise(self.cfg.batch_size), self.is_train: is_train, self.states: states, self.progress: progress } if self.cfg.supervised: feed_dict[self.ground_truth] = [ground_truth] feed_dict[self.progress] = progress debug_info, img, grad, new_state, new_value, score, reward = self.sess.run( [ self.generator_debug_output, self.fake_output[0], self.fake_gradients[0], self.new_states, self.new_value[0], self.centered_fake_logit[0], self.reward[0] ], feed_dict=feed_dict) debug_plot = self.generator_debugger(debug_info) images.append( self.draw_value_reward_score(img, new_value, reward, score)) gradients.append(grad) debug_plots.append(debug_plot) states = new_state if states[0, STATE_STOPPED_DIM] > 0: break for k in range(len(images), self.cfg.test_steps + 1): images.append(img * 0 + 1) gradients.append(img * 0 + 1) debug_plots.append(img * 0 + 1) return images, debug_plots, gradients def draw_critic_scores(self, images, ground_truth=None): # We do not care about states here, so that value drawn may not make sense. images = list(images) original_len = len(images) if len(images) < self.cfg.batch_size: images += [images[0]] * (self.cfg.batch_size - len(images)) states = self.memory.get_initial_states(self.cfg.batch_size) # indexs = self.memory.get_random_indexs(self.cfg,batch_size) images = np.stack(images, axis=0) if self.cfg.supervised: # TODO feed_dict = { self.real_data: images, self.fake_input: images, self.ground_truth: ground_truth, self.new_states: states, self.states: states, self.is_train: 0 } else: feed_dict = { self.fake_output: images, self.real_data: images, } if self.cfg.gan == 'ls': logit = self.fake_logit else: logit = self.centered_fake_logit scores = self.sess.run(logit, feed_dict=feed_dict) if self.cfg.supervised: scores = np.sqrt(scores) * 100.0 ret = [] for i in range(len(images)): img, score = images[i].copy(), scores[i] # Average with 0.5 for semi-transparent background img[50:] = img[50:] * 0.5 + 0.25 if self.cfg.gan == 'ls': red = -np.tanh(float(score) / 1) * 0.5 + 0.5 else: red = -np.tanh(float(score) / 10.0) * 0.5 + 0.5 score = '%+.3f' % score cv2.putText(img, score, (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.35, (1.0, 1.0 - red, 1.0 - red)) ret.append(img) return ret[:original_len] def backup_scripts(self): script_dir = os.path.join(self.dir, 'scripts') try: os.mkdir(script_dir) except Exception as e: pass for fn in os.listdir('.'): if fn.endswith('.py'): shutil.copy(fn, script_dir) print('Scripts are backed up. Initializing network...') def get_high_resolution_net(self, res): if res not in self.high_res_nets: print('Creating high_res_network for ', res) net = Dict() net.high_res_input = tf.placeholder( tf.float32, shape=(None, res[0], res[1], self.cfg.real_img_channels), name='highres_in') net.fake_input = self.fake_input net.fake_input_feature = self.fake_input_feature net.real_data = self.real_data net.z = self.z net.is_train = self.is_train net.states = self.states with tf.variable_scope('generator', reuse=True): fake_output, net.generator_debug_output, net.generator_debugger = self.cfg.generator( [net.fake_input, net.z, net.states], is_train=net.is_train, cfg=self.cfg, high_res=net.high_res_input, progress=0) net.fake_output, net.new_states, net.high_res_output = fake_output net.fake_logit, net.fake_embeddings, _ = self.cfg.critic( images=net.fake_output, cfg=self.cfg, reuse=True, is_train=False) self.high_res_nets[res] = net return self.high_res_nets[res] def eval(self, spec_files=None, output_dir='./outputs', step_by_step=False, show_linear=True, show_input=True): from util import get_image_center if output_dir is not None: try: os.mkdir(output_dir) except: pass print(spec_files) # Use a fixed noise batch_size = 1 for fn in spec_files: print('Processing input {}'.format(fn)) from util import read_tiff16, linearize_ProPhotoRGB if fn.endswith('.tif') or fn.endswith('.tiff'): image = read_tiff16(fn) high_res_image = linearize_ProPhotoRGB(image) else: # TODO: deal with png and jpeg files better - they are probably not RAW. print( 'Warning: sRGB color space jpg and png images may not work perfectly. See README for details. (image {})' .format(fn)) image = cv2.imread(fn)[:, :, ::-1] if image.dtype == np.uint8: image = image / 255.0 if image.dtype == np.uint16: image = image / 65535.0 else: print( 'image data type {} is not supported. Please email Yuanming Hu.' .format(image.dtype)) high_res_image = np.power(image, 2.2) # Linearize sRGB high_res_image /= 2 * high_res_image.max( ) # Mimic RAW exposure # Uncomment to bypass preprocessing # high_res_image = image noises = [ self.memory.get_noise(batch_size) for _ in range(self.cfg.test_steps) ] fn = fn.split('/')[-1] def get_dir(): if output_dir is not None: d = output_dir else: d = self.dump_dir return d try: os.mkdir(get_dir()) except: pass def show_and_save(x, img): img = img[:, :, ::-1] #cv2.imshow(x, img) cv2.imwrite(os.path.join(get_dir(), fn + '.' + x + '.png'), img * 255.0) #if os.path.exists(os.path.join(get_dir(), fn + '.retouched.png')): # print('Skipping', fn) # continue high_res_input = high_res_image low_res_img = cv2.resize(get_image_center(high_res_image), dsize=(64, 64)) res = high_res_input.shape[:2] net = self.get_high_resolution_net(res) low_res_img_trajs = [low_res_img] low_res_images = [low_res_img] states = self.memory.get_initial_states(batch_size) high_res_output = high_res_input masks = [] decisions = [] operations = [] debug_info_list = [] tmp_fake_input = low_res_images * batch_size tmp_fake_input = np.array(tmp_fake_input) print(tmp_fake_input.shape) for i in range(self.cfg.test_steps): feed_dict = { net.fake_input: low_res_images * batch_size, net.z: noises[i], net.is_train: 0, net.states: states, net.high_res_input: [high_res_output] * batch_size } new_low_res_images, new_scores, new_states, new_high_res_output, debug_info = self.sess.run( [ net.fake_output[0], net.fake_logit[0], net.new_states[0], net.high_res_output[0], net.generator_debug_output ], feed_dict=feed_dict) low_res_img_trajs.append(new_low_res_images) low_res_images = [new_low_res_images] # print('new_states', new_states.shape) states = [new_states] * batch_size debug_info_list.append(debug_info) debug_plots = self.generator_debugger(debug_info, combined=False) decisions.append(debug_plots[0]) operations.append(debug_plots[1]) masks.append(debug_plots[2]) high_res_output = new_high_res_output if states[0][STATE_STOPPED_DIM] > 0: break if step_by_step: show_and_save('intermediate%02d' % i, high_res_output) linear_high_res = high_res_input # Max to white, and then gamma correction high_res_input = (high_res_input / high_res_input.max())**(1 / 2.4) # Save linear if show_linear: show_and_save('linear', linear_high_res) # Save corrected if show_input: show_and_save('input_tone_mapped', high_res_input) # Save retouched show_and_save('retouched', high_res_output) # Steps & debugging information with open(os.path.join(get_dir(), fn + '_debug.pkl'), 'wb') as f: pickle.dump(debug_info_list, f) padding = 4 patch = 64 grid = patch + padding steps = len(low_res_img_trajs) fused = np.ones(shape=(grid * 4, grid * steps, 3), dtype=np.float32) for i in range(len(low_res_img_trajs)): sx = grid * i sy = 0 fused[sy:sy + patch, sx:sx + patch] = cv2.resize( low_res_img_trajs[i], dsize=(patch, patch), interpolation=cv2.cv2.INTER_NEAREST) for i in range(len(low_res_img_trajs) - 1): sx = grid * i + grid // 2 sy = grid fused[sy:sy + patch, sx:sx + patch] = cv2.resize( decisions[i], dsize=(patch, patch), interpolation=cv2.cv2.INTER_NEAREST) sy = grid * 2 - padding // 2 fused[sy:sy + patch, sx:sx + patch] = cv2.resize( operations[i], dsize=(patch, patch), interpolation=cv2.cv2.INTER_NEAREST) sy = grid * 3 - padding fused[sy:sy + patch, sx:sx + patch] = cv2.resize( masks[i], dsize=(patch, patch), interpolation=cv2.cv2.INTER_NEAREST) # Save steps show_and_save('steps', fused)
def __init__(self, env_type, state_dims, num_actions): if env_type == EnvTypes.ATARI: state_size = [state_dims[0], state_dims[1]*FRAME_STACK, state_dims[2]] elif env_type == EnvTypes.STANDARD: state_size = state_dims self.replay_memory = ReplayMemory(REPLAY_MEMORY_CAPACITY, state_size) self.exploration = 1.0 self.train_iter = 0 self.env_type = env_type if env_type == EnvTypes.ATARI: buffer_size = FRAME_STACK*FRAME_SKIP self.observation_buffer = [np.zeros((state_dims[0], state_dims[1], state_dims[2])) for _ in range(buffer_size)] else: self.observation_buffer = [np.zeros((state_dims[0]))] self.config = tf.ConfigProto() self.config.gpu_options.per_process_gpu_memory_fraction = GPU_MEMORY_FRACTION self.sess = tf.Session(config=self.config) # build q network self.dqn_vars = dict() with tf.variable_scope(DQN_SCOPE): if env_type == EnvTypes.ATARI: self.x, self.initial_layers = self.add_atari_layers(state_dims, self.dqn_vars) elif env_type == EnvTypes.STANDARD: self.x, self.initial_layers = self.add_standard_layers(state_dims, self.dqn_vars) # add final hidden layers self.hid = fc(self.initial_layers, 128, HIDDEN, var_dict=self.dqn_vars) self.q = fc(self.hid, num_actions, OUTPUT, var_dict=self.dqn_vars, activation=False) tf.histogram_summary('q_values', self.q) # build target network self.target_vars = dict() with tf.variable_scope(TARGET_SCOPE): if env_type == EnvTypes.ATARI: self.t_x, self.t_initial_layers = self.add_atari_layers(state_dims, self.target_vars) elif env_type == EnvTypes.STANDARD: self.t_x, self.t_initial_layers = self.add_standard_layers(state_dims, self.target_vars) self.t_hid = fc(self.t_initial_layers, 128, HIDDEN, var_dict=self.target_vars) self.t_q = fc(self.t_hid, num_actions, OUTPUT, var_dict=self.target_vars, activation=False) tf.histogram_summary('target_q_values', self.t_q) # add weight transfer operations from primary dqn network to target network self.assign_ops = [] with tf.variable_scope(TRANSFER_SCOPE): for variable in self.dqn_vars.keys(): target_variable = TARGET_SCOPE + variable[len(DQN_SCOPE):] decay = tf.mul(1 - TAU, self.target_vars[target_variable]) update = tf.mul(TAU, self.dqn_vars[variable]) new_target_weight = tf.add(decay, update) target_assign = self.target_vars[target_variable].assign(new_target_weight) self.assign_ops.append(target_assign) # build dqn evaluation with tf.variable_scope(EVALUATION_SCOPE): # one-hot action selection self.action = tf.placeholder(tf.int32, shape=[None]) self.action_one_hot = tf.one_hot(self.action, num_actions) # reward self.reward = tf.placeholder(tf.float32, shape=[None, 1]) # terminal state self.nonterminal = tf.placeholder(tf.float32, shape=[None, 1]) self.target = tf.add(self.reward, tf.mul(GAMMA, tf.mul(self.nonterminal, tf.reduce_max(self.t_q, 1, True)))) self.predict = tf.reduce_sum(tf.mul(self.action_one_hot, self.q), 1, True) self.error = tf.reduce_mean(mse(self.predict, self.target)) tf.scalar_summary('error', self.error) val_print = tf.Print(self.error, [self.predict, self.target]) self.optimize = tf.train.RMSPropOptimizer(ALPHA, decay=RMS_DECAY, momentum=MOMENTUM, epsilon=EPSILON).minimize(self.error, var_list=self.dqn_vars.values()) # write out the graph and summaries for tensorboard self.summaries = tf.merge_all_summaries() if os.path.isdir(TENSORBOARD_GRAPH_DIR): shutil.rmtree(TENSORBOARD_GRAPH_DIR) self.writer = tf.train.SummaryWriter(TENSORBOARD_GRAPH_DIR, self.sess.graph) # initialize variables self.sess.run(tf.initialize_all_variables()) # create saver self.saver = tf.train.Saver()
class AtariGame(Game): def __init__(self, rom_path=_default_rom_path, frame_skip=4, history_length=4, resize_mode='scale', resized_rows=84, resized_cols=84, crop_offset=8, display_screen=False, max_null_op=30, replay_memory_size=1000000, replay_start_size=100, death_end_episode=True): super(AtariGame, self).__init__() self.rng = get_numpy_rng() self.ale = ale_load_from_rom(rom_path=rom_path, display_screen=display_screen) self.start_lives = self.ale.lives() self.action_set = self.ale.getMinimalActionSet() self.resize_mode = resize_mode self.resized_rows = resized_rows self.resized_cols = resized_cols self.crop_offset = crop_offset self.frame_skip = frame_skip self.history_length = history_length self.max_null_op = max_null_op self.death_end_episode = death_end_episode self.screen_buffer_length = 2 self.screen_buffer = numpy.empty((self.screen_buffer_length, self.ale.getScreenDims()[1], self.ale.getScreenDims()[0]), dtype='uint8') self.replay_memory = ReplayMemory(state_dim=(resized_rows, resized_cols), history_length=history_length, memory_size=replay_memory_size, replay_start_size=replay_start_size) self.start() def start(self): self.ale.reset_game() null_op_num = self.rng.randint(self.screen_buffer_length, max(self.max_null_op + 1, self.screen_buffer_length + 1)) for i in range(null_op_num): self.ale.act(0) self.ale.getScreenGrayscale(self.screen_buffer[i % self.screen_buffer_length, :, :]) self.total_reward = 0 self.episode_reward = 0 self.episode_step = 0 self.max_episode_step = DEFAULT_MAX_EPISODE_STEP self.start_lives = self.ale.lives() def force_restart(self): self.start() self.replay_memory.clear() def begin_episode(self, max_episode_step=DEFAULT_MAX_EPISODE_STEP): """ Begin an episode of a game instance. We can play the game for a maximum of `max_episode_step` and after that, we are forced to restart """ if self.episode_step > self.max_episode_step or self.ale.game_over(): self.start() else: for i in range(self.screen_buffer_length): self.ale.act(0) self.ale.getScreenGrayscale(self.screen_buffer[i % self.screen_buffer_length, :, :]) self.max_episode_step = max_episode_step self.start_lives = self.ale.lives() self.episode_reward = 0 self.episode_step = 0 @property def episode_terminate(self): termination_flag = self.ale.game_over() or self.episode_step >= self.max_episode_step if self.death_end_episode: return (self.ale.lives() < self.start_lives) or termination_flag else: return termination_flag @property def state_enabled(self): return self.replay_memory.size >= self.replay_memory.history_length def get_observation(self): image = self.screen_buffer.max(axis=0) if 'crop' == self.resize_mode: original_rows, original_cols = image.shape new_resized_rows = int(round( float(original_rows) * self.resized_cols / original_cols)) resized = cv2.resize(image, (self.resized_cols, new_resized_rows), interpolation=cv2.INTER_LINEAR) crop_y_cutoff = new_resized_rows - self.crop_offset - self.resized_rows img = resized[crop_y_cutoff: crop_y_cutoff + self.resized_rows, :] return img else: return cv2.resize(image, (self.resized_cols, self.resized_rows), interpolation=cv2.INTER_LINEAR) def play(self, a): assert not self.episode_terminate,\ "Warning, the episode seems to have terminated. " \ "We need to call either game.begin_episode(max_episode_step) to continue a new " \ "episode or game.start() to force restart." self.episode_step += 1 reward = 0.0 action = self.action_set[a] for i in range(self.frame_skip): reward += self.ale.act(action) self.ale.getScreenGrayscale(self.screen_buffer[i % self.screen_buffer_length, :, :]) self.total_reward += reward self.episode_reward += reward ob = self.get_observation() terminate_flag = self.episode_terminate self.replay_memory.append(ob, a, numpy.clip(reward, -1, 1), terminate_flag) return reward, terminate_flag
class QEngine: def __init__(self, **kwargs): self.setup = kwargs self._initialize(**kwargs) del kwargs["game"] def _prepare_for_save(self): self.setup["epsilon"] = self._epsilon self.setup["steps"] = self._steps self.setup["skiprate"] = self._skiprate # TODO why the f**k isn't it in init? def _initialize(self, game, network_args=None, actions=None, history_length=4, batchsize=64, update_pattern=(1, 1), replay_memory_size=10000, backprop_start_step=10000, start_epsilon=1.0, end_epsilon=0.1, epsilon_decay_start_step=50000, epsilon_decay_steps=100000, reward_scale=1.0, use_game_variables=True, misc_scale=None, reshaped_x=None, reshaped_y=None, skiprate=4, shaping_on=False, count_states=False, name=None, net_type="cnn", melt_steps=10000, remember_n_actions=0): if network_args is None: network_args = dict() if count_states is not None: self._count_states = bool(count_states) self.name = name self._reward_scale = reward_scale self._game = game self._batchsize = batchsize self._history_length = max(history_length, 1) self._update_pattern = update_pattern self._epsilon = max(min(start_epsilon, 1.0), 0.0) self._end_epsilon = min(max(end_epsilon, 0.0), self._epsilon) self._epsilon_decay_steps = epsilon_decay_steps self._epsilon_decay_stride = (self._epsilon - end_epsilon) / epsilon_decay_steps self._epsilon_decay_start = epsilon_decay_start_step self._skiprate = max(skiprate, 0) self._shaping_on = shaping_on self._steps = 0 self._melt_steps = melt_steps self._backprop_start_step = max(backprop_start_step, batchsize) self._use_game_variables = use_game_variables self._last_action_index = 0 if self._shaping_on: self._last_shaping_reward = 0 self.learning_mode = True if actions is None: self._actions = generate_default_actions(game) else: self._actions = actions self._actions_num = len(self._actions) self._actions_stats = np.zeros([self._actions_num], np.int) # changes img_shape according to the history size self._channels = game.get_screen_channels() if self._history_length > 1: self._channels *= self._history_length if reshaped_x is None: x = game.get_screen_width() y = game.get_screen_height() scale_x = scale_y = 1.0 else: x = reshaped_x scale_x = float(x) / game.get_screen_width() if reshaped_y is None: y = int(game.get_screen_height() * scale_x) scale_y = scale_x else: y = reshaped_y scale_y = float(y) / game.get_screen_height() img_shape = [self._channels, y, x] # TODO check if it is slow (it seems that no) if scale_x == 1 and scale_y == 1: def convert(img): img = img.astype(np.float32) / 255.0 return img else: def convert(img): img = img.astype(np.float32) / 255.0 new_image = np.ndarray([img.shape[0], y, x], dtype=img.dtype) for i in xrange(img.shape[0]): # new_image[i] = skimage.transform.resize(img[i], (y,x), preserve_range=True) new_image[i] = cv2.resize(img[i], (x, y), interpolation=cv2.INTER_AREA) return new_image self._convert_image = convert if self._use_game_variables: single_state_misc_len = game.get_available_game_variables_size() + int(self._count_states) else: single_state_misc_len = int(self._count_states) self._single_state_misc_len = single_state_misc_len self._remember_n_actions = remember_n_actions if remember_n_actions > 0: self._remember_n_actions = remember_n_actions self._action_len = len(self._actions[0]) self._last_n_actions = np.zeros([remember_n_actions * self._action_len], dtype=np.float32) self._total_misc_len = single_state_misc_len * self._history_length + len(self._last_n_actions) self._last_action_index = 0 else: self._total_misc_len = single_state_misc_len * self._history_length if self._total_misc_len > 0: self._misc_state_included = True self._current_misc_state = np.zeros(self._total_misc_len, dtype=np.float32) if single_state_misc_len > 0: self._state_misc_buffer = np.zeros(single_state_misc_len, dtype=np.float32) if misc_scale is not None: self._misc_scale = np.array(misc_scale, dtype=np.float32) else: self._misc_scale = None else: self._misc_state_included = False state_format = dict() state_format["s_img"] = img_shape state_format["s_misc"] = self._total_misc_len self._transitions = ReplayMemory(state_format, replay_memory_size, batchsize) network_args["state_format"] = state_format network_args["actions_number"] = len(self._actions) if net_type in ("dqn", None, ""): self._evaluator = DQN(**network_args) elif net_type == "duelling": self._evaluator = DuellingDQN(**network_args) else: print "Unsupported evaluator type." exit(1) # TODO throw. . .? self._current_image_state = np.zeros(img_shape, dtype=np.float32) def _update_state(self): raw_state = self._game.get_state() img = self._convert_image(raw_state.image_buffer) state_misc = None if self._single_state_misc_len > 0: state_misc = self._state_misc_buffer if self._use_game_variables: game_variables = raw_state.game_variables.astype(np.float32) state_misc[0:len(game_variables)] = game_variables if self._count_states: state_misc[-1] = raw_state.number if self._misc_scale is not None: state_misc = state_misc * self._misc_scale if self._history_length > 1: pure_channels = self._channels / self._history_length self._current_image_state[0:-pure_channels] = self._current_image_state[pure_channels:] self._current_image_state[-pure_channels:] = img if self._single_state_misc_len > 0: misc_len = len(state_misc) hist = self._history_length self._current_misc_state[0:(hist - 1) * misc_len] = self._current_misc_state[misc_len:hist * misc_len] self._current_misc_state[(hist - 1) * misc_len:hist * misc_len] = state_misc else: self._current_image_state[:] = img if self._single_state_misc_len > 0: self._current_misc_state[0:len(state_misc)] = state_misc if self._remember_n_actions: self._last_n_actions[:-self._action_len] = self._last_n_actions[self._action_len:] self._last_n_actions[-self._action_len:] = self._actions[self._last_action_index] self._current_misc_state[-len(self._last_n_actions):] = self._last_n_actions def new_episode(self, update_state=False): self._game.new_episode() self.reset_state() self._last_shaping_reward = 0 if update_state: self._update_state() # Return current state including history def _current_state(self): if self._misc_state_included: s = [self._current_image_state, self._current_misc_state] else: s = [self._current_image_state] return s # Return current state's COPY including history. def _current_state_copy(self): if self._misc_state_included: s = [self._current_image_state.copy(), self._current_misc_state.copy()] else: s = [self._current_image_state.copy()] return s # Sets the whole state to zeros. def reset_state(self): self._current_image_state.fill(0.0) self._last_action_index = 0 if self._misc_state_included: self._current_misc_state.fill(0.0) if self._remember_n_actions > 0: self._last_n_actions.fill(0) def make_step(self): self._update_state() # TODO Check if not making the copy still works a = self._evaluator.estimate_best_action(self._current_state_copy()) self._actions_stats[a] += 1 self._game.make_action(self._actions[a], self._skiprate + 1) self._last_action_index = a def make_sleep_step(self, sleep_time=1 / 35.0): self._update_state() a = self._evaluator.estimate_best_action(self._current_state_copy()) self._actions_stats[a] += 1 self._game.set_action(self._actions[a]) self._last_action_index = a for i in xrange(self._skiprate): self._game.advance_action(1, False, True) sleep(sleep_time) self._game.advance_action() sleep(sleep_time) # Performs a learning step according to epsilon-greedy policy. # The step spans self._skiprate +1 actions. def make_learning_step(self): self._steps += 1 # epsilon decay if self._steps > self._epsilon_decay_start and self._epsilon > self._end_epsilon: self._epsilon = max(self._epsilon - self._epsilon_decay_stride, 0) # Copy because state will be changed in a second s = self._current_state_copy(); # With probability epsilon choose a random action: if self._epsilon >= random.random(): a = random.randint(0, len(self._actions) - 1) else: a = self._evaluator.estimate_best_action(s) self._actions_stats[a] += 1 # make action and get the reward self._last_action_index = a r = self._game.make_action(self._actions[a], self._skiprate + 1) r = np.float32(r) if self._shaping_on: sr = np.float32(doom_fixed_to_double(self._game.get_game_variable(GameVariable.USER1))) r += sr - self._last_shaping_reward self._last_shaping_reward = sr r *= self._reward_scale # update state s2 accordingly if self._game.is_episode_finished(): # terminal state s2 = None self._transitions.add_transition(s, a, s2, r, terminal=True) else: self._update_state() s2 = self._current_state() self._transitions.add_transition(s, a, s2, r, terminal=False) # Perform q-learning once for a while if self._transitions.size >= self._backprop_start_step and self._steps % self._update_pattern[0] == 0: for a in xrange(self._update_pattern[1]): self._evaluator.learn(self._transitions.get_sample()) # Melt the network sometimes if self._steps % self._melt_steps == 0: self._evaluator.melt() # Adds a transition to the bank. def add_transition(self, s, a, s2, r, terminal): self._transitions.add_transition(s, a, s2, r, terminal) # Runs a single episode in current mode. It ignores the mode if learn==true/false def run_episode(self, sleep_time=0): self.new_episode() if sleep_time == 0: while not self._game.is_episode_finished(): self.make_step() else: while not self._game.is_episode_finished(): self.make_sleep_step(sleep_time) return np.float32(self._game.get_total_reward()) # Utility stuff def get_actions_stats(self, clear=False, norm=True): stats = self._actions_stats.copy() if norm: stats = stats / np.float32(self._actions_stats.sum()) stats[stats == 0.0] = -1 stats = np.around(stats, 3) if clear: self._actions_stats.fill(0) return stats def get_steps(self): return self._steps def get_epsilon(self): return self._epsilon def get_network(self): return self._evaluator.network def set_epsilon(self, eps): self._epsilon = eps def set_skiprate(self, skiprate): self._skiprate = max(skiprate, 0) def get_skiprate(self): return self._skiprate # Saves network weights to a file def save_params(self, filename, quiet=False): if not quiet: print "Saving network weights to " + filename + "..." self._prepare_for_save() params = get_all_param_values(self._evaluator.network) pickle.dump(params, open(filename, "wb")) if not quiet: print "Saving finished." # Loads network weights from the file def load_params(self, filename, quiet=False): if not quiet: print "Loading network weights from " + filename + "..." params = pickle.load(open(filename, "rb")) set_all_param_values(self._evaluator.network, params) set_all_param_values(self._evaluator.frozen_network, params) if not quiet: print "Loading finished." # Loads the whole engine with params from file @staticmethod def load(game, filename, quiet=False): if not quiet: print "Loading qengine from " + filename + "..." params = pickle.load(open(filename, "rb")) qengine_args = params[0] network_params = params[1] steps = qengine_args["steps"] epsilon = qengine_args["epsilon"] del (qengine_args["epsilon"]) del (qengine_args["steps"]) qengine_args["game"] = game qengine = QEngine(**qengine_args) set_all_param_values(qengine._evaluator.network, network_params) set_all_param_values(qengine._evaluator.frozen_network, network_params) if not quiet: print "Loading finished." qengine._steps = steps qengine._epsilon = epsilon return qengine # Saves the whole engine with params to a file def save(self, filename, quiet=False): if not quiet: print "Saving qengine to " + filename + "..." self._prepare_for_save() network_params = get_all_param_values(self._evaluator.network) params = [self.setup, network_params] pickle.dump(params, open(filename, "wb")) if not quiet: print "Saving finished."
# Environment env = gym.make(config['env_name']) torch.manual_seed(config['seed']) np.random.seed(config['seed']) random.seed(config['seed']) env.seed(config['seed']) env.action_space.np_random.seed(config['seed']) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Agent agent = SAC(env.observation_space.shape[0], env.action_space, config) # Memory memory = ReplayMemory(config['replay_size']) # Training Loop total_numsteps = 0 updates = 0 test_step = 10000 for i_episode in itertools.count(1): episode_reward = 0 episode_steps = 0 done = False state = env.reset() acc_log_alpha = 0. while not done: if config['start_steps'] > total_numsteps:
class DQN(): def __init__(self, env_type, state_dims, num_actions): if env_type == EnvTypes.ATARI: state_size = [state_dims[0], state_dims[1]*FRAME_STACK, state_dims[2]] elif env_type == EnvTypes.STANDARD: state_size = state_dims self.replay_memory = ReplayMemory(REPLAY_MEMORY_CAPACITY, state_size) self.exploration = 1.0 self.train_iter = 0 self.env_type = env_type if env_type == EnvTypes.ATARI: buffer_size = FRAME_STACK*FRAME_SKIP self.observation_buffer = [np.zeros((state_dims[0], state_dims[1], state_dims[2])) for _ in range(buffer_size)] else: self.observation_buffer = [np.zeros((state_dims[0]))] self.config = tf.ConfigProto() self.config.gpu_options.per_process_gpu_memory_fraction = GPU_MEMORY_FRACTION self.sess = tf.Session(config=self.config) # build q network self.dqn_vars = dict() with tf.variable_scope(DQN_SCOPE): if env_type == EnvTypes.ATARI: self.x, self.initial_layers = self.add_atari_layers(state_dims, self.dqn_vars) elif env_type == EnvTypes.STANDARD: self.x, self.initial_layers = self.add_standard_layers(state_dims, self.dqn_vars) # add final hidden layers self.hid = fc(self.initial_layers, 128, HIDDEN, var_dict=self.dqn_vars) self.q = fc(self.hid, num_actions, OUTPUT, var_dict=self.dqn_vars, activation=False) tf.histogram_summary('q_values', self.q) # build target network self.target_vars = dict() with tf.variable_scope(TARGET_SCOPE): if env_type == EnvTypes.ATARI: self.t_x, self.t_initial_layers = self.add_atari_layers(state_dims, self.target_vars) elif env_type == EnvTypes.STANDARD: self.t_x, self.t_initial_layers = self.add_standard_layers(state_dims, self.target_vars) self.t_hid = fc(self.t_initial_layers, 128, HIDDEN, var_dict=self.target_vars) self.t_q = fc(self.t_hid, num_actions, OUTPUT, var_dict=self.target_vars, activation=False) tf.histogram_summary('target_q_values', self.t_q) # add weight transfer operations from primary dqn network to target network self.assign_ops = [] with tf.variable_scope(TRANSFER_SCOPE): for variable in self.dqn_vars.keys(): target_variable = TARGET_SCOPE + variable[len(DQN_SCOPE):] decay = tf.mul(1 - TAU, self.target_vars[target_variable]) update = tf.mul(TAU, self.dqn_vars[variable]) new_target_weight = tf.add(decay, update) target_assign = self.target_vars[target_variable].assign(new_target_weight) self.assign_ops.append(target_assign) # build dqn evaluation with tf.variable_scope(EVALUATION_SCOPE): # one-hot action selection self.action = tf.placeholder(tf.int32, shape=[None]) self.action_one_hot = tf.one_hot(self.action, num_actions) # reward self.reward = tf.placeholder(tf.float32, shape=[None, 1]) # terminal state self.nonterminal = tf.placeholder(tf.float32, shape=[None, 1]) self.target = tf.add(self.reward, tf.mul(GAMMA, tf.mul(self.nonterminal, tf.reduce_max(self.t_q, 1, True)))) self.predict = tf.reduce_sum(tf.mul(self.action_one_hot, self.q), 1, True) self.error = tf.reduce_mean(mse(self.predict, self.target)) tf.scalar_summary('error', self.error) val_print = tf.Print(self.error, [self.predict, self.target]) self.optimize = tf.train.RMSPropOptimizer(ALPHA, decay=RMS_DECAY, momentum=MOMENTUM, epsilon=EPSILON).minimize(self.error, var_list=self.dqn_vars.values()) # write out the graph and summaries for tensorboard self.summaries = tf.merge_all_summaries() if os.path.isdir(TENSORBOARD_GRAPH_DIR): shutil.rmtree(TENSORBOARD_GRAPH_DIR) self.writer = tf.train.SummaryWriter(TENSORBOARD_GRAPH_DIR, self.sess.graph) # initialize variables self.sess.run(tf.initialize_all_variables()) # create saver self.saver = tf.train.Saver() def add_atari_layers(self, dims, var_dict): x = tf.placeholder(tf.float32, shape=[None, dims[0], dims[1]*FRAME_STACK, 1]) conv1 = conv2d(x, 8, 4, 32, CONV1, var_dict=var_dict) conv2 = conv2d(conv1, 4, 2, 64, CONV2, var_dict=var_dict) conv3 = conv2d(conv2, 3, 1, 64, CONV3, var_dict=var_dict) conv_shape = conv3.get_shape().as_list() flatten = [-1, conv_shape[1]*conv_shape[2]*conv_shape[3]] return x, tf.reshape(conv3, flatten) def add_standard_layers(self, dims, var_dict): x = tf.placeholder(tf.float32, shape=[None, dims[0]]) fc1 = fc(x, 256, FC, var_dict=var_dict) return x, fc1 def process_observation(self, observation): if self.env_type == EnvTypes.ATARI: # convert to normalized luminance and downscale observation = downscale(rgb_to_luminance(observation), 2) # push the new observation onto the buffer self.observation_buffer.pop(len(self.observation_buffer)-1) self.observation_buffer.insert(0, observation) def _get_stacked_state(self): stacked_state = self.observation_buffer[0] for i in range(1, FRAME_STACK): stacked_state = np.hstack((stacked_state, self.observation_buffer[i*FRAME_SKIP])) return stacked_state def _predict(self): if self.env_type == EnvTypes.ATARI: state = self._get_stacked_state() else: state = self.observation_buffer[0] state = np.expand_dims(state, axis=0) return np.argmax(self.sess.run(self.q, feed_dict={self.x: state})) def training_predict(self, env, observation): self.process_observation(observation) # select action according to epsilon-greedy policy if random.random() < self.exploration: action = env.action_space.sample() else: action = self._predict() self.exploration = max(self.exploration - EXPLORATION_DECAY, FINAL_EXPLORATION) return action def testing_predict(self, observation): self.process_observation(observation) return self._predict() def notify_state_transition(self, action, reward, done): if self.env_type == EnvTypes.ATARI: state = self._get_stacked_state() else: state = self.observation_buffer[0] self.replay_memory.add_state_transition(state, action, reward, done) if done: # flush the observation buffer for i in range(len(self.observation_buffer)): self.observation_buffer[i] = np.zeros(self.observation_buffer[i].shape) def batch_train(self, save_dir): # sample batch from replay memory state, action, reward, terminal, newstate = self.replay_memory.sample(BATCH_SIZE) reward = np.expand_dims(reward, axis=1) terminal = np.expand_dims(terminal, axis=1) nonterminal = 1 - terminal # update target network weights self.sess.run(self.assign_ops) # run neural network training step if self.train_iter % SUMMARY_PERIOD == 0: summary, _ = self.sess.run([self.summaries, self.optimize], feed_dict={self.x:state, self.t_x:newstate, self.action:action, self.reward:reward, self.nonterminal:nonterminal}) self.writer.add_summary(summary, self.train_iter) else: self.sess.run(self.optimize, feed_dict={self.x:state, self.t_x:newstate, self.action:action, self.reward:reward, self.nonterminal:nonterminal}) # save the dqn if save_dir is not None and self.train_iter % SAVE_CHECKPOINT_PERIOD == 0: self.save_algorithm(save_dir) self.train_iter += 1 def save_algorithm(self, save_dir): # create directory tree for saving the algorithm checkpoint_dir = save_dir + "/save_{}".format(self.train_iter) os.mkdir(checkpoint_dir) model_file = checkpoint_dir + "/model.ckpt" print("Saving algorithm to {}".format(checkpoint_dir)) t = time.time() self.saver.save(self.sess, model_file) print("Completed saving in {} seconds".format(time.time() - t)) def restore_algorithm(self, restore_dir): self.train_iter = int(restore_dir[restore_dir.rfind("save_") + len("save_"):]) self.saver.restore(self.sess, restore_dir + "/model.ckpt")
class DoubleDQNAgent(): def __init__(self, state_size, action_size): # for rendering self.render = False self.state_size = state_size self.action_size = action_size # hyperparams for estimator self.gamma = 0.95 self.lr = 0.001 self.replay_memory_size = 50000 self.epsilon = 1.0 self.epsilon_min = 0.000001 self.explore_steps = 3000 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_steps self.batch_size = 32 self.replay_memory_init_size = 1000 # Estimators self.q_estimator = DQNEstimator(state_size, action_size) self.target_estimator = DQNEstimator(state_size, action_size) self.optimizer = optim.SGD(self.q_estimator.parameters(), lr=self.lr) # memory self.memory = ReplayMemory(self.replay_memory_size) def update_target_estimator(self): self.target_estimator.load_state_dict(self.q_estimator.state_dict()) def get_action(self, state): if np.random.rand() <= self.epsilon: return random.randrange(self.action_size) else: state = Variable(torch.from_numpy(state)).float() q_values = self.q_estimator(state) _, best_action = torch.max(q_values, dim=1) return int(best_action) def train(self): # epsilon decay if self.epsilon > self.epsilon_min: self.epsilon -= self.epsilon_decay # fetch samples from memory batch = self.memory.sample(self.batch_size) batch = np.array(batch).transpose() # stack all the states states = np.vstack(batch[0]) actions = torch.LongTensor(list(batch[1])) rewards = torch.FloatTensor(list(batch[2])) # stack all the next states next_states = np.vstack(batch[3]) dones = batch[4] dones = dones.astype(int) # actions one hot encoding actions_one_hot = F.one_hot(actions, num_classes=self.action_size) actions_one_hot = torch.FloatTensor(actions_one_hot.float()) actions_one_hot = Variable(actions_one_hot) # Forward prop states = torch.FloatTensor(states) states = Variable(states) preds = self.q_estimator(states) # get current action value preds = torch.sum(torch.mul(preds, actions_one_hot), dim=1) # Double DQN next_states = torch.FloatTensor(next_states) next_states = Variable(next_states) next_action_values = self.q_estimator(next_states) best_actions = torch.argmax(next_action_values, dim=1) q_values_next_target = self.target_estimator(next_states) dones = torch.FloatTensor(dones) target = rewards + (1 - dones) * self.gamma * q_values_next_target[ np.arange(self.batch_size), best_actions] target = Variable(target) loss = F.mse_loss(preds, target).mean() # zero out accumulated grads self.optimizer.zero_grad() # back prop loss.backward() self.optimizer.step() return loss.item()
env = NormalizedActions(gym.make(args.env_name)) writer = SummaryWriter() env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) if args.algo == "NAF": agent = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) else: agent = DDPG(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) memory = ReplayMemory(args.replay_size) ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None rewards = [] total_numsteps = 0 updates = 0 for i_episode in range(args.num_episodes): state = torch.Tensor([env.reset()]) if args.ou_noise: ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale
class DQNAgent(): def __init__(self, state_size, action_size): self.state_size = state_size self.action_size = action_size # visualising training self.render = False self.load_model = False # hyperparams for estimator self.gamma = 0.95 self.lr = 0.001 self.replay_memory_size = 50000 self.epsilon = 1.0 self.min_epsilon = 0.000001 self.explore_step = 3000 self.epsilon_decay = (self.epsilon - self.min_epsilon) / self.explore_step self.batch_size = 32 self.replay_memory_init_size = 500 self.update_target_model_every = 1000 # Replay Memory self.memory = ReplayMemory(self.replay_memory_size) # create estimator and target estimators self.q_estimator = DQNEstimator(state_size, action_size) self.target_estimator = DQNEstimator(state_size, action_size) self.optimizer = optim.Adam(self.q_estimator.parameters(), lr=self.lr) # initialize target estimator # TODO: copy q_estimator weights to target model if self.load_model: # TODO: Load saved Q estimator pass def update_target_estimator(self): self.target_estimator.load_state_dict(self.q_estimator.state_dict()) def get_action(self, state): # random action if np.random.rand() <= self.epsilon: return random.randrange(self.action_size) else: # greedy action state = Variable(torch.from_numpy(state)).float() q_values = self.q_estimator(state) _, best_action = torch.max(q_values, dim=1) return int(best_action) def train_network(self): # epsilon decay if self.epsilon > self.min_epsilon: self.epsilon -= self.epsilon_decay # fetch samples samples = self.memory.sample(self.batch_size) samples = np.array(samples).transpose() # create batches of states, actions, rewards, next_states, done # stack all the states states = np.vstack(samples[0]) actions = torch.LongTensor(list(samples[1])) rewards = torch.FloatTensor(list(samples[2])) next_states = Variable(torch.FloatTensor(np.vstack(samples[3]))) is_dones = samples[4] is_dones = torch.FloatTensor(is_dones.astype(int)) # forward propagation Q_network for current states states = torch.Tensor(states) states = Variable(states).float() preds = self.q_estimator(states) # onehot encoding actions actions_one_hot = F.one_hot(actions, num_classes=self.action_size) actions_one_hot = torch.FloatTensor(actions_one_hot.float()) actions_one_hot = Variable(actions_one_hot) # get current actions' action value preds = torch.sum(torch.mul(preds, actions_one_hot), dim=1) # Q function of next state nex_state_preds = self.target_estimator(next_states).data # calculate Q-Learning target target = rewards + (1 - is_dones) * self.gamma * torch.max( nex_state_preds, dim=1)[0] target = Variable(target) # calculate mse loss (preds and targets) loss = F.mse_loss(preds, target).mean() # backward propagation self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.item()
def setUp(self): self.heap = BinaryHeap() self.replayMemory = ReplayMemory(10, 32, 4, 84, 84)
class Driver(object): ''' A driver object for the SCRC ''' def __init__(self, args): '''Constructor''' self.WARM_UP = 0 self.QUALIFYING = 1 self.RACE = 2 self.UNKNOWN = 3 self.stage = args.stage self.parser = msgParser.MsgParser() self.state = carState.CarState() self.control = carControl.CarControl() self.steers = [-1.0, -0.8, -0.6, -0.5, -0.4, -0.3, -0.2, -0.15, -0.1, -0.05, 0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.0] self.speeds = [-1.0, -0.5, 0.0, 0.5, 1.0] self.num_inputs = 19 self.num_steers = len(self.steers) self.num_speeds = len(self.speeds) self.num_actions = self.num_steers + self.num_speeds self.net = DeepQNetwork(self.num_inputs, self.num_steers, self.num_speeds, args) self.mem = ReplayMemory(args.replay_size, self.num_inputs, args) self.minibatch_size = args.batch_size if args.load_replay: self.mem.load(args.load_replay) if args.load_weights: self.net.load_weights(args.load_weights) self.save_weights_prefix = args.save_weights_prefix self.save_interval = args.save_interval self.save_replay = args.save_replay self.enable_training = args.enable_training self.enable_exploration = args.enable_exploration self.save_csv = args.save_csv if self.save_csv: self.csv_file = open(args.save_csv, "wb") self.csv_writer = csv.writer(self.csv_file) self.csv_writer.writerow(['episode', 'distFormStart', 'distRaced', 'curLapTime', 'lastLapTime', 'racePos', 'epsilon', 'replay_memory', 'train_steps']) self.total_train_steps = 0 self.exploration_decay_steps = args.exploration_decay_steps self.exploration_rate_start = args.exploration_rate_start self.exploration_rate_end = args.exploration_rate_end self.skip = args.skip self.show_sensors = args.show_sensors self.show_qvalues = args.show_qvalues self.episode = 0 self.distances = [] self.onRestart() if self.show_sensors: from sensorstats import Stats self.stats = Stats(inevery=8) if self.show_qvalues: from plotq import PlotQ self.plotq = PlotQ(self.num_steers, self.num_speeds) def init(self): '''Return init string with rangefinder angles''' self.angles = [0 for x in range(19)] for i in range(5): self.angles[i] = -90 + i * 15 self.angles[18 - i] = 90 - i * 15 for i in range(5, 9): self.angles[i] = -20 + (i-5) * 5 self.angles[18 - i] = 20 - (i-5) * 5 return self.parser.stringify({'init': self.angles}) def getState(self): #state = np.array([self.state.getSpeedX() / 200.0, self.state.getAngle(), self.state.getTrackPos()]) #state = np.array(self.state.getTrack() + [self.state.getSpeedX()]) / 200.0 state = np.array(self.state.getTrack()) / 200.0 assert state.shape == (self.num_inputs,) return state def getReward(self, terminal): if terminal: reward = -1000 else: dist = self.state.getDistFromStart() if self.prev_dist is not None: reward = max(0, dist - self.prev_dist) * 10 assert reward >= 0, "reward: %f" % reward else: reward = 0 self.prev_dist = dist #reward -= self.state.getTrackPos() #print "reward:", reward return reward def getTerminal(self): return np.all(np.array(self.state.getTrack()) == -1) def getEpsilon(self): # calculate decaying exploration rate if self.total_train_steps < self.exploration_decay_steps: return self.exploration_rate_start - self.total_train_steps * (self.exploration_rate_start - self.exploration_rate_end) / self.exploration_decay_steps else: return self.exploration_rate_end def drive(self, msg): # parse incoming message self.state.setFromMsg(msg) # show sensors if self.show_sensors: self.stats.update(self.state) # training if self.enable_training and self.mem.count >= self.minibatch_size: minibatch = self.mem.getMinibatch() self.net.train(minibatch) self.total_train_steps += 1 #print "total_train_steps:", self.total_train_steps # skip frame and use the same action as previously if self.skip > 0: self.frame = (self.frame + 1) % self.skip if self.frame != 0: return self.control.toMsg() # fetch state, calculate reward and terminal indicator state = self.getState() terminal = self.getTerminal() reward = self.getReward(terminal) #print "reward:", reward # store new experience in replay memory if self.enable_training and self.prev_state is not None and self.prev_steer is not None and self.prev_speed is not None: self.mem.add(self.prev_state, self.prev_steer, self.prev_speed, reward, state, terminal) # if terminal state (out of track), then restart game if terminal: #print "terminal state, restarting" self.control.setMeta(1) return self.control.toMsg() else: self.control.setMeta(0) # choose actions for wheel and speed epsilon = self.getEpsilon() if self.enable_exploration and random.random() < epsilon: #print "random move" steer = random.randrange(self.num_steers) #speed = random.randrange(self.num_speeds) speed = random.randint(2, self.num_speeds-1) else: # use broadcasting to efficiently produce minibatch of desired size minibatch = state + np.zeros((self.minibatch_size, 1)) Q = self.net.predict(minibatch) assert Q.shape == (self.minibatch_size, self.num_actions), "Q.shape: %s" % str(Q.shape) #print "steer Q: ", Q[0,:self.num_steers] #print "speed Q:", Q[0,-self.num_speeds:] steer = np.argmax(Q[0, :self.num_steers]) speed = np.argmax(Q[0, -self.num_speeds:]) if self.show_qvalues: self.plotq.update(Q[0]) #print "steer:", steer, "speed:", speed # gears are always automatic gear = self.gear() # set actions self.setSteerAction(steer) self.setGearAction(gear) self.setSpeedAction(speed) # remember state and actions self.prev_state = state self.prev_steer = steer self.prev_speed = speed #print "total_train_steps:", self.total_train_steps, "mem_count:", self.mem.count #print "reward:", reward, "epsilon:", epsilon return self.control.toMsg() def gear(self): rpm = self.state.getRpm() gear = self.state.getGear() if self.prev_rpm == None: up = True else: if (self.prev_rpm - rpm) < 0: up = True else: up = False if up and rpm > 7000 and gear < 6: gear += 1 if not up and rpm < 3000 and gear > 0: gear -= 1 return gear def setSteerAction(self, steer): assert 0 <= steer <= self.num_steers self.control.setSteer(self.steers[steer]) def setGearAction(self, gear): assert -1 <= gear <= 6 self.control.setGear(gear) def setSpeedAction(self, speed): assert 0 <= speed <= self.num_speeds accel = self.speeds[speed] if accel >= 0: #print "accel", accel self.control.setAccel(accel) self.control.setBrake(0) else: #print "brake", -accel self.control.setAccel(0) self.control.setBrake(-accel) def onShutDown(self): if self.save_weights_prefix: self.net.save_weights(self.save_weights_prefix + "_" + str(self.episode) + ".pkl") if self.save_replay: self.mem.save(self.save_replay) if self.save_csv: self.csv_file.close() def onRestart(self): self.prev_rpm = None self.prev_dist = None self.prev_state = None self.prev_steer = None self.prev_speed = None self.frame = -1 if self.episode > 0: dist = self.state.getDistRaced() self.distances.append(dist) epsilon = self.getEpsilon() print "Episode:", self.episode, "\tDistance:", dist, "\tMax:", max(self.distances), "\tMedian10:", np.median(self.distances[-10:]), \ "\tEpsilon:", epsilon, "\tReplay memory:", self.mem.count if self.save_weights_prefix and self.save_interval > 0 and self.episode % self.save_interval == 0: self.net.save_weights(self.save_weights_prefix + "_" + str(self.episode) + ".pkl") #self.mem.save(self.save_weights_prefix + "_" + str(self.episode) + "_replay.pkl") if self.save_csv: self.csv_writer.writerow([ self.episode, self.state.getDistFromStart(), self.state.getDistRaced(), self.state.getCurLapTime(), self.state.getLastLapTime(), self.state.getRacePos(), epsilon, self.mem.count, self.total_train_steps ]) self.csv_file.flush() self.episode += 1
self.position.unsqueeze(0)) else: reward = 0 next_state = None return cur_state, next_state, self.to_variable(np.array([reward])) def to_variable(self, x): if torch.cuda.is_available(): return Variable(torch.from_numpy(x).float()).cuda() else: return Variable(torch.from_numpy(x).float()) if __name__ == '__main__': env = Environment(test=False, init_position=np.array([0, 1])) replay_memory = ReplayMemory(100) max_val = 0 min_val = env.wealth while (1): action = np.random.randint(3) cur_state, next_state, reward = env.step(action) replay_memory.push(cur_state, action, next_state, reward) if (next_state == None): break max_val = max(max_val, env.wealth) min_val = min(min_val, env.wealth) print(max_val, min_val) # transitions = replay_memory.sample(1) # Transition(*zip(*transitions)) # batch = Transition(*zip(*transitions))
class Agent: def __init__(self, max_memory, batch_size, action_size, atom_size, input_size, kernel_size): self.z = np.linspace(V_MIN, V_MAX, ATOM_SIZE) self.action_size = action_size self.epsilon = EPSILON self.batch_size = batch_size self.atom_size = atom_size self.memory = ReplayMemory(max_memory) self.brain = RainbowDQN(action_size=action_size, atom_size=atom_size, input_size=input_size, kernel_size=kernel_size) self.target_brain = RainbowDQN(action_size=action_size, atom_size=atom_size, input_size=input_size, kernel_size=kernel_size) self.target_brain.load_state_dict(self.brain.state_dict()) self.optim = optim.Adam(self.brain.parameters(), lr=0.001) def step(self, state_input): probs = self.brain(state_input) best_action = self.select_best_action(probs) return best_action def select_best_action(self, probs): numpy_probs = self.variable_to_numpy(probs) z_probs = np.multiply(numpy_probs, self.z) best_action = np.sum(z_probs, axis=1).argmax() # best_action = np.argmax(numpy_probs, axis=1) return best_action def store_states(self, states, best_action, reward, done, next_states): td = self.calculate_td(states, best_action, reward, done, next_states) self.memory.add_memory(states, best_action, reward, done, next_states, td=td) def variable_to_numpy(self, probs): # probs is a list of softmax prob numpy_probs = probs.data.numpy() return numpy_probs #TODO find out why td does not get -100 reward def calculate_td(self, states, best_action, reward, done, next_states): probs = self.brain(states) numpy_probs = self.variable_to_numpy(probs) # states_prob = np.multiply(numpy_probs, self.z) # states_q_value = np.sum(states_prob, axis=1)[best_action] states_q_value = numpy_probs[0][best_action] next_probs = self.brain(next_states) numpy_next_probs = self.variable_to_numpy(next_probs) # next_states_prob = np.multiply(numpy_next_probs, self.z) # max_next_states_q_value = np.sum(next_states_prob, axis=1).max() max_next_states_q_value = np.max(numpy_next_probs, axis=1)[0] if done: td = reward - states_q_value else: td = (reward + gamma * max_next_states_q_value) - states_q_value return abs(td) def learn(self): # make sure that there is at least an amount of batch_size before training it if self.memory.count < self.batch_size: return tree_indexes, tds, batches = self.memory.get_memory(self.batch_size) total_loss = None for index, batch in enumerate(batches): # fixme fix this None type if batch is None: continue state_input = batch[0] best_action = batch[1] reward = batch[2] done = batch[3] next_state_input = batch[4] current_q = self.brain(state_input) next_best_action = self.step(next_state_input) # max_current_q = torch.max(current_q) next_z_prob = self.target_brain(next_state_input) next_z_prob = self.variable_to_numpy(next_z_prob) # target = reward + (1 - done) * gamma * next_z_prob.data[0][next_best_action] # target = Variable(torch.FloatTensor([target])) #TODO finish single dqn with per target_z_prob = np.zeros([self.action_size, ATOM_SIZE], dtype=np.float32) if done: Tz = min(V_MAX, max(V_MIN, reward)) b = (Tz - V_MIN) / (self.z[1] - self.z[0]) m_l = math.floor(b) m_u = math.ceil(b) target_z_prob[best_action][m_l] += (m_u - b) target_z_prob[best_action][m_u] += (b - m_l) else: for z_index in range(len(next_z_prob)): Tz = min(V_MAX, max(V_MIN, reward + gamma * self.z[z_index])) b = (Tz - V_MIN) / (self.z[1] - self.z[0]) m_l = math.floor(b) m_u = math.ceil(b) target_z_prob[best_action][m_l] += next_z_prob[next_best_action][z_index] * (m_u - b) target_z_prob[best_action][m_u] += next_z_prob[next_best_action][z_index] * (b - m_l) target_z_prob = Variable(torch.from_numpy(target_z_prob)) # backward propagate output_prob = self.brain(state_input)[0] loss = -torch.sum(target_z_prob * torch.log(output_prob + 1e-8)) # loss = F.mse_loss(max_current_q, target) total_loss = loss if total_loss is None else total_loss + loss # update td td = self.calculate_td(state_input, best_action, reward, done, next_state_input) tds[index] = td self.optim.zero_grad() total_loss.backward() self.optim.step() # load brain to target brain self.target_brain.load_state_dict(self.brain.state_dict()) self.memory.update_memory(tree_indexes, tds)
def __init__(self, dimO, dimA): dimA = list(dimA) dimO = list(dimO) nets = ddpg_nets_dm tau = FLAGS.tau discount = FLAGS.discount pl2norm = FLAGS.pl2norm l2norm = FLAGS.l2norm plearning_rate = FLAGS.prate learning_rate = FLAGS.rate outheta = FLAGS.outheta ousigma = FLAGS.ousigma # init replay memory self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA) # start tf session self.sess = tf.Session( config=tf.ConfigProto(inter_op_parallelism_threads=FLAGS.thread, log_device_placement=False, allow_soft_placement=True, gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=0.1))) # create tf computational graph # self.theta_p = nets.theta_p(dimO, dimA, FLAGS.l1size, FLAGS.l2size) self.theta_q = nets.theta_q(dimO, dimA, FLAGS.l1size, FLAGS.l2size) self.theta_pt, update_pt = exponential_moving_averages( self.theta_p, tau) self.theta_qt, update_qt = exponential_moving_averages( self.theta_q, tau) obs = tf.placeholder(tf.float32, [None] + dimO, "obs") act_test = nets.policy(obs, self.theta_p) # explore noise_init = tf.zeros([1] + dimA) noise_var = tf.Variable(noise_init) self.ou_reset = noise_var.assign(noise_init) noise = noise_var.assign_sub((outheta) * noise_var - tf.random_normal(dimA, stddev=ousigma)) act_expl = act_test + noise # test q = nets.qfunction(obs, act_test, self.theta_q) # training # q optimization act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train") rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew") obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2") term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2") # policy loss act_train_policy = nets.policy(obs, self.theta_p) q_train_policy = nets.qfunction(obs, act_train_policy, self.theta_q) meanq = tf.reduce_mean(q_train_policy, 0) wd_p = tf.add_n([pl2norm * tf.nn.l2_loss(var) for var in self.theta_p]) # weight decay loss_p = -meanq + wd_p # policy optimization optim_p = tf.train.AdamOptimizer(learning_rate=plearning_rate, epsilon=1e-4) grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=self.theta_p) optimize_p = optim_p.apply_gradients(grads_and_vars_p) with tf.control_dependencies([optimize_p]): train_p = tf.group(update_pt) # q q_train = nets.qfunction(obs, act_train, self.theta_q) # q targets act2 = nets.policy(obs2, theta=self.theta_pt) q2 = nets.qfunction(obs2, act2, theta=self.theta_qt) q_target = tf.stop_gradient(tf.select(term2, rew, rew + discount * q2)) # q_target = tf.stop_gradient(rew + discount * q2) # q loss td_error = q_train - q_target ms_td_error = tf.reduce_mean(tf.square(td_error), 0) wd_q = tf.add_n([l2norm * tf.nn.l2_loss(var) for var in self.theta_q]) # weight decay loss_q = ms_td_error + wd_q # q optimization optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-4) grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=self.theta_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) with tf.control_dependencies([optimize_q]): train_q = tf.group(update_qt) summary_writer = tf.train.SummaryWriter( os.path.join(FLAGS.outdir, 'board'), self.sess.graph) summary_list = [] summary_list.append( tf.scalar_summary('Qvalue', tf.reduce_mean(q_train))) summary_list.append(tf.scalar_summary('loss', ms_td_error)) summary_list.append(tf.scalar_summary('reward', tf.reduce_mean(rew))) # tf functions with self.sess.as_default(): self._act_test = Fun(obs, act_test) self._act_expl = Fun(obs, act_expl) self._reset = Fun([], self.ou_reset) self._train = Fun([obs, act_train, rew, obs2, term2], [train_p, train_q, loss_q], summary_list, summary_writer) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf") if ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.initialize_all_variables()) self.sess.graph.finalize() self.t = 0 # global training time (number of observations)
class Agent: def __init__(self, dimO, dimA): dimA, dimO = dimA[0], dimO[0] self.dimA = dimA self.dimO = dimO tau = FLAGS.tau discount = FLAGS.discount l2norm = FLAGS.l2norm learning_rate = FLAGS.rate outheta = FLAGS.outheta ousigma = FLAGS.ousigma if FLAGS.icnn_opt == 'adam': self.opt = self.adam elif FLAGS.icnn_opt == 'bundle_entropy': self.opt = self.bundle_entropy else: raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt) self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA) self.sess = tf.Session(config=tf.ConfigProto( inter_op_parallelism_threads=FLAGS.thread, log_device_placement=False, allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True))) self.noise = np.zeros(self.dimA) obs = tf.placeholder(tf.float32, [None, dimO], "obs") act = tf.placeholder(tf.float32, [None, dimA], "act") rew = tf.placeholder(tf.float32, [None], "rew") with tf.variable_scope('q'): negQ = self.negQ(obs, act) negQ_entr = negQ - entropy(act) q = -negQ q_entr = -negQ_entr act_grad, = tf.gradients(negQ, act) act_grad_entr, = tf.gradients(negQ_entr, act) obs_target = tf.placeholder(tf.float32, [None, dimO], "obs_target") act_target = tf.placeholder(tf.float32, [None, dimA], "act_target") term_target = tf.placeholder(tf.bool, [None], "term_target") with tf.variable_scope('q_target'): negQ_target = self.negQ(obs_target, act_target) negQ_entr_target = negQ_target - entropy(act_target) act_target_grad, = tf.gradients(negQ_target, act_target) act_entr_target_grad, = tf.gradients(negQ_entr_target, act_target) q_target = -negQ_target q_target_entr = -negQ_entr_target if FLAGS.icnn_opt == 'adam': y = tf.where(term_target, rew, rew + discount * q_target_entr) y = tf.maximum(q_entr - 1., y) y = tf.minimum(q_entr + 1., y) y = tf.stop_gradient(y) td_error = q_entr - y elif FLAGS.icnn_opt == 'bundle_entropy': q_target = tf.where(term2, rew, rew + discount * q2_entropy) q_target = tf.maximum(q_entropy - 1., q_target) q_target = tf.minimum(q_entropy + 1., q_target) q_target = tf.stop_gradient(q_target) td_error = q_entropy - q_target else: raise RuntimeError("Needs checking.") ms_td_error = tf.reduce_mean(tf.square(td_error), 0) regLosses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope='q/') loss_q = ms_td_error + l2norm*tf.reduce_sum(regLosses) self.theta_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q/') self.theta_cvx_ = [v for v in self.theta_ if 'proj' in v.name and 'W:' in v.name] self.makeCvx = [v.assign(tf.abs(v)) for v in self.theta_cvx_] self.proj = [v.assign(tf.maximum(v, 0)) for v in self.theta_cvx_] # self.proj = [v.assign(tf.abs(v)) for v in self.theta_cvx_] self.theta_target_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q_target/') update_target = [theta_target_i.assign_sub(tau*(theta_target_i-theta_i)) for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)] optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate) grads_and_vars_q = optim_q.compute_gradients(loss_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.outdir, 'board'), self.sess.graph) if FLAGS.icnn_opt == 'adam': tf.summary.scalar('Qvalue', tf.reduce_mean(q)) elif FLAGS.icnn_opt == 'bundle_entropy': tf.summary.scalar('Qvalue', tf.reduce_mean(q_entr)) tf.summary.scalar('loss', ms_td_error) tf.summary.scalar('reward', tf.reduce_mean(rew)) merged = tf.summary.merge_all # tf functions with self.sess.as_default(): self._train = Fun([obs, act, rew, obs_target, act_target, term_target], [optimize_q, update_target, loss_q], merged, summary_writer) self._fg = Fun([obs, act], [negQ, act_grad]) self._fg_target = Fun([obs_target, act_target], [negQ_target, act_target_grad]) self._fg_entr = Fun([obs, act], [negQ_entr, act_grad_entr]) self._fg_entr_target = Fun([obs_target, act_target], [negQ_entr_target, act_entr_target_grad]) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf") if ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.initialize_all_variables()) self.sess.run(self.makeCvx) self.sess.run([theta_target_i.assign(theta_i) for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)]) self.sess.graph.finalize() self.t = 0 # global training time (number of observations) def bundle_entropy(self, func, obs): act = np.ones((obs.shape[0], self.dimA)) * 0.5 def fg(x): value, grad = func(obs, 2 * x - 1) grad *= 2 return value, grad act = bundle_entropy.solveBatch(fg, act)[0] act = 2 * act - 1 return act def adam(self, func, obs, plot=False): # if npr.random() < 1./20: # plot = True b1 = 0.9 b2 = 0.999 lam = 0.5 eps = 1e-8 alpha = 0.01 nBatch = obs.shape[0] act = np.zeros((nBatch, self.dimA)) m = np.zeros_like(act) v = np.zeros_like(act) b1t, b2t = 1., 1. act_best, a_diff, f_best = [None]*3 hist = {'act': [], 'f': [], 'g': []} for i in range(1000): f, g = func(obs, act) if plot: hist['act'].append(act.copy()) hist['f'].append(f) hist['g'].append(g) if i == 0: act_best = act.copy() f_best = f.copy() else: prev_act_best = act_best.copy() I = (f < f_best) act_best[I] = act[I] f_best[I] = f[I] a_diff_i = np.mean(np.linalg.norm(act_best - prev_act_best, axis=1)) a_diff = a_diff_i if a_diff is None \ else lam*a_diff + (1.-lam)*a_diff_i # print(a_diff_i, a_diff, np.sum(f)) if a_diff < 1e-3 and i > 5: #print(' + Adam took {} iterations'.format(i)) if plot: self.adam_plot(func, obs, hist) return act_best m = b1 * m + (1. - b1) * g v = b2 * v + (1. - b2) * (g * g) b1t *= b1 b2t *= b2 mhat = m/(1.-b1t) vhat = v/(1.-b2t) act -= alpha * mhat / (np.sqrt(v) + eps) # act = np.clip(act, -1, 1) act = np.clip(act, -1.+1e-8, 1.-1e-8) #print(' + Warning: Adam did not converge.') if plot: self.adam_plot(func, obs, hist) return act_best def adam_plot(self, func, obs, hist): hist['act'] = np.array(hist['act']).T hist['f'] = np.array(hist['f']).T hist['g'] = np.array(hist['g']).T if self.dimA == 1: xs = np.linspace(-1.+1e-8, 1.-1e-8, 100) ys = [func(obs[[0],:], [[xi]])[0] for xi in xs] fig = plt.figure() plt.plot(xs, ys) plt.plot(hist['act'][0,0,:], hist['f'][0,:], label='Adam') plt.legend() fname = os.path.join(FLAGS.outdir, 'adamPlt.png') #print("Saving Adam plot to {}".format(fname)) plt.savefig(fname) plt.close(fig) elif self.dimA == 2: assert(False) else: xs = npr.uniform(-1., 1., (5000, self.dimA)) ys = np.array([func(obs[[0],:], [xi])[0] for xi in xs]) epi = np.hstack((xs, ys)) pca = PCA(n_components=2).fit(epi) W = pca.components_[:,:-1] xs_proj = xs.dot(W.T) fig = plt.figure() X = Y = np.linspace(xs_proj.min(), xs_proj.max(), 100) Z = griddata(xs_proj[:,0], xs_proj[:,1], ys.ravel(), X, Y, interp='linear') plt.contourf(X, Y, Z, 15) plt.colorbar() adam_x = hist['act'][:,0,:].T adam_x = adam_x.dot(W.T) plt.plot(adam_x[:,0], adam_x[:,1], label='Adam', color='k') plt.legend() fname = os.path.join(FLAGS.outdir, 'adamPlt.png') #print("Saving Adam plot to {}".format(fname)) plt.savefig(fname) plt.close(fig) def reset(self, obs): self.noise = np.zeros(self.dimA) self.observation = obs # initial observation def act(self, test=False): with self.sess.as_default(): #print('--- Selecting action, test={}'.format(test)) obs = np.expand_dims(self.observation, axis=0) if FLAGS.icnn_opt == 'adam': f = self._fg_entr # f = self._fg elif FLAGS.icnn_opt == 'bundle_entropy': f = self._fg else: raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt) tflearn.is_training(False) action = self.opt(f, obs) tflearn.is_training(not test) if not test: self.noise -= FLAGS.outheta*self.noise - \ FLAGS.ousigma*npr.randn(self.dimA) action += self.noise action = np.clip(action, -1, 1) self.action = np.atleast_1d(np.squeeze(action, axis=0)) return self.action def observe(self, rew, term, obs2, test=False): obs1 = self.observation self.observation = obs2 # train if not test: self.t = self.t + 1 self.rm.enqueue(obs1, term, self.action, rew) if self.t > FLAGS.warmup: for i in range(FLAGS.iter): loss = self.train() def train(self): with self.sess.as_default(): obs, act, rew, ob2, term2, info = self.rm.minibatch(size=FLAGS.bsize) if FLAGS.icnn_opt == 'adam': # f = self._opt_train_entr f = self._fg_entr_target # f = self._fg_target elif FLAGS.icnn_opt == 'bundle_entropy': f = self._fg_target else: raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt) #print('--- Optimizing for training') tflearn.is_training(False) act2 = self.opt(f, ob2) tflearn.is_training(True) _, _, loss = self._train(obs, act, rew, ob2, act2, term2, log=FLAGS.summary, global_step=self.t) self.sess.run(self.proj) return loss def negQ(self, x, y, reuse=False): szs = [FLAGS.l1size, FLAGS.l2size] assert(len(szs) >= 1) fc = tflearn.fully_connected bn = tflearn.batch_normalization lrelu = tflearn.activations.leaky_relu if reuse: tf.get_variable_scope().reuse_variables() nLayers = len(szs) us = [] zs = [] z_zs = [] z_ys = [] z_us = [] reg = 'L2' prevU = x for i in range(nLayers): with tf.variable_scope('u'+str(i)) as s: u = fc(prevU, szs[i], reuse=reuse, scope=s, regularizer=reg) if i < nLayers-1: u = tf.nn.relu(u) if FLAGS.icnn_bn: u = bn(u, reuse=reuse, scope=s, name='bn') variable_summaries(u, suffix='u{}'.format(i)) us.append(u) prevU = u prevU, prevZ = x, y for i in range(nLayers+1): sz = szs[i] if i < nLayers else 1 z_add = [] if i > 0: with tf.variable_scope('z{}_zu_u'.format(i)) as s: zu_u = fc(prevU, szs[i-1], reuse=reuse, scope=s, activation='relu', bias=True, regularizer=reg, bias_init=tf.constant_initializer(1.)) variable_summaries(zu_u, suffix='zu_u{}'.format(i)) with tf.variable_scope('z{}_zu_proj'.format(i)) as s: z_zu = fc(tf.multiply(prevZ, zu_u), sz, reuse=reuse, scope=s, bias=False, regularizer=reg) variable_summaries(z_zu, suffix='z_zu{}'.format(i)) z_zs.append(z_zu) z_add.append(z_zu) with tf.variable_scope('z{}_yu_u'.format(i)) as s: yu_u = fc(prevU, self.dimA, reuse=reuse, scope=s, bias=True, regularizer=reg, bias_init=tf.constant_initializer(1.)) variable_summaries(yu_u, suffix='yu_u{}'.format(i)) with tf.variable_scope('z{}_yu'.format(i)) as s: z_yu = fc(tf.multiply(y, yu_u), sz, reuse=reuse, scope=s, bias=False, regularizer=reg) z_ys.append(z_yu) variable_summaries(z_yu, suffix='z_yu{}'.format(i)) z_add.append(z_yu) with tf.variable_scope('z{}_u'.format(i)) as s: z_u = fc(prevU, sz, reuse=reuse, scope=s, bias=True, regularizer=reg, bias_init=tf.constant_initializer(0.)) variable_summaries(z_u, suffix='z_u{}'.format(i)) z_us.append(z_u) z_add.append(z_u) z = tf.add_n(z_add) variable_summaries(z, suffix='z{}_preact'.format(i)) if i < nLayers: # z = tf.nn.relu(z) z = lrelu(z, alpha=FLAGS.lrelu) variable_summaries(z, suffix='z{}_act'.format(i)) zs.append(z) prevU = us[i] if i < nLayers else None prevZ = z z = tf.reshape(z, [-1], name='energies') return z def __del__(self): self.sess.close()
class DQN(object): """ A starter class to implement the Deep Q Network algorithm TODOs specify the main areas where logic needs to be added. If you get an error a Box2D error using the pip version try installing from source: > git clone https://github.com/pybox2d/pybox2d > pip install -e . """ def __init__(self, env): self.env = env self.sess = tf.Session() # A few starter hyperparameters self.batch_size = 512 self.gamma = 0.9 # If using e-greedy exploration self.eps_start = 0.9 self.eps_end = 0.05 self.eps_decay = 5000 # in learning steps # If using a target network self.clone_steps = 200 self.eps_value_list = np.linspace(self.eps_start, self.eps_end, self.eps_decay) self.eps_value = self.eps_start # memory self.replay_memory = ReplayMemory(100000) # Perhaps you want to have some samples in the memory before starting to train? self.min_replay_size = 10000 self.cost_his = [] self.eps_his = [] self.reward_his = [] self.state_space_size = self.env.observation_space.shape[0] self.action_space_size = self.env.action_space.n self.lr = 0.001 self.learn_step = 0 # define yours training operations here... self.observation_input = tf.placeholder( tf.float32, shape=[None, self.state_space_size]) self.observation_input_ = tf.placeholder(tf.float32, [None, self.state_space_size]) self.build_model(self.observation_input) t_params = tf.get_collection('target_net_params') e_params = tf.get_collection('eval_net_params') self.replace_target_op = [ tf.assign(t, e) for t, e in zip(t_params, e_params) ] # define your update operations here... self.num_episodes = 5000 self.num_steps = 0 self.saver = tf.train.Saver(tf.trainable_variables()) self.sess.run(tf.global_variables_initializer()) def build_model(self, observation_input, scope='train'): """ TODO: Define the tensorflow model Hint: You will need to define and input placeholder and output Q-values Currently returns an op that gives all zeros. """ self.q_target = tf.placeholder(tf.float32, [None, self.action_space_size]) with tf.variable_scope('eval_net'): collection_names = [ 'eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES ] l1_size = 10 w_initializer = tf.random_normal_initializer(0., 0.3) b_initializer = tf.constant_initializer(0.1) with tf.variable_scope('l1'): w1 = tf.get_variable('w1', [self.state_space_size, l1_size], initializer=w_initializer, collections=collection_names) b1 = tf.get_variable('b1', [1, l1_size], initializer=b_initializer, collections=collection_names) l1 = tf.nn.relu(tf.matmul(self.observation_input, w1) + b1) with tf.variable_scope('l2'): w2 = tf.get_variable('w2', [l1_size, self.action_space_size], initializer=w_initializer, collections=collection_names) b2 = tf.get_variable('b2', [1, self.action_space_size], initializer=b_initializer, collections=collection_names) self.q_eval = tf.matmul(l1, w2) + b2 with tf.variable_scope('loss'): self.loss = tf.losses.huber_loss(self.q_target, self.q_eval) with tf.variable_scope('train'): self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize( self.loss) with tf.variable_scope('target_net'): collection_names = [ 'target_net_params', tf.GraphKeys.GLOBAL_VARIABLES ] with tf.variable_scope('l1'): w1 = tf.get_variable('w1', [self.state_space_size, l1_size], initializer=w_initializer, collections=collection_names) b1 = tf.get_variable('b1', [1, l1_size], initializer=b_initializer, collections=collection_names) l1 = tf.nn.relu(tf.matmul(self.observation_input_, w1) + b1) with tf.variable_scope('l2'): w2 = tf.get_variable('w2', [l1_size, self.action_space_size], initializer=w_initializer, collections=collection_names) b2 = tf.get_variable('b2', [1, self.action_space_size], initializer=b_initializer, collections=collection_names) self.q_next = tf.matmul(l1, w2) + b2 def _reshape_state(self, state): return state.reshape(1, len(state)) def select_action(self, obs, evaluation_mode=False): """ TODO: Select an action given an observation using your model. This should include any exploration strategy you wish to implement If evaluation_mode=True, then this function should behave as if training is finished. This may be reducing exploration, etc. Currently returns a random action. """ observation = obs[np.newaxis, :] if evaluation_mode: actions_value = self.sess.run( self.q_eval, feed_dict={self.observation_input: observation}) action = np.argmax(actions_value) return action if np.random.uniform() > self.eps_value: #print('Exploiting') actions_value = self.sess.run( self.q_eval, feed_dict={self.observation_input: observation}) action = np.argmax(actions_value) else: #print('Exploring') action = self.env.action_space.sample() return action def update(self): """ TODO: Implement the functionality to update the network according to the Q-learning rule """ if self.learn_step % self.clone_steps == 0: self.sess.run(self.replace_target_op) batch_memory = self.replay_memory.sample(self.batch_size) observation_input_ = np.concatenate( [[transition.next_state for transition in batch_memory]], axis=1) observation_input = np.concatenate( [[transition.state for transition in batch_memory]], axis=1) q_next, q_eval = self.sess.run( [self.q_next, self.q_eval], feed_dict={ self.observation_input_: observation_input_, self.observation_input: observation_input, }) q_target = q_eval.copy() batch_index = np.arange(self.batch_size, dtype=np.int32) batch_action = np.asarray( [transition.action for transition in batch_memory]) batch_reward = np.asarray( [transition.reward for transition in batch_memory]) eval_act_index = batch_action reward = batch_reward q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1) _, self.cost = self.sess.run([self._train_op, self.loss], feed_dict={ self.observation_input: observation_input, self.q_target: q_target }) self.cost_his.append(self.cost) if self.learn_step < self.eps_decay: self.eps_value = self.eps_value_list[self.learn_step % self.eps_decay] self.eps_his.append(self.eps_value) self.learn_step += 1 def plot_loss(self): import matplotlib.pyplot as plt f, axarr = plt.subplots(2, sharex=True) axarr[0].plot(np.arange(len(self.cost_his)), self.cost_his) axarr[0].set_title('Learning Curve') axarr[1].plot(np.arange(len(self.cost_his)), self.eps_his) axarr[0].ylabel('Cost') axarr[1].ylabel('Epsilon') plt.xlabel('training steps') plt.show() def train(self): """ The training loop. This runs a single episode. TODO: Implement the following as desired: 1. Storing transitions to the ReplayMemory 2. Updating the network at some frequency 3. Backing up the current parameters to a reference, target network """ done = False obs = env.reset() while not done: # self.eps_value = self.eps_value_list[eps_counter] action = self.select_action(obs, evaluation_mode=False) next_obs, reward, done, info = env.step(action) self.replay_memory.push(obs, action, next_obs, reward, done) if (self.num_steps > self.min_replay_size) and (self.num_steps % 50 == 0): self.update() obs = next_obs self.num_steps += 1 def eval(self, save_snapshot=True): """ Run an evaluation episode, this will call """ total_reward = 0.0 ep_steps = 0 done = False obs = env.reset() print(self.eps_value) while not done: env.render() action = self.select_action(obs, evaluation_mode=True) obs, reward, done, info = env.step(action) total_reward += reward print("Evaluation episode: ", total_reward) if save_snapshot: print("Saving state with Saver") self.saver.save(self.sess, 'models/dqn-model', global_step=self.num_episodes)
adv_ph = tf.placeholder(dtype=tf.float32, shape=(None, )) ret_ph = tf.placeholder(dtype=tf.float32, shape=(None, )) logp_old_ph = tf.placeholder(dtype=tf.float32, shape=(None, )) # Main outputs from computation graph pi, logp, logp_pi, v = mlp_actor_critic(x_ph, a_ph) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(args.steps / num_procs()) memory = ReplayMemory(obs_dim, act_dim, local_steps_per_epoch, args.gamma, args.lam) # Count variables var_counts = tuple(count_vars(scope) for scope in ['pi', 'v']) # Objective functions ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + args.clip_ratio) * adv_ph, (1 - args.clip_ratio) * adv_ph) actor_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) critic_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(
def train(active_mv): senv = ShapeNetEnv(FLAGS) replay_mem = ReplayMemory(FLAGS) #### for debug #a = np.array([[1,0,1],[0,0,0]]) #b = np.array([[1,0,1],[0,1,0]]) #print('IoU: {}'.format(replay_mem.calu_IoU(a, b))) #sys.exit() #### for debug log_string('====== Starting burning in memories ======') burn_in(senv, replay_mem) log_string('====== Done. {} trajectories burnt in ======'.format( FLAGS.burn_in_length)) #epsilon = FLAGS.init_eps K_single = np.asarray([[420.0, 0.0, 112.0], [0.0, 420.0, 112.0], [0.0, 0.0, 1]]) K_list = np.tile(K_single[None, None, ...], (1, FLAGS.max_episode_length, 1, 1)) ### burn in(pretrain) for MVnet if FLAGS.burn_in_iter > 0: for i in xrange(FLAGS.burn_in_iter): if not FLAGS.random_pretrain: mvnet_input = replay_mem.get_batch_list(FLAGS.batch_size) else: mvnet_input = replay_mem.get_batch_list_random( senv, FLAGS.batch_size) tic = time.time() out_stuff = active_mv.run_step(mvnet_input, mode='burnin', is_training=True) summs_burnin = burnin_log(i, out_stuff, time.time() - tic) for summ in summs_burnin: active_mv.train_writer.add_summary(summ, i) rollout_obj = Rollout(active_mv, senv, replay_mem, FLAGS) for i_idx in xrange(FLAGS.max_iter): t0 = time.time() rollout_obj.go(i_idx, verbose=True, add_to_mem=True, is_train=True) t1 = time.time() replay_mem.enable_gbl() mvnet_input = replay_mem.get_batch_list(FLAGS.batch_size) t2 = time.time() out_stuff = active_mv.run_step(mvnet_input, mode='train', is_training=True) replay_mem.disable_gbl() t3 = time.time() train_log(i_idx, out_stuff, (t0, t1, t2, t3)) active_mv.train_writer.add_summary(out_stuff.merged_train, i_idx) if i_idx % FLAGS.save_every_step == 0 and i_idx > 0: save(active_mv, i_idx, i_idx, i_idx) if i_idx % FLAGS.test_every_step == 0 and i_idx > 0: print('Evaluating active policy') evaluate(active_mv, FLAGS.test_episode_num, replay_mem, i_idx, rollout_obj, mode='active') print('Evaluating random policy') evaluate(active_mv, FLAGS.test_episode_num, replay_mem, i_idx, rollout_obj, mode='random')
class Driver(object): ''' A driver object for the SCRC ''' def __init__(self, args): '''Constructor''' self.WARM_UP = 0 self.QUALIFYING = 1 self.RACE = 2 self.UNKNOWN = 3 self.stage = args.stage self.parser = msgParser.MsgParser() self.state = carState.CarState() self.control = carControl.CarControl() self.steers = [-1.0, -0.8, -0.6, -0.5, -0.4, -0.3, -0.2, -0.15, -0.1, -0.05, 0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 1.0] self.speeds = [-1.0, -0.5, 0.0, 0.5, 1.0] self.num_inputs = 19 self.num_steers = len(self.steers) self.num_speeds = len(self.speeds) self.num_actions = self.num_steers + self.num_speeds self.net = DeepQNetwork(self.num_inputs, self.num_steers, self.num_speeds, args) self.mem = ReplayMemory(args.replay_size, self.num_inputs, args) self.minibatch_size = args.batch_size if args.load_weights: self.net.load_weights(args.load_weights) self.save_weights_prefix = args.save_weights_prefix self.pretrained_network = args.pretrained_network self.steer_lock = 0.785398 self.max_speed = 100 self.algorithm = args.algorithm self.device = args.device self.mode = args.mode self.maxwheelsteps = args.maxwheelsteps self.enable_training = args.enable_training self.enable_exploration = args.enable_exploration self.total_train_steps = 0 self.exploration_decay_steps = args.exploration_decay_steps self.exploration_rate_start = args.exploration_rate_start self.exploration_rate_end = args.exploration_rate_end self.show_sensors = args.show_sensors self.show_qvalues = args.show_qvalues self.episode = 0 self.onRestart() if self.show_sensors: from sensorstats import Stats self.stats = Stats(inevery=8) if self.show_qvalues: from plotq import PlotQ self.plotq = PlotQ(self.num_steers, self.num_speeds) if self.device == 'wheel': from wheel import Wheel self.wheel = Wheel(args.joystick_nr, args.autocenter, args.gain, args.min_force, args.max_force) def init(self): '''Return init string with rangefinder angles''' self.angles = [0 for x in range(19)] for i in range(5): self.angles[i] = -90 + i * 15 self.angles[18 - i] = 90 - i * 15 for i in range(5, 9): self.angles[i] = -20 + (i-5) * 5 self.angles[18 - i] = 20 - (i-5) * 5 return self.parser.stringify({'init': self.angles}) def getState(self): #state = np.array([self.state.getSpeedX() / 200.0, self.state.getAngle(), self.state.getTrackPos()]) #state = np.array(self.state.getTrack() + [self.state.getSpeedX()]) / 200.0 state = np.array(self.state.getTrack()) / 200.0 assert state.shape == (self.num_inputs,) return state def getReward(self, terminal): if terminal: reward = -1000 else: dist = self.state.getDistFromStart() if self.prev_dist is not None: reward = max(0, dist - self.prev_dist) * 10 assert reward >= 0, "reward: %f" % reward else: reward = 0 self.prev_dist = dist #reward -= self.state.getTrackPos() #print "reward:", reward return reward def getTerminal(self): return np.all(np.array(self.state.getTrack()) == -1) def getEpsilon(self): # calculate decaying exploration rate if self.total_train_steps < self.exploration_decay_steps: return self.exploration_rate_start - self.total_train_steps * (self.exploration_rate_start - self.exploration_rate_end) / self.exploration_decay_steps else: return self.exploration_rate_end def drive(self, msg): # parse incoming message self.state.setFromMsg(msg) # show sensors if self.show_sensors: self.stats.update(self.state) # fetch state, calculate reward and terminal indicator state = self.getState() terminal = self.getTerminal() reward = self.getReward(terminal) #print "reward:", reward # store new experience in replay memory if self.enable_training and self.prev_state is not None and self.prev_steer is not None and self.prev_speed is not None: self.mem.add(self.prev_state, self.prev_steer, self.prev_speed, reward, state, terminal) # if terminal state (out of track), then restart game if terminal: print "terminal state, restarting" self.control.setMeta(1) return self.control.toMsg() else: self.control.setMeta(0) # choose actions for wheel and speed if self.enable_exploration and random.random() < self.getEpsilon(): #print "random move" steer = random.randrange(self.num_steers) #speed = random.randrange(self.num_speeds) speed = random.randint(2, self.num_speeds-1) elif self.algorithm == 'network': # use broadcasting to efficiently produce minibatch of desired size minibatch = state + np.zeros((self.minibatch_size, 1)) Q = self.net.predict(minibatch) assert Q.shape == (self.minibatch_size, self.num_actions), "Q.shape: %s" % str(Q.shape) #print "steer Q: ", Q[0,:21] #print "speed Q:", Q[0,-5:] steer = np.argmax(Q[0, :self.num_steers]) speed = np.argmax(Q[0, -self.num_speeds:]) if self.show_qvalues: self.plotq.update(Q[0]) elif self.algorithm == 'hardcoded': steer = self.getSteerAction(self.steer()) speed = self.getSpeedActionAccel(self.speed()) else: assert False, "Unknown algorithm" #print "steer:", steer, "speed:", speed # gears are always automatic gear = self.gear() # check for manual override # might be partial, so we always need to choose algorithmic actions first events = self.wheel.getEvents() if self.mode == 'override' and self.wheel.supportsDrive(): # wheel for event in events: if self.wheel.isWheelMotion(event): self.wheelsteps = self.maxwheelsteps if self.wheelsteps > 0: wheel = self.wheel.getWheel() steer = self.getSteerAction(wheel) self.wheelsteps -= 1 # gas pedal accel = self.wheel.getAccel() if accel > 0: speed = self.getSpeedActionAccel(accel) # brake pedal brake = self.wheel.getBrake() if brake > 0: speed = self.getSpeedActionBrake(brake) # check for wheel buttons always, not only in override mode for event in events: if self.wheel.isButtonDown(event, 2): self.algorithm = 'network' self.mode = 'override' self.wheel.generateForce(0) print "Switched to network algorithm" elif self.wheel.isButtonDown(event, 3): self.net.load_weights(self.pretrained_network) self.algorithm = 'network' self.mode = 'ff' self.enable_training = False print "Switched to pretrained network" elif self.wheel.isButtonDown(event, 4): self.enable_training = not self.enable_training print "Switched training", "ON" if self.enable_training else "OFF" elif self.wheel.isButtonDown(event, 5): self.algorithm = 'hardcoded' self.mode = 'ff' print "Switched to hardcoded algorithm" elif self.wheel.isButtonDown(event, 6): self.enable_exploration = not self.enable_exploration self.mode = 'override' self.wheel.generateForce(0) print "Switched exploration", "ON" if self.enable_exploration else "OFF" elif self.wheel.isButtonDown(event, 7): self.mode = 'ff' if self.mode == 'override' else 'override' if self.mode == 'override': self.wheel.generateForce(0) print "Switched force feedback", "ON" if self.mode == 'ff' else "OFF" elif self.wheel.isButtonDown(event, 0) or self.wheel.isButtonDown(event, 8): gear = max(-1, gear - 1) elif self.wheel.isButtonDown(event, 1) or self.wheel.isButtonDown(event, 9): gear = min(6, gear + 1) # set actions self.setSteerAction(steer) self.setGearAction(gear) self.setSpeedAction(speed) # turn wheel using force feedback if self.mode == 'ff' and self.wheel.supportsForceFeedback(): wheel = self.wheel.getWheel() self.wheel.generateForce(self.control.getSteer()-wheel) # remember state and actions self.prev_state = state self.prev_steer = steer self.prev_speed = speed # training if self.enable_training and self.mem.count >= self.minibatch_size: minibatch = self.mem.getMinibatch() self.net.train(minibatch) self.total_train_steps += 1 #print "total_train_steps:", self.total_train_steps #print "total_train_steps:", self.total_train_steps, "mem_count:", self.mem.count return self.control.toMsg() def setSteerAction(self, steer): self.control.setSteer(self.steers[steer]) def setGearAction(self, gear): assert -1 <= gear <= 6 self.control.setGear(gear) def setSpeedAction(self, speed): accel = self.speeds[speed] if accel >= 0: #print "accel", accel self.control.setAccel(accel) self.control.setBrake(0) else: #print "brake", -accel self.control.setAccel(0) self.control.setBrake(-accel) def getSteerAction(self, wheel): steer = np.argmin(np.abs(np.array(self.steers) - wheel)) return steer def getSpeedActionAccel(self, accel): speed = np.argmin(np.abs(np.array(self.speeds) - accel)) return speed def getSpeedActionBrake(self, brake): speed = np.argmin(np.abs(np.array(self.speeds) + brake)) return speed def steer(self): angle = self.state.angle dist = self.state.trackPos steer = (angle - dist*0.5)/self.steer_lock return steer def gear(self): rpm = self.state.getRpm() gear = self.state.getGear() if self.prev_rpm == None: up = True else: if (self.prev_rpm - rpm) < 0: up = True else: up = False if up and rpm > 7000: gear += 1 if not up and rpm < 3000: gear -= 1 return gear def speed(self): speed = self.state.getSpeedX() accel = self.prev_accel if speed < self.max_speed: accel += 0.1 if accel > 1: accel = 1.0 else: accel -= 0.1 if accel < 0: accel = 0.0 self.prev_accel = accel return accel def onShutDown(self): pass def onRestart(self): if self.mode == 'ff': self.wheel.generateForce(0) self.prev_rpm = None self.prev_accel = 0 self.prev_dist = None self.prev_state = None self.prev_steer = None self.prev_speed = None self.wheelsteps = 0 if self.save_weights_prefix and self.episode > 0: self.net.save_weights(self.save_weights_prefix + "_" + str(self.episode) + ".pkl") self.episode += 1 print "Episode", self.episode
epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) # Worker Process Queues output_queue = mp.Queue(maxsize=args.pop) params_queue = mp.Queue(maxsize=args.pop) elite_queue = mp.Queue(maxsize=int(2 * args.pop)) # Agent agent = SAC(STATE_DIM, ACTION_DIM, args) sac_episodes = args.sac_episodes # Memory memory = ReplayMemory(args.replay_size) processes = [] elite_list = [] # Training Loop total_numsteps = 0 updates = 0 time_list = [] max_rewards = [] min_rewards = [] avg_rewards = [] noise_mut = [] total_time = 0 critic_loss = 0 # Create and start the processes
env = gym.make('MinitaurBulletEnv-v0') torch.manual_seed(seed) np.random.seed(seed) env.seed(seed) max_steps = env._max_episode_steps print('max_steps: ', max_steps) batch_size=128 ## 512 LEARNING_RATE=0.0001 start_steps=10000 ## Steps sampling random actions replay_size=1000000 ## size of replay buffer agent = soft_actor_critic_agent(env.observation_space.shape[0], env.action_space, \ hidden_size=256, seed=seed, lr=LEARNING_RATE, gamma=0.99, tau=0.005, alpha=0.2) memory = ReplayMemory(replay_size) print('device: ', device) print('leraning rate: ', LEARNING_RATE) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] print('state_dim: ',state_dim, ', action_dim: ', action_dim) threshold = env.spec.reward_threshold print('threshold: ', threshold) scores, avg_scores, avg_numm_steps = sac_train(max_steps=max_steps) reward_round = round(np.max(scores), 2)
def __init__(self, dimO, dimA): dimA, dimO = dimA[0], dimO[0] self.dimA = dimA self.dimO = dimO tau = FLAGS.tau discount = FLAGS.discount l2norm = FLAGS.l2norm learning_rate = FLAGS.rate outheta = FLAGS.outheta ousigma = FLAGS.ousigma if FLAGS.icnn_opt == 'adam': self.opt = self.adam elif FLAGS.icnn_opt == 'bundle_entropy': self.opt = self.bundle_entropy else: raise RuntimeError("Unrecognized ICNN optimizer: "+FLAGS.icnn_opt) self.rm = ReplayMemory(FLAGS.rmsize, dimO, dimA) self.sess = tf.Session(config=tf.ConfigProto( inter_op_parallelism_threads=FLAGS.thread, log_device_placement=False, allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True))) self.noise = np.zeros(self.dimA) obs = tf.placeholder(tf.float32, [None, dimO], "obs") act = tf.placeholder(tf.float32, [None, dimA], "act") rew = tf.placeholder(tf.float32, [None], "rew") with tf.variable_scope('q'): negQ = self.negQ(obs, act) negQ_entr = negQ - entropy(act) q = -negQ q_entr = -negQ_entr act_grad, = tf.gradients(negQ, act) act_grad_entr, = tf.gradients(negQ_entr, act) obs_target = tf.placeholder(tf.float32, [None, dimO], "obs_target") act_target = tf.placeholder(tf.float32, [None, dimA], "act_target") term_target = tf.placeholder(tf.bool, [None], "term_target") with tf.variable_scope('q_target'): negQ_target = self.negQ(obs_target, act_target) negQ_entr_target = negQ_target - entropy(act_target) act_target_grad, = tf.gradients(negQ_target, act_target) act_entr_target_grad, = tf.gradients(negQ_entr_target, act_target) q_target = -negQ_target q_target_entr = -negQ_entr_target if FLAGS.icnn_opt == 'adam': y = tf.where(term_target, rew, rew + discount * q_target_entr) y = tf.maximum(q_entr - 1., y) y = tf.minimum(q_entr + 1., y) y = tf.stop_gradient(y) td_error = q_entr - y elif FLAGS.icnn_opt == 'bundle_entropy': q_target = tf.where(term2, rew, rew + discount * q2_entropy) q_target = tf.maximum(q_entropy - 1., q_target) q_target = tf.minimum(q_entropy + 1., q_target) q_target = tf.stop_gradient(q_target) td_error = q_entropy - q_target else: raise RuntimeError("Needs checking.") ms_td_error = tf.reduce_mean(tf.square(td_error), 0) regLosses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope='q/') loss_q = ms_td_error + l2norm*tf.reduce_sum(regLosses) self.theta_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q/') self.theta_cvx_ = [v for v in self.theta_ if 'proj' in v.name and 'W:' in v.name] self.makeCvx = [v.assign(tf.abs(v)) for v in self.theta_cvx_] self.proj = [v.assign(tf.maximum(v, 0)) for v in self.theta_cvx_] # self.proj = [v.assign(tf.abs(v)) for v in self.theta_cvx_] self.theta_target_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q_target/') update_target = [theta_target_i.assign_sub(tau*(theta_target_i-theta_i)) for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)] optim_q = tf.train.AdamOptimizer(learning_rate=learning_rate) grads_and_vars_q = optim_q.compute_gradients(loss_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.outdir, 'board'), self.sess.graph) if FLAGS.icnn_opt == 'adam': tf.summary.scalar('Qvalue', tf.reduce_mean(q)) elif FLAGS.icnn_opt == 'bundle_entropy': tf.summary.scalar('Qvalue', tf.reduce_mean(q_entr)) tf.summary.scalar('loss', ms_td_error) tf.summary.scalar('reward', tf.reduce_mean(rew)) merged = tf.summary.merge_all # tf functions with self.sess.as_default(): self._train = Fun([obs, act, rew, obs_target, act_target, term_target], [optimize_q, update_target, loss_q], merged, summary_writer) self._fg = Fun([obs, act], [negQ, act_grad]) self._fg_target = Fun([obs_target, act_target], [negQ_target, act_target_grad]) self._fg_entr = Fun([obs, act], [negQ_entr, act_grad_entr]) self._fg_entr_target = Fun([obs_target, act_target], [negQ_entr_target, act_entr_target_grad]) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf") if ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.initialize_all_variables()) self.sess.run(self.makeCvx) self.sess.run([theta_target_i.assign(theta_i) for theta_i, theta_target_i in zip(self.theta_, self.theta_target_)]) self.sess.graph.finalize() self.t = 0 # global training time (number of observations)
class GaussianDQN(Agent): def __init__(self, approximator, policy, mdp_info, batch_size, target_update_frequency, initial_replay_size, max_replay_size, fit_params=None, approximator_params=None, clip_reward=True, update_type='weighted', delta=0.1, store_prob=False, q_max=100, max_spread=None): self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency self.update_type = update_type self.delta = delta self.standard_bound = norm.ppf(1 - self.delta, loc=0, scale=1) self.store_prob = store_prob self.q_max = q_max self.max_spread = max_spread self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) self._n_updates = 0 self._epsilon = 1e-7 apprx_params_train = deepcopy(approximator_params) apprx_params_train['name'] = 'train' apprx_params_target = deepcopy(approximator_params) apprx_params_target['name'] = 'target' self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, **apprx_params_target) policy.set_q(self.approximator) self.target_approximator.model.set_weights( self.approximator.model.get_weights()) super(GaussianDQN, self).__init__(policy, mdp_info) @staticmethod def _compute_prob_max(mean_list, sigma_list): n_actions = len(mean_list) lower_limit = mean_list - 8 * sigma_list upper_limit = mean_list + 8 * sigma_list epsilon = 1e2 n_trapz = 100 x = np.zeros(shape=(n_trapz, n_actions)) y = np.zeros(shape=(n_trapz, n_actions)) integrals = np.zeros(n_actions) for j in range(n_actions): if sigma_list[j] < epsilon: p = 1 for k in range(n_actions): if k != j: p *= norm.cdf(mean_list[j], loc=mean_list[k], scale=sigma_list[k]) integrals[j] = p else: x[:, j] = np.linspace(lower_limit[j], upper_limit[j], n_trapz) y[:, j] = norm.pdf(x[:, j], loc=mean_list[j], scale=sigma_list[j]) for k in range(n_actions): if k != j: y[:, j] *= norm.cdf(x[:, j], loc=mean_list[k], scale=sigma_list[k]) integrals[j] = (upper_limit[j] - lower_limit[j]) / ( 2 * (n_trapz - 1)) * (y[0, j] + y[-1, j] + 2 * np.sum(y[1:-1, j])) # print(np.sum(integrals)) # assert np.isclose(np.sum(integrals), 1) with np.errstate(divide='raise'): try: return integrals / np.sum(integrals) except FloatingPointError: print(integrals) print(mean_list) print(sigma_list) input() def fit(self, dataset): mask = np.ones((len(dataset), 2)) self._replay_memory.add(dataset, mask) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _, mask = \ self._replay_memory.get(self._batch_size) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next, sigma_next, prob_explore = self._next_q( next_state, absorbing) q = reward + self.mdp_info.gamma * q_next sigma = self.mdp_info.gamma * sigma_next stacked = np.stack([q, sigma]) self.approximator.fit(state, action, stacked, prob_exploration=prob_explore, **self._fit_params) self._n_updates += 1 if self._n_updates % self._target_update_frequency == 0: self._update_target() def _update_target(self): """ Update the target network. """ self.target_approximator.model.set_weights( self.approximator.model.get_weights()) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in `next_state`. Returns: Maximum action-value for each state in `next_state`. """ q_and_sigma = self.target_approximator.predict(next_state).squeeze() q = q_and_sigma[0, :, :] sigma = q_and_sigma[1, :, :] for i in range(q.shape[0]): if absorbing[i]: q[i] *= 0 sigma[i] *= self._epsilon max_q = np.zeros((q.shape[0])) max_sigma = np.zeros((q.shape[0])) probs = [] prob_explore = np.zeros(q.shape[0]) for i in range(q.shape[0]): # for each batch means = q[i, :] sigmas = sigma[i, :] prob = GaussianDQN._compute_prob_max(means, sigmas) probs.append(prob) prob_explore[i] = 1. - np.max(prob) if self.update_type == 'mean': best_actions = np.argmax(q, axis=1) for i in range(q.shape[0]): max_q[i] = q[i, best_actions[i]] max_sigma[i] = sigma[i, best_actions[i]] elif self.update_type == 'weighted': for i in range(q.shape[0]): # for each batch means = q[i, :] sigmas = sigma[i, :] prob = probs[i] max_q[i] = np.sum(means * prob) max_sigma[i] = np.sum(sigmas * prob) elif self.update_type == 'optimistic': for i in range(q.shape[0]): # for each batch means = q[i, :] sigmas = sigma[i, :] bounds = sigmas * self.standard_bound + means bounds = np.clip(bounds, -self.q_max, self.q_max) next_index = np.random.choice( np.argwhere(bounds == np.max(bounds)).ravel()) max_q[i] = q[i, next_index] max_sigma[i] = sigma[i, next_index] else: raise ValueError("Update type not implemented") return max_q, max_sigma, np.mean(prob_explore) def draw_action(self, state): action = super(GaussianDQN, self).draw_action(np.array(state)) return action def episode_start(self): return
def main(): parser = argparse.ArgumentParser() parser.add_argument('--num-envs', type=int, default=1) parser.add_argument('--t-max', type=int, default=1) parser.add_argument('--learning-rate', type=float, default=0.0002) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--steps-per-epoch', type=int, default=100000) parser.add_argument('--testing', type=int, default=0) parser.add_argument('--continue-training', type=int, default=0) parser.add_argument('--epoch-num', type=int, default=40) parser.add_argument('--start-epoch', type=int, default=20) parser.add_argument('--testing-epoch', type=int, default=3) parser.add_argument('--save-log', type=str, default='basic/log') parser.add_argument('--signal-num', type=int, default=4) parser.add_argument('--toxin', type=int, default=0) parser.add_argument('--a1-AC-folder', type=str, default='basic/a1_Qnet') parser.add_argument('--eps-start', type=float, default=1.0) parser.add_argument('--replay-start-size', type=int, default=50000) parser.add_argument('--decay-rate', type=int, default=500000) parser.add_argument('--replay-memory-size', type=int, default=1000000) parser.add_argument('--eps-min', type=float, default=0.05) rewards = { "positive": 1.0, "negative": -1.0, "tick": -0.002, "loss": -2.0, "win": 2.0 } args = parser.parse_args() config = Config(args) q_ctx = config.ctx steps_per_epoch = args.steps_per_epoch np.random.seed(args.seed) start_epoch = args.start_epoch testing_epoch = args.testing_epoch save_log = args.save_log epoch_num = args.epoch_num epoch_range = range(epoch_num) toxin = args.toxin a1_Qnet_folder = args.a1_AC_folder freeze_interval = 10000 update_interval = 5 replay_memory_size = args.replay_memory_size discount = 0.99 replay_start_size = args.replay_start_size history_length = 1 eps_start = args.eps_start eps_min = args.eps_min eps_decay = (eps_start - eps_min) / args.decay_rate eps_curr = eps_start freeze_interval /= update_interval minibatch_size = 32 testing = args.testing testing = True if testing == 1 else False continue_training = args.continue_training continue_training = True if continue_training == 1 else False game = HunterWorld(width=256, height=256, num_preys=10, draw=False, num_hunters=2, num_toxins=toxin) env = PLE(game, fps=30, force_fps=True, display_screen=False, reward_values=rewards, resized_rows=80, resized_cols=80, num_steps=2) replay_memory = ReplayMemory(state_dim=(148, ), action_dim=(2, ), history_length=history_length, memory_size=replay_memory_size, replay_start_size=replay_start_size, state_dtype='float32') action_set = env.get_action_set() action_map1 = [] for action in action_set[0].values(): action_map1.append(action) action_map2 = [] for action in action_set[1].values(): action_map2.append(action) action_num = len(action_map1) target1 = Qnetwork(actions_num=8, q_ctx=q_ctx, isTrain=False, batch_size=1, dir=dir, folder=a1_Qnet_folder) target32 = Qnetwork(actions_num=8, q_ctx=q_ctx, isTrain=False, batch_size=32, dir=dir, folder=a1_Qnet_folder) Qnet = Qnetwork(actions_num=8, q_ctx=q_ctx, isTrain=True, batch_size=32, dir=dir, folder=a1_Qnet_folder) if testing: env.force_fps = False env.game.draw = True env.display_screen = True Qnet.load_params(testing_epoch) elif continue_training: epoch_range = range(start_epoch, epoch_num + start_epoch) Qnet.load_params(start_epoch - 1) logging_config(logging, dir, save_log, file_name) else: logging_config(logging, dir, save_log, file_name) copyTargetQNetwork(Qnet.model, target1.model) copyTargetQNetwork(Qnet.model, target32.model) logging.info('args=%s' % args) logging.info('config=%s' % config.__dict__) print_params(logging, Qnet.model) training_steps = 0 total_steps = 0 for epoch in epoch_range: steps_left = steps_per_epoch episode = 0 epoch_reward = 0 start = time.time() env.reset_game() while steps_left > 0: episode += 1 episode_loss = 0.0 episode_q_value = 0.0 episode_update_step = 0 episode_action_step = 0 episode_reward = 0 episode_step = 0 collisions = 0.0 time_episode_start = time.time() env.reset_game() while not env.game_over(): if replay_memory.size >= history_length and replay_memory.size > replay_start_size: do_exploration = (np.random.rand() < eps_curr) eps_curr = max(eps_curr - eps_decay, eps_min) if do_exploration: action1 = np.random.randint(action_num) action2 = np.random.randint(action_num) else: current_state = replay_memory.latest_slice() state = nd.array( current_state.reshape((1, ) + current_state.shape), ctx=q_ctx) target1.model.forward(mx.io.DataBatch([state], [])) q_value = target1.model.get_outputs()[0].asnumpy()[0] action1 = numpy.argmax(q_value[0:4]) action2 = numpy.argmax(q_value[4:8]) episode_q_value += q_value[action1] episode_q_value += q_value[action2 + 4] episode_action_step += 1 else: action1 = np.random.randint(action_num) action2 = np.random.randint(action_num) next_ob, reward, terminal_flag = env.act( [action_map1[action1], action_map2[action2]]) reward = np.sum(reward) replay_memory.append( np.array(next_ob).flatten(), [action1, action2], reward, terminal_flag) total_steps += 1 episode_reward += reward if reward < 0: collisions += 1 episode_step += 1 if total_steps % update_interval == 0 and replay_memory.size > replay_start_size: training_steps += 1 state_batch, actions, rewards, nextstate_batch, terminate_flags = replay_memory.sample( batch_size=minibatch_size) state_batch = nd.array(state_batch, ctx=q_ctx) actions_batch1 = nd.array(actions[:, 0], ctx=q_ctx) actions_batch2 = nd.array(actions[:, 1], ctx=q_ctx) target32.model.forward( mx.io.DataBatch([nd.array(nextstate_batch, ctx=q_ctx)], [])) Qvalue = target32.model.get_outputs()[0].asnumpy() y_batch1 = rewards + np.max(Qvalue[:, 0:4], axis=1) * ( 1.0 - terminate_flags) * discount y_batch2 = rewards + np.max(Qvalue[:, 4:8], axis=1) * ( 1.0 - terminate_flags) * discount y_batch1 = nd.array(y_batch1, ctx=q_ctx) y_batch2 = nd.array(y_batch2, ctx=q_ctx) Qnet.model.forward(mx.io.DataBatch([ state_batch, actions_batch1, y_batch1, actions_batch2 + 4, y_batch2 ], []), is_train=True) Qnet.model.backward() Qnet.model.update() if training_steps % 10 == 0: out = Qnet.model.get_outputs() loss1 = 0.5 * nd.square( nd.choose_element_0index(out[0], actions_batch1) - y_batch1) loss2 = 0.5 * nd.square( nd.choose_element_0index(out[1], actions_batch2) - y_batch2) episode_loss += nd.sum(loss1).asnumpy() episode_loss += nd.sum(loss2).asnumpy() episode_update_step += 1 if training_steps % freeze_interval == 0: copyTargetQNetwork(Qnet.model, target1.model) copyTargetQNetwork(Qnet.model, target32.model) steps_left -= episode_step time_episode_end = time.time() epoch_reward += episode_reward info_str = "Epoch:%d, Episode:%d, Steps Left:%d/%d/%d, Reward:%f, fps:%f, Exploration:%f" \ % (epoch, episode, steps_left, episode_step, steps_per_epoch, episode_reward, episode_step / (time_episode_end - time_episode_start), eps_curr) info_str += ", Collision:%f/%d " % (collisions / episode_step, collisions) if episode_update_step > 0: info_str += ", Avg Loss:%f/%d" % ( episode_loss / episode_update_step, episode_update_step) if episode_action_step > 0: info_str += ", Avg Q Value:%f/%d " % ( episode_q_value / episode_action_step, episode_action_step) if episode % 1 == 0: print info_str logging.info(info_str) end = time.time() fps = steps_per_epoch / (end - start) Qnet.save_params(epoch) print "Epoch:%d, FPS:%f, Avg Reward: %f/%d" % ( epoch, fps, epoch_reward / float(episode), episode)
def train(args, net, env): # Begin tf session with tf.Session() as sess: # Initialize variables tf.global_variables_initializer().run() saver = tf.train.Saver(tf.global_variables(), max_to_keep=5) # load from previous save if len(args.ckpt_name) > 0: saver.restore(sess, os.path.join(args.save_dir, args.ckpt_name)) # Load data shift = sess.run(net.shift) scale = sess.run(net.scale) shift_u = sess.run(net.shift_u) scale_u = sess.run(net.scale_u) replay_memory = ReplayMemory(args, shift, scale, shift_u, scale_u, env, net, sess) # Store normalization parameters sess.run(tf.assign(net.shift, replay_memory.shift_x)) sess.run(tf.assign(net.scale, replay_memory.scale_x)) sess.run(tf.assign(net.shift_u, replay_memory.shift_u)) sess.run(tf.assign(net.scale_u, replay_memory.scale_u)) #Function to evaluate loss on validation set def val_loss(kl_weight): replay_memory.reset_batchptr_val() loss = 0.0 for b in range(replay_memory.n_batches_val): # Get inputs batch_dict = replay_memory.next_batch_val() x = batch_dict["states"] u = batch_dict['inputs'] # Construct inputs for network feed_in = {} feed_in[net.x] = np.reshape( x, (2 * args.batch_size * args.seq_length, args.state_dim)) feed_in[net.u] = u if args.kl_weight > 0.0: feed_in[net.kl_weight] = kl_weight else: feed_in[net.kl_weight] = 1.0 # Find loss feed_out = net.cost cost = sess.run(feed_out, feed_in) loss += cost return loss / replay_memory.n_batches_val # Initialize variable to track validation score over time old_score = 1e9 count_decay = 0 decay_epochs = [] # Define temperature for annealing kl_weight T = args.anneal_time * replay_memory.n_batches_train count = 0 # Loop over epochs for e in range(args.num_epochs): visualize_predictions(args, sess, net, replay_memory, env, e) # Initialize loss loss = 0.0 rec_loss = 0.0 kl_loss = 0.0 loss_count = 0 replay_memory.reset_batchptr_train() # Loop over batches for b in range(replay_memory.n_batches_train): start = time.time() count += 1 # Update kl_weight if e < args.start_kl: kl_weight = 1e-3 else: count += 1 kl_weight = min(args.kl_weight, 1e-3 + args.kl_weight * count / float(T)) # Get inputs batch_dict = replay_memory.next_batch_train() x = batch_dict["states"] u = batch_dict['inputs'] # Construct inputs for network feed_in = {} feed_in[net.x] = np.reshape( x, (2 * args.batch_size * args.seq_length, args.state_dim)) feed_in[net.u] = u feed_in[net.kl_weight] = kl_weight # Find loss and perform training operation feed_out = [ net.cost, net.loss_reconstruction, net.kl_loss, net.train ] out = sess.run(feed_out, feed_in) # Update and display cumulative losses loss += out[0] rec_loss += out[1] kl_loss += out[2] loss_count += 1 end = time.time() # Print loss if (e * replay_memory.n_batches_train + b) % 100 == 0 and b > 0: print("{}/{} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * replay_memory.n_batches_train + b, args.num_epochs * replay_memory.n_batches_train, e, loss/loss_count, end - start)) print("{}/{} (epoch {}), rec_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * replay_memory.n_batches_train + b, args.num_epochs * replay_memory.n_batches_train, e, rec_loss/loss_count, end - start)) print("{}/{} (epoch {}), kl_loss = {:.3f}, time/batch = {:.3f}" \ .format(e * replay_memory.n_batches_train + b, args.num_epochs * replay_memory.n_batches_train, e, kl_loss/loss_count, end - start)) print('') loss = 0.0 rec_loss = 0.0 kl_loss = 0.0 loss_count = 0 # Evaluate loss on validation set score = val_loss(args.kl_weight * (e >= args.start_kl)) print('Validation Loss: {0:f}'.format(score)) # Set learning rate if (old_score - score) < 0.01 and e != args.start_kl: count_decay += 1 decay_epochs.append(e) if len(decay_epochs) >= 3 and np.sum( np.diff(decay_epochs)[-2:]) == 2: break print('setting learning rate to ', args.learning_rate * (args.decay_rate**count_decay)) sess.run( tf.assign( net.learning_rate, args.learning_rate * (args.decay_rate**count_decay))) if args.learning_rate * (args.decay_rate**count_decay) < 1e-5: break print('learning rate is set to ', args.learning_rate * (args.decay_rate**count_decay)) old_score = score # Save model every epoch checkpoint_path = os.path.join(args.save_dir, args.save_name + '.ckpt') saver.save(sess, checkpoint_path, global_step=e) print("model saved to {}".format(checkpoint_path))
def _initialize(self, game, network_args=None, actions=None, history_length=4, batchsize=64, update_pattern=(1, 1), replay_memory_size=10000, backprop_start_step=10000, start_epsilon=1.0, end_epsilon=0.1, epsilon_decay_start_step=50000, epsilon_decay_steps=100000, reward_scale=1.0, use_game_variables=True, misc_scale=None, reshaped_x=None, reshaped_y=None, skiprate=4, shaping_on=False, count_states=False, name=None, net_type="cnn", melt_steps=10000, remember_n_actions=0): if network_args is None: network_args = dict() if count_states is not None: self._count_states = bool(count_states) self.name = name self._reward_scale = reward_scale self._game = game self._batchsize = batchsize self._history_length = max(history_length, 1) self._update_pattern = update_pattern self._epsilon = max(min(start_epsilon, 1.0), 0.0) self._end_epsilon = min(max(end_epsilon, 0.0), self._epsilon) self._epsilon_decay_steps = epsilon_decay_steps self._epsilon_decay_stride = (self._epsilon - end_epsilon) / epsilon_decay_steps self._epsilon_decay_start = epsilon_decay_start_step self._skiprate = max(skiprate, 0) self._shaping_on = shaping_on self._steps = 0 self._melt_steps = melt_steps self._backprop_start_step = max(backprop_start_step, batchsize) self._use_game_variables = use_game_variables self._last_action_index = 0 if self._shaping_on: self._last_shaping_reward = 0 self.learning_mode = True if actions is None: self._actions = generate_default_actions(game) else: self._actions = actions self._actions_num = len(self._actions) self._actions_stats = np.zeros([self._actions_num], np.int) # changes img_shape according to the history size self._channels = game.get_screen_channels() if self._history_length > 1: self._channels *= self._history_length if reshaped_x is None: x = game.get_screen_width() y = game.get_screen_height() scale_x = scale_y = 1.0 else: x = reshaped_x scale_x = float(x) / game.get_screen_width() if reshaped_y is None: y = int(game.get_screen_height() * scale_x) scale_y = scale_x else: y = reshaped_y scale_y = float(y) / game.get_screen_height() img_shape = [self._channels, y, x] # TODO check if it is slow (it seems that no) if scale_x == 1 and scale_y == 1: def convert(img): img = img.astype(np.float32) / 255.0 return img else: def convert(img): img = img.astype(np.float32) / 255.0 new_image = np.ndarray([img.shape[0], y, x], dtype=img.dtype) for i in xrange(img.shape[0]): # new_image[i] = skimage.transform.resize(img[i], (y,x), preserve_range=True) new_image[i] = cv2.resize(img[i], (x, y), interpolation=cv2.INTER_AREA) return new_image self._convert_image = convert if self._use_game_variables: single_state_misc_len = game.get_available_game_variables_size() + int(self._count_states) else: single_state_misc_len = int(self._count_states) self._single_state_misc_len = single_state_misc_len self._remember_n_actions = remember_n_actions if remember_n_actions > 0: self._remember_n_actions = remember_n_actions self._action_len = len(self._actions[0]) self._last_n_actions = np.zeros([remember_n_actions * self._action_len], dtype=np.float32) self._total_misc_len = single_state_misc_len * self._history_length + len(self._last_n_actions) self._last_action_index = 0 else: self._total_misc_len = single_state_misc_len * self._history_length if self._total_misc_len > 0: self._misc_state_included = True self._current_misc_state = np.zeros(self._total_misc_len, dtype=np.float32) if single_state_misc_len > 0: self._state_misc_buffer = np.zeros(single_state_misc_len, dtype=np.float32) if misc_scale is not None: self._misc_scale = np.array(misc_scale, dtype=np.float32) else: self._misc_scale = None else: self._misc_state_included = False state_format = dict() state_format["s_img"] = img_shape state_format["s_misc"] = self._total_misc_len self._transitions = ReplayMemory(state_format, replay_memory_size, batchsize) network_args["state_format"] = state_format network_args["actions_number"] = len(self._actions) if net_type in ("dqn", None, ""): self._evaluator = DQN(**network_args) elif net_type == "duelling": self._evaluator = DuellingDQN(**network_args) else: print "Unsupported evaluator type." exit(1) # TODO throw. . .? self._current_image_state = np.zeros(img_shape, dtype=np.float32)
"input_shape": meta_controller_input_shape } controller_hparams = { "learning_rate": learning_rate, "epsilon": 1, "action_dim": env.action_space.n, "input_shape": controller_input_shape } controller = Controller(sess, controller_hparams) meta_controller = MetaController(sess, meta_controller_hparams) ''' Initialize the replay buffers ''' d1 = ReplayMemory(name="controller", buffer_capacity=256, storage_capacity=4096, obs_shape=controller_input_shape) d2 = ReplayMemory(name="metacontroller", buffer_capacity=256, storage_capacity=4096, obs_shape=meta_controller_input_shape) #Storing performance performanceDf = pd.DataFrame( columns=["episode", "intrinsic_reward", "goal_x", "goal_y", "training"]) if not os.path.exists("results"): os.makedirs("results") ''' Pre-training step. Iterate over subgoals randomly and train controller to achieve subgoals '''
class DQLearner(interfaces.LearningAgent): def __init__(self, dqn, num_actions, gamma=0.99, learning_rate=0.00025, replay_start_size=50000, epsilon_start=1.0, epsilon_end=0.01, epsilon_steps=1000000, update_freq=4, target_copy_freq=30000, replay_memory_size=1000000, frame_history=4, batch_size=32, error_clip=1, restore_network_file=None, double=True): self.dqn = dqn config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.inp_actions = tf.placeholder(tf.float32, [None, num_actions]) inp_shape = [None] + list(self.dqn.get_input_shape()) + [frame_history] inp_dtype = self.dqn.get_input_dtype() assert type(inp_dtype) is str self.inp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_terminated = tf.placeholder(tf.bool, [None]) self.inp_reward = tf.placeholder(tf.float32, [None]) self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.gamma = gamma with tf.variable_scope('online'): mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [ frame_history ] mask = tf.reshape(self.inp_mask, mask_shape) masked_input = self.inp_frames * mask self.q_online = self.dqn.construct_q_network(masked_input) with tf.variable_scope('target'): mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [ frame_history ] sp_mask = tf.reshape(self.inp_sp_mask, mask_shape) masked_sp_input = self.inp_sp_frames * sp_mask self.q_target = self.dqn.construct_q_network(masked_sp_input) if double: with tf.variable_scope('online', reuse=True): self.q_online_prime = self.dqn.construct_q_network( masked_sp_input) self.maxQ = tf.gather_nd( self.q_target, tf.transpose([ tf.range(0, 32, dtype=tf.int32), tf.cast(tf.argmax(self.q_online_prime, axis=1), tf.int32) ], [1, 0])) else: self.maxQ = tf.reduce_max(self.q_target, reduction_indices=1) self.r = tf.sign(self.inp_reward) use_backup = tf.cast(tf.logical_not(self.inp_terminated), dtype=tf.float32) self.y = self.r + use_backup * gamma * self.maxQ self.delta = tf.reduce_sum(self.inp_actions * self.q_online, reduction_indices=1) - self.y self.error = tf.where( tf.abs(self.delta) < error_clip, 0.5 * tf.square(self.delta), error_clip * tf.abs(self.delta)) self.loss = tf.reduce_sum(self.error) self.g = tf.gradients(self.loss, self.q_online) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.95, centered=True, epsilon=0.01) self.train_op = optimizer.minimize(self.loss, var_list=th.get_vars('online')) self.copy_op = th.make_copy_op('online', 'target') self.saver = tf.train.Saver(var_list=th.get_vars('online')) self.replay_buffer = ReplayMemory(self.dqn.get_input_shape(), self.dqn.get_input_dtype(), replay_memory_size, frame_history) self.frame_history = frame_history self.replay_start_size = replay_start_size self.epsilon = epsilon_start self.epsilon_min = epsilon_end self.epsilon_steps = epsilon_steps self.epsilon_delta = (self.epsilon - self.epsilon_min) / self.epsilon_steps self.update_freq = update_freq self.target_copy_freq = target_copy_freq self.action_ticker = 1 self.num_actions = num_actions self.batch_size = batch_size self.sess.run(tf.initialize_all_variables()) if restore_network_file is not None: self.saver.restore(self.sess, restore_network_file) print('Restored network from file') self.sess.run(self.copy_op) def update_q_values(self): S1, A, R, S2, T, M1, M2 = self.replay_buffer.sample(self.batch_size) Aonehot = np.zeros((self.batch_size, self.num_actions), dtype=np.float32) Aonehot[list(range(len(A))), A] = 1 [_, loss, q_online, maxQ, q_target, r, y, error, delta, g] = self.sess.run( [ self.train_op, self.loss, self.q_online, self.maxQ, self.q_target, self.r, self.y, self.error, self.delta, self.g ], feed_dict={ self.inp_frames: S1, self.inp_actions: Aonehot, self.inp_sp_frames: S2, self.inp_reward: R, self.inp_terminated: T, self.inp_mask: M1, self.inp_sp_mask: M2 }) return loss def run_learning_episode(self, environment, max_episode_steps=100000): episode_steps = 0 total_reward = 0 for steps in range(max_episode_steps): if environment.is_current_state_terminal(): break state = environment.get_current_state() if np.random.uniform(0, 1) < self.epsilon: action = np.random.choice( environment.get_actions_for_state(state)) else: action = self.get_action(state) if self.replay_buffer.size() > self.replay_start_size: self.epsilon = max(self.epsilon_min, self.epsilon - self.epsilon_delta) state, action, reward, next_state, is_terminal = environment.perform_action( action) total_reward += reward self.replay_buffer.append(state[-1], action, reward, next_state[-1], is_terminal) if (self.replay_buffer.size() > self.replay_start_size) and ( self.action_ticker % self.update_freq == 0): loss = self.update_q_values() if (self.action_ticker - self.replay_start_size) % self.target_copy_freq == 0: self.sess.run(self.copy_op) self.action_ticker += 1 episode_steps += 1 return episode_steps, total_reward def get_action(self, state): size = list(np.array(list(range(len(self.dqn.get_input_shape())))) + 1) state_input = np.transpose(state, size + [0]) [q_values] = self.sess.run( [self.q_online], feed_dict={ self.inp_frames: [state_input], self.inp_mask: np.ones((1, self.frame_history), dtype=np.float32) }) return np.argmax(q_values[0]) def save_network(self, file_name): self.saver.save(self.sess, file_name)
def __init__(self, dqn, num_actions, gamma=0.99, learning_rate=0.00025, replay_start_size=50000, epsilon_start=1.0, epsilon_end=0.01, epsilon_steps=1000000, update_freq=4, target_copy_freq=30000, replay_memory_size=1000000, frame_history=4, batch_size=32, error_clip=1, restore_network_file=None, double=True): self.dqn = dqn config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) self.inp_actions = tf.placeholder(tf.float32, [None, num_actions]) inp_shape = [None] + list(self.dqn.get_input_shape()) + [frame_history] inp_dtype = self.dqn.get_input_dtype() assert type(inp_dtype) is str self.inp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_sp_frames = tf.placeholder(inp_dtype, inp_shape) self.inp_terminated = tf.placeholder(tf.bool, [None]) self.inp_reward = tf.placeholder(tf.float32, [None]) self.inp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.inp_sp_mask = tf.placeholder(inp_dtype, [None, frame_history]) self.gamma = gamma with tf.variable_scope('online'): mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [ frame_history ] mask = tf.reshape(self.inp_mask, mask_shape) masked_input = self.inp_frames * mask self.q_online = self.dqn.construct_q_network(masked_input) with tf.variable_scope('target'): mask_shape = [-1] + [1] * len(self.dqn.get_input_shape()) + [ frame_history ] sp_mask = tf.reshape(self.inp_sp_mask, mask_shape) masked_sp_input = self.inp_sp_frames * sp_mask self.q_target = self.dqn.construct_q_network(masked_sp_input) if double: with tf.variable_scope('online', reuse=True): self.q_online_prime = self.dqn.construct_q_network( masked_sp_input) self.maxQ = tf.gather_nd( self.q_target, tf.transpose([ tf.range(0, 32, dtype=tf.int32), tf.cast(tf.argmax(self.q_online_prime, axis=1), tf.int32) ], [1, 0])) else: self.maxQ = tf.reduce_max(self.q_target, reduction_indices=1) self.r = tf.sign(self.inp_reward) use_backup = tf.cast(tf.logical_not(self.inp_terminated), dtype=tf.float32) self.y = self.r + use_backup * gamma * self.maxQ self.delta = tf.reduce_sum(self.inp_actions * self.q_online, reduction_indices=1) - self.y self.error = tf.where( tf.abs(self.delta) < error_clip, 0.5 * tf.square(self.delta), error_clip * tf.abs(self.delta)) self.loss = tf.reduce_sum(self.error) self.g = tf.gradients(self.loss, self.q_online) optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.95, centered=True, epsilon=0.01) self.train_op = optimizer.minimize(self.loss, var_list=th.get_vars('online')) self.copy_op = th.make_copy_op('online', 'target') self.saver = tf.train.Saver(var_list=th.get_vars('online')) self.replay_buffer = ReplayMemory(self.dqn.get_input_shape(), self.dqn.get_input_dtype(), replay_memory_size, frame_history) self.frame_history = frame_history self.replay_start_size = replay_start_size self.epsilon = epsilon_start self.epsilon_min = epsilon_end self.epsilon_steps = epsilon_steps self.epsilon_delta = (self.epsilon - self.epsilon_min) / self.epsilon_steps self.update_freq = update_freq self.target_copy_freq = target_copy_freq self.action_ticker = 1 self.num_actions = num_actions self.batch_size = batch_size self.sess.run(tf.initialize_all_variables()) if restore_network_file is not None: self.saver.restore(self.sess, restore_network_file) print('Restored network from file') self.sess.run(self.copy_op)
def __init__(self, dimO, dimA): dimA = list(dimA) dimO = list(dimO) nets = nets_dm # init replay memory self.rm = ReplayMemory(rm_size, dimO, dimA, dtype=np.__dict__[rm_dtype]) # own replay memory self.replay_memory = deque(maxlen=rm_size) # start tf session self.sess = tf.Session(config=tf.ConfigProto( inter_op_parallelism_threads=threads, log_device_placement=False, allow_soft_placement=True)) # create tf computational graph # self.theta_p = nets.theta_p(dimO, dimA) self.theta_q = nets.theta_q(dimO, dimA) self.theta_pt, update_pt = exponential_moving_averages(self.theta_p, tau) self.theta_qt, update_qt = exponential_moving_averages(self.theta_q, tau) obs = tf.placeholder(tf.float32, [None] + dimO, "obs") act_test, sum_p = nets.policy(obs, self.theta_p) # explore noise_init = tf.zeros([1] + dimA) noise_var = tf.Variable(noise_init) self.ou_reset = noise_var.assign(noise_init) noise = noise_var.assign_sub((ou_theta) * noise_var - tf.random_normal(dimA, stddev=ou_sigma)) act_expl = act_test + noise # test q, sum_q = nets.qfunction(obs, act_test, self.theta_q, name= 'q_mu_of_s') # training # policy loss meanq = tf.reduce_mean(q, 0) wd_p = tf.add_n([pl2 * tf.nn.l2_loss(var) for var in self.theta_p]) # weight decay loss_p = -meanq + wd_p # policy optimization optim_p = tf.train.AdamOptimizer(learning_rate=lrp) grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=self.theta_p) optimize_p = optim_p.apply_gradients(grads_and_vars_p) with tf.control_dependencies([optimize_p]): train_p = tf.group(update_pt) # q optimization act_train = tf.placeholder(tf.float32, [FLAGS.bsize] + dimA, "act_train") rew = tf.placeholder(tf.float32, [FLAGS.bsize], "rew") obs2 = tf.placeholder(tf.float32, [FLAGS.bsize] + dimO, "obs2") term2 = tf.placeholder(tf.bool, [FLAGS.bsize], "term2") # q q_train, sum_qq = nets.qfunction(obs, act_train, self.theta_q, name= 'qs_a') # q targets act2, sum_p2 = nets.policy(obs2, theta=self.theta_pt) q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt, name='qsprime_aprime') q_target = tf.stop_gradient(tf.select(term2, rew, rew + discount * q2)) # q_target = tf.stop_gradient(rew + discount * q2) # q loss td_error = q_train - q_target ms_td_error = tf.reduce_mean(tf.square(td_error), 0) wd_q = tf.add_n([ql2 * tf.nn.l2_loss(var) for var in self.theta_q]) # weight decay loss_q = ms_td_error + wd_q # q optimization optim_q = tf.train.AdamOptimizer(learning_rate=lrq) grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=self.theta_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) with tf.control_dependencies([optimize_q]): train_q = tf.group(update_qt) # logging log_obs = [] if dimO[0] > 20 else [tf.histogram_summary("obs/" + str(i), obs[:, i]) for i in range(dimO[0])] log_act = [] if dimA[0] > 20 else [tf.histogram_summary("act/inf" + str(i), act_test[:, i]) for i in range(dimA[0])] log_act2 = [] if dimA[0] > 20 else [tf.histogram_summary("act/train" + str(i), act_train[:, i]) for i in range(dimA[0])] log_misc = [sum_p, sum_qq, tf.histogram_summary("td_error", td_error)] log_grad = [grad_histograms(grads_and_vars_p), grad_histograms(grads_and_vars_q)] log_noise = [tf.histogram_summary('noise', noise_var)] log_train = log_obs + log_act + log_act2 + log_misc + log_grad + log_noise merged = tf.merge_all_summaries() # initialize tf log writer self.writer = tf.train.SummaryWriter(FLAGS.outdir + "/tf", self.sess.graph, flush_secs=20) # init replay memory for recording episodes max_ep_length = 10000 self.rm_log = ReplayMemory(max_ep_length, dimO, dimA, rm_dtype) # tf functions with self.sess.as_default(): self.act_test = Fun(obs, act_test) self._act_expl = Fun(obs, act_expl) self._reset = Fun([], self.ou_reset) self._train_q = Fun([obs, act_train, rew, obs2, term2], [train_q], log_train, self.writer) self._train_p = Fun([obs], [train_p]) self._train_p = Fun([obs], [train_p], log_obs, self.writer) self._train = Fun([obs, act_train, rew, obs2, term2], [train_p, train_q], merged, self.writer) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint(FLAGS.outdir + "/tf") if ckpt: self.saver.restore(self.sess, ckpt) else: self.sess.run(tf.initialize_all_variables()) self.sess.graph.finalize() self.t = 0 # global training time (number of observations)
from network import Network from agent import Agent from replay_memory import ReplayMemory NUM_EPISODE = 1000 RENDER = False REWARD_SUM_QUEUE_SIZE = 100 MEMORY_SIZE = 2000 TRAIN_START = 1000 if __name__ == "__main__": env = gym.make('CartPole-v0') network = Network("cpu:0") agent = Agent(network) replay_memory = ReplayMemory(MEMORY_SIZE) reward_sum_queue = [] reward_sum_history = [] reward_sum_avg_history = [] for n_episode in range(NUM_EPISODE): state = env.reset() done = False reward_sum = 0.0 while not done: if RENDER: env.render() action = agent.get_action(state) next_state, reward, done, _ = env.step(action)
class QEngine: def __init__(self, **kwargs): self.setup = kwargs self._initialize(**kwargs) if "game" in kwargs: del kwargs["game"] def _prepare_for_save(self): self.setup["epsilon"] = self.epsilon self.setup["steps"] = self.steps self.setup["skiprate"] = self.skiprate # TODO why isn't it in init? # There was some reason but can't remember it now. def _initialize(self, game=None, network_args=None, actions=None, name=None, net_type="dqn", # TODO change to the actual class name? reshaped_x=None, reshaped_y=None, skiprate=3, history_length=4, batchsize=64, update_pattern=(1, 1), replay_memory_size=10000, backprop_start_step=10000, start_epsilon=1.0, end_epsilon=0.1, epsilon_decay_start_step=50000, epsilon_decay_steps=100000, reward_scale=1.0, # TODO useless? melt_steps=10000, shaping_on=False, count_time=False, one_hot_time=False, count_time_interval=1, count_time_max=2100, use_game_variables=True, rearrange_misc=False, remember_n_actions=4, one_hot_nactions=False, misc_scale=None, # TODO seems useless results_file=None, params_file=None, config_file=None, no_timeout_terminal=False # TODO seems useless ): if game is not None: self.game = game self.config_file = None elif config_file is not None: self.config_file = config_file self.game = initialize_doom(self.config_file) else: raise Exception("No game, no config file. Dunno how to initialize doom.") if network_args is None: network_args = dict() if count_time: self.count_time = bool(count_time) if self.count_time: self.one_hot_time = one_hot_time self.count_time_max = int(count_time_max) self.count_time_interval = int(count_time_interval) if one_hot_time: self.count_time_len = int(self.count_time_max / self.count_time_interval) else: self.count_time_len = 1 else: self.count_time_len = 0 self.count_time = False self.name = name if reward_scale is not None: self.reward_scale = reward_scale else: self.reward_scale = 1.0 self.rearrange_misc = rearrange_misc self.batchsize = batchsize self.history_length = max(history_length, 1) self.update_pattern = update_pattern self.epsilon = max(min(start_epsilon, 1.0), 0.0) self.end_epsilon = min(max(end_epsilon, 0.0), self.epsilon) self.epsilon_decay_steps = epsilon_decay_steps self.epsilon_decay_stride = (self.epsilon - end_epsilon) / epsilon_decay_steps self.epsilon_decay_start = epsilon_decay_start_step self.skiprate = max(skiprate, 0) self.shaping_on = shaping_on self.steps = 0 self.melt_steps = melt_steps self.backprop_start_step = max(backprop_start_step, batchsize) self.one_hot_nactions = one_hot_nactions self.no_timeout_terminal = no_timeout_terminal if results_file: self.results_file = results_file else: self.results_file = "results/" + name + ".res" if params_file: self.params_file = params_file else: self.params_file = "params/" + name if self.game.get_available_game_variables_size() > 0 and use_game_variables: self.use_game_variables = True else: self.use_game_variables = False self.last_shaping_reward = 0 self.learning_mode = True if actions is None: self.actions = generate_default_actions(self.game) else: self.actions = actions self.actions_num = len(self.actions) self.actions_stats = np.zeros([self.actions_num], np.int) # changes img_shape according to the history size self.channels = self.game.get_screen_channels() if self.history_length > 1: self.channels *= self.history_length if reshaped_x is None: x = self.game.get_screen_width() y = self.game.get_screen_height() scale_x = scale_y = 1.0 else: x = reshaped_x scale_x = float(x) / self.game.get_screen_width() if reshaped_y is None: y = int(self.game.get_screen_height() * scale_x) scale_y = scale_x else: y = reshaped_y scale_y = float(y) / self.game.get_screen_height() img_shape = [self.channels, y, x] # TODO check if it is slow (it seems that no) if scale_x == 1 and scale_y == 1: def convert(img): img = img.astype(np.float32) / 255.0 return img else: def convert(img): img = img.astype(np.float32) / 255.0 new_image = np.ndarray([img.shape[0], y, x], dtype=img.dtype) for i in xrange(img.shape[0]): # new_image[i] = skimage.transform.resize(img[i], (y,x), preserve_range=True) new_image[i] = cv2.resize(img[i], (x, y), interpolation=cv2.INTER_AREA) return new_image self.convert_image = convert if self.use_game_variables: single_state_misc_len = int(self.game.get_available_game_variables_size() + self.count_time_len) else: single_state_misc_len = int(self.count_time_len) self.single_state_misc_len = single_state_misc_len self.remember_n_actions = remember_n_actions total_misc_len = int(single_state_misc_len * self.history_length) if remember_n_actions > 0: self.remember_n_actions = remember_n_actions if self.one_hot_nactions: self.action_len = int(2 ** floor(log(len(self.actions), 2))) else: self.action_len = len(self.actions[0]) self.last_action = np.zeros([self.action_len], dtype=np.float32) self.last_n_actions = np.zeros([remember_n_actions * self.action_len], dtype=np.float32) total_misc_len += len(self.last_n_actions) if total_misc_len > 0: self.misc_state_included = True self.current_misc_state = np.zeros(total_misc_len, dtype=np.float32) if single_state_misc_len > 0: if misc_scale is not None: self.misc_scale = np.array(misc_scale, dtype=np.float32) else: self.misc_scale = None else: self.misc_state_included = False state_format = dict() state_format["s_img"] = img_shape state_format["s_misc"] = total_misc_len self.replay_memory = ReplayMemory(state_format, replay_memory_size, batchsize) network_args["state_format"] = state_format network_args["actions_number"] = len(self.actions) if net_type in ("dqn", None, ""): self.approximator = approximators.DQN(**network_args) elif net_type in ["duelling", "dueling"]: self.approximator = approximators.DuelingDQN(**network_args) else: if locate('approximators.' + net_type) is not None: self.approximator = locate('approximators.' + net_type)(**network_args) else: raise Exception("Unsupported approximator type.") self.current_image_state = np.zeros(img_shape, dtype=np.float32) def _update_state(self): raw_state = self.game.get_state() img = self.convert_image(raw_state.image_buffer) state_misc = None if self.single_state_misc_len > 0: state_misc = np.zeros(self.single_state_misc_len, dtype=np.float32) if self.use_game_variables: game_variables = raw_state.game_variables.astype(np.float32) state_misc[0:len(game_variables)] = game_variables count_time_start = len(game_variables) else: count_time_start = 0 if self.count_time: raw_time = raw_state.number processed_time = int(min(self.count_time_max, raw_time) / self.count_time_interval) if self.one_hot_time: num_one_hot = processed_time - 1 state_number = np.zeros([self.count_time_len], dtype=np.float32) state_number[num_one_hot] = 1 ''' # TODO make it available in options # HACK1 that uses health and count as one hot at once hp = int(raw_state.game_variables[0]) state = raw_time state_number = np.zeros([self.count_time_len], dtype=np.float32) state_number[hp - 1] = 1 state_number[99 + state] = 1 # HACK1 ends ''' ''' # TODO make it available in options # HACK2 that uses health as one hot hp = int(raw_state.game_variables[0]) state_number = np.zeros([self.count_time_len], dtype=np.float32) state_number[hp - 1] = 1 # HACK2 ends ''' else: state_number = processed_time state_misc[count_time_start:] = state_number if self.misc_scale is not None: state_misc = state_misc * self.misc_scale if self.history_length > 1: pure_channels = self.channels / self.history_length self.current_image_state[0:-pure_channels] = self.current_image_state[pure_channels:] self.current_image_state[-pure_channels:] = img if self.single_state_misc_len > 0: misc_len = len(state_misc) hist_len = self.history_length # TODO don't move count_time when it's one hot - it's useless and performance drops slightly if self.rearrange_misc: for i in xrange(misc_len): cms_part = self.current_misc_state[i * hist_len:(i + 1) * hist_len] cms_part[0:hist_len - 1] = cms_part[1:] cms_part[-1] = state_misc[i] else: cms = self.current_misc_state cms[0:(hist_len - 1) * misc_len] = cms[misc_len:hist_len * misc_len] cms[(hist_len - 1) * misc_len:hist_len * misc_len] = state_misc else: self.current_image_state[:] = img if self.single_state_misc_len > 0: self.current_misc_state[0:len(state_misc)] = state_misc if self.remember_n_actions: self.last_n_actions[:-self.action_len] = self.last_n_actions[self.action_len:] self.last_n_actions[-self.action_len:] = self.last_action self.current_misc_state[-len(self.last_n_actions):] = self.last_n_actions def new_episode(self, update_state=False): self.game.new_episode() self.reset_state() self.last_shaping_reward = 0 if update_state: self._update_state() def set_last_action(self, index): if self.one_hot_nactions: self.last_action.fill(0) self.last_action[index] = 1 else: self.last_action[:] = self.actions[index] # Return current state including history def _current_state(self): if self.misc_state_included: s = [self.current_image_state, self.current_misc_state] else: s = [self.current_image_state] return s # Return current state's COPY including history. def _current_state_copy(self): if self.misc_state_included: s = [self.current_image_state.copy(), self.current_misc_state.copy()] else: s = [self.current_image_state.copy()] return s # Sets the whole state to zeros. def reset_state(self): self.current_image_state.fill(0.0) if self.misc_state_included: self.current_misc_state.fill(0.0) if self.remember_n_actions > 0: self.set_last_action(0) self.last_n_actions.fill(0) def make_step(self): self._update_state() # TODO Check if not making the copy still works a = self.approximator.estimate_best_action(self._current_state_copy()) self.actions_stats[a] += 1 self.game.make_action(self.actions[a], self.skiprate + 1) if self.remember_n_actions: self.set_last_action(a) def make_sleep_step(self, sleep_time=1 / 35.0): self._update_state() a = self.approximator.estimate_best_action(self._current_state_copy()) self.actions_stats[a] += 1 self.game.set_action(self.actions[a]) if self.remember_n_actions: self.set_last_action(a) for i in xrange(self.skiprate): self.game.advance_action(1, False, True) sleep(sleep_time) self.game.advance_action() sleep(sleep_time) def check_timeout(self): return (self.game.get_episode_time() - self.game.get_episode_start_time() >= self.game.get_episode_timeout()) # Performs a learning step according to epsilon-greedy policy. # The step spans self.skiprate +1 actions. def make_learning_step(self): self.steps += 1 # epsilon decay if self.steps > self.epsilon_decay_start and self.epsilon > self.end_epsilon: self.epsilon = max(self.epsilon - self.epsilon_decay_stride, 0) # Copy because state will be changed in a second s = self._current_state_copy(); # With probability epsilon choose a random action: if self.epsilon >= random.random(): a = random.randint(0, len(self.actions) - 1) else: a = self.approximator.estimate_best_action(s) self.actions_stats[a] += 1 # make action and get the reward if self.remember_n_actions: self.set_last_action(a) r = self.game.make_action(self.actions[a], self.skiprate + 1) r = np.float32(r) if self.shaping_on: sr = np.float32(doom_fixed_to_double(self.game.get_game_variable(GameVariable.USER1))) r += sr - self.last_shaping_reward self.last_shaping_reward = sr r *= self.reward_scale # update state s2 accordingly and add transition if self.game.is_episode_finished(): if (not self.no_timeout_terminal) or (not self.check_timeout()): s2 = None self.replay_memory.add_transition(s, a, s2, r, terminal=True) else: self._update_state() s2 = self._current_state() self.replay_memory.add_transition(s, a, s2, r, terminal=False) # Perform q-learning once for a while if self.replay_memory.size >= self.backprop_start_step and self.steps % self.update_pattern[0] == 0: for a in xrange(self.update_pattern[1]): self.approximator.learn(self.replay_memory.get_sample()) # Melt the network sometimes if self.steps % self.melt_steps == 0: self.approximator.melt() # Runs a single episode in current mode. It ignores the mode if learn==true/false def run_episode(self, sleep_time=0): self.new_episode() if sleep_time == 0: while not self.game.is_episode_finished(): self.make_step() else: while not self.game.is_episode_finished(): self.make_sleep_step(sleep_time) return np.float32(self.game.get_total_reward()) # Utility stuff def get_actions_stats(self, clear=False, norm=True): stats = self.actions_stats.copy() if norm: stats = stats / np.float32(self.actions_stats.sum()) stats[stats == 0.0] = -1 stats = np.around(stats, 3) if clear: self.actions_stats.fill(0) return stats def get_steps(self): return self.steps def get_epsilon(self): return self.epsilon def get_network(self): return self.approximator.network def set_epsilon(self, eps): self.epsilon = eps def set_skiprate(self, skiprate): self.skiprate = max(skiprate, 0) def get_skiprate(self): return self.skiprate def get_mean_loss(self): return self.approximator.get_mean_loss() # Saves network weights to a file def save_params(self, filename, quiet=False): if not quiet: print "Saving network weights to " + filename + "..." self._prepare_for_save() params = get_all_param_values(self.approximator.network) pickle.dump(params, open(filename, "wb")) if not quiet: print "Saving finished." # Loads network weights from the file def load_params(self, filename, quiet=False): if not quiet: print "Loading network weights from " + filename + "..." params = pickle.load(open(filename, "rb")) set_all_param_values(self.approximator.network, params) set_all_param_values(self.approximator.frozen_network, params) if not quiet: print "Loading finished." # Loads the whole engine with params from file def get_network_architecture(self): return get_all_param_values(self.get_network()) def print_setup(self): print "\nNetwork architecture:" for p in self.get_network_architecture(): print p.shape print "\n*** Engine setup ***" for k in self.setup.keys(): if k == "network_args": print"network_args:" net_args = self.setup[k] for k2 in net_args.keys(): print "\t", k2, ":", net_args[k2] else: print k, ":", self.setup[k] @staticmethod def load(filename, game=None, config_file=None, quiet=False): if not quiet: print "Loading qengine from " + filename + "..." params = pickle.load(open(filename, "rb")) qengine_args = params[0] network_weights = params[1] steps = qengine_args["steps"] epsilon = qengine_args["epsilon"] del (qengine_args["epsilon"]) del (qengine_args["steps"]) if game is None: if config_file is not None: game = initialize_doom(config_file) qengine_args["config_file"] = config_file elif "config_file" in qengine_args and qengine_args["config_file"] is not None: game = initialize_doom(qengine_args["config_file"]) else: raise Exception("No game, no config file. Dunno how to initialize doom.") else: qengine_args["config_file"] = None qengine_args["game"] = game qengine = QEngine(**qengine_args) set_all_param_values(qengine.approximator.network, network_weights) set_all_param_values(qengine.approximator.frozen_network, network_weights) if not quiet: print "Loading finished." qengine.steps = steps qengine.epsilon = epsilon return qengine # Saves the whole engine with params to a file def save(self, filename=None, quiet=False): if filename is None: filename = self.params_file if not quiet: print "Saving qengine to " + filename + "..." self._prepare_for_save() network_params = get_all_param_values(self.approximator.network) params = [self.setup, network_params] pickle.dump(params, open(filename, "wb")) if not quiet: print "Saving finished."
def train(active_mv): senv = ShapeNetEnv(FLAGS) replay_mem = ReplayMemory(FLAGS) log_string('====== Starting burning in memories ======') burn_in(senv, replay_mem) log_string('====== Done. {} trajectories burnt in ======'.format( FLAGS.burn_in_length)) rollout_obj = Rollout(active_mv, senv, replay_mem, FLAGS) # burn in(pretrain) for MVnet if FLAGS.burn_in_iter > 0: for i in range(FLAGS.burnin_start_iter, FLAGS.burnin_start_iter + FLAGS.burn_in_iter): rollout_obj.go(i, verbose=True, add_to_mem=True, mode=FLAGS.burnin_mode, is_train=True) mvnet_input = replay_mem.get_batch_list(FLAGS.batch_size) tic = time.time() out_stuff = run_step(mvnet_input, mode='burnin', is_training=True) if (i + 1 ) % FLAGS.save_every_step == 0 and i > FLAGS.burnin_start_iter: save_pretrain(active_mv, i + 1) if (((i + 1) % FLAGS.test_every_step == 0 and i > FLAGS.burnin_start_iter) or (FLAGS.eval0 and i == FLAGS.burnin_start_iter)): evaluate_burnin( active_mv, FLAGS.test_episode_num, replay_mem, i + 1, rollout_obj, mode=FLAGS.burnin_mode, override_mvnet_input=(batch_to_single_mvinput(mvnet_input) if FLAGS.reproj_mode else None)) for i_idx in range(FLAGS.max_iter): t0 = time.time() if np.random.uniform() < FLAGS.epsilon: rollout_obj.go(i_idx, verbose=True, add_to_mem=True, mode=FLAGS.explore_mode, is_train=True) else: rollout_obj.go(i_idx, verbose=True, add_to_mem=True, is_train=True) t1 = time.time() mvnet_input = replay_mem.get_batch_list(FLAGS.batch_size) t2 = time.time() out_stuff = active_mv.run_step(mvnet_input, mode='train', is_training=True) t3 = time.time() train_log(i_idx, out_stuff, (t0, t1, t2, t3)) if (i_idx + 1) % FLAGS.save_every_step == 0 and i_idx > 0: save(active_mv, i_idx + 1, i_idx + 1, i_idx + 1) if (i_idx + 1) % FLAGS.test_every_step == 0 and i_idx > 0: print('Evaluating active policy') evaluate(active_mv, FLAGS.test_episode_num, replay_mem, i_idx + 1, rollout_obj, mode='active') print('Evaluating random policy') evaluate(active_mv, FLAGS.test_episode_num, replay_mem, i_idx + 1, rollout_obj, mode='oneway')
def train(params): # Load Atari rom and prepare ALE environment atari = GymEnvironment(params.random_start_wait, params.show_game) # Initialize two Q-Value Networks one for training and one for target prediction dqn_train = DeepQNetwork( params=params, num_actions=atari.num_actions, network_name="qnetwork-train", trainable=True ) # Q-Network for predicting target Q-values dqn_target= DeepQNetwork( params=params, num_actions=atari.num_actions, network_name="qnetwork-target", trainable=False ) # Initialize replay memory for storing experience to sample batches from replay_mem = ReplayMemory(params.replay_capacity, params.batch_size) # Small structure for storing the last four screens history = ScreenHistory(params) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it replay_mem_dump = os.path.abspath(os.path.join(params.output_dir, "replay_memory.hdf5")) checkpoint_dir = os.path.abspath(os.path.join(params.output_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) train_step = 0 count_actions = np.zeros(atari.num_actions) # Count per action (only greedy) count_act_random = 0 # Count of random actions count_act_greedy = 0 # Count of greedy actions # Histories of qvalues and loss for running average qvalues_hist = collections.deque([0]*params.interval_summary, maxlen=params.interval_summary) loss_hist = collections.deque([10]*params.interval_summary, maxlen=params.interval_summary) # Time measurements dt_batch_gen = collections.deque([0]*10, maxlen=10) dt_optimization = collections.deque([0]*10, maxlen=10) dt_train_total = collections.deque([0]*10, maxlen=10) # Optionally load pre-initialized replay memory from disk if params.replay_mem_dump is not None and params.is_train: print("Loading pre-initialized replay memory from HDF5 file.") replay_mem.load(params.replay_mem_dump) # Initialize a new game and store the screens in the history reward, screen, is_terminal = atari.new_random_game() for _ in xrange(params.history_length): history.add(screen) # Initialize the TensorFlow session gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=0.4 ) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: # Initialize the TensorFlow session init = tf.initialize_all_variables() sess.run(init) # Only save trainable variables and the global step to disk tf_vars_to_save = tf.trainable_variables() + [dqn_train.global_step] saver = tf.train.Saver(tf_vars_to_save, max_to_keep=40) if params.model_file is not None: # Load pre-trained model from disk saver.restore(sess, params.model_file) train_step, learning_rate = sess.run([dqn_train.global_step, dqn_train.learning_rate]) print("Restarted training from model file. Step = %06i, Learning Rate = %.5f" % (train_step, learning_rate)) # Initialize summary writer dqn_train.build_summary_writer(sess) # Initialize the target Q-Network fixed with the same weights update_target_network(sess, "qnetwork-train", "qnetwork-target") for step in xrange(params.num_steps): replay_mem_size = replay_mem.num_examples() if params.is_train and replay_mem_size < params.train_start and step % 1000 == 0: print("Initializing replay memory %i/%i" % (step, params.train_start)) # Epsilon Greedy Exploration: with the probability of epsilon # choose a random action, otherwise go greedy with the action # having the maximal Q-value. Note the minimum episolon of 0.1 if params.is_train: epsilon = max(0.1, 1.0-float(train_step*params.train_freq) / float(params.epsilon_step)) else: epsilon = 0.05 ################################################################ ####################### SELECT A MOVE ########################## ################################################################ # Either choose a random action or predict the action using the Q-network do_random_action = (random.random() < epsilon) if do_random_action or (replay_mem_size < params.train_start and params.is_train): action_id = random.randrange(atari.num_actions) count_act_random += 1 else: # Get the last screens from the history and perform # feed-forward through the network to compute Q-values feed_dict = { dqn_train.pl_screens: history.get() } qvalues = sess.run(dqn_train.qvalues, feed_dict=feed_dict) # Choose the best action based on the approximated Q-values qvalue_max = np.max(qvalues[0]) action_id = np.argmax(qvalues[0]) count_act_greedy += 1 count_actions[action_id] += 1 qvalues_hist.append(qvalue_max) ################################################################ ####################### PLAY THE MOVE ########################## ################################################################ # Play the selected action (either random or predicted) on the Atari game # Note that the action is performed for k = 4 frames (frame skipping) cumulative_reward, screen, is_terminal = atari.act(action_id) # Perform reward clipping and add the example to the replay memory cumulative_reward = min(+1.0, max(-1.0, cumulative_reward)) # Add the screen to short term history and replay memory history.add(screen) # Add experience to replay memory if params.is_train: replay_mem.add(action_id, cumulative_reward, screen, is_terminal) # Check if we are game over, and if yes, initialize a new game if is_terminal: reward, screen, is_terminal = atari.new_random_game() replay_mem.add(0, reward, screen, is_terminal) history.add(screen) ################################################################ ###################### TRAINING MODEL ########################## ################################################################ if params.is_train and step > params.train_start and step % params.train_freq == 0: t1 = time.time() # Prepare batch and train the network # TODO: set actions with terminal == 1 to reward = -1 ?? screens_in, actions, rewards, screens_out, terminals = replay_mem.sample_batch() dt_batch_gen.append(time.time() - t1) t2 = time.time() # Compute the target rewards from the previously fixed network # Note that the forward run is performed on the output screens. qvalues_target = sess.run( dqn_target.qvalues, feed_dict={ dqn_target.pl_screens: screens_out } ) # Inputs for trainable Q-network feed_dict = { dqn_train.pl_screens : screens_in, dqn_train.pl_actions : actions, dqn_train.pl_rewards : rewards, dqn_train.pl_terminals : terminals, dqn_train.pl_qtargets : np.max(qvalues_target, axis=1), } # Actual training operation _, loss, train_step = sess.run([dqn_train.train_op, dqn_train.loss, dqn_train.global_step], feed_dict=feed_dict) t3 = time.time() dt_optimization.append(t3 - t2) dt_train_total.append(t3 - t1) # Running average of the loss loss_hist.append(loss) # Check if the returned loss is not NaN if np.isnan(loss): print("[%s] Training failed with loss = NaN." % datetime.now().strftime("%Y-%m-%d %H:%M")) # Once every n = 10000 frames update the Q-network for predicting targets if train_step % params.network_update_rate == 0: print("[%s] Updating target network." % datetime.now().strftime("%Y-%m-%d %H:%M")) update_target_network(sess, "qnetwork-train", "qnetwork-target") ################################################################ ####################### MODEL EVALUATION ####################### ################################################################ if params.is_train and train_step % params.eval_frequency == 0: eval_total_reward = 0 eval_num_episodes = 0 eval_num_rewards = 0 eval_episode_max_reward = 0 eval_episode_reward = 0 eval_actions = np.zeros(atari.num_actions) # Initialize new game without random start moves reward, screen, terminal = atari.new_game() for _ in range(4): history.add(screen) for eval_step in range(params.eval_steps): if random.random() < params.eval_epsilon: # Random action action_id = random.randrange(atari.num_actions) else: # Greedy action # Get the last screens from the history and perform # feed-forward through the network to compute Q-values feed_dict_eval = { dqn_train.pl_screens: history.get() } qvalues = sess.run(dqn_train.qvalues, feed_dict=feed_dict_eval) # Choose the best action based on the approximated Q-values qvalue_max = np.max(qvalues[0]) action_id = np.argmax(qvalues[0]) # Keep track of how many of each action is performed eval_actions[action_id] += 1 # Perform the action reward, screen, terminal = atari.act(action_id) history.add(screen) eval_episode_reward += reward if reward > 0: eval_num_rewards += 1 if terminal: eval_total_reward += eval_episode_reward eval_episode_max_reward = max(eval_episode_reward, eval_episode_max_reward) eval_episode_reward = 0 eval_num_episodes += 1 reward, screen, terminal = atari.new_game() for _ in range(4): history.add(screen) # Send statistics about the environment to TensorBoard eval_update_ops = [ dqn_train.eval_rewards.assign(eval_total_reward), dqn_train.eval_num_rewards.assign(eval_num_rewards), dqn_train.eval_max_reward.assign(eval_episode_max_reward), dqn_train.eval_num_episodes.assign(eval_num_episodes), dqn_train.eval_actions.assign(eval_actions / np.sum(eval_actions)) ] sess.run(eval_update_ops) summaries = sess.run(dqn_train.eval_summary_op, feed_dict=feed_dict) dqn_train.train_summary_writer.add_summary(summaries, train_step) print("[%s] Evaluation Summary" % datetime.now().strftime("%Y-%m-%d %H:%M")) print(" Total Reward: %i" % eval_total_reward) print(" Max Reward per Episode: %i" % eval_episode_max_reward) print(" Num Episodes: %i" % eval_num_episodes) print(" Num Rewards: %i" % eval_num_rewards) ################################################################ ###################### PRINTING / SAVING ####################### ################################################################ # Write a training summary to disk if params.is_train and train_step % params.interval_summary == 0: avg_dt_batch_gen = sum(dt_batch_gen) / float(len(dt_batch_gen)) avg_dt_optimization = sum(dt_optimization) / float(len(dt_optimization)) avg_dt_total = sum(dt_train_total) / float(len(dt_train_total)) # print("Avg. Time Batch Preparation: %.3f seconds" % avg_dt_batch_gen) # print("Avg. Time Train Operation: %.3f seconds" % avg_dt_train_op) # print("Avg. Time Total per Batch: %.3f seconds (%.2f samples/second)" % # (avg_dt_total, (1.0/avg_dt_total)*params.batch_size)) # Send statistics about the environment to TensorBoard update_game_stats_ops = [ dqn_train.avg_reward_per_game.assign(atari.avg_reward_per_episode()), dqn_train.max_reward_per_game.assign(atari.max_reward_per_episode), dqn_train.avg_moves_per_game.assign(atari.avg_steps_per_episode()), dqn_train.total_reward_replay.assign(replay_mem.total_reward()), dqn_train.num_games_played.assign(atari.episode_number), dqn_train.actions_random.assign(count_act_random), dqn_train.actions_greedy.assign(count_act_greedy), dqn_train.runtime_batch.assign(avg_dt_batch_gen), dqn_train.runtime_train.assign(avg_dt_optimization), dqn_train.runtime_total.assign(avg_dt_total), dqn_train.samples_per_second.assign((1.0/avg_dt_total)*params.batch_size) ] sess.run(update_game_stats_ops) # Build and save summaries summaries = sess.run(dqn_train.train_summary_op, feed_dict=feed_dict) dqn_train.train_summary_writer.add_summary(summaries, train_step) avg_qvalue = avg_loss = 0 for i in xrange(len(qvalues_hist)): avg_qvalue += qvalues_hist[i] avg_loss += loss_hist[i] avg_qvalue /= float(len(qvalues_hist)) avg_loss /= float(len(loss_hist)) format_str = "[%s] Step %06i, ReplayMemory = %i, Epsilon = %.4f, "\ "Episodes = %i, Avg.Reward = %.2f, Max.Reward = %.2f, Avg.QValue = %.4f, Avg.Loss = %.6f" print(format_str % (datetime.now().strftime("%Y-%m-%d %H:%M"), train_step, replay_mem.num_examples(), epsilon, atari.episode_number, atari.avg_reward_per_episode(), atari.max_reward_per_episode, avg_qvalue, avg_loss)) # For debugging purposes, dump the batch to disk #print("[%s] Writing batch images to file (debugging)" % # datetime.now().strftime("%Y-%m-%d %H:%M")) #batch_output_dir = os.path.join(params.output_dir, "batches/%06i/" % train_step) #replay_mem.write_batch_to_disk(batch_output_dir, screens_in, actions, rewards, screens_out) # Write model checkpoint to disk if params.is_train and train_step % params.interval_checkpoint == 0: path = saver.save(sess, checkpoint_prefix, global_step=train_step) print("[%s] Saving TensorFlow model checkpoint to disk." % datetime.now().strftime("%Y-%m-%d %H:%M")) # Dump the replay memory to disk # TODO: fix this! # print("[%s] Saving replay memory to disk." % # datetime.now().strftime("%Y-%m-%d %H:%M")) # replay_mem.save(replay_mem_dump) sum_actions = float(reduce(lambda x, y: x+y, count_actions)) action_str = "" for action_id, action_count in enumerate(count_actions): action_perc = action_count/sum_actions if not sum_actions == 0 else 0 action_str += "<%i, %s, %i, %.2f> " % \ (action_id, atari.action_to_string(action_id), action_count, action_perc) format_str = "[%s] Q-Network Actions Summary: NumRandom: %i, NumGreedy: %i, %s" print(format_str % (datetime.now().strftime("%Y-%m-%d %H:%M"), count_act_random, count_act_greedy, action_str)) print("Finished training Q-network.")
class Agent: def __init__(self, dimO, dimA, nets=nets_dm, tau =.001, # fdsla discount =.99, pl2 =.0, ql2 =.01, lrp =.0001, lrq =.001, ou_theta = 0.15, ou_sigma = 0.2, rm_size = 500000, rm_dtype = 'float32', mb_size = 32, threads = 4,**kwargs): dimA = list(dimA) dimO = list(dimO) # init replay memory self.rm = ReplayMemory(rm_size, dimO, dimA, dtype=np.__dict__[rm_dtype]) self.mb_size = mb_size # start tf session self.sess = tf.Session(config=tf.ConfigProto( inter_op_parallelism_threads=threads, log_device_placement=False, allow_soft_placement=True)) # create tf computational graph # self.theta_p = nets.theta_p(dimO, dimA) self.theta_q = nets.theta_q(dimO, dimA) self.theta_pt, update_pt = exponential_moving_averages( self.theta_p, tau) self.theta_qt, update_qt = exponential_moving_averages( self.theta_q, tau) obs = tf.placeholder(tf.float32, [None] + dimO, "obs") act_test, sum_p = nets.policy(obs, self.theta_p) # explore noise_init = tf.zeros([1]+dimA) noise_var = tf.Variable(noise_init) self.ou_reset = noise_var.assign(noise_init) noise = noise_var.assign_sub( (ou_theta) * noise_var - tf.random_normal(dimA, stddev=ou_sigma)) act_expl = act_test + noise # test q, sum_q = nets.qfunction(obs, act_test, self.theta_q) # training # policy loss meanq = tf.reduce_mean(q, 0) wd_p = tf.add_n([pl2 * tf.nn.l2_loss(var) for var in self.theta_p]) # weight decay loss_p = -meanq + wd_p # policy optimization optim_p = tf.train.AdamOptimizer(learning_rate=lrp) grads_and_vars_p = optim_p.compute_gradients( loss_p, var_list=self.theta_p) optimize_p = optim_p.apply_gradients(grads_and_vars_p) with tf.control_dependencies([optimize_p]): train_p = tf.group(update_pt) # q optimization act_train = tf.placeholder(tf.float32, [None] + dimA, "act_train") rew = tf.placeholder(tf.float32, [None], "rew") obs2 = tf.placeholder(tf.float32, [None] + dimO, "obs2") term2 = tf.placeholder(tf.bool, [None], "term2") # q q, sum_qq = nets.qfunction(obs, act_train, self.theta_q) # q targets act2, sum_p2 = nets.policy(obs2, theta=self.theta_pt) q2, sum_q2 = nets.qfunction(obs2, act2, theta=self.theta_qt) q_target = tf.stop_gradient(tf.select(term2,rew,rew + discount*q2)) # = tf.stop_gradient(rew + discount * q2) # q loss mb_td_error = tf.square(q - q_target) mean_td_error = tf.reduce_mean(mb_td_error, 0) wd_q = tf.add_n([ql2 * tf.nn.l2_loss(var) for var in self.theta_q]) # weight decay loss_q = mean_td_error + wd_q # q optimization optim_q = tf.train.AdamOptimizer(learning_rate=lrq) grads_and_vars_q = optim_q.compute_gradients( loss_q, var_list=self.theta_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) with tf.control_dependencies([optimize_q]): train_q = tf.group(update_qt) # logging log_obs = [] if dimO[0]>20 else [tf.histogram_summary("obs/"+str(i),obs[:,i]) for i in range(dimO[0])] log_act = [] if dimA[0]>20 else [tf.histogram_summary("act/inf"+str(i),act_test[:,i]) for i in range(dimA[0])] log_act2 = [] if dimA[0]>20 else [tf.histogram_summary("act/train"+str(i),act_train[:,i]) for i in range(dimA[0])] log_misc = [sum_p, sum_qq, tf.histogram_summary("qfunction/td_error", mb_td_error)] log_grad = [grad_histograms(grads_and_vars_p), grad_histograms(grads_and_vars_q)] log_train = log_obs + log_act + log_act2 + log_misc + log_grad # initialize tf log writer self.writer = tf.train.SummaryWriter( "./tf", self.sess.graph, flush_secs=20) # init replay memory for recording episodes max_ep_length = 10000 self.rm_log = ReplayMemory(max_ep_length,dimO,dimA,rm_dtype) # tf functions with self.sess.as_default(): self._act_test = Fun(obs,act_test) self._act_expl = Fun(obs,act_expl) self._reset = Fun([],self.ou_reset) self._train = Fun([obs,act_train,rew,obs2,term2],[train_p,train_q],log_train,self.writer) # initialize tf variables self.saver = tf.train.Saver(max_to_keep=1) ckpt = tf.train.latest_checkpoint("./tf") if ckpt: self.saver.restore(self.sess,ckpt) else: self.sess.run(tf.initialize_all_variables()) self.sess.graph.finalize() self.t = 0 # global training time (number of observations) def reset(self, obs): self._reset() self.observation = np.squeeze(obs) # initial observation def act(self, test=False, logging=False): obs = np.expand_dims(self.observation, axis=0) action = self._act_test(obs) if test else self._act_expl(obs) self.action = np.atleast_1d(np.squeeze(action, axis=0)) # TODO: remove this hack return self.action def observe(self, rew, term, obs2, test=False): rew = self.reward(rew) # internal reward # TODO: outsource if not test: self.t = self.t + 1 self.rm.enqueue(self.observation, term, self.action, rew) # save parameters etc. if (self.t+45000) % 50000 == 0: # TODO: correct s = self.saver.save(self.sess,"./tf/c",self.t) print("DDPG Checkpoint: " + s) self.observation = np.squeeze(obs2) # current observation <- obs2 return rew def train(self, logging=False): obs, act, rew, obs2, term2, info = self.rm.minibatch(size=self.mb_size) self._train(obs,act,rew,obs2,term2,log=logging,global_step=self.t) def reward(self,external_reward,logging=False): """ calculate internal reward """ ra = - .1 * np.mean(np.square(self.action)) rint = external_reward + ra if logging: self.write_scalar('reward/ext',external_reward) self.write_scalar('reward/a',ra) self.write_scalar('reward/rint',rint) return rint def write_scalar(self,tag,val): s = tf.Summary(value=[tf.Summary.Value(tag=tag,simple_value=val)]) self.writer.add_summary(s,self.t) def __del__(self): self.sess.close()