def _before_sim_loop(self): n_state = self._env.observation_space.shape[0] n_action = self._env.action_space.n self._algo = DDQN(n_state, n_action, self._algo_params) self._algo.update_net() self._score = 0.0 self._score_sum = 0.0
class GameManager(): def __init__(self): # Init game state self.episode = 0.0 self.win_counter = 0.0 self.state = CardGameState(self) self.brain = DDQN() self.episode_reward = 0 self.game_history = list() def update(self, dt): pass def auto_play(self): while self.episode < MAX_EPISODES: action = self.brain.get_action(self.state) action_to_store = np.zeros(3) action_to_store[action] = 1 self.state.process(action) # receive game result reward = self.state.reward done = self.state.terminal self.episode_reward += reward self.brain.train(self.state, self.state.s_t, action_to_store, reward, self.state.s_t1, done) self.state.t += 1 self.state.update() if done: self.episode += 1 win_rate = 0.0 if self.episode_reward == 1: self.game_history.append(1) else: self.game_history.append(0) if len(self.game_history) < GAME_HISTORY_SIZE: win_rate = np.sum(self.game_history) / float( len(self.game_history)) * 100.0 else: self.game_history.pop(0) win_rate = np.sum( self.game_history) / GAME_HISTORY_SIZE * 100.0 print("Episode {} | Win Rate = {}".format( self.episode, win_rate)) self.brain.write_summary(win_rate, self.episode) self.episode_reward = 0 self.state.reset()
class Tester(): def __init__(self, render_flag): self.model = DDQN(36, 36) self.render_flag = render_flag self.width = 6 self.height = 6 self.env = MineSweeper(self.width, self.height, 6) if (self.render_flag): self.renderer = Render(self.env.state) self.load_models(20000) def get_action(self, state): state = state.flatten() mask = (1 - self.env.fog).flatten() action = self.model.act(state, mask) return action def load_models(self, number): path = "pre-trained\ddqn_dnn" + str(number) + ".pth" dict = torch.load(path) self.model.load_state_dict(dict['current_state_dict']) self.model.epsilon = 0 def do_step(self, action): i = int(action / self.width) j = action % self.width if (self.render_flag): self.renderer.state = self.env.state self.renderer.draw() self.renderer.bugfix() next_state, terminal, reward = self.env.choose(i, j) return next_state, terminal, reward
def __init__(self,width,height,bomb_no,render_flag): self.width = width self.height = height self.bomb_no = bomb_no self.box_count = width*height self.env = MineSweeper(self.width,self.height,self.bomb_no) self.current_model = DDQN(self.box_count,self.box_count) self.target_model = DDQN(self.box_count,self.box_count) self.target_model.eval() self.optimizer = torch.optim.Adam(self.current_model.parameters(),lr=0.003,weight_decay=1e-5) self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer,step_size=2000,gamma=0.95) self.target_model.load_state_dict(self.current_model.state_dict()) self.buffer = Buffer(100000) self.gamma = 0.99 self.render_flag = render_flag self.epsilon_min = 0.01 self.epsilon_decay = 0.90 self.reward_threshold = 0.12 self.reward_step = 0.01 self.batch_size = 4096 self.tau = 5e-5 self.log = open("./Logs/ddqn_log.txt",'w') if(self.render_flag): self.Render = Render(self.env.state)
def __init__(self, render_flag): self.model = DDQN(36, 36) self.render_flag = render_flag self.width = 6 self.height = 6 self.env = MineSweeper(self.width, self.height, 6) if (self.render_flag): self.renderer = Render(self.env.state) self.load_models(20000)
def __init__(self): # Init game state self.episode = 0.0 self.win_counter = 0.0 self.state = CardGameState(self) self.brain = DDQN() self.episode_reward = 0 self.game_history = list()
class DDQNRunner(Runner): def __init__(self, env_name, algo_params, runner_params): super(DDQNRunner, self).__init__(env_name, 'DDQN', algo_params, runner_params) def _before_sim_loop(self): n_state = self._env.observation_space.shape[0] n_action = self._env.action_space.n self._algo = DDQN(n_state, n_action, self._algo_params) self._algo.update_net() self._score = 0.0 self._score_sum = 0.0 def _episode_sim(self, n_epi): s = self._env.reset() done = False self._score = 0.0 n_step = 0 if self._train: self._algo.epsilon = max(0.01, self._algo.start_epsilon - 0.01*(n_epi/200)) else: self._algo.epsilon = 0.0 while not done: a = self._algo.sample_action(torch.from_numpy(s).float()) s_prime, r, done, info = self._step_wrapper(self._env.step(a)) if self._train: self._algo.append_data((s,a,r/self._reward_scale,s_prime, done)) if self._save_step_log: self._write_step_log(n_step, n_epi, s, a, r, done) s = s_prime self._score += r n_step += 1 self._score_sum += self._score def _after_sim(self, n_epi, print_log, cond_check): super()._after_sim(n_epi, print_log, cond_check) if not self._done and self._train: if self._algo.buffer_size() > self._algo.n_train_start: self._algo.train_net() if n_epi % self._algo.update_interval==0: self._algo.update_net() def _print_log(self, n_epi, avg_score): super()._print_log(n_epi, avg_score) print(f"n_buffer : {self._algo.buffer_size()}, "\ + f"eps : {self._algo.epsilon*100:.1f}%")
def main(): value_function = Sequential( Linear(in_features=4, out_features=128), ReLU(), Linear(in_features=128, out_features=128), ReLU(), Linear(in_features=128, out_features=32), ReLU(), Linear(in_features=32, out_features=2) ).to(torch.device("cuda:0")) optimizer = RMSprop(params=value_function.parameters(), alpha=0.95, lr=0.0001) agent = DDQN( value_function=value_function, optimizer=optimizer, lr_scheduler=LambdaLR(optimizer=optimizer, lr_lambda=lambda e: max(0.9999 ** e, 0.1)), gamma=0.95, epsilon_fn=lambda x: 0.9999**x, replay_buffer_size=10000, replay_batch_size=128, start_training_at=1024, unfreeze_freq=64, device=torch.device("cuda:0"), verbose=True ) run_ddqn(agent, render=True)
def _initialise_ddqn(self): """ Initialise the DDQN :return: None """ self._ddqn2 = DDQN(state_size=len(self._columns), action_size=len(self._columns), seed=0, technique=self._rl_technique)
def __init__(self): self.last_action = Action() self.time_step = 0 self.total_time_step = 0 self.episode_step = 0 self.populating_phase = False self.model_save_interval = 30 # Switch learning phase / evaluation phase self.policy_frozen = False self.ddqn = DDQN() self.state = np.zeros( (config.rl_agent_history_length, config.ale_screen_channels, config.ale_scaled_screen_size[1], config.ale_scaled_screen_size[0]), dtype=np.float32) self.exploration_rate = self.ddqn.exploration_rate self.exploration_rate_for_evaluation = 0.05 self.last_observed_screen = None
def main(): if len(sys.argv) != 2: print('usage: python ' + sys.argv[0] + ' [weights_path]') exit(0) weights_path = sys.argv[1] env = gym.make(GYM) env = gym.wrappers.Monitor(env, "./video", force=True) input_shape = env.observation_space.shape[0] output_shape = env.action_space.n print('environment: in: ({}) out: ({})'.format(input_shape, output_shape)) ddqn = DDQN(input_shape, output_shape) if os.path.exists(weights_path): ddqn.load_weights(weights_path) state = env.reset() state = np.expand_dims(state, 0) tot_reward = 0 for _ in range(1000): env.render() q_values = ddqn.predict(state) action = np.argmax(q_values) next_state, reward, done, info = env.step(action) next_state = np.expand_dims(next_state, 0) state = next_state tot_reward += reward if done: break env.close() print('total reward: {}'.format(tot_reward))
def run_ddqn(agent: DDQN, render: bool = True): env = gym.make("CartPole-v1") draw = env.render if render else lambda: ... # Train forever. while True: next_state = env.reset() reward = 0 done = False while True: action = agent.train_step(state=next_state, reward=reward, episode_ended=done) if done: break next_state, reward, done, info = env.step(action) draw()
def __init__(self): self.last_action = Action() self.time_step = 0 self.total_time_step = 0 self.episode_step = 0 self.populating_phase = False self.model_save_interval = 30 # Switch learning phase / evaluation phase self.policy_frozen = False self.ddqn = DDQN() self.state = np.zeros((config.rl_agent_history_length, config.ale_screen_channels, config.ale_scaled_screen_size[1], config.ale_scaled_screen_size[0]), dtype=np.float32) self.exploration_rate = self.ddqn.exploration_rate self.exploration_rate_for_evaluation = 0.05 self.last_observed_screen = None
def main(): env = gym.make("VideoPinball-ram-v0") state_size = env.observation_space.shape[0] num_actions = env.action_space.n # Initialize model dqn_model = DQN(state_size, num_actions) ddqn_model = DDQN(state_size, num_actions) # TODO: # 1) Train your model for 650 episodes, passing in the environment and the agent. # 2) Append the total reward of the episode into a list keeping track of all of the rewards. # 3) After training, print the average of the last 50 rewards you've collected. dqn_rwds = [] ddqn_rwds = [] print('start train') num_games = 100 for i in range(num_games): if i % 10 == 0: print('step:', i) ddqn_rwd = generate_trajectory(env, ddqn_model) ddqn_rwds.append(ddqn_rwd) dqn_rwd = generate_trajectory(env, dqn_model) dqn_rwds.append(dqn_rwd) env.close() print("DQN rewards") print(dqn_rwds) print("DDQN Rewards") print(ddqn_rwds) # TODO: Visualize your rewards. visualize_data(np.array(dqn_rwds), np.array(ddqn_rwds))
def main(): if len(sys.argv) != 3: print('usage: python ' + sys.argv[0] + ' epochs [weights_path]') exit(0) epochs = int(sys.argv[1]) weights_path = sys.argv[2] env = gym.make(GYM) input_shape = env.observation_space.shape[0] output_shape = env.action_space.n print('environment: in: ({}) out: ({})'.format(input_shape, output_shape)) ddqn = DDQN(input_shape, output_shape) if os.path.exists(weights_path): ddqn.load_weights(weights_path) optimizer = tf.train.AdamOptimizer() # init training replay_buffer = Replay_buffer(REPLAY_BUFFER_SIZE) target_network = DDQN(input_shape, output_shape) target_network.set_weights(ddqn.get_weights()) target_reset_count = 0 train_counter = 0 e_counter = 0 epsilon_explore = E_START loss_value = 0 rewards_x_epoch = [] e_x_time = [] loss_x_time = [] for epc in range(epochs): state = env.reset() state = np.expand_dims(state, 0) tot_reward = 0 for _ in range(1000): # action selection if random.random() <= epsilon_explore: action = random.randint(0, output_shape - 1) else: q_values = ddqn.predict(state) action = np.argmax(q_values) # simulation next_state, reward, done, info = env.step(action) next_state = np.expand_dims(next_state, 0) replay_buffer.add((state, action, reward, next_state, 0 if done else 1)) state = next_state tot_reward += reward # training train_counter += 1 if train_counter > TRAINING_START and train_counter % TRAINING_FREQ == 0: (batch_states, batch_actions, batch_s_t1, batch_rewards, batch_final) = replay_buffer.sample(BATCH_SIZE) with tf.GradientTape() as tape: # actual prediction action_indexes = tf.stack([tf.range(BATCH_SIZE, dtype=tf.int64), batch_actions], axis=1) y_prediction = tf.gather_nd(ddqn(batch_states), action_indexes) # targets amax = tf.argmax(ddqn(batch_s_t1), axis=1) amax = tf.stack([tf.range(BATCH_SIZE, dtype=tf.int64), amax], axis=1) batch_target_y = target_network(batch_s_t1) target_expected_rewards = tf.gather_nd(batch_target_y, amax) y_target = batch_rewards + (DISCOUNT * target_expected_rewards * batch_final) # loss loss_value = tf.reduce_mean(tf.pow(y_target - y_prediction, 2)) grads = tape.gradient(loss_value, ddqn.trainable_variables) optimizer.apply_gradients(zip(grads, ddqn.trainable_variables)) if train_counter > TRAINING_START: e_counter += 1 epsilon_explore = get_epsilon(e_counter) if e_counter > E_RESET_FREQ: e_counter = 0 e_x_time.append(epsilon_explore) loss_x_time.append(float(loss_value)) target_reset_count += 1 if target_reset_count == TARGET_RESET_FREQ: target_network.set_weights(ddqn.get_weights()) target_reset_count = 0 if done: break rewards_x_epoch.append(tot_reward) if epc % 10 == 0: print('[{:2.1f}%], e: {:5.4f} - loss: {:10.6f} - last episode reward: {}'.format((epc * 100) / epochs, epsilon_explore, float(loss_value), tot_reward)) ddqn.save_weights(weights_path, save_format='h5') with open('rewards.csv', 'w') as f: for rew in rewards_x_epoch: f.write("%s," % rew) with open('epsilon.csv', 'w') as f: for e in e_x_time: f.write("%s," % e) with open('loss.csv', 'w') as f: for l in loss_x_time: f.write("%s," % l) # let's try it obs = env.reset() obs = np.expand_dims(obs, 0) for _ in range(1000): env.render() q_values = ddqn.predict(obs) action = np.argmax(q_values) obs, reward, done, info = env.step(action) obs = np.expand_dims(obs, 0) if done: break env.close()
# set up environment tools USE_GPU = torch.cuda.is_available() mod_action_space = [2, 3, 4, 5] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = Env(device) agent = Agent(eps=dum_val, eps_min=dum_val, eps_max=dum_val, eps_decay=dum_val, num_actions=len(mod_action_space), device=device) agent.turn_eps_off() stack = Frstack(initial_frame=env.state) # create policy net and load saved weights policy_net = DDQN(NUM_FRAMES, len(mod_action_space)) if USE_GPU: policy_net.cuda() def test(): policy_net.load_state_dict(torch.load(POLICY_NET_PATH)) policy_net.eval() print("testing...") all_rewards = [] all_images = [] for episode in range(NUM_TEST_EPISODES): env.reset() episode_reward = 0
pass # Custom method for saving own metrics # Creates writer, writes custom metrics and closes writer def update_stats(self, **stats): self._write_logs(stats, self.step) def _write_logs(self, logs, index): with self.writer.as_default(): for name, value in logs.items(): tf.summary.scalar(name, value, step=index) self.step += 1 self.writer.flush() agent = DDQN(10, (env.OBSERVATION_SPACE_VALUES)) agent.load_weights(MODEL_FILE) tensorboard = ModifiedTensorBoard( log_dir="logs/{}-{}".format(MODEL_NAME, int(time.time()))) # Iterate over episodes for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'): # Update tensorboard step every episode # Restarting episode - reset episode reward and step number episode_reward = 0 step = 1 # Reset environment and get initial state current_state = env.reset()
env = gym.make('Pendulum-v0') env = env.unwrapped env.seed(1) action_space = 11 n_features = 3 memory0 = Memory(n_features, MEMORY_CAPACITY, n_features * 2 + 2, BATCH_SIZE) memory1 = Memory(n_features, MEMORY_CAPACITY, n_features * 2 + 2, BATCH_SIZE) sess0 = tf.Session() sess1 = tf.Session() dqn = [ DDQN(action_space, n_features, memory0, name='dqn0', learning_rate=LEARNING_RATE, e_greedy_increment=0.001, double_q=False, sess=sess0), DDQN(action_space, n_features, memory1, name='dqn1', learning_rate=LEARNING_RATE, e_greedy_increment=0.001, double_q=False, sess=sess1) ] sess0.run(tf.global_variables_initializer()) sess1.run(tf.global_variables_initializer())
class Agent(RLGlueAgent): def __init__(self): self.last_action = Action() self.time_step = 0 self.total_time_step = 0 self.episode_step = 0 self.populating_phase = False self.model_save_interval = 30 # Switch learning phase / evaluation phase self.policy_frozen = False self.ddqn = DDQN() self.state = np.zeros( (config.rl_agent_history_length, config.ale_screen_channels, config.ale_scaled_screen_size[1], config.ale_scaled_screen_size[0]), dtype=np.float32) self.exploration_rate = self.ddqn.exploration_rate self.exploration_rate_for_evaluation = 0.05 self.last_observed_screen = None def preprocess_screen(self, observation): screen_width = config.ale_screen_size[0] screen_height = config.ale_screen_size[1] new_width = config.ale_scaled_screen_size[0] new_height = config.ale_scaled_screen_size[1] if len(observation.intArray) == 100928: observation = np.asarray(observation.intArray[128:], dtype=np.uint8).reshape( (screen_width, screen_height, 3)) observation = spm.imresize(observation, (new_height, new_width)) # Clip the pixel value to be between 0 and 1 if config.ale_screen_channels == 1: # Convert RGB to Luminance observation = np.dot(observation[:, :, :], [0.299, 0.587, 0.114]) observation = observation.reshape((new_height, new_width, 1)) observation = observation.transpose(2, 0, 1) / 255.0 observation /= (np.max(observation) + 1e-5) else: # Greyscale if config.ale_screen_channels == 3: raise Exception( "You forgot to add --send_rgb option when you run ALE.") observation = np.asarray(observation.intArray[128:]).reshape( (screen_width, screen_height)) observation = spm.imresize(observation, (new_height, new_width)) # Clip the pixel value to be between 0 and 1 observation = observation.reshape( (1, new_height, new_width)) / 255.0 observation /= (np.max(observation) + 1e-5) observed_screen = observation if self.last_observed_screen is not None: observed_screen = np.maximum(observation, self.last_observed_screen) self.last_observed_screen = observation return observed_screen def agent_init(self, taskSpecString): pass def reshape_state_to_conv_input(self, state): return state.reshape( (1, config.rl_agent_history_length * config.ale_screen_channels, config.ale_scaled_screen_size[1], config.ale_scaled_screen_size[0])) def dump_result(self, reward, q_max=None, q_min=None): if self.time_step % 50 == 0: if self.policy_frozen is False: print "time_step:", self.time_step, print "reward:", reward, print "eps:", self.exploration_rate, if q_min is None: print "" else: print "Q ::", print "max:", q_max, print "min:", q_min def dump_state(self, state=None, prefix=""): if state is None: state = self.state state = self.reshape_state_to_conv_input(state) for h in xrange(config.rl_agent_history_length): start = h * config.ale_screen_channels end = start + config.ale_screen_channels image = state[0, start:end, :, :] if config.ale_screen_channels == 1: image = image.reshape((image.shape[1], image.shape[2])) elif config.ale_screen_channels == 3: image = image.transpose(1, 2, 0) image = np.uint8(image * 255.0) image = Image.fromarray(image) image.save(("%sstate-%d.png" % (prefix, h))) def learn(self, reward, epsode_ends=False): if self.policy_frozen is False: self.ddqn.store_transition_in_replay_memory( self.reshape_state_to_conv_input(self.last_state), self.last_action.intArray[0], reward, self.reshape_state_to_conv_input(self.state), epsode_ends) if self.total_time_step <= config.rl_replay_start_size: # A uniform random policy is run for 'replay_start_size' frames before learning starts # 経験を積むためランダムに動き回るらしい。 print "Initial exploration before learning starts:", "%d/%d" % ( self.total_time_step, config.rl_replay_start_size) self.populating_phase = True self.exploration_rate = config.rl_initial_exploration else: self.populating_phase = False self.ddqn.decrease_exploration_rate() self.exploration_rate = self.ddqn.exploration_rate if self.total_time_step % ( config.rl_action_repeat * config.rl_update_frequency ) == 0 and self.total_time_step != 0: self.ddqn.replay_experience() if self.total_time_step % config.rl_target_network_update_frequency == 0 and self.total_time_step != 0: print "Target has been updated." self.ddqn.update_target() def agent_start(self, observation): print "Episode", self.episode_step, "::", "total_time_step:", if self.total_time_step > 1000: print int(self.total_time_step / 1000), "K" else: print self.total_time_step observed_screen = self.preprocess_screen(observation) self.state[0] = observed_screen return_action = Action() action, q_max, q_min = self.ddqn.eps_greedy( self.reshape_state_to_conv_input(self.state), self.exploration_rate) return_action.intArray = [action] self.last_action = copy.deepcopy(return_action) self.last_state = self.state.copy() return return_action def agent_step(self, reward, observation): observed_screen = self.preprocess_screen(observation) self.state = np.roll(self.state, 1, axis=0) self.state[0] = observed_screen ########################### DEBUG ############################### # if self.total_time_step % 500 == 0 and self.total_time_step != 0: # self.dump_state() self.learn(reward) return_action = Action() q_max = None q_min = None if self.time_step % config.rl_action_repeat == 0: action, q_max, q_min = self.ddqn.eps_greedy( self.reshape_state_to_conv_input(self.state), self.exploration_rate) else: action = self.last_action.intArray[0] return_action.intArray = [action] self.dump_result(reward, q_max, q_min) if self.policy_frozen is False: self.last_action = copy.deepcopy(return_action) self.last_state = self.state.copy() self.time_step += 1 self.total_time_step += 1 return return_action def agent_end(self, reward): self.learn(reward, epsode_ends=True) # [Optional] ## Visualizing the results self.dump_result(reward) if self.policy_frozen is False: self.time_step = 0 self.total_time_step += 1 self.episode_step += 1 def agent_cleanup(self): pass def agent_message(self, inMessage): if inMessage.startswith("freeze_policy"): self.policy_frozen = True self.exploration_rate = self.exploration_rate_for_evaluation return "The policy was freezed." if inMessage.startswith("unfreeze_policy"): self.policy_frozen = False self.exploration_rate = self.ddqn.exploration_rate return "The policy was unfreezed." if inMessage.startswith("save_model"): if self.populating_phase is False: self.ddqn.save() return "The model was saved."
env.seed(args.seed) np.random.seed(args.seed) obs_shape_list = env.observation_space.shape action_shape = env.action_space.n if len(obs_shape_list) > 1: shapes = [(80, 80), (1, ), (1, ), (1, ), (80, 80)] dtypes = [np.uint8, np.uint8, np.float32, np.bool, np.uint8] model_type = "CNN" else: shapes = [(obs_shape_list[0], ), (1, ), (1, ), (1, ), (obs_shape_list[0], )] dtypes = [np.float32, np.uint8, np.float32, np.bool, np.float32] model_type = "DNN" qnet = DDQN(shapes[0] + (args.frames, ), action_shape, model_type, args) if args.predictor: pred = Predictor(shapes[0], action_shape, args) kws = ['obs', 'action', 'reward', 'done', 'new_obs'] memory = FullReplayMemory(args.buffer_size, kws, shapes, dtypes) writer = tf.summary.create_file_writer("logs/{}_{}".format( args.scenario, datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))) total_numsteps = 0 timestep = 0 t_start = time.time() epsilon = args.epsilon # total_parameters = np.sum([np.prod(v.get_shape().as_list()) for v in qnet.q1.trainable_variables]) with writer.as_default():
We will ignore actions 0 and 1. ''' mod_action_space = [2, 3, 4, 5] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = Env(device) agent = Agent(eps=EPS_MAX, eps_min=EPS_MIN, eps_max=EPS_MAX, eps_decay=EPS_DECAY, num_actions=len(mod_action_space), device=device) memory = PriorityReplayBuffer(MEMORY_SIZE) stack = Frstack(initial_frame=env.state) # initialize policy and target network policy_net = DDQN(NUM_FRAMES, len(mod_action_space)) target_net = DDQN(NUM_FRAMES, len(mod_action_space)) if USE_GPU: policy_net.cuda() target_net.cuda() target_net.load_state_dict(policy_net.state_dict()) target_net.eval() # TODO: consider RMSProp vs Adam - DeepMind paper uses RMSProp optimizer = optim.Adam(params=policy_net.parameters(), lr=ALPHA) def experience_replay(): # experience tuple - (state, action, next_state, reward, done) batch, idxs, is_weights = memory.sample(BATCH_SIZE) batch = list(zip(*batch))
n_actions = env.action_space.n n_features = env.state.shape[0] print('actions=', n_actions, 'n_features=', n_features) memory0 = Memory(n_features, MEMORY_CAPACITY, n_features * 2 + 2, BATCH_SIZE) memory1 = Memory(n_features, MEMORY_CAPACITY, n_features * 2 + 2, BATCH_SIZE) sess0 = tf.Session() sess1 = tf.Session() dqn = [ DDQN(n_actions, n_features, memory0, name='dqn0', learning_rate=LEARNING_RATE, reward_delay=GAMMA, replace_target_iter=200, double_q=False, sess=sess0), DDQN(n_actions, n_features, memory1, name='dqn1', learning_rate=LEARNING_RATE, reward_delay=GAMMA, replace_target_iter=200, double_q=False, sess=sess1) ]
env = gym.make('Pendulum-v0') env = env.unwrapped env.seed(1) action_space = 11 n_features = 3 memory0 = Memory(n_features, MEMORY_CAPACITY, n_features * 2 + 2, BATCH_SIZE) memory1 = Memory(n_features, MEMORY_CAPACITY, n_features * 2 + 2, BATCH_SIZE) sess = tf.Session() with tf.variable_scope('dqn'): dqn = DDQN(n_actions=action_space, n_features=n_features, memory=memory0, name='dqn', learning_rate=LEARNING_RATE, e_greedy_increment=0.001, double_q=False, sess=sess) with tf.variable_scope('ddqn'): ddqn = DDQN(n_actions=action_space, n_features=n_features, memory=memory1, name='ddqn', learning_rate=LEARNING_RATE, e_greedy_increment=0.001, double_q=True, sess=sess) sess.run(tf.global_variables_initializer())
class Driver(): def __init__(self,width,height,bomb_no,render_flag): self.width = width self.height = height self.bomb_no = bomb_no self.box_count = width*height self.env = MineSweeper(self.width,self.height,self.bomb_no) self.current_model = DDQN(self.box_count,self.box_count) self.target_model = DDQN(self.box_count,self.box_count) self.target_model.eval() self.optimizer = torch.optim.Adam(self.current_model.parameters(),lr=0.003,weight_decay=1e-5) self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer,step_size=2000,gamma=0.95) self.target_model.load_state_dict(self.current_model.state_dict()) self.buffer = Buffer(100000) self.gamma = 0.99 self.render_flag = render_flag self.epsilon_min = 0.01 self.epsilon_decay = 0.90 self.reward_threshold = 0.12 self.reward_step = 0.01 self.batch_size = 4096 self.tau = 5e-5 self.log = open("./Logs/ddqn_log.txt",'w') if(self.render_flag): self.Render = Render(self.env.state) def load_models(self,number): path = "./pre-trained/ddqn_dnn"+str(number)+".pth" weights = torch.load(path) self.current_model.load_state_dict(weights['current_state_dict']) self.target_model.load_state_dict(weights['target_state_dict']) self.optimizer.load_state_dict(weights['optimizer_state_dict']) self.current_model.epsilon = weights['epsilon'] ### Get an action from the DDQN model by supplying it State and Mask def get_action(self,state,mask): state = state.flatten() mask = mask.flatten() action = self.current_model.act(state,mask) return action ### Does the action and returns Next State, If terminal, Reward, Next Mask def do_step(self,action): i = int(action/self.width) j = action%self.width if(self.render_flag): self.Render.state = self.env.state self.Render.draw() self.Render.bugfix() next_state,terminal,reward = self.env.choose(i,j) next_fog = 1-self.env.fog return next_state,terminal,reward,next_fog ### Reward Based Epsilon Decay def epsilon_update(self,avg_reward): if(avg_reward>self.reward_threshold): self.current_model.epsilon = max(self.epsilon_min,self.current_model.epsilon*self.epsilon_decay) self.reward_threshold+= self.reward_step def TD_Loss(self): ### Samples batch from buffer memory state,action,mask,reward,next_state,next_mask,terminal = self.buffer.sample(self.batch_size) ### Converts the variabls to tensors for processing by DDQN state = Variable(FloatTensor(float32(state))) mask = Variable(FloatTensor(float32(mask))) next_state = FloatTensor(float32(next_state)) action = LongTensor(float32(action)) next_mask = FloatTensor(float32(next_mask)) reward = FloatTensor(reward) done = FloatTensor(terminal) ### Predicts Q value for present and next state with current and target model q_values = self.current_model(state,mask) next_q_values = self.target_model(next_state,next_mask) # Calculates Loss: # If not Terminal: # Loss = (reward + gamma*Q_val(next_state)) - Q_val(current_state) # If Terminal: # Loss = reward - Q_val(current_state) q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) next_q_value = next_q_values.max(1)[0] expected_q_value = reward + self.gamma * next_q_value * (1 - done) loss = (q_value - expected_q_value.detach()).pow(2).mean() loss_print = loss.item() # Propagates the Loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.scheduler.step() for target_param, local_param in zip(self.target_model.parameters(), self.current_model.parameters()): target_param.data.copy_(self.tau*local_param.data + (1.0-self.tau)*target_param.data) return loss_print def save_checkpoints(self,batch_no): path = "./pre-trained/ddqn_dnn"+str(batch_no)+".pth" torch.save({ 'epoch': batch_no, 'current_state_dict': self.current_model.state_dict(), 'target_state_dict' : self.target_model.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'epsilon':self.current_model.epsilon }, path) def save_logs(self,batch_no,avg_reward,loss,wins): res = [ str(batch_no), "\tAvg Reward: ", str(avg_reward), "\t Loss: ", str(loss), "\t Wins: ", str(wins), "\t Epsilon: ", str(self.current_model.epsilon) ] log_line = " ".join(res) print(log_line) self.log.write(log_line+"\n") self.log.flush()
def main(_): #env = gym.make("Frostbite-v0") env = gym.make ("MsPacman-v0") n_s = env.observation_space.shape[0] n_a = env.action_space.n pre = Preprocessor() with tf.Session() as sess: dqn = DDQN(input_shape=[FLAGS.batch_size, 84, 84, 4], action_n=n_a, N=FLAGS.N) #dqn = DQN(input_shape=[FLAGS.batch_size, n_s], action_n=n_a) global_step = 0 saver = tf.train.Saver() if FLAGS.restore and os.path.exists("./data/model.ckpt"): saver.restore(sess, "./data/model.ckpt") #Rs = np.loadtxt("R.csv", delimiter=',') else: sess.run(tf.global_variables_initializer()) for episode in range(FLAGS.episode): obs = env.reset() #s = env.reset() pre.init(obs) done = False step = 0 limit = env.spec.tags.get("wrapper_config.TimeLimit.max_episode_steps") s = pre.state while not done and step < limit: # epsilon decay epsilon = 1.0 if global_step < FLAGS.replay_start_size else \ max(FLAGS.min_epsilon, np.interp( global_step, [0, FLAGS.decay], [1.0, FLAGS.min_epsilon])) # epsilon greedy if global_step < FLAGS.replay_start_size or np.random.rand() < epsilon: a = env.action_space.sample() else: a = dqn.greedy(s[np.newaxis], sess) obs, r, done, _ = env.step(a) s_ = pre.get_state(obs) #s_, r, done, _ = env.step(a) dqn.set_exp((s, a, r*FLAGS.reward_scale, done, s_)) s = s_ if global_step >= FLAGS.replay_start_size: dqn.update(sess) if global_step % FLAGS.sync_freq == 0: dqn.update_target(sess) step += 1 global_step += 1 if FLAGS.save and episode % FLAGS.save_freq == 0: saver.save(sess, "./checkpoint/model.ckpt", global_step=global_step) # Evaluation if episode % FLAGS.eval == 0: obs = env.reset() pre.init(obs) done = False s = pre.state R = 0 step = 0 epsilon = 0.01 while not done and step < limit: if np.random.rand() < epsilon: a = env.action_space.sample() else: a = dqn.greedy(s[np.newaxis], sess) obs, r, done, _ = env.step(a) s_ = pre.get_state(obs) s = s_ if FLAGS.render: env.render() R += r step += 1 print("epoch:{}, step:{}, R:{}".format(episode, global_step, R)) with open('R.csv', 'a') as f: f.write("{},".format(R)) gc.collect()
pre_train_steps = 10000 max_epLength = args.max_episode_length # load previous saved model load_model = args.load_model # location of model path = args.model_path # rate to update target network tau = 0.001 reward_exit_arena = args.exit_reward tf.reset_default_graph() # init DDQNs n_actions = [env.action_space[i].n for i in range(env.n)] state_sizes = [env.observation_space[i].shape[0] for i in range(env.n)] mainQN = [DDQN(n_actions[i], state_sizes[i]) for i in range(env.n)] targetQN = [DDQN(n_actions[i], state_sizes[i]) for i in range(env.n)] # init tensorflow init = tf.global_variables_initializer() saver = tf.train.Saver() trainables = tf.trainable_variables() targetOps = updateTargetGraph(trainables, tau) # assign experience buffer for each agent experiences = [experience_buffer() for i in range(env.n)] # chance of random actions if args.testing: e = 0.1 pre_train_steps = 0
print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) # RL = DQN(s_dim = env.observation_space.shape[0], # a_dim = env.action_space.n, # learning_rate = 0.01, # e_greedy = 0.9, # replace_target_iter = 100, # memory_size = 2000, # e_greedy_increment = 0.001) RL = DDQN(s_dim=env.observation_space.shape[0], a_dim=env.action_space.n, learning_rate=0.001, e_greedy=0.9, replace_target_iter=300, memory_size=3000, e_greedy_increment=0.0002) total_steps = 0 total_reward = [] for i_episode in range(15): s = env.reset() ep_r = 0 while True: env.render() a = RL.choose_action(s) s_, r, done, info = env.step(a)
class Agent(RLGlueAgent): def __init__(self): self.last_action = Action() self.time_step = 0 self.total_time_step = 0 self.episode_step = 0 self.populating_phase = False self.model_save_interval = 30 # Switch learning phase / evaluation phase self.policy_frozen = False self.ddqn = DDQN() self.state = np.zeros((config.rl_agent_history_length, config.ale_screen_channels, config.ale_scaled_screen_size[1], config.ale_scaled_screen_size[0]), dtype=np.float32) self.exploration_rate = self.ddqn.exploration_rate self.exploration_rate_for_evaluation = 0.05 self.last_observed_screen = None def preprocess_screen(self, observation): screen_width = config.ale_screen_size[0] screen_height = config.ale_screen_size[1] new_width = config.ale_scaled_screen_size[0] new_height = config.ale_scaled_screen_size[1] if len(observation.intArray) == 100928: observation = np.asarray(observation.intArray[128:], dtype=np.uint8).reshape((screen_width, screen_height, 3)) observation = spm.imresize(observation, (new_height, new_width)) # Clip the pixel value to be between 0 and 1 if config.ale_screen_channels == 1: # Convert RGB to Luminance observation = np.dot(observation[:,:,:], [0.299, 0.587, 0.114]) observation = observation.reshape((new_height, new_width, 1)) observation = observation.transpose(2, 0, 1) / 255.0 observation /= (np.max(observation) + 1e-5) else: # Greyscale if config.ale_screen_channels == 3: raise Exception("You forgot to add --send_rgb option when you run ALE.") observation = np.asarray(observation.intArray[128:]).reshape((screen_width, screen_height)) observation = spm.imresize(observation, (new_height, new_width)) # Clip the pixel value to be between 0 and 1 observation = observation.reshape((1, new_height, new_width)) / 255.0 observation /= (np.max(observation) + 1e-5) observed_screen = observation if self.last_observed_screen is not None: observed_screen = np.maximum(observation, self.last_observed_screen) self.last_observed_screen = observation return observed_screen def agent_init(self, taskSpecString): pass def reshape_state_to_conv_input(self, state): return state.reshape((1, config.rl_agent_history_length * config.ale_screen_channels, config.ale_scaled_screen_size[1], config.ale_scaled_screen_size[0])) def dump_result(self, reward, q_max=None, q_min=None): if self.time_step % 50 == 0: if self.policy_frozen is False: print "time_step:", self.time_step, print "reward:", reward, print "eps:", self.exploration_rate, if q_min is None: print "" else: print "Q ::", print "max:", q_max, print "min:", q_min def dump_state(self, state=None, prefix=""): if state is None: state = self.state state = self.reshape_state_to_conv_input(state) for h in xrange(config.rl_agent_history_length): start = h * config.ale_screen_channels end = start + config.ale_screen_channels image = state[0,start:end,:,:] if config.ale_screen_channels == 1: image = image.reshape((image.shape[1], image.shape[2])) elif config.ale_screen_channels == 3: image = image.transpose(1, 2, 0) image = np.uint8(image * 255.0) image = Image.fromarray(image) image.save(("%sstate-%d.png" % (prefix, h))) def learn(self, reward, epsode_ends=False): if self.policy_frozen is False: self.ddqn.store_transition_in_replay_memory(self.reshape_state_to_conv_input(self.last_state), self.last_action.intArray[0], reward, self.reshape_state_to_conv_input(self.state), epsode_ends) if self.total_time_step <= config.rl_replay_start_size: # A uniform random policy is run for 'replay_start_size' frames before learning starts # 経験を積むためランダムに動き回るらしい。 print "Initial exploration before learning starts:", "%d/%d" % (self.total_time_step, config.rl_replay_start_size) self.populating_phase = True self.exploration_rate = config.rl_initial_exploration else: self.populating_phase = False self.ddqn.decrease_exploration_rate() self.exploration_rate = self.ddqn.exploration_rate if self.total_time_step % (config.rl_action_repeat * config.rl_update_frequency) == 0 and self.total_time_step != 0: self.ddqn.replay_experience() if self.total_time_step % config.rl_target_network_update_frequency == 0 and self.total_time_step != 0: print "Target has been updated." self.ddqn.update_target() def agent_start(self, observation): print "Episode", self.episode_step, "::", "total_time_step:", if self.total_time_step > 1000: print int(self.total_time_step / 1000), "K" else: print self.total_time_step observed_screen = self.preprocess_screen(observation) self.state[0] = observed_screen return_action = Action() action, q_max, q_min = self.ddqn.eps_greedy(self.reshape_state_to_conv_input(self.state), self.exploration_rate) return_action.intArray = [action] self.last_action = copy.deepcopy(return_action) self.last_state = self.state.copy() return return_action def agent_step(self, reward, observation): observed_screen = self.preprocess_screen(observation) self.state = np.roll(self.state, 1, axis=0) self.state[0] = observed_screen ########################### DEBUG ############################### # if self.total_time_step % 500 == 0 and self.total_time_step != 0: # self.dump_state() self.learn(reward) return_action = Action() q_max = None q_min = None if self.time_step % config.rl_action_repeat == 0: action, q_max, q_min = self.ddqn.eps_greedy(self.reshape_state_to_conv_input(self.state), self.exploration_rate) else: action = self.last_action.intArray[0] return_action.intArray = [action] self.dump_result(reward, q_max, q_min) if self.policy_frozen is False: self.last_action = copy.deepcopy(return_action) self.last_state = self.state.copy() self.time_step += 1 self.total_time_step += 1 return return_action def agent_end(self, reward): self.learn(reward, epsode_ends=True) # [Optional] ## Visualizing the results self.dump_result(reward) if self.policy_frozen is False: self.time_step = 0 self.total_time_step += 1 self.episode_step += 1 def agent_cleanup(self): pass def agent_message(self, inMessage): if inMessage.startswith("freeze_policy"): self.policy_frozen = True self.exploration_rate = self.exploration_rate_for_evaluation return "The policy was freezed." if inMessage.startswith("unfreeze_policy"): self.policy_frozen = False self.exploration_rate = self.ddqn.exploration_rate return "The policy was unfreezed." if inMessage.startswith("save_model"): if self.populating_phase is False: self.ddqn.save() return "The model was saved."
import gym from dqn import DQN from ddqn import DDQN from dueling_ddqn import DuelingDDQN from noisy_dqn import NoisyDQN from categorical_dqn import CategoricalDQN from rainbow import Rainbow from utils.dqn_runner import vector_train from utils.dqn_runner import evaluate if __name__ == "__main__": env = gym.vector.make("CartPole-v1", num_envs=4, asynchronous=True) agent = DDQN(env.single_observation_space, env.single_action_space) returns = vector_train(agent, env, 50000, 450) eval_env = gym.make("CartPole-v1") evaluate(agent, eval_env, 1, True)