def __init__(self): self.args = args = agent.parse_args() self.ep = EnvPool(args.env, self.args.env_size) self.eps = [ MultiStageEpsilon([ LinearAnnealEpsilon(1.0, 0.1, int(1e6)), LinearAnnealEpsilon(0.1, 0.05, int(1e7 - 1e6)) ]), 0 ] self.replay = ReplayBuffer(args.replay_buffer_size) main_logger.info("Replay Buffer Max Size: {}B".format( pretty_num(args.replay_buffer_size * (84 * 84 * 4 * 2 + 8), True))) self.sess = agent.make_session() self.sess.__enter__() agent.setup(self.ep.action_num, self.replay) self.train_epi = 0 self.max_reward = agent.score
def __init__( self, env, learning_rate=1e-3, seed=1234, gamma=0.99, max_eps=1.0, min_eps=0.1, render=False, print_freq=1, load_path=None, save_path=None, batch_size=32, log_dir='logs/train', max_steps=100000, buffer_capacity=None, max_episode_len=None, eps_decay_rate=-1e-4, target_update_freq=1000, ): tf.random.set_seed(seed) np.random.seed(seed) self.gamma = gamma self.render = render self.batch_size = batch_size self.print_freq = print_freq self.q_lr = learning_rate self.max_eps = max_eps self.min_eps = min_eps self.eps_decay_rate = eps_decay_rate self.buffer = ReplayBuffer(buffer_capacity) self.max_steps = max_steps self.target_update = target_update_freq self.model = QNetwork(env.action_space.n, name='q_network') self.target = QNetwork(env.action_space.n, name='target_network') self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr) self.summary_writer = tf.summary.create_file_writer(log_dir) self.env = env self.max_episode_len = max_episode_len if max_episode_len else self.env.spec.max_episode_steps self.rewards = [] self.save_path = save_path if load_path is not None: self.model.load_weights(load_path)
def __init__(self, api, network_class, sess, save_path, history_size=15, restore_path=None, verbose=False, train=False, test=False): super(NeuralNetworkAgent, self).__init__(api, verbose=verbose) # currently 7500 w/ 1000 # Network self.network = network_class(sess, save_path, restore_path=restore_path, hist_size=history_size) self.replay_buffer = ReplayBuffer(max_size=2500) self.train = train self.history_size = history_size # Internal self.launched = False self.placed_move = False self.ctr = 0 self.restart_game = 1 self.game_restarted = True self.show_board = False self.last_move = -2 self.start_state = np.zeros((20, 10, 1)) self.possible_moves = [-1, 0, 6, 7] self.training_begun = False if not test else True self.epsilon = 1. if not test else 0 self.decay = 0.999 self.test = test self.prev_states = [self.start_state] * self.history_size
def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
plot_episode_rewards = [] # 이건 에피소드 받은 리워드 ( 에이전트 동안 받은 개별 리워드 다 더한 값) plot_episode_valid_steps = [] # 에피소드별 action 요청이 하나라도 들어온 step 카운트 plot_episode_count_requested_agent = np.asarray( [0] * N_AGENTS) # 에이전트별 요청받은 에이전트 대수 기록 plot_episode_requested_agents = np.asarray([0] * N_AGENTS) plot_count_per_actions = np.asarray([0] * N_ACTION) args = get_common_args() args = qmix_args(args) policy = QMIX(args) agents = Agents(args, policy) env = elevator.ElevatorEnv(SCREEN_WIDTH, SCREEN_HEIGHT, False) worker = RolloutWorker(env, agents, args) buffer = ReplayBuffer(args) plt.figure() plt.axis([0, args.n_epoch, 0, 100]) win_rates = [] episode_rewards = [] train_steps = 0 save_path = args.result_dir + '/' + current os.makedirs(save_path, exist_ok=True) for epoch in range(args.n_epoch): episodes = [] for e in range(args.n_episodes): episode, episode_reward, episode_count_per_actions, episode_episode_requested_agents, episode_episode_count_requested_agent = worker.generate_episode( e)
def main(args): constraints = np.array([1,0]) train_data = pickle.load(open("paths.5.half.pkl", "rb")) train_data2 = [RLPath2(path, compute_g) for path in tqdm(train_data)] dataset = ReplayBuffer(10000000) for path in tqdm(train_data2): dataset.store(path) init_states = pickle.load(open("init_states606.pkl", "rb")) args = { "env" : "LunarLanderContinuous-v2", "train" : True, "test" : False, "max_iter" : 2, "test_episodes" : 1, "output_dir" : "output", "output_iters" : 10, "gpu" : "0", "visualize" : False } args = Namespace(**args) best_response_algorithm = BestResponse(args) lambda_bound = 30 eta = 1 starting_lambda = [1, 100] online_convex_algorithm = ExponentiatedGradient( lambda_bound, len(constraints), eta=eta, starting_lambda=starting_lambda) discount = 0.95 state_size = 8 action_size = 2 lr = 0.001 fqe_epochs = 100 fqe_batches = 3 fitted_off_policy_evaluation_algorithm = FittedQEvaluation(discount, state_size, action_size, lr, epochs=fqe_epochs, batches=fqe_batches) init_seed = 606 num_paths = 2 exact_policy_algorithm = ExactPolicyEvaluator(discount, init_seed, num_paths, compute_g) problem = OptProblem(constraints, dataset, init_states, best_response_algorithm, online_convex_algorithm, fitted_off_policy_evaluation_algorithm, exact_policy_algorithm, lambda_bound, max_iterations=10) lambdas = [] policies = [] iteration = 0 while not problem.is_over(): iteration += 1 for i in range(1): print('*' * 20) print('Iteration %s, %s' % (iteration, i)) if len(lambdas) == 0: # first iteration lambdas.append(online_convex_algorithm.get()) print('lambda_{0}_{2} = {1}'.format(iteration, lambdas[-1], i)) else: # all other iterations lambda_t = problem.online_algo() lambdas.append(lambda_t) print('lambda_{0}_{3} = online-algo(pi_{1}_{3}) = {2}'.format(iteration, iteration-1, lambdas[-1], i)) lambda_t = lambdas[-1] pi_t = problem.best_response(lambda_t) values = [] # policies.append(pi_t) problem.update(pi_t, values, iteration) # Evaluate C(pi_t), G(pi_t) and save
tf.summary.scalar('agent' + str(i) + '_reward_l100_mean', reward_100[i]) for i in range(3) ] sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run([ agent1_actor_target_init, agent1_critic_target_init, agent2_actor_target_init, agent2_critic_target_init, agent3_actor_target_init, agent3_critic_target_init ]) saver.restore(sess, './weight_single/210000.cptk') summary_writer = tf.summary.FileWriter('./test_three_summary', graph=tf.get_default_graph()) agent1_memory = ReplayBuffer(100000) agent2_memory = ReplayBuffer(100000) agent3_memory = ReplayBuffer(100000) e = 1 reward_100_list = [[], [], []] for i in range(1000000): if i % 1000 == 0: o_n = env.reset() agent1_action, agent2_action, agent3_action = get_agents_action( o_n, sess, noise_rate=0.1) env.render()
reward_1000 = [tf.Variable(0, dtype=tf.float32) for i in range(3)] reward_1000_op = [tf.summary.scalar('agent' + str(i) + '_reward_l1000_mean', reward_1000[i]) for i in range(3)] config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = gpu_fraction sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) sess.run([agent1_actor_target_init, agent1_critic_target_init, agent2_actor_target_init, agent2_critic_target_init, agent3_actor_target_init, agent3_critic_target_init]) summary_writer = tf.summary.FileWriter('./three_summary', graph=tf.get_default_graph()) agent1_memory = ReplayBuffer(100000) agent2_memory = ReplayBuffer(100000) agent3_memory = ReplayBuffer(100000) e = 1 reward_100_list = [[], [], []] for i in range(1000000): if i % 1000 == 0: o_n = env.reset() for agent_index in range(3): summary_writer.add_summary(sess.run(reward_1000_op[agent_index], {reward_1000[agent_index]: np.mean(reward_100_list[agent_index])}), i // 1000) agent1_action, agent2_action, agent3_action = get_agents_action(o_n, sess, noise_rate=0.2) a = [[0, i[0][0], 0, i[0][1], 0] for i in [agent1_action, agent2_action, agent3_action]]
def playGame(): args = parse_args() args.initial_eps = 0.0001 if args.test else args.initial_eps if args.double: save_dir = "02DoubleDQN/" if not args.dueling else "02DoubleDuelingDQN/" else: save_dir = "01DQN/" if not args.dueling else "01DuelingDQN/" print("double:{}, dueling:{}, prioritized:{}\n".format( args.double, args.dueling, args.prioritized)) sess = tf.InteractiveSession() # placeholders s = tf.placeholder("float", [None, 80, 80, 4], name="state") target = tf.placeholder("float", [None], name="target") action = tf.placeholder("float", [None, args.n_actions], name="action") # actions taken: [0, 1] or [1, 0] # -----dueling--------- q_func = model(s, args.n_actions, scope="q_func") if not args.dueling else dueling_model( s, args.n_actions, scope="q_func") # -----dueling--------- # -----double--------- if args.double: q_func_vars = scope_vars("q_func") # target q network evaluation q_target = model( s, args.n_actions, scope="q_target") if not args.dueling else dueling_model( s, args.n_actions, scope="q_target") q_target_vars = scope_vars("q_target") # -----double--------- # define the cost function readout_action = tf.reduce_sum(tf.multiply(q_func, action), axis=1) td_errors = target - readout_action cost = tf.reduce_mean(tf.square(td_errors)) train_step = tf.train.AdamOptimizer(args.lr).minimize(cost) # open up a game state to communicate with emulator game_state = game.GameState() # -----prioritized replay--------- # initialize replay memory if args.prioritized: replay_buffer = PrioritizedReplayBuffer(args.replay_buffer_size, alpha=args.prioritized_alpha) beta_schedule = LinearSchedule(args.prioritized_beta_iter, initial_p=args.prioritized_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(args.replay_buffer_size) # -----prioritized replay--------- ''' printing a_file = open("logs_" + args.game + "/readout.txt", 'w') h_file = open("logs_" + args.game + "/hidden.txt", 'w') ''' # get the first state by doing nothing and preprocess the image to 80x80x4 do_nothing = np.zeros(args.n_actions) do_nothing[0] = 1 x_t, r_0, terminal = game_state.frame_step(do_nothing) x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY) ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # s_t : 80 * 80 * 4 # load networks saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) checkpoint = tf.train.get_checkpoint_state("saved_networks/" + save_dir) already_trained = 0 if checkpoint and checkpoint.model_checkpoint_path: already_trained = checkpoint.model_checkpoint_path already_trained = int(already_trained[already_trained.find('dqn-') + 4:]) saver.restore(sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") # start training EpsilonSchedule = LinearSchedule(args.explore, args.final_eps, args.initial_eps) t = already_trained epsilon = EpsilonSchedule.value(t) while "flappy bird" != "angry bird": #-----double--------- # whether update q_target if args.double and t % args.target_update_freq == 0: sess.run(update_target(q_func_vars, q_target_vars)) # -----double--------- # choose an action epsilon greedily Q_t = q_func.eval(feed_dict={s: [s_t]})[0] a_t = np.zeros([args.n_actions]) action_index = 0 if t % args.frame_per_action == 0: action_index = random.randrange( args.n_actions) if random.random() < epsilon else np.argmax( Q_t) a_t[action_index] = 1 # run the selected action and observe next state and reward x_t1_colored, r_t, terminal = game_state.frame_step(a_t) s_t1 = preprocess(s_t, x_t1_colored) # store the transition in D replay_buffer.add(s_t, a_t, r_t, s_t1, terminal) # only scale down epsilon if done observing if t > args.observe: epsilon = EpsilonSchedule.value(t - args.observe) # only train if done observing if t > args.observe + already_trained: # -----prioritized replay--------- # sample a minibatch to train on if args.prioritized: experience = replay_buffer.sample( args.batch_size, beta=beta_schedule.value(t - args.observe - already_trained)) (s_j_batch, a_batch, r_batch, s_j1_batch, done_batch, weights, batch_idxes) = experience else: s_j_batch, a_batch, r_batch, s_j1_batch, done_batch = replay_buffer.sample( args.batch_size) # -----prioritized replay--------- target_batch = [] # -----double--------- Q_j1_batch = q_target.eval( feed_dict={s: s_j1_batch}) if args.double else q_func.eval( feed_dict={s: s_j1_batch}) # -----double--------- for i in range(0, args.batch_size): terminal = done_batch[i] # if terminal, only equals reward if terminal: target_batch.append(r_batch[i]) else: target_batch.append(r_batch[i] + args.gamma * np.max(Q_j1_batch[i])) # -----prioritized replay--------- if args.prioritized: td_errs = td_errors.eval(feed_dict={ target: target_batch, action: a_batch, s: s_j_batch }) new_priorities = np.abs(td_errs) + args.prioritized_eps replay_buffer.update_priorities(batch_idxes, new_priorities) # -----prioritized replay--------- # perform gradient step train_step.run(feed_dict={ target: target_batch, action: a_batch, s: s_j_batch }) # update the old values s_t = s_t1 t += 1 # save if t % args.save_freq == 0: saver.save(sess, "saved_networks/" + save_dir + args.game + '-dqn', global_step=t) # display if t <= args.observe: state = "observe" elif t > args.observe and t <= args.observe + args.explore: state = "explore" else: state = "train" info_expr = 'TIMESTEP:{}, STATE:{}, EPSILON:{:6f}, ACTION{}, REWARD:{}, Q_MAX:{}' print( info_expr.format(t, state, epsilon, action_index, r_t, np.max(Q_t))) # write info to files '''
for i in range(num_agents) ] config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = gpu_fraction sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) sess.run( [agent_actor_target_init_list[:], agent_critic_target_init_list[:]]) summary_writer = tf.summary.FileWriter('./VUE_summary', graph=tf.get_default_graph()) for i in range(num_agents): mem = ReplayBuffer(10000) memory.append(mem) # for every 100 step, check the rewards reward_100_list = np.zeros([100, 1], dtype=float) sum_r = 0. for i in range(1, Episode + 1): print(str(i) + "번째 에피소드 시작..") if i % 100 == 0: print(str(i) + "번째 에피소드. 환경 리셋.(100 배수)") o_n = env.reset() for agent_index in range(num_agents): summary_writer.add_summary( sess.run( reward_100_op[agent_index], { reward_100[agent_index]:
class TrainDQN: def __init__(self, env, sess, learning_rate=1e-3, seed=1234, gamma=0.99, max_eps=1.0, min_eps=0.1, render=False, print_freq=20, load_path=None, save_path=None, batch_size=32, log_dir='logs/train', max_steps=100000, buffer_capacity=None, max_episode_len=2000, eps_decay_rate=-0.0001, target_update_freq=1000, ): """Trains an openai gym-like environment with deep q learning. Args: env: gym.Env where our agent resides seed: Random seed for reproducibility gamma: Discount factor max_eps: Starting exploration factor min_eps: Exploration factor to decay towards max_episode_len: Maximum length of an individual episode render: True to render the environment, else False print_freq: Displays logging information every 'print_freq' episodes load_path: (str) Path to load existing model from save_path: (str) Path to save model during training max_steps: maximum number of times to sample the environment buffer_capacity: How many state, action, next state, reward tuples the replay buffer should store max_episode_len: Maximum number of timesteps in an episode eps_decay_rate: lambda parameter in exponential decay for epsilon target_update_fraction: Fraction of max_steps update the target network """ np.random.seed(seed) self.sess = sess self.env = env self.input_dim = env.observation_space.shape[0] self.output_dim = env.action_space.n self.max_steps = max_steps self.max_eps = max_eps self.min_eps = min_eps self.eps_decay_rate = eps_decay_rate self.max_episode_len = max_episode_len self.render = render self.print_freq = print_freq self.rewards = [] self.metrics = [] self.save_path = save_path self.load_path = load_path self.batch_size = batch_size self.num_updates = 0 self.gamma = gamma self.buffer = ReplayBuffer(capacity=max_steps // 2 if buffer_capacity is None else buffer_capacity) self.target_update_freq = target_update_freq self.learning_rate = learning_rate with tf.variable_scope('q_network'): self.q_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,)) with tf.variable_scope('target_network'): self.target_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,)) self.update_target_network = [old.assign(new) for (new, old) in zip(tf.trainable_variables('q_network'), tf.trainable_variables('target_network'))] if self.load_path is not None: self.load() self.add_summaries(log_dir) def add_summaries(self, log_dir): tf.summary.scalar('Loss', self.q_network.loss, ) tf.summary.scalar('Mean Estimated Value', tf.reduce_mean(self.q_network.output_pred)) # Merge all the summaries and write them out to log_dir self.merged = tf.summary.merge_all() self.train_writer = tf.summary.FileWriter(log_dir, self.sess.graph) def learn(self): """Learns via Deep-Q-Networks (DQN)""" obs = self.env.reset() mean_reward = None total_reward = 0 ep = 0 ep_len = 0 rand_actions = 0 for t in range(self.max_steps): # weight decay from https://jaromiru.com/2016/10/03/lets-make-a-dqn-implementation/ eps = self.min_eps + (self.max_eps - self.min_eps) * np.exp( self.eps_decay_rate * t) if self.render: self.env.render() # Take exploratory action with probability epsilon if np.random.uniform() < eps: action = self.env.action_space.sample() rand_actions += 1 else: action = self.act(obs) # Execute action in emulator and observe reward and next state new_obs, reward, done, info = self.env.step(action) total_reward += reward # Store transition s_t, a_t, r_t, s_t+1 in replay buffer self.buffer.add((obs, action, reward, new_obs, done)) # Perform learning step self.update() obs = new_obs ep_len += 1 if done or ep_len >= self.max_episode_len: # print("Episode Length:", ep_len) # print(f"Episode {ep} Reward:{total_reward}") # print(f"Random Action Percent: {rand_actions/ep_len}") ep += 1 ep_len = 0 rand_actions = 0 self.rewards.append(total_reward) total_reward = 0 obs = self.env.reset() if ep % self.print_freq == 0 and ep > 0: new_mean_reward = np.mean(self.rewards[-self.print_freq - 1:]) print(f"-------------------------------------------------------") print(f"Mean {self.print_freq} Episode Reward: {new_mean_reward}") print(f"Exploration fraction: {eps}") print(f"Total Episodes: {ep}") print(f"Total timesteps: {t}") print(f"-------------------------------------------------------") # Add reward summary summary = tf.Summary() summary.value.add(tag=f'Mean {self.print_freq} Episode Reward', simple_value=new_mean_reward) summary.value.add(tag=f'Epsilon', simple_value=eps) self.train_writer.add_summary(summary, self.num_updates) # Model saving inspired by Open AI Baseline implementation if (mean_reward is None or new_mean_reward >= mean_reward) and self.save_path is not None: print(f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}") print(f'Location: {self.save_path}') # save_path = f"{self.save_path}_model" self.save() mean_reward = new_mean_reward def act(self, observation): """Takes an action given the observation. Args: observation: observation from the environment Returns: integer index of the selected action """ pred = self.sess.run([self.q_network.output_pred], feed_dict={self.q_network.input_ph: np.reshape(observation, (1, self.input_dim))}) return np.argmax(pred) def update(self): """Applies gradients to the Q network computed from a minibatch of self.batch_size.""" if self.batch_size <= self.buffer.size(): self.num_updates += 1 # Update the Q network with model parameters from the target network if self.num_updates % self.target_update_freq == 0: self.sess.run(self.update_target_network) print('Updated Target Network') # Sample random minibatch of transitions from the replay buffer sample = self.buffer.sample(self.batch_size) states, action, reward, next_states, done = sample # Calculate discounted predictions for the subsequent states using target network next_state_pred = self.gamma * self.sess.run(self.target_network.output_pred, feed_dict={ self.target_network.input_ph: next_states}, ) # Adjust the targets for non-terminal states reward = reward.reshape(len(reward), 1) targets = reward loc = np.argwhere(done != True).flatten() if len(loc) > 0: max_q = np.amax(next_state_pred, axis=1) targets[loc] = np.add( targets[loc], max_q[loc].reshape(max_q[loc].shape[0], 1), casting='unsafe') # Update discount factor and train model on batch _, loss = self.sess.run([self.q_network.opt, self.q_network.loss], feed_dict={self.q_network.input_ph: states, self.q_network.target_ph: targets.flatten(), self.q_network.action_indices_ph: action}) def save(self): """Saves the Q network.""" self.q_network.saver.save(self.sess, self.save_path) def load(self): """Loads the Q network.""" self.q_network.saver.restore(self.sess, self.save_path) def plot_rewards(self, path=None): """Plots rewards per episode. Args: path: Location to save the rewards plot. If None, image will be displayed with plt.show() """ plt.plot(self.rewards) plt.xlabel('Episode') plt.ylabel('Reward') if path is None: plt.show() else: plt.savefig(path) plt.close('all')
reward_1000_op = [tf.summary.scalar('agent' + str(i) + '_reward_l1000_mean', reward_1000[i]) for i in range(3)] config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = gpu_fraction sess = tf.Session(config=config) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run([agent1_actor_target_init, agent1_critic_target_init, agent2_actor_target_init, agent2_critic_target_init, agent3_actor_target_init, agent3_critic_target_init]) summary_writer = tf.summary.FileWriter('./three_ma_summary', graph=tf.get_default_graph()) agent1_memory = ReplayBuffer(100000) agent2_memory = ReplayBuffer(100000) agent3_memory = ReplayBuffer(100000) # e = 1 reward_100_list = [[], [], []] for i in range(1000000): if i % 1000 == 0: o_n = env.reset() for agent_index in range(3): summary_writer.add_summary(sess.run(reward_1000_op[agent_index], {reward_1000[agent_index]: np.mean(reward_100_list[agent_index])}), i // 1000) agent1_action, agent2_action, agent3_action = get_agents_action(o_n, sess, noise_rate=0.2) a = [[0, i[0][0], 0, i[0][1], 0] for i in [agent1_action, agent2_action, agent3_action]]
class NeuralNetworkAgent(Agent): def __init__(self, api, network_class, sess, save_path, history_size=15, restore_path=None, verbose=False, train=False, test=False): super(NeuralNetworkAgent, self).__init__(api, verbose=verbose) # currently 7500 w/ 1000 # Network self.network = network_class(sess, save_path, restore_path=restore_path, hist_size=history_size) self.replay_buffer = ReplayBuffer(max_size=2500) self.train = train self.history_size = history_size # Internal self.launched = False self.placed_move = False self.ctr = 0 self.restart_game = 1 self.game_restarted = True self.show_board = False self.last_move = -2 self.start_state = np.zeros((20, 10, 1)) self.possible_moves = [-1, 0, 6, 7] self.training_begun = False if not test else True self.epsilon = 1. if not test else 0 self.decay = 0.999 self.test = test self.prev_states = [self.start_state] * self.history_size def _controller_listener(self): piece_id = self.api.peekCPU(0x0042) game_state = self.api.peekCPU(0x0048) if piece_id != 19 and game_state == 1: # Train if self.train and self.replay_buffer.size( ) > 250 and not self.test: batch = self.replay_buffer.sample(batch_sz=250) self.network.train(batch) self.training_begun = True self.epsilon *= self.decay if self.epsilon < 0.010: self.epsilon = 0.010 if not self.placed_move: # and (random_move >= 0 or self.restart_game > 0): # os.system('clear') print '--------------' is_random = False move = None if np.random.random() < self.epsilon or not self.training_begun: move = np.random.choice(self.possible_moves) is_random = True else: tensor = np.dstack([self.grid] + self.prev_states) pred = self.network.predict(tensor)[0] move = self.possible_moves[pred] if self.restart_game > 0: self.api.writeGamepad(0, 3, True) self.restart_game -= 1 move = -2 else: if move >= 0: self.api.writeGamepad(0, move, True) self.placed_move = True self.show_board = True if self.last_move != -2 and piece_id != 19: print 'Random:', is_random S = self.grid.copy() self._update_board(self.api.peekCPU(0x0042)) board = self._simulate_piece_drop(self.api.peekCPU(0x0042)) n_empty = self._count_empty(self.grid) n_holes = self._count_holes(self.grid) height = self._count_height(board) levelness = self._determine_levelness(board) A = self.last_move # R = self._count_total() + self._get_score() - n_empty #R = (-50 * height) + (-20 * n_holes) + (self._get_score()) if height <= 2: R = 1000 else: R = -200 * height R += -20 * n_holes + 10 * levelness # 10 * self._get_score() SP = self.grid.copy() self.prev_states.insert(0, S) print np.dstack(self.prev_states).shape self.replay_buffer.add( np.dstack(self.prev_states), self.possible_moves.index(A), R, np.dstack([SP] + self.prev_states[:self.history_size])) self.prev_states = self.prev_states[:self.history_size] print self.epsilon self._print_transition(S, A, board, R) self.last_move = move else: self.placed_move = False def _frame_render_finished(self): """ Renders the board the the current piece TODO: do this lazily, so we aren't calling read too often O_o """ # To make things easier, we're going to modify the next piece drop # Always drop a certain type of block (currently square). self.api.writeCPU(0x00bf, 0x0a) piece_id = self.api.peekCPU(0x0042) game_state = self.api.peekCPU(0x0048) # Restart the game if piece_id == 19 and (game_state == 10 or game_state == 0): self.prev_states = [self.start_state] * self.history_size self.game_restarted = True self.restart_game = 1 return # Probably a line clear... Skip if piece_id == 19 and game_state != 1: return def _piece_update(self, access_type, address, value): """ Can be used to control the piece being dropped """ if self.api.readCPU(0x0048) == 1: return 0x0a return value def agent_name(self): return 'NeuralNetworkAgent'
class DQN: def __init__( self, env, learning_rate=1e-3, seed=1234, gamma=0.99, max_eps=1.0, min_eps=0.1, render=False, print_freq=1, load_path=None, save_path=None, batch_size=32, log_dir='logs/train', max_steps=100000, buffer_capacity=None, max_episode_len=None, eps_decay_rate=-1e-4, target_update_freq=1000, ): tf.random.set_seed(seed) np.random.seed(seed) self.gamma = gamma self.render = render self.batch_size = batch_size self.print_freq = print_freq self.q_lr = learning_rate self.max_eps = max_eps self.min_eps = min_eps self.eps_decay_rate = eps_decay_rate self.buffer = ReplayBuffer(buffer_capacity) self.max_steps = max_steps self.target_update = target_update_freq self.model = QNetwork(env.action_space.n, name='q_network') self.target = QNetwork(env.action_space.n, name='target_network') self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr) self.summary_writer = tf.summary.create_file_writer(log_dir) self.env = env self.max_episode_len = max_episode_len if max_episode_len else self.env.spec.max_episode_steps self.rewards = [] self.save_path = save_path if load_path is not None: self.model.load_weights(load_path) def act(self, state): return np.argmax(self.model(state)) @tf.function def train_step(self, states, indices, targets): """ Performs a single step of gradient descent on the Q network Args: states: numpy array of states with shape (batch size, state dim) indices: list indices of the selected actions targets: targets for computing the MSE loss """ with tf.GradientTape() as tape: action_values = tf.gather_nd(self.model(states), indices) mse_loss = tf.keras.losses.MeanSquaredError()(action_values, targets) gradients = tape.gradient(mse_loss, self.model.trainable_variables) self.optimizer.apply_gradients( zip(gradients, self.model.trainable_variables)) # Log training information with self.summary_writer.as_default(): tf.summary.scalar('MSE Loss', mse_loss, step=self.optimizer.iterations) tf.summary.scalar('Estimated Q Value', tf.reduce_mean(action_values), step=self.optimizer.iterations) def update(self): """ Computes the target for the MSE loss and calls the tf.function for gradient descent """ if len(self.buffer) >= self.batch_size: # Sample random minibatch of N transitions states, actions, rewards, next_states, dones = self.buffer.sample( self.batch_size) # Adjust the targets for non-terminal states next_state_pred = self.target(next_states) targets = rewards + self.gamma * next_state_pred.numpy().max( axis=1) * (1 - dones) batch_range = tf.range(start=0, limit=actions.shape[0]) indices = tf.stack((batch_range, actions), axis=1) # update critic by minimizing the MSE loss self.train_step(states, indices, targets) def learn(self): """Learns via Deep-Q-Networks (DQN)""" obs = self.env.reset() total_reward = 0 ep = 0 ep_len = 0 rand_actions = 0 mean_reward = None for t in range(self.max_steps): if t % self.target_update == 0: copy_weights(self.model.variables, self.target.variables) # weight decay from https://jaromiru.com/2016/10/03/lets-make-a-dqn-implementation/ eps = self.min_eps + (self.max_eps - self.min_eps) * np.exp( self.eps_decay_rate * t) if self.render: self.env.render() # Take exploratory action with probability epsilon if np.random.uniform() < eps: action = self.env.action_space.sample() rand_actions += 1 else: action = self.act(np.expand_dims(obs, axis=0)) # Execute action in emulator and observe reward and next state new_obs, reward, done, info = self.env.step(action) total_reward += reward # Store transition s_t, a_t, r_t, s_t+1 in replay buffer self.buffer.add((obs, action, reward, new_obs, done)) # Perform learning step self.update() obs = new_obs ep_len += 1 if done or ep_len >= self.max_episode_len: with self.summary_writer.as_default(): ep += 1 self.rewards.append(total_reward) total_reward = 0 obs = self.env.reset() if ep % self.print_freq == 0 and ep > 0: new_mean_reward = np.mean( self.rewards[-self.print_freq - 1:]) print( f"-------------------------------------------------------" ) print( f"Mean {self.print_freq} Episode Reward: {new_mean_reward}" ) print(f"Exploration fraction: {rand_actions / ep_len}") print(f"Total Episodes: {ep}") print(f"Total timesteps: {t}") print( f"-------------------------------------------------------" ) tf.summary.scalar( f'Mean {self.print_freq} Episode Reward', new_mean_reward, step=t) tf.summary.scalar(f'Epsilon', eps, step=t) # Model saving inspired by Open AI Baseline implementation if (mean_reward is None or new_mean_reward >= mean_reward) and self.save_path is not None: print( f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}" ) print(f'Location: {self.save_path}') mean_reward = new_mean_reward self.model.save_weights(self.save_path) ep_len = 0 rand_actions = 0
def play(train_indicator): buffer_size = 100000 batch_size = 32 gamma = 0.99 # discount factor tau = 0.001 # Target Network HyperParameter lra = 0.0001 # Learning rate for Actor lrc = 0.001 # Learning rate for Critic ou_sigma = 0.3 action_dim = 1 # Steering angle state_dim = 21 # num of sensors input episodes_num = 2000 max_steps = 100000 step = 0 train_stat_file = "data/train_stat.txt" actor_weights_file = "data/actor.h5" critic_weights_file = "data/critic.h5" config = tf.ConfigProto() config.gpu_options.allow_growth = True tf_session = tf.Session(config=config) keras_backend.set_session(tf_session) actor = ActorNetwork(tf_session=tf_session, state_size=state_dim, action_size=action_dim, hidden_units=(300, 600), tau=tau, lr=lra) critic = CriticNetwork(tf_session=tf_session, state_size=state_dim, action_size=action_dim, hidden_units=(300, 600), tau=tau, lr=lrc) buffer = ReplayBuffer(buffer_size) # noise function for exploration ou = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim), sigma=ou_sigma * np.ones(action_dim)) # Torcs environment - throttle and gear change controlled by client env = TorcsEnv(vision=False, throttle=False, gear_change=False) try: actor.model.load_weights(actor_weights_file) critic.model.load_weights(critic_weights_file) actor.target_model.load_weights(actor_weights_file) critic.target_model.load_weights(critic_weights_file) print("Weights loaded successfully") except: print("Cannot load weights") for i in range(episodes_num): print("Episode : %s Replay buffer %s" % (i, len(buffer))) if i % 3 == 0: ob = env.reset( relaunch=True ) # relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() # 21 len state dimensions - https://arxiv.org/abs/1304.1672 state = np.hstack((ob.angle, ob.track, ob.trackPos)) total_reward = 0. for j in range(max_steps): loss = 0 action_predicted = actor.model.predict( state.reshape(1, state.shape[0])) + ou() # predict and add noise observation, reward, done, info = env.step(action_predicted[0]) state1 = np.hstack( (observation.angle, observation.track, observation.trackPos)) buffer.add((state, action_predicted[0], reward, state1, done)) # add replay buffer # batch update batch = buffer.get_batch(batch_size) states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) rewards = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q_values = critic.target_model.predict( [new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = rewards[k] else: y_t[k] = rewards[k] + gamma * target_q_values[k] if train_indicator: loss += critic.model.train_on_batch([states, actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.get_gradients(states, a_for_grad) actor.train(states, grads) actor.train_target_model() critic.train_target_model() total_reward += reward state = state1 print("Episode %s - Step %s - Action %s - Reward %s" % (i, step, action_predicted[0][0], reward)) step += 1 if done: break if i % 3 == 0 and train_indicator: print("Saving weights...") actor.model.save_weights(actor_weights_file, overwrite=True) critic.model.save_weights(critic_weights_file, overwrite=True) tm = time.strftime("%Y-%m-%d %H:%M:%S") episode_stat = "%s -th Episode. %s total steps. Total reward: %s. Time %s" % ( i, step, total_reward, tm) print(episode_stat) with open(train_stat_file, "a") as outfile: outfile.write(episode_stat + "\n") env.end()
class Game(object): def __init__(self): self.args = args = agent.parse_args() self.ep = EnvPool(args.env, self.args.env_size) self.eps = [ MultiStageEpsilon([ LinearAnnealEpsilon(1.0, 0.1, int(1e6)), LinearAnnealEpsilon(0.1, 0.05, int(1e7 - 1e6)) ]), 0 ] self.replay = ReplayBuffer(args.replay_buffer_size) main_logger.info("Replay Buffer Max Size: {}B".format( pretty_num(args.replay_buffer_size * (84 * 84 * 4 * 2 + 8), True))) self.sess = agent.make_session() self.sess.__enter__() agent.setup(self.ep.action_num, self.replay) self.train_epi = 0 self.max_reward = agent.score def random(self): random_step = self.args.replay_buffer_size // 2 obs = self.ep.reset() with tqdm(total=random_step, desc="random", ascii=True) as t: while t.n < random_step: action, (obs_, reward, done, info) = self.ep.random() [ self.replay.add(obs[i], action[i], reward[i], float(done[i]), obs_[i]) for i in range(self.ep.size) ] obs, info = self.ep.auto_reset() t.update(self.ep.size) total_epi = sum(len(info[i]['rewards']) for i in range(self.ep.size)) mean_reward = np.mean([ np.mean(info[i]['rewards']) for i in range(self.ep.size) if info[i]['rewards'] ]) record = Record() record.add_key_value('Phase', 'Random') record.add_key_value('Episodes', pretty_num(total_epi)) record.add_key_value('Mean Reward', np.round(mean_reward, 2)) main_logger.info("\n" + record.dumps()) if not self.max_reward: self.max_reward = mean_reward def train(self): train_step = 250000 self.ep.reset_state() obs = self.ep.reset() with tqdm(total=train_step, desc="Train", ascii=True) as t: while t.n < train_step: action = agent.take_action( obs, self.eps[0].get(self.train_epi * train_step + t.n)) obs_, reward, done, info = self.ep.step(action) [ self.replay.add(obs[i], action[i], reward[i], float(done[i]), obs_[i]) for i in range(self.ep.size) ] obs, info = self.ep.auto_reset() if t.n % self.args.target_update_freq == 0: agent.update_target() if t.n % self.args.learning_freq == 0: agent.train(self.ep.size) t.update(self.ep.size) self.train_epi += 1 completion = np.round(self.train_epi / self.args.num_iters, 2) total_epi = sum(len(info[i]['rewards']) for i in range(self.ep.size)) mean_reward = np.mean([ np.mean(info[i]['rewards'][-100:]) for i in range(self.ep.size) if info[i]['rewards'] ]) record = Record() record.add_key_value('Phase', 'Train') record.add_key_value('% Completion', completion) record.add_key_value('Episodes', pretty_num(total_epi)) record.add_key_value( '% Exploration', np.round(self.eps[0].get(self.train_epi * train_step) * 100, 2)) record.add_key_value('Reward (100 epi mean)', np.round(mean_reward, 2)) main_logger.info("\n" + record.dumps()) def test(self): test_step = 200000 self.ep.reset_state() obs = self.ep.reset() with tqdm(total=test_step, desc="Evaluation", ascii=True) as t: while t.n < test_step: action = agent.take_action(obs, self.eps[1]) self.ep.step(action) obs, info = self.ep.auto_reset() t.update(self.ep.size) total_epi = sum(len(info[i]['rewards']) for i in range(self.ep.size)) mean_reward = np.mean([ np.mean(info[i]['rewards']) for i in range(self.ep.size) if info[i]['rewards'] ]) record = Record() record.add_key_value('Phase', 'Evaluation') record.add_key_value('Episodes', pretty_num(total_epi)) record.add_key_value('Mean Reward', np.round(mean_reward, 2)) main_logger.info("\n" + record.dumps()) if self.max_reward < mean_reward: self.max_reward = mean_reward agent.score = mean_reward agent.save_model() def run(self): self.random() for i in range(self.args.num_iters): self.train() self.test() self.exit() def exit(self): self.ep.close()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # get targets self.qnetwork_target.eval() with torch.no_grad(): Q_targets_next = torch.max(self.qnetwork_target.forward(next_states), dim=1, keepdim=True)[0] Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) # get outputs self.qnetwork_local.train() Q_expected = self.qnetwork_local.forward(states).gather(1, actions) # compute loss loss = F.mse_loss(Q_expected, Q_targets) # clear gradients self.optimizer.zero_grad() # update weights local network loss.backward() # take one SGD step self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def __init__(self, env, sess, learning_rate=1e-3, seed=1234, gamma=0.99, max_eps=1.0, min_eps=0.1, render=False, print_freq=20, load_path=None, save_path=None, batch_size=32, log_dir='logs/train', max_steps=100000, buffer_capacity=None, max_episode_len=2000, eps_decay_rate=-0.0001, target_update_freq=1000, ): """Trains an openai gym-like environment with deep q learning. Args: env: gym.Env where our agent resides seed: Random seed for reproducibility gamma: Discount factor max_eps: Starting exploration factor min_eps: Exploration factor to decay towards max_episode_len: Maximum length of an individual episode render: True to render the environment, else False print_freq: Displays logging information every 'print_freq' episodes load_path: (str) Path to load existing model from save_path: (str) Path to save model during training max_steps: maximum number of times to sample the environment buffer_capacity: How many state, action, next state, reward tuples the replay buffer should store max_episode_len: Maximum number of timesteps in an episode eps_decay_rate: lambda parameter in exponential decay for epsilon target_update_fraction: Fraction of max_steps update the target network """ np.random.seed(seed) self.sess = sess self.env = env self.input_dim = env.observation_space.shape[0] self.output_dim = env.action_space.n self.max_steps = max_steps self.max_eps = max_eps self.min_eps = min_eps self.eps_decay_rate = eps_decay_rate self.max_episode_len = max_episode_len self.render = render self.print_freq = print_freq self.rewards = [] self.metrics = [] self.save_path = save_path self.load_path = load_path self.batch_size = batch_size self.num_updates = 0 self.gamma = gamma self.buffer = ReplayBuffer(capacity=max_steps // 2 if buffer_capacity is None else buffer_capacity) self.target_update_freq = target_update_freq self.learning_rate = learning_rate with tf.variable_scope('q_network'): self.q_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,)) with tf.variable_scope('target_network'): self.target_network = QNetworkBuilder(self.input_dim, self.output_dim, (64,)) self.update_target_network = [old.assign(new) for (new, old) in zip(tf.trainable_variables('q_network'), tf.trainable_variables('target_network'))] if self.load_path is not None: self.load() self.add_summaries(log_dir)
def train(conf, env, model, num_episodes=500, batch_size=100, buffer_size=10000): conf.buffer_size = buffer_size conf.batch_size = batch_size replay_buffer = ReplayBuffer(size=buffer_size) discount_rate = conf.discount_rate eps = conf.initial_eps decay_factor = conf.decay_factor for episode in range(num_episodes): print("Episode {}".format(episode)) observation = env.reset() eps *= decay_factor done = False total_food = 0 step = 0 while not done: model_input = np.array([observation]) prediction = model.predict(model_input) if np.random.random() < eps: action = np.random.randint(0, 4) was_random = True else: action = np.argmax(prediction) was_random = False debugger.print_step_before_move(step, observation, prediction, action, was_random) debugger.render_env_until_key_press(env) new_observation, reward, done, _ = env.step(action) replay_buffer.add(observation, action, reward, new_observation, float(done)) # target_action_score = reward + (0 if done else discount_rate * np.max(model.predict( # np.array([new_observation])))) # label = prediction # label[0][action] = target_action_score # model.fit(model_input, label, epochs=1, # verbose=0) obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) labels = model.predict(obses_t) targets = discount_rate * np.max(model.predict(obses_tp1), axis=1) # print('targets', targets) # print('rewards', rewards) for i in range(len(dones)): if dones[i]: targets[i] = 0 targets[i] += rewards[i] labels[i][actions[i]] = targets[i] model.fit(obses_t, labels, epochs=1, verbose=0) weights, batch_idxes = np.ones_like(rewards), None # debugger.print_step_after_move(reward, target_action_score, # label, model.predict(model_input)) if (reward > 0): total_food += 1 step += 1 observation = new_observation wandb.log({ 'episode': episode, 'total_food': total_food, 'eps': eps, 'lifetime': step }) print('Score: {}'.format(total_food)) print() env.close()