def __init__(self, sess, state_size, action_size): self.sess = sess self.state_size = state_size self.action_size = action_size # hyper parameter self.batch_size = 32 self.discount_factor = 0.99 self.learning_rate = 0.00025 # epsilon self.s_epsilon = 1.0 self.e_epsilon = 0.01 self.n_epsilon_decay = 100000 self.epsilon = self.s_epsilon # replay buffer self.buffer = ReplayBuffer(50000) # place holder self.actions = tf.placeholder(tf.int32, shape=None) self.targets = tf.placeholder(tf.float32, shape=None) # network self.policy_net = DQN({}) self.target_net = DQN({}) self.sess.run(tf.global_variables_initializer()) self.update_target_network() # optimizer self.loss_op, self.train_op = self._build_op()
def __init__(self, fps=50): self.GeneralReward = False self.net = Network(150, 450, 150, 650) self.updateRewardA = 0 self.updateRewardB = 0 self.updateIter = 0 self.lossA = 0 self.lossB = 0 self.restart = False self.iteration = 0 self.AgentA = DQN() self.AgentB = DQN() # Testing self.net = Network(150, 450, 150, 650) self.NetworkA = self.net.network( 300, ysource=80, Ynew=650) # Network A self.NetworkB = self.net.network( 200, ysource=650, Ynew=80) # Network B pygame.init() self.BLACK = (0, 0, 0) self.myFontA = pygame.font.SysFont("Times New Roman", 25) self.myFontB = pygame.font.SysFont("Times New Roman", 25) self.myFontIter = pygame.font.SysFont('Times New Roman', 25) self.FPS = fps self.fpsClock = pygame.time.Clock() self.nextplayer = np.random.choice(['A', 'B'])
def __init__(self): #init ROS node rospy.init_node('robot_control') #super calls to parent classes super(RobotController, self).__init__() #initializes the work with starting parameters self.network = DQN(.0003, .1, .25) self.network.start()
def __init__(self, name) : self.name = name self.initializeProperties() self.QNetwork = DQN(self.imageSize, "QN", self.miniBatchSize) self.TDTarget = DQN(self.imageSize, "TD", self.miniBatchSize) self.sess = tf.Session() self.QNetwork.setSess(self.sess) self.TDTarget.setSess(self.sess) self.sess.run(tf.global_variables_initializer()) self.synchronise()
def __init__(self, name, num_episodes=500): self.name = name self.num_episodes = num_episodes self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.env = gym.make(name).unwrapped self.env.reset() self.env_w = EnvWrapper(self.env, self.device) self.cfg = Config() self.cfg.n_actions = self.env.action_space.n self.cfg.policy_net = DQN(self.env_w.screen_height, self.env_w.screen_width, self.cfg.n_actions).to(self.device) self.cfg.target_net = DQN(self.env_w.screen_height, self.env_w.screen_width, self.cfg.n_actions).to(self.device) self.agent = Agent(self.env, self.env_w, self.device, self.cfg)
def __init__(self, p): self.p = p self.target_dqn = DQN(self.p['HIDDEN_DIM']) self.eval_dqn = DQN(self.p['HIDDEN_DIM']) self.memory = ReplayMemory(self.p['MEMORY_SIZE'], [4]) self.optimizer = torch.optim.Adam(self.eval_dqn.parameters(), self.p['LEARNING_RATE']) try: self.eval_dqn.load_state_dict(torch.load("Model/eval_dqn.data")) self.target_dqn.load_state_dict(torch.load("Model/eval_dqn.data")) print("Data has been loaded successfully") except: print("No data existing")
def __init__(self, env, hyperparameters, device, summary_writer=None): """Set parameters, initialize network.""" state_space_shape = env.observation_space.shape action_space_size = env.action_space.n self.env = env self.online_network = DQN(state_space_shape, action_space_size).to(device) self.target_network = DQN(state_space_shape, action_space_size).to(device) # XXX maybe not really necesary? self.update_target_network() self.experience_replay = None self.accumulated_loss = [] self.device = device self.optimizer = optim.Adam(self.online_network.parameters(), lr=hyperparameters['learning_rate']) self.double_DQN = hyperparameters['double_DQN'] # Discount factor self.gamma = hyperparameters['gamma'] # XXX ??? self.n_multi_step = hyperparameters['n_multi_step'] self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'], hyperparameters['n_multi_step'], hyperparameters['gamma']) self.birth_time = time.time() self.iter_update_target = hyperparameters['n_iter_update_target'] self.buffer_start_size = hyperparameters['buffer_start_size'] self.summary_writer = summary_writer # Greedy search hyperparameters self.epsilon_start = hyperparameters['epsilon_start'] self.epsilon = hyperparameters['epsilon_start'] self.epsilon_decay = hyperparameters['epsilon_decay'] self.epsilon_final = hyperparameters['epsilon_final']
def __init__(self, name, isBot): self.name = name self.isBot = isBot if not self.isBot: self.chosenAction = 0 self.defineKeyboardListener() self.initializeProperties() self.QNetwork = DQN("QN{}".format(name), self.miniBatchSize) self.TDTarget = DQN("TD{}".format(name), self.miniBatchSize) self.sess = tf.Session() self.QNetwork.setSess(self.sess) self.TDTarget.setSess(self.sess) self.sess.run(tf.global_variables_initializer()) self.synchronise()
def __init__( self, state_size, action_size, n_agents, buffer_size: int = 1e5, batch_size: int = 256, gamma: float = 0.995, tau: float = 1e-3, learning_rate: float = 7e-4, update_every: int = 4, ): """ Initialize DQN agent using the agent-experience buffer Args: state_size (int): Size of the state observation returned by the environment action_size (int): Action space size n_agents (int): Number of agents in the environment buffer_size (int): Desired total experience buffer size batch_size (int): Mini-batch size gamma (float): Discount factor tau (float): For soft update of target parameters learning_rate (float): Learning rate update_every (int): Number of steps before target network update """ self.state_size = state_size self.action_size = action_size self.n_agents = n_agents # Q-Networks self.policy_net = DQN(state_size, action_size).to(device) self.target_net = DQN(state_size, action_size).to(device) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate) self.memory = AgentReplayMemory(buffer_size, n_agents, state_size, device) self.t_step = 0 self.update_every = update_every self.batch_size = batch_size self.gamma = gamma self.tau = tau
def test(): print('뇌세포 깨우는 중..') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state('model') saver.restore(sess, ckpt.model_checkpoint_path) for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) while not terminal: action = brain.get_action() state, reward, terminal = game.step(action) total_reward += reward brain.remember(state, action, reward, terminal) time.sleep(0.3) print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))
class RobotController(CmdVelPublisher, ImageSubscriber, object): """ Brain of the Robot, which inherits from ImageSubscriber and cmdVelPublisher, and initializes a network. Determines from ImageSubscriber how to move, which is executed in cmdVelPublisher, and these movements are optimized and learned through self.network """ def __init__(self): #init ROS node rospy.init_node('robot_control') #super calls to parent classes super(RobotController, self).__init__() #initializes the work with starting parameters self.network = DQN(.0003, .1, .25) self.network.start() def robot_control(self, action): """ Given action, will exceute a specificed behavior from the robot action: 0 = forward 1 = leftTurn 2 = rightTurn 3 = stop """ try: if action < 0 or action > 3: raise ValueError("Action is invalid") self.state[action].__call__() except: # make robot stop print "Invalid action - stopping robot" self.state[3].__call__() self.sendMessage() rospy.sleep(.1) # use desired action for 0.1 second self.state[3].__call__() # set robot to stop for .1 second self.sendMessage() rospy.sleep(.1) def run(self): """ The main run loop """ r = rospy.Rate(10) while not rospy.is_shutdown(): if not self.cv_image is None: #visualizes the binary image cv2.imshow('video_window', self.binary_image) cv2.waitKey(5) #feeds binary image into network to receive action with corresponding Q-values a, Q = self.network.feed_forward(self.binary_image) #moves based on move probable action self.robot_control(a[0]) #updates the network parameters based on what happened from the action step self.network.update(self.binary_image) r.sleep() self.network.stop()
def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Initialize Q-Networks self.qnetwork_local = DQN(state_size, action_size, seed).to(device) self.qnetwork_target = DQN(state_size, action_size, seed).to(device) # Initialize optimizer self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Initialize optimizer self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
EPS_DECAY = 50000 TARGET_UPDATE = 10 LR = 0.005 test_time = False n_steps = 8 n_actions = env.action_space.n img_height = 64 img_width = 64 policy_net = None network_path = "target_net.pt" if os.path.exists(network_path): policy_net = torch.load(network_path) print("successfully loaded existing network from file: " + network_path) else: policy_net = DQN(img_height, img_width, n_actions) target_net = DQN(img_height, img_width, n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters(), lr=LR) memory = ReplayMemory(10000) steps_done = 0 logfile = "train_log.txt" with open(logfile, "w+") as f: f.write("CS4803 MineRL Project Logs:\n") def append_log(s): with open(logfile, "a") as f: f.write(s + "\n") def state_from_obs(obs):
def trainD(file_name="Distral_2col_SQL", list_of_envs=[GridworldEnv(5), GridworldEnv(4), GridworldEnv(6)], batch_size=128, gamma=0.999, alpha=0.8, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5, is_plot=False, num_episodes=200, max_num_steps_per_episode=1000, learning_rate=0.001, memory_replay_size=10000, memory_policy_size=1000): """ Soft Q-learning training routine. Returns rewards and durations logs. """ num_actions = list_of_envs[0].action_space.n input_size = list_of_envs[0].observation_space.shape[0] num_envs = len(list_of_envs) policy = PolicyNetwork(input_size, num_actions) models = [DQN(input_size, num_actions) for _ in range(0, num_envs)] memories = [ ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs) ] optimizers = [ optim.Adam(model.parameters(), lr=learning_rate) for model in models ] policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate) episode_durations = [[] for _ in range(num_envs)] episode_rewards = [[] for _ in range(num_envs)] steps_done = np.zeros(num_envs) episodes_done = np.zeros(num_envs) current_time = np.zeros(num_envs) # Initialize environments states = [] for env in list_of_envs: states.append( torch.from_numpy(env.reset()).type(torch.FloatTensor).view( -1, input_size)) while np.min(episodes_done) < num_episodes: # TODO: add max_num_steps_per_episode # Optimization is given by alternating minimization scheme: # 1. do the step for each env # 2. do one optimization step for each env using "soft-q-learning". # 3. do one optimization step for the policy for i_env, env in enumerate(list_of_envs): # select an action action = select_action(states[i_env], policy, models[i_env], num_actions, eps_start, eps_end, eps_decay, episodes_done[i_env], alpha, beta) steps_done[i_env] += 1 current_time[i_env] += 1 next_state_tmp, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # Observe new state next_state = torch.from_numpy(next_state_tmp).type( torch.FloatTensor).view(-1, input_size) if done: next_state = None # Store the transition in memory time = Tensor([current_time[i_env]]) memories[i_env].push(states[i_env], action, next_state, reward, time) # Perform one step of the optimization (on the target network) optimize_model(policy, models[i_env], optimizers[i_env], memories[i_env], batch_size, alpha, beta, gamma) # Update state states[i_env] = next_state # Check if agent reached target if done or current_time[i_env] >= max_num_steps_per_episode: if episodes_done[i_env] <= num_episodes: print( "ENV:", i_env, "iter:", episodes_done[i_env], "\treward:{0:.2f}".format(env.episode_total_reward), "\tit:", current_time[i_env], "\texp_factor:", eps_end + (eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay)) episode_rewards[i_env].append(env.episode_total_reward) episodes_done[i_env] += 1 episode_durations[i_env].append(current_time[i_env]) current_time[i_env] = 0 states[i_env] = torch.from_numpy(env.reset()).type( torch.FloatTensor).view(-1, input_size) if is_plot: plot_rewards(episode_rewards, i_env) # Perform one step of the optimization on the Distilled policy optimize_policy(policy, policy_optimizer, memories, batch_size, num_envs, gamma, alpha, beta) print('Complete') env.render(close=True) env.close() ## Store Results np.save(file_name + '-rewards', episode_rewards) np.save(file_name + '-durations', episode_durations) return models, policy, episode_rewards, episode_durations
class DQNAgent(): """Deep Q-learning agent.""" # def __init__(self, # env, device=DEVICE, summary_writer=writer, # noqa # hyperparameters=DQN_HYPERPARAMS): # noqa rewards = [] total_reward = 0 birth_time = 0 n_iter = 0 n_games = 0 ts_frame = 0 ts = time.time() # Memory = namedtuple( # 'Memory', ['obs', 'action', 'new_obs', 'reward', 'done'], # verbose=False, rename=False) Memory = namedtuple('Memory', ['obs', 'action', 'new_obs', 'reward', 'done'], rename=False) def __init__(self, env, hyperparameters, device, summary_writer=None): """Set parameters, initialize network.""" state_space_shape = env.observation_space.shape action_space_size = env.action_space.n self.env = env self.online_network = DQN(state_space_shape, action_space_size).to(device) self.target_network = DQN(state_space_shape, action_space_size).to(device) # XXX maybe not really necesary? self.update_target_network() self.experience_replay = None self.accumulated_loss = [] self.device = device self.optimizer = optim.Adam(self.online_network.parameters(), lr=hyperparameters['learning_rate']) self.double_DQN = hyperparameters['double_DQN'] # Discount factor self.gamma = hyperparameters['gamma'] # XXX ??? self.n_multi_step = hyperparameters['n_multi_step'] self.replay_buffer = ReplayBuffer(hyperparameters['buffer_capacity'], hyperparameters['n_multi_step'], hyperparameters['gamma']) self.birth_time = time.time() self.iter_update_target = hyperparameters['n_iter_update_target'] self.buffer_start_size = hyperparameters['buffer_start_size'] self.summary_writer = summary_writer # Greedy search hyperparameters self.epsilon_start = hyperparameters['epsilon_start'] self.epsilon = hyperparameters['epsilon_start'] self.epsilon_decay = hyperparameters['epsilon_decay'] self.epsilon_final = hyperparameters['epsilon_final'] def get_max_action(self, obs): ''' Forward pass of the NN to obtain the action of the given observations ''' # convert the observation in tensor state_t = torch.tensor(np.array([obs])).to(self.device) # forward pass q_values_t = self.online_network(state_t) # get the maximum value of the output (i.e. the best action to take) _, act_t = torch.max(q_values_t, dim=1) return int(act_t.item()) def act(self, obs): ''' Greedy action outputted by the NN in the CentralControl ''' return self.get_max_action(obs) def act_eps_greedy(self, obs): ''' E-greedy action ''' # In case of a noisy net, it takes a greedy action # if self.noisy_net: # return self.act(obs) if np.random.random() < self.epsilon: return self.env.action_space.sample() else: return self.act(obs) def update_target_network(self): """Update target network weights with current online network values.""" self.target_network.load_state_dict(self.online_network.state_dict()) def set_optimizer(self, learning_rate): self.optimizer = optim.Adam(self.online_network.parameters(), lr=learning_rate) def sample_and_optimize(self, batch_size): ''' Sample batch_size memories from the buffer and optimize them ''' # This should be the part where it waits until it has enough # experience if len(self.replay_buffer) > self.buffer_start_size: # sample mini_batch = self.replay_buffer.sample(batch_size) # optimize # l_loss = self.cc.optimize(mini_batch) l_loss = self.optimize(mini_batch) self.accumulated_loss.append(l_loss) # update target NN if self.n_iter % self.iter_update_target == 0: self.update_target_network() def optimize(self, mini_batch): ''' Optimize the NN ''' # reset the grads self.optimizer.zero_grad() # caluclate the loss of the mini batch loss = self._calulate_loss(mini_batch) loss_v = loss.item() # do backpropagation loss.backward() # one step of optimization self.optimizer.step() return loss_v def _calulate_loss(self, mini_batch): ''' Calculate mini batch's MSE loss. It support also the double DQN version ''' states, actions, next_states, rewards, dones = mini_batch # convert the data in tensors states_t = torch.as_tensor(states, device=self.device) next_states_t = torch.as_tensor(next_states, device=self.device) actions_t = torch.as_tensor(actions, device=self.device) rewards_t = torch.as_tensor(rewards, dtype=torch.float32, device=self.device) done_t = torch.as_tensor(dones, dtype=torch.uint8, device=self.device) # noqa # Value of the action taken previously (recorded in actions_v) # in state_t state_action_values = self.online_network(states_t).gather( 1, actions_t[:, None]).squeeze(-1) # NB gather is a differentiable function # Next state value with Double DQN. (i.e. get the value predicted # by the target nn, of the best action predicted by the online nn) if self.double_DQN: double_max_action = self.online_network(next_states_t).max(1)[1] double_max_action = double_max_action.detach() target_output = self.target_network(next_states_t) # NB: [:,None] add an extra dimension next_state_values = torch.gather( target_output, 1, double_max_action[:, None]).squeeze(-1) # Next state value in the normal configuration else: next_state_values = self.target_network(next_states_t).max(1)[0] next_state_values = next_state_values.detach() # No backprop # Use the Bellman equation expected_state_action_values = rewards_t + \ (self.gamma**self.n_multi_step) * next_state_values # compute the loss return nn.MSELoss()(state_action_values, expected_state_action_values) def reset_stats(self): ''' Reset the agent's statistics ''' self.rewards.append(self.total_reward) self.total_reward = 0 self.accumulated_loss = [] self.n_games += 1 def add_env_feedback(self, obs, action, new_obs, reward, done): ''' Acquire a new feedback from the environment. The feedback is constituted by the new observation, the reward and the done boolean. ''' # Create the new memory and update the buffer new_memory = self.Memory(obs=obs, action=action, new_obs=new_obs, reward=reward, done=done) # Append it to the replay buffer self.replay_buffer.append(new_memory) # update the variables self.n_iter += 1 # TODO check this... # decrease epsilon self.epsilon = max( self.epsilon_final, self.epsilon_start - self.n_iter / self.epsilon_decay) self.total_reward += reward def print_info(self): ''' Print information about the agent ''' fps = (self.n_iter - self.ts_frame) / (time.time() - self.ts) # TODO replace with proper logger print('%d %d rew:%d mean_rew:%.2f eps:%.2f, fps:%d, loss:%.4f' % (self.n_iter, self.n_games, self.total_reward, np.mean(self.rewards[-40:]), self.epsilon, fps, np.mean(self.accumulated_loss))) self.ts_frame = self.n_iter self.ts = time.time() if self.summary_writer is not None: self.summary_writer.add_scalar('reward', self.total_reward, self.n_games) self.summary_writer.add_scalar('mean_reward', np.mean(self.rewards[-40:]), self.n_games) self.summary_writer.add_scalar('10_mean_reward', np.mean(self.rewards[-10:]), self.n_games) self.summary_writer.add_scalar('epsilon', self.epsilon, self.n_games) self.summary_writer.add_scalar('loss', np.mean(self.accumulated_loss), self.n_games)
class AgentCartpole: def __init__(self, p): self.p = p self.target_dqn = DQN(self.p['HIDDEN_DIM']) self.eval_dqn = DQN(self.p['HIDDEN_DIM']) self.memory = ReplayMemory(self.p['MEMORY_SIZE'], [4]) self.optimizer = torch.optim.Adam(self.eval_dqn.parameters(), self.p['LEARNING_RATE']) try: self.eval_dqn.load_state_dict(torch.load("Model/eval_dqn.data")) self.target_dqn.load_state_dict(torch.load("Model/eval_dqn.data")) print("Data has been loaded successfully") except: print("No data existing") def act(self, state): r = random.random() if r > self.p['EPSILON']: x = torch.FloatTensor(state) q_value = self.eval_dqn(x) action = torch.argmax(q_value).item() return action else: action = random.randint(0, self.p['N_ACTIONS']-1) return action def learn(self): if self.memory.index < self.p['BATCH_SIZE']: return # Get the state dict from the saved date eval_dict = self.eval_dqn.state_dict() target_dict = self.eval_dqn.state_dict() # Updating the parameters of the target DQN for w in eval_dict: target_dict[w] = (1 - self.p['ALPHA']) * target_dict[w] + self.p['ALPHA'] * eval_dict[w] self.target_dqn.load_state_dict(target_dict) # Get a sample of size BATCH batch_state, batch_action, batch_next_state, batch_reward, batch_done = self.memory.pop(self.p['BATCH_SIZE']) # Update the treshold for the act() method if needed everytime the agent learn if self.p["EPSILON"] > self.p["EPSILON_MIN"]: self.p["EPSILON"] *= self.p["EPSILON_DECAY"] loss = nn.MSELoss() # Compute q values for the current evaluation q_eval = self.eval_dqn(batch_state).gather(1, batch_action.long().unsqueeze(1)).reshape([self.p["BATCH_SIZE"]]) # Compute the next state q values q_next = self.target_dqn(batch_next_state).detach() # Compute the targetted q values q_target = batch_reward + q_next.max(1)[0].reshape([self.p["BATCH_SIZE"]]) * self.p["GAMMA"] self.optimizer.zero_grad() l = loss(q_eval, q_target) l.backward() self.optimizer.step() def random(self): env = gym.make('CartPole-v1') env = env.unwrapped env.reset() rewards = [] while True: env.render() action = env.action_space.pop(self.p['BATCH_SIZE']) observation, reward, done, info = env.step(action) rewards.append(reward) if done: break env.close() plt.ylabel("Rewards") plt.xlabel("Nb interactions") plt.plot(rewards) plt.grid() plt.show() def dqn_cartpole(self): env = gym.make('CartPole-v1') env = env.unwrapped rewards = [] for i in range(self.p['N_EPISODE']): state = env.reset() rewards.append(0) for s in range(self.p['N_STEPS']): # env.render() action = self.act(state) n_state, reward, done, _ = env.step(action) if done: reward = -1 rewards[-1] += reward self.memory.push(state, action, n_state, reward, done) self.learn() state = n_state print('Episode : ', i, ', Rewards : ', rewards[-1]) # Save the eval model after each episode torch.save(self.eval_dqn.state_dict(), "Model/eval_dqn.data") # Display result n = 50 res = sum(([a]*n for a in [sum(rewards[i:i+n])//n for i in range(0,len(rewards),n)]), []) print(rewards) plt.ylabel("Rewards") plt.xlabel("Episode") plt.plot(rewards) plt.plot(res) plt.grid() plt.legend(['Rewards per episode', 'Last 50 runs average']) plt.show() env.close()
def test_ppo(args=get_args()): args.cfg_path = f"maps/{args.task}.cfg" args.wad_path = f"maps/{args.task}.wad" args.res = (args.skip_num, 84, 84) env = Env(args.cfg_path, args.frames_stack, args.res) args.state_shape = args.res args.action_shape = env.action_space.shape or env.action_space.n # should be N_FRAMES x H x W print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) # make environments train_envs = ShmemVectorEnv([ lambda: Env(args.cfg_path, args.frames_stack, args.res) for _ in range(args.training_num) ]) test_envs = ShmemVectorEnv([ lambda: Env(args.cfg_path, args.frames_stack, args.res, args.save_lmp) for _ in range(min(os.cpu_count() - 1, args.test_num)) ]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # define model net = DQN(*args.state_shape, args.action_shape, device=args.device, features_only=True, output_dim=args.hidden_size) actor = Actor(net, args.action_shape, device=args.device, softmax_output=False) critic = Critic(net, device=args.device) optim = torch.optim.Adam(ActorCritic(actor, critic).parameters(), lr=args.lr) lr_scheduler = None if args.lr_decay: # decay learning rate to 0 linearly max_update_num = np.ceil( args.step_per_epoch / args.step_per_collect) * args.epoch lr_scheduler = LambdaLR( optim, lr_lambda=lambda epoch: 1 - epoch / max_update_num) # define policy def dist(p): return torch.distributions.Categorical(logits=p) policy = PPOPolicy(actor, critic, optim, dist, discount_factor=args.gamma, gae_lambda=args.gae_lambda, max_grad_norm=args.max_grad_norm, vf_coef=args.vf_coef, ent_coef=args.ent_coef, reward_normalization=args.rew_norm, action_scaling=False, lr_scheduler=lr_scheduler, action_space=env.action_space, eps_clip=args.eps_clip, value_clip=args.value_clip, dual_clip=args.dual_clip, advantage_normalization=args.norm_adv, recompute_advantage=args.recompute_adv).to(args.device) if args.icm_lr_scale > 0: feature_net = DQN(*args.state_shape, args.action_shape, device=args.device, features_only=True, output_dim=args.hidden_size) action_dim = np.prod(args.action_shape) feature_dim = feature_net.output_dim icm_net = IntrinsicCuriosityModule(feature_net.net, feature_dim, action_dim, device=args.device) icm_optim = torch.optim.Adam(icm_net.parameters(), lr=args.lr) policy = ICMPolicy(policy, icm_net, icm_optim, args.icm_lr_scale, args.icm_reward_scale, args.icm_forward_loss_weight).to(args.device) # load a previous policy if args.resume_path: policy.load_state_dict( torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # replay buffer: `save_last_obs` and `stack_num` can be removed together # when you have enough RAM buffer = VectorReplayBuffer(args.buffer_size, buffer_num=len(train_envs), ignore_obs_next=True, save_only_last_obs=True, stack_num=args.frames_stack) # collector train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs, exploration_noise=True) # log log_name = 'ppo_icm' if args.icm_lr_scale > 0 else 'ppo' log_path = os.path.join(args.logdir, args.task, log_name) writer = SummaryWriter(log_path) writer.add_text("args", str(args)) logger = TensorboardLogger(writer) def save_best_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): if env.spec.reward_threshold: return mean_rewards >= env.spec.reward_threshold elif 'Pong' in args.task: return mean_rewards >= 20 else: return False # watch agent's performance def watch(): print("Setup test envs ...") policy.eval() test_envs.seed(args.seed) if args.save_buffer_name: print(f"Generate buffer with size {args.buffer_size}") buffer = VectorReplayBuffer(args.buffer_size, buffer_num=len(test_envs), ignore_obs_next=True, save_only_last_obs=True, stack_num=args.frames_stack) collector = Collector(policy, test_envs, buffer, exploration_noise=True) result = collector.collect(n_step=args.buffer_size) print(f"Save buffer into {args.save_buffer_name}") # Unfortunately, pickle will cause oom with 1M buffer size buffer.save_hdf5(args.save_buffer_name) else: print("Testing agent ...") test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) rew = result["rews"].mean() lens = result["lens"].mean() * args.skip_num print(f'Mean reward (over {result["n/ep"]} episodes): {rew}') print(f'Mean length (over {result["n/ep"]} episodes): {lens}') if args.watch: watch() exit(0) # test train_collector and start filling replay buffer train_collector.collect(n_step=args.batch_size * args.training_num) # trainer result = onpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.repeat_per_collect, args.test_num, args.batch_size, step_per_collect=args.step_per_collect, stop_fn=stop_fn, save_best_fn=save_best_fn, logger=logger, test_in_train=False) pprint.pprint(result) watch()
class tennis: def __init__(self, fps=50): self.GeneralReward = False self.net = Network(150, 450, 150, 650) self.updateRewardA = 0 self.updateRewardB = 0 self.updateIter = 0 self.lossA = 0 self.lossB = 0 self.restart = False self.iteration = 0 self.AgentA = DQN() self.AgentB = DQN() # Testing self.net = Network(150, 450, 150, 650) self.NetworkA = self.net.network( 300, ysource=80, Ynew=650) # Network A self.NetworkB = self.net.network( 200, ysource=650, Ynew=80) # Network B pygame.init() self.BLACK = (0, 0, 0) self.myFontA = pygame.font.SysFont("Times New Roman", 25) self.myFontB = pygame.font.SysFont("Times New Roman", 25) self.myFontIter = pygame.font.SysFont('Times New Roman', 25) self.FPS = fps self.fpsClock = pygame.time.Clock() self.nextplayer = np.random.choice(['A', 'B']) def setWindow(self): # set up the window self.DISPLAYSURF = pygame.display.set_mode((600, 750), 0, 32) pygame.display.set_caption( 'REINFORCEMENT LEARNING (DQN) - TABLE TENNIS') # set up the colors self.BLACK = (0, 0, 0) self.WHITE = (255, 255, 255) self.RED = (255, 0, 0) self.GREEN = (0, 255, 0) self.BLUE = (0, 0, 255) return def display(self): self.setWindow() self.DISPLAYSURF.fill(self.WHITE) pygame.draw.rect(self.DISPLAYSURF, self.BLACK, (50, 100, 500, 550)) pygame.draw.rect(self.DISPLAYSURF, self.RED, (50, 365, 500, 20)) return def reset(self): return def evaluate_state_from_last_coordinate(self, c): """ cmax: 550 cmin: 50 c definately will be between 50 and 550. """ if c >= 50 and c <= 550: return int(c/50 - 1) else: return 0 def evaluate_action(self, diff): if (int(diff) <= 50): return True else: return False def randomVal(self, action): "action is a probability of values between 0 and 1" val = (action*500) + 50 return val def play(self, action, count=0, play = 'A'): # play = A implies compute player A's next play. # play = B implies compute player B's next play. if play == 'A': # playerA should play if count == 0: self.NetworkA = self.net.network( self.ballx, ysource=80, Ynew=650) # Network A self.bally = self.NetworkA[1][count] self.ballx = self.NetworkA[0][count] if self.GeneralReward == True: self.playerax = self.randomVal(action) else: self.playerax = self.ballx else: self.ballx = self.NetworkA[0][count] self.bally = self.NetworkA[1][count] obsOne = self.evaluate_state_from_last_coordinate( int(self.ballx)) # last state of the ball obsTwo = self.evaluate_state_from_last_coordinate( int(self.playerbx)) # evaluate player bx diff = np.abs(self.ballx - self.playerbx) obs = obsTwo reward = self.evaluate_action(diff) done = True info = str(diff) else: # playerB should play if count == 0: self.NetworkB = self.net.network( self.ballx, ysource=650, Ynew=80) # Network B self.bally = self.NetworkB[1][count] self.ballx = self.NetworkB[0][count] if self.GeneralReward == True: self.playerbx = self.randomVal(action) else: self.playerbx = self.ballx else: self.ballx = self.NetworkB[0][count] self.bally = self.NetworkB[1][count] obsOne = self.evaluate_state_from_last_coordinate( int(self.ballx)) # last state of the ball obsTwo = self.evaluate_state_from_last_coordinate( int(self.playerax)) # evaluate player bx diff = np.abs(self.ballx - self.playerax) obs = obsTwo reward = self.evaluate_action(diff) done = True info = str(diff) return obs, reward, done, info def computeLoss(self, reward, loss = 'A'): # loss = A, implies compute loss of player A, otherwise, compute Player B loss. if loss == 'A': if reward == 0: self.lossA += 1 else: self.lossA += 0 else: if reward == 0: self.lossB += 1 else: self.lossB += 0 return def execute(self, state, iteration, count, player = 'A'): if player == 'B': stateB = state # Online DQN evaluates what to do try: q_valueB = self.AgentB.model.predict([stateB]) except: q_valueB = 0 actionB = self.AgentB.epsilon_greedy(q_valueB, iteration) # Online DQN plays obsB, rewardB, doneB, infoB = self.play( action=actionB, count=count, play = 'B') next_stateB = actionB # Let's memorize what just happened self.AgentB.replay_memory.append( (stateB, actionB, rewardB, next_stateB, 1.0 - doneB)) stateB = next_stateB output = (q_valueB, actionB, obsB, rewardB, doneB, infoB, next_stateB, actionB, stateB) else: stateA = state # Online DQN evaluates what to do # arr = np.array([stateA]) try: q_valueA = self.AgentB.model.predict([stateB]) except: q_valueA = 0 actionA = self.AgentA.epsilon_greedy(q_valueA, iteration) # Online DQN plays obsA, rewardA, doneA, infoA = self.play( action=actionA, count=count, play = 'A') next_stateA = actionA # Let's memorize what just happened self.AgentA.replay_memory.append( (stateA, actionA, rewardA, next_stateA, 1.0 - doneA)) stateA = next_stateA output = (q_valueA, actionA, obsA, rewardA, doneA, infoA, next_stateA, actionA, stateA) return output def trainOnlineDQN(self, player = 'A'): if player == 'A': X_state_val, X_action_val, rewards, X_next_state_val, continues = ( self.AgentA.sample_memories(self.AgentA.batch_size)) arr = [X_next_state_val] next_q_values = self.AgentA.model.predict(arr) max_next_q_values = np.max( next_q_values, axis=1, keepdims=True) y_val = rewards + continues * self.AgentA.discount_rate * max_next_q_values # Train the online DQN self.AgentA.model.fit(X_state_val, tf.keras.utils.to_categorical( X_next_state_val, num_classes=10), verbose=0) else: X_state_val, X_action_val, rewards, X_next_state_val, continues = ( self.AgentB.sample_memories(self.AgentB.batch_size)) arr = [X_next_state_val] next_q_values = self.AgentB.model.predict(arr) max_next_q_values = np.max( next_q_values, axis=1, keepdims=True) y_val = rewards + continues * self.AgentB.discount_rate * max_next_q_values # Train the online DQN self.AgentB.model.fit(X_state_val, tf.keras.utils.to_categorical( X_next_state_val, num_classes=10), verbose=0) return True def show_board(self): self.display() # CHECK BALL MOVEMENT self.DISPLAYSURF.blit(self.PLAYERA, (self.playerax, 50)) self.DISPLAYSURF.blit(self.PLAYERB, (self.playerbx, 650)) self.DISPLAYSURF.blit(self.ball, (self.ballx, self.bally)) self.DISPLAYSURF.blit(self.randNumLabelA, (20, 15)) self.DISPLAYSURF.blit(self.randNumLabelB, (450, 15)) pygame.display.update() self.fpsClock.tick(self.FPS) for event in pygame.event.get(): if event.type == QUIT: # self.AgentA.model.save('models/AgentA.h5') # self.AgentB.model.save('models/AgentB.h5') pygame.quit() sys.exit() return def step(self, action): # stepOutput: reward, next_state, done # action represents the next player to player, action can either be {playerA:0, playerB: 1} # diplay team players self.PLAYERA = pygame.image.load('Images/padB.png') self.PLAYERA = pygame.transform.scale(self.PLAYERA, (50, 50)) self.PLAYERB = pygame.image.load('Images/padA.png') self.PLAYERB = pygame.transform.scale(self.PLAYERB, (50, 50)) self.ball = pygame.image.load('Images/ball.png') self.ball = pygame.transform.scale(self.ball, (15, 15)) self.playerax = 150 self.playerbx = 250 self.ballx = 250 self.bally = 300 # player A starts by playing with state 0 obsA, rewardA, doneA, infoA = 0, False, False, '' obsB, rewardB, doneB, infoB = 0, False, False, '' state = 0 stateA = 0 stateB = 0 next_stateA = 0 next_stateB = 0 iteration = self.iteration actionA = 0 actionB = 0 restart = False self.display() self.randNumLabelA = self.myFontA.render( 'Score A: '+str(self.updateRewardA), 1, self.BLACK) self.randNumLabelB = self.myFontB.render( 'Score B: '+str(self.updateRewardB), 1, self.BLACK) nextplayer = self.nextplayer if self.nextplayer == 'A': for count in range(50): if count == 0: output = self.execute(state, iteration, count, player = nextplayer) q_valueA, actionA, obsA, rewardA, doneA, infoA, next_stateA, actionA, stateA = output state = next_stateA elif count == 49: output = self.execute(state, iteration, count, player = 'A') q_valueA, actionA, obsA, rewardA, doneA, infoA, next_stateA, actionA, stateA = output state = next_stateA self.updateRewardA += rewardA self.computeLoss(rewardA, loss = 'A') # restart the game if player A fails to get the ball, and let B start the game if rewardA == 0: self.restart = True time.sleep(0.5) self.nextplayer = 'B' self.GeneralReward = False else: self.restart = False self.GeneralReward = True # Sample memories and use the target DQN to produce the target Q-Value self.trainOnlineDQN(player = 'A') self.nextplayer = 'B' self.updateIter += 1 else: output = self.execute(state, iteration, count, player = 'A') q_valueA, actionA, obsA, rewardA, doneA, infoA, next_stateA, actionA, stateA = output state = next_stateA stepOutput = rewardA, next_stateA, doneA self.show_board() else: for count in range(50): if count == 0: output = self.execute(state, iteration, count, player = 'B') q_valueB, actionB, obsB, rewardB, doneB, infoB, next_stateB, actionB, stateB = output state = next_stateB elif count == 49: output = self.execute(state, iteration, count, player = 'B') q_valueB, actionB, obsB, rewardB, doneB, infoB, next_stateB, actionB, stateB = output state = next_stateB self.updateRewardB += rewardB self.computeLoss(rewardB, loss = 'B') # restart the game if player A fails to get the ball, and let B start the game if rewardB == 0: self.restart = True time.sleep(0.5) self.GeneralReward = False self.nextplayer = 'A' else: self.restart = False self.GeneralReward = True # Sample memories and use the target DQN to produce the target Q-Value self.trainOnlineDQN(player = 'B') self.nextplayer = 'A' self.updateIter += 1 # evaluate B else: output = self.execute(state, iteration, count, player = 'B') q_valueB, actionB, obsB, rewardB, doneB, infoB, next_stateB, actionB, stateB = output state = next_stateB stepOutput = rewardA, next_stateA, doneA self.show_board() self.iteration += 1 # keep track of the total number of iterations conducted return stepOutput
def trainSQL0(file_name="SQL0", env=GridworldEnv(1), batch_size=128, gamma=0.999, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=1000, is_plot=False, num_episodes=200, max_num_steps_per_episode=1000, learning_rate=0.0001, memory_replay_size=10000, n_step=10, target_update=10): """ Soft Q-learning training routine when observation vector is input Retuns rewards and durations logs. """ num_actions = env.action_space.n input_size = env.observation_space.shape[0] model = DQN(input_size, num_actions) target_model = DQN(input_size, num_actions) target_model.load_state_dict(model.state_dict()) optimizer = optim.Adam(model.parameters(), lr=learning_rate) # optimizer = optim.RMSprop(model.parameters(), ) use_cuda = torch.cuda.is_available() if use_cuda: model.cuda() memory = ReplayMemory(memory_replay_size, n_step, gamma) episode_durations = [] mean_durations = [] episode_rewards = [] mean_rewards = [] steps_done, t = 0, 0 for i_episode in range(num_episodes): if i_episode % 20 == 0: clear_output() if i_episode != 0: print("Cur episode:", i_episode, "steps done:", episode_durations[-1], "exploration factor:", eps_end + (eps_start - eps_end) * \ math.exp(-1. * steps_done / eps_decay), "reward:", env.episode_total_reward) # Initialize the environment and state state = torch.from_numpy(env.reset()).type(torch.FloatTensor).view( -1, input_size) for t in count(): # Select and perform an action action = select_action(state, model, num_actions, eps_start, eps_end, eps_decay, steps_done) next_state_tmp, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # Observe new state next_state = torch.from_numpy(next_state_tmp).type( torch.FloatTensor).view(-1, input_size) if done: next_state = None # Store the transition in memory memory.push(model, target_model, state, action, next_state, reward) # Move to the next state state = next_state # plot_state(state) # env.render() # Perform one step of the optimization (on the target network) optimize_model(model, target_model, optimizer, memory, batch_size, gamma, beta) #### Difference w.r.t DQN if done or t + 1 >= max_num_steps_per_episode: episode_durations.append(t + 1) episode_rewards.append( env.episode_total_reward ) ##### Modify for OpenAI envs such as CartPole if is_plot: plot_durations(episode_durations, mean_durations) plot_rewards(episode_rewards, mean_rewards) steps_done += 1 break if i_episode % target_update == 0 and i_episode != 0: target_model.load_state_dict(model.state_dict()) print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-sql0-rewards', episode_rewards) np.save(file_name + '-sql0-durations', episode_durations) return model, episode_rewards, episode_durations
def train(): print('뇌세포 깨우는 중..') sess = tf.Session() game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False) brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION) rewards = tf.placeholder(tf.float32, [None]) tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards)) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter('logs', sess.graph) summary_merged = tf.summary.merge_all() brain.update_target_network() epsilon = 1.0 time_step = 0 total_reward_list = [] # 게임을 시작합니다. for episode in range(MAX_EPISODE): terminal = False total_reward = 0 state = game.reset() brain.init_state(state) while not terminal: if np.random.rand() < epsilon: action = random.randrange(NUM_ACTION) else: action = brain.get_action() if episode > OBSERVE: epsilon -= 1 / 1000 state, reward, terminal = game.step(action) total_reward += reward brain.remember(state, action, reward, terminal) if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0: brain.train() if time_step % TARGET_UPDATE_INTERVAL == 0: brain.update_target_network() time_step += 1 print('게임횟수: %d 점수: %d' % (episode + 1, total_reward)) total_reward_list.append(total_reward) if episode % 10 == 0: summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list}) writer.add_summary(summary, time_step) total_reward_list = [] if episode % 100 == 0: saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
def __init__(self, epi, cfg=dcfg, validation=False): #cpu or cuda torch.cuda.empty_cache() self.device = cfg.device #torch.device("cuda" if torch.cuda.is_available() else "cpu") self.state_dim = cfg.proc_frame_size #State dimensionality 84x84. self.state_size = cfg.state_size #self.t_steps= tsteps self.t_eps = cfg.t_eps self.minibatch_size = cfg.minibatch_size # Q-learning parameters self.discount = cfg.discount #Discount factor. self.replay_memory = cfg.replay_memory self.bufferSize = cfg.bufferSize self.target_q = cfg.target_q self.validation = validation if (validation): self.episode = epi else: self.episode = int(epi) - 1 self.cfg = cfg modelGray = 'results/ep' + str(self.episode) + '/modelGray.net' modelDepth = 'results/ep' + str(self.episode) + '/modelDepth.net' tModelGray = 'results/ep' + str(self.episode) + '/tModelGray.net' tModelDepth = 'results/ep' + str(self.episode) + '/tModelDepth.net' if os.path.exists(modelGray) and os.path.exists(modelDepth): print("Loading model") self.gray_policy_net = torch.load(modelGray).to(self.device) self.gray_target_net = torch.load(tModelGray).to(self.device) self.depth_policy_net = torch.load(modelDepth).to(self.device) self.depth_target_net = torch.load(tModelDepth).to(self.device) else: print("New model") self.gray_policy_net = DQN(noutputs=cfg.noutputs, nfeats=cfg.nfeats, nstates=cfg.nstates, kernels=cfg.kernels, strides=cfg.strides, poolsize=cfg.poolsize).to(self.device) self.gray_target_net = DQN(noutputs=cfg.noutputs, nfeats=cfg.nfeats, nstates=cfg.nstates, kernels=cfg.kernels, strides=cfg.strides, poolsize=cfg.poolsize).to(self.device) self.depth_policy_net = DQN(noutputs=cfg.noutputs, nfeats=cfg.nfeats, nstates=cfg.nstates, kernels=cfg.kernels, strides=cfg.strides, poolsize=cfg.poolsize).to(self.device) self.depth_target_net = DQN(noutputs=cfg.noutputs, nfeats=cfg.nfeats, nstates=cfg.nstates, kernels=cfg.kernels, strides=cfg.strides, poolsize=cfg.poolsize).to(self.device) if not validation and self.target_q and self.episode % self.target_q == 0: print("cloning") self.depth_policy_net = DQN(noutputs=cfg.noutputs, nfeats=cfg.nfeats, nstates=cfg.nstates, kernels=cfg.kernels, strides=cfg.strides, poolsize=cfg.poolsize).to(self.device) self.depth_target_net = DQN(noutputs=cfg.noutputs, nfeats=cfg.nfeats, nstates=cfg.nstates, kernels=cfg.kernels, strides=cfg.strides, poolsize=cfg.poolsize).to(self.device) self.gray_target_net.load_state_dict(self.gray_target_net.state_dict()) self.gray_target_net.eval() self.depth_target_net.load_state_dict( self.depth_target_net.state_dict()) self.depth_target_net.eval() self.gray_optimizer = optim.RMSprop(self.gray_policy_net.parameters()) self.depth_optimizer = optim.RMSprop( self.depth_policy_net.parameters()) self.memory = ReplayMemory(self.replay_memory)
class Player : def __init__(self, name) : self.name = name self.initializeProperties() self.QNetwork = DQN(self.imageSize, "QN", self.miniBatchSize) self.TDTarget = DQN(self.imageSize, "TD", self.miniBatchSize) self.sess = tf.Session() self.QNetwork.setSess(self.sess) self.TDTarget.setSess(self.sess) self.sess.run(tf.global_variables_initializer()) self.synchronise() def initializeProperties(self) : # Q Network Constants self.imageSize = 80 self.synchronisationPeriod = 500 # Constants self.explorationRate = 0.999 # Behaviour when playing & training self.trainable = True self.exploiting = False # Statistics self.score = 0 # Training self.trainingData = [] self.maxBatchSize = 10000 # trainingData will not have more than maxBatchSize elements self.miniBatchSize = 32 self.miniBatch = [] self.startTraining = 1000 # the training will happen iff we have more than startTraining data in trainingData print("Properties initialized") def training(self, step) : if not self.trainable or len(self.trainingData) < self.startTraining: return if step % self.synchronisationPeriod == 0 : self.synchronise() self.miniBatch = random.sample(self.trainingData, self.miniBatchSize) states, actions, rewards, nextStates = zip(*self.miniBatch) output = self.TDTarget.computeTarget(nextStates, rewards) self.QNetwork.training(states, output, actions) def play(self) : if self.exploiting or random.random() > self.explorationRate : return self.QNetwork.evaluate(self.buffer) else : return int(random.random() < 0.9) def updateConstants(self, learningRate = None, explorationRate = None) : self.QNetwork.updateConstants(learningRate) if not isinstance(explorationRate, type(None)) : self.explorationRate = explorationRate def resetStats(self) : self.score = 0 def updateStats(self, reward) : if reward == 1 : self.score += 1 def displayStats(self) : # print("{} victories & {} defeats".format(self.gamesWon, self.gamesLost)) print(self.score) def addStateSequence(self, action, reward, nS) : # nS = np.transpose(nS, [1, 2, 0]) if self.trainable : self.trainingData.append([self.buffer, action, reward, nS]) while len(self.trainingData) > self.maxBatchSize : del self.trainingData[0] self.buffer = nS def saveQNetwork(self, path, global_step = None) : self.QNetwork.saveQNetwork(path, global_step) def restoreQNetwork(self, path, global_step = None): self.QNetwork.restoreQNetwork(path, global_step) def setBehaviour(self, isTraining) : self.trainable = isTraining self.exploiting = not isTraining def synchronise(self): e1_params = [t for t in tf.trainable_variables() if t.name.startswith(self.QNetwork.scope)] e1_params = sorted(e1_params, key=lambda v: v.name) e2_params = [t for t in tf.trainable_variables() if t.name.startswith(self.TDTarget.scope)] e2_params = sorted(e2_params, key=lambda v: v.name) update_ops = [] for e1_v, e2_v in zip(e1_params, e2_params): op = e2_v.assign(e1_v) update_ops.append(op) self.sess.run(update_ops)
class Agent: def __init__( self, state_size, action_size, n_agents, buffer_size: int = 1e5, batch_size: int = 256, gamma: float = 0.995, tau: float = 1e-3, learning_rate: float = 7e-4, update_every: int = 4, ): """ Initialize DQN agent using the agent-experience buffer Args: state_size (int): Size of the state observation returned by the environment action_size (int): Action space size n_agents (int): Number of agents in the environment buffer_size (int): Desired total experience buffer size batch_size (int): Mini-batch size gamma (float): Discount factor tau (float): For soft update of target parameters learning_rate (float): Learning rate update_every (int): Number of steps before target network update """ self.state_size = state_size self.action_size = action_size self.n_agents = n_agents # Q-Networks self.policy_net = DQN(state_size, action_size).to(device) self.target_net = DQN(state_size, action_size).to(device) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate) self.memory = AgentReplayMemory(buffer_size, n_agents, state_size, device) self.t_step = 0 self.update_every = update_every self.batch_size = batch_size self.gamma = gamma self.tau = tau def step(self, states, actions, rewards, next_steps, done): self.memory.push_agent_actions(states, actions, rewards, next_steps, done) self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: if self.memory.at_capacity(): experience = self.memory.sample(self.batch_size) self.learn(experience, self.gamma) def act(self, states, eps=0): states = torch.from_numpy(states).float().to(device) self.policy_net.eval() with torch.no_grad(): action_values = self.policy_net(states) self.policy_net.train() r = np.random.random(size=self.n_agents) action_values = np.argmax(action_values.cpu().data.numpy(), axis=1) random_choices = np.random.randint(0, self.action_size, size=self.n_agents) return np.where(r > eps, action_values, random_choices) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences criterion = torch.nn.MSELoss() self.policy_net.train() self.target_net.eval() # shape of output from the model (batch_size,action_dim) = (64,4) predicted_targets = self.policy_net(states).gather(1, actions) with torch.no_grad(): labels_next = self.target_net(next_states).detach().max( 1)[0].unsqueeze(1) # .detach() -> Returns a new Tensor, detached from the current graph. labels = rewards + (gamma * labels_next * (1 - dones)) loss = criterion(predicted_targets, labels).to(device) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.policy_net, self.target_net, self.tau) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Args: local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data)
class TrainNQL: def __init__(self, epi, cfg=dcfg, validation=False): #cpu or cuda torch.cuda.empty_cache() self.device = cfg.device #torch.device("cuda" if torch.cuda.is_available() else "cpu") self.state_dim = cfg.proc_frame_size #State dimensionality 84x84. self.state_size = cfg.state_size #self.t_steps= tsteps self.t_eps = cfg.t_eps self.minibatch_size = cfg.minibatch_size # Q-learning parameters self.discount = cfg.discount #Discount factor. self.replay_memory = cfg.replay_memory self.bufferSize = cfg.bufferSize self.target_q = cfg.target_q self.validation = validation if (validation): self.episode = epi else: self.episode = int(epi) - 1 self.cfg = cfg modelGray = 'results/ep' + str(self.episode) + '/modelGray.net' modelDepth = 'results/ep' + str(self.episode) + '/modelDepth.net' tModelGray = 'results/ep' + str(self.episode) + '/tModelGray.net' tModelDepth = 'results/ep' + str(self.episode) + '/tModelDepth.net' if os.path.exists(modelGray) and os.path.exists(modelDepth): print("Loading model") self.gray_policy_net = torch.load(modelGray).to(self.device) self.gray_target_net = torch.load(tModelGray).to(self.device) self.depth_policy_net = torch.load(modelDepth).to(self.device) self.depth_target_net = torch.load(tModelDepth).to(self.device) else: print("New model") self.gray_policy_net = DQN(noutputs=cfg.noutputs, nfeats=cfg.nfeats, nstates=cfg.nstates, kernels=cfg.kernels, strides=cfg.strides, poolsize=cfg.poolsize).to(self.device) self.gray_target_net = DQN(noutputs=cfg.noutputs, nfeats=cfg.nfeats, nstates=cfg.nstates, kernels=cfg.kernels, strides=cfg.strides, poolsize=cfg.poolsize).to(self.device) self.depth_policy_net = DQN(noutputs=cfg.noutputs, nfeats=cfg.nfeats, nstates=cfg.nstates, kernels=cfg.kernels, strides=cfg.strides, poolsize=cfg.poolsize).to(self.device) self.depth_target_net = DQN(noutputs=cfg.noutputs, nfeats=cfg.nfeats, nstates=cfg.nstates, kernels=cfg.kernels, strides=cfg.strides, poolsize=cfg.poolsize).to(self.device) if not validation and self.target_q and self.episode % self.target_q == 0: print("cloning") self.depth_policy_net = DQN(noutputs=cfg.noutputs, nfeats=cfg.nfeats, nstates=cfg.nstates, kernels=cfg.kernels, strides=cfg.strides, poolsize=cfg.poolsize).to(self.device) self.depth_target_net = DQN(noutputs=cfg.noutputs, nfeats=cfg.nfeats, nstates=cfg.nstates, kernels=cfg.kernels, strides=cfg.strides, poolsize=cfg.poolsize).to(self.device) self.gray_target_net.load_state_dict(self.gray_target_net.state_dict()) self.gray_target_net.eval() self.depth_target_net.load_state_dict( self.depth_target_net.state_dict()) self.depth_target_net.eval() self.gray_optimizer = optim.RMSprop(self.gray_policy_net.parameters()) self.depth_optimizer = optim.RMSprop( self.depth_policy_net.parameters()) self.memory = ReplayMemory(self.replay_memory) def get_tensor_from_image(self, file): convert = T.Compose([ T.ToPILImage(), T.Resize((self.state_dim, self.state_dim), interpolation=Image.BILINEAR), T.ToTensor() ]) screen = Image.open(file) screen = np.ascontiguousarray(screen, dtype=np.float32) / 255 screen = torch.from_numpy(screen) screen = convert(screen).unsqueeze(0).to(self.device) return screen def get_data(self, episode, tsteps): #images=torch.Tensor(tsteps,self.state_size,self.state_dim,self.state_dim).to(self.device) #depths=torch.Tensor(tsteps,self.state_size,self.state_dim,self.state_dim).to(self.device) images = [] depths = [] dirname_rgb = 'dataset/RGB/ep' + str(episode) dirname_dep = 'dataset/Depth/ep' + str(episode) for step in range(tsteps): #proc_image=torch.Tensor(self.state_size,self.state_dim,self.state_dim).to(self.device) #proc_depth=torch.Tensor(self.state_size,self.state_dim,self.state_dim).to(self.device) proc_image = [] proc_depth = [] dirname_rgb = 'dataset/RGB/ep' + str(episode) dirname_dep = 'dataset/Depth/ep' + str(episode) for i in range(self.state_size): grayfile = dirname_rgb + '/image_' + str(step + 1) + '_' + str( i + 1) + '.png' depthfile = dirname_dep + '/depth_' + str( step + 1) + '_' + str(i + 1) + '.png' #proc_image[i] = self.get_tensor_from_image(grayfile) #proc_depth[i] = self.get_tensor_from_image(depthfile) proc_image.append(grayfile) proc_depth.append(depthfile) #images[step]=proc_image #depths[step]=proc_depth images.append(proc_image) depths.append(proc_depth) return images, depths def load_data(self): rewards = torch.load('files/reward_history.dat') actions = torch.load('files/action_history.dat') ep_rewards = torch.load('files/ep_rewards.dat') print("Loading images") best_scores = range(len(actions)) buffer_selection_mode = 'default' if (buffer_selection_mode == 'success_handshake'): eps_values = [] for i in range(len(actions)): hspos = 0 hsneg = 0 for step in range(len(actions[i])): if (len(actions[i]) > 0): if actions[i][step] == 3: if rewards[i][step] > 0: hspos = hspos + 1 elif rewards[i][step] == -0.1: hsneg = hsneg + 1 accuracy = float(((hspos) / (hspos + hsneg))) eps_values.append(accuracy) best_scores = np.argsort(eps_values) for i in best_scores: print('Ep: ', i + 1) dirname_gray = 'dataset/RGB/ep' + str(i + 1) dirname_dep = 'dataset/Depth/ep' + str(i + 1) files = [] if (os.path.exists(dirname_gray)): files = os.listdir(dirname_gray) k = 0 for file in files: if re.match(r"image.*\.png", file): k = k + 1 k = int(k / 8) while (k % 4 != 0): k = k - 1 if (k > self.bufferSize): k = self.bufferSize print(k) #os.system("free -h") #with torch.no_grad(): images, depths = self.get_data(i + 1, k) print("Loading done") for step in range(k - 1): #print(len(rewards),i) #print(len(rewards[i]), step) reward = self.cfg.neutral_reward if rewards[i][step] >= 1: reward = self.cfg.hs_success_reward elif rewards[i][step] < 0: reward = self.cfg.hs_fail_reward reward = torch.tensor([reward], device=self.device) action = torch.tensor([[actions[i][step]]], device=self.device, dtype=torch.long) #image = images[step].unsqueeze(0).to(self.device) #depth = depths[step].unsqueeze(0).to(self.device) #next_image = images[step+1].unsqueeze(0).to(self.device) #next_depth = depths[step+1].unsqueeze(0).to(self.device) image = images[step] depth = depths[step] next_image = images[step + 1] next_depth = depths[step + 1] self.memory.push(image, depth, action, next_image, next_depth, reward) #print("Memory size: ",getsizeof(self.memory)) #torch.cuda.empty_cache() def train(self): if len(self.memory) < self.minibatch_size: return for i in range(0, len(self.memory), self.minibatch_size): #transitions = self.memory.sample(self.minibatch_size) transitions = self.memory.pull(self.minibatch_size) print('Batch train: ' + str(int(i / self.minibatch_size) + 1) + "/" + str(int(len(self.memory) / self.minibatch_size) + 1)) aux_transitions = [] for t in transitions: proc_sgray = torch.Tensor(self.state_size, self.state_dim, self.state_dim).to(self.device) proc_sdepth = torch.Tensor(self.state_size, self.state_dim, self.state_dim).to(self.device) proc_next_sgray = torch.Tensor(self.state_size, self.state_dim, self.state_dim).to(self.device) proc_next_sdepth = torch.Tensor(self.state_size, self.state_dim, self.state_dim).to(self.device) count = 0 for sgray, sdepth, next_sgray, next_sdepth in zip( t.sgray, t.sdepth, t.next_sgray, t.next_sdepth): proc_sgray[count] = self.get_tensor_from_image(sgray) proc_sdepth[count] = self.get_tensor_from_image(sdepth) proc_next_sgray[count] = self.get_tensor_from_image( next_sgray) proc_next_sdepth[count] = self.get_tensor_from_image( next_sdepth) count += 1 proc_sgray = proc_sgray.unsqueeze(0).to(self.device) proc_sdepth = proc_sdepth.unsqueeze(0).to(self.device) proc_next_sgray = proc_next_sgray.unsqueeze(0).to(self.device) proc_next_sdepth = proc_next_sdepth.unsqueeze(0).to( self.device) #('sgray','sdepth','action','next_sgray','next_sdepth','reward') one_transition = Transition(proc_sgray, proc_sdepth, t.action, proc_next_sgray, proc_next_sdepth, t.reward) aux_transitions.append(one_transition) transitions = aux_transitions # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) #print(batch.sgray) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) gray_non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_sgray)), device=self.device, dtype=torch.bool) gray_non_final_next_states = torch.cat( [s for s in batch.next_sgray if s is not None]) depth_non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_sdepth)), device=self.device, dtype=torch.bool) depth_non_final_next_states = torch.cat( [s for s in batch.next_sdepth if s is not None]) sgray_batch = torch.cat(batch.sgray) sdepth_batch = torch.cat(batch.sdepth) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net sgray_action_values = self.gray_policy_net(sgray_batch).gather( 1, action_batch) sdepth_action_values = self.depth_policy_net(sdepth_batch).gather( 1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_sgray_values = torch.zeros(self.minibatch_size, device=self.device) next_sgray_values[gray_non_final_mask] = self.gray_target_net( gray_non_final_next_states).max(1)[0].detach() next_sdepth_values = torch.zeros(self.minibatch_size, device=self.device) next_sdepth_values[depth_non_final_mask] = self.depth_target_net( depth_non_final_next_states).max(1)[0].detach() # Compute the expected Q values expected_sgray_action_values = (next_sgray_values * self.discount) + reward_batch expected_sdepth_action_values = (next_sdepth_values * self.discount) + reward_batch # Compute Huber loss gray_loss = F.smooth_l1_loss( sgray_action_values, expected_sgray_action_values.unsqueeze(1)) depth_loss = F.smooth_l1_loss( sdepth_action_values, expected_sdepth_action_values.unsqueeze(1)) # Optimize the model self.gray_optimizer.zero_grad() gray_loss.backward() for param in self.gray_policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.gray_optimizer.step() # Optimize the model self.depth_optimizer.zero_grad() depth_loss.backward() for param in self.depth_policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.depth_optimizer.step()
def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(4), GridworldEnv(5)], batch_size=128, gamma=0.999, alpha=0.9, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5, is_plot=False, num_episodes=200, max_num_steps_per_episode=1000, learning_rate=0.001, memory_replay_size=10000, memory_policy_size=1000): """ Soft Q-learning training routine. Retuns rewards and durations logs. Plot environment screen """ num_actions = list_of_envs[0].action_space.n num_envs = len(list_of_envs) policy = PolicyNetwork(num_actions) models = [DQN(num_actions) for _ in range(0, num_envs)] ### Add torch.nn.ModuleList (?) memories = [ ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs) ] use_cuda = torch.cuda.is_available() if use_cuda: policy.cuda() for model in models: model.cuda() optimizers = [ optim.Adam(model.parameters(), lr=learning_rate) for model in models ] policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate) # optimizer = optim.RMSprop(model.parameters(), ) episode_durations = [[] for _ in range(num_envs)] episode_rewards = [[] for _ in range(num_envs)] steps_done = np.zeros(num_envs) episodes_done = np.zeros(num_envs) current_time = np.zeros(num_envs) # Initialize environments for env in list_of_envs: env.reset() while np.min(episodes_done) < num_episodes: # TODO: add max_num_steps_per_episode # Optimization is given by alterating minimization scheme: # 1. do the step for each env # 2. do one optimization step for each env using "soft-q-learning". # 3. do one optimization step for the policy for i_env, env in enumerate(list_of_envs): # print("Cur episode:", i_episode, "steps done:", steps_done, # "exploration factor:", eps_end + (eps_start - eps_end) * \ # math.exp(-1. * steps_done / eps_decay)) # last_screen = env.current_grid_map current_screen = get_screen(env) state = current_screen # - last_screen # Select and perform an action action = select_action(state, policy, models[i_env], num_actions, eps_start, eps_end, eps_decay, episodes_done[i_env], alpha, beta) steps_done[i_env] += 1 current_time[i_env] += 1 _, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # Observe new state last_screen = current_screen current_screen = get_screen(env) if not done: next_state = current_screen # - last_screen else: next_state = None # Store the transition in memory time = Tensor([current_time[i_env]]) memories[i_env].push(state, action, next_state, reward, time) # Perform one step of the optimization (on the target network) optimize_model(policy, models[i_env], optimizers[i_env], memories[i_env], batch_size, alpha, beta, gamma) if done: print( "ENV:", i_env, "iter:", episodes_done[i_env], "\treward:", env.episode_total_reward, "\tit:", current_time[i_env], "\texp_factor:", eps_end + (eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay)) env.reset() episodes_done[i_env] += 1 episode_durations[i_env].append(current_time[i_env]) current_time[i_env] = 0 episode_rewards[i_env].append(env.episode_total_reward) if is_plot: plot_rewards(episode_rewards, i_env) optimize_policy(policy, policy_optimizer, memories, batch_size, num_envs, gamma) print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-distral-2col-rewards', episode_rewards) np.save(file_name + '-distral-2col-durations', episode_durations) return models, policy, episode_rewards, episode_durations
return gray # epsilon greedy def pick_action(observation, net): if(random.random() < epsilon): return random.randint(0, num_actions-1) action = torch.argmax( net(torch.tensor(observation).float().unsqueeze(0))) return action net = DQN() net.load_state_dict(torch.load("model.h5", map_location="cpu")) criterion = nn.MSELoss() optimizer = optim.Adam(net.parameters(), lr=0.01) starttime = time.time() buffer = collections.deque(maxlen=N) lr = 1e-3 for i in range(num_episodes): observation = env.reset() observation = preprocess(observation) observation = [observation, observation, observation, observation] j = 0 while(True):
def trainDQN(file_name="DQN", env=GridworldEnv(1), batch_size=128, gamma=0.999, eps_start=0.9, eps_end=0.05, eps_decay=1000, is_plot=False, num_episodes=500, max_num_steps_per_episode=1000, learning_rate=0.0001, memory_replay_size=10000): """ DQN training routine. Retuns rewards and durations logs. Plot environment screen """ if is_plot: env.reset() plt.ion() plt.figure() plt.imshow(get_screen(env).cpu().squeeze(0).squeeze(0).numpy(), interpolation='none') plt.title("") plt.draw() plt.pause(0.00001) num_actions = env.action_space.n model = DQN(num_actions) optimizer = optim.Adam(model.parameters(), lr=learning_rate) use_cuda = torch.cuda.is_available() if use_cuda: model.cuda() memory = ReplayMemory(memory_replay_size) episode_durations = [] mean_durations = [] episode_rewards = [] mean_rewards = [] steps_done = 0 # total steps for i_episode in range(num_episodes): if i_episode % 20 == 0: clear_output() print("Cur episode:", i_episode, "steps done:", steps_done, "exploration factor:", eps_end + (eps_start - eps_end) * \ math.exp(-1. * steps_done / eps_decay)) # Initialize the environment and state env.reset() # last_screen = env.current_grid_map # (1, 1, 8, 8) current_screen = get_screen(env) state = current_screen # - last_screen for t in count(): # Select and perform an action action = select_action(state, model, num_actions, eps_start, eps_end, eps_decay, steps_done) steps_done += 1 _, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # Observe new state last_screen = current_screen current_screen = get_screen(env) if not done: next_state = current_screen # - last_screen else: next_state = None # Store the transition in memory memory.push(state, action, next_state, reward) # Move to the next state state = next_state # plot_state(state) # env.render() # Perform one step of the optimization (on the target network) optimize_model(model, optimizer, memory, batch_size, gamma) if done or t + 1 >= max_num_steps_per_episode: episode_durations.append(t + 1) episode_rewards.append(env.episode_total_reward) if is_plot: plot_durations(episode_durations, mean_durations) plot_rewards(episode_rewards, mean_rewards) break print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-dqn-rewards', episode_rewards) np.save(file_name + '-dqn-durations', episode_durations) return model, episode_rewards, episode_durations
def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(4), GridworldEnv(5)], batch_size=128, gamma=0.999, alpha=0.9, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5, is_plot=False, num_episodes=200, max_num_steps_per_episode=1000, learning_rate=0.001, memory_replay_size=10000, memory_policy_size=1000): """ Soft Q-learning training routine. Retuns rewards and durations logs. Plot environment screen """ # action dimension num_actions = list_of_envs[0].action_space.n # total envs num_envs = len(list_of_envs) # pi_0 policy = PolicyNetwork(num_actions) # Q value, every environment has one, used to calculate A_i, models = [DQN(num_actions) for _ in range(0, num_envs)] ### Add torch.nn.ModuleList (?) # replay buffer for env ??? memories = [ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs)] use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # device = "cpu" print(device) # model policy = policy.to(device) for i in range(len(models)): models[i] = models[i].to(device) # optimizer for every Q model optimizers = [optim.Adam(model.parameters(), lr=learning_rate) for model in models] # optimizer for policy policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate) # optimizer = optim.RMSprop(model.parameters(), ) # info list for each environment episode_durations = [[] for _ in range(num_envs)] # list of local steps episode_rewards = [[] for _ in range(num_envs)] # list of list of episode reward episodes_done = np.zeros(num_envs) # episode num steps_done = np.zeros(num_envs) # global timesteps for each env current_time = np.zeros(num_envs) # local timesteps for each env # Initialize environments for env in list_of_envs: env.reset() while np.min(episodes_done) < num_episodes: policy.train() for model in models: model.train() # TODO: add max_num_steps_per_episode # Optimization is given by alterating minimization scheme: # 1. do the step for each env # 2. do one optimization step for each env using "soft-q-learning". # 3. do one optimization step for the policy # 1. do the step for each env for i_env, env in enumerate(list_of_envs): # print("Cur episode:", i_episode, "steps done:", steps_done, # "exploration factor:", eps_end + (eps_start - eps_end) * \ # math.exp(-1. * steps_done / eps_decay)) # last_screen = env.current_grid_map # ===========update step info begin======================== current_screen = get_screen(env) # state state = current_screen # - last_screen # action chosen by pi_1~pi_i action = select_action(state, policy, models[i_env], num_actions, eps_start, eps_end, eps_decay, episodes_done[i_env], alpha, beta, device) # global_steps steps_done[i_env] += 1 # local steps current_time[i_env] += 1 # reward _, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # next state last_screen = current_screen current_screen = get_screen(env) if not done: next_state = current_screen # - last_screen else: next_state = None # add to buffer time = Tensor([current_time[i_env]]) memories[i_env].push(state, action, next_state, reward, time) # 2. do one optimization step for each env using "soft-q-learning". # Perform one step of the optimization (on the target network) optimize_model(policy, models[i_env], optimizers[i_env], memories[i_env], batch_size, alpha, beta, gamma, device) # ===========update step info end ======================== # ===========update episode info begin ==================== if done: print("ENV:", i_env, "iter:", episodes_done[i_env], "\treward:", env.episode_total_reward, "\tit:", current_time[i_env], "\texp_factor:", eps_end + (eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay)) # reset env env.reset() # episode steps episodes_done[i_env] += 1 # append each episode local timesteps list for every env episode_durations[i_env].append(current_time[i_env]) # reset local timesteps current_time[i_env] = 0 # append total episode_reward to list episode_rewards[i_env].append(env.episode_total_reward) if is_plot: plot_rewards(episode_rewards, i_env) # ===========update episode info end ==================== # 3. do one optimization step for the policy # after all envs has performed one step, optimize policy optimize_policy(policy, policy_optimizer, memories, batch_size, num_envs, gamma, device) print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-distral-2col-rewards', episode_rewards) np.save(file_name + '-distral-2col-durations', episode_durations) return models, policy, episode_rewards, episode_durations
def test_dqn(args=get_args()): env = make_atari_env(args) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.env.action_space.shape or env.env.action_space.n # should be N_FRAMES x H x W print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) # make environments train_envs = SubprocVectorEnv( [lambda: make_atari_env(args) for _ in range(args.training_num)]) test_envs = SubprocVectorEnv( [lambda: make_atari_env_watch(args) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) train_envs.seed(args.seed) test_envs.seed(args.seed) # define model net = DQN(*args.state_shape, args.action_shape, args.device).to(args.device) optim = torch.optim.Adam(net.parameters(), lr=args.lr) # define policy policy = DQNPolicy(net, optim, args.gamma, args.n_step, target_update_freq=args.target_update_freq) # load a previous policy if args.resume_path: policy.load_state_dict( torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # replay buffer: `save_last_obs` and `stack_num` can be removed together # when you have enough RAM buffer = VectorReplayBuffer(args.buffer_size, buffer_num=len(train_envs), ignore_obs_next=True, save_only_last_obs=True, stack_num=args.frames_stack) # collector train_collector = Collector(policy, train_envs, buffer, exploration_noise=True) test_collector = Collector(policy, test_envs, exploration_noise=True) # log log_path = os.path.join(args.logdir, args.task, 'dqn') writer = SummaryWriter(log_path) writer.add_text("args", str(args)) logger = BasicLogger(writer) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): if env.env.spec.reward_threshold: return mean_rewards >= env.spec.reward_threshold elif 'Pong' in args.task: return mean_rewards >= 20 else: return False def train_fn(epoch, env_step): # nature DQN setting, linear decay in the first 1M steps if env_step <= 1e6: eps = args.eps_train - env_step / 1e6 * \ (args.eps_train - args.eps_train_final) else: eps = args.eps_train_final policy.set_eps(eps) logger.write('train/eps', env_step, eps) def test_fn(epoch, env_step): policy.set_eps(args.eps_test) # watch agent's performance def watch(): print("Setup test envs ...") policy.eval() policy.set_eps(args.eps_test) test_envs.seed(args.seed) if args.save_buffer_name: print(f"Generate buffer with size {args.buffer_size}") buffer = VectorReplayBuffer(args.buffer_size, buffer_num=len(test_envs), ignore_obs_next=True, save_only_last_obs=True, stack_num=args.frames_stack) collector = Collector(policy, test_envs, buffer) result = collector.collect(n_step=args.buffer_size) print(f"Save buffer into {args.save_buffer_name}") # Unfortunately, pickle will cause oom with 1M buffer size buffer.save_hdf5(args.save_buffer_name) else: print("Testing agent ...") test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) pprint.pprint(result) if args.watch: watch() exit(0) # test train_collector and start filling replay buffer train_collector.collect(n_step=args.batch_size * args.training_num) # trainer result = offpolicy_trainer(policy, train_collector, test_collector, args.epoch, args.step_per_epoch, args.step_per_collect, args.test_num, args.batch_size, train_fn=train_fn, test_fn=test_fn, stop_fn=stop_fn, save_fn=save_fn, logger=logger, update_per_step=args.update_per_step, test_in_train=False) pprint.pprint(result) watch()
class Player: def __init__(self, name, isBot): self.name = name self.isBot = isBot if not self.isBot: self.chosenAction = 0 self.defineKeyboardListener() self.initializeProperties() self.QNetwork = DQN("QN{}".format(name), self.miniBatchSize) self.TDTarget = DQN("TD{}".format(name), self.miniBatchSize) self.sess = tf.Session() self.QNetwork.setSess(self.sess) self.TDTarget.setSess(self.sess) self.sess.run(tf.global_variables_initializer()) self.synchronise() def initializeProperties(self): self.synchronisationPeriod = 100 self.explorationRate = 0.999 # Behaviour when playing & training self.trainable = True self.exploiting = False # Statistics self.gamesWon = 0 self.gamesLost = 0 # Training self.trainingData = [] self.maxBatchSize = 50000 # trainingData will not have more than maxBatchSize elements self.miniBatchSize = 32 self.miniBatch = [] self.startTraining = 1000 # the training will happen iff we have more than startTraining data in trainingData print("Properties initialized") def defineKeyboardListener(self): def on_press(key): try: if key == Key.up: self.chosenAction = 1 elif key == Key.down: self.chosenAction = 2 else: self.chosenAction = 0 except AttributeError: self.chosenAction = 0 def on_release(key): self.chosenAction = 0 if key == keyboard.Key.esc: # Stop listener return False self.listener = keyboard.Listener(on_press=on_press, on_release=on_release) self.listener.start() def training(self, step): if not self.trainable or len(self.trainingData) < self.startTraining: return if step % self.synchronisationPeriod == 0: self.synchronise() self.miniBatch = random.sample(self.trainingData, self.miniBatchSize) states, actions, rewards, nextStates = zip(*self.miniBatch) output = self.TDTarget.computeTarget(nextStates, rewards) self.QNetwork.training(states, output, actions) def play(self): if self.isBot: if self.exploiting or random.random() > self.explorationRate: return self.QNetwork.evaluate(self.buffer) else: return random.randint(0, 1) else: return self.chosenAction def updateConstants(self, learningRate=None, explorationRate=None): self.QNetwork.updateConstants(learningRate) if not isinstance(explorationRate, type(None)): self.explorationRate = explorationRate def resetStats(self): self.gamesWon = 0 self.gamesLost = 0 def updateStats(self, reward): if reward == 1: self.gamesWon += 1 elif reward == -1: self.gamesLost += 1 def displayStats(self): # print("{} victories & {} defeats".format(self.gamesWon, self.gamesLost)) print(self.gamesWon, self.gamesLost) def addStateSequence(self, action, reward, nextState): if self.trainable: self.trainingData.append([self.buffer, action, reward, nextState]) while len(self.trainingData) > self.maxBatchSize: self.trainingData.pop(0) self.buffer = nextState def saveQNetwork(self, path, global_step=None): self.QNetwork.saveQNetwork(path, global_step) def restoreQNetwork(self, path, global_step=None): self.QNetwork.restoreQNetwork(path, global_step) def setBehaviour(self, isTraining): self.trainable = isTraining self.exploiting = not isTraining def synchronise(self): e1_params = [ t for t in tf.trainable_variables() if t.name.startswith(self.QNetwork.scope) ] e1_params = sorted(e1_params, key=lambda v: v.name) e2_params = [ t for t in tf.trainable_variables() if t.name.startswith(self.TDTarget.scope) ] e2_params = sorted(e2_params, key=lambda v: v.name) update_ops = [] for e1_v, e2_v in zip(e1_params, e2_params): op = e2_v.assign(e1_v) update_ops.append(op) self.sess.run(update_ops)