def __init__(self, env_name, gamma, nstep, target_update_period, n_frames): self.env_name = env_name self.gamma = gamma self.nstep = nstep self.action_space = gym.make(env_name).action_space.n self.qnet = DuelingQNetwork(action_space=self.action_space) self.target_qnet = DuelingQNetwork(action_space=self.action_space) self.target_update_period = target_update_period self.n_frames = n_frames #self.optimizer = tf.keras.optimizers.Adam(lr=0.0001) self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.00025 / 4, rho=0.95, momentum=0.0, epsilon=1.5e-07, centered=True) self.update_count = 0
def __init__(self, state_size, action_size, seed, use_is=True): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed use_is: flag indicating whether to use importance sampling when computing the sampling probabilities """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = PrioritizedExperienceReplayBuffer( action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.use_is = use_is
def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Use the optim package to define an Optimizer that will update the weights of # the model for us. Here we will use Adam; the optim package contains many other # optimization algorithms. # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size, params, seed=None, model='dqn'): self.seed = seed if seed: random.seed(seed) np.random.seed(0) self.params = params self.state_size = state_size self.action_size = action_size self.eps = self.params['EPS'] # Memmory to learn from. self.memory = ReplayBuffer(memory_size=self.params['BUFFER_SIZE'], sample_size=self.params['BATCH_SIZE']) # Network if model == 'dqn': # Vanilla DQN self.target = QNetwork(state_size=state_size, action_size=action_size, seed=seed).to(device) self.local = QNetwork(state_size=state_size, action_size=action_size, seed=seed).to(device) elif model == 'ddqn': # Dueling DQN self.target = DuelingQNetwork(state_size=state_size, action_size=action_size, seed=seed).to(device) self.local = DuelingQNetwork(state_size=state_size, action_size=action_size, seed=seed).to(device) self.optimizer = torch.optim.Adam(self.local.parameters(), lr=self.params['LR']) self.t_step = 0
def __init__(self, state_size, action_size, num_episodes, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed num_episodes (int): number of training epochs """ self.state_size = state_size self.action_size = action_size self.seed = seed # Q-Network self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.anneal_beta = (1. - BETA) / num_episodes self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, ALPHA, BETA) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.t_learning_step = 0
def __init__(self, state_size, action_size, double_dqn, dueling, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action double_qn (bool): true if double dqn else false seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.double_dqn = double_dqn self.seed = random.seed(seed) self.dueling = dueling # Q-Network if dueling: self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device) else: self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size, seed): #, writer): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # TODO: Swap ReplayBuffer for PER buffer # Replay memory # self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.memory = PrioritisedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, ALPHA, EPSILON) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.beta = BETA_START
def __init__(self, state_size, action_size, seed, use_double_dqn, use_dueling_dqn): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.use_double_dqn = use_double_dqn if use_dueling_dqn: # Dueling Q-Network self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device) else: # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_NETWORK_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size, seed, network="Dueling", stepkey="Double"): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ print ("Architecture: " + str(network) + " " + str(stepkey) + " QN") self.stepkey = stepkey self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network if (network=="Dueling"): self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device) elif (network=="Convolutional"): self.qnetwork_local = ConvolutionalDuelingQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = ConvolutionalDuelingQNetwork(state_size, action_size, seed).to(device) else: self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) print (self.qnetwork_local) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed model (string): which network to use """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size, seed, ddqn=False, dueling=False, init_td=1e-5, prioritize_weight=0.0, beta_scheduler=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network if not dueling: self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) elif dueling: self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory if (prioritize_weight != 0.0): self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, prioritize_weight, beta_scheduler) self.init_td = init_td self.prioritize_weight = prioritize_weight else: self.prioritize_weight = 0.0 self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.init_td = init_td # Initialize time step (for updating every steps) self.t_step = 0 self.ddqn = ddqn
def __init__(self, state_size, action_size, seed, prioritized=False): """Dueling Q network agent.""" super().__init__(state_size, action_size, seed) # Dueling Q-Network self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device) # use GPU or not self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
def __init__(self, seed, **kwargs): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.seed = random.seed(seed) # Hyper-parameters self.network_args = kwargs.get('network_args', {}) self.buffer_size = kwargs.get('buffer_size', BUFFER_SIZE) self.batch_size = kwargs.get('batch_size', BATCH_SIZE) self.gamma = kwargs.get('gamma', GAMMA) self.tau = kwargs.get('tau', TAU) self.update_every = kwargs.get('update_every', UPDATE_EVERY) self.lr = kwargs.get('lr', LR) self.double_q = kwargs.get('double_q', False) self.dueling = kwargs.get('dueling', False) self.ray_layer = kwargs.get('ray_layer', False) # Q-Network if self.dueling: if self.ray_layer: self.qnetwork_local = DuelingQNetworkWithRayLayer( seed, **self.network_args).to(device) self.qnetwork_target = DuelingQNetworkWithRayLayer( seed, **self.network_args).to(device) else: self.qnetwork_local = DuelingQNetwork( seed, **self.network_args).to(device) self.qnetwork_target = DuelingQNetwork( seed, **self.network_args).to(device) else: if self.ray_layer: self.qnetwork_local = QNetworkWithRayLayer( seed, **self.network_args).to(device) self.qnetwork_target = QNetworkWithRayLayer( seed, **self.network_args).to(device) else: self.qnetwork_local = QNetwork(seed, **self.network_args).to(device) self.qnetwork_target = QNetwork(seed, **self.network_args).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Replay memory self.memory = ReplayBuffer(ACTION_SIZE, self.buffer_size, self.batch_size, seed) # Initialize time step (for updating every update_every steps) self.t_step = 0
def __init__(self, state_size, action_size, seed, double_dqn=True, priority_replay=True, dueling_network=True): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.B = B_START self.double_dqn = double_dqn self.priority_replay = priority_replay self.dueling_network = dueling_network # Q-Network if self.dueling_network: self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device) else: self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory if self.priority_replay: self.memory = PrioritizedReplayBuffer(state_size, BUFFER_SIZE, BATCH_SIZE, seed, use_rank=False) else: self.memory = ReplayBuffer(state_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size, parameters, evaluation_mode=False): self.evaluation_mode = evaluation_mode self.state_size = state_size self.action_size = action_size self.double_dqn = True self.hidsize = 1 if not evaluation_mode: self.hidsize = parameters.hidden_size self.buffer_size = parameters.buffer_size self.batch_size = parameters.batch_size self.update_every = parameters.update_every self.learning_rate = parameters.learning_rate self.tau = parameters.tau self.gamma = parameters.gamma self.buffer_min_size = parameters.buffer_min_size # Device if parameters.use_gpu and torch.cuda.is_available(): self.device = torch.device("cuda:0") print(" Using GPU") print(" GPU") else: self.device = torch.device("cpu") print(" Using CPU") # Q-Network self.qnetwork_local = DuelingQNetwork(state_size, action_size, hidsize1=self.hidsize, hidsize2=self.hidsize).to( self.device) if not evaluation_mode: self.qnetwork_target = copy.deepcopy(self.qnetwork_local) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.learning_rate) self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, self.device) self.t_step = 0 self.loss = 0.0
def __init__(self, state_size, action_size, seed, lr_decay_rate=0.999): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) # self.qnetwork_target.eval() # No need to compute gradients print(self.qnetwork_target) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.lr_scheduler = optim.lr_scheduler.ExponentialLR(self.optimizer, 1) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size, seed, DDQN=False, PRB=False, Dueling=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed DDQN (bool): apply Double DDQN algorithm PRB (bool): use a Prioritized ReplayBuffer Dueling (bool): use a Dueling NN-architecture """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.DDQN = DDQN self.PRB = PRB # Q-Network if Dueling: self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device) else: self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory if self.PRB: self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, ALPHA, BETA_START, BETA_INCREASE) else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ super(DuelingAgent, self).__init__(state_size, action_size, seed) # Q-Network self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.model = DuelingQNetwork(state_size, action_size, seed).to(device) # self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device) # for target_param, param in zip(self.qnetwork_local.parameters(),self.qnetwork_target.parameters()): # target_param.data.copy_(param) self.optimizer = optim.Adam(self.model.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, env_name, epsilon=0.05, n_frames=4): self.env_name = env_name self.env = gym.make(env_name) self.action_space = self.env.action_space.n self.epsilon = epsilon self.n_frames = n_frames self.frames = collections.deque(maxlen=n_frames) self.qnet = DuelingQNetwork(action_space=self.action_space) self.define_network()
def __init__(self, state_size, action_size, seed, args): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.double_dqn = args.double_dqn self.dueling_dqn = args.dueling_dqn self.args = args assert self.double_dqn * self.dueling_dqn == 0 if self.double_dqn: print("Implementing Double DQN!") elif self.dueling_dqn: print("Implementing Dueling DQN!") else: print("Implementing DQN") # Q-Network if self.dueling_dqn: self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device) else: self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.args.lr) # Replay memory self.memory = ReplayBuffer(action_size, args.buffer_size, args.batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size, seed, hidden_sizes=[64, 64], flavor='plain'): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed hidden_sizes (list): list of neurons in each layer flavor (str): flavor of the network - plain, double, dueling, double-dueling """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.hidden_sizes = hidden_sizes self.flavor = flavor # Q-Network if self.flavor == 'plain' or self.flavor == 'double': self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_sizes).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_sizes).to(device) # Dueling Q-Network if self.flavor == 'dueling' or self.flavor == 'double-dueling': self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed, hidden_sizes).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed, hidden_sizes).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, id, state_size, action_size, seed, use_double=False, use_prio=False, use_dueling=False): """Initialize an Agent object. Params ====== id (int): id used to identify the agent state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed double (boolean): Use Double DQN algorithm use_prio (boolean): Use Prioritized Experience Replay use_dueling (boolean): Use Dueling DQN algorithm """ self.state_size = state_size self.action_size = action_size self.id = id self.use_double = use_double self.use_prio = use_prio self.use_dueling = use_dueling self.seed = random.seed(seed) self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Q-Network if use_dueling: self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(self.device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(self.device) else: self.qnetwork_local = QNetwork(state_size, action_size, seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory if use_prio: self.memory = NaivePrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, PRIO_ALPHA, PRIO_EPSILON) else: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def _create_nn(self, nn_type, state_size, action_size, seed, device): if nn_type == 'noisydueling': self._sample_noise = True return NoisyDuelingQNetwork(state_size, action_size, seed, device=device).to(device) elif nn_type == 'dueling': return DuelingQNetwork(state_size, action_size, seed).to(device) elif nn_type == 'q': return QNetwork(state_size, action_size, seed).to(device) else: raise Exception( 'Unknown NN type - must be one of NoisyDueling, Dueling or Q')
def __init__(self, state_size, action_size, mem_length=100000, ddqn=True): self.gamma = 0.99 self.batch_size = 64 self.action_size = action_size self.ddqn = ddqn self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") if ddqn: self.model = DuelingQNetwork(state_size, action_size).to(self.device) self.target_model = DuelingQNetwork(state_size, action_size).to(self.device) self.optimizer = optim.Adam(self.model.parameters(), lr=5e-4) self.experience = self.ddqn_experience else: self.model = QNetwork(state_size, action_size).to(self.device) self.optimizer = optim.Adam(self.model.parameters(), lr=5e-4) self.experience = self.dqn_experience # replay memory self.memory = deque(maxlen=mem_length)
def __init__(self, pid, env_name, epsilon, alpha, buffer_size, n_frames, gamma, nstep, reward_clip): self.pid = pid self.env = gym.make(env_name) self.epsilon = epsilon self.gamma = gamma self.alpha = alpha self.n_frames = n_frames self.action_space = self.env.action_space.n self.frames = collections.deque(maxlen=n_frames) self.nstep = nstep self.buffer_size = buffer_size self.local_buffer = LocalReplayBuffer(reward_clip=reward_clip, gamma=gamma, nstep=nstep) self.local_qnet = DuelingQNetwork(action_space=self.action_space) self.episode_steps = 0 self.episode_rewards = 0 self.lives = 5 #: Breakout only self.define_network()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed model (string): which network to use """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = DuelingQNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.0): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Double DQN # Local network picks action next_action = self.qnetwork_local(next_states).detach().argmax( 1).unsqueeze(1) # Target network estimates the value of said action Q_targets_next = self.qnetwork_target(next_states).gather( 1, next_action) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDDQNPolicy(Policy): """Dueling Double DQN policy""" def __init__(self, state_size, action_size, parameters, evaluation_mode=False): self.evaluation_mode = evaluation_mode self.state_size = state_size self.action_size = action_size self.double_dqn = True self.hidsize = 1 if not evaluation_mode: self.hidsize = parameters.hidden_size self.buffer_size = parameters.buffer_size self.batch_size = parameters.batch_size self.update_every = parameters.update_every self.learning_rate = parameters.learning_rate self.tau = parameters.tau self.gamma = parameters.gamma self.buffer_min_size = parameters.buffer_min_size # Device if parameters.use_gpu and torch.cuda.is_available(): self.device = torch.device("cuda:0") print(" Using GPU") print(" GPU") else: self.device = torch.device("cpu") print(" Using CPU") # Q-Network self.qnetwork_local = DuelingQNetwork(state_size, action_size, hidsize1=self.hidsize, hidsize2=self.hidsize).to( self.device) if not evaluation_mode: self.qnetwork_target = copy.deepcopy(self.qnetwork_local) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.learning_rate) self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, self.device) self.t_step = 0 self.loss = 0.0 def act(self, state, eps=0.): state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_state, done): assert not self.evaluation_mode, "Policy has been initialized for evaluation only." # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.buffer_min_size and len( self.memory) > self.batch_size: self._learn() def _learn(self): experiences = self.memory.sample() states, actions, rewards, next_states, dones = experiences # Get expected Q values from local model q_expected = self.qnetwork_local(states).gather(1, actions) if self.double_dqn: # Double DQN q_best_action = self.qnetwork_local(next_states).max(1)[1] q_targets_next = self.qnetwork_target(next_states).gather( 1, q_best_action.unsqueeze(-1)) else: # DQN q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(-1) # Compute Q targets for current states q_targets = rewards + (self.gamma * q_targets_next * (1 - dones)) # Compute loss self.loss = F.mse_loss(q_expected, q_targets) # Minimize the loss self.optimizer.zero_grad() self.loss.backward() self.optimizer.step() # Update target network self._soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def _soft_update(self, local_model, target_model, tau): # Soft update model parameters. # θ_target = τ*θ_local + (1 - τ)*θ_target for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save(self, filename): torch.save(self.qnetwork_local.state_dict(), filename + ".local") torch.save(self.qnetwork_target.state_dict(), filename + ".target") def load(self, filename): if os.path.exists(filename + ".local"): self.qnetwork_local.load_state_dict( torch.load(filename + ".local", map_location=torch.device('cpu'))) print('local') if os.path.exists(filename + ".target"): self.qnetwork_target.load_state_dict( torch.load(filename + ".target", map_location=torch.device('cpu'))) print('target')
class Actor: def __init__(self, pid, env_name, epsilon, alpha, buffer_size, n_frames, gamma, nstep, reward_clip): self.pid = pid self.env = gym.make(env_name) self.epsilon = epsilon self.gamma = gamma self.alpha = alpha self.n_frames = n_frames self.action_space = self.env.action_space.n self.frames = collections.deque(maxlen=n_frames) self.nstep = nstep self.buffer_size = buffer_size self.local_buffer = LocalReplayBuffer(reward_clip=reward_clip, gamma=gamma, nstep=nstep) self.local_qnet = DuelingQNetwork(action_space=self.action_space) self.episode_steps = 0 self.episode_rewards = 0 self.lives = 5 #: Breakout only self.define_network() def define_network(self): #: hide GPU from remote actor tf.config.set_visible_devices([], 'GPU') #: define by run frame = preprocess_frame(self.env.reset()) for _ in range(self.n_frames): self.frames.append(frame) state = np.stack(self.frames, axis=2)[np.newaxis, ...] self.local_qnet(state) def rollout(self, current_weights): tf.config.set_visible_devices([], 'GPU') self.local_qnet.set_weights(current_weights) state = np.stack(self.frames, axis=2)[np.newaxis, ...] for _ in range(self.buffer_size): state = np.stack(self.frames, axis=2)[np.newaxis, ...] action = self.local_qnet.sample_action(state, self.epsilon) next_frame, reward, done, info = self.env.step(action) self.episode_steps += 1 self.episode_rewards += reward self.frames.append(preprocess_frame(next_frame)) next_state = np.stack(self.frames, axis=2)[np.newaxis, ...] if self.lives != info["ale.lives"]: #: loss of life as episode ends transition = (state, action, reward, next_state, True) self.lives = info["ale.lives"] else: transition = (state, action, reward, next_state, done) self.local_buffer.push(transition) if done: print(self.pid, self.episode_steps, self.episode_rewards, round(self.epsilon, 3)) self.episode_steps = 0 self.episode_rewards = 0 self.lives = 5 frame = preprocess_frame(self.env.reset()) for _ in range(self.n_frames): self.frames.append(frame) experiences = self.local_buffer.pull() states = np.vstack([exp.state for exp in experiences]).astype(np.float32) actions = np.vstack([exp.action for exp in experiences]).astype(np.float32) rewards = np.array([exp.reward for exp in experiences]).reshape(-1, 1) next_states = np.vstack([exp.next_state for exp in experiences]).astype(np.float32) dones = np.array([exp.done for exp in experiences]).reshape(-1, 1) next_actions, next_qvalues = self.local_qnet.sample_actions( next_states) next_actions_onehot = tf.one_hot(next_actions, self.action_space) max_next_qvalues = tf.reduce_sum(next_qvalues * next_actions_onehot, axis=1, keepdims=True) TQ = rewards + self.gamma**(self.nstep) * (1 - dones) * max_next_qvalues qvalues = self.local_qnet(states) actions_onehot = tf.one_hot(actions.flatten().astype(np.int32), self.action_space) Q = tf.reduce_sum(qvalues * actions_onehot, axis=1, keepdims=True) priorities = ((np.abs(TQ - Q) + 0.001)**self.alpha).flatten() experiences = [zlib.compress(pickle.dumps(exp)) for exp in experiences] return priorities, experiences, self.pid
class RemoteTestActor: def __init__(self, env_name, epsilon=0.05, n_frames=4): self.env_name = env_name self.env = gym.make(env_name) self.action_space = self.env.action_space.n self.epsilon = epsilon self.n_frames = n_frames self.frames = collections.deque(maxlen=n_frames) self.qnet = DuelingQNetwork(action_space=self.action_space) self.define_network() def define_network(self): #: hide GPU from remote actor tf.config.set_visible_devices([], 'GPU') #: define by run frame = preprocess_frame(self.env.reset()) for _ in range(self.n_frames): self.frames.append(frame) state = np.stack(self.frames, axis=2)[np.newaxis, ...] self.qnet(state) def get_layers(self, idx): return self.qnet.layers[idx:] def play(self, current_weights, epsilon=0.01): tf.config.set_visible_devices([], 'GPU') self.qnet.set_weights(current_weights) episode_steps, episode_rewards = 0, 0 frame = preprocess_frame(self.env.reset()) for _ in range(self.n_frames): self.frames.append(frame) state = np.stack(self.frames, axis=2)[np.newaxis, ...] done = False while not done: state = np.stack(self.frames, axis=2)[np.newaxis, ...] action = self.qnet.sample_action(state, epsilon=epsilon) next_frame, reward, done, _ = self.env.step(action) self.frames.append(preprocess_frame(next_frame)) episode_steps += 1 episode_rewards += reward if episode_steps > 1000 and episode_rewards < 10: break return episode_steps, episode_rewards def play_with_video(self, checkpoint_path, monitor_dir, epsilon=0.01): monitor_dir = Path(monitor_dir) if monitor_dir.exists(): shutil.rmtree(monitor_dir) monitor_dir.mkdir() env = gym.wrappers.Monitor(gym.make(self.env_name), monitor_dir, force=True, video_callable=(lambda ep: True)) frame = preprocess_frame(env.reset()) frames = collections.deque([frame] * self.n_frames, maxlen=self.n_frames) state = np.stack(frames, axis=2)[np.newaxis, ...] self.qnet(state) self.qnet.load_weights(checkpoint_path) episode_steps, episode_rewards = 0, 0 state = np.stack(self.frames, axis=2)[np.newaxis, ...] done = False while not done: state = np.stack(self.frames, axis=2)[np.newaxis, ...] action = self.qnet.sample_action(state, epsilon) next_frame, reward, done, _ = self.env.step(action) self.frames.append(preprocess_frame(next_frame)) episode_steps += 1 episode_rewards += reward return episode_rewards