def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, expert_mem_size, batch_size, n_step, lam_n_step, lam_sup, lam_L2, eps_min=0.01, eps_dec=5e-7, replace=1000, chkpt_dir='tmp/dqn', algo=None, env_name=None): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.input_dims = input_dims self.n_actions = n_actions self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cntr = replace self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter = 0 self.n_step = n_step self.lam_n_step = lam_n_step self.lam_sup = lam_sup self.lam_L2 = lam_L2 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.demo_memory = DemoReplayBuffer(expert_mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.lam_L2, self.n_actions, input_dims=self.input_dims, model_name=env_name + "_" + algo + "_q_eval", model_dir=chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.lam_L2, self.n_actions, input_dims=self.input_dims, model_name=env_name + "_" + algo + "_q_next", model_dir=chkpt_dir)
def __init__(self, replay_buffer_len=5000, learning_rate=0.001, espilon=1, verbose=0): self.gamma = 1 self.epsilon = espilon self.min_epsilon = 0.1 self.delta_epsilon = 0.025 self.epsilon_update_freq = 5000 self.lr = learning_rate self.target_update_freq = 250 self.Optimizer = keras.optimizers.RMSprop( learning_rate=self.lr) self.model = keras.Sequential( [ keras.layers.Flatten(input_shape=[5, 5, 1]), keras.layers.Dense( units=5, name="layer1"), ] ) self.replay_buffer = ReplayBuffer( size=replay_buffer_len, frame_history_len=1) self.mini_batch_size = 32 if verbose > 0: self.model.summary() self.target_model = keras.models.clone_model(self.model) self.update_target_weights() self.update_steps = 0 self.total_reward = 0
def __init__(self, env, q_net, loss_func, opt, lr=0.00025, imsize=(84, 84), gamma=0.99, tau=0.001, buffer_size=1e6, log_dir=None, weight_dir=None): self.env = env self.q_net = q_net.type(dtype) self.target_q_net = copy.deepcopy(q_net).type(dtype) self.loss_func = loss_func self.opt = opt(self.q_net.parameters(), lr) self.gamma = gamma self.tau = tau self.buffer_size = buffer_size self.n_action_space = env.action_space.n self._state_size = env.observation_space.shape self._imsize = imsize self.train_reward_list = [] self.test_reward_list = [] self.train_error_list = [] self._buffer = ReplayBuffer([ 1, ], self._state_size, imsize, buffer_size) self.log_dir = log_dir if log_dir is not None else "./logs/" self.weight_dir = weight_dir if weight_path is not None else "./checkpoints/"
def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, hyperparameters): """ Args: hyperparameters (dict): a dictionary of hyperparameters discounted_returns: None """ # Extract hyperparameters self.lr = hyperparameters['learning_rate'] self.discount = hyperparameters['discount_rate'] self.num_batch_transitions = hyperparameters['num_batch_transitions'] self.state_dim = hyperparameters['state_dim'] self.action_dim = hyperparameters['action_dim'] self.total_train_steps = hyperparameters['total_train_steps'] self.max_episode_length = hyperparameters['max_episode_length'] self.num_train_epochs = hyperparameters['num_train_epochs'] self.device = hyperparameters['device'] # Initialize actor/critic networks self.actor = Actor(self.state_dim, self.action_dim) self.critic = Critic(self.state_dim, self.action_dim) self.actor_optim = optim.Adam(self.actor.parameters(), lr=self.lr) self.critic_optim = optim.Adam(self.critic.parameters(), lr=self.lr) # Initialize replay buffer and environment self.replay_buffer = ReplayBuffer(self.batch_size) self.enviroment = KukaGymEnv(renders=True)
def __init__(self, agent_list, action_size, learn_period=10, learn_sampling_num=20, buffer_size=int(1e6), batch_size=128, random_seed=0): super().__init__() if len(agent_list) == 0: raise Exception('len(agent_list) = 0') self.agent_list = agent_list self.learn_period = learn_period self.learn_sampling_num = learn_sampling_num self.batch_size = batch_size self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed, device) self.time_step = 0 # debugging constant self.__debug_num_agents = len(agent_list) self.__debug_state_size = agent_list[0].state_size self.__debug_action_size = agent_list[0].action_size
class DQNAgent(nn.Module): def __init__(self, state_dim: int, action_dim: int, hidden_sizes: list = [128, 128], activation=nn.ReLU, buffer_size: int = 1000000, batch_size: int = 32, lr: float = 1e-4, gamma: float = 0.95, theta: float = 0.05): super(DQNAgent, self).__init__() self.q_net = mlp([state_dim] + hidden_sizes + [action_dim], activation=activation) self.target_net = mlp([state_dim] + hidden_sizes + [action_dim], activation=activation) self.target_net.load_state_dict(self.q_net.state_dict()) self.buffer = ReplayBuffer(buffer_size) self.batch_size = batch_size self.optimizer = Adam(self.q_net.parameters(), lr=lr) self.gamma = gamma self.theta = theta def forward(self, x): return self.q_net(x) def save_memory(self, ex): self.buffer.push(ex) def train(self, k=4, max_norm=5.): losses = [] for _ in range(k): experiences = self.buffer.sample(self.batch_size) s, a, r, t, mask = get_batch(experiences) next_q = self.target_net(t).max(-1, keepdim=True)[0] target = r + self.gamma * mask * next_q.detach() pred = self.q_net(s).gather(-1, a) loss = F.mse_loss(pred, target) self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.q_net.parameters(), max_norm) self.optimizer.step() losses.append(loss.item()) self.target_update() return np.mean(losses) def train_start(self): return (len(self.buffer) >= self.batch_size) def target_update(self): for target, param in zip(self.target_net.parameters(), self.q_net.parameters()): target.data = (1 - self.theta) * target.data + self.theta * param.data #%%
def train_agent(path, env, agent, seed=0, num_episodes=100, num_steps=100, batch_size=128, replay_buffer_size=1000000): if not os.path.isdir(path): os.makedirs(path) os.chdir(path) env.seed(seed) random.seed(seed) pickle.dump(agent.policy_net, open('first_policy.pickle', 'wb')) replay_buffer = ReplayBuffer(replay_buffer_size) rewards = [] max_angle = [] ave_angle = [] for episode in range(num_episodes): state = env.reset() episode_reward = 0 max_th = 0 ave_th = 0 for step in range(num_steps): action = agent.policy_net.get_action(state) + np.array([0.0]) next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) if len(replay_buffer) > batch_size: agent.train_step(replay_buffer=replay_buffer, batch_size=batch_size) state = next_state episode_reward += reward th = np.arccos(state[0]) * np.sign(state[1]) max_th = max(max_th, abs(th)) ave_th += abs(th) rewards.append(episode_reward) max_angle.append(max_th) ave_angle.append(ave_th / num_steps) pickle.dump(agent.policy_net, open('last_policy.pickle', 'wb')) pickle.dump(rewards, open('rewards.pickle', 'wb')) pickle.dump(max_angle, open('max_angle.pickle', 'wb')) pickle.dump(ave_angle, open('ave_angle.pickle', 'wb')) plt.figure(figsize=(10, 6)) plt.plot(rewards) plt.title('Reward vs Episode') plt.savefig('rewards.png', dpi=100) plt.close()
def __init__( self, nc: int, nz: int, ngf: int, ndf: int, ng_blocks: int, nd_layers: int, ksize_d: int, norm_type: str, lambda_A: float, lambda_B: float, lambda_idt: float, ) -> None: """Construct CycleGAN. Parameters: ----------- nc: the number of image channels nz: size of z latent vector ngf: size of feature maps in generator ndf: size of feature maps in discriminator ng_blocks: the number of Residual blocks nd_layers: the number of conv layers in the discriminator ksize_d: kernel size of conv layer in the discriminator norm_type: normalization layer type `batch` | `instance` lambda_A: forward cycle loss weight lambda_B: backward cycle loss weight lambda_idt: identity loss weight """ super(CycleGAN, self).__init__() # Generators self.G_AB = ResidualGenerator(nz, nc, ngf, norm_type, ng_blocks) init_weights(self.G_AB) self.G_BA = ResidualGenerator(nz, nc, ngf, norm_type, ng_blocks) init_weights(self.G_BA) # Discriminators self.D_A = PatchDiscriminator(nc, ksize_d, ndf, nd_layers, norm_type) init_weights(self.D_A) self.D_B = PatchDiscriminator(nc, ksize_d, ndf, nd_layers, norm_type) init_weights(self.D_B) # Relay Buffer self.replay_buffer = { "fake_A": ReplayBuffer(), "fake_B": ReplayBuffer() } # Optimizers self.optimizers = {} # Schedulers self.schedulers = {} # Criterions self.criterions = {"gan": None, "cycle": None, "idt": None} # Loss weights self.lambdas = {"A": lambda_A, "B": lambda_B, "idt": lambda_idt}
def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, mem_alpha, batch_size, beta, beta_max, beta_increment, eps_min=0.01, eps_dec=5e-7, replace=1000, chkpt_dir='tmp/dqn', algo=None, env_name=None): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.beta = beta self.beta_max = beta_max self.beta_increment = beta_increment self.input_dims = input_dims self.n_actions = n_actions self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cntr = replace self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions, mem_alpha) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, model_name=env_name + "_" + algo + "_q_eval", model_dir=chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, model_name=env_name + "_" + algo + "_q_next", model_dir=chkpt_dir)
def __init__(self, state_size, action_size, actor_lr, critic_lr, random_seed, mu, theta, sigma, buffer_size, batch_size, epsilon_start, epsilon_min, epsilon_decay, gamma, tau, n_time_steps, n_learn_updates, device): self.state_size = state_size self.action_size = action_size self.actor_lr = actor_lr self.critic_lr = critic_lr # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, name="Actor_local") self.actor_target = Actor(state_size, action_size, name="Actor_target") self.actor_optimizer = Adam(learning_rate=self.actor_lr) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, name="Critic_local") self.critic_target = Critic(state_size, action_size, name="Critic_target") self.critic_optimizer = Adam(learning_rate=self.critic_lr) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.noise = GaussianNoise(action_size, random_seed, mu, sigma) self.epsilon = epsilon_start self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay # Replay memory self.batch_size = int(batch_size) self.buffer_size = int(buffer_size) self.memory = ReplayBuffer(self.buffer_size, self.batch_size, random_seed) # Algorithm parameters self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters self.n_time_steps = n_time_steps # number of time steps before updating network parameters self.n_learn_updates = n_learn_updates # number of updates per learning step # Device self.device = device tf.keras.backend.clear_session()
def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an MADDPG Agent object. Params ====== :param state_size: dimension of each state :param action_size: dimension of each action :param num_agents: number of inner agents :param random_seed: random seed """ super().__init__(state_size, action_size, num_agents, random_seed) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.actors_local = [] self.actors_target = [] self.actor_optimizers = [] self.critics_local = [] self.critics_target = [] self.critic_optimizers = [] for i in range(num_agents): # Actor Network (w/ Target Network) self.actors_local.append( Actor(state_size, action_size, random_seed).to(device)) self.actors_target.append( Actor(state_size, action_size, random_seed).to(device)) self.actor_optimizers.append( optim.Adam(self.actors_local[i].parameters(), lr=LR_ACTOR)) # Critic Network (w/ Target Network) self.critics_local.append( Critic(num_agents * state_size, num_agents * action_size, random_seed).to(device)) self.critics_target.append( Critic(num_agents * state_size, num_agents * action_size, random_seed).to(device)) self.critic_optimizers.append( optim.Adam(self.critics_local[i].parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)) # Noise process for each agent self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # debugging variables self.step_count = 0 self.mse_error_list = []
def train_init(self): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables self.replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) self.max_q_values = deque(maxlen=1000)
def train(self): start = time.time() replay_buffer = ReplayBuffer(self.config.replay_buffer_size, self.observation_dim, self.action_dim, self.env.n) # replay_buffer = Memory(self.config.replay_buffer_size) self.current_obs_n = self.env.reset() self.current_episode_length = 0 for t in range(self.config.num_batches): self.current_batch_num = t samples = self.sample_n(self.env, replay_buffer, self.config.train_freq, self.config.batch_size) for i in range(self.env.n): agent_net = self.agent_networks[i] if t % self.config.eval_freq == 0: agent_net.adapt_param_noise(samples) agent_net.train_for_batch_samples( samples, agents_list=self.agent_networks) # periodically do a test run to evaluate policies so far if t % self.config.eval_freq == 0: self.logger.info("Batch " + str(t) + ":") self.test_run(self.env, self.config.eval_episodes) self.logger.info("- Training all done.") self.logger.info("Total training time: " + str(time.time() - start) + " seconds.")
def run_training(args): env = gym.make(args.env) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) print('States: %i'%(state_dim)) print('Actions: %i'%(action_dim)) # TODO scale = max_action * np.ones(action_dim) agent = TD3Agent( state_dim, action_dim, scale, '_'.join([args.model_path, args.date]) ) replay_buffer = ReplayBuffer( state_dim, action_dim, buffer_size=args.buffer_size, batch_size=args.batch_size ) logger = Logger(log_path='_'.join([args.log_path, args.date])) run_train_loop(args, env, agent, replay_buffer, logger)
def __init__(self, color: Color, model_name: str, train_policy: TrainablePolicy, immediate_reward: Reward, final_reward: Reward, board_size: int, discount_factor: float = 1.0) -> None: super().__init__(color) self.weights_path: str = f'weights\\{model_name}_{self.color.name}' self.train_policy: TrainablePolicy = train_policy self.test_policy: OptimalTrainablePolicy = OptimalTrainablePolicy( board_size) self.immediate_reward: Reward = immediate_reward self.final_reward: Reward = final_reward self.board_size = board_size self.discount_factor: float = discount_factor self.replay_buffer: ReplayBuffer = ReplayBuffer( (board_size**2 - 4) // 2) self.train_mode: Union[bool, None] = None try: # create new model self.dnn: Sequential = self.create_model() # load existing weights self.load_weights() except: # create new model self.dnn: Sequential = self.create_model() # save initial weights self.save_weights()
def __init__(self, color: Color, immediate_reward: ImmediateReward = None, board_size: int = 8): super().__init__(color, immediate_reward) self.board_size: int = board_size self.episode_rewards = [] self.training_errors = [] self.train_mode = False self.replay_buffer = ReplayBuffer(size=int(10e5))
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.action_range = self.action_high - self.action_low # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 * (self.action_range) self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters (CartPole) # self.gamma = 0.99 # discount factor # self.tau = 0.01 # for soft update of target parameters # Algorithm parameters (Quadcopter) self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # learning rate actor_learning_rate = 0.0001 critic_learning_rate = 0.001 # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, actor_learning_rate) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, actor_learning_rate) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, critic_learning_rate) self.critic_target = Critic(self.state_size, self.action_size, critic_learning_rate) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # Score tracker self.score = -np.inf self.best_score = -np.inf
def __init__(self, state_size, action_size, seed, checkpoint=None): """ Contructor :param state_size: :param action_size: :param seed: :param checkpoint: if running from a checkpoint """ self.state_size = state_size self.action_size = action_size self.seed = np.random.seed(seed) # As for any DQN implementation we create a local and a target Network. # In this Case we use the DuelingDQN Implementation for both networks self.qnetwork_local = DuelingDQNetwork(state_size, action_size, seed, fc1_units=FC1_UNITS, fc2_units=FC2_UNITS).to(device) self.qnetwork_target = DuelingDQNetwork(state_size, action_size, seed, fc1_units=FC1_UNITS, fc2_units=FC2_UNITS).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) if checkpoint: #If We have a checkpoint we load the state to the networks and optimizers print('Using Checkpoint...') self.qnetwork_local.load_state_dict(checkpoint['local_state_dict']) self.qnetwork_target.load_state_dict( checkpoint['target_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer']) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def setup_ddpg(self, args): sess = self.sess tf.set_random_seed(int(args['random_seed'])) # Fetch environment state and action space properties state_dim = self.env.observation_space["observation"].shape[0] action_dim = self.env.action_space.shape[0] action_bound = self.env.action_space.high # Ensure action bound is symmetric assert (all(self.env.action_space.high - self.env.action_space.low)) self.actor = ActorNetwork(sess, state_dim, action_dim, action_bound, float(args['actor_lr']), float(args['tau']), int(args['minibatch_size'])) self.critic = CriticNetwork(sess, state_dim, action_dim, float(args['critic_lr']), float(args['tau']), float(args['gamma']), self.actor.get_num_trainable_vars()) self.actor_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(action_dim)) # Set up summary Ops self.summary_ops, self.summary_vars = build_summaries() sess.run(tf.global_variables_initializer()) # Initialize target network weights self.actor.update_target_network() self.critic.update_target_network() # Initialize replay memory self.replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed'])) # Needed to enable BatchNorm. # This hurts the performance on Pendulum but could be useful # in other environments. tflearn.is_training(True)
def __init__(self, state_dim: int, action_dim: int, hidden_sizes: list = [128, 128], activation=nn.ReLU, buffer_size: int = 1000000, batch_size: int = 32, lr: float = 1e-4, gamma: float = 0.95, theta: float = 0.05): super(DQNAgent, self).__init__() self.q_net = mlp([state_dim] + hidden_sizes + [action_dim], activation=activation) self.target_net = mlp([state_dim] + hidden_sizes + [action_dim], activation=activation) self.target_net.load_state_dict(self.q_net.state_dict()) self.buffer = ReplayBuffer(buffer_size) self.batch_size = batch_size self.optimizer = Adam(self.q_net.parameters(), lr=lr) self.gamma = gamma self.theta = theta
def __init__(self, state_size, action_size, seed, gamma=0.99, step_size=1, dueling_dqn=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network if dueling_dqn: print("Use dueling dqn") self.qnetwork_local = NoisyDuelingDQN(state_size, action_size, seed).to(device) self.qnetwork_target = NoisyDuelingDQN(state_size, action_size, seed).to(device) else: print("Use non-dueling dqn") self.qnetwork_local = DQN(state_size, action_size, seed).to(device) self.qnetwork_target = DQN(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.gamma = gamma self.step_size = step_size
def evaluate(self, env=None, num_episodes=None): """ Evaluation with same procedure as the training """ # log our activity only if default call if num_episodes is None: self.logger.info("Evaluating...") # arguments defaults if num_episodes is None: num_episodes = self.config.num_episodes_test if env is None: env = self.env # replay memory to play replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = [] for i in range(num_episodes): total_reward = 0 state = env.reset() state = state.reshape([1, -1, 1]) while True: if self.config.render_test: env.render() # store last state in buffer idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() action = self.get_action(q_input) # perform action in env new_state, reward, done, info = env.step(action) # store in replay memory replay_buffer.store_effect(idx, action, reward, done) state = new_state state = state.reshape([1, -1, 1]) # count reward total_reward += reward if done: break # updates to perform at the end of an episode rewards.append(total_reward) avg_reward = np.mean(rewards) sigma_reward = np.sqrt(np.var(rewards) / len(rewards)) if num_episodes >= 1: msg = "Average reward: {:04.2f} +/- {:04.2f}".format( avg_reward, sigma_reward) self.logger.info(msg) return avg_reward
def __init__(self, state_size, action_size, num_agents, random_seed): """ Initialize an DDPG Agent object. :param state_size (int): dimension of each state :param action_size (int): dimension of each action :param num_agents (int): number of agents in environment ot use ddpg :param random_seed (int): random seed """ super().__init__(state_size, action_size, num_agents, random_seed) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process for each agent self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # debug of the MSE critic loss self.mse_error_list = []
def __init__(self, state_size, action_size, args, device): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.hidden_size = args.hidden_size self.seed = args.seed self.args = args self.device = device assert self.args.agent in ['dqn', 'double_dqn', 'dueling_dqn'],\ "invalid agent name" if self.args.agent == "double_dqn": print("Implementing Double DQN!") elif self.args.agent == "dueling_dqn": print("Implementing Dueling DQN!") else: print("Implementing DQN") # Q-Network if self.args.agent == "dueling_dqn": self.qnetwork_local = DuelingQNetwork(state_size, action_size, self.hidden_size, self.seed).to(device) self.qnetwork_target = DuelingQNetwork(state_size, action_size, self.hidden_size, self.seed).to(device) else: self.qnetwork_local = QNetwork(state_size, action_size, self.hidden_size, self.seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, self.hidden_size, self.seed).to(device) print("Agent Architecture") print(self.qnetwork_local) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.args.lr) # Replay memory self.memory = ReplayBuffer(action_size, args.buffer_size, args.batch_size, self.seed, self.device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = args.update_frequency
def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def init_agent(self, id_, game_type): super(DQNAgent, self).init_agent(id_, game_type) # Assume the graph has been constructed. # Create a tf Session and run initializer of variables. tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True self._session = tf.Session(config=tf_config) # Tensorboard self._add_summary() # Initialize all variables. init = tf.global_variables_initializer() self._session.run(init) # Synchronise q and target_q networks. self._session.run(self._update_target_op) # for saving networks weights self._saver = tf.train.Saver() # Initialize replay buffer and variables. self._train_replay_buffer = ReplayBuffer(self._config.buffer_size, self._config.state_history) self._train_rewards = deque(maxlen=self._config.num_episodes_test) self._train_max_q_values = deque(maxlen=1000) self._train_q_values = deque(maxlen=1000) self._init_averages() self._time_step = 0 self._progress_bar = Progbar(target=self._config.nsteps_train) self._has_episode_started = False if not self._train_from_scratch: self._load()
def run_episode( env, q_func, replay_buffer_size=1000000, frame_history_len=4, game=None, ): assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete if len(env.observation_space.shape) == 1: input_arg = env.observation_space.shape[0] else: img_h, img_w, img_c = env.observation_space.shape input_arg = frame_history_len * img_c num_actions = env.action_space.n Q = q_func(input_arg, num_actions).type(dtype) Q.load_state_dict(torch.load("./models/PAL_{}.pth".format(game), map_location=lambda storage, loc: storage)) replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) all_obs = [] last_obs = env.reset() for t in count(): last_idx = replay_buffer.store_frame(last_obs) recent_observations = replay_buffer.encode_recent_observation() all_obs.append(recent_observations) torch_obs = torch.from_numpy(recent_observations).type(dtype).unsqueeze(0) / 255.0 with torch.no_grad(): Qvals = Q(torch_obs).data[0] max2val, max2idx = Qvals.topk(2) action = max2idx[0] obs, reward, done, _ = env.step(action) env.render() replay_buffer.store_effect(last_idx, action, reward, done) if done: break last_obs = obs return all_obs
def __init__(self, env, params): self.env = env self.params = params self.epsilon = self.params["epsilon_start"] self.replay_buffer = ReplayBuffer(int(self.params["buffer_size"])) self.n_actions = self.env.action_space.n if torch.cuda.is_available(): self.device = "cuda:0" else: self.device = "cpu" self.Q = CNN(self.n_actions).to(self.device) self.Q_target = CNN(self.n_actions).to(self.device) self.optimizer = optim.RMSprop(self.Q.parameters(), lr=2.5e-4)