def Load_Cfagent(defaults): with Load(defaults["load_name"], num=defaults['num']) as load: collector, env, mover, teleporter, CFagent = load.items(Collector, Game, Mover, Teleporter, CFAgent) buffer = ReplayBuffer(**defaults) CFbuffer = CFReplayBuffer(**defaults) with Save(env, collector, mover, teleporter, CFagent, **defaults) as save: intervention_idx, modified_board = teleporter.pre_process(env) dones = CFagent.pre_process(env) CF_dones, cfs = None, None CFagent.CF_count = 0 for frame in loop(env, collector, save, teleporter): CFagent.counterfact(env, dones, teleporter, CF_dones, cfs) modified_board = teleporter.interveen(env.board, intervention_idx, modified_board) actions = mover(modified_board) observations, rewards, dones, info = env.step(actions) modified_board, modified_rewards, modified_dones, teleport_rewards, intervention_idx = teleporter.modify(observations, rewards, dones, info) buffer.teleporter_save_data(teleporter.boards, observations, teleporter.interventions, teleport_rewards, dones, intervention_idx) mover.learn(modified_board, actions, modified_rewards, modified_dones) board_before, board_after, intervention, tele_rewards, tele_dones = buffer.sample_data() teleporter.learn(board_after, intervention, tele_rewards, tele_dones, board_before) collector.collect([rewards, modified_rewards, teleport_rewards], [dones, modified_dones]) CF_dones, cfs = CFagent.counterfact_check(dones, env, **defaults) CFbuffer.CF_save_data(CFagent.boards, observations, CFagent.counterfactuals, rewards, dones, CF_dones) CFboard, CFobs, cf, CFrewards, CFdones1 = CFbuffer.sample_data() CFagent.learn(CFobs, cf, CFrewards, CFdones1, CFboard)
def __init__(self, state_size, action_size, seed, model=QNetwork): """Initialize an Agent object. Param ===== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed model (object): model to use Return ====== None """ self.state_size = state_size self.action_size = action_size self.seed = seed # Q-Network self.qnetwork_local = model(state_size, action_size, seed).to(device) self.qnetwork_target = model(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=hyperparameters["lr"]) # Replay memory self.memory = ReplayBuffer(action_size, hyperparameters["buffer_size"], hyperparameters["batch_size"], seed, device) # Initialize time step (for updating every hyperparameters["update_every"] steps) self.t_step = 0 # Init tracking of params wandb.login() wandb.init(project=project_name, name=name, config=hyperparameters) jovian.log_hyperparams(hyperparameters)
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.1 # for soft update of target parameters
def __init__(self, state_size, action_size, agent_id): self.state_size = state_size self.action_size = action_size self.seed = args['seed'] self.device = args['device'] #self.args = args # Q-Network self.actor_network = ActorNetwork(state_size, action_size).to(self.device) self.actor_target = ActorNetwork(state_size, action_size).to(self.device) self.actor_optimizer = optim.Adam(self.actor_network.parameters(), lr=args['LR_ACTOR']) #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine) #if not agent_id: # self.actor_network.load_state_dict(torch.load(args['agent_p0_path']), strict=False) # self.actor_target.load_state_dict(torch.load(args['agent_p0_path']), strict=False) #else: # self.actor_network.load_state_dict(torch.load(args['agent_p1_path']), strict=False) # self.actor_target.load_state_dict(torch.load(args['agent_p1_path']), strict=False) # Replay memory self.memory = ReplayBuffer(action_size, args['BUFFER_SIZE'], args['BATCH_SIZE'], self.device, self.seed) # Noise process self.noise = OUNoise(action_size, self.seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.mCriticLoss = 0 self.actorLoss = 0
def get_replay_buffer(self, gamma, env): total_score, steps, n = 0, 0, 0 replay_buffer = ReplayBuffer() state = self.state_modifier.apply(env.reset()) while steps < self.steps: self._episodes += 1 n += 1 if n == 1: print("0 state value {}".format( self.critic.get_values(state).detach()[0])) score = 0 while True: # timelimits if self.render: env.render() action = self.actor.get_action(state).detach() next_state, reward, done, tl, _ = env.step(action) next_state = self.state_modifier.apply(next_state) if tl == 1: reward += self.critic.get_values( next_state).detach()[0] * gamma score += reward replay_buffer.append(state, action, reward, done == 1) state = next_state total_score, steps = total_score + reward, steps + 1 if done == 1: break print("episodes: {}, score: {}, avg steps: {}, avg reward {}".format( self._episodes, total_score / n, steps / n, total_score / steps)) return replay_buffer, total_score / n
def train(self, transitions: int, sigma_max: float = 1., sigma_min: float = 0., buffer_size: int = 10000, batch_size: int = 128, progress_upd_step: int = None, start_training: int = 1000, shaping_coef: float = 300.): history = ReplayBuffer(buffer_size) progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100 log = { "alpha": self.alpha, "gamma": self.gamma, "sigma_max": sigma_max, "sigma_min": sigma_min, "buffer_size": buffer_size, "batch_size": batch_size, "tau": self.tau, "shaping_coef": shaping_coef, "step": [], "reward_mean": [], "reward_std": [] } state = self.reset() t = tqdm(range(transitions)) for i in t: sigma = sigma_max - (sigma_max - sigma_min) * i / transitions action = self.act(state) noise = np.random.normal(scale=sigma, size=action.shape) action = np.clip(action + noise, -1, 1) next_state, reward, done, _ = self.env.step(action) reward += shaping_coef * (self.gamma * np.abs(next_state[1]) - np.abs(state[1])) done_ = next_state[0] >= 0.5 history.add((state, action, next_state, reward, done_)) state = self.reset() if done else next_state if i > start_training: batch = history.sample(batch_size) self.update_critic(batch) self.update_actor(batch) if (i + 1) % progress_upd_step == 0: reward_mean, reward_std = self.evaluate_policy() log["step"].append(i) log["reward_mean"].append(reward_mean) log["reward_std"].append(reward_std) t.set_description( f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}" ) return log
def __init__(self, env, sess, low_action_bound_list, high_action_bound_list): self.env = env self.sess = sess self.low_action_bound_list = low_action_bound_list # depends on the env self.high_action_bound_list = high_action_bound_list self.action_range_bound = [ hi - lo for hi, lo in zip(self.high_action_bound_list, self.low_action_bound_list) ] self.learning_rate = 0.0001 #TODO move these to configs self.epsilon = 1.0 self.epsilon_min = 0.1 self.epsilon_decay = 1e-6 self.gamma = 0.99 self.tau = 0.001 self.buffer_size = 1000000 self.batch_size = 128 self.theta = 0.15 self.ou = 0 self.sigma = 0.3 self.state_dim = self.env.observation_space.shape[0] self.action_dim = len(self.low_action_bound_list ) #self.env.action_space, make this into input self.continuous_action_space = True # Initialize replay buffer self.replay_buffer = ReplayBuffer(self.buffer_size) # Creating ACTOR model actor_ = Actor(self.state_dim, self.action_dim, self.learning_rate) self.actor_state_input, self.actor_model = actor_.create_actor_model() _, self.target_actor_model = actor_.create_actor_model() self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.action_dim]) actor_model_weights = self.actor_model.trainable_weights self.actor_grads = tf.gradients(self.actor_model.output, actor_model_weights, -self.actor_critic_grad) grads = zip(self.actor_grads, actor_model_weights) self.optimize = tf.train.AdamOptimizer( self.learning_rate).apply_gradients(grads) # Creating CRITIC model critic_ = Critic(self.state_dim, self.action_dim, self.learning_rate) self.critic_state_input, self.critic_action_input, self.critic_model = critic_.create_critic_model( ) _, _, self.target_critic_model = critic_.create_critic_model() self.critic_grads = tf.gradients(self.critic_model.output, self.critic_action_input) self.noise = OrnsteinUhlenbeckProcess(size=self.action_dim) self.noise.reset() self.sess.run(tf.initialize_all_variables())
def metateleport(defaults): collector = Collector(**defaults) env = Game(**defaults) mover = Mover(env, _extra_dim=1, **defaults) teleporter1 = Teleporter(env, _extra_dim=1, **defaults) teleporter2 = MetaTeleporter(env, **defaults) buffer1 = ReplayBuffer(**defaults) buffer2 = ReplayBuffer(**defaults) with Save(env, collector, mover, teleporter1, teleporter2, **defaults) as save: intervention_idx2, modified_board2 = teleporter2.pre_process(env) intervention_idx1, _ = teleporter1.pre_process(env) for frame in loop(env, collector, save, teleporter1, teleporter2): modified_board2 = teleporter2.interveen(env.board, intervention_idx2, modified_board2) modified_board1 = teleporter1.interveen(env.board, intervention_idx1, modified_board2) actions = mover(modified_board1) observations, rewards, dones, info = env.step(actions) modified_board1, modified_board2, modified_rewards1, modified_rewards2, modified_dones1, modified_dones2, tele_rewards, intervention_idx1, intervention_idx2 = teleporter2.metamodify(observations, rewards, dones, info, teleporter1.interventions) buffer1.teleporter_save_data(teleporter1.boards, modified_board2, teleporter1.interventions, modified_rewards2, modified_dones2, intervention_idx1) buffer2.teleporter_save_data(teleporter2.boards, observations, teleporter2.interventions, tele_rewards, dones, intervention_idx2) mover.learn(modified_board1, actions, modified_rewards1, modified_dones1) board_before, board_after, intervention, tel_rewards, tele_dones = buffer1.sample_data() teleporter1.learn(board_after, intervention, tel_rewards, tele_dones, board_before) board_before, board_after, intervention, tel_rewards, tele_dones = buffer2.sample_data() teleporter2.learn(board_after, intervention, tel_rewards, tele_dones, board_before) collector.collect([rewards, modified_rewards1, modified_rewards2, tele_rewards], [dones, modified_dones1, modified_dones2])
def train(self, transitions: int, eps_max: float = 0.5, eps_min: float = 0., buffer_size: int = 10000, batch_size: int = 128, shaping_coef: float = 300., progress_upd_step: int = None, start_training: int = 10000): history = ReplayBuffer(size=buffer_size) progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100 log = { "alpha": self.alpha, "gamma": self.gamma, "buffer_size": buffer_size, "batch_size": batch_size, "tau": self.tau, "shaping_coef": shaping_coef, "eps_max": eps_max, "eps_min": eps_min, "step": [], "reward_mean": [], "reward_std": [] } state = self.reset() t = tqdm(range(transitions)) for i in t: eps = eps_max - (eps_max - eps_min) * i / transitions if random() < eps: action = self.env.action_space.sample() else: action = self.act(state) next_state, reward, done, _ = self.env.step(action) reward += shaping_coef * (self.gamma * np.abs(next_state[1]) - np.abs(state[1])) done_ = next_state[0] >= 0.5 history.add((state, action, next_state, reward, done_)) state = self.reset() if done else next_state if i > start_training: self.update(history.sample(batch_size)) if (i + 1) % progress_upd_step == 0: reward_mean, reward_std = self.evaluate_policy() log["step"].append(i) log["reward_mean"].append(reward_mean) log["reward_std"].append(reward_std) t.set_description( f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}" ) return log
def CFagent(defaults): env = Game(**defaults) mover = Mover(env, _extra_dim=1, **defaults) teleporter = Teleporter(env, **defaults) buffer = ReplayBuffer(**defaults) CFagent = CFAgent(env, **defaults) CFbuffer = CFReplayBuffer(**defaults) collector = Collector(**defaults) with Save(env, collector, mover, teleporter, CFagent, **defaults) as save: intervention_idx, modified_board = teleporter.pre_process(env) dones = CFagent.pre_process(env) CF_dones, cfs = None, None for frame in loop(env, collector, save, teleporter): CFagent.counterfact(env, dones, teleporter, CF_dones, cfs) modified_board = teleporter.interveen(env.board, intervention_idx, modified_board) actions = mover(modified_board) observations, rewards, dones, info = env.step(actions) modified_board, modified_rewards, modified_dones, teleport_rewards, intervention_idx = teleporter.modify(observations, rewards, dones, info) buffer.teleporter_save_data(teleporter.boards, observations, teleporter.interventions, teleport_rewards, dones, intervention_idx) mover.learn(modified_board, actions, modified_rewards, modified_dones) board_before, board_after, intervention, tele_rewards, tele_dones = buffer.sample_data() teleporter.learn(board_after, intervention, tele_rewards, tele_dones, board_before) collector.collect([rewards, modified_rewards, teleport_rewards], [dones, modified_dones]) CF_dones, cfs = CFagent.counterfact_check(dones, env, **defaults) CFbuffer.CF_save_data(CFagent.boards, observations, CFagent.counterfactuals, rewards, dones, CF_dones) CFboard, CFobs, cf, CFrewards, CFdones1 = CFbuffer.sample_data() CFagent.learn(CFobs, cf, CFrewards, CFdones1, CFboard)
def __init__(self, name, args, sess=None, reuse=False, log_tensorboard=True, save=True): self.learn_steps = 0 # hyperparameters self.gamma = args[name]['gamma'] self.tau = args[name]['tau'] self.init_noise_sigma = args[name]['init_noise_sigma'] self.noise_decay = args[name]['noise_decay'] # replay buffer self.buffer = ReplayBuffer(sample_size=args['batch_size'], max_len=args[name]['buffer_size']) super(DDPG, self).__init__(name, args, sess=sess, reuse=reuse, build_graph=True, log_tensorboard=log_tensorboard, save=save) self._initialize_target_net()
def __init__(self, state_size, action_size, fc1_units=256, fc2_units=128, device=torch.device('cpu')): """DQN agent Args: state_size (int): dimension of each state action_size (int): dimension of each action (or the number of action choices) seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.device = device # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, fc1_units=fc1_units, fc2_units=fc2_units).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, fc1_units=fc1_units, fc2_units=fc2_units).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Initialze qnetwork_target parameters to qnetwork_local self.soft_update(self.qnetwork_local, self.qnetwork_target, 1) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device=self.device) # Initialize the time step counter (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, env): # Hyperparameters self.GAMMA = 0.95 self.BATCH_SIZE = 64 self.BUFFER_SIZE = 20000 self.ACTOR_LEARNING_RATE = 0.0001 self.CRITIC_LEARNING_RATE = 0.001 self.TAU = 0.001 self.env = env # get state dimension self.state_dim = env.observation_space.shape[0] # get action dimension self.action_dim = env.action_space.shape[0] # get action bound self.action_bound = env.action_space.high[0] ## create actor and critic networks self.actor = Actor(self.state_dim, self.action_dim, self.action_bound, self.TAU, self.ACTOR_LEARNING_RATE) self.critic = Critic(self.state_dim, self.action_dim, self.TAU, self.CRITIC_LEARNING_RATE) ## initialize replay buffer self.buffer = ReplayBuffer(self.BUFFER_SIZE) # save the results self.save_epi_reward = []
def __init__(self, DQNType, input_shape, replaybuffersize=100000, input_preprocess=[]): super().__init__(MOVEMENTS.COMPLEX) self.memory = ReplayBuffer(replaybuffersize) self.train_network = DQNType(input_shape, len(self.movements)) self.target_network = self.train_network.clone_model() self.input_preprocess = input_preprocess ## Initialize self.counter = 0 self.epsilon = 1 ## hyperparameters self.hyperparams = { "burn_in": 10000, "copy_each": 5000, "learn_each": 1, "save_each": 5000, "final_epsilon": 0.1, "epsilon_decay_rate": 0.99998, "batch_size": 32, "gamma": 0.99 }
def __init__(self, state_size, action_size, device, buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, lr=5e-4, update_every=4): self.state_size = state_size self.action_size = action_size self.device = device self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every # model settings self.qnet_local = Model(state_size, action_size).to(self.device) self.qnet_target = Model(state_size, action_size).to(self.device) self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=self.lr) # replay buffer settings self.replay_buffer = ReplayBuffer(self.buffer_size, self.batch_size) self.update_step = 0
def __init__(self, state_size, action_size, seed, hidden_layers=[64, 64], buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, learning_rate=5e-4, update_every=4, head_name="DuelingDQN", head_scale="max"): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed hidden_layers (list of int ; optional): number of each layer nodes buffer_size (int ; optional): replay buffer size batch_size (int; optional): minibatch size gamma (float; optional): discount factor tau (float; optional): for soft update of target parameters learning_rate (float; optional): learning rate update_every (int; optional): how often to update the network """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = learning_rate self.update_every = update_every # detect GPU device self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # Assign model parameters and assign device model_params = [ state_size, action_size, seed, hidden_layers, head_name, head_scale ] self.qnetwork_local = QNetwork(*model_params).to(self.device) self.qnetwork_target = QNetwork(*model_params).to(self.device) # Set up optimizer self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Initialize Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed, self.device) # Initialize time step (for updating every self.update_every steps) self.t_step = 0
def __init__(self, device, state_size, n_agents, action_size, random_seed, \ buffer_size, batch_size, gamma, TAU, lr_actor, lr_critic, weight_decay, \ learn_interval, learn_num, ou_sigma, ou_theta, checkpoint_folder = './'): # Set Computational device self.DEVICE = device # Init State, action and agent dimensions self.state_size = state_size self.n_agents = n_agents self.action_size = action_size self.seed = random.seed(random_seed) self.l_step = 0 self.log_interval = 200 # Init Hyperparameters self.BUFFER_SIZE = buffer_size self.BATCH_SIZE = batch_size self.GAMMA = gamma self.TAU = TAU self.LR_ACTOR = lr_actor self.LR_CRITIC = lr_critic self.WEIGHT_DECAY = weight_decay self.LEARN_INTERVAL = learn_interval self.LEARN_NUM = learn_num # Init Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Init Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Init Noise Process self.noise = OUNoise((n_agents, action_size), random_seed, mu=0., theta=ou_theta, sigma=ou_sigma) # Init Replay Memory self.memory = ReplayBuffer(device, action_size, buffer_size, batch_size, random_seed)
def __init__(self, num_agents, local_obs_dim, local_action_size, global_obs_dim, global_action_size, discount_factor=0.95, tau=0.02, device=device, random_seed=4, lr_critic=1.0e-4, weight_decay=0.0): super(MADDPG, self).__init__() # parameter configuration self.num_agents = num_agents self.device = device self.discount_factor = discount_factor self.tau = tau self.num_agents = num_agents self.global_action_size = global_action_size self.global_obs_dim = global_obs_dim torch.manual_seed(random_seed) random.seed(random_seed) self.random_seed = random_seed self.weight_decay = weight_decay # define actors self.actors = [ DDPGActor(num_agents, local_obs_dim, local_action_size, global_obs_dim, global_action_size, device=device) for _ in range(num_agents) ] # define centralized critic self.critic = Critic(global_obs_dim, global_action_size, self.random_seed).to(self.device) self.target_critic = Critic(global_obs_dim, global_action_size, self.random_seed).to(self.device) hard_update(self.target_critic, self.critic) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=self.weight_decay) # noise coef self.noise_coef = 1.0 self.noise_coef_decay = 1e-6 # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, state_size, action_size, num_agents, random_seed=0, params=params): """ Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.params = params # Actor (Policy) Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(self.params['DEVICE']) self.actor_target = Actor(state_size, action_size, random_seed).to(self.params['DEVICE']) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.params['LR_ACTOR']) # Critic (Value) Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(self.params['DEVICE']) self.critic_target = Critic(state_size, action_size, random_seed).to(self.params['DEVICE']) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.params['LR_CRITIC'], weight_decay=self.params['WEIGHT_DECAY']) # Initialize target and local to same weights self.hard_update(self.actor_local, self.actor_target) self.hard_update(self.critic_local, self.critic_target) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, self.params['BUFFER_SIZE'], self.params['BATCH_SIZE'], random_seed)
def train_network(config: MuZeroConfig, storage: SharedStorage, replay_buffer: ReplayBuffer): network = storage.latest_network() # recover the latest network to be updated learning_rate = config.lr_init * config.lr_decay_rate**(network.training_steps()/config.lr_decay_steps) network.optimiser.learning_rate = learning_rate for i in range(config.training_steps+1): if i % config.checkpoint_interval == 0: storage.save_network(network.training_steps(), network) batch = replay_buffer.sample_batch(config.num_unroll_steps, config.td_steps, config.prediction_interval) l = network.update_weights(batch, config.weight_decay, config.hidden_state_dampen) if i % 100 == 0: print((i, l)) storage.save_network(network.training_steps(), network) return i ##
def __init__(self, state_size, action_size, seed, hidden_layers=[64, 64], drop_p=0.3, with_dueling=False, isDDQN=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed hidden_layers (array): Hidden number of nodes in each layer drop_p (float [0-1]) : Probability of dropping nodes (implementation of dropout) with_dueling (boolean) : If true, network is dueling network, otherwise false. isDDQN (boolean) : If true, double dqn in implemented, otherwise false. """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_layers=hidden_layers, drop_p=drop_p, dueling=with_dueling).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_layers=hidden_layers, drop_p=drop_p, dueling=with_dueling).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # Parameter instance of DDQN. self.isDDQN = isDDQN
def __init__(self, env, gamma, batch_size, buffer_size, lr_rate, tau): self.env = env self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.action_bound = env.action_space.high[0] self.gamma = gamma self.batch_size = batch_size self.buffer_size = buffer_size self.actor = Actor(self.state_dim, self.action_dim, self.action_bound, lr_rate[0], tau) self.critic = Critic(self.state_dim, self.action_dim, lr_rate[1], tau) self.buffer = ReplayBuffer(self.buffer_size) self.save_epi_reward = []
def __init__(self, state_size, action_size, seed, GAMMA=GAMMA, TAU=TAU, LR=LR, UPDATE_EVERY=UPDATE_EVERY, BUFFER_SIZE=BUFFER_SIZE, BATCH_SIZE=BATCH_SIZE): """ Initialize the agent. ========== PARAMETERS ========== state_size (int) = observation dimension of the environment action_size (int) = dimension of each action seed (int) = random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.gamma = GAMMA self.tau = TAU self.lr = LR self.update_every = UPDATE_EVERY self.buffer_size = BUFFER_SIZE self.batch_size = BATCH_SIZE self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # instantiate online local and target network for weight updates self.qnetwork_local = QNetwork(state_size, action_size, seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # create a replay buffer self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed, self.device) # time steps for updating target network every time t_step % 4 == 0 self.t_step = 0
def __init__(self, env=gym.make('Pendulum-v0'), s_dim=2, a_dim=1, gamma=0.99, episodes=100, tau=0.001, buffer_size=1e06, minibatch_size=64, actor_lr=0.001, critic_lr=0.001, save_name='final_weights', render=False): self.save_name = save_name self.render = render self.env = env self.upper_bound = env.action_space.high[0] self.lower_bound = env.action_space.low[0] self.EPISODES = episodes self.MAX_TIME_STEPS = 200 self.s_dim = s_dim self.a_dim = a_dim self.GAMMA = gamma self.TAU = tau self.buffer_size = buffer_size self.minibatch_size = minibatch_size self.actor_lr = actor_lr self.critic_lr = critic_lr self.ou_noise = OUNoise(mean=np.zeros(1)) self.actor = Actor(self.s_dim, self.a_dim).model() self.target_actor = Actor(self.s_dim, self.a_dim).model() self.actor_opt = tf.keras.optimizers.Adam(learning_rate=self.actor_lr) self.target_actor.set_weights(self.actor.get_weights()) self.critic = Critic(self.s_dim, self.a_dim).model() self.critic_opt = tf.keras.optimizers.Adam( learning_rate=self.critic_lr) self.target_critic = Critic(self.s_dim, self.a_dim).model() self.target_critic.set_weights(self.critic.get_weights()) self.replay_buffer = ReplayBuffer(self.buffer_size)
def __init__(self, env, state_dim, action_dim): self.name = 'DDPG' self.env = env self.state_dim = state_dim self.action_dim = action_dim self.AE = Actor(state_dim,action_dim).cuda() self.CE = Critic(state_dim,action_dim).cuda() self.AT = Actor(state_dim,action_dim).cuda() self.CT = Critic(state_dim,action_dim).cuda() self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.time_step = 0 self.AE.load_state_dict(torch.load(MODEL_DIR+'/obs/actor_340000.pkl')) # self.AT.load_state_dict(torch.load(MODEL_DIR+'/actor_280000.pkl')) # self.CE.load_state_dict(torch.load(MODEL_DIR+'/critic_280000.pkl')) # self.CT.load_state_dict(torch.load(MODEL_DIR+'/critic_280000.pkl')) self.optimizer_a = torch.optim.Adam(self.AE.parameters(), lr=1e-4) self.optimizer_c = torch.optim.Adam(self.CE.parameters(), lr=1e-4)
def __init__(self, env, batch_size, mem_size, discount, actor_params, critic_params): self._batch_size = batch_size self._mem_size = mem_size self._discount = discount self._sess = tensorflow.Session() k_backend.set_session(self._sess) self._env = env self._state_dim = env.observation_space.shape[0] self._action_dim = env.action_space.shape[0] self._action_min = env.action_space.low self._action_max = env.action_space.high self._state_min = env.observation_space.low self._state_max = env.observation_space.high self._actor = Actor(self._sess, self._state_dim, self._action_dim, self._action_min, self._action_max, actor_params) self._critic = Critic(self._sess, 0.5, self._state_dim, self._action_dim, critic_params) self._memory = ReplayBuffer(mem_size)
def __init__(self, state_size, action_size, random_seed): """ Initialize the model with arguments as follows: ARGUMENTS ========= - state_size (int) = dimension of input space - action_size (int) = dimension of action space - random_seed (int) = random seed Returns ======= - best learned action to take after Actor-Critic Learning """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # create noise self.noise = OUNoise(action_size, random_seed) self.noise_decay = NOISE_DECAY # create memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device) # Actor Networks (local online net + target net) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = LR_ACTOR) # Critic Networks (local online net + target net) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # instantiate online and target networks with same weights self.soft_update(self.actor_local, self.actor_target, 1) self.soft_update(self.critic_local, self.critic_target, 1) self.learn_counter = 0
def teleport(defaults): collector = Collector(**defaults) env = Game(**defaults) mover = Mover(env, _extra_dim=1, **defaults) teleporter = Teleporter(env, **defaults) buffer = ReplayBuffer(**defaults) with Save(env, collector, mover, teleporter, **defaults) as save: intervention_idx, modified_board = teleporter.pre_process(env) for frame in loop(env, collector, save, teleporter): modified_board = teleporter.interveen(env.board, intervention_idx, modified_board) actions = mover(modified_board) observations, rewards, dones, info = env.step(actions) modified_board, modified_rewards, modified_dones, teleport_rewards, intervention_idx = teleporter.modify(observations, rewards, dones, info) buffer.teleporter_save_data(teleporter.boards, observations, teleporter.interventions, teleport_rewards, dones, intervention_idx) mover.learn(modified_board, actions, modified_rewards, modified_dones) board_before, board_after, intervention, tele_rewards, tele_dones = buffer.sample_data() teleporter.learn(board_after, intervention, tele_rewards, tele_dones, board_before) collector.collect([rewards, modified_rewards, teleport_rewards], [dones, modified_dones])
def __init__(self, action_shape, model_structure, agent_hyperparams, dueling, double): self.device = torch.device(agent_hyperparams["device"]) self.action_shape = action_shape self.dueling = dueling self.double = double if self.dueling: prime = model_structure[0] value = model_structure[1] advantage = model_structure[2] self.local_model = DuelingQ(prime, value, advantage).to(self.device) self.target_model = DuelingQ(prime, value, advantage).to(self.device) self.target_model.load_state_dict(self.local_model.state_dict()) else: self.local_model = Model(model_structure).to(self.device) self.target_model = Model(model_structure).to(self.device) self.target_model.load_state_dict(self.local_model.state_dict()) self.optimizer = optim.RMSprop(self.local_model.parameters(), lr=agent_hyperparams['lr']) self.replay_buffer = ReplayBuffer( agent_hyperparams['memory_size'], agent_hyperparams['batch_size'], agent_hyperparams['greedy_coeff'], agent_hyperparams['default_priority'], agent_hyperparams['shed_amount']) self.eps = agent_hyperparams['eps'] self.alpha = agent_hyperparams['alpha'] self.gamma = agent_hyperparams['gamma'] self.beta = agent_hyperparams['beta'] self.eps_decay = agent_hyperparams['eps_decay'] self.alpha_decay = agent_hyperparams['alpha_decay'] self.gamma_decay = agent_hyperparams['gamma_decay'] self.beta_decay = agent_hyperparams['beta_decay'] self.min_eps = agent_hyperparams['min_eps'] self.min_alpha = agent_hyperparams['min_alpha'] self.min_gamma = agent_hyperparams['min_gamma'] self.min_beta = agent_hyperparams['min_beta']
class Agent: def __init__(self, input_dim, output_dim, tau=0.001, gamma=0.99, train_batch_size=640): self.input_dim = input_dim self.output_dim = output_dim self.tau = tau self.gamma = gamma self.train_batch_size = train_batch_size self.main_critic = Critic(input_dim, output_dim, tau, gamma) self.target_critic = Critic(input_dim, output_dim, tau, gamma) self.main_actor = Actor(input_dim, output_dim, tau, gamma) self.target_actor = Actor(input_dim, output_dim, tau, gamma) self.target_critic.model.set_weights( self.main_critic.model.get_weights()) self.target_actor.model.set_weights( self.main_actor.model.get_weights()) self.memory = ReplayBuffer(batch_size=train_batch_size) def get_action(self, state): return self.main_actor.get_action(state) def train(self): data = self.memory.sample() states = np.vstack([e.state for e in data if e is not None]) actions = np.array([e.action for e in data if e is not None ]).astype(np.float32).reshape(-1, self.output_dim) rewards = np.array([e.reward for e in data if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in data if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in data if e is not None]) actions_next = self.target_actor.model.predict_on_batch(next_states) Q_targets_next = self.target_critic.model.predict_on_batch( [next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.main_critic.train(states, actions, Q_targets) action_gradients = np.reshape(self.main_critic.get_gradient(states,actions), \ (-1, self.output_dim)) self.main_actor.train(states, action_gradients) self.target_actor.model = self.main_actor.soft_update( self.target_actor.model) self.target_critic.model = self.main_critic.soft_update( self.target_critic.model)