def __init__(self, input_dims, env, n_actions): self.memory = ReplayBuffer(input_dims) self.n_actions = n_actions self.actor_nn = ActorNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_actor', max_action=env.action_space.n) self.critic_local_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_1') self.critic_local_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_2') self.critic_target_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_1') self.critic_target_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_2') self.value_nn = ValueNetwork(input_dims, name=Constants.env_id + '_value') self.target_value_nn = ValueNetwork(input_dims, name=Constants.env_id + '_target_value') self.update_network_parameters(tau=1)
def __init__(self, alpha, beta, input_dims, tau, env, env_id, gamma=0.99, n_actions=2, max_size=1000000, layer1_size=256, layer2_size=256, batch_size=100, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_critic_1') self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_critic_2') self.value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name=env_id+'_value') self.target_value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name=env_id+'_target_value') self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, config: Config): self.config = config self.is_training = True if self.config.prioritized_replay: self.buffer = PrioritizedReplayBuffer( self.config.max_buff, alpha=self.config.prioritized_replay_alpha) if self.config.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = self.config.frames self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.config.prioritized_replay_beta0, final_p=1.0) else: self.buffer = ReplayBuffer(self.config.max_buff) self.beta_schedule = None self.model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model.load_state_dict(self.model.state_dict()) self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate) if self.config.use_cuda: self.cuda()
def __init__(self, num_agents, state_size, action_size, hidden_layers, seed, gamma=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weight_decay=WEIGHT_DECAY, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE): """Initialize MADDPG agent.""" super(MADDPG, self).__init__() self.seed = random.seed(seed) self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay = weight_decay self.buffer_size = buffer_size self.batch_size = batch_size self.agents = [DDPGAgent(state_size, action_size, hidden_layers, gamma, \ tau, lr_actor, lr_critic, weight_decay, seed) \ for _ in range(num_agents)] self.replay_buffer = ReplayBuffer(num_agents, buffer_size, batch_size)
def __init__(self, net, o_dim, a_dim, lr=1e-3, batch_size=16, algorithm="ddqn", gamma=0.99, tau=1e-3, buffer_size=int(1e6)): """ o_dim: observation space dim (or # of channels) a_dim: action space dimension """ self.o_dim = o_dim self.a_dim = a_dim self.lr = lr self.batch_size = batch_size self.gamma = gamma self.tau = tau self.buffer_size = buffer_size if algorithm.lower() in ("dqn"): self.algorithm = "dqn" elif algorithm.lower() in ("ddqn", "double dqn", "doubledqn"): self.algorithm = "ddqn" else: raise TypeError("cannot recognize algorithm") self.buffer = ReplayBuffer(buffer_size, batch_size) self.online_net = net(o_dim, a_dim).to(self.device) self.target_net = net(o_dim, a_dim).to(self.device) self.optimizer = optim.Adam(self.online_net.parameters(), lr=lr)
def __init__(self, state_size, action_size, random_seed): self.state_size = state_size self.action_size = action_size self.seed = random_seed # ------------------ actor ------------------ # self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) # ------------------ critic ----------------- # self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) # ------------------ optimizers ------------- # self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) # ----------------------- initialize target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, 1) self.soft_update(self.actor_local, self.actor_target, 1) self.t_step = 0 # Noise process self.noise = OUNoise(action_size, random_seed) # Replay Buffer self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device, random_seed)
def __init__(self, env_id, action_space, action_bound): self.env_id = env_id self.action_space = action_space self.action_bound = action_bound self.env = gym.make(self.env_id) self.replay_buffer = ReplayBuffer(max_len=self.MAX_EXPERIENCES) self.policy = GaussianPolicy(action_space=self.action_space, action_bound=self.action_bound) self.duqlqnet = DualQNetwork() self.target_dualqnet = DualQNetwork() self.log_alpha = tf.Variable(0.) #: alpha=1 self.alpha_optimizer = tf.keras.optimizers.Adam(3e-4) self.target_entropy = -0.5 * self.action_space self.global_steps = 0 self._initialize_weights()
def __init__(self, alpha=0.0003, beta= 0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, ent_alpha = 0.0001, batch_size=256, reward_scale=2, layer1_size=256, layer2_size=256, chkpt_dir='tmp/sac'): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.ent_alpha = ent_alpha self.reward_scale = reward_scale self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size , name='actor', chkpt_dir=chkpt_dir) self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size ,name='critic_1', chkpt_dir=chkpt_dir) self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size ,name='critic_2', chkpt_dir=chkpt_dir) self.target_critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size ,name='target_critic_1', chkpt_dir=chkpt_dir) self.target_critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size ,name='target_critic_2', chkpt_dir=chkpt_dir) self.update_network_parameters(tau=1)
def __init__(self, state_shape, action_size, seed, cnn=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed cnn (bool): whether to use convolutional NN """ self.state_shape = state_shape self.action_size = action_size self.seed = random.seed(seed) self.cnn = cnn if cnn: self.qnetwork_local = QNetworkFullyConvolutional( state_shape, action_size, seed).to(device) self.qnetwork_target = QNetworkFullyConvolutional( state_shape, action_size, seed).to(device) else: self.qnetwork_local = QNetworkFullyConnected( state_shape, action_size, seed).to(device) self.qnetwork_target = QNetworkFullyConnected( state_shape, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, env, model, target_model, config, name_agent="dqn"): self.name_agent = name_agent self.dim_space = env.observation_space.shape[0] self.nb_actions = env.action_space.n self.epsilon = config.epsilon_start self.epsilon_final = config.epsilon_final self.epsilon_start = config.epsilon_start self.epsilon_decay = config.epsilon_decay self.gamma = config.gamma self.replay_buffer = ReplayBuffer(10000, config.batch_size) self.environment = env self.batch_size = config.batch_size self.update_nb_iter = config.update_nb_iter # q0 self.model = model # q0_barre self.target_model = target_model self.optimizer = optim.Adam(self.model.parameters(), lr=config.learning_rate) # self.loss_data = [] self.rewards = []
def __init__(self, config: Config): self.config = config self.is_training = True # self.buffer = deque(maxlen=self.config.max_buff) self.buffer = ReplayBuffer(self.config.max_buff) self.actor = Actor(self.config.state_dim, self.config.action_dim, self.config.max_action) self.actor_target = Actor(self.config.state_dim, self.config.action_dim, self.config.max_action) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = Adam(self.actor.parameters(), lr=self.config.learning_rate) self.critic_1 = Critic(self.config.state_dim, self.config.action_dim) self.critic_1_target = Critic(self.config.state_dim, self.config.action_dim) self.critic_1_target.load_state_dict(self.critic_1.state_dict()) self.critic_1_optimizer = Adam(self.critic_1.parameters(), lr=self.config.learning_rate) self.critic_2 = Critic(self.config.state_dim, self.config.action_dim) self.critic_2_target = Critic(self.config.state_dim, self.config.action_dim) self.critic_2_target.load_state_dict(self.critic_2.state_dict()) self.critic_2_optimizer = Adam(self.critic_2.parameters(), lr=self.config.learning_rate) self.MseLoss = nn.MSELoss() if self.config.use_cuda: self.cuda()
def __init__(self, n_agents, act_spcs, ob_spcs, writer, args): self.args = args self.memory = ReplayBuffer(args.buffer_length, n_agents, device) # self.memory = ReplayMemory(args.buffer_length, n_agents, device) self.use_maddpg = args.algo == "maddpg" self.use_sac = args.use_sac self.use_td3 = args.use_td3 self.use_single_q = args.single_q self.all_obs = args.all_obs self.n_agents = n_agents self.act_spcs = act_spcs self.ob_spcs = ob_spcs qnet_actspcs = [np.sum(self.act_spcs) if self.use_maddpg else self.act_spcs[i] for i in range(n_agents)] qnet_obspcs = [np.sum(self.ob_spcs) if self.use_maddpg else self.ob_spcs[i] for i in range(n_agents)] if self.use_sac and not self.use_td3: self.agents = [SAC_agent(self.act_spcs[i], qnet_obspcs[i] if self.all_obs else self.ob_spcs[i], qnet_obspcs[i], qnet_actspcs[i]) for i in range(n_agents)] elif self.use_td3: self.agents = [TD3_agent(self.act_spcs[i], qnet_obspcs[i] if self.all_obs else self.ob_spcs[i], qnet_obspcs[i], qnet_actspcs[i]) for i in range(n_agents)] else: self.agents = [DDPG_agent(self.act_spcs[i], qnet_obspcs[i] if self.all_obs else self.ob_spcs[i], qnet_obspcs[i], qnet_actspcs[i]) for i in range(n_agents)] self.n_steps = 0 self.n_updates = 0 self.writer = writer self.criterion = nn.MSELoss() self.sac_alpha = args.sac_alpha self.agent_actions = [[] for i in range(self.n_agents)]
def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(n_actions=n_actions, name='critic_2') self.value = ValueNetwork(name='value') self.target_value = ValueNetwork(name='target_value') self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic_1.compile(optimizer=Adam(learning_rate=beta)) self.critic_2.compile(optimizer=Adam(learning_rate=beta)) self.value.compile(optimizer=Adam(learning_rate=beta)) self.target_value.compile(optimizer=Adam(learning_rate=beta)) self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, num_agents, x_dim, o_dim, a_dim, lr_actor=1e-3, lr_critic=1e-3, batch_size=16, gamma=0.99, tau=0.001, buffer_size=int(1e5), seed=1234): self.num_agents = num_agents self.x_dim = x_dim self.o_dim = o_dim self.a_dim = a_dim self.lr_actor = lr_actor self.lr_critic = lr_critic self.batch_size = batch_size self.gamma = gamma self.tau = tau self.buffer_size = buffer_size self.seed = seed self.buffer = ReplayBuffer(buffer_size, batch_size, seed) self.agents = [DDPGAgent(num_agents, id, x_dim, o_dim, a_dim, lr_actor, lr_critic, gamma, seed) \ for id in range(num_agents)]
def __init__(self, input_dims, alpha=0.001, beta=0.002, env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, fc1=400, fc2=300, batch_size=64, noise=0.1): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.noise = noise self.max_action = env.action_space.high[0] self.min_action = env.action_space.low[0] self.actor = ActorNetwork(n_actions=n_actions, name='actor') self.critic = CriticNetwork(name='critic') self.target_actor = ActorNetwork(n_actions=n_actions, name='target_actor') self.target_critic = CriticNetwork(name='target_critic') self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic.compile(optimizer=Adam(learning_rate=beta)) self.target_actor.compile(optimizer=Adam(learning_rate=alpha)) self.target_critic.compile(optimizer=Adam(learning_rate=beta)) self.update_network_parameters(tau=1)
def __init__(self, alpha=3e-4, beta=3e-4, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=5e-3, fc1_dim=256, fc2_dim=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, n_actions, env.action_space.high) self.critic1 = CriticNetwork(beta, input_dims, n_actions, name='critic1') self.critic2 = CriticNetwork(beta, input_dims, n_actions, name='critic2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, actor, critic, reward_fun, gamma=0.99, tau=0.005, # policy_noise=0.2, noise_clip=0.5, policy_freq=2, max_buffer_size=1e6, batch_size=64, lr=3e-4 ): self._actor = actor self._actor_target = copy.deepcopy(self._actor) self._actor_optimizer = torch.optim.Adam(self._actor.parameters(), lr=lr) self._critic = critic self._critic_target = copy.deepcopy(self._critic) self._critic_loss = nn.MSELoss() self._critic_optimizer = torch.optim.Adam(self._critic.parameters(), lr=lr) self.reward_fun = reward_fun self._gamma = gamma self._tau = tau self._policy_freq = policy_freq self._rbuffer_max_size = max_buffer_size self._replay_buffer = ReplayBuffer(self._rbuffer_max_size) self._batch_size = batch_size self._steps = 0 self._run = 0
def __init__(self, args): """ init function Args: - args: class with args parameter """ self.state_size = args.state_size self.action_size = args.action_size self.bs = args.bs self.gamma = args.gamma self.epsilon = args.epsilon self.tau = args.tau self.discrete = args.discrete self.randomer = OUNoise(args.action_size) self.buffer = ReplayBuffer(args.max_buff) self.actor = Actor(self.state_size, self.action_size) self.actor_target = Actor(self.state_size, self.action_size) self.actor_opt = AdamW(self.actor.parameters(), args.lr_actor) self.critic = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_opt = AdamW(self.critic.parameters(), args.lr_critic) hard_update(self.actor_target, self.actor) hard_update(self.critic_target, self.critic)
def __init__(self, state_size=37, action_size=4, gamma=0.99, lr=0.001, update_every=5): """ Initializes the model. ---- @param: 1. state_size: size of input # of states. 2. action_size: size of # of actions. 3. gamma: discounted return rate. 4. lr: learning rate for the model. 5. update_every: update target_model every X time-steps. """ self.state_size = state_size self.action_size = action_size self.gamma = gamma #define dicsounted return #Q-network : defines the 2 DQN (using doubling Q-learning architecture via fixed Q target) self.qnetwork_local = DQNetwork() self.qnetwork_target = DQNetwork() #define the optimizer self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) #replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE) self.update_every = update_every self.target_update_counter = 0
class MADDPGAgent(): def __init__(self, seed, checkpoint_filename=None): self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, DEVICE, seed) self.t = 0 self.agents = [ DDPGAgent(index, NUM_AGENTS, seed, DEVICE) for index in range(NUM_AGENTS) ] if checkpoint_filename: for i, to_load in enumerate(self.agents): f"{os.getcwd()}/models/{checkpoint_filename}_actor_{i}.weights" actor_file = torch.load( f"{os.getcwd()}/models/{checkpoint_filename}_actor_{i}.weights", map_location=DEVICE) critic_file = torch.load( f"{os.getcwd()}/models/{checkpoint_filename}_critic_{i}.weights", map_location=DEVICE) to_load.actor_local.load_state_dict(actor_file) to_load.actor_target.load_state_dict(actor_file) to_load.critic_local.load_state_dict(critic_file) to_load.critic_target.load_state_dict(critic_file) print(f'Files loaded with prefix {checkpoint_filename}') def step(self, all_states, all_actions, all_rewards, all_next_states, all_dones): all_states = all_states.reshape(1, -1) all_next_states = all_next_states.reshape(1, -1) self.memory.add(all_states, all_actions, all_rewards, all_next_states, all_dones) self.t = (self.t + 1) % UPDATE_FREQUENCY if self.t == 0 and (len(self.memory) > BATCH_SIZE): experiences = [self.memory.sample() for _ in range(NUM_AGENTS)] self.learn(experiences, GAMMA) def act(self, all_states, random): all_actions = [] for agent, state in zip(self.agents, all_states): action = agent.act(state, random=random) all_actions.append(action) return np.array(all_actions).reshape(1, -1) def learn(self, experiences, gamma): all_actions = [] all_next_actions = [] for i, agent in enumerate(self.agents): states, _, _, next_states, _ = experiences[i] agent_id = torch.tensor([i]).to(DEVICE) state = states.reshape(-1, 2, 24).index_select(1, agent_id).squeeze(1) next_state = next_states.reshape(-1, 2, 24).index_select( 1, agent_id).squeeze(1) all_actions.append(agent.actor_local(state)) all_next_actions.append(agent.actor_target(next_state)) for i, agent in enumerate(self.agents): agent.learn(i, experiences[i], gamma, all_next_actions, all_actions)
def __init__(self, state_size, action_size, num_agents, random_seed): self.agents = [ DDPGAgent(state_size, action_size, random_seed) for _ in range(num_agents) ] self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device, random_seed) self.t_step = 0
def __init__(self, state_size, action_size, num_agents): self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.whole_action_dim = self.action_size*self.num_agents self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE) # Replay memory self.maddpg_agents = [DDPG(state_size, action_size, num_agents), DDPG(state_size, action_size, num_agents)] #create agents self.episodes_before_training = EPISODES_BEFORE_TRAINING
def __init__(self, state_size, action_size, num_agents=2, eps_before_train=500, gamma=0.99, batch_size=128, buffer_size=int(1e5), lr_actor=1e-4, lr_critic=1e-3, weight_decay=0, tau=1e-3, noise_weight=1.0, noise_decay=0.999998, noise_min=1e-3, seed=0, device="cuda:0"): # (self, state_size, action_size, num_agents=2, random_seed=1, lr_actor=2e-4, lr_critic=1e-3, # weight_decay=0, tau=2e-3, device=device) torch.manual_seed(seed) np.random.seed(seed) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.action_dim = action_size * num_agents self.eps_before_train = eps_before_train self.gamma = gamma self.batch_size = batch_size self.buffer_size = buffer_size self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay = weight_decay self.tau = tau self.noise_weight = noise_weight self.noise_decay = noise_decay self.noise_min = noise_min self.device = device self.i_episode = 0 self.agents = [ DDPG(self.state_size, self.action_size, self.num_agents, random_seed=2 * i * seed, lr_actor=self.lr_actor, lr_critic=self.lr_critic, weight_decay=self.weight_decay, tau=self.tau, device=self.device) for i in range(self.num_agents) ] self.memory = ReplayBuffer(self.action_size, self.buffer_size, seed)
def __init__(self, env_name, num_quantiles=32, fqf_factor=0.000001*0.1, ent_coef=0.001, state_embedding_dim=3136, quantile_embedding_dim=64, gamma=0.99, n_frames=4, batch_size=32, buffer_size=1000000, update_period=8, target_update_period=10000): self.env_name = env_name self.num_quantiles = num_quantiles self.state_embedding_dim = state_embedding_dim self.quantile_embedding_dim = quantile_embedding_dim self.k = 1.0 self.ent_coef = ent_coef self.n_frames = n_frames self.action_space = gym.make(self.env_name).action_space.n self.fqf_network = FQFNetwork( action_space=self.action_space, num_quantiles=self.num_quantiles, state_embedding_dim=self.state_embedding_dim, quantile_embedding_dim=self.quantile_embedding_dim) self.target_fqf_network = FQFNetwork( action_space=self.action_space, num_quantiles=self.num_quantiles, state_embedding_dim=self.state_embedding_dim, quantile_embedding_dim=self.quantile_embedding_dim) self._define_network() self.optimizer = tf.keras.optimizers.Adam( lr=0.00015, epsilon=0.01/32) #: fpl; fraction proposal layer self.optimizer_fpl = tf.keras.optimizers.Adam( learning_rate=0.00005 * fqf_factor, epsilon=0.0003125) self.gamma = gamma self.replay_buffer = ReplayBuffer(max_len=buffer_size) self.batch_size = batch_size self.update_period = update_period self.target_update_period = target_update_period self.steps = 0
def reset_parameters(self): self._q.reset_parameters() self._q_target.reset_parameters() hard_update(self._q_target, self._q) self._pi.reset_parameters() if self._use_rbuffer: self._replay_buffer = ReplayBuffer(self._rbuffer_max_size)
def __init__( self, policy, env, gamma, learning_rate, buffer_size, exploration_type, exploration_frac, exploration_ep, exploration_initial_eps, exploration_final_eps, double_q, policy_kwargs, seed, intent ): super(TabularRLModel, self).__init__( policy=policy, env=env, policy_kwargs=policy_kwargs, seed=seed) self.gamma = gamma self.learning_rate = learning_rate self.buffer_size = buffer_size self.exploration_type = exploration_type self.exploration_frac = exploration_frac self.exploration_ep = exploration_ep self.exploration_initial_eps = exploration_initial_eps self.exploration_final_eps = exploration_final_eps self.double_q = double_q self.intent = intent # self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs # self.policy = policy(self.observation_space, self.action_space, intent=True) self.policy_kwargs = get_default_args(self.policy) self.policy_kwargs['ob_space'] = self.observation_space self.policy_kwargs['ac_space'] = self.action_space self.policy_kwargs['intent'] = self.intent if policy_kwargs is not None: for key, val in policy_kwargs.items(): self.policy_kwargs[key] = val # self.policy_kwargs['transform_func'] = transform_func # if policy_kwargs is None: # self.policy = policy(self.observation_space, self.action_space, # intent=True, device=self.device) # else: self.policy = policy(**self.policy_kwargs) if self.buffer_size is None: self.replay_buffer = None else: self.replay_buffer = ReplayBuffer(self.buffer_size)
def __init__(self, load_from=None, will_train=True): self.env = TorcsEnv( path='/usr/local/share/games/torcs/config/raceman/quickrace.xml') self.args = SAC_args() self.buffer = ReplayBuffer(self.args.buffer_size) action_dim = self.env.action_space.shape[0] state_dim = self.env.observation_space.shape[0] hidden_dim = 256 self.action_size = action_dim self.state_size = state_dim self.value_net = ValueNetwork(state_dim, hidden_dim).to(self.args.device) self.target_value_net = ValueNetwork(state_dim, hidden_dim).to(self.args.device) self.soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.target_value_net.load_state_dict(self.value_net.state_dict()) self.value_criterion = nn.MSELoss() self.soft_q_loss1 = nn.MSELoss() self.soft_q_loss2 = nn.MSELoss() self.value_opt = optim.Adam(self.value_net.parameters(), lr=self.args.lr) self.soft_q_opt1 = optim.Adam(self.soft_q_net1.parameters(), lr=self.args.lr) self.soft_q_opt2 = optim.Adam(self.soft_q_net2.parameters(), lr=self.args.lr) self.policy_opt = optim.Adam(self.policy_net.parameters(), lr=self.args.lr) if will_train: current_time = time.strftime('%d-%b-%y-%H.%M.%S', time.localtime()) self.plot_folder = f'plots/{current_time}' self.model_save_folder = f'model/{current_time}' make_sure_dir_exists(self.plot_folder) make_sure_dir_exists(self.model_save_folder) self.cp = Checkpoint(self.model_save_folder) if load_from is not None: try: self.load_checkpoint(load_from) except FileNotFoundError: print(f'{load_from} not found. Running default.') else: print('Starting from scratch.')
def __init__(self, config: Config): self.config = config self.is_training = True self.buffer = ReplayBuffer(self.config.max_buff) self.model = DQN(self.config.state_dim, self.config.action_dim).cuda() self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate) if self.config.use_cuda: self.cuda()
def test_append(self): count = 100 start_length = count // 2 max_length = count buffer = ReplayBuffer(start_length=start_length, max_length=max_length) for append_count in range(max_length*2): buffer.append(append_count) self.assertEqual(len(buffer.buffer), min(append_count+1, max_length), "Incorrect buffer size.") self.assertEqual(buffer.buffer[0], max(0, (append_count+1) - max_length), "Incorrect first value.") self.assertEqual(buffer.buffer[-1], append_count, "Incorrect last value.")
class MADDPG(): def __init__(self, num_agents, state_size, action_size, random_seed): """ Initialize multiple Agents each with a Actor-Critic network but they share the replay buffer to learn from experience """ self.num_agents = num_agents self.agents = [] for _ in range(num_agents): agent = Agent(state_size, action_size, random_seed) self.agents.append(agent) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def act(self, states, add_noise=True): clipped_actions = [] for state, agent in zip(states, self.agents): clipped_actions.append(agent.act(state, add_noise)) return clipped_actions def reset(self): for agent in self.agents: agent.reset() def learn(self, experiences, gamma): for agent in self.agents: agent.learn(experiences, gamma) def saveCheckPoints(self): for i, agent in enumerate(self.agents): torch.save(agent.actor_local.state_dict(), f"checkpoints/actor_agent_{i}.pth") torch.save(agent.critic_local.state_dict(), f"checkpoints/critic_agent_{i}.pth") def loadCheckPoints(self): for i, agent in enumerate(self.agents): agent.actor_local.load_state_dict( torch.load(f"checkpoints/actor_agent_{i}.pth")) agent.critic_local.load_state_dict( torch.load(f"checkpoints/critic_agent_{i}.pth")) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for i in range(self.num_agents): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for agent in self.agents: experiences = self.memory.sample() self.learn(experiences, GAMMA)