def assemble_training(seed, weights=None, lr=cfg.LEARNING_RATE, er=cfg.EPS_START): """ Configure everything needed to start the training. The parameter weights is used to continue training and set the weights. This function wraps the environment with all the preprocessing steps, sets the type of policy and the Replay Buffer. """ if weights: checkpoint = torch.load(weights) env = getWrappedEnv(seed=checkpoint["info"]["seed"]) dqn = DuelingDQN(env, lr=lr) eval_net = DuelingDQN(env) load_checkpoint(dqn, weights, dqn.device) load_checkpoint(eval_net, weights, dqn.device) policy = eGreedyPolicyDecay(env, seed, checkpoint["info"]["er"], er, cfg.EPS_END, cfg.DECAY_STEPS, dqn) buffer = ReplayBuffer(seed=seed) agent = DDQNAgent(dqn, eval_net, policy, buffer) with open(checkpoint["info"]["buffer"], "rb") as f: preloaded_buffer = pickle.load(f) agent.buffer = preloaded_buffer print( "Resume training at Episode", checkpoint["info"]["episodes"], "after", checkpoint["info"]["frames"], "frames.\n", "Learning rate is", checkpoint["info"]["lr"], "\nExploration rate is", checkpoint["info"]["er"], ) return env, agent, checkpoint["info"]["episodes"], checkpoint["info"][ "frames"] env = getWrappedEnv(seed=seed) dqn = DuelingDQN(env, lr=lr) eval_net = DuelingDQN(env) policy = eGreedyPolicyDecay(env, seed, er, er, cfg.EPS_END, cfg.DECAY_STEPS, dqn) buffer = ReplayBuffer(seed=seed) agent = DDQNAgent(dqn, eval_net, policy, buffer) return env, agent, 0, 0
def __init__(self, input_dims, env, n_actions): self.memory = ReplayBuffer(input_dims) self.n_actions = n_actions self.actor_nn = ActorNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_actor', max_action=env.action_space.n) self.critic_local_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_1') self.critic_local_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_local_2') self.critic_target_1_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_1') self.critic_target_2_nn = CriticNetwork(input_dims, n_actions=n_actions, name=Constants.env_id + '_critic_target_2') self.value_nn = ValueNetwork(input_dims, name=Constants.env_id + '_value') self.target_value_nn = ValueNetwork(input_dims, name=Constants.env_id + '_target_value') self.update_network_parameters(tau=1)
def __init__(self, alpha, beta, input_dims, tau, env, env_id, gamma=0.99, n_actions=2, max_size=1000000, layer1_size=256, layer2_size=256, batch_size=100, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_critic_1') self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name=env_id+'_critic_2') self.value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name=env_id+'_value') self.target_value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name=env_id+'_target_value') self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, config: Config): self.config = config self.is_training = True if self.config.prioritized_replay: self.buffer = PrioritizedReplayBuffer( self.config.max_buff, alpha=self.config.prioritized_replay_alpha) if self.config.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = self.config.frames self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.config.prioritized_replay_beta0, final_p=1.0) else: self.buffer = ReplayBuffer(self.config.max_buff) self.beta_schedule = None self.model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model.load_state_dict(self.model.state_dict()) self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate) if self.config.use_cuda: self.cuda()
def __init__(self, alpha=3e-4, beta=3e-4, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=5e-3, fc1_dim=256, fc2_dim=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, n_actions, env.action_space.high) self.critic1 = CriticNetwork(beta, input_dims, n_actions, name='critic1') self.critic2 = CriticNetwork(beta, input_dims, n_actions, name='critic2') self.value = ValueNetwork(beta, input_dims, name='value') self.target_value = ValueNetwork(beta, input_dims, name='target_value') self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, env_id, action_space, action_bound): self.env_id = env_id self.action_space = action_space self.action_bound = action_bound self.env = gym.make(self.env_id) self.replay_buffer = ReplayBuffer(max_len=self.MAX_EXPERIENCES) self.policy = GaussianPolicy(action_space=self.action_space, action_bound=self.action_bound) self.duqlqnet = DualQNetwork() self.target_dualqnet = DualQNetwork() self.log_alpha = tf.Variable(0.) #: alpha=1 self.alpha_optimizer = tf.keras.optimizers.Adam(3e-4) self.target_entropy = -0.5 * self.action_space self.global_steps = 0 self._initialize_weights()
def __init__(self, state_size=37, action_size=4, gamma=0.99, lr=0.001, update_every=5): """ Initializes the model. ---- @param: 1. state_size: size of input # of states. 2. action_size: size of # of actions. 3. gamma: discounted return rate. 4. lr: learning rate for the model. 5. update_every: update target_model every X time-steps. """ self.state_size = state_size self.action_size = action_size self.gamma = gamma #define dicsounted return #Q-network : defines the 2 DQN (using doubling Q-learning architecture via fixed Q target) self.qnetwork_local = DQNetwork() self.qnetwork_target = DQNetwork() #define the optimizer self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr) #replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE) self.update_every = update_every self.target_update_counter = 0
def __init__(self, num_agents, state_size, action_size, hidden_layers, seed, gamma=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, weight_decay=WEIGHT_DECAY, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE): """Initialize MADDPG agent.""" super(MADDPG, self).__init__() self.seed = random.seed(seed) self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay = weight_decay self.buffer_size = buffer_size self.batch_size = batch_size self.agents = [DDPGAgent(state_size, action_size, hidden_layers, gamma, \ tau, lr_actor, lr_critic, weight_decay, seed) \ for _ in range(num_agents)] self.replay_buffer = ReplayBuffer(num_agents, buffer_size, batch_size)
def __init__(self, actor, critic, reward_fun, gamma=0.99, tau=0.005, # policy_noise=0.2, noise_clip=0.5, policy_freq=2, max_buffer_size=1e6, batch_size=64, lr=3e-4 ): self._actor = actor self._actor_target = copy.deepcopy(self._actor) self._actor_optimizer = torch.optim.Adam(self._actor.parameters(), lr=lr) self._critic = critic self._critic_target = copy.deepcopy(self._critic) self._critic_loss = nn.MSELoss() self._critic_optimizer = torch.optim.Adam(self._critic.parameters(), lr=lr) self.reward_fun = reward_fun self._gamma = gamma self._tau = tau self._policy_freq = policy_freq self._rbuffer_max_size = max_buffer_size self._replay_buffer = ReplayBuffer(self._rbuffer_max_size) self._batch_size = batch_size self._steps = 0 self._run = 0
def __init__(self, args): """ init function Args: - args: class with args parameter """ self.state_size = args.state_size self.action_size = args.action_size self.bs = args.bs self.gamma = args.gamma self.epsilon = args.epsilon self.tau = args.tau self.discrete = args.discrete self.randomer = OUNoise(args.action_size) self.buffer = ReplayBuffer(args.max_buff) self.actor = Actor(self.state_size, self.action_size) self.actor_target = Actor(self.state_size, self.action_size) self.actor_opt = AdamW(self.actor.parameters(), args.lr_actor) self.critic = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_opt = AdamW(self.critic.parameters(), args.lr_critic) hard_update(self.actor_target, self.actor) hard_update(self.critic_target, self.critic)
def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, layer1_size=256, layer2_size=256, batch_size=256, reward_scale=2): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(n_actions=n_actions, name='actor', max_action=env.action_space.high) self.critic_1 = CriticNetwork(n_actions=n_actions, name='critic_1') self.critic_2 = CriticNetwork(n_actions=n_actions, name='critic_2') self.value = ValueNetwork(name='value') self.target_value = ValueNetwork(name='target_value') self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic_1.compile(optimizer=Adam(learning_rate=beta)) self.critic_2.compile(optimizer=Adam(learning_rate=beta)) self.value.compile(optimizer=Adam(learning_rate=beta)) self.target_value.compile(optimizer=Adam(learning_rate=beta)) self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, config: Config): self.config = config self.is_training = True # self.buffer = deque(maxlen=self.config.max_buff) self.buffer = ReplayBuffer(self.config.max_buff) self.actor = Actor(self.config.state_dim, self.config.action_dim, self.config.max_action) self.actor_target = Actor(self.config.state_dim, self.config.action_dim, self.config.max_action) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = Adam(self.actor.parameters(), lr=self.config.learning_rate) self.critic_1 = Critic(self.config.state_dim, self.config.action_dim) self.critic_1_target = Critic(self.config.state_dim, self.config.action_dim) self.critic_1_target.load_state_dict(self.critic_1.state_dict()) self.critic_1_optimizer = Adam(self.critic_1.parameters(), lr=self.config.learning_rate) self.critic_2 = Critic(self.config.state_dim, self.config.action_dim) self.critic_2_target = Critic(self.config.state_dim, self.config.action_dim) self.critic_2_target.load_state_dict(self.critic_2.state_dict()) self.critic_2_optimizer = Adam(self.critic_2.parameters(), lr=self.config.learning_rate) self.MseLoss = nn.MSELoss() if self.config.use_cuda: self.cuda()
def __init__(self, input_dims, alpha=0.001, beta=0.002, env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, fc1=400, fc2=300, batch_size=64, noise=0.1): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.noise = noise self.max_action = env.action_space.high[0] self.min_action = env.action_space.low[0] self.actor = ActorNetwork(n_actions=n_actions, name='actor') self.critic = CriticNetwork(name='critic') self.target_actor = ActorNetwork(n_actions=n_actions, name='target_actor') self.target_critic = CriticNetwork(name='target_critic') self.actor.compile(optimizer=Adam(learning_rate=alpha)) self.critic.compile(optimizer=Adam(learning_rate=beta)) self.target_actor.compile(optimizer=Adam(learning_rate=alpha)) self.target_critic.compile(optimizer=Adam(learning_rate=beta)) self.update_network_parameters(tau=1)
def __init__(self, alpha=0.0003, beta= 0.0003, input_dims=[8], env=None, gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, ent_alpha = 0.0001, batch_size=256, reward_scale=2, layer1_size=256, layer2_size=256, chkpt_dir='tmp/sac'): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.ent_alpha = ent_alpha self.reward_scale = reward_scale self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size , name='actor', chkpt_dir=chkpt_dir) self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size ,name='critic_1', chkpt_dir=chkpt_dir) self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size ,name='critic_2', chkpt_dir=chkpt_dir) self.target_critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size ,name='target_critic_1', chkpt_dir=chkpt_dir) self.target_critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, fc1_dims=layer1_size, fc2_dims=layer2_size ,name='target_critic_2', chkpt_dir=chkpt_dir) self.update_network_parameters(tau=1)
def __init__(self, state_shape, action_size, seed, cnn=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed cnn (bool): whether to use convolutional NN """ self.state_shape = state_shape self.action_size = action_size self.seed = random.seed(seed) self.cnn = cnn if cnn: self.qnetwork_local = QNetworkFullyConvolutional( state_shape, action_size, seed).to(device) self.qnetwork_target = QNetworkFullyConvolutional( state_shape, action_size, seed).to(device) else: self.qnetwork_local = QNetworkFullyConnected( state_shape, action_size, seed).to(device) self.qnetwork_target = QNetworkFullyConnected( state_shape, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, env, model, target_model, config, name_agent="dqn"): self.name_agent = name_agent self.dim_space = env.observation_space.shape[0] self.nb_actions = env.action_space.n self.epsilon = config.epsilon_start self.epsilon_final = config.epsilon_final self.epsilon_start = config.epsilon_start self.epsilon_decay = config.epsilon_decay self.gamma = config.gamma self.replay_buffer = ReplayBuffer(10000, config.batch_size) self.environment = env self.batch_size = config.batch_size self.update_nb_iter = config.update_nb_iter # q0 self.model = model # q0_barre self.target_model = target_model self.optimizer = optim.Adam(self.model.parameters(), lr=config.learning_rate) # self.loss_data = [] self.rewards = []
def __init__(self, n_agents, act_spcs, ob_spcs, writer, args): self.args = args self.memory = ReplayBuffer(args.buffer_length, n_agents, device) # self.memory = ReplayMemory(args.buffer_length, n_agents, device) self.use_maddpg = args.algo == "maddpg" self.use_sac = args.use_sac self.use_td3 = args.use_td3 self.use_single_q = args.single_q self.all_obs = args.all_obs self.n_agents = n_agents self.act_spcs = act_spcs self.ob_spcs = ob_spcs qnet_actspcs = [np.sum(self.act_spcs) if self.use_maddpg else self.act_spcs[i] for i in range(n_agents)] qnet_obspcs = [np.sum(self.ob_spcs) if self.use_maddpg else self.ob_spcs[i] for i in range(n_agents)] if self.use_sac and not self.use_td3: self.agents = [SAC_agent(self.act_spcs[i], qnet_obspcs[i] if self.all_obs else self.ob_spcs[i], qnet_obspcs[i], qnet_actspcs[i]) for i in range(n_agents)] elif self.use_td3: self.agents = [TD3_agent(self.act_spcs[i], qnet_obspcs[i] if self.all_obs else self.ob_spcs[i], qnet_obspcs[i], qnet_actspcs[i]) for i in range(n_agents)] else: self.agents = [DDPG_agent(self.act_spcs[i], qnet_obspcs[i] if self.all_obs else self.ob_spcs[i], qnet_obspcs[i], qnet_actspcs[i]) for i in range(n_agents)] self.n_steps = 0 self.n_updates = 0 self.writer = writer self.criterion = nn.MSELoss() self.sac_alpha = args.sac_alpha self.agent_actions = [[] for i in range(self.n_agents)]
def __init__(self, num_agents, x_dim, o_dim, a_dim, lr_actor=1e-3, lr_critic=1e-3, batch_size=16, gamma=0.99, tau=0.001, buffer_size=int(1e5), seed=1234): self.num_agents = num_agents self.x_dim = x_dim self.o_dim = o_dim self.a_dim = a_dim self.lr_actor = lr_actor self.lr_critic = lr_critic self.batch_size = batch_size self.gamma = gamma self.tau = tau self.buffer_size = buffer_size self.seed = seed self.buffer = ReplayBuffer(buffer_size, batch_size, seed) self.agents = [DDPGAgent(num_agents, id, x_dim, o_dim, a_dim, lr_actor, lr_critic, gamma, seed) \ for id in range(num_agents)]
def __init__(self, net, o_dim, a_dim, lr=1e-3, batch_size=16, algorithm="ddqn", gamma=0.99, tau=1e-3, buffer_size=int(1e6)): """ o_dim: observation space dim (or # of channels) a_dim: action space dimension """ self.o_dim = o_dim self.a_dim = a_dim self.lr = lr self.batch_size = batch_size self.gamma = gamma self.tau = tau self.buffer_size = buffer_size if algorithm.lower() in ("dqn"): self.algorithm = "dqn" elif algorithm.lower() in ("ddqn", "double dqn", "doubledqn"): self.algorithm = "ddqn" else: raise TypeError("cannot recognize algorithm") self.buffer = ReplayBuffer(buffer_size, batch_size) self.online_net = net(o_dim, a_dim).to(self.device) self.target_net = net(o_dim, a_dim).to(self.device) self.optimizer = optim.Adam(self.online_net.parameters(), lr=lr)
def __init__(self, state_size, action_size, random_seed): self.state_size = state_size self.action_size = action_size self.seed = random_seed # ------------------ actor ------------------ # self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) # ------------------ critic ----------------- # self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) # ------------------ optimizers ------------- # self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) # ----------------------- initialize target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, 1) self.soft_update(self.actor_local, self.actor_target, 1) self.t_step = 0 # Noise process self.noise = OUNoise(action_size, random_seed) # Replay Buffer self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device, random_seed)
def __init__(self, state_size, action_size, num_agents): self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.whole_action_dim = self.action_size*self.num_agents self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE) # Replay memory self.maddpg_agents = [DDPG(state_size, action_size, num_agents), DDPG(state_size, action_size, num_agents)] #create agents self.episodes_before_training = EPISODES_BEFORE_TRAINING
def __init__(self, state_size, action_size, num_agents, random_seed): self.agents = [ DDPGAgent(state_size, action_size, random_seed) for _ in range(num_agents) ] self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device, random_seed) self.t_step = 0
def reset_parameters(self): self._q.reset_parameters() self._q_target.reset_parameters() hard_update(self._q_target, self._q) self._pi.reset_parameters() if self._use_rbuffer: self._replay_buffer = ReplayBuffer(self._rbuffer_max_size)
def __init__(self, env_name, num_quantiles=32, fqf_factor=0.000001*0.1, ent_coef=0.001, state_embedding_dim=3136, quantile_embedding_dim=64, gamma=0.99, n_frames=4, batch_size=32, buffer_size=1000000, update_period=8, target_update_period=10000): self.env_name = env_name self.num_quantiles = num_quantiles self.state_embedding_dim = state_embedding_dim self.quantile_embedding_dim = quantile_embedding_dim self.k = 1.0 self.ent_coef = ent_coef self.n_frames = n_frames self.action_space = gym.make(self.env_name).action_space.n self.fqf_network = FQFNetwork( action_space=self.action_space, num_quantiles=self.num_quantiles, state_embedding_dim=self.state_embedding_dim, quantile_embedding_dim=self.quantile_embedding_dim) self.target_fqf_network = FQFNetwork( action_space=self.action_space, num_quantiles=self.num_quantiles, state_embedding_dim=self.state_embedding_dim, quantile_embedding_dim=self.quantile_embedding_dim) self._define_network() self.optimizer = tf.keras.optimizers.Adam( lr=0.00015, epsilon=0.01/32) #: fpl; fraction proposal layer self.optimizer_fpl = tf.keras.optimizers.Adam( learning_rate=0.00005 * fqf_factor, epsilon=0.0003125) self.gamma = gamma self.replay_buffer = ReplayBuffer(max_len=buffer_size) self.batch_size = batch_size self.update_period = update_period self.target_update_period = target_update_period self.steps = 0
def __init__(self, state_size, action_size, num_agents=2, eps_before_train=500, gamma=0.99, batch_size=128, buffer_size=int(1e5), lr_actor=1e-4, lr_critic=1e-3, weight_decay=0, tau=1e-3, noise_weight=1.0, noise_decay=0.999998, noise_min=1e-3, seed=0, device="cuda:0"): # (self, state_size, action_size, num_agents=2, random_seed=1, lr_actor=2e-4, lr_critic=1e-3, # weight_decay=0, tau=2e-3, device=device) torch.manual_seed(seed) np.random.seed(seed) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.action_dim = action_size * num_agents self.eps_before_train = eps_before_train self.gamma = gamma self.batch_size = batch_size self.buffer_size = buffer_size self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay = weight_decay self.tau = tau self.noise_weight = noise_weight self.noise_decay = noise_decay self.noise_min = noise_min self.device = device self.i_episode = 0 self.agents = [ DDPG(self.state_size, self.action_size, self.num_agents, random_seed=2 * i * seed, lr_actor=self.lr_actor, lr_critic=self.lr_critic, weight_decay=self.weight_decay, tau=self.tau, device=self.device) for i in range(self.num_agents) ] self.memory = ReplayBuffer(self.action_size, self.buffer_size, seed)
def __init__( self, policy, env, gamma, learning_rate, buffer_size, exploration_type, exploration_frac, exploration_ep, exploration_initial_eps, exploration_final_eps, double_q, policy_kwargs, seed, intent ): super(TabularRLModel, self).__init__( policy=policy, env=env, policy_kwargs=policy_kwargs, seed=seed) self.gamma = gamma self.learning_rate = learning_rate self.buffer_size = buffer_size self.exploration_type = exploration_type self.exploration_frac = exploration_frac self.exploration_ep = exploration_ep self.exploration_initial_eps = exploration_initial_eps self.exploration_final_eps = exploration_final_eps self.double_q = double_q self.intent = intent # self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs # self.policy = policy(self.observation_space, self.action_space, intent=True) self.policy_kwargs = get_default_args(self.policy) self.policy_kwargs['ob_space'] = self.observation_space self.policy_kwargs['ac_space'] = self.action_space self.policy_kwargs['intent'] = self.intent if policy_kwargs is not None: for key, val in policy_kwargs.items(): self.policy_kwargs[key] = val # self.policy_kwargs['transform_func'] = transform_func # if policy_kwargs is None: # self.policy = policy(self.observation_space, self.action_space, # intent=True, device=self.device) # else: self.policy = policy(**self.policy_kwargs) if self.buffer_size is None: self.replay_buffer = None else: self.replay_buffer = ReplayBuffer(self.buffer_size)
def __init__(self, load_from=None, will_train=True): self.env = TorcsEnv( path='/usr/local/share/games/torcs/config/raceman/quickrace.xml') self.args = SAC_args() self.buffer = ReplayBuffer(self.args.buffer_size) action_dim = self.env.action_space.shape[0] state_dim = self.env.observation_space.shape[0] hidden_dim = 256 self.action_size = action_dim self.state_size = state_dim self.value_net = ValueNetwork(state_dim, hidden_dim).to(self.args.device) self.target_value_net = ValueNetwork(state_dim, hidden_dim).to(self.args.device) self.soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.target_value_net.load_state_dict(self.value_net.state_dict()) self.value_criterion = nn.MSELoss() self.soft_q_loss1 = nn.MSELoss() self.soft_q_loss2 = nn.MSELoss() self.value_opt = optim.Adam(self.value_net.parameters(), lr=self.args.lr) self.soft_q_opt1 = optim.Adam(self.soft_q_net1.parameters(), lr=self.args.lr) self.soft_q_opt2 = optim.Adam(self.soft_q_net2.parameters(), lr=self.args.lr) self.policy_opt = optim.Adam(self.policy_net.parameters(), lr=self.args.lr) if will_train: current_time = time.strftime('%d-%b-%y-%H.%M.%S', time.localtime()) self.plot_folder = f'plots/{current_time}' self.model_save_folder = f'model/{current_time}' make_sure_dir_exists(self.plot_folder) make_sure_dir_exists(self.model_save_folder) self.cp = Checkpoint(self.model_save_folder) if load_from is not None: try: self.load_checkpoint(load_from) except FileNotFoundError: print(f'{load_from} not found. Running default.') else: print('Starting from scratch.')
def test_append(self): count = 100 start_length = count // 2 max_length = count buffer = ReplayBuffer(start_length=start_length, max_length=max_length) for append_count in range(max_length*2): buffer.append(append_count) self.assertEqual(len(buffer.buffer), min(append_count+1, max_length), "Incorrect buffer size.") self.assertEqual(buffer.buffer[0], max(0, (append_count+1) - max_length), "Incorrect first value.") self.assertEqual(buffer.buffer[-1], append_count, "Incorrect last value.")
def __init__(self, config: Config): self.config = config self.is_training = True self.buffer = ReplayBuffer(self.config.max_buff) self.model = DQN(self.config.state_dim, self.config.action_dim).cuda() self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate) if self.config.use_cuda: self.cuda()
def reset_parameters(self): self._actor.reset_parameters() self._actor_target.reset_parameters() self._critic.reset_parameters() self._critic_target.reset_parameters() hard_update(self._actor_target, self._actor) hard_update(self._critic_target, self._critic) self._steps = 0 self._replay_buffer = ReplayBuffer(self._rbuffer_max_size)