Пример #1
0
def assemble_training(seed,
                      weights=None,
                      lr=cfg.LEARNING_RATE,
                      er=cfg.EPS_START):
    """
    Configure everything needed to start the training. The parameter weights is used to continue training 
    and set the weights. This function wraps the environment with all the preprocessing steps, sets the 
    type of policy and the Replay Buffer.
    """
    if weights:
        checkpoint = torch.load(weights)
        env = getWrappedEnv(seed=checkpoint["info"]["seed"])
        dqn = DuelingDQN(env, lr=lr)
        eval_net = DuelingDQN(env)

        load_checkpoint(dqn, weights, dqn.device)
        load_checkpoint(eval_net, weights, dqn.device)

        policy = eGreedyPolicyDecay(env, seed, checkpoint["info"]["er"], er,
                                    cfg.EPS_END, cfg.DECAY_STEPS, dqn)
        buffer = ReplayBuffer(seed=seed)
        agent = DDQNAgent(dqn, eval_net, policy, buffer)
        with open(checkpoint["info"]["buffer"], "rb") as f:
            preloaded_buffer = pickle.load(f)
        agent.buffer = preloaded_buffer
        print(
            "Resume training at Episode",
            checkpoint["info"]["episodes"],
            "after",
            checkpoint["info"]["frames"],
            "frames.\n",
            "Learning rate is",
            checkpoint["info"]["lr"],
            "\nExploration rate is",
            checkpoint["info"]["er"],
        )
        return env, agent, checkpoint["info"]["episodes"], checkpoint["info"][
            "frames"]

    env = getWrappedEnv(seed=seed)
    dqn = DuelingDQN(env, lr=lr)
    eval_net = DuelingDQN(env)

    policy = eGreedyPolicyDecay(env, seed, er, er, cfg.EPS_END,
                                cfg.DECAY_STEPS, dqn)
    buffer = ReplayBuffer(seed=seed)
    agent = DDQNAgent(dqn, eval_net, policy, buffer)
    return env, agent, 0, 0
Пример #2
0
    def __init__(self, input_dims, env, n_actions):
        self.memory = ReplayBuffer(input_dims)
        self.n_actions = n_actions

        self.actor_nn = ActorNetwork(input_dims,
                                     n_actions=n_actions,
                                     name=Constants.env_id + '_actor',
                                     max_action=env.action_space.n)
        self.critic_local_1_nn = CriticNetwork(input_dims,
                                               n_actions=n_actions,
                                               name=Constants.env_id +
                                               '_critic_local_1')
        self.critic_local_2_nn = CriticNetwork(input_dims,
                                               n_actions=n_actions,
                                               name=Constants.env_id +
                                               '_critic_local_2')
        self.critic_target_1_nn = CriticNetwork(input_dims,
                                                n_actions=n_actions,
                                                name=Constants.env_id +
                                                '_critic_target_1')
        self.critic_target_2_nn = CriticNetwork(input_dims,
                                                n_actions=n_actions,
                                                name=Constants.env_id +
                                                '_critic_target_2')
        self.value_nn = ValueNetwork(input_dims,
                                     name=Constants.env_id + '_value')
        self.target_value_nn = ValueNetwork(input_dims,
                                            name=Constants.env_id +
                                            '_target_value')
        self.update_network_parameters(tau=1)
Пример #3
0
    def __init__(self, alpha, beta, input_dims, tau, env,
            env_id, gamma=0.99, 
            n_actions=2, max_size=1000000, layer1_size=256,
            layer2_size=256, batch_size=100, reward_scale=2):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(alpha, input_dims, layer1_size,
                                  layer2_size, n_actions=n_actions,
                                  name=env_id+'_actor', 
                                  max_action=env.action_space.high)
        self.critic_1 = CriticNetwork(beta, input_dims, layer1_size,
                                      layer2_size, n_actions=n_actions,
                                      name=env_id+'_critic_1')
        self.critic_2 = CriticNetwork(beta, input_dims, layer1_size,
                                      layer2_size, n_actions=n_actions,
                                      name=env_id+'_critic_2')
       
        self.value = ValueNetwork(beta, input_dims, layer1_size,
                                      layer2_size, name=env_id+'_value')
        self.target_value = ValueNetwork(beta, input_dims, layer1_size,
                                         layer2_size, name=env_id+'_target_value')

        self.scale = reward_scale
        self.update_network_parameters(tau=1)
Пример #4
0
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        if self.config.prioritized_replay:
            self.buffer = PrioritizedReplayBuffer(
                self.config.max_buff,
                alpha=self.config.prioritized_replay_alpha)
            if self.config.prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = self.config.frames
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=self.config.prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.buffer = ReplayBuffer(self.config.max_buff)
            self.beta_schedule = None

        self.model = CnnDQN(self.config.state_shape, self.config.action_dim)
        self.target_model = CnnDQN(self.config.state_shape,
                                   self.config.action_dim)
        self.target_model.load_state_dict(self.model.state_dict())
        self.model_optim = Adam(self.model.parameters(),
                                lr=self.config.learning_rate)

        if self.config.use_cuda:
            self.cuda()
Пример #5
0
    def __init__(self,
                 alpha=3e-4,
                 beta=3e-4,
                 input_dims=[8],
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=5e-3,
                 fc1_dim=256,
                 fc2_dim=256,
                 batch_size=256,
                 reward_scale=2):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(alpha, input_dims, n_actions,
                                  env.action_space.high)
        self.critic1 = CriticNetwork(beta,
                                     input_dims,
                                     n_actions,
                                     name='critic1')
        self.critic2 = CriticNetwork(beta,
                                     input_dims,
                                     n_actions,
                                     name='critic2')
        self.value = ValueNetwork(beta, input_dims, name='value')
        self.target_value = ValueNetwork(beta, input_dims, name='target_value')

        self.scale = reward_scale
        self.update_network_parameters(tau=1)
    def __init__(self, env_id, action_space, action_bound):

        self.env_id = env_id

        self.action_space = action_space

        self.action_bound = action_bound

        self.env = gym.make(self.env_id)

        self.replay_buffer = ReplayBuffer(max_len=self.MAX_EXPERIENCES)

        self.policy = GaussianPolicy(action_space=self.action_space,
                                     action_bound=self.action_bound)

        self.duqlqnet = DualQNetwork()

        self.target_dualqnet = DualQNetwork()

        self.log_alpha = tf.Variable(0.)  #: alpha=1

        self.alpha_optimizer = tf.keras.optimizers.Adam(3e-4)

        self.target_entropy = -0.5 * self.action_space

        self.global_steps = 0

        self._initialize_weights()
Пример #7
0
    def __init__(self,
                 state_size=37,
                 action_size=4,
                 gamma=0.99,
                 lr=0.001,
                 update_every=5):
        """
        Initializes the model.
        ----
        @param:
        1. state_size: size of input # of states.
        2. action_size: size of # of actions.
        3. gamma: discounted return rate.
        4. lr: learning rate for the model.
        5. update_every: update target_model every X time-steps.
        """
        self.state_size = state_size
        self.action_size = action_size

        self.gamma = gamma  #define dicsounted return

        #Q-network : defines the 2 DQN (using doubling Q-learning architecture via fixed Q target)
        self.qnetwork_local = DQNetwork()
        self.qnetwork_target = DQNetwork()

        #define the optimizer
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)

        #replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE)
        self.update_every = update_every
        self.target_update_counter = 0
Пример #8
0
    def __init__(self,
                 num_agents,
                 state_size,
                 action_size,
                 hidden_layers,
                 seed,
                 gamma=GAMMA,
                 tau=TAU,
                 lr_actor=LR_ACTOR,
                 lr_critic=LR_CRITIC,
                 weight_decay=WEIGHT_DECAY,
                 buffer_size=BUFFER_SIZE,
                 batch_size=BATCH_SIZE):
        """Initialize MADDPG agent."""
        super(MADDPG, self).__init__()

        self.seed = random.seed(seed)

        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weight_decay
        self.buffer_size = buffer_size
        self.batch_size = batch_size

        self.agents = [DDPGAgent(state_size, action_size, hidden_layers, gamma, \
                                 tau, lr_actor, lr_critic, weight_decay, seed) \
                                     for _ in range(num_agents)]

        self.replay_buffer = ReplayBuffer(num_agents, buffer_size, batch_size)
Пример #9
0
	def __init__(self, actor, critic, reward_fun, gamma=0.99, tau=0.005, # policy_noise=0.2, noise_clip=0.5,
				 policy_freq=2, max_buffer_size=1e6, batch_size=64, lr=3e-4
				 ):

		self._actor = actor
		self._actor_target = copy.deepcopy(self._actor)
		self._actor_optimizer = torch.optim.Adam(self._actor.parameters(), lr=lr)

		self._critic = critic
		self._critic_target = copy.deepcopy(self._critic)
		self._critic_loss = nn.MSELoss()
		self._critic_optimizer = torch.optim.Adam(self._critic.parameters(), lr=lr)

		self.reward_fun = reward_fun

		self._gamma = gamma
		self._tau = tau
		self._policy_freq = policy_freq

		self._rbuffer_max_size = max_buffer_size
		self._replay_buffer = ReplayBuffer(self._rbuffer_max_size)
		self._batch_size = batch_size

		self._steps = 0
		self._run = 0
Пример #10
0
    def __init__(self, args):
        """
            init function
            Args:
                - args: class with args parameter
        """
        self.state_size = args.state_size
        self.action_size = args.action_size
        self.bs = args.bs
        self.gamma = args.gamma
        self.epsilon = args.epsilon
        self.tau = args.tau
        self.discrete = args.discrete
        self.randomer = OUNoise(args.action_size)
        self.buffer = ReplayBuffer(args.max_buff)

        self.actor = Actor(self.state_size, self.action_size)
        self.actor_target = Actor(self.state_size, self.action_size)
        self.actor_opt = AdamW(self.actor.parameters(), args.lr_actor)

        self.critic = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)
        self.critic_opt = AdamW(self.critic.parameters(), args.lr_critic)

        hard_update(self.actor_target, self.actor)
        hard_update(self.critic_target, self.critic)
Пример #11
0
    def __init__(self,
                 alpha=0.0003,
                 beta=0.0003,
                 input_dims=[8],
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 layer1_size=256,
                 layer2_size=256,
                 batch_size=256,
                 reward_scale=2):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions

        self.actor = ActorNetwork(n_actions=n_actions,
                                  name='actor',
                                  max_action=env.action_space.high)
        self.critic_1 = CriticNetwork(n_actions=n_actions, name='critic_1')
        self.critic_2 = CriticNetwork(n_actions=n_actions, name='critic_2')
        self.value = ValueNetwork(name='value')
        self.target_value = ValueNetwork(name='target_value')

        self.actor.compile(optimizer=Adam(learning_rate=alpha))
        self.critic_1.compile(optimizer=Adam(learning_rate=beta))
        self.critic_2.compile(optimizer=Adam(learning_rate=beta))
        self.value.compile(optimizer=Adam(learning_rate=beta))
        self.target_value.compile(optimizer=Adam(learning_rate=beta))

        self.scale = reward_scale
        self.update_network_parameters(tau=1)
Пример #12
0
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        # self.buffer = deque(maxlen=self.config.max_buff)
        self.buffer = ReplayBuffer(self.config.max_buff)

        self.actor = Actor(self.config.state_dim, self.config.action_dim,
                           self.config.max_action)
        self.actor_target = Actor(self.config.state_dim,
                                  self.config.action_dim,
                                  self.config.max_action)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = Adam(self.actor.parameters(),
                                    lr=self.config.learning_rate)

        self.critic_1 = Critic(self.config.state_dim, self.config.action_dim)
        self.critic_1_target = Critic(self.config.state_dim,
                                      self.config.action_dim)
        self.critic_1_target.load_state_dict(self.critic_1.state_dict())
        self.critic_1_optimizer = Adam(self.critic_1.parameters(),
                                       lr=self.config.learning_rate)

        self.critic_2 = Critic(self.config.state_dim, self.config.action_dim)
        self.critic_2_target = Critic(self.config.state_dim,
                                      self.config.action_dim)
        self.critic_2_target.load_state_dict(self.critic_2.state_dict())
        self.critic_2_optimizer = Adam(self.critic_2.parameters(),
                                       lr=self.config.learning_rate)

        self.MseLoss = nn.MSELoss()

        if self.config.use_cuda:
            self.cuda()
Пример #13
0
    def __init__(self,
                 input_dims,
                 alpha=0.001,
                 beta=0.002,
                 env=None,
                 gamma=0.99,
                 n_actions=2,
                 max_size=1000000,
                 tau=0.005,
                 fc1=400,
                 fc2=300,
                 batch_size=64,
                 noise=0.1):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.noise = noise
        self.max_action = env.action_space.high[0]
        self.min_action = env.action_space.low[0]

        self.actor = ActorNetwork(n_actions=n_actions, name='actor')
        self.critic = CriticNetwork(name='critic')
        self.target_actor = ActorNetwork(n_actions=n_actions,
                                         name='target_actor')
        self.target_critic = CriticNetwork(name='target_critic')

        self.actor.compile(optimizer=Adam(learning_rate=alpha))
        self.critic.compile(optimizer=Adam(learning_rate=beta))
        self.target_actor.compile(optimizer=Adam(learning_rate=alpha))
        self.target_critic.compile(optimizer=Adam(learning_rate=beta))

        self.update_network_parameters(tau=1)
Пример #14
0
    def __init__(self, alpha=0.0003, beta= 0.0003, input_dims=[8], env=None, 
                gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, 
                ent_alpha = 0.0001, batch_size=256, reward_scale=2, 
                layer1_size=256, layer2_size=256, chkpt_dir='tmp/sac'):
        self.gamma = gamma
        self.tau = tau
        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.ent_alpha = ent_alpha
        self.reward_scale = reward_scale

        self.actor = ActorNetwork(alpha, input_dims, n_actions=n_actions, 
                                fc1_dims=layer1_size, fc2_dims=layer2_size ,
                                name='actor', chkpt_dir=chkpt_dir)

        self.critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, 
                                fc1_dims=layer1_size, fc2_dims=layer2_size ,name='critic_1',
                                chkpt_dir=chkpt_dir)
        self.critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, 
                                fc1_dims=layer1_size, fc2_dims=layer2_size ,name='critic_2',
                                chkpt_dir=chkpt_dir)
        self.target_critic_1 = CriticNetwork(beta, input_dims, n_actions=n_actions, 
                                fc1_dims=layer1_size, fc2_dims=layer2_size ,name='target_critic_1',
                                chkpt_dir=chkpt_dir)
        self.target_critic_2 = CriticNetwork(beta, input_dims, n_actions=n_actions, 
                                fc1_dims=layer1_size, fc2_dims=layer2_size ,name='target_critic_2',
                                chkpt_dir=chkpt_dir)


        self.update_network_parameters(tau=1)
Пример #15
0
    def __init__(self, state_shape, action_size, seed, cnn=False):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            cnn (bool): whether to use convolutional NN
        """
        self.state_shape = state_shape
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.cnn = cnn

        if cnn:
            self.qnetwork_local = QNetworkFullyConvolutional(
                state_shape, action_size, seed).to(device)
            self.qnetwork_target = QNetworkFullyConvolutional(
                state_shape, action_size, seed).to(device)
        else:
            self.qnetwork_local = QNetworkFullyConnected(
                state_shape, action_size, seed).to(device)
            self.qnetwork_target = QNetworkFullyConnected(
                state_shape, action_size, seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    def __init__(self, env, model, target_model, config, name_agent="dqn"):

        self.name_agent = name_agent

        self.dim_space = env.observation_space.shape[0]
        self.nb_actions = env.action_space.n
        self.epsilon = config.epsilon_start

        self.epsilon_final = config.epsilon_final
        self.epsilon_start = config.epsilon_start
        self.epsilon_decay = config.epsilon_decay

        self.gamma = config.gamma
        self.replay_buffer = ReplayBuffer(10000, config.batch_size)
        self.environment = env
        self.batch_size = config.batch_size
        self.update_nb_iter = config.update_nb_iter

        # q0
        self.model = model
        # q0_barre
        self.target_model = target_model
        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=config.learning_rate)

        #
        self.loss_data = []
        self.rewards = []
Пример #17
0
 def __init__(self, n_agents, act_spcs, ob_spcs, writer, args):
     self.args = args
     self.memory = ReplayBuffer(args.buffer_length, n_agents, device)
     # self.memory = ReplayMemory(args.buffer_length, n_agents, device)
     self.use_maddpg = args.algo == "maddpg"
     self.use_sac = args.use_sac
     self.use_td3 = args.use_td3
     self.use_single_q = args.single_q
     self.all_obs = args.all_obs
     self.n_agents = n_agents
     self.act_spcs = act_spcs
     self.ob_spcs = ob_spcs
     qnet_actspcs = [np.sum(self.act_spcs) if self.use_maddpg else self.act_spcs[i]
                     for i in range(n_agents)]
     qnet_obspcs = [np.sum(self.ob_spcs) if self.use_maddpg else self.ob_spcs[i]
                     for i in range(n_agents)]
     if self.use_sac and not self.use_td3:
         self.agents = [SAC_agent(self.act_spcs[i], qnet_obspcs[i] if self.all_obs
                                  else self.ob_spcs[i], qnet_obspcs[i],
                        qnet_actspcs[i]) for i in range(n_agents)]
     elif self.use_td3:
         self.agents = [TD3_agent(self.act_spcs[i], qnet_obspcs[i] if self.all_obs
                                   else self.ob_spcs[i], qnet_obspcs[i],
                        qnet_actspcs[i]) for i in range(n_agents)]
     else:
         self.agents = [DDPG_agent(self.act_spcs[i], qnet_obspcs[i] if self.all_obs
                                   else self.ob_spcs[i], qnet_obspcs[i],
                        qnet_actspcs[i]) for i in range(n_agents)]
     self.n_steps = 0
     self.n_updates = 0
     self.writer = writer
     self.criterion = nn.MSELoss()
     self.sac_alpha = args.sac_alpha
     self.agent_actions = [[] for i in range(self.n_agents)]
Пример #18
0
    def __init__(self,
                 num_agents,
                 x_dim,
                 o_dim,
                 a_dim,
                 lr_actor=1e-3,
                 lr_critic=1e-3,
                 batch_size=16,
                 gamma=0.99,
                 tau=0.001,
                 buffer_size=int(1e5),
                 seed=1234):

        self.num_agents = num_agents
        self.x_dim = x_dim
        self.o_dim = o_dim
        self.a_dim = a_dim
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.buffer_size = buffer_size
        self.seed = seed

        self.buffer = ReplayBuffer(buffer_size, batch_size, seed)
        self.agents = [DDPGAgent(num_agents, id, x_dim, o_dim, a_dim, lr_actor, lr_critic, gamma, seed) \
                       for id in range(num_agents)]
    def __init__(self,
                 net,
                 o_dim,
                 a_dim,
                 lr=1e-3,
                 batch_size=16,
                 algorithm="ddqn",
                 gamma=0.99,
                 tau=1e-3,
                 buffer_size=int(1e6)):
        """
        o_dim: observation space dim (or # of channels)
        a_dim: action space dimension
        """
        self.o_dim = o_dim
        self.a_dim = a_dim
        self.lr = lr
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.buffer_size = buffer_size

        if algorithm.lower() in ("dqn"):
            self.algorithm = "dqn"
        elif algorithm.lower() in ("ddqn", "double dqn", "doubledqn"):
            self.algorithm = "ddqn"
        else:
            raise TypeError("cannot recognize algorithm")

        self.buffer = ReplayBuffer(buffer_size, batch_size)

        self.online_net = net(o_dim, a_dim).to(self.device)
        self.target_net = net(o_dim, a_dim).to(self.device)

        self.optimizer = optim.Adam(self.online_net.parameters(), lr=lr)
Пример #20
0
    def __init__(self, state_size, action_size, random_seed):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random_seed

        # ------------------ actor ------------------ #
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)

        # ------------------ critic ----------------- #
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)

        # ------------------ optimizers ------------- #
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC)

        # ----------------------- initialize target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, 1)
        self.soft_update(self.actor_local, self.actor_target, 1)
        self.t_step = 0

        # Noise process
        self.noise = OUNoise(action_size, random_seed)
        # Replay Buffer
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   device, random_seed)
Пример #21
0
 def __init__(self, state_size, action_size, num_agents):
     self.state_size = state_size
     self.action_size = action_size
     self.num_agents = num_agents
     self.whole_action_dim = self.action_size*self.num_agents
     self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE) # Replay memory
     self.maddpg_agents = [DDPG(state_size, action_size, num_agents), DDPG(state_size, action_size, num_agents)] #create agents
     self.episodes_before_training = EPISODES_BEFORE_TRAINING
Пример #22
0
 def __init__(self, state_size, action_size, num_agents, random_seed):
     self.agents = [
         DDPGAgent(state_size, action_size, random_seed)
         for _ in range(num_agents)
     ]
     self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                device, random_seed)
     self.t_step = 0
Пример #23
0
    def reset_parameters(self):
        self._q.reset_parameters()
        self._q_target.reset_parameters()

        hard_update(self._q_target, self._q)

        self._pi.reset_parameters()
        if self._use_rbuffer:
            self._replay_buffer = ReplayBuffer(self._rbuffer_max_size)
    def __init__(self, env_name,
                 num_quantiles=32, fqf_factor=0.000001*0.1, ent_coef=0.001,
                 state_embedding_dim=3136, quantile_embedding_dim=64,
                 gamma=0.99, n_frames=4, batch_size=32,
                 buffer_size=1000000,
                 update_period=8,
                 target_update_period=10000):

        self.env_name = env_name

        self.num_quantiles = num_quantiles

        self.state_embedding_dim = state_embedding_dim

        self.quantile_embedding_dim = quantile_embedding_dim

        self.k = 1.0

        self.ent_coef = ent_coef

        self.n_frames = n_frames

        self.action_space = gym.make(self.env_name).action_space.n

        self.fqf_network = FQFNetwork(
            action_space=self.action_space,
            num_quantiles=self.num_quantiles,
            state_embedding_dim=self.state_embedding_dim,
            quantile_embedding_dim=self.quantile_embedding_dim)

        self.target_fqf_network = FQFNetwork(
            action_space=self.action_space,
            num_quantiles=self.num_quantiles,
            state_embedding_dim=self.state_embedding_dim,
            quantile_embedding_dim=self.quantile_embedding_dim)

        self._define_network()

        self.optimizer = tf.keras.optimizers.Adam(
            lr=0.00015, epsilon=0.01/32)

        #: fpl; fraction proposal layer
        self.optimizer_fpl = tf.keras.optimizers.Adam(
            learning_rate=0.00005 * fqf_factor,
            epsilon=0.0003125)

        self.gamma = gamma

        self.replay_buffer = ReplayBuffer(max_len=buffer_size)

        self.batch_size = batch_size

        self.update_period = update_period

        self.target_update_period = target_update_period

        self.steps = 0
Пример #25
0
    def __init__(self,
                 state_size,
                 action_size,
                 num_agents=2,
                 eps_before_train=500,
                 gamma=0.99,
                 batch_size=128,
                 buffer_size=int(1e5),
                 lr_actor=1e-4,
                 lr_critic=1e-3,
                 weight_decay=0,
                 tau=1e-3,
                 noise_weight=1.0,
                 noise_decay=0.999998,
                 noise_min=1e-3,
                 seed=0,
                 device="cuda:0"):

        # (self, state_size, action_size, num_agents=2, random_seed=1, lr_actor=2e-4, lr_critic=1e-3,
        #          weight_decay=0, tau=2e-3, device=device)

        torch.manual_seed(seed)
        np.random.seed(seed)

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.action_dim = action_size * num_agents

        self.eps_before_train = eps_before_train
        self.gamma = gamma
        self.batch_size = batch_size
        self.buffer_size = buffer_size

        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weight_decay
        self.tau = tau

        self.noise_weight = noise_weight
        self.noise_decay = noise_decay
        self.noise_min = noise_min
        self.device = device
        self.i_episode = 0

        self.agents = [
            DDPG(self.state_size,
                 self.action_size,
                 self.num_agents,
                 random_seed=2 * i * seed,
                 lr_actor=self.lr_actor,
                 lr_critic=self.lr_critic,
                 weight_decay=self.weight_decay,
                 tau=self.tau,
                 device=self.device) for i in range(self.num_agents)
        ]
        self.memory = ReplayBuffer(self.action_size, self.buffer_size, seed)
Пример #26
0
    def __init__(
        self,
        policy,
        env,
        gamma,
        learning_rate,
        buffer_size,
        exploration_type,
        exploration_frac,
        exploration_ep,
        exploration_initial_eps,
        exploration_final_eps,
        double_q,
        policy_kwargs,
        seed,
        intent
        ):

        super(TabularRLModel, self).__init__(
            policy=policy,
            env=env, 
            policy_kwargs=policy_kwargs,
            seed=seed)

        self.gamma = gamma
        self.learning_rate = learning_rate
        self.buffer_size = buffer_size
        self.exploration_type = exploration_type
        self.exploration_frac = exploration_frac
        self.exploration_ep = exploration_ep
        self.exploration_initial_eps = exploration_initial_eps
        self.exploration_final_eps = exploration_final_eps
        self.double_q = double_q
        self.intent = intent
        # self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs
        # self.policy = policy(self.observation_space, self.action_space, intent=True)

        self.policy_kwargs = get_default_args(self.policy)
        self.policy_kwargs['ob_space'] = self.observation_space
        self.policy_kwargs['ac_space'] = self.action_space
        self.policy_kwargs['intent'] = self.intent

        if policy_kwargs is not None:
            for key, val in policy_kwargs.items():
                self.policy_kwargs[key] = val
        # self.policy_kwargs['transform_func'] = transform_func

        # if policy_kwargs is None:
        #     self.policy = policy(self.observation_space, self.action_space,
        #                          intent=True, device=self.device)
        # else:
        self.policy = policy(**self.policy_kwargs)

        if self.buffer_size is None:
            self.replay_buffer = None
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size)
Пример #27
0
    def __init__(self, load_from=None, will_train=True):
        self.env = TorcsEnv(
            path='/usr/local/share/games/torcs/config/raceman/quickrace.xml')
        self.args = SAC_args()
        self.buffer = ReplayBuffer(self.args.buffer_size)

        action_dim = self.env.action_space.shape[0]
        state_dim = self.env.observation_space.shape[0]
        hidden_dim = 256

        self.action_size = action_dim
        self.state_size = state_dim

        self.value_net = ValueNetwork(state_dim,
                                      hidden_dim).to(self.args.device)
        self.target_value_net = ValueNetwork(state_dim,
                                             hidden_dim).to(self.args.device)

        self.soft_q_net1 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)
        self.soft_q_net2 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)

        self.policy_net = PolicyNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.args.device)

        self.target_value_net.load_state_dict(self.value_net.state_dict())

        self.value_criterion = nn.MSELoss()
        self.soft_q_loss1 = nn.MSELoss()
        self.soft_q_loss2 = nn.MSELoss()

        self.value_opt = optim.Adam(self.value_net.parameters(),
                                    lr=self.args.lr)
        self.soft_q_opt1 = optim.Adam(self.soft_q_net1.parameters(),
                                      lr=self.args.lr)
        self.soft_q_opt2 = optim.Adam(self.soft_q_net2.parameters(),
                                      lr=self.args.lr)
        self.policy_opt = optim.Adam(self.policy_net.parameters(),
                                     lr=self.args.lr)

        if will_train:
            current_time = time.strftime('%d-%b-%y-%H.%M.%S', time.localtime())
            self.plot_folder = f'plots/{current_time}'
            self.model_save_folder = f'model/{current_time}'
            make_sure_dir_exists(self.plot_folder)
            make_sure_dir_exists(self.model_save_folder)
            self.cp = Checkpoint(self.model_save_folder)

        if load_from is not None:
            try:
                self.load_checkpoint(load_from)
            except FileNotFoundError:
                print(f'{load_from} not found. Running default.')
        else:
            print('Starting from scratch.')
 def test_append(self):
     count = 100
     start_length = count // 2
     max_length = count
     buffer = ReplayBuffer(start_length=start_length, max_length=max_length)
     for append_count in range(max_length*2):
         buffer.append(append_count)
         self.assertEqual(len(buffer.buffer), min(append_count+1, max_length), "Incorrect buffer size.")
         self.assertEqual(buffer.buffer[0], max(0, (append_count+1) - max_length), "Incorrect first value.")
         self.assertEqual(buffer.buffer[-1], append_count, "Incorrect last value.")
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        self.buffer = ReplayBuffer(self.config.max_buff)

        self.model = DQN(self.config.state_dim, self.config.action_dim).cuda()
        self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate)

        if self.config.use_cuda:
            self.cuda()
Пример #30
0
	def reset_parameters(self):
		self._actor.reset_parameters()
		self._actor_target.reset_parameters()
		self._critic.reset_parameters()
		self._critic_target.reset_parameters()

		hard_update(self._actor_target, self._actor)
		hard_update(self._critic_target, self._critic)

		self._steps = 0
		self._replay_buffer = ReplayBuffer(self._rbuffer_max_size)