예제 #1
0
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 expert_mem_size,
                 batch_size,
                 n_step,
                 lam_n_step,
                 lam_sup,
                 lam_L2,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000,
                 chkpt_dir='tmp/dqn',
                 algo=None,
                 env_name=None):

        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr

        self.input_dims = input_dims
        self.n_actions = n_actions
        self.batch_size = batch_size

        self.eps_min = eps_min
        self.eps_dec = eps_dec

        self.replace_target_cntr = replace
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0

        self.n_step = n_step

        self.lam_n_step = lam_n_step
        self.lam_sup = lam_sup
        self.lam_L2 = lam_L2

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
        self.demo_memory = DemoReplayBuffer(expert_mem_size, input_dims,
                                            n_actions)

        self.q_eval = DeepQNetwork(self.lr,
                                   self.lam_L2,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   model_name=env_name + "_" + algo +
                                   "_q_eval",
                                   model_dir=chkpt_dir)

        self.q_next = DeepQNetwork(self.lr,
                                   self.lam_L2,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   model_name=env_name + "_" + algo +
                                   "_q_next",
                                   model_dir=chkpt_dir)
예제 #2
0
    def __init__(self, replay_buffer_len=5000, learning_rate=0.001, espilon=1, verbose=0):
        self.gamma = 1
        self.epsilon = espilon
        self.min_epsilon = 0.1
        self.delta_epsilon = 0.025
        self.epsilon_update_freq = 5000

        self.lr = learning_rate
        self.target_update_freq = 250
        self.Optimizer = keras.optimizers.RMSprop(
            learning_rate=self.lr)

        self.model = keras.Sequential(
            [
                keras.layers.Flatten(input_shape=[5, 5, 1]),
                keras.layers.Dense(
                    units=5, name="layer1"),
            ]
        )

        self.replay_buffer = ReplayBuffer(
            size=replay_buffer_len, frame_history_len=1)
        self.mini_batch_size = 32

        if verbose > 0:
            self.model.summary()

        self.target_model = keras.models.clone_model(self.model)
        self.update_target_weights()

        self.update_steps = 0
        self.total_reward = 0
예제 #3
0
    def __init__(self,
                 env,
                 q_net,
                 loss_func,
                 opt,
                 lr=0.00025,
                 imsize=(84, 84),
                 gamma=0.99,
                 tau=0.001,
                 buffer_size=1e6,
                 log_dir=None,
                 weight_dir=None):
        self.env = env
        self.q_net = q_net.type(dtype)
        self.target_q_net = copy.deepcopy(q_net).type(dtype)
        self.loss_func = loss_func
        self.opt = opt(self.q_net.parameters(), lr)
        self.gamma = gamma
        self.tau = tau
        self.buffer_size = buffer_size

        self.n_action_space = env.action_space.n
        self._state_size = env.observation_space.shape
        self._imsize = imsize

        self.train_reward_list = []
        self.test_reward_list = []
        self.train_error_list = []
        self._buffer = ReplayBuffer([
            1,
        ], self._state_size, imsize, buffer_size)

        self.log_dir = log_dir if log_dir is not None else "./logs/"
        self.weight_dir = weight_dir if weight_path is not None else "./checkpoints/"
예제 #4
0
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
예제 #5
0
 def __init__(self, hyperparameters):
     """
     Args:
         hyperparameters (dict): a dictionary of hyperparameters
     discounted_returns:
         None
     """
     # Extract hyperparameters
     self.lr = hyperparameters['learning_rate']
     self.discount = hyperparameters['discount_rate']
     self.num_batch_transitions = hyperparameters['num_batch_transitions']
     self.state_dim = hyperparameters['state_dim']
     self.action_dim = hyperparameters['action_dim']
     self.total_train_steps = hyperparameters['total_train_steps']
     self.max_episode_length = hyperparameters['max_episode_length']
     self.num_train_epochs = hyperparameters['num_train_epochs']
     self.device = hyperparameters['device']
     # Initialize actor/critic networks
     self.actor = Actor(self.state_dim, self.action_dim)
     self.critic = Critic(self.state_dim, self.action_dim)
     self.actor_optim = optim.Adam(self.actor.parameters(), lr=self.lr)
     self.critic_optim = optim.Adam(self.critic.parameters(), lr=self.lr)
     # Initialize replay buffer and environment
     self.replay_buffer = ReplayBuffer(self.batch_size)
     self.enviroment = KukaGymEnv(renders=True)
예제 #6
0
    def __init__(self,
                 agent_list,
                 action_size,
                 learn_period=10,
                 learn_sampling_num=20,
                 buffer_size=int(1e6),
                 batch_size=128,
                 random_seed=0):
        super().__init__()

        if len(agent_list) == 0:
            raise Exception('len(agent_list) = 0')

        self.agent_list = agent_list

        self.learn_period = learn_period
        self.learn_sampling_num = learn_sampling_num

        self.batch_size = batch_size

        self.memory = ReplayBuffer(action_size, buffer_size, batch_size,
                                   random_seed, device)

        self.time_step = 0

        # debugging constant
        self.__debug_num_agents = len(agent_list)
        self.__debug_state_size = agent_list[0].state_size
        self.__debug_action_size = agent_list[0].action_size
예제 #7
0
class DQNAgent(nn.Module):
    def __init__(self,
                 state_dim: int,
                 action_dim: int,
                 hidden_sizes: list = [128, 128],
                 activation=nn.ReLU,
                 buffer_size: int = 1000000,
                 batch_size: int = 32,
                 lr: float = 1e-4,
                 gamma: float = 0.95,
                 theta: float = 0.05):
        super(DQNAgent, self).__init__()
        self.q_net = mlp([state_dim] + hidden_sizes + [action_dim],
                         activation=activation)
        self.target_net = mlp([state_dim] + hidden_sizes + [action_dim],
                              activation=activation)
        self.target_net.load_state_dict(self.q_net.state_dict())
        self.buffer = ReplayBuffer(buffer_size)
        self.batch_size = batch_size
        self.optimizer = Adam(self.q_net.parameters(), lr=lr)
        self.gamma = gamma
        self.theta = theta

    def forward(self, x):
        return self.q_net(x)

    def save_memory(self, ex):
        self.buffer.push(ex)

    def train(self, k=4, max_norm=5.):
        losses = []
        for _ in range(k):
            experiences = self.buffer.sample(self.batch_size)
            s, a, r, t, mask = get_batch(experiences)
            next_q = self.target_net(t).max(-1, keepdim=True)[0]
            target = r + self.gamma * mask * next_q.detach()
            pred = self.q_net(s).gather(-1, a)
            loss = F.mse_loss(pred, target)
            self.optimizer.zero_grad()
            loss.backward()
            clip_grad_norm_(self.q_net.parameters(), max_norm)
            self.optimizer.step()
            losses.append(loss.item())
        self.target_update()
        return np.mean(losses)

    def train_start(self):
        return (len(self.buffer) >= self.batch_size)

    def target_update(self):
        for target, param in zip(self.target_net.parameters(),
                                 self.q_net.parameters()):
            target.data = (1 -
                           self.theta) * target.data + self.theta * param.data


#%%
예제 #8
0
def train_agent(path,
                env,
                agent,
                seed=0,
                num_episodes=100,
                num_steps=100,
                batch_size=128,
                replay_buffer_size=1000000):

    if not os.path.isdir(path):
        os.makedirs(path)
    os.chdir(path)

    env.seed(seed)
    random.seed(seed)

    pickle.dump(agent.policy_net, open('first_policy.pickle', 'wb'))

    replay_buffer = ReplayBuffer(replay_buffer_size)

    rewards = []
    max_angle = []
    ave_angle = []
    for episode in range(num_episodes):
        state = env.reset()
        episode_reward = 0
        max_th = 0
        ave_th = 0
        for step in range(num_steps):
            action = agent.policy_net.get_action(state) + np.array([0.0])
            next_state, reward, done, _ = env.step(action)
            replay_buffer.push(state, action, reward, next_state, done)

            if len(replay_buffer) > batch_size:
                agent.train_step(replay_buffer=replay_buffer,
                                 batch_size=batch_size)

            state = next_state
            episode_reward += reward
            th = np.arccos(state[0]) * np.sign(state[1])
            max_th = max(max_th, abs(th))
            ave_th += abs(th)

        rewards.append(episode_reward)
        max_angle.append(max_th)
        ave_angle.append(ave_th / num_steps)

    pickle.dump(agent.policy_net, open('last_policy.pickle', 'wb'))
    pickle.dump(rewards, open('rewards.pickle', 'wb'))
    pickle.dump(max_angle, open('max_angle.pickle', 'wb'))
    pickle.dump(ave_angle, open('ave_angle.pickle', 'wb'))

    plt.figure(figsize=(10, 6))
    plt.plot(rewards)
    plt.title('Reward vs Episode')
    plt.savefig('rewards.png', dpi=100)
    plt.close()
예제 #9
0
    def __init__(
        self,
        nc: int,
        nz: int,
        ngf: int,
        ndf: int,
        ng_blocks: int,
        nd_layers: int,
        ksize_d: int,
        norm_type: str,
        lambda_A: float,
        lambda_B: float,
        lambda_idt: float,
    ) -> None:
        """Construct CycleGAN.

        Parameters:
        -----------
            nc:          the number of image channels
            nz:          size of z latent vector
            ngf:         size of feature maps in generator
            ndf:         size of feature maps in discriminator
            ng_blocks:   the number of Residual blocks
            nd_layers:   the number of conv layers in the discriminator
            ksize_d:        kernel size of conv layer in the discriminator
            norm_type:   normalization layer type `batch` | `instance`
            lambda_A:    forward cycle loss weight
            lambda_B:    backward cycle loss weight
            lambda_idt:  identity loss weight
        """
        super(CycleGAN, self).__init__()
        # Generators
        self.G_AB = ResidualGenerator(nz, nc, ngf, norm_type, ng_blocks)
        init_weights(self.G_AB)
        self.G_BA = ResidualGenerator(nz, nc, ngf, norm_type, ng_blocks)
        init_weights(self.G_BA)
        # Discriminators
        self.D_A = PatchDiscriminator(nc, ksize_d, ndf, nd_layers, norm_type)
        init_weights(self.D_A)
        self.D_B = PatchDiscriminator(nc, ksize_d, ndf, nd_layers, norm_type)
        init_weights(self.D_B)
        # Relay Buffer
        self.replay_buffer = {
            "fake_A": ReplayBuffer(),
            "fake_B": ReplayBuffer()
        }
        # Optimizers
        self.optimizers = {}
        # Schedulers
        self.schedulers = {}
        # Criterions
        self.criterions = {"gan": None, "cycle": None, "idt": None}
        # Loss weights
        self.lambdas = {"A": lambda_A, "B": lambda_B, "idt": lambda_idt}
예제 #10
0
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 mem_alpha,
                 batch_size,
                 beta,
                 beta_max,
                 beta_increment,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000,
                 chkpt_dir='tmp/dqn',
                 algo=None,
                 env_name=None):

        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr

        self.beta = beta
        self.beta_max = beta_max
        self.beta_increment = beta_increment

        self.input_dims = input_dims
        self.n_actions = n_actions
        self.batch_size = batch_size

        self.eps_min = eps_min
        self.eps_dec = eps_dec

        self.replace_target_cntr = replace
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions, mem_alpha)

        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   model_name=env_name + "_" + algo +
                                   "_q_eval",
                                   model_dir=chkpt_dir)

        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   model_name=env_name + "_" + algo +
                                   "_q_next",
                                   model_dir=chkpt_dir)
예제 #11
0
    def __init__(self, state_size, action_size, actor_lr, critic_lr,
                 random_seed, mu, theta, sigma, buffer_size, batch_size,
                 epsilon_start, epsilon_min, epsilon_decay, gamma, tau,
                 n_time_steps, n_learn_updates, device):

        self.state_size = state_size
        self.action_size = action_size

        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, name="Actor_local")
        self.actor_target = Actor(state_size, action_size, name="Actor_target")
        self.actor_optimizer = Adam(learning_rate=self.actor_lr)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size,
                                   action_size,
                                   name="Critic_local")
        self.critic_target = Critic(state_size,
                                    action_size,
                                    name="Critic_target")
        self.critic_optimizer = Adam(learning_rate=self.critic_lr)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.noise = GaussianNoise(action_size, random_seed, mu, sigma)
        self.epsilon = epsilon_start
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay

        # Replay memory
        self.batch_size = int(batch_size)
        self.buffer_size = int(buffer_size)
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size,
                                   random_seed)

        # Algorithm parameters
        self.gamma = gamma  # discount factor
        self.tau = tau  # for soft update of target parameters
        self.n_time_steps = n_time_steps  # number of time steps before updating network parameters
        self.n_learn_updates = n_learn_updates  # number of updates per learning step

        # Device
        self.device = device

        tf.keras.backend.clear_session()
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """Initialize an MADDPG Agent object.
        Params
        ======
            :param state_size: dimension of each state
            :param action_size: dimension of each action
            :param num_agents: number of inner agents
            :param random_seed: random seed
        """
        super().__init__(state_size, action_size, num_agents, random_seed)
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)

        self.actors_local = []
        self.actors_target = []
        self.actor_optimizers = []
        self.critics_local = []
        self.critics_target = []
        self.critic_optimizers = []
        for i in range(num_agents):
            # Actor Network (w/ Target Network)
            self.actors_local.append(
                Actor(state_size, action_size, random_seed).to(device))
            self.actors_target.append(
                Actor(state_size, action_size, random_seed).to(device))
            self.actor_optimizers.append(
                optim.Adam(self.actors_local[i].parameters(), lr=LR_ACTOR))
            # Critic Network (w/ Target Network)
            self.critics_local.append(
                Critic(num_agents * state_size, num_agents * action_size,
                       random_seed).to(device))
            self.critics_target.append(
                Critic(num_agents * state_size, num_agents * action_size,
                       random_seed).to(device))
            self.critic_optimizers.append(
                optim.Adam(self.critics_local[i].parameters(),
                           lr=LR_CRITIC,
                           weight_decay=WEIGHT_DECAY))

        # Noise process for each agent
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # debugging variables
        self.step_count = 0
        self.mse_error_list = []
예제 #13
0
    def train_init(self):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        self.replay_buffer = ReplayBuffer(self.config.buffer_size,
                                          self.config.state_history)
        self.max_q_values = deque(maxlen=1000)
예제 #14
0
    def train(self):
        start = time.time()
        replay_buffer = ReplayBuffer(self.config.replay_buffer_size,
                                     self.observation_dim, self.action_dim,
                                     self.env.n)
        # replay_buffer = Memory(self.config.replay_buffer_size)
        self.current_obs_n = self.env.reset()
        self.current_episode_length = 0
        for t in range(self.config.num_batches):
            self.current_batch_num = t

            samples = self.sample_n(self.env, replay_buffer,
                                    self.config.train_freq,
                                    self.config.batch_size)

            for i in range(self.env.n):
                agent_net = self.agent_networks[i]
                if t % self.config.eval_freq == 0:
                    agent_net.adapt_param_noise(samples)
                agent_net.train_for_batch_samples(
                    samples, agents_list=self.agent_networks)

            # periodically do a test run to evaluate policies so far
            if t % self.config.eval_freq == 0:
                self.logger.info("Batch " + str(t) + ":")
                self.test_run(self.env, self.config.eval_episodes)

        self.logger.info("- Training all done.")
        self.logger.info("Total training time: " + str(time.time() - start) +
                         " seconds.")
예제 #15
0
def run_training(args):
    env = gym.make(args.env) 
    state_dim = env.observation_space.shape[0] 
    action_dim = env.action_space.shape[0] 
    max_action = float(env.action_space.high[0])
    
    print('States: %i'%(state_dim))
    print('Actions: %i'%(action_dim))

    # TODO
    scale = max_action * np.ones(action_dim)

    agent = TD3Agent(
        state_dim, 
        action_dim, 
        scale, 
        '_'.join([args.model_path, args.date])
        )

    replay_buffer = ReplayBuffer(
        state_dim, 
        action_dim, 
        buffer_size=args.buffer_size,
        batch_size=args.batch_size
        )

    logger = Logger(log_path='_'.join([args.log_path, args.date]))

    run_train_loop(args, env, agent, replay_buffer, logger)
    def __init__(self,
                 color: Color,
                 model_name: str,
                 train_policy: TrainablePolicy,
                 immediate_reward: Reward,
                 final_reward: Reward,
                 board_size: int,
                 discount_factor: float = 1.0) -> None:
        super().__init__(color)

        self.weights_path: str = f'weights\\{model_name}_{self.color.name}'
        self.train_policy: TrainablePolicy = train_policy
        self.test_policy: OptimalTrainablePolicy = OptimalTrainablePolicy(
            board_size)
        self.immediate_reward: Reward = immediate_reward
        self.final_reward: Reward = final_reward
        self.board_size = board_size
        self.discount_factor: float = discount_factor

        self.replay_buffer: ReplayBuffer = ReplayBuffer(
            (board_size**2 - 4) // 2)
        self.train_mode: Union[bool, None] = None

        try:
            # create new model
            self.dnn: Sequential = self.create_model()
            # load existing weights
            self.load_weights()
        except:
            # create new model
            self.dnn: Sequential = self.create_model()
            # save initial weights
            self.save_weights()
예제 #17
0
	def __init__(self, color: Color, immediate_reward: ImmediateReward = None, board_size: int = 8):
		super().__init__(color, immediate_reward)
		self.board_size: int = board_size
		self.episode_rewards = []
		self.training_errors = []
		self.train_mode = False
		self.replay_buffer = ReplayBuffer(size=int(10e5))
예제 #18
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = self.action_high - self.action_low

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2 * (self.action_range)
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters (CartPole)
        # self.gamma = 0.99  # discount factor
        # self.tau = 0.01  # for soft update of target parameters

        # Algorithm parameters (Quadcopter)
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
예제 #19
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        
        # learning rate
        actor_learning_rate = 0.0001
        critic_learning_rate = 0.001

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, actor_learning_rate)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, actor_learning_rate)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, critic_learning_rate)
        self.critic_target = Critic(self.state_size, self.action_size, critic_learning_rate)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters
        
        # Score tracker
        self.score = -np.inf
        self.best_score = -np.inf
예제 #20
0
    def __init__(self, state_size, action_size, seed, checkpoint=None):
        """
        Contructor

        :param state_size:
        :param action_size:
        :param seed:
        :param checkpoint: if running from a checkpoint
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = np.random.seed(seed)

        # As for any DQN implementation  we create a local and a target Network.
        # In this Case we use the DuelingDQN Implementation for both networks

        self.qnetwork_local = DuelingDQNetwork(state_size,
                                               action_size,
                                               seed,
                                               fc1_units=FC1_UNITS,
                                               fc2_units=FC2_UNITS).to(device)
        self.qnetwork_target = DuelingDQNetwork(state_size,
                                                action_size,
                                                seed,
                                                fc1_units=FC1_UNITS,
                                                fc2_units=FC2_UNITS).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        if checkpoint:
            #If We have a checkpoint we load the state to the networks and optimizers
            print('Using Checkpoint...')
            self.qnetwork_local.load_state_dict(checkpoint['local_state_dict'])
            self.qnetwork_target.load_state_dict(
                checkpoint['target_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer'])

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    def setup_ddpg(self, args):
        sess = self.sess
        tf.set_random_seed(int(args['random_seed']))

        # Fetch environment state and action space properties
        state_dim = self.env.observation_space["observation"].shape[0]
        action_dim = self.env.action_space.shape[0]
        action_bound = self.env.action_space.high

        # Ensure action bound is symmetric
        assert (all(self.env.action_space.high - self.env.action_space.low))

        self.actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                                  float(args['actor_lr']), float(args['tau']),
                                  int(args['minibatch_size']))

        self.critic = CriticNetwork(sess, state_dim, action_dim,
                                    float(args['critic_lr']),
                                    float(args['tau']), float(args['gamma']),
                                    self.actor.get_num_trainable_vars())

        self.actor_noise = OrnsteinUhlenbeckActionNoise(
            mu=np.zeros(action_dim))

        # Set up summary Ops
        self.summary_ops, self.summary_vars = build_summaries()

        sess.run(tf.global_variables_initializer())

        # Initialize target network weights
        self.actor.update_target_network()
        self.critic.update_target_network()

        # Initialize replay memory
        self.replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                          int(args['random_seed']))

        # Needed to enable BatchNorm.
        # This hurts the performance on Pendulum but could be useful
        # in other environments.
        tflearn.is_training(True)
예제 #22
0
 def __init__(self,
              state_dim: int,
              action_dim: int,
              hidden_sizes: list = [128, 128],
              activation=nn.ReLU,
              buffer_size: int = 1000000,
              batch_size: int = 32,
              lr: float = 1e-4,
              gamma: float = 0.95,
              theta: float = 0.05):
     super(DQNAgent, self).__init__()
     self.q_net = mlp([state_dim] + hidden_sizes + [action_dim],
                      activation=activation)
     self.target_net = mlp([state_dim] + hidden_sizes + [action_dim],
                           activation=activation)
     self.target_net.load_state_dict(self.q_net.state_dict())
     self.buffer = ReplayBuffer(buffer_size)
     self.batch_size = batch_size
     self.optimizer = Adam(self.q_net.parameters(), lr=lr)
     self.gamma = gamma
     self.theta = theta
예제 #23
0
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 gamma=0.99,
                 step_size=1,
                 dueling_dqn=False):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        if dueling_dqn:
            print("Use dueling dqn")
            self.qnetwork_local = NoisyDuelingDQN(state_size, action_size,
                                                  seed).to(device)
            self.qnetwork_target = NoisyDuelingDQN(state_size, action_size,
                                                   seed).to(device)
        else:
            print("Use non-dueling dqn")
            self.qnetwork_local = DQN(state_size, action_size, seed).to(device)
            self.qnetwork_target = DQN(state_size, action_size,
                                       seed).to(device)

        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.gamma = gamma
        self.step_size = step_size
예제 #24
0
    def evaluate(self, env=None, num_episodes=None):
        """
        Evaluation with same procedure as the training
        """
        # log our activity only if default call
        if num_episodes is None:
            self.logger.info("Evaluating...")

        # arguments defaults
        if num_episodes is None:
            num_episodes = self.config.num_episodes_test

        if env is None:
            env = self.env

        # replay memory to play
        replay_buffer = ReplayBuffer(self.config.buffer_size,
                                     self.config.state_history)
        rewards = []

        for i in range(num_episodes):
            total_reward = 0
            state = env.reset()
            state = state.reshape([1, -1, 1])

            while True:
                if self.config.render_test:
                    env.render()

                # store last state in buffer
                idx = replay_buffer.store_frame(state)
                q_input = replay_buffer.encode_recent_observation()

                action = self.get_action(q_input)

                # perform action in env
                new_state, reward, done, info = env.step(action)

                # store in replay memory
                replay_buffer.store_effect(idx, action, reward, done)
                state = new_state
                state = state.reshape([1, -1, 1])

                # count reward
                total_reward += reward
                if done:
                    break

            # updates to perform at the end of an episode
            rewards.append(total_reward)

        avg_reward = np.mean(rewards)
        sigma_reward = np.sqrt(np.var(rewards) / len(rewards))

        if num_episodes >= 1:
            msg = "Average reward: {:04.2f} +/- {:04.2f}".format(
                avg_reward, sigma_reward)
            self.logger.info(msg)

        return avg_reward
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """
        Initialize an DDPG Agent object.
            :param state_size (int): dimension of each state
            :param action_size (int): dimension of each action
            :param num_agents (int): number of agents in environment ot use ddpg
            :param random_seed (int): random seed
        """
        super().__init__(state_size, action_size, num_agents, random_seed)
        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process for each agent
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # debug of the MSE critic loss
        self.mse_error_list = []
예제 #26
0
    def __init__(self, state_size, action_size, args, device):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.hidden_size = args.hidden_size
        self.seed = args.seed
        self.args = args
        self.device = device
        assert self.args.agent in ['dqn', 'double_dqn', 'dueling_dqn'],\
                "invalid agent name"
        if self.args.agent == "double_dqn":
            print("Implementing Double DQN!")
        elif self.args.agent == "dueling_dqn":
            print("Implementing Dueling DQN!")
        else:
            print("Implementing DQN")

        # Q-Network
        if self.args.agent == "dueling_dqn":
            self.qnetwork_local = DuelingQNetwork(state_size, action_size, self.hidden_size, self.seed).to(device)
            self.qnetwork_target = DuelingQNetwork(state_size, action_size, self.hidden_size, self.seed).to(device)
        else:
            self.qnetwork_local = QNetwork(state_size, action_size, self.hidden_size, self.seed).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size, self.hidden_size, self.seed).to(device)
        print("Agent Architecture")
        print(self.qnetwork_local)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.args.lr)

        # Replay memory
        self.memory = ReplayBuffer(action_size, args.buffer_size, args.batch_size, self.seed, self.device)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = args.update_frequency
예제 #27
0
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    def init_agent(self, id_, game_type):
        super(DQNAgent, self).init_agent(id_, game_type)

        # Assume the graph has been constructed.
        # Create a tf Session and run initializer of variables.
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True
        self._session = tf.Session(config=tf_config)

        # Tensorboard
        self._add_summary()

        # Initialize all variables.
        init = tf.global_variables_initializer()
        self._session.run(init)

        # Synchronise q and target_q networks.
        self._session.run(self._update_target_op)

        # for saving networks weights
        self._saver = tf.train.Saver()

        # Initialize replay buffer and variables.
        self._train_replay_buffer = ReplayBuffer(self._config.buffer_size, self._config.state_history)
        self._train_rewards = deque(maxlen=self._config.num_episodes_test)
        self._train_max_q_values = deque(maxlen=1000)
        self._train_q_values = deque(maxlen=1000)
        self._init_averages()

        self._time_step = 0
        self._progress_bar = Progbar(target=self._config.nsteps_train)

        self._has_episode_started = False

        if not self._train_from_scratch:
            self._load()
def run_episode(
    env,
    q_func,
    replay_buffer_size=1000000,
    frame_history_len=4,
    game=None,
    ):

    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space)      == gym.spaces.Discrete


    if len(env.observation_space.shape) == 1:
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n


    Q = q_func(input_arg, num_actions).type(dtype)
    Q.load_state_dict(torch.load("./models/PAL_{}.pth".format(game), map_location=lambda storage, loc: storage))

    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)


    all_obs = []

    last_obs = env.reset()

    for t in count():

        last_idx = replay_buffer.store_frame(last_obs)
        recent_observations = replay_buffer.encode_recent_observation()
        all_obs.append(recent_observations)


        torch_obs = torch.from_numpy(recent_observations).type(dtype).unsqueeze(0) / 255.0
        with torch.no_grad():
            Qvals = Q(torch_obs).data[0]
        max2val, max2idx = Qvals.topk(2)
        action = max2idx[0]
        

        obs, reward, done, _ = env.step(action)
        env.render()
        replay_buffer.store_effect(last_idx, action, reward, done)

        if done:
            break
        last_obs = obs
    
    return all_obs
예제 #30
0
    def __init__(self, env, params):

        self.env = env
        self.params = params
        self.epsilon = self.params["epsilon_start"]
        self.replay_buffer = ReplayBuffer(int(self.params["buffer_size"]))
       
        self.n_actions = self.env.action_space.n
        
        if torch.cuda.is_available():  
            self.device = "cuda:0" 
        else:  
            self.device = "cpu" 
            
        self.Q = CNN(self.n_actions).to(self.device)
        self.Q_target = CNN(self.n_actions).to(self.device)
        
        self.optimizer = optim.RMSprop(self.Q.parameters(), lr=2.5e-4)