示例#1
0
 def __init__(self,
              optimizer_spec,
              num_goal=81,
              num_action=81,
              replay_memory_size=10000,
              subgoals = 81,
              screen_size = (500,500),
              batch_size=128):
     ###############
     # BUILD MODEL #
     ###############
     self.num_goal = num_goal
     self.num_action = num_action
     self.batch_size = batch_size
     # Construct meta-controller and controller
     self.meta_controller = MetaController().type(dtype)
     self.target_meta_controller = MetaController().type(dtype)
     self.controller = Controller().type(dtype)
     self.target_controller = Controller().type(dtype)
     # Construct the optimizers for meta-controller and controller
     self.meta_optimizer = optimizer_spec.constructor(self.meta_controller.parameters(), **optimizer_spec.kwargs)
     self.ctrl_optimizer = optimizer_spec.constructor(self.controller.parameters(), **optimizer_spec.kwargs)
     # Construct the replay memory for meta-controller and controller
     self.meta_replay_memory = ReplayMemory(replay_memory_size)
     self.ctrl_replay_memory = ReplayMemory(replay_memory_size)
     self.subgoals = subgoals
     self.screen_size = screen_size
     self.idx_2_action = self.action_dict()
示例#2
0
 def __init__(self,
              optimizer_spec,
              num_goal=6,
              num_action=2,
              replay_memory_size=10000,
              batch_size=128):
     ###############
     # BUILD MODEL #
     ###############
     self.num_goal = num_goal
     self.num_action = num_action
     self.batch_size = batch_size
     # Construct meta-controller and controller
     self.meta_controller = MetaController().type(dtype)
     self.target_meta_controller = MetaController().type(dtype)
     self.controller = Controller().type(dtype)
     self.target_controller = Controller().type(dtype)
     # Construct the optimizers for meta-controller and controller
     self.meta_optimizer = optimizer_spec.constructor(
         self.meta_controller.parameters(), **optimizer_spec.kwargs)
     self.ctrl_optimizer = optimizer_spec.constructor(
         self.controller.parameters(), **optimizer_spec.kwargs)
     # Construct the replay memory for meta-controller and controller
     self.meta_replay_memory = ReplayMemory(replay_memory_size)
     self.ctrl_replay_memory = ReplayMemory(replay_memory_size)
    def __init__(self, args):

        super(BasePGQLearner, self).__init__(args)

        self.q_update_counter = 0
        self.replay_size = args.replay_size
        self.pgq_fraction = args.pgq_fraction
        self.batch_update_size = args.batch_update_size
        scope_name = 'local_learning_{}'.format(self.actor_id)
        conf_learning = {'name': scope_name,
                         'input_shape': self.input_shape,
                         'num_act': self.num_actions,
                         'args': args}

        with tf.device('/cpu:0'):
            self.local_network = PolicyValueNetwork(conf_learning)
        with tf.device('/gpu:0'), tf.variable_scope('', reuse=True):
            self.batch_network = PolicyValueNetwork(conf_learning)
            self._build_q_ops()

        self.reset_hidden_state()
        self.replay_memory = ReplayMemory(
            self.replay_size,
            self.local_network.get_input_shape(),
            self.num_actions)
            
        if self.is_master():
            var_list = self.local_network.params
            self.saver = tf.train.Saver(var_list=var_list, max_to_keep=3, 
                                        keep_checkpoint_every_n_hours=2)
示例#4
0
    def __init__(self):
        super(DQNDoubleQAgent, self).__init__()
        self.training = False
        self.max_frames = 2000000
        self._epsilon = Epsilon(start=1.0, end=0.1, update_increment=0.0001)
        self.gamma = 0.99
        self.train_q_per_step = 4
        self.train_q_batch_size = 256
        self.steps_before_training = 10000
        self.target_q_update_frequency = 50000

        self._Q_weights_path = "./data/SC2DoubleQAgent"
        self._Q = DQNCNN()
        if os.path.isfile(self._Q_weights_path):
            self._Q.load_state_dict(torch.load(self._Q_weights_path))
            print("Loading weights:", self._Q_weights_path)
        self._Qt = copy.deepcopy(self._Q)
        self._Q.cuda()
        self._Qt.cuda()
        self._optimizer = optim.Adam(self._Q.parameters(), lr=1e-8)
        self._criterion = nn.MSELoss()
        self._memory = ReplayMemory(100000)

        self._loss = deque(maxlen=1000)
        self._max_q = deque(maxlen=1000)
        self._action = None
        self._screen = None
        self._fig = plt.figure()
        self._plot = [plt.subplot(2, 2, i + 1) for i in range(4)]

        self._screen_size = 28
    def __init__(self, args):
        self.args = args

        super(AElearner, self).__init__(args)
        self.cts_eta = args.cts_eta
        self.cts_beta = args.cts_beta
        self.ae_delta = args.ae_delta
        self.batch_size = args.batch_update_size
        self.replay_memory = ReplayMemory(
            args.replay_size,
            self.local_network_upper.get_input_shape(),
            # self.local_network.get_input_shape(),
            self.num_actions)
        #inits desity model(chooses how many steps for update )
        #20 * q targt update steps
        self._init_density_model(args)
        #computes loss
        self._double_dqn_op()
        self.which_net_to_update_counter = 0
        self.ae_counter = 0
        self.epsilon_greedy_counter = 0
        self.total_ae_counter = 0
        self.total_epsilon_greedy_counter = 0
        self.q_values_upper_max = []
        self.q_values_lower_max = []
        self.ae_valid_actions = True
        self.action_meanings = self.emulator.env.unwrapped.get_action_meanings(
        )
        self.minimized_actions_counter = {
            value: 0
            for value in self.action_meanings
        }
        print(self.minimized_actions_counter)
示例#6
0
 def __init__(self,
              actor_optimizer_spec,
              critic_optimizer_spec,
              num_feature,
              num_action,
              replay_memory_size=1000000,
              batch_size=64,
              tau=0.001):
     ###############
     # BUILD MODEL #
     ###############
     self.num_feature = num_feature
     self.num_action = num_action
     self.batch_size = batch_size
     self.tau = tau
     # Construct actor and critic
     self.actor = Actor(num_feature, num_action).type(dtype)
     self.target_actor = Actor(num_feature, num_action).type(dtype)
     self.critic = Critic(num_feature, num_action).type(dtype)
     self.target_critic = Critic(num_feature, num_action).type(dtype)
     # Construct the optimizers for actor and critic
     self.actor_optimizer = actor_optimizer_spec.constructor(
         self.actor.parameters(), **actor_optimizer_spec.kwargs)
     self.critic_optimizer = critic_optimizer_spec.constructor(
         self.critic.parameters(), **critic_optimizer_spec.kwargs)
     # Construct the replay memory
     self.replay_memory = ReplayMemory(replay_memory_size)
示例#7
0
    def __init__(self, config):
        self.config = config

        self.logger = logging.getLogger("DQNAgent")

        # define models (policy and target)
        self.policy_model = DQN(self.config)
        self.target_model = DQN(self.config)

        # define memory
        self.memory = ReplayMemory(self.config)

        # define loss
        self.loss = HuberLoss()

        # define optimizer
        self.optim = torch.optim.RMSprop(self.policy_model.parameters())

        # define environment
        self.env = gym.make('CartPole-v0').unwrapped
        self.cartpole = CartPoleEnv(self.config.screen_width)

        # initialize counter
        self.current_episode = 0
        self.current_iteration = 0
        self.episode_durations = []

        self.batch_size = self.config.batch_size

        # set cuda flag
        self.is_cuda = torch.cuda.is_available()
        if self.is_cuda and not self.config.cuda:
            self.logger.info(
                "WARNING: You have a CUDA device, so you should probably enable CUDA"
            )

        self.cuda = self.is_cuda & self.config.cuda

        if self.cuda:
            self.logger.info("Program will run on *****GPU-CUDA***** ")
            print_cuda_statistics()
            self.device = torch.device("cuda")
            torch.cuda.set_device(self.config.gpu_device)
        else:
            self.logger.info("Program will run on *****CPU***** ")
            self.device = torch.device("cpu")

        self.policy_model = self.policy_model.to(self.device)
        self.target_model = self.target_model.to(self.device)
        self.loss = self.loss.to(self.device)

        # Initialize Target model with policy model state dict
        self.target_model.load_state_dict(self.policy_model.state_dict())
        self.target_model.eval()

        # Summary Writer
        self.summary_writer = SummaryWriter(log_dir=self.config.summary_dir,
                                            comment='DQN')
示例#8
0
    def __init__(self, args):
        self.final_epsilon = args.final_epsilon
        super(PseudoCountQLearner, self).__init__(args)

        self.cts_eta = args.cts_eta
        self.cts_beta = args.cts_beta
        self.batch_size = args.batch_update_size
        self.replay_memory = ReplayMemory(args.replay_size)

        self._init_density_model(args)
        self._double_dqn_op()
 def test_zero_step(self):
   self.memory = ReplayMemory(capacity=10, multi_step_n=0)
   for i in range(5):
     a = Transition([0, 1, 2, i], 0, [4, 5, 6, i*i], 1, False)
     self.memory.push(a)
   final = Transition([0, 1, 2, 10], 0, [4, 5, 6, 100], 10, True)
   self.memory.push(final)
   self.assertEqual(self.memory.memory[0].r, 1)
   self.assertEqual(self.memory.memory[3].r, 1)
   self.assertEqual(self.memory.memory[4].r, 1)
   self.assertEqual(self.memory.memory[5].r, 10)
    def __init__(self, args):
        super(PseudoCountQLearner, self).__init__(args)

        self.cts_eta = .9
        self.batch_size = 32
        self.replay_memory = ReplayMemory(args.replay_size)

        #more cython tuning could useful here
        self.density_model = CTSDensityModel(height=args.cts_rescale_dim,
                                             width=args.cts_rescale_dim,
                                             num_bins=args.cts_bins,
                                             beta=0.05)
    def __init__(self, args):
        self.args = args
        super(PseudoCountQLearner, self).__init__(args)

        self.cts_eta = args.cts_eta
        self.cts_beta = args.cts_beta
        self.batch_size = args.batch_update_size
        self.replay_memory = ReplayMemory(args.replay_size,
                                          self.local_network.get_input_shape(),
                                          self.num_actions)

        self._init_density_model(args)
        self._double_dqn_op()
    def __init__(self, args):
        self.args = args
        super(PseudoCountQLearner, self).__init__(args)

        self.cts_eta = args.cts_eta
        self.cts_beta = args.cts_beta
        self.batch_size = args.batch_update_size
        self.replay_memory = ReplayMemory(args.replay_size,
                                          self.local_network.get_input_shape(),
                                          self.num_actions)
        #inits desity model(chooses how many steps for update )
        #20 * q targt update steps
        self._init_density_model(args)
        #computes loss
        self._double_dqn_op()
示例#13
0
    def _build_q_ops(self):
        # pgq specific initialization
        self.pgq_fraction = self.pgq_fraction
        self.batch_size = self.batch_update_size
        self.replay_memory = ReplayMemory(self.replay_size)
        self.q_tilde = self.batch_network.beta * (
            self.batch_network.log_output_layer_pi +
            tf.expand_dims(self.batch_network.output_layer_entropy,
                           1)) + self.batch_network.output_layer_v

        self.Qi, self.Qi_plus_1 = tf.split(axis=0,
                                           num_or_size_splits=2,
                                           value=self.q_tilde)
        self.V, _ = tf.split(axis=0,
                             num_or_size_splits=2,
                             value=self.batch_network.output_layer_v)
        self.log_pi, _ = tf.split(
            axis=0,
            num_or_size_splits=2,
            value=tf.expand_dims(self.batch_network.log_output_selected_action,
                                 1))
        self.R = tf.placeholder('float32', [None], name='1-step_reward')

        self.terminal_indicator = tf.placeholder(tf.float32, [None],
                                                 name='terminal_indicator')
        self.max_TQ = self.gamma * tf.reduce_max(
            self.Qi_plus_1, 1) * (1 - self.terminal_indicator)
        self.Q_a = tf.reduce_sum(
            self.Qi * tf.split(axis=0,
                               num_or_size_splits=2,
                               value=self.batch_network.selected_action_ph)[0],
            1)

        self.q_objective = -self.pgq_fraction * tf.reduce_mean(
            tf.stop_gradient(self.R + self.max_TQ - self.Q_a) *
            (self.V[:, 0] + self.log_pi[:, 0]))

        self.V_params = self.batch_network.params
        self.q_gradients = tf.gradients(self.q_objective, self.V_params)

        if self.batch_network.clip_norm_type == 'global':
            self.q_gradients = tf.clip_by_global_norm(
                self.q_gradients, self.batch_network.clip_norm)[0]
        elif self.batch_network.clip_norm_type == 'local':
            self.q_gradients = [
                tf.clip_by_norm(g, self.batch_network.clip_norm)
                for g in self.q_gradients
            ]
示例#14
0
 def __init__(self, env, args, device='cpu'):
     """
     Instantiate an NEC Agent
     ----------
     env: gym.Env
         gym environment to train on
     args: args class from argparser
         args are from from train.py: see train.py for help with each arg
     device: string
         'cpu' or 'cuda:0' depending on use_cuda flag from train.py
     """
     self.environment_type = args.environment_type
     self.env = env
     self.device = device
     # Hyperparameters
     self.epsilon = args.initial_epsilon
     self.final_epsilon = args.final_epsilon
     self.epsilon_decay = args.epsilon_decay
     self.gamma = args.gamma
     self.N = args.N
     # Transition queue and replay memory
     self.transition_queue = []
     self.replay_every = args.replay_every
     self.replay_buffer_size = args.replay_buffer_size
     self.replay_memory = ReplayMemory(self.replay_buffer_size)
     # CNN for state embedding network
     self.frames_to_stack = args.frames_to_stack
     self.embedding_size = args.embedding_size
     self.in_height = args.in_height
     self.in_width = args.in_width
     self.cnn = CNN(self.frames_to_stack, self.embedding_size,
                    self.in_height, self.in_width).to(self.device)
     # Differentiable Neural Dictionary (DND): one for each action
     self.kernel = inverse_distance
     self.num_neighbors = args.num_neighbors
     self.max_memory = args.max_memory
     self.lr = args.lr
     self.dnd_list = []
     for i in range(env.action_space.n):
         self.dnd_list.append(
             DND(self.kernel, self.num_neighbors, self.max_memory,
                 args.optimizer, self.lr))
     # Optimizer for state embedding CNN
     self.q_lr = args.q_lr
     self.batch_size = args.batch_size
     self.optimizer = get_optimizer(args.optimizer, self.cnn.parameters(),
                                    self.lr)
示例#15
0
 def __init__(self,
              environment_name="CartPole-v1",
              replay_memory_size=10000,
              action_threshold=0.7,
              batch_size=64,
              gamma=0.9):
     self.environment = gym.make(environment_name)
     state = self.environment.reset()
     self.state_shape = state.shape
     self.action_space = self.environment.action_space.n
     self.replay_memory = ReplayMemory(self.state_shape,
                                       capacity=replay_memory_size)
     self.model = self.build_network()
     self.target_model = self.build_network()
     self.action_threshold = action_threshold
     self.batch_size = batch_size
     self.gamma = gamma
示例#16
0
    def __init__(
        self,
        state_size,
        n_actions,
        args,
        device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
        self.device = device

        # Exploration / Exploitation params.
        self.steps_done = 0
        self.eps_threshold = 1
        self.eps_start = args.eps_start
        self.eps_end = args.eps_end
        self.eps_decay = args.eps_decay

        # RL params
        self.target_update = args.target_update
        self.discount = args.discount

        # Env params
        self.n_actions = n_actions
        self.state_size = state_size

        # Deep q networks params
        self.layers = args.layers
        self.batch_size = args.batch_size
        self.policy_net = DQN(state_size, n_actions,
                              layers=self.layers).to(self.device).float()
        self.target_net = None
        self.grad_clip = args.grad_clip

        if str(args.optimizer).lower() == 'adam':
            self.optimizer = optim.Adam(self.policy_net.parameters())
        if str(args.optimizer).lower() == 'rmsprop':
            self.optimizer = optim.RMSprop(self.policy_net.parameters())
        else:
            raise NotImplementedError

        self.memory = ReplayMemory(args.replay_size)

        # Performance buffers.
        self.rewards_list = []
示例#17
0
 def __init__(self,
              environment_name="Acrobot-v1",
              replay_memory_size=10000,
              action_threshold=0.7,
              batch_size=64,
              gamma=0.9):
     super(MotionAthlete,
           self).__init__(environment_name, replay_memory_size,
                          action_threshold, batch_size, gamma)
     self.environment.close()
     del self.environment
     self.environment = EnvironmentWrapper(environment_name)
     frame = self.environment.reset()
     frmae_shape = frame.shape
     self.motion_tracer = MotionTracer(frame_shape=frmae_shape)
     self.state_shape = self.motion_tracer.state_shape
     self.replay_memory = ReplayMemory(self.state_shape,
                                       capacity=replay_memory_size)
     del self.model
     del self.target_model
     self.model = self.build_network()
     self.target_model = self.build_network()
示例#18
0
def train_model(env,
                conv_layers,
                learning_rate=5e-4,
                total_timesteps=100000,
                buffer_size=50000,
                exploration_fraction=0.1,
                exploration_final_eps=0.02,
                train_freq=1,
                batch_size=32,
                print_freq=1,
                checkpoint_freq=100000,
                checkpoint_path=None,
                learning_starts=1000,
                gamma=1.0,
                target_network_update_freq=500,
                double_dqn=False,
                **network_kwargs) -> tf.keras.Model:
    """Train a DQN model.

    Parameters
    -------
    env: gym.Env
        openai gym
    conv_layers: list
        a list of triples that defines the conv network
    learning_rate: float
        learning rate for adam optimizer
    total_timesteps: int
        number of env steps to run the environment
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every train_freq steps.
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to store a checkpoint during training
    checkpoint_path: str
        the fs path for storing the checkpoints
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    double_dqn: bool
        specifies if double q-learning is used during training
    Returns
    -------
    dqn: an instance of tf.Module that contains the trained model
    """
    q_func = build_dueling_q_func(conv_layers, **network_kwargs)

    dqn = DeepQ(model_builder=q_func,
                observation_shape=env.observation_space.shape,
                num_actions=env.action_space.n,
                learning_rate=learning_rate,
                gamma=gamma,
                double_dqn=double_dqn)

    manager = None
    if checkpoint_path is not None:
        load_path = osp.expanduser(checkpoint_path)
        ckpt = tf.train.Checkpoint(model=dqn.q_network)
        manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=5)
        ckpt.restore(manager.latest_checkpoint)
        print("Restoring from {}".format(manager.latest_checkpoint))

    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)

    # Create the replay buffer
    replay_buffer = ReplayMemory(buffer_size)
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(total_timesteps=int(exploration_fraction *
                                                     total_timesteps),
                                 initial_prob=1.0,
                                 final_prob=exploration_final_eps)

    dqn.update_target()

    episode_rewards = [0.0]
    obs = env.reset()

    obs = np.expand_dims(np.array(obs), axis=0)

    for t in range(total_timesteps):
        update_eps = exploration.step_to(t)

        action, _, _, _ = dqn.step(tf.constant(obs), update_eps=update_eps)
        action = action[0].numpy()

        new_obs, reward, done, _ = env.step(action)
        # Store transition in the replay buffer.
        new_obs = np.expand_dims(np.array(new_obs), axis=0)
        replay_buffer.add(obs[0], action, reward, new_obs[0], float(done))
        obs = new_obs

        episode_rewards[-1] += reward
        if done:
            obs = env.reset()
            obs = np.expand_dims(np.array(obs), axis=0)
            episode_rewards.append(0.0)

        if t > learning_starts and t % train_freq == 0:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            weights, _ = tf.ones_like(rewards), None

            td_loss = dqn.train(obses_t, actions, rewards, obses_tp1, dones,
                                weights)

        if t > learning_starts and t % target_network_update_freq == 0:
            # Update target network every target_network_update_freq steps
            dqn.update_target()

        reward_100_mean = np.round(np.mean(episode_rewards[-101:-1]), 1)
        number_episodes = len(episode_rewards) - 1
        if done and print_freq is not None and number_episodes % print_freq == 0:
            format_str = "Steps: {}, Episodes: {}, 100 ep reward average: {}, Reward: {}, Epsilon-greedy %explore: {}"
            print(
                format_str.format(t, number_episodes, reward_100_mean,
                                  episode_rewards[-2],
                                  int(100 * exploration.value(t))))

            with train_summary_writer.as_default():
                tf.summary.scalar('loss',
                                  dqn.train_loss_metrics.result(),
                                  step=t)
                tf.summary.scalar('reward', episode_rewards[-2], step=t)

        if checkpoint_path is not None and t % checkpoint_freq == 0:
            manager.save()

        # Every training step, reset the loss metric
        dqn.train_loss_metrics.reset_states()

    return dqn.q_network
示例#19
0
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    # Define and build DDPG agent
    hidden_size = tuple(args.hidden_size)
    agent = DDPG(args.gamma,
                 args.tau,
                 hidden_size,
                 env.observation_space.shape[0],
                 env.action_space,
                 checkpoint_dir=checkpoint_dir
                 )

    # Initialize replay memory
    memory = ReplayMemory(int(args.replay_size))

    # Initialize OU-Noise
    nb_actions = env.action_space.shape[-1]
    ou_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                            sigma=float(args.noise_stddev) * np.ones(nb_actions))

    # Define counters and other variables
    start_step = 0
    # timestep = start_step
    if args.load_model:
        # Load agent if necessary
        start_step, memory = agent.load_checkpoint()
    timestep = start_step // 10000 + 1
    rewards, policy_losses, value_losses, mean_test_rewards = [], [], [], []
    epoch = 0
示例#20
0
    def __init__(self,
               env,
               embedding_network,
               replay_memory=ReplayMemory(100000),
               initial_epsilon=1.0,
               final_epsilon=0.01,
               epsilon_decay=0.99,
               batch_size=8,
               sgd_lr=1e-6,
               q_lr=0.01,
               gamma=0.99,
               lookahead_horizon=100,
               update_period=4,
               kernel=inverse_distance,
               num_neighbors=50,
               max_memory=500000):
        '''
    Instantiate an NEC Agent

    Parameters
    ----------
    env: gym.Env
      gym environment to train on
    embedding_network: torch.nn.Module
      Model to extract the embedding from a state
    replay_memory: ReplayMemory
      Replay memory to sample from for embedding network updates
    initial_epsilon: float
      Initial epsilon for epsilon greedy search
    epsilon_decay: float
      Exponential decay factor for epsilon
    batch_size: int
      Batch size to sample from the replay memory
    sgd_lr: float
      Learning rate to use for RMSProp updates to the embedding network and DND
    q_lr: float
      Learning rate to use for Q-updates on DND updates
    gamma: float
      Discount factor
    lookahead_horizon: int
      Lookahead horizon to use for N-step Q-value estimates
    update_period: int
      Inverse of rate at which embedding network gets updated
      i.e. if 1 then update after every timestep, if 16 then update every 16 timesteps, etc.
    kernel: (torch.autograd.Variable, torch.autograd.Variable) => (torch.autograd.Variable)
      Kernel function to use for DND lookups
    num_neighbors: int
      Number of neighbors to return in K-NN lookups in DND
    max_memory: int
      Maximum number of key-value pairs to store in each DND
        '''

        self.env = env
        self.embedding_network = embedding_network

    
        self.replay_memory = replay_memory
        self.epsilon = initial_epsilon
        self.final_epsilon = final_epsilon
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.q_lr = q_lr
        self.gamma = gamma
        self.lookahead_horizon = lookahead_horizon
        self.update_period = update_period
    
        self.transition_queue = []
        self.optimizer = optim.RMSprop(
            self.embedding_network.parameters(), lr=sgd_lr)
        self.dnd_list = [DND(kernel, num_neighbors, max_memory, sgd_lr)
                     for _ in range(env.action_space_n)]
 def setUp(self):
   self.memory = ReplayMemory(capacity=10)
示例#22
0
    def __init__(self,
                 actor_optimizer_spec,
                 critic_optimizer_spec,
                 num_feature,
                 num_action,
                 net_type,
                 replay_memory_size=1000000,
                 batch_size=64,
                 tau=0.001):
        ###############
        # BUILD MODEL #
        ###############
        self.num_feature = num_feature
        self.num_action = num_action
        self.batch_size = batch_size
        self.tau = tau
        # Construct actor and critic

        if net_type == 0:
            self.actor = MLPA(input_size=num_feature,
                              output_size=num_action,
                              hidden_size=(400, 300),
                              n_layers=2,
                              tanh_flag=1).type(dtype)
            self.target_actor = MLPA(input_size=num_feature,
                                     output_size=num_action,
                                     hidden_size=(400, 300),
                                     n_layers=2,
                                     tanh_flag=1).type(dtype)
            self.critic = MLPC(input_size_state=num_feature,
                               input_size_action=num_action,
                               output_size=1,
                               hidden_size=(400, 300),
                               n_layers=2).type(dtype)
            self.target_critic = MLPC(input_size_state=num_feature,
                                      input_size_action=num_action,
                                      output_size=1,
                                      hidden_size=(400, 300),
                                      n_layers=2).type(dtype)
        elif net_type == 1:
            self.actor = MLPA(input_size=num_feature + 1,
                              output_size=num_action,
                              hidden_size=(400, 300),
                              n_layers=2,
                              tanh_flag=1).type(dtype)
            self.target_actor = MLPA(input_size=num_feature + 1,
                                     output_size=num_action,
                                     hidden_size=(400, 300),
                                     n_layers=2,
                                     tanh_flag=1).type(dtype)
            self.critic = MLPC(input_size_state=num_feature + 1,
                               input_size_action=num_action,
                               output_size=1,
                               hidden_size=(400, 300),
                               n_layers=2).type(dtype)
            self.target_critic = MLPC(input_size_state=num_feature + 1,
                                      input_size_action=num_action,
                                      output_size=1,
                                      hidden_size=(400, 300),
                                      n_layers=2).type(dtype)
        elif net_type == 2:
            self.actor = PMLPA(input_size=num_feature,
                               output_size=num_action,
                               hidden_size=(400, 300),
                               dtype=dtype,
                               n_layers=2,
                               tanh_flag=1).type(dtype)
            self.target_actor = PMLPA(input_size=num_feature,
                                      output_size=num_action,
                                      hidden_size=(400, 300),
                                      dtype=dtype,
                                      n_layers=2,
                                      tanh_flag=1).type(dtype)
            self.critic = PMLPC(input_size_state=num_feature,
                                input_size_action=num_action,
                                output_size=1,
                                hidden_size=(400, 300),
                                dtype=dtype,
                                n_layers=2).type(dtype)
            self.target_critic = PMLPC(input_size_state=num_feature,
                                       input_size_action=num_action,
                                       output_size=1,
                                       hidden_size=(400, 300),
                                       dtype=dtype,
                                       n_layers=2).type(dtype)

        # Construct the optimizers for actor and critic
        self.actor_optimizer = actor_optimizer_spec.constructor(
            self.actor.parameters(), **actor_optimizer_spec.kwargs)
        self.critic_optimizer = critic_optimizer_spec.constructor(
            self.critic.parameters(), **critic_optimizer_spec.kwargs)
        # Construct the replay memory
        self.replay_memory = ReplayMemory(replay_memory_size)
示例#23
0
    def __init__(self,
                 env,
                 embedding_network,
                 replay_memory=ReplayMemory(500000),
                 epsilon_schedule=epsilon_schedule,
                 batch_size=8,
                 sgd_learning_rate=1e-2,
                 q_learning_rate=0.5,
                 gamma=0.99,
                 lookahead_horizon=100,
                 update_period=4,
                 kernel=inverse_distance,
                 num_neighbors=50,
                 max_memory=125000,
                 warmup_period=1000,
                 test_period=10):
        """
    Instantiate an NEC Agent

    Parameters
    ----------
    env: gym.Env
      gym environment to train on
    embedding_network: torch.nn.Module
      Model to extract the embedding from a state
    replay_memory: ReplayMemory
      Replay memory to sample from for embedding network updates
    epsilon_schedule: (int) => (float)
      Function that determines the epsilon for epsilon-greedy exploration from the timestep t
    batch_size: int
      Batch size to sample from the replay memory
    sgd_learning_rate: float
      Learning rate to use for RMSProp updates to the embedding network
    q_learning_rate: float
      Learning rate to use for Q-updates on DND updates
    gamma: float
      Discount factor
    lookahead_horizon: int
      Lookahead horizon to use for N-step Q-value estimates
    update_period: int
      Inverse of rate at which embedding network gets updated
      i.e. if 1 then update after every timestep, if 16 then update every 16 timesteps, etc.
    kernel: (torch.autograd.Variable, torch.autograd.Variable) => (torch.autograd.Variable)
      Kernel function to use for DND lookups
    num_neighbors: int
      Number of neighbors to return in K-NN lookups in DND
    max_memory: int
      Maximum number of key-value pairs to store in DND
    warmup_period: int
      Number of timesteps to act randomly before learning
    test_period: int
      Number of episodes between each test iteration
    """

        self.env = env
        self.embedding_network = embedding_network
        if use_cuda:
            self.embedding_network.cuda()

        self.replay_memory = replay_memory
        self.epsilon_schedule = epsilon_schedule
        self.batch_size = batch_size
        self.q_learning_rate = q_learning_rate
        self.gamma = gamma
        self.lookahead_horizon = lookahead_horizon
        self.update_period = update_period
        self.warmup_period = warmup_period
        self.test_period = test_period

        self.transition_queue = []
        self.optimizer = optim.RMSprop(self.embedding_network.parameters(),
                                       lr=sgd_learning_rate)

        state_dict = self.embedding_network.state_dict()
        self.dnd_list = [
            DND(kernel, num_neighbors, max_memory,
                state_dict[next(reversed(state_dict))].size()[0])
            for _ in range(env.action_space.n)
        ]
示例#24
0
    def __init__(self, args):

        super(BasePGQLearner, self).__init__(args)

        # args.entropy_regularisation_strength = 0.0
        conf_learning = {
            'name': 'local_learning_{}'.format(self.actor_id),
            'input_shape': self.input_shape,
            'num_act': self.num_actions,
            'args': args
        }

        self.local_network = PolicyValueNetwork(conf_learning)
        self.reset_hidden_state()

        if self.is_master():
            var_list = self.local_network.params
            self.saver = tf.train.Saver(var_list=var_list,
                                        max_to_keep=3,
                                        keep_checkpoint_every_n_hours=2)

        # pgq specific initialization
        self.batch_size = 32
        self.pgq_fraction = args.pgq_fraction
        self.replay_memory = ReplayMemory(args.replay_size)
        self.q_tilde = self.local_network.beta * (
            self.local_network.log_output_layer_pi +
            tf.expand_dims(self.local_network.output_layer_entropy,
                           1)) + self.local_network.output_layer_v

        self.Qi, self.Qi_plus_1 = tf.split(axis=0,
                                           num_or_size_splits=2,
                                           value=self.q_tilde)
        self.V, _ = tf.split(axis=0,
                             num_or_size_splits=2,
                             value=self.local_network.output_layer_v)
        self.log_pi, _ = tf.split(
            axis=0,
            num_or_size_splits=2,
            value=tf.expand_dims(self.local_network.log_output_selected_action,
                                 1))
        self.R = tf.placeholder('float32', [None], name='1-step_reward')

        self.terminal_indicator = tf.placeholder(tf.float32, [None],
                                                 name='terminal_indicator')
        self.max_TQ = self.gamma * tf.reduce_max(
            self.Qi_plus_1, 1) * (1 - self.terminal_indicator)
        self.Q_a = tf.reduce_sum(
            self.Qi * tf.split(axis=0,
                               num_or_size_splits=2,
                               value=self.local_network.selected_action_ph)[0],
            1)

        self.q_objective = -self.pgq_fraction * tf.reduce_mean(
            tf.stop_gradient(self.R + self.max_TQ - self.Q_a) *
            (self.V[:, 0] + self.log_pi[:, 0]))

        self.V_params = self.local_network.params
        self.q_gradients = tf.gradients(self.q_objective, self.V_params)

        if self.local_network.clip_norm_type == 'global':
            self.q_gradients = tf.clip_by_global_norm(
                self.q_gradients, self.local_network.clip_norm)[0]
        elif self.local_network.clip_norm_type == 'local':
            self.q_gradients = [
                tf.clip_by_norm(g, self.local_network.clip_norm)
                for g in self.q_gradients
            ]

        if (self.optimizer_mode == "local"):
            if (self.optimizer_type == "rmsprop"):
                self.batch_opt_st = np.ones(size, dtype=ctypes.c_float)
            else:
                self.batch_opt_st = np.zeros(size, dtype=ctypes.c_float)
        elif (self.optimizer_mode == "shared"):
            self.batch_opt_st = args.batch_opt_state