예제 #1
0
 def __init__(self,
              actor_optimizer_spec,
              critic_optimizer_spec,
              num_feature,
              num_action,
              replay_memory_size=1000000,
              batch_size=64,
              tau=0.001):
     ###############
     # BUILD MODEL #
     ###############
     self.num_feature = num_feature
     self.num_action = num_action
     self.batch_size = batch_size
     self.tau = tau
     # Construct actor and critic
     self.actor = Actor(num_feature, num_action).type(dtype)
     self.target_actor = Actor(num_feature, num_action).type(dtype)
     self.critic = Critic(num_feature, num_action).type(dtype)
     self.target_critic = Critic(num_feature, num_action).type(dtype)
     # Construct the optimizers for actor and critic
     self.actor_optimizer = actor_optimizer_spec.constructor(
         self.actor.parameters(), **actor_optimizer_spec.kwargs)
     self.critic_optimizer = critic_optimizer_spec.constructor(
         self.critic.parameters(), **critic_optimizer_spec.kwargs)
     # Construct the replay memory
     self.replay_memory = ReplayMemory(replay_memory_size)
    def __init__(self, args):
        self.args = args

        super(AElearner, self).__init__(args)
        self.cts_eta = args.cts_eta
        self.cts_beta = args.cts_beta
        self.ae_delta = args.ae_delta
        self.batch_size = args.batch_update_size
        self.replay_memory = ReplayMemory(
            args.replay_size,
            self.local_network_upper.get_input_shape(),
            # self.local_network.get_input_shape(),
            self.num_actions)
        #inits desity model(chooses how many steps for update )
        #20 * q targt update steps
        self._init_density_model(args)
        #computes loss
        self._double_dqn_op()
        self.which_net_to_update_counter = 0
        self.ae_counter = 0
        self.epsilon_greedy_counter = 0
        self.total_ae_counter = 0
        self.total_epsilon_greedy_counter = 0
        self.q_values_upper_max = []
        self.q_values_lower_max = []
        self.ae_valid_actions = True
        self.action_meanings = self.emulator.env.unwrapped.get_action_meanings(
        )
        self.minimized_actions_counter = {
            value: 0
            for value in self.action_meanings
        }
        print(self.minimized_actions_counter)
    def __init__(self, args):

        super(BasePGQLearner, self).__init__(args)

        self.q_update_counter = 0
        self.replay_size = args.replay_size
        self.pgq_fraction = args.pgq_fraction
        self.batch_update_size = args.batch_update_size
        scope_name = 'local_learning_{}'.format(self.actor_id)
        conf_learning = {'name': scope_name,
                         'input_shape': self.input_shape,
                         'num_act': self.num_actions,
                         'args': args}

        with tf.device('/cpu:0'):
            self.local_network = PolicyValueNetwork(conf_learning)
        with tf.device('/gpu:0'), tf.variable_scope('', reuse=True):
            self.batch_network = PolicyValueNetwork(conf_learning)
            self._build_q_ops()

        self.reset_hidden_state()
        self.replay_memory = ReplayMemory(
            self.replay_size,
            self.local_network.get_input_shape(),
            self.num_actions)
            
        if self.is_master():
            var_list = self.local_network.params
            self.saver = tf.train.Saver(var_list=var_list, max_to_keep=3, 
                                        keep_checkpoint_every_n_hours=2)
예제 #4
0
 def __init__(self,
              optimizer_spec,
              num_goal=6,
              num_action=2,
              replay_memory_size=10000,
              batch_size=128):
     ###############
     # BUILD MODEL #
     ###############
     self.num_goal = num_goal
     self.num_action = num_action
     self.batch_size = batch_size
     # Construct meta-controller and controller
     self.meta_controller = MetaController().type(dtype)
     self.target_meta_controller = MetaController().type(dtype)
     self.controller = Controller().type(dtype)
     self.target_controller = Controller().type(dtype)
     # Construct the optimizers for meta-controller and controller
     self.meta_optimizer = optimizer_spec.constructor(
         self.meta_controller.parameters(), **optimizer_spec.kwargs)
     self.ctrl_optimizer = optimizer_spec.constructor(
         self.controller.parameters(), **optimizer_spec.kwargs)
     # Construct the replay memory for meta-controller and controller
     self.meta_replay_memory = ReplayMemory(replay_memory_size)
     self.ctrl_replay_memory = ReplayMemory(replay_memory_size)
예제 #5
0
 def __init__(self,
              optimizer_spec,
              num_goal=81,
              num_action=81,
              replay_memory_size=10000,
              subgoals = 81,
              screen_size = (500,500),
              batch_size=128):
     ###############
     # BUILD MODEL #
     ###############
     self.num_goal = num_goal
     self.num_action = num_action
     self.batch_size = batch_size
     # Construct meta-controller and controller
     self.meta_controller = MetaController().type(dtype)
     self.target_meta_controller = MetaController().type(dtype)
     self.controller = Controller().type(dtype)
     self.target_controller = Controller().type(dtype)
     # Construct the optimizers for meta-controller and controller
     self.meta_optimizer = optimizer_spec.constructor(self.meta_controller.parameters(), **optimizer_spec.kwargs)
     self.ctrl_optimizer = optimizer_spec.constructor(self.controller.parameters(), **optimizer_spec.kwargs)
     # Construct the replay memory for meta-controller and controller
     self.meta_replay_memory = ReplayMemory(replay_memory_size)
     self.ctrl_replay_memory = ReplayMemory(replay_memory_size)
     self.subgoals = subgoals
     self.screen_size = screen_size
     self.idx_2_action = self.action_dict()
예제 #6
0
    def __init__(self):
        super(DQNDoubleQAgent, self).__init__()
        self.training = False
        self.max_frames = 2000000
        self._epsilon = Epsilon(start=1.0, end=0.1, update_increment=0.0001)
        self.gamma = 0.99
        self.train_q_per_step = 4
        self.train_q_batch_size = 256
        self.steps_before_training = 10000
        self.target_q_update_frequency = 50000

        self._Q_weights_path = "./data/SC2DoubleQAgent"
        self._Q = DQNCNN()
        if os.path.isfile(self._Q_weights_path):
            self._Q.load_state_dict(torch.load(self._Q_weights_path))
            print("Loading weights:", self._Q_weights_path)
        self._Qt = copy.deepcopy(self._Q)
        self._Q.cuda()
        self._Qt.cuda()
        self._optimizer = optim.Adam(self._Q.parameters(), lr=1e-8)
        self._criterion = nn.MSELoss()
        self._memory = ReplayMemory(100000)

        self._loss = deque(maxlen=1000)
        self._max_q = deque(maxlen=1000)
        self._action = None
        self._screen = None
        self._fig = plt.figure()
        self._plot = [plt.subplot(2, 2, i + 1) for i in range(4)]

        self._screen_size = 28
예제 #7
0
    def __init__(self, config):
        self.config = config

        self.logger = logging.getLogger("DQNAgent")

        # define models (policy and target)
        self.policy_model = DQN(self.config)
        self.target_model = DQN(self.config)

        # define memory
        self.memory = ReplayMemory(self.config)

        # define loss
        self.loss = HuberLoss()

        # define optimizer
        self.optim = torch.optim.RMSprop(self.policy_model.parameters())

        # define environment
        self.env = gym.make('CartPole-v0').unwrapped
        self.cartpole = CartPoleEnv(self.config.screen_width)

        # initialize counter
        self.current_episode = 0
        self.current_iteration = 0
        self.episode_durations = []

        self.batch_size = self.config.batch_size

        # set cuda flag
        self.is_cuda = torch.cuda.is_available()
        if self.is_cuda and not self.config.cuda:
            self.logger.info(
                "WARNING: You have a CUDA device, so you should probably enable CUDA"
            )

        self.cuda = self.is_cuda & self.config.cuda

        if self.cuda:
            self.logger.info("Program will run on *****GPU-CUDA***** ")
            print_cuda_statistics()
            self.device = torch.device("cuda")
            torch.cuda.set_device(self.config.gpu_device)
        else:
            self.logger.info("Program will run on *****CPU***** ")
            self.device = torch.device("cpu")

        self.policy_model = self.policy_model.to(self.device)
        self.target_model = self.target_model.to(self.device)
        self.loss = self.loss.to(self.device)

        # Initialize Target model with policy model state dict
        self.target_model.load_state_dict(self.policy_model.state_dict())
        self.target_model.eval()

        # Summary Writer
        self.summary_writer = SummaryWriter(log_dir=self.config.summary_dir,
                                            comment='DQN')
예제 #8
0
    def __init__(self, args):
        self.final_epsilon = args.final_epsilon
        super(PseudoCountQLearner, self).__init__(args)

        self.cts_eta = args.cts_eta
        self.cts_beta = args.cts_beta
        self.batch_size = args.batch_update_size
        self.replay_memory = ReplayMemory(args.replay_size)

        self._init_density_model(args)
        self._double_dqn_op()
 def test_zero_step(self):
   self.memory = ReplayMemory(capacity=10, multi_step_n=0)
   for i in range(5):
     a = Transition([0, 1, 2, i], 0, [4, 5, 6, i*i], 1, False)
     self.memory.push(a)
   final = Transition([0, 1, 2, 10], 0, [4, 5, 6, 100], 10, True)
   self.memory.push(final)
   self.assertEqual(self.memory.memory[0].r, 1)
   self.assertEqual(self.memory.memory[3].r, 1)
   self.assertEqual(self.memory.memory[4].r, 1)
   self.assertEqual(self.memory.memory[5].r, 10)
    def __init__(self, args):
        super(PseudoCountQLearner, self).__init__(args)

        self.cts_eta = .9
        self.batch_size = 32
        self.replay_memory = ReplayMemory(args.replay_size)

        #more cython tuning could useful here
        self.density_model = CTSDensityModel(height=args.cts_rescale_dim,
                                             width=args.cts_rescale_dim,
                                             num_bins=args.cts_bins,
                                             beta=0.05)
    def __init__(self, args):
        self.args = args
        super(PseudoCountQLearner, self).__init__(args)

        self.cts_eta = args.cts_eta
        self.cts_beta = args.cts_beta
        self.batch_size = args.batch_update_size
        self.replay_memory = ReplayMemory(args.replay_size,
                                          self.local_network.get_input_shape(),
                                          self.num_actions)

        self._init_density_model(args)
        self._double_dqn_op()
    def __init__(self, args):
        self.args = args
        super(PseudoCountQLearner, self).__init__(args)

        self.cts_eta = args.cts_eta
        self.cts_beta = args.cts_beta
        self.batch_size = args.batch_update_size
        self.replay_memory = ReplayMemory(args.replay_size,
                                          self.local_network.get_input_shape(),
                                          self.num_actions)
        #inits desity model(chooses how many steps for update )
        #20 * q targt update steps
        self._init_density_model(args)
        #computes loss
        self._double_dqn_op()
예제 #13
0
    def _build_q_ops(self):
        # pgq specific initialization
        self.pgq_fraction = self.pgq_fraction
        self.batch_size = self.batch_update_size
        self.replay_memory = ReplayMemory(self.replay_size)
        self.q_tilde = self.batch_network.beta * (
            self.batch_network.log_output_layer_pi +
            tf.expand_dims(self.batch_network.output_layer_entropy,
                           1)) + self.batch_network.output_layer_v

        self.Qi, self.Qi_plus_1 = tf.split(axis=0,
                                           num_or_size_splits=2,
                                           value=self.q_tilde)
        self.V, _ = tf.split(axis=0,
                             num_or_size_splits=2,
                             value=self.batch_network.output_layer_v)
        self.log_pi, _ = tf.split(
            axis=0,
            num_or_size_splits=2,
            value=tf.expand_dims(self.batch_network.log_output_selected_action,
                                 1))
        self.R = tf.placeholder('float32', [None], name='1-step_reward')

        self.terminal_indicator = tf.placeholder(tf.float32, [None],
                                                 name='terminal_indicator')
        self.max_TQ = self.gamma * tf.reduce_max(
            self.Qi_plus_1, 1) * (1 - self.terminal_indicator)
        self.Q_a = tf.reduce_sum(
            self.Qi * tf.split(axis=0,
                               num_or_size_splits=2,
                               value=self.batch_network.selected_action_ph)[0],
            1)

        self.q_objective = -self.pgq_fraction * tf.reduce_mean(
            tf.stop_gradient(self.R + self.max_TQ - self.Q_a) *
            (self.V[:, 0] + self.log_pi[:, 0]))

        self.V_params = self.batch_network.params
        self.q_gradients = tf.gradients(self.q_objective, self.V_params)

        if self.batch_network.clip_norm_type == 'global':
            self.q_gradients = tf.clip_by_global_norm(
                self.q_gradients, self.batch_network.clip_norm)[0]
        elif self.batch_network.clip_norm_type == 'local':
            self.q_gradients = [
                tf.clip_by_norm(g, self.batch_network.clip_norm)
                for g in self.q_gradients
            ]
예제 #14
0
파일: NEC.py 프로젝트: jlrussin/RL_project
 def __init__(self, env, args, device='cpu'):
     """
     Instantiate an NEC Agent
     ----------
     env: gym.Env
         gym environment to train on
     args: args class from argparser
         args are from from train.py: see train.py for help with each arg
     device: string
         'cpu' or 'cuda:0' depending on use_cuda flag from train.py
     """
     self.environment_type = args.environment_type
     self.env = env
     self.device = device
     # Hyperparameters
     self.epsilon = args.initial_epsilon
     self.final_epsilon = args.final_epsilon
     self.epsilon_decay = args.epsilon_decay
     self.gamma = args.gamma
     self.N = args.N
     # Transition queue and replay memory
     self.transition_queue = []
     self.replay_every = args.replay_every
     self.replay_buffer_size = args.replay_buffer_size
     self.replay_memory = ReplayMemory(self.replay_buffer_size)
     # CNN for state embedding network
     self.frames_to_stack = args.frames_to_stack
     self.embedding_size = args.embedding_size
     self.in_height = args.in_height
     self.in_width = args.in_width
     self.cnn = CNN(self.frames_to_stack, self.embedding_size,
                    self.in_height, self.in_width).to(self.device)
     # Differentiable Neural Dictionary (DND): one for each action
     self.kernel = inverse_distance
     self.num_neighbors = args.num_neighbors
     self.max_memory = args.max_memory
     self.lr = args.lr
     self.dnd_list = []
     for i in range(env.action_space.n):
         self.dnd_list.append(
             DND(self.kernel, self.num_neighbors, self.max_memory,
                 args.optimizer, self.lr))
     # Optimizer for state embedding CNN
     self.q_lr = args.q_lr
     self.batch_size = args.batch_size
     self.optimizer = get_optimizer(args.optimizer, self.cnn.parameters(),
                                    self.lr)
예제 #15
0
 def __init__(self,
              environment_name="CartPole-v1",
              replay_memory_size=10000,
              action_threshold=0.7,
              batch_size=64,
              gamma=0.9):
     self.environment = gym.make(environment_name)
     state = self.environment.reset()
     self.state_shape = state.shape
     self.action_space = self.environment.action_space.n
     self.replay_memory = ReplayMemory(self.state_shape,
                                       capacity=replay_memory_size)
     self.model = self.build_network()
     self.target_model = self.build_network()
     self.action_threshold = action_threshold
     self.batch_size = batch_size
     self.gamma = gamma
예제 #16
0
    def __init__(
        self,
        state_size,
        n_actions,
        args,
        device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
        self.device = device

        # Exploration / Exploitation params.
        self.steps_done = 0
        self.eps_threshold = 1
        self.eps_start = args.eps_start
        self.eps_end = args.eps_end
        self.eps_decay = args.eps_decay

        # RL params
        self.target_update = args.target_update
        self.discount = args.discount

        # Env params
        self.n_actions = n_actions
        self.state_size = state_size

        # Deep q networks params
        self.layers = args.layers
        self.batch_size = args.batch_size
        self.policy_net = DQN(state_size, n_actions,
                              layers=self.layers).to(self.device).float()
        self.target_net = None
        self.grad_clip = args.grad_clip

        if str(args.optimizer).lower() == 'adam':
            self.optimizer = optim.Adam(self.policy_net.parameters())
        if str(args.optimizer).lower() == 'rmsprop':
            self.optimizer = optim.RMSprop(self.policy_net.parameters())
        else:
            raise NotImplementedError

        self.memory = ReplayMemory(args.replay_size)

        # Performance buffers.
        self.rewards_list = []
예제 #17
0
 def __init__(self,
              environment_name="Acrobot-v1",
              replay_memory_size=10000,
              action_threshold=0.7,
              batch_size=64,
              gamma=0.9):
     super(MotionAthlete,
           self).__init__(environment_name, replay_memory_size,
                          action_threshold, batch_size, gamma)
     self.environment.close()
     del self.environment
     self.environment = EnvironmentWrapper(environment_name)
     frame = self.environment.reset()
     frmae_shape = frame.shape
     self.motion_tracer = MotionTracer(frame_shape=frmae_shape)
     self.state_shape = self.motion_tracer.state_shape
     self.replay_memory = ReplayMemory(self.state_shape,
                                       capacity=replay_memory_size)
     del self.model
     del self.target_model
     self.model = self.build_network()
     self.target_model = self.build_network()
예제 #18
0
class BasePGQLearner(BaseA3CLearner):
    def __init__(self, args):

        super(BasePGQLearner, self).__init__(args)

        # args.entropy_regularisation_strength = 0.0
        conf_learning = {
            'name': 'local_learning_{}'.format(self.actor_id),
            'input_shape': self.input_shape,
            'num_act': self.num_actions,
            'args': args
        }

        self.local_network = PolicyValueNetwork(conf_learning)
        self.reset_hidden_state()

        if self.is_master():
            var_list = self.local_network.params
            self.saver = tf.train.Saver(var_list=var_list,
                                        max_to_keep=3,
                                        keep_checkpoint_every_n_hours=2)

        # pgq specific initialization
        self.batch_size = 32
        self.pgq_fraction = args.pgq_fraction
        self.replay_memory = ReplayMemory(args.replay_size)
        self.q_tilde = self.local_network.beta * (
            self.local_network.log_output_layer_pi +
            tf.expand_dims(self.local_network.output_layer_entropy,
                           1)) + self.local_network.output_layer_v

        self.Qi, self.Qi_plus_1 = tf.split(axis=0,
                                           num_or_size_splits=2,
                                           value=self.q_tilde)
        self.V, _ = tf.split(axis=0,
                             num_or_size_splits=2,
                             value=self.local_network.output_layer_v)
        self.log_pi, _ = tf.split(
            axis=0,
            num_or_size_splits=2,
            value=tf.expand_dims(self.local_network.log_output_selected_action,
                                 1))
        self.R = tf.placeholder('float32', [None], name='1-step_reward')

        self.terminal_indicator = tf.placeholder(tf.float32, [None],
                                                 name='terminal_indicator')
        self.max_TQ = self.gamma * tf.reduce_max(
            self.Qi_plus_1, 1) * (1 - self.terminal_indicator)
        self.Q_a = tf.reduce_sum(
            self.Qi * tf.split(axis=0,
                               num_or_size_splits=2,
                               value=self.local_network.selected_action_ph)[0],
            1)

        self.q_objective = -self.pgq_fraction * tf.reduce_mean(
            tf.stop_gradient(self.R + self.max_TQ - self.Q_a) *
            (self.V[:, 0] + self.log_pi[:, 0]))

        self.V_params = self.local_network.params
        self.q_gradients = tf.gradients(self.q_objective, self.V_params)

        if self.local_network.clip_norm_type == 'global':
            self.q_gradients = tf.clip_by_global_norm(
                self.q_gradients, self.local_network.clip_norm)[0]
        elif self.local_network.clip_norm_type == 'local':
            self.q_gradients = [
                tf.clip_by_norm(g, self.local_network.clip_norm)
                for g in self.q_gradients
            ]

        if (self.optimizer_mode == "local"):
            if (self.optimizer_type == "rmsprop"):
                self.batch_opt_st = np.ones(size, dtype=ctypes.c_float)
            else:
                self.batch_opt_st = np.zeros(size, dtype=ctypes.c_float)
        elif (self.optimizer_mode == "shared"):
            self.batch_opt_st = args.batch_opt_state

    def apply_batch_q_update(self):
        s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(
            self.batch_size)

        batch_grads, max_TQ, Q_a = self.session.run(
            [self.q_gradients, self.max_TQ, self.Q_a],
            feed_dict={
                self.R: r_i,
                self.local_network.selected_action_ph: np.vstack([a_i, a_i]),
                self.local_network.input_ph: np.vstack([s_i, s_f]),
                self.terminal_indicator: is_terminal.astype(np.int),
            })
        # print 'max_TQ={}, Q_a={}'.format(max_TQ[:5], Q_a[:5])

        self._apply_gradients_to_shared_memory_vars(batch_grads,
                                                    opt_st=self.batch_opt_st)

    def softmax(self, x, temperature):
        x /= temperature
        exp_x = np.exp(x - np.max(x))

        return exp_x / exp_x.sum()
class BasePGQLearner(BaseA3CLearner):
    def __init__(self, args):

        super(BasePGQLearner, self).__init__(args)

        self.q_update_counter = 0
        self.replay_size = args.replay_size
        self.pgq_fraction = args.pgq_fraction
        self.batch_update_size = args.batch_update_size
        scope_name = 'local_learning_{}'.format(self.actor_id)
        conf_learning = {'name': scope_name,
                         'input_shape': self.input_shape,
                         'num_act': self.num_actions,
                         'args': args}

        with tf.device('/cpu:0'):
            self.local_network = PolicyValueNetwork(conf_learning)
        with tf.device('/gpu:0'), tf.variable_scope('', reuse=True):
            self.batch_network = PolicyValueNetwork(conf_learning)
            self._build_q_ops()

        self.reset_hidden_state()
        self.replay_memory = ReplayMemory(
            self.replay_size,
            self.local_network.get_input_shape(),
            self.num_actions)
            
        if self.is_master():
            var_list = self.local_network.params
            self.saver = tf.train.Saver(var_list=var_list, max_to_keep=3, 
                                        keep_checkpoint_every_n_hours=2)


    def _build_q_ops(self):
        # pgq specific initialization
        self.pgq_fraction = self.pgq_fraction
        self.batch_size = self.batch_update_size
        self.q_tilde = self.batch_network.beta * (
            self.batch_network.log_output_layer_pi
            + tf.expand_dims(self.batch_network.output_layer_entropy, 1)
        ) + self.batch_network.output_layer_v

        self.Qi, self.Qi_plus_1 = tf.split(axis=0, num_or_size_splits=2, value=self.q_tilde)
        self.V, _ = tf.split(axis=0, num_or_size_splits=2, value=self.batch_network.output_layer_v)
        self.log_pi, _ = tf.split(axis=0, num_or_size_splits=2, value=tf.expand_dims(self.batch_network.log_output_selected_action, 1))
        self.R = tf.placeholder('float32', [None], name='1-step_reward')

        self.terminal_indicator = tf.placeholder(tf.float32, [None], name='terminal_indicator')
        self.max_TQ = self.gamma*tf.reduce_max(self.Qi_plus_1, 1) * (1 - self.terminal_indicator)
        self.Q_a = tf.reduce_sum(self.Qi * tf.split(axis=0, num_or_size_splits=2, value=self.batch_network.selected_action_ph)[0], 1)

        self.q_objective = - self.pgq_fraction * tf.reduce_mean(tf.stop_gradient(self.R + self.max_TQ - self.Q_a) * (0.5 * self.V[:, 0] + self.log_pi[:, 0]))

        self.V_params = self.batch_network.params
        self.q_gradients = tf.gradients(self.q_objective, self.V_params)
        self.q_gradients = self.batch_network._clip_grads(self.q_gradients)


    def batch_q_update(self):
        if len(self.replay_memory) < self.replay_memory.maxlen//10:
            return

        s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(self.batch_size)

        batch_grads = self.session.run(
            self.q_gradients,
            feed_dict={
                self.R: r_i,
                self.batch_network.selected_action_ph: np.vstack([a_i, a_i]),
                self.batch_network.input_ph: np.vstack([s_i, s_f]),
                self.terminal_indicator: is_terminal.astype(np.int),
            }
        )
        self.apply_gradients_to_shared_memory_vars(batch_grads)
예제 #20
0
    def __init__(self, args):

        super(BasePGQLearner, self).__init__(args)

        # args.entropy_regularisation_strength = 0.0
        conf_learning = {
            'name': 'local_learning_{}'.format(self.actor_id),
            'input_shape': self.input_shape,
            'num_act': self.num_actions,
            'args': args
        }

        self.local_network = PolicyValueNetwork(conf_learning)
        self.reset_hidden_state()

        if self.is_master():
            var_list = self.local_network.params
            self.saver = tf.train.Saver(var_list=var_list,
                                        max_to_keep=3,
                                        keep_checkpoint_every_n_hours=2)

        # pgq specific initialization
        self.batch_size = 32
        self.pgq_fraction = args.pgq_fraction
        self.replay_memory = ReplayMemory(args.replay_size)
        self.q_tilde = self.local_network.beta * (
            self.local_network.log_output_layer_pi +
            tf.expand_dims(self.local_network.output_layer_entropy,
                           1)) + self.local_network.output_layer_v

        self.Qi, self.Qi_plus_1 = tf.split(axis=0,
                                           num_or_size_splits=2,
                                           value=self.q_tilde)
        self.V, _ = tf.split(axis=0,
                             num_or_size_splits=2,
                             value=self.local_network.output_layer_v)
        self.log_pi, _ = tf.split(
            axis=0,
            num_or_size_splits=2,
            value=tf.expand_dims(self.local_network.log_output_selected_action,
                                 1))
        self.R = tf.placeholder('float32', [None], name='1-step_reward')

        self.terminal_indicator = tf.placeholder(tf.float32, [None],
                                                 name='terminal_indicator')
        self.max_TQ = self.gamma * tf.reduce_max(
            self.Qi_plus_1, 1) * (1 - self.terminal_indicator)
        self.Q_a = tf.reduce_sum(
            self.Qi * tf.split(axis=0,
                               num_or_size_splits=2,
                               value=self.local_network.selected_action_ph)[0],
            1)

        self.q_objective = -self.pgq_fraction * tf.reduce_mean(
            tf.stop_gradient(self.R + self.max_TQ - self.Q_a) *
            (self.V[:, 0] + self.log_pi[:, 0]))

        self.V_params = self.local_network.params
        self.q_gradients = tf.gradients(self.q_objective, self.V_params)

        if self.local_network.clip_norm_type == 'global':
            self.q_gradients = tf.clip_by_global_norm(
                self.q_gradients, self.local_network.clip_norm)[0]
        elif self.local_network.clip_norm_type == 'local':
            self.q_gradients = [
                tf.clip_by_norm(g, self.local_network.clip_norm)
                for g in self.q_gradients
            ]

        if (self.optimizer_mode == "local"):
            if (self.optimizer_type == "rmsprop"):
                self.batch_opt_st = np.ones(size, dtype=ctypes.c_float)
            else:
                self.batch_opt_st = np.zeros(size, dtype=ctypes.c_float)
        elif (self.optimizer_mode == "shared"):
            self.batch_opt_st = args.batch_opt_state
class PseudoCountQLearner(ValueBasedLearner, DensityModelMixin):
    """
    Based on DQN+CTS model from the paper 'Unifying Count-Based Exploration and Intrinsic Motivation' (https://arxiv.org/abs/1606.01868)
    Presently the implementation differs from the paper in that the novelty bonuses are computed online rather than by computing the
    prediction gains after the model has been updated with all frames from the episode. Async training with different final epsilon values
    tends to produce better results than just using a single actor-learner.
    """
    def __init__(self, args):
        self.args = args
        super(PseudoCountQLearner, self).__init__(args)

        self.cts_eta = args.cts_eta
        self.cts_beta = args.cts_beta
        self.batch_size = args.batch_update_size
        self.replay_memory = ReplayMemory(args.replay_size,
                                          self.local_network.get_input_shape(),
                                          self.num_actions)

        self._init_density_model(args)
        self._double_dqn_op()

    def generate_final_epsilon(self):
        if self.num_actor_learners == 1:
            return self.args.final_epsilon
        else:
            return super(PseudoCountQLearner, self).generate_final_epsilon()

    def _get_summary_vars(self):
        q_vars = super(PseudoCountQLearner, self)._get_summary_vars()

        bonus_q05 = tf.Variable(0., name='novelty_bonus_q05')
        s1 = tf.summary.scalar('Novelty_Bonus_q05_{}'.format(self.actor_id),
                               bonus_q05)
        bonus_q50 = tf.Variable(0., name='novelty_bonus_q50')
        s2 = tf.summary.scalar('Novelty_Bonus_q50_{}'.format(self.actor_id),
                               bonus_q50)
        bonus_q95 = tf.Variable(0., name='novelty_bonus_q95')
        s3 = tf.summary.scalar('Novelty_Bonus_q95_{}'.format(self.actor_id),
                               bonus_q95)

        augmented_reward = tf.Variable(0., name='augmented_episode_reward')
        s4 = tf.summary.scalar(
            'Augmented_Episode_Reward_{}'.format(self.actor_id),
            augmented_reward)

        return q_vars + [bonus_q05, bonus_q50, bonus_q95, augmented_reward]

    #TODO: refactor to make this cleaner
    def prepare_state(self, state, total_episode_reward, steps_at_last_reward,
                      ep_t, episode_ave_max_q, episode_over, bonuses,
                      total_augmented_reward):
        # Start a new game on reaching terminal state
        if episode_over:
            T = self.global_step.value() * self.max_local_steps
            t = self.local_step
            e_prog = float(t) / self.epsilon_annealing_steps
            episode_ave_max_q = episode_ave_max_q / float(ep_t)
            s1 = "Q_MAX {0:.4f}".format(episode_ave_max_q)
            s2 = "EPS {0:.4f}".format(self.epsilon)

            self.scores.insert(0, total_episode_reward)
            if len(self.scores) > 100:
                self.scores.pop()

            logger.info('T{0} / STEP {1} / REWARD {2} / {3} / {4}'.format(
                self.actor_id, T, total_episode_reward, s1, s2))
            logger.info(
                'ID: {0} -- RUNNING AVG: {1:.0f} ± {2:.0f} -- BEST: {3:.0f}'.
                format(
                    self.actor_id,
                    np.array(self.scores).mean(),
                    2 * np.array(self.scores).std(),
                    max(self.scores),
                ))

            self.log_summary(
                total_episode_reward,
                episode_ave_max_q,
                self.epsilon,
                np.percentile(bonuses, 5),
                np.percentile(bonuses, 50),
                np.percentile(bonuses, 95),
                total_augmented_reward,
            )

            state = self.emulator.get_initial_state()
            ep_t = 0
            total_episode_reward = 0
            episode_ave_max_q = 0
            episode_over = False

        return (state, total_episode_reward, steps_at_last_reward, ep_t,
                episode_ave_max_q, episode_over)

    def _double_dqn_op(self):
        q_local_action = tf.cast(
            tf.argmax(self.local_network.output_layer, axis=1), tf.int32)
        q_target_max = utils.ops.slice_2d(
            self.target_network.output_layer,
            tf.range(0, self.batch_size),
            q_local_action,
        )
        self.one_step_reward = tf.placeholder(tf.float32,
                                              self.batch_size,
                                              name='one_step_reward')
        self.is_terminal = tf.placeholder(tf.bool,
                                          self.batch_size,
                                          name='is_terminal')

        self.y_target = self.one_step_reward + self.cts_eta*self.gamma*q_target_max \
            * (1 - tf.cast(self.is_terminal, tf.float32))

        self.double_dqn_loss = self.local_network._value_function_loss(
            self.local_network.q_selected_action -
            tf.stop_gradient(self.y_target))

        self.double_dqn_grads = tf.gradients(self.double_dqn_loss,
                                             self.local_network.params)

    # def batch_update(self):
    #     if len(self.replay_memory) < self.replay_memory.maxlen//10:
    #         return

    #     s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(self.batch_size)

    #     feed_dict={
    #         self.one_step_reward: r_i,
    #         self.target_network.input_ph: s_f,
    #         self.local_network.input_ph: np.vstack([s_i, s_f]),
    #         self.local_network.selected_action_ph: np.vstack([a_i, a_i]),
    #         self.is_terminal: is_terminal
    #     }
    #     grads = self.session.run(self.double_dqn_grads, feed_dict=feed_dict)
    #     self.apply_gradients_to_shared_memory_vars(grads)

    def batch_update(self):
        if len(self.replay_memory) < self.replay_memory.maxlen // 10:
            return

        s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(
            self.batch_size)

        feed_dict = {
            self.local_network.input_ph: s_f,
            self.target_network.input_ph: s_f,
            self.is_terminal: is_terminal,
            self.one_step_reward: r_i,
        }
        y_target = self.session.run(self.y_target, feed_dict=feed_dict)

        feed_dict = {
            self.local_network.input_ph: s_i,
            self.local_network.target_ph: y_target,
            self.local_network.selected_action_ph: a_i
        }
        grads = self.session.run(self.local_network.get_gradients,
                                 feed_dict=feed_dict)
        self.apply_gradients_to_shared_memory_vars(grads)

    def train(self):
        """ Main actor learner loop for n-step Q learning. """
        logger.debug("Actor {} resuming at Step {}, {}".format(
            self.actor_id, self.global_step.value(), time.ctime()))

        s = self.emulator.get_initial_state()

        s_batch = list()
        a_batch = list()
        y_batch = list()
        bonuses = deque(maxlen=1000)
        episode_over = False

        t0 = time.time()
        global_steps_at_last_record = self.global_step.value()
        while (self.global_step.value() < self.max_global_steps):
            # # Sync local learning net with shared mem
            # self.sync_net_with_shared_memory(self.local_network, self.learning_vars)
            # self.save_vars()
            rewards = list()
            states = list()
            actions = list()
            max_q_values = list()
            local_step_start = self.local_step
            total_episode_reward = 0
            total_augmented_reward = 0
            episode_ave_max_q = 0
            ep_t = 0

            while not episode_over:
                # Sync local learning net with shared mem
                self.sync_net_with_shared_memory(self.local_network,
                                                 self.learning_vars)
                self.save_vars()

                # Choose next action and execute it
                a, q_values = self.choose_next_action(s)

                new_s, reward, episode_over = self.emulator.next(a)
                total_episode_reward += reward
                max_q = np.max(q_values)

                current_frame = new_s[..., -1]
                bonus = self.density_model.update(current_frame)
                bonuses.append(bonus)

                # Rescale or clip immediate reward
                reward = self.rescale_reward(
                    self.rescale_reward(reward) + bonus)
                total_augmented_reward += reward
                ep_t += 1

                rewards.append(reward)
                states.append(s)
                actions.append(a)
                max_q_values.append(max_q)

                s = new_s
                self.local_step += 1
                episode_ave_max_q += max_q

                global_step, _ = self.global_step.increment()

                if global_step % self.q_target_update_steps == 0:
                    self.update_target()
                if global_step % self.density_model_update_steps == 0:
                    self.write_density_model()

                # Sync local tensorflow target network params with shared target network params
                if self.target_update_flags.updated[self.actor_id] == 1:
                    self.sync_net_with_shared_memory(self.target_network,
                                                     self.target_vars)
                    self.target_update_flags.updated[self.actor_id] = 0
                if self.density_model_update_flags.updated[self.actor_id] == 1:
                    self.read_density_model()
                    self.density_model_update_flags.updated[self.actor_id] = 0

                if self.local_step % self.q_update_interval == 0:
                    self.batch_update()

                if self.is_master() and (self.local_step % 500 == 0):
                    bonus_array = np.array(bonuses)
                    steps = global_step - global_steps_at_last_record
                    global_steps_at_last_record = global_step

                    logger.debug(
                        'Mean Bonus={:.4f} / Max Bonus={:.4f} / STEPS/s={}'.
                        format(bonus_array.mean(), bonus_array.max(),
                               steps / float(time.time() - t0)))
                    t0 = time.time()

            else:
                #compute monte carlo return
                mc_returns = np.zeros((len(rewards), ), dtype=np.float32)
                running_total = 0.0
                for i, r in enumerate(reversed(rewards)):
                    running_total = r + self.gamma * running_total
                    mc_returns[len(rewards) - i - 1] = running_total

                mixed_returns = self.cts_eta * np.asarray(rewards) + (
                    1 - self.cts_eta) * mc_returns

                #update replay memory
                states.append(new_s)
                episode_length = len(rewards)
                for i in range(episode_length):
                    self.replay_memory.append(states[i], actions[i],
                                              mixed_returns[i],
                                              i + 1 == episode_length)

            s, total_episode_reward, _, ep_t, episode_ave_max_q, episode_over = \
                self.prepare_state(s, total_episode_reward, self.local_step, ep_t, episode_ave_max_q, episode_over, bonuses, total_augmented_reward)
예제 #22
0
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    # Define and build DDPG agent
    hidden_size = tuple(args.hidden_size)
    agent = DDPG(args.gamma,
                 args.tau,
                 hidden_size,
                 env.observation_space.shape[0],
                 env.action_space,
                 checkpoint_dir=checkpoint_dir
                 )

    # Initialize replay memory
    memory = ReplayMemory(int(args.replay_size))

    # Initialize OU-Noise
    nb_actions = env.action_space.shape[-1]
    ou_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                            sigma=float(args.noise_stddev) * np.ones(nb_actions))

    # Define counters and other variables
    start_step = 0
    # timestep = start_step
    if args.load_model:
        # Load agent if necessary
        start_step, memory = agent.load_checkpoint()
    timestep = start_step // 10000 + 1
    rewards, policy_losses, value_losses, mean_test_rewards = [], [], [], []
    epoch = 0
예제 #23
0
class hDQN():
    """
    The Hierarchical-DQN Agent
    Parameters
    ----------
        optimizer_spec: OptimizerSpec
            Specifying the constructor and kwargs, as well as learning rate schedule
            for the optimizer
        num_goal: int
            The number of goal that agent can choose from
        num_action: int
            The number of action that agent can choose from
        replay_memory_size: int
            How many memories to store in the replay memory.
        batch_size: int
            How many transitions to sample each time experience is replayed.
    """
    def __init__(self,
                 optimizer_spec,
                 num_goal=6,
                 num_action=2,
                 replay_memory_size=10000,
                 batch_size=128):
        ###############
        # BUILD MODEL #
        ###############
        self.num_goal = num_goal
        self.num_action = num_action
        self.batch_size = batch_size
        # Construct meta-controller and controller
        self.meta_controller = MetaController().type(dtype)
        self.target_meta_controller = MetaController().type(dtype)
        self.controller = Controller().type(dtype)
        self.target_controller = Controller().type(dtype)
        # Construct the optimizers for meta-controller and controller
        self.meta_optimizer = optimizer_spec.constructor(
            self.meta_controller.parameters(), **optimizer_spec.kwargs)
        self.ctrl_optimizer = optimizer_spec.constructor(
            self.controller.parameters(), **optimizer_spec.kwargs)
        # Construct the replay memory for meta-controller and controller
        self.meta_replay_memory = ReplayMemory(replay_memory_size)
        self.ctrl_replay_memory = ReplayMemory(replay_memory_size)

    def get_intrinsic_reward(self, goal, state):
        return 1.0 if goal == state else 0.0

    def select_goal(self, state, epilson):
        sample = random.random()
        if sample > epilson:
            state = torch.from_numpy(state).type(dtype)
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            with torch.no_grad():
                return self.meta_controller(Variable(
                    state, volatile=True)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([random.randrange(self.num_goal)])

    def select_action(self, joint_state_goal, epilson):
        sample = random.random()
        if sample > epilson:
            joint_state_goal = torch.from_numpy(joint_state_goal).type(dtype)
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            with torch.no_grad():
                return self.controller(
                    Variable(joint_state_goal,
                             volatile=True)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([random.randrange(self.num_action)])

    def update_meta_controller(self, gamma=1.0):
        if len(self.meta_replay_memory) < self.batch_size:
            return
        state_batch, goal_batch, next_state_batch, ex_reward_batch, done_mask = \
            self.meta_replay_memory.sample(self.batch_size)
        state_batch = Variable(torch.from_numpy(state_batch).type(dtype))
        goal_batch = Variable(torch.from_numpy(goal_batch).long())
        next_state_batch = Variable(
            torch.from_numpy(next_state_batch).type(dtype))
        ex_reward_batch = Variable(
            torch.from_numpy(ex_reward_batch).type(dtype))
        not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype)
        if USE_CUDA:
            goal_batch = goal_batch.cuda()
        # Compute current Q value, meta_controller takes only state and output value for every state-goal pair
        # We choose Q based on goal chosen.
        current_Q_values = self.meta_controller(state_batch).gather(
            1, goal_batch.unsqueeze(1))
        # Compute next Q value based on which goal gives max Q values
        # Detach variable from the current graph since we don't want gradients for next Q to propagated
        next_max_q = self.target_meta_controller(
            next_state_batch).detach().max(1)[0]
        next_Q_values = not_done_mask * next_max_q
        # Compute the target of the current Q values
        target_Q_values = ex_reward_batch + (gamma * next_Q_values)
        # Compute Bellman error (using Huber loss)
        loss = F.smooth_l1_loss(current_Q_values.view(-1), target_Q_values)

        # Copy Q to target Q before updating parameters of Q
        self.target_meta_controller.load_state_dict(
            self.meta_controller.state_dict())
        # Optimize the model
        self.meta_optimizer.zero_grad()
        loss.backward()
        for param in self.meta_controller.parameters():
            param.grad.data.clamp_(-1, 1)
        self.meta_optimizer.step()

    def update_controller(self, gamma=1.0):
        if len(self.ctrl_replay_memory) < self.batch_size:
            return
        state_goal_batch, action_batch, next_state_goal_batch, in_reward_batch, done_mask = \
            self.ctrl_replay_memory.sample(self.batch_size)
        state_goal_batch = Variable(
            torch.from_numpy(state_goal_batch).type(dtype))
        action_batch = Variable(torch.from_numpy(action_batch).long())
        next_state_goal_batch = Variable(
            torch.from_numpy(next_state_goal_batch).type(dtype))
        in_reward_batch = Variable(
            torch.from_numpy(in_reward_batch).type(dtype))
        not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype)
        if USE_CUDA:
            action_batch = action_batch.cuda()
        # Compute current Q value, controller takes only (state, goal) and output value for every (state, goal)-action pair
        # We choose Q based on action taken.
        current_Q_values = self.controller(state_goal_batch).gather(
            1, action_batch.unsqueeze(1))
        # Compute next Q value based on which goal gives max Q values
        # Detach variable from the current graph since we don't want gradients for next Q to propagated
        next_max_q = self.target_controller(
            next_state_goal_batch).detach().max(1)[0]
        next_Q_values = not_done_mask * next_max_q
        # Compute the target of the current Q values
        target_Q_values = in_reward_batch + (gamma * next_Q_values)
        # Compute Bellman error (using Huber loss)
        loss = F.smooth_l1_loss(current_Q_values.view(-1), target_Q_values)

        # Copy Q to target Q before updating parameters of Q
        self.target_controller.load_state_dict(self.controller.state_dict())
        # Optimize the model
        self.ctrl_optimizer.zero_grad()
        loss.backward()
        for param in self.controller.parameters():
            param.grad.data.clamp_(-1, 1)
        self.ctrl_optimizer.step()
예제 #24
0
파일: nec_agent.py 프로젝트: zjyyysj/mgh-hf
    def __init__(self,
               env,
               embedding_network,
               replay_memory=ReplayMemory(100000),
               initial_epsilon=1.0,
               final_epsilon=0.01,
               epsilon_decay=0.99,
               batch_size=8,
               sgd_lr=1e-6,
               q_lr=0.01,
               gamma=0.99,
               lookahead_horizon=100,
               update_period=4,
               kernel=inverse_distance,
               num_neighbors=50,
               max_memory=500000):
        '''
    Instantiate an NEC Agent

    Parameters
    ----------
    env: gym.Env
      gym environment to train on
    embedding_network: torch.nn.Module
      Model to extract the embedding from a state
    replay_memory: ReplayMemory
      Replay memory to sample from for embedding network updates
    initial_epsilon: float
      Initial epsilon for epsilon greedy search
    epsilon_decay: float
      Exponential decay factor for epsilon
    batch_size: int
      Batch size to sample from the replay memory
    sgd_lr: float
      Learning rate to use for RMSProp updates to the embedding network and DND
    q_lr: float
      Learning rate to use for Q-updates on DND updates
    gamma: float
      Discount factor
    lookahead_horizon: int
      Lookahead horizon to use for N-step Q-value estimates
    update_period: int
      Inverse of rate at which embedding network gets updated
      i.e. if 1 then update after every timestep, if 16 then update every 16 timesteps, etc.
    kernel: (torch.autograd.Variable, torch.autograd.Variable) => (torch.autograd.Variable)
      Kernel function to use for DND lookups
    num_neighbors: int
      Number of neighbors to return in K-NN lookups in DND
    max_memory: int
      Maximum number of key-value pairs to store in each DND
        '''

        self.env = env
        self.embedding_network = embedding_network

    
        self.replay_memory = replay_memory
        self.epsilon = initial_epsilon
        self.final_epsilon = final_epsilon
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.q_lr = q_lr
        self.gamma = gamma
        self.lookahead_horizon = lookahead_horizon
        self.update_period = update_period
    
        self.transition_queue = []
        self.optimizer = optim.RMSprop(
            self.embedding_network.parameters(), lr=sgd_lr)
        self.dnd_list = [DND(kernel, num_neighbors, max_memory, sgd_lr)
                     for _ in range(env.action_space_n)]
예제 #25
0
class Athlete(object):
    def __init__(self,
                 environment_name="CartPole-v1",
                 replay_memory_size=10000,
                 action_threshold=0.7,
                 batch_size=64,
                 gamma=0.9):
        self.environment = gym.make(environment_name)
        state = self.environment.reset()
        self.state_shape = state.shape
        self.action_space = self.environment.action_space.n
        self.replay_memory = ReplayMemory(self.state_shape,
                                          capacity=replay_memory_size)
        self.model = self.build_network()
        self.target_model = self.build_network()
        self.action_threshold = action_threshold
        self.batch_size = batch_size
        self.gamma = gamma

    def build_network(self) -> tf.keras.Model:
        yield NotImplemented()

    def choose_action(self, state: np.ndarray, threshold: float):
        if random.random() > threshold:
            # 随机取结果
            action = random.randint(0, self.action_space - 1)
        else:
            # 模型取结果
            results = self.model.predict(state.reshape([1] +
                                                       list(state.shape)))
            action = np.argmax(results, 1)[0]
        return action

    def simulate(self, action_threshold: float):
        state = self.environment.reset()
        while not self.replay_memory.is_full:
            action = self.choose_action(state, action_threshold)
            state_after, reward, done, _ = self.environment.step(action)
            self.replay_memory.add(state, action, reward, done, state_after)
            state = state_after
            if done:
                state = self.environment.reset()

        return True

    def train(self, epoch=100, model_prefix="saved_models/model"):
        model_prefix = model_prefix + ".epoch_{}.score_{}.h5"
        self.model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.01),
                           loss=tf.losses.mean_squared_error)
        for i in range(epoch):
            print("Epoch {} running:...".format(i))
            self.target_model.set_weights(self.model.get_weights())
            self.replay_memory.reset()
            self.simulate(self.action_threshold)
            self.replay_memory.compute_estimated_q(self.target_model,
                                                   self.gamma)
            num_batches = self.replay_memory.length / self.batch_size
            for j in range(int(num_batches)):
                states, actions, rewards, dones, next_states, estimated_q = self.replay_memory.random_batch(
                    self.batch_size)
                self.model.fit(states, estimated_q, epochs=1, verbose=0)

            if i % 5 == 0:
                score = self.estimate_model(self.model, render=False)
                model_path = model_prefix.format(i, score)
                print("Saving model: {} ...".format(model_path))
                self.model.save(model_prefix.format(i, score))

    def estimate_model(self, model=None, model_path="", render=True):
        if not model:
            model: tf.keras.Model = tf.keras.models.load_model(model_path)
        state = self.environment.reset()
        reward_count = 0
        while True:
            action = model.predict(
                state.reshape([
                    1,
                ] + list(self.state_shape)))
            print(state)
            print(action)
            action = np.argmax(action, 1)[0]
            print(action)
            if render:
                time.sleep(0.05)
                self.environment.render()
            state_after, revard, done, _ = self.environment.step(action)
            reward_count += revard
            if done:
                break
            state = state_after

        print("Steps taken: ", reward_count)
        return reward_count

    def score_model(self, model=None, model_path="", num_iteration=10):
        if not model:
            model: tf.keras.Model = tf.keras.models.load_model(model_path)

        scores = []
        for i in range(num_iteration):
            score = self.estimate_model(model)
            scores.append(score)
        avg_score = sum(scores) / num_iteration
        return avg_score
예제 #26
0
class DQNAgent:
    def __init__(self, config):
        self.config = config

        self.logger = logging.getLogger("DQNAgent")

        # define models (policy and target)
        self.policy_model = DQN(self.config)
        self.target_model = DQN(self.config)

        # define memory
        self.memory = ReplayMemory(self.config)

        # define loss
        self.loss = HuberLoss()

        # define optimizer
        self.optim = torch.optim.RMSprop(self.policy_model.parameters())

        # define environment
        self.env = gym.make('CartPole-v0').unwrapped
        self.cartpole = CartPoleEnv(self.config.screen_width)

        # initialize counter
        self.current_episode = 0
        self.current_iteration = 0
        self.episode_durations = []

        self.batch_size = self.config.batch_size

        # set cuda flag
        self.is_cuda = torch.cuda.is_available()
        if self.is_cuda and not self.config.cuda:
            self.logger.info(
                "WARNING: You have a CUDA device, so you should probably enable CUDA"
            )

        self.cuda = self.is_cuda & self.config.cuda

        if self.cuda:
            self.logger.info("Program will run on *****GPU-CUDA***** ")
            print_cuda_statistics()
            self.device = torch.device("cuda")
            torch.cuda.set_device(self.config.gpu_device)
        else:
            self.logger.info("Program will run on *****CPU***** ")
            self.device = torch.device("cpu")

        self.policy_model = self.policy_model.to(self.device)
        self.target_model = self.target_model.to(self.device)
        self.loss = self.loss.to(self.device)

        # Initialize Target model with policy model state dict
        self.target_model.load_state_dict(self.policy_model.state_dict())
        self.target_model.eval()

        # Summary Writer
        self.summary_writer = SummaryWriter(log_dir=self.config.summary_dir,
                                            comment='DQN')

    def load_checkpoint(self, file_name):
        filename = self.config.checkpoint_dir + file_name
        try:
            self.logger.info("Loading checkpoint '{}'".format(filename))
            checkpoint = torch.load(filename)

            self.current_episode = checkpoint['episode']
            self.current_iteration = checkpoint['iteration']
            self.policy_model.load_state_dict(checkpoint['state_dict'])
            self.optim.load_state_dict(checkpoint['optimizer'])

            self.logger.info(
                "Checkpoint loaded successfully from '{}' at (epoch {}) at (iteration {})\n"
                .format(self.config.checkpoint_dir, checkpoint['episode'],
                        checkpoint['iteration']))
        except OSError as e:
            self.logger.info(
                "No checkpoint exists from '{}'. Skipping...".format(
                    self.config.checkpoint_dir))
            self.logger.info("**First time to train**")

    def save_checkpoint(self, file_name="checkpoint.pth.tar", is_best=0):
        state = {
            'episode': self.current_episode,
            'iteration': self.current_iteration,
            'state_dict': self.policy_model.state_dict(),
            'optimizer': self.optim.state_dict(),
        }
        # Save the state
        torch.save(state, self.config.checkpoint_dir + file_name)
        # If it is the best copy it to another file 'model_best.pth.tar'
        if is_best:
            shutil.copyfile(self.config.checkpoint_dir + file_name,
                            self.config.checkpoint_dir + 'model_best.pth.tar')

    def run(self):
        """
        This function will the operator
        :return:
        """
        try:
            self.train()

        except KeyboardInterrupt:
            self.logger.info("You have entered CTRL+C.. Wait to finalize")

    def select_action(self, state):
        """
        The action selection function, it either uses the model to choose an action or samples one uniformly.
        :param state: current state of the model
        :return:
        """
        if self.cuda:
            state = state.cuda()
        sample = random.random()
        eps_threshold = self.config.eps_start + (
            self.config.eps_start - self.config.eps_end) * math.exp(
                -1. * self.current_iteration / self.config.eps_decay)
        self.current_iteration += 1
        if sample > eps_threshold:
            with torch.no_grad():
                return self.policy_model(state).max(1)[1].view(1,
                                                               1)  # size (1,1)
        else:
            return torch.tensor([[random.randrange(2)]],
                                device=self.device,
                                dtype=torch.long)

    def optimize_policy_model(self):
        """
        performs a single step of optimization for the policy model
        :return:
        """
        if self.memory.length() < self.batch_size:
            return
        # sample a batch
        transitions = self.memory.sample_batch(self.batch_size)

        one_batch = Transition(*zip(*transitions))

        # create a mask of non-final states
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, one_batch.next_state)),
                                      device=self.device,
                                      dtype=torch.uint8)  # [128]
        non_final_next_states = torch.cat([
            s for s in one_batch.next_state if s is not None
        ])  # [< 128, 3, 40, 80]

        # concatenate all batch elements into one
        state_batch = torch.cat(one_batch.state)  # [128, 3, 40, 80]
        action_batch = torch.cat(one_batch.action)  # [128, 1]
        reward_batch = torch.cat(one_batch.reward)  # [128]

        state_batch = state_batch.to(self.device)
        non_final_next_states = non_final_next_states.to(self.device)

        curr_state_values = self.policy_model(state_batch)  # [128, 2]
        curr_state_action_values = curr_state_values.gather(
            1, action_batch)  # [128, 1]

        # Get V(s_{t+1}) for all next states. By definition we set V(s)=0 if s is a terminal state.
        next_state_values = torch.zeros(self.batch_size,
                                        device=self.device)  # [128]
        next_state_values[non_final_mask] = self.target_model(
            non_final_next_states).max(1)[0].detach()  # [< 128]

        # Get the expected Q values
        expected_state_action_values = (
            next_state_values * self.config.gamma) + reward_batch  # [128]
        # compute loss: temporal difference error
        loss = self.loss(curr_state_action_values,
                         expected_state_action_values.unsqueeze(1))

        # optimizer step
        self.optim.zero_grad()
        loss.backward()
        for param in self.policy_model.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optim.step()

        return loss

    def train(self):
        """
        Training loop based on the number of episodes
        :return:
        """
        for episode in tqdm(
                range(self.current_episode, self.config.num_episodes)):
            self.current_episode = episode
            # reset environment
            self.env.reset()
            self.train_one_epoch()
            # The target network has its weights kept frozen most of the time
            if self.current_episode % self.config.target_update == 0:
                self.target_model.load_state_dict(
                    self.policy_model.state_dict())

        self.env.render()
        self.env.close()

    def train_one_epoch(self):
        """
        One episode of training; it samples an action, observe next screen and optimize the model once
        :return:
        """
        episode_duration = 0
        prev_frame = self.cartpole.get_screen(self.env)
        curr_frame = self.cartpole.get_screen(self.env)
        # get state
        curr_state = curr_frame - prev_frame

        while (1):
            episode_duration += 1
            # select action
            action = self.select_action(curr_state)
            # perform action and get reward
            _, reward, done, _ = self.env.step(action.item())

            if self.cuda:
                reward = torch.Tensor([reward]).to(self.device)
            else:
                reward = torch.Tensor([reward]).to(self.device)

            prev_frame = curr_frame
            curr_frame = self.cartpole.get_screen(self.env)
            # assign next state
            if done:
                next_state = None
            else:
                next_state = curr_frame - prev_frame

            # add this transition into memory
            self.memory.push_transition(curr_state, action, next_state, reward)

            curr_state = next_state

            # Policy model optimization step
            curr_loss = self.optimize_policy_model()
            if curr_loss is not None:
                if self.cuda:
                    curr_loss = curr_loss.cpu()
                self.summary_writer.add_scalar("Temporal Difference Loss",
                                               curr_loss.detach().numpy(),
                                               self.current_iteration)
            # check if done
            if done:
                break

        self.summary_writer.add_scalar("Training Episode Duration",
                                       episode_duration, self.current_episode)

    def validate(self):
        pass

    def finalize(self):
        """
        Finalize all the operations of the 2 Main classes of the process the operator and the data loader
        :return:
        """
        self.logger.info(
            "Please wait while finalizing the operation.. Thank you")
        self.save_checkpoint()
        self.summary_writer.export_scalars_to_json("{}all_scalars.json".format(
            self.config.summary_dir))
        self.summary_writer.close()
예제 #27
0
class MotionAthlete(Athlete):
    def __init__(self,
                 environment_name="Acrobot-v1",
                 replay_memory_size=10000,
                 action_threshold=0.7,
                 batch_size=64,
                 gamma=0.9):
        super(MotionAthlete,
              self).__init__(environment_name, replay_memory_size,
                             action_threshold, batch_size, gamma)
        self.environment.close()
        del self.environment
        self.environment = EnvironmentWrapper(environment_name)
        frame = self.environment.reset()
        frmae_shape = frame.shape
        self.motion_tracer = MotionTracer(frame_shape=frmae_shape)
        self.state_shape = self.motion_tracer.state_shape
        self.replay_memory = ReplayMemory(self.state_shape,
                                          capacity=replay_memory_size)
        del self.model
        del self.target_model
        self.model = self.build_network()
        self.target_model = self.build_network()

    def simulate(self, action_threshold: float):
        print("Simulating...")
        frame = self.environment.reset()
        self.motion_tracer.reset()
        self.motion_tracer.add_frame(frame)
        while not self.replay_memory.is_full:
            state = self.motion_tracer.get_state()
            action = self.choose_action(state, action_threshold)
            frame_after, reward, done, _ = self.environment.step(action)
            self.motion_tracer.add_frame(frame_after)
            state_next = self.motion_tracer.get_state()
            self.replay_memory.add(state, action, reward, done, state_next)
            if done:
                frame = self.environment.reset()
                self.motion_tracer.reset()
                self.motion_tracer.add_frame(frame)
        print("Simulation finished")

        return True

    def estimate_model(self, model=None, model_path="", render=True):
        if not model:
            model: tf.keras.Model = tf.keras.models.load_model(model_path)
        frame = self.environment.reset()
        self.motion_tracer.reset()
        self.motion_tracer.add_frame(frame)
        state = self.motion_tracer.get_state()
        reward_count = 0
        step_count = 0
        while True:
            step_count += 1
            action = model.predict(
                state.reshape([
                    1,
                ] + list(self.state_shape)))
            print(frame)
            print(action)
            action = np.argmax(action, 1)[0]
            print(action)
            if render:
                time.sleep(0.05)
                self.environment.render()
            frame_after, revard, done, _ = self.environment.step(action)
            reward_count += revard
            if done:
                break
            self.motion_tracer.add_frame(frame_after)
            state = self.motion_tracer.get_state()

        print("Total reward: ", reward_count)
        print("Total step: ", step_count)
        return reward_count
class PseudoCountQLearner(ValueBasedLearner):
    def __init__(self, args):
        super(PseudoCountQLearner, self).__init__(args)

        self.cts_eta = .9
        self.batch_size = 32
        self.replay_memory = ReplayMemory(args.replay_size)

        #more cython tuning could useful here
        self.density_model = CTSDensityModel(height=args.cts_rescale_dim,
                                             width=args.cts_rescale_dim,
                                             num_bins=args.cts_bins,
                                             beta=0.05)

    def generate_final_epsilon(self):
        return 0.1

    def _get_summary_vars(self):
        q_vars = super(PseudoCountQLearner, self)._get_summary_vars()

        bonus_q25 = tf.Variable(0., name='novelty_bonus_q25')
        s1 = tf.summary.scalar('Novelty_Bonus_q25_{}'.format(self.actor_id),
                               bonus_q25)
        bonus_q50 = tf.Variable(0., name='novelty_bonus_q50')
        s2 = tf.summary.scalar('Novelty_Bonus_q50_{}'.format(self.actor_id),
                               bonus_q50)
        bonus_q75 = tf.Variable(0., name='novelty_bonus_q75')
        s3 = tf.summary.scalar('Novelty_Bonus_q75_{}'.format(self.actor_id),
                               bonus_q75)

        return q_vars + [bonus_q25, bonus_q50, bonus_q75]

    def prepare_state(self, state, total_episode_reward, steps_at_last_reward,
                      ep_t, episode_ave_max_q, episode_over, bonuses):
        # prevent the agent from getting stuck
        reset_game = False
        if (self.local_step - steps_at_last_reward > 5000
                or (self.emulator.get_lives() == 0
                    and self.emulator.game not in ONE_LIFE_GAMES)):

            steps_at_last_reward = self.local_step
            episode_over = True
            reset_game = True

        # Start a new game on reaching terminal state
        if episode_over:
            T = self.global_step.value()
            t = self.local_step
            e_prog = float(t) / self.epsilon_annealing_steps
            episode_ave_max_q = episode_ave_max_q / float(ep_t)
            s1 = "Q_MAX {0:.4f}".format(episode_ave_max_q)
            s2 = "EPS {0:.4f}".format(self.epsilon)

            self.scores.insert(0, total_episode_reward)
            if len(self.scores) > 100:
                self.scores.pop()

            logger.info('T{0} / STEP {1} / REWARD {2} / {3} / {4}'.format(
                self.actor_id, T, total_episode_reward, s1, s2))
            logger.info(
                'ID: {0} -- RUNNING AVG: {1:.0f} ± {2:.0f} -- BEST: {3:.0f}'.
                format(
                    self.actor_id,
                    np.array(self.scores).mean(),
                    2 * np.array(self.scores).std(),
                    max(self.scores),
                ))

            if self.is_master() and self.is_train:
                stats = [
                    total_episode_reward,
                    episode_ave_max_q,
                    self.epsilon,
                    np.percentile(bonuses, 25),
                    np.percentile(bonuses, 50),
                    np.percentile(bonuses, 75),
                ]
                feed_dict = {
                    self.summary_ph[i]: stats[i]
                    for i in range(len(stats))
                }
                res = self.session.run(self.update_ops + [self.summary_op],
                                       feed_dict=feed_dict)
                self.summary_writer.add_summary(res[-1],
                                                self.global_step.value())

            if reset_game or self.emulator.game in ONE_LIFE_GAMES:
                state = self.emulator.get_initial_state()

            ep_t = 0
            total_episode_reward = 0
            episode_ave_max_q = 0
            episode_over = False

        return state, total_episode_reward, steps_at_last_reward, ep_t, episode_ave_max_q, episode_over

    def batch_update(self):
        if len(self.replay_memory) < self.batch_size:
            return

        s_i, a_i, r_i, s_f, is_terminal = self.replay_memory.sample_batch(
            self.batch_size)

        q_target_values = self.session.run(
            self.target_network.output_layer,
            feed_dict={self.target_network.input_ph: s_f})
        y_target = r_i + self.cts_eta * self.gamma * q_target_values.max(
            axis=1) * (1 - is_terminal.astype(np.int))

        feed_dict = {
            self.local_network.input_ph: s_i,
            self.local_network.target_ph: y_target,
            self.local_network.selected_action_ph: a_i
        }
        grads = self.session.run(self.local_network.get_gradients,
                                 feed_dict=feed_dict)

        self.apply_gradients_to_shared_memory_vars(grads)

    def _run(self):
        """ Main actor learner loop for n-step Q learning. """
        if not self.is_train:
            return self.test()

        logger.debug("Actor {} resuming at Step {}, {}".format(
            self.actor_id, self.global_step.value(), time.ctime()))

        s = self.emulator.get_initial_state()

        s_batch = []
        a_batch = []
        y_batch = []
        bonuses = deque(maxlen=100)

        exec_update_target = False
        total_episode_reward = 0
        episode_ave_max_q = 0
        episode_over = False
        qmax_down = 0
        qmax_up = 0
        prev_qmax = -10 * 6
        low_qmax = 0
        ep_t = 0

        t0 = time.time()
        while (self.global_step.value() < self.max_global_steps):
            # Sync local learning net with shared mem
            self.sync_net_with_shared_memory(self.local_network,
                                             self.learning_vars)
            self.save_vars()

            rewards = []
            states = []
            actions = []
            local_step_start = self.local_step

            while not episode_over:
                # Choose next action and execute it
                a, readout_t = self.choose_next_action(s)

                new_s, reward, episode_over = self.emulator.next(a)
                total_episode_reward += reward

                current_frame = new_s[..., -1]
                bonus = self.density_model.update(current_frame)
                bonuses.append(bonus)

                if self.is_master() and (self.local_step % 200 == 0):
                    bonus_array = np.array(bonuses)
                    logger.debug(
                        'Mean Bonus={:.4f} / Max Bonus={:.4f} / STEPS/s={}'.
                        format(bonus_array.mean(), bonus_array.max(),
                               100. / (time.time() - t0)))
                    t0 = time.time()

                # Rescale or clip immediate reward
                reward = self.rescale_reward(
                    self.rescale_reward(reward) + bonus)
                ep_t += 1

                rewards.append(reward)
                states.append(s)
                actions.append(a)

                s = new_s
                self.local_step += 1
                episode_ave_max_q += np.max(readout_t)

                global_step, update_target = self.global_step.increment(
                    self.q_target_update_steps)

                if update_target:
                    update_target = False
                    exec_update_target = True

                if self.local_step % 4 == 0:
                    self.batch_update()

                self.local_network.global_step = global_step

            else:
                mc_returns = list()
                running_total = 0.0
                for r in reversed(rewards):
                    running_total = r + self.gamma * running_total
                    mc_returns.insert(0, running_total)

                mixed_returns = self.cts_eta * np.array(rewards) + (
                    1 - self.cts_eta) * np.array(mc_returns)

                states.append(new_s)
                episode_length = len(rewards)
                for i in range(episode_length):
                    self.replay_memory.append(
                        (states[i], actions[i], mixed_returns[i],
                         states[i + 1], i + 1 == episode_length))

            if exec_update_target:
                self.update_target()
                exec_update_target = False
                # Sync local tensorflow target network params with shared target network params
                if self.target_update_flags.updated[self.actor_id] == 1:
                    self.sync_net_with_shared_memory(self.target_network,
                                                     self.target_vars)
                    self.target_update_flags.updated[self.actor_id] = 0

            s, total_episode_reward, _, ep_t, episode_ave_max_q, episode_over = \
                self.prepare_state(s, total_episode_reward, self.local_step, ep_t, episode_ave_max_q, episode_over, bonuses)
예제 #29
0
    def __init__(self,
                 env,
                 embedding_network,
                 replay_memory=ReplayMemory(500000),
                 epsilon_schedule=epsilon_schedule,
                 batch_size=8,
                 sgd_learning_rate=1e-2,
                 q_learning_rate=0.5,
                 gamma=0.99,
                 lookahead_horizon=100,
                 update_period=4,
                 kernel=inverse_distance,
                 num_neighbors=50,
                 max_memory=125000,
                 warmup_period=1000,
                 test_period=10):
        """
    Instantiate an NEC Agent

    Parameters
    ----------
    env: gym.Env
      gym environment to train on
    embedding_network: torch.nn.Module
      Model to extract the embedding from a state
    replay_memory: ReplayMemory
      Replay memory to sample from for embedding network updates
    epsilon_schedule: (int) => (float)
      Function that determines the epsilon for epsilon-greedy exploration from the timestep t
    batch_size: int
      Batch size to sample from the replay memory
    sgd_learning_rate: float
      Learning rate to use for RMSProp updates to the embedding network
    q_learning_rate: float
      Learning rate to use for Q-updates on DND updates
    gamma: float
      Discount factor
    lookahead_horizon: int
      Lookahead horizon to use for N-step Q-value estimates
    update_period: int
      Inverse of rate at which embedding network gets updated
      i.e. if 1 then update after every timestep, if 16 then update every 16 timesteps, etc.
    kernel: (torch.autograd.Variable, torch.autograd.Variable) => (torch.autograd.Variable)
      Kernel function to use for DND lookups
    num_neighbors: int
      Number of neighbors to return in K-NN lookups in DND
    max_memory: int
      Maximum number of key-value pairs to store in DND
    warmup_period: int
      Number of timesteps to act randomly before learning
    test_period: int
      Number of episodes between each test iteration
    """

        self.env = env
        self.embedding_network = embedding_network
        if use_cuda:
            self.embedding_network.cuda()

        self.replay_memory = replay_memory
        self.epsilon_schedule = epsilon_schedule
        self.batch_size = batch_size
        self.q_learning_rate = q_learning_rate
        self.gamma = gamma
        self.lookahead_horizon = lookahead_horizon
        self.update_period = update_period
        self.warmup_period = warmup_period
        self.test_period = test_period

        self.transition_queue = []
        self.optimizer = optim.RMSprop(self.embedding_network.parameters(),
                                       lr=sgd_learning_rate)

        state_dict = self.embedding_network.state_dict()
        self.dnd_list = [
            DND(kernel, num_neighbors, max_memory,
                state_dict[next(reversed(state_dict))].size()[0])
            for _ in range(env.action_space.n)
        ]
예제 #30
0
class DQNDoubleQAgent(BaseAgent):
    def __init__(self):
        super(DQNDoubleQAgent, self).__init__()
        self.training = False
        self.max_frames = 2000000
        self._epsilon = Epsilon(start=1.0, end=0.1, update_increment=0.0001)
        self.gamma = 0.99
        self.train_q_per_step = 4
        self.train_q_batch_size = 256
        self.steps_before_training = 10000
        self.target_q_update_frequency = 50000

        self._Q_weights_path = "./data/SC2DoubleQAgent"
        self._Q = DQNCNN()
        if os.path.isfile(self._Q_weights_path):
            self._Q.load_state_dict(torch.load(self._Q_weights_path))
            print("Loading weights:", self._Q_weights_path)
        self._Qt = copy.deepcopy(self._Q)
        self._Q.cuda()
        self._Qt.cuda()
        self._optimizer = optim.Adam(self._Q.parameters(), lr=1e-8)
        self._criterion = nn.MSELoss()
        self._memory = ReplayMemory(100000)

        self._loss = deque(maxlen=1000)
        self._max_q = deque(maxlen=1000)
        self._action = None
        self._screen = None
        self._fig = plt.figure()
        self._plot = [plt.subplot(2, 2, i + 1) for i in range(4)]

        self._screen_size = 28

    def get_env_action(self, action, obs):
        action = np.unravel_index(action,
                                  [1, self._screen_size, self._screen_size])
        target = [action[2], action[1]]
        command = _MOVE_SCREEN  #action[0]   # removing unit selection out of the equation
        # if command == 0:
        #   command = _SELECT_POINT
        # else:
        #   command = _MOVE_SCREEN

        if command in obs.observation["available_actions"]:
            return actions.FunctionCall(command, [[0], target])
        else:
            return actions.FunctionCall(_NO_OP, [])

    '''
    :param 
      s = obs.observation["screen"]
    :returns
      action = argmax action
  '''

    def get_action(self, s):
        # greedy
        if np.random.rand() > self._epsilon.value():
            # print("greedy action")
            s = Variable(torch.from_numpy(s).cuda())
            s = s.unsqueeze(0).float()
            self._action = self._Q(s).squeeze().cpu().data.numpy()
            return self._action.argmax()
        # explore
        else:
            # print("random choice")
            # action = np.random.choice([0, 1])
            action = 0
            target = np.random.randint(0, self._screen_size, size=2)
            return action * self._screen_size * self._screen_size + target[
                0] * self._screen_size + target[1]

    def select_friendly_action(self, obs):
        player_relative = obs.observation["screen"][_PLAYER_RELATIVE]
        friendly_y, friendly_x = (
            player_relative == _PLAYER_FRIENDLY).nonzero()
        target = [int(friendly_x.mean()), int(friendly_y.mean())]
        return actions.FunctionCall(_SELECT_POINT, [[0], target])

    def train(self, env, training=True):
        self._epsilon.isTraining = training
        self.run_loop(env, self.max_frames)
        if self._epsilon.isTraining:
            torch.save(self._Q.state_dict(), self._Q_weights_path)

    def run_loop(self, env, max_frames=0):
        """A run loop to have agents and an environment interact."""
        total_frames = 0
        start_time = time.time()

        action_spec = env.action_spec()
        observation_spec = env.observation_spec()

        self.setup(observation_spec, action_spec)

        try:
            while True:
                obs = env.reset()[0]
                # remove unit selection from the equation by selecting the friendly on every new game.
                select_friendly = self.select_friendly_action(obs)
                obs = env.step([select_friendly])[0]
                # distance = self.get_reward(obs.observation["screen"])

                self.reset()

                while True:
                    total_frames += 1

                    self._screen = obs.observation["screen"][5]
                    s = np.expand_dims(obs.observation["screen"][5], 0)
                    # plt.imshow(s[5])
                    # plt.pause(0.00001)
                    if max_frames and total_frames >= max_frames:
                        print("max frames reached")
                        return
                    if obs.last():
                        print("total frames:", total_frames, "Epsilon:",
                              self._epsilon.value())
                        self._epsilon.increment()
                        break

                    action = self.get_action(s)
                    env_actions = self.get_env_action(action, obs)
                    obs = env.step([env_actions])[0]

                    r = obs.reward
                    s1 = np.expand_dims(obs.observation["screen"][5], 0)
                    done = r > 0
                    if self._epsilon.isTraining:
                        transition = Transition(s, action, s1, r, done)
                        self._memory.push(transition)

                    if total_frames % self.train_q_per_step == 0 and total_frames > self.steps_before_training and self._epsilon.isTraining:
                        self.train_q()
                        # pass

                    if total_frames % self.target_q_update_frequency == 0 and total_frames > self.steps_before_training and self._epsilon.isTraining:
                        self._Qt = copy.deepcopy(self._Q)
                        self.show_chart()

                    if total_frames % 1000 == 0 and total_frames > self.steps_before_training and self._epsilon.isTraining:
                        self.show_chart()

                    if not self._epsilon.isTraining and total_frames % 3 == 0:
                        self.show_chart()

        except KeyboardInterrupt:
            pass
        finally:
            print("finished")
            elapsed_time = time.time() - start_time
            print("Took %.3f seconds for %s steps: %.3f fps" %
                  (elapsed_time, total_frames, total_frames / elapsed_time))

    def get_reward(self, s):
        player_relative = s[_PLAYER_RELATIVE]
        neutral_y, neutral_x = (player_relative == _PLAYER_NEUTRAL).nonzero()
        neutral_target = [int(neutral_x.mean()), int(neutral_y.mean())]
        friendly_y, friendly_x = (
            player_relative == _PLAYER_FRIENDLY).nonzero()
        if len(friendly_y) == 0 or len(friendly_x) == 0:  # this is shit
            return 0
        friendly_target = [int(friendly_x.mean()), int(friendly_y.mean())]

        distance_2 = (neutral_target[0] - friendly_target[0])**2 + (
            neutral_target[1] - friendly_target[1])**2
        distance = math.sqrt(distance_2)
        return -distance

    def show_chart(self):
        self._plot[0].clear()
        self._plot[0].set_xlabel('Last 1000 Training Cycles')
        self._plot[0].set_ylabel('Loss')
        self._plot[0].plot(list(self._loss))

        self._plot[1].clear()
        self._plot[1].set_xlabel('Last 1000 Training Cycles')
        self._plot[1].set_ylabel('Max Q')
        self._plot[1].plot(list(self._max_q))

        self._plot[2].clear()
        self._plot[2].set_title("screen")
        self._plot[2].imshow(self._screen)

        self._plot[3].clear()
        self._plot[3].set_title("action")
        self._plot[3].imshow(self._action)
        plt.pause(0.00001)

    def train_q(self):
        if self.train_q_batch_size >= len(self._memory):
            return

        s, a, s_1, r, done = self._memory.sample(self.train_q_batch_size)
        s = Variable(torch.from_numpy(s).cuda()).float()
        a = Variable(torch.from_numpy(a).cuda()).long()
        s_1 = Variable(torch.from_numpy(s_1).cuda(), volatile=True).float()
        r = Variable(torch.from_numpy(r).cuda()).float()
        done = Variable(torch.from_numpy(1 - done).cuda()).float()

        # Q_sa = r + gamma * max(Q_s'a')
        Q = self._Q(s)
        Q = Q.view(self.train_q_batch_size, -1)
        Q = Q.gather(1, a)

        Qt = self._Qt(s_1).view(self.train_q_batch_size, -1)

        # double Q
        best_action = self._Q(s_1).view(self.train_q_batch_size,
                                        -1).max(dim=1, keepdim=True)[1]
        y = r + done * self.gamma * Qt.gather(1, best_action)
        # Q
        # y = r + done * self.gamma * Qt.max(dim=1)[0].unsqueeze(1)

        y.volatile = False

        loss = self._criterion(Q, y)
        self._loss.append(loss.sum().cpu().data.numpy())
        self._max_q.append(Q.max().cpu().data.numpy()[0])
        self._optimizer.zero_grad()  # zero the gradient buffers
        loss.backward()
        self._optimizer.step()