示例#1
0
    def agent_init(self, **options):
        self.use_gpu = options['use_gpu']
        self.depth_image_dim = options['depth_image_dim']
        self.q_net_input_dim = self.image_feature_dim * self.image_feature_count + self.depth_image_dim

        if os.path.exists(self.cnn_feature_extractor):
            print("loading... " + self.cnn_feature_extractor)
            with open(self.cnn_feature_extractor, 'rb') as f:
                self.feature_extractor = pickle.load(f)
            print("done")
        else:
            print('there is no chainer alexnet model file ',
                  self.cnn_feature_extractor)
            print('making chainer model from ', self.model)
            print('this process take a tens of minutes.')
            self.feature_extractor = CnnFeatureExtractor(
                self.use_gpu, self.model, self.model_type,
                self.image_feature_dim)
            pickle.dump(self.feature_extractor,
                        open(self.cnn_feature_extractor, 'wb'))
            print("pickle.dump finished")

        self.time = 0
        self.epsilon = 1.0  # Initial exploratoin rate
        self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim)
示例#2
0
    def agent_init(self, **options):
        self.use_gpu = options['use_gpu']
        self.agent_count = options['agent_count']
        self.image_count = options['rgb_image_count']
        self.depth_image_dim = options['depth_image_dim']
        self.ir_idm = options['ir_dim']
        self.ground_dim = options['ground_dim']
        self.compass_dim = options['compass_dim']
        self.target_dim = options['target_dim']
        self.model = options['model']

        self.cnn_input_dim = self.image_dim * self.image_count
        self.feature_dim = self.image_feature_dim * self.image_feature_count
        self.other_input_dim = self.depth_image_dim + self.ir_idm + self.ground_dim + self.compass_dim + self.target_dim

        self.time = 1
        self.epsilon = 1.0
        self.avgloss_log_file = self.avgloss_log + "avg_loss.log"

        if self.model != 'None':
            self.policy_frozen = False
            self.epsilon = 0.5

        self.q_net = QNet(self.use_gpu, self.actions, self.cnn_input_dim,
                          self.feature_dim, self.agent_count,
                          self.other_input_dim, self.model)
    def __init__(self, g_list, test_g_list, env):
        self.g_list = g_list
        if test_g_list is None:
            self.test_g_list = g_list
        else:
            self.test_g_list = test_g_list

        self.env = env
        self.net = QNet()
        self.old_net = QNet()
        self.optimizer = optim.Adam(self.net.parameters(),
                                    lr=cmd_args.learning_rate)

        if cmd_args.ctx == 'gpu':
            self.net = self.net.cuda()
            self.old_net = self.old_net.cuda()

        self.eps_start = 1.0
        self.eps_end = 1.0
        self.eps_step = 10000
        self.burn_in = 100  # number of iterations to run first set ("intial burning in to memory") of simulations?
        self.step = 0

        self.best_eval = None
        self.pos = 0
        self.sample_idxes = list(range(len(g_list)))
        random.shuffle(self.sample_idxes)
        self.take_snapshot()
    def agent_init(self, **options):
        self.use_gpu = options['use_gpu']
        #self.depth_image_dim = options['depth_image_dim']
        self.q_net_input_dim = self.image_feature_dim * self.image_feature_count #+ self.depth_image_dim

        if os.path.exists(self.cnn_feature_extractor):
            print("loading... " + self.cnn_feature_extractor),
            self.feature_extractor = pickle.load(open(self.cnn_feature_extractor))
            print("done")
        else:
            self.feature_extractor = CnnFeatureExtractor(self.use_gpu, self.model, self.model_type, self.image_feature_dim)
            pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'w'))
            print("pickle.dump finished")

        self.time = 0
        self.epsilon = 1.0  # Initial exploratoin rate
        self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim)
    def agent_init(self, **options):
        try:
            self.image_count = options['image_count']
            self.depth_image_dim = options['depth_image_dim']
            self.use_gpu = options['use_gpu']
            self.test = options['test']
            self.folder = options["folder"]  #save_modelで使う->self.
            model_num = options['model_num']

            self.q_net_input_dim = self.image_feature_dim * self.image_count + self.depth_image_dim * self.image_count

            if os.path.exists(self.cnn_feature_extractor):
                print("loading... " + self.cnn_feature_extractor),
                self.feature_extractor = pickle.load(
                    open(self.cnn_feature_extractor))
                print("done")

            else:
                self.feature_extractor = CnnFeatureExtractor(
                    self.use_gpu, self.model, self.model_type,
                    self.image_feature_dim)
                pickle.dump(self.feature_extractor,
                            open(self.cnn_feature_extractor, 'w'))
                print("pickle.dump finished")

            self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim)

            self.time = model_num + 1  #saveとloadが同時に行われることを防ぐため
            if (self.test):
                self.epsilon = 0.0
            else:
                non_exploration = max(
                    self.time - self.q_net.initial_exploration, 0)
                self.epsilon = max(1.0 - non_exploration * self.epsilon_delta,
                                   self.min_eps)
            print "epsilon = ", self.epsilon

            if (self.test or model_num > 0):
                self.q_net.load_model(self.folder, model_num)
        except:
            import traceback
            import sys
            traceback.print_exc()
            sys.exit()
示例#6
0
    def agent_init(self, **options):
        self.use_gpu = options['use_gpu']
        self.pad_state_dim = options['pad_states_dim']
        self.q_net_input_dim = self.image_feature_dim + self.pad_state_dim

        if os.path.exists(self.cnn_feature_extractor):
            print("loading... " + self.cnn_feature_extractor),
            self.feature_extractor = pickle.load(
                open(self.cnn_feature_extractor))
        else:
            print("pickle.dump start")
            self.feature_extractor = CnnFeatureExtractor(
                self.use_gpu, self.model, self.model_type,
                self.image_feature_dim)
            pickle.dump(self.feature_extractor,
                        open(self.cnn_feature_extractor, 'wb'))
            print("pickle.dump finished")

        self.time = 0
        self.epsilon = 1.0  # Initial exploratoin rate
        self.q_net = QNet(self.use_gpu, self.num_of_action_type,
                          self.num_of_pad, self.q_net_input_dim)
示例#7
0
 def __init__(self,
              obs_dims,
              act_dim,
              lr=1e-3,
              gamma=0.99,
              replay_buffer_size=10000,
              batch_size=64,
              epsilon_min=0.01,
              epsilon_dec=5e-5,
              target_update_frequency=64):
     self.buffer = ReplayBuffer(replay_buffer_size, obs_dims)
     self.batch_size = batch_size
     self.q_eval = QNet(obs_dims, act_dim)
     self.q_target = QNet(obs_dims, act_dim)
     self.obs_dims = obs_dims
     self.act_dim = act_dim
     self.learn_ctr = 0
     self.target_update_frequency = target_update_frequency
     self.gamma = gamma
     self.epsilon = 1
     self.epsilon_min = epsilon_min
     self.epsilon_dec = epsilon_dec
     self.optimizer = torch.optim.Adam(self.q_eval.parameters(), lr=lr)
     self.loss_fn = torch.nn.MSELoss()
示例#8
0
    def agent_init(self, **options):
        self.use_gpu = options['use_gpu']
        self.depth_image_dim = options['depth_image_dim']
        self.q_net_input_dim = self.image_feature_dim * self.image_feature_count + self.depth_image_dim

        if os.path.exists(self.cnn_feature_extractor):
            print("loading... " + self.cnn_feature_extractor),
            self.feature_extractor = pickle.load(open(self.cnn_feature_extractor))
            print("done")
        else:
            self.feature_extractor = CnnFeatureExtractor(self.use_gpu, self.model, self.model_type, self.image_feature_dim)
            pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'w'))
            print("pickle.dump finished")

        self.time = 0
        self.epsilon = 1.0  # Initial exploratoin rate
        self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim)
示例#9
0
    def agent_init(self, **options):
        self.use_gpu = options['use_gpu']
        self.pad_state_dim = options['pad_states_dim']
        self.q_net_input_dim = self.image_feature_dim + self.pad_state_dim

        if os.path.exists(self.cnn_feature_extractor):
            print("loading... " + self.cnn_feature_extractor),
            self.feature_extractor = pickle.load(open(self.cnn_feature_extractor))
        else:
            print("pickle.dump start")
            self.feature_extractor = CnnFeatureExtractor(self.use_gpu, self.model, self.model_type, self.image_feature_dim)
            pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'wb'))
            print("pickle.dump finished")

        self.time = 0
        self.epsilon = 1.0  # Initial exploratoin rate
        self.q_net = QNet(self.use_gpu, self.num_of_action_type, self.num_of_pad, self.q_net_input_dim)
示例#10
0
class CnnDqnAgent(object):
    policy_frozen = False
    epsilon_delta = 1.0 / (6 * 10**4)
    min_eps = 0.1

    actions = [0, 1, 2, 3, 4, 5, 6, 7]

    image_feature_dim = 14 * 14
    image_feature_count = 32
    image_dim = 128 * 128
    avgloss_log = '/home/ohk/Documents/playground/Assets/log/'

    def _observation_to_state_cnn(self, observation):
        temp = []
        for i in range(len(observation["image"])):
            temp.append(np.r_[observation["image"][i]])
        return np.r_[temp]

    def _observation_to_state_other(self, observation):
        temp = []
        # change in another network structure
        for i in range(len(observation["ir"])):
            temp.append(np.r_[observation["ir"][i], observation["compass"][i],
                              observation["target"][i]])
        return np.r_[temp]

    def _reshape_for_cnn(self, state, hist_size, x, y):

        state_ = np.zeros((self.agent_count, 3 * hist_size, 128, 128),
                          dtype=np.float32)

        for i in range(self.agent_count):
            if hist_size == 1:
                state_[i] = state[i][0].transpose(2, 0, 1)
            elif hist_size == 2:
                state_[i] = np.c_[state[i][0], state[i][1]].transpose(2, 0, 1)
            elif hist_size == 4:
                state_[i] = np.c_[state[i][0], state[i][1], state[i][2],
                                  state[i][3]].transpose(2, 0, 1)

        return state_

    def agent_init(self, **options):
        self.use_gpu = options['use_gpu']
        self.agent_count = options['agent_count']
        self.image_count = options['rgb_image_count']
        self.depth_image_dim = options['depth_image_dim']
        self.ir_idm = options['ir_dim']
        self.ground_dim = options['ground_dim']
        self.compass_dim = options['compass_dim']
        self.target_dim = options['target_dim']
        self.model = options['model']

        self.cnn_input_dim = self.image_dim * self.image_count
        self.feature_dim = self.image_feature_dim * self.image_feature_count
        self.other_input_dim = self.depth_image_dim + self.ir_idm + self.ground_dim + self.compass_dim + self.target_dim

        self.time = 1
        self.epsilon = 1.0
        self.avgloss_log_file = self.avgloss_log + "avg_loss.log"

        if self.model != 'None':
            self.policy_frozen = False
            self.epsilon = 0.5

        self.q_net = QNet(self.use_gpu, self.actions, self.cnn_input_dim,
                          self.feature_dim, self.agent_count,
                          self.other_input_dim, self.model)

    def agent_start(self, observation, reward):
        obs_cnn_array = self._observation_to_state_cnn(observation)
        obs_other_array = self._observation_to_state_other(observation)

        # Initialize State
        self.state_cnn = np.zeros(
            (self.agent_count, self.q_net.hist_size, 128, 128, 3),
            dtype=np.uint8)

        for i in range(self.agent_count):
            self.state_cnn[i][self.q_net.hist_size - 1] = obs_cnn_array[i]
        state_cnn_ = self._reshape_for_cnn(self.state_cnn,
                                           self.q_net.hist_size, 128, 128)
        state_cnn_ /= 255.0

        self.state_other = np.zeros(
            (self.agent_count, self.q_net.hist_size, self.other_input_dim),
            dtype=np.uint8)
        for i in range(self.agent_count):
            self.state_other[i][self.q_net.hist_size - 1] = obs_other_array[i]
        state_other_ = np.asanyarray(self.state_other.reshape(
            self.agent_count, self.q_net.hist_size * self.other_input_dim),
                                     dtype=np.float32)
        state_other_ /= 255.0

        if self.use_gpu >= 0:
            state_cnn_ = cuda.to_gpu(state_cnn_)
            state_other_ = cuda.to_gpu(state_other_)

        if self.policy_frozen is False:  # Learning ON/OFF
            if self.q_net.initial_exploration <= self.time:
                self.epsilon -= self.epsilon_delta
                if self.epsilon < self.min_eps:
                    self.epsilon = self.min_eps
                eps = self.epsilon
                print(("\naTraining Now. Time step : %d Epsilon : %.6f" %
                       (self.time, eps)))
            else:  # Initial Exploation Phase
                eps = 1.0
                print(("\naInitial Exploration S : %d/%d Epsilon : %.6f" %
                       (self.time, self.q_net.initial_exploration, eps)))

        # Generate an Action e-greedy
        action, q_now = self.q_net.e_greedy(state_cnn_, state_other_,
                                            self.epsilon, reward)

        # Update for next step
        self.last_action = action.copy()
        self.last_state_cnn = self.state_cnn.copy()
        self.last_state_other = self.state_other.copy()

        del state_cnn_, state_other_, obs_cnn_array, obs_other_array
        gc.collect()

        self.time += 1

        return action, q_now

    def agent_step(self, reward, observation):
        obs_cnn_array = self._observation_to_state_cnn(observation)
        obs_other_array = self._observation_to_state_other(observation)

        #         img = observation["image"][0]
        #         img.save("img.png")

        # Compose State : 4-step sequential observation
        for i in range(self.agent_count):
            if self.q_net.hist_size == 4:
                self.state_cnn[i] = np.asanyarray([
                    self.state_cnn[i][1], self.state_cnn[i][2],
                    self.state_cnn[i][3], obs_cnn_array[i]
                ],
                                                  dtype=np.uint8)
                if (obs_other_array.size != 0):
                    self.state_other[i] = np.asanyarray([
                        self.state_other[i][1], self.state_other[i][2],
                        self.state_other[i][3], obs_other_array[i]
                    ],
                                                        dtype=np.uint8)
            elif self.q_net.hist_size == 2:
                self.state_cnn[i] = np.asanyarray(
                    [self.state_cnn[i][1], obs_cnn_array[i]], dtype=np.uint8)
                if (obs_other_array.size != 0):
                    self.state_other[i] = np.asanyarray(
                        [self.state_other[i][1], obs_other_array[i]],
                        dtype=np.uint8)
            elif self.q_net.hist_size == 1:
                self.state_cnn[i] = np.asanyarray([obs_cnn_array[i]],
                                                  dtype=np.uint8)
                if (obs_other_array.size != 0):
                    self.state_other[i] = np.asanyarray([obs_other_array[i]],
                                                        dtype=np.uint8)
            else:
                print("self.DQN.hist_size err")

        state_cnn_ = self._reshape_for_cnn(self.state_cnn,
                                           self.q_net.hist_size, 128, 128)
        state_cnn_ /= 255.0

        state_other_ = np.asanyarray(self.state_other.reshape(
            self.agent_count, self.q_net.hist_size * self.other_input_dim),
                                     dtype=np.float32)
        state_other_ /= 255.0

        if self.use_gpu >= 0:
            state_cnn_ = cuda.to_gpu(state_cnn_)
            state_other_ = cuda.to_gpu(state_other_)

        # Exploration decays along the time sequence
        if self.policy_frozen is False:  # Learning ON/OFF
            if self.q_net.initial_exploration <= self.time:
                self.epsilon -= self.epsilon_delta
                if self.epsilon < self.min_eps:
                    self.epsilon = self.min_eps
                eps = self.epsilon
                print(("\nbTraining Now. Time step : %d Epsilon : %.6f" %
                       (self.time, eps)))
            else:  # Initial Exploation Phase
                eps = 1.0
                print(("\nInitial Exploration : %d/%d Epsilon : %.6f" %
                       (self.time, self.q_net.initial_exploration, eps)))
        else:  # Evaluation
            eps = 0.05
            print(("\nPolicy is Frozen. Time step : %d Epsilon : %.6f" %
                   (self.time, eps)))

        # Generate an Action by e-greedy action selection
        action, q_now = self.q_net.e_greedy(state_cnn_, state_other_, eps,
                                            reward)

        del state_cnn_, state_other_, obs_cnn_array, obs_other_array
        gc.collect()

        return action, eps, q_now

    def agent_step_update(self, reward, action, eps):
        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state_cnn,
                                        self.last_state_other,
                                        self.last_action, reward,
                                        self.state_cnn, self.state_other,
                                        False)
            self.q_net.experience_replay(self.time)

        if self.policy_frozen is False:
            self.last_action = action.copy()  # copy.deepcopy(action)
            self.last_state_cnn = self.state_cnn.copy()
            self.last_state_other = self.state_other.copy()

        self.time += 1

    def agent_end(self, reward):  # Episode Terminated
        print(('episode finished. Time step : %d' % self.time))

        print(("agent"), end=' ')
        for i in range(self.agent_count):
            print(("[%02d]        ( )reward(%06.2f)" % (i, reward[i])),
                  end=' ')
            if i % 5 == 4:
                print(("\n     "), end=' ')

        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state_cnn,
                                        self.last_state_other,
                                        self.last_action, reward,
                                        self.last_state_cnn,
                                        self.last_state_other, True)
            self.q_net.experience_replay(self.time)
        avg_episode_loss = 0
        if self.q_net.time_of_episode != 0:
            avg_episode_loss = self.q_net.loss_per_episode / self.q_net.time_of_episode
        self.q_net.loss_per_episode = 0
        self.q_net.time_of_episode = 0
        with open(self.avgloss_log_file, 'a') as the_file:
            the_file.write(str(self.time) + "," + str(avg_episode_loss) + "\n")
        # Time count
#         if self.policy_frozen is False:
        self.time += 1
示例#11
0
class CnnDqnAgent(object):
    policy_frozen = False
    epsilon_delta = 1.0 / 10 ** 4.4
    min_eps = 0.1

    actions = [0, 1, 2]

    cnn_feature_extractor = 'alexnet_feature_extractor.pickle'
    model = 'bvlc_alexnet.caffemodel'
    model_type = 'alexnet'
    image_feature_dim = 256 * 6 * 6
    image_feature_count = 1

    def _observation_to_featurevec(self, observation):
        # TODO clean
        if self.image_feature_count == 1:
            return np.r_[self.feature_extractor.feature(observation["image"][0]),
                         observation["depth"][0]]
        elif self.image_feature_count == 4:
            return np.r_[self.feature_extractor.feature(observation["image"][0]),
                         self.feature_extractor.feature(observation["image"][1]),
                         self.feature_extractor.feature(observation["image"][2]),
                         self.feature_extractor.feature(observation["image"][3]),
                         observation["depth"][0],
                         observation["depth"][1],
                         observation["depth"][2],
                         observation["depth"][3]]
        else:
            print("not supported: number of camera")

    def agent_init(self, **options):
        self.use_gpu = options['use_gpu']
        self.depth_image_dim = options['depth_image_dim']
        self.q_net_input_dim = self.image_feature_dim * self.image_feature_count + self.depth_image_dim

        if os.path.exists(self.cnn_feature_extractor):
            print("loading... " + self.cnn_feature_extractor),
            self.feature_extractor = pickle.load(open(self.cnn_feature_extractor))
            print("done")
        else:
            self.feature_extractor = CnnFeatureExtractor(self.use_gpu, self.model, self.model_type, self.image_feature_dim)
            pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'w'))
            print("pickle.dump finished")

        self.time = 0
        self.epsilon = 1.0  # Initial exploratoin rate
        self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim)

    def agent_start(self, observation):
        obs_array = self._observation_to_featurevec(observation)

        # Initialize State
        self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.uint8)
        self.state[0] = obs_array
        state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)

        # Generate an Action e-greedy
        action, q_now = self.q_net.e_greedy(state_, self.epsilon)
        return_action = action

        # Update for next step
        self.last_action = copy.deepcopy(return_action)
        self.last_state = self.state.copy()
        self.last_observation = obs_array

        return return_action

    def agent_step(self, reward, observation):
        obs_array = self._observation_to_featurevec(observation)

        #obs_processed = np.maximum(obs_array, self.last_observation)  # Take maximum from two frames

        # Compose State : 4-step sequential observation
        if self.q_net.hist_size == 4:
            self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_array], dtype=np.uint8)
        elif self.q_net.hist_size == 2:
            self.state = np.asanyarray([self.state[1], obs_array], dtype=np.uint8)
        elif self.q_net.hist_size == 1:
            self.state = np.asanyarray([obs_array], dtype=np.uint8)
        else:
            print("self.DQN.hist_size err")

        state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)

        # Exploration decays along the time sequence
        if self.policy_frozen is False:  # Learning ON/OFF
            if self.q_net.initial_exploration < self.time:
                self.epsilon -= self.epsilon_delta
                if self.epsilon < self.min_eps:
                    self.epsilon = self.min_eps
                eps = self.epsilon
            else:  # Initial Exploation Phase
                print("Initial Exploration : %d/%d steps" % (self.time, self.q_net.initial_exploration)),
                eps = 1.0
        else:  # Evaluation
            print("Policy is Frozen")
            eps = 0.05

        # Generate an Action by e-greedy action selection
        action, q_now = self.q_net.e_greedy(state_, eps)

        return action, eps, q_now, obs_array

    def agent_step_update(self, reward, action, eps, q_now, obs_array):
        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.state, False)
            self.q_net.experience_replay(self.time)

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Simple text based visualization
        if self.use_gpu >= 0:
            q_max = np.max(q_now.get())
        else:
            q_max = np.max(q_now)

        print('Step:%d  Action:%d  Reward:%.1f  Epsilon:%.6f  Q_max:%3f' % (
            self.time, self.q_net.action_to_index(action), reward, eps, q_max))

        # Updates for next step
        self.last_observation = obs_array

        if self.policy_frozen is False:
            self.last_action = copy.deepcopy(action)
            self.last_state = self.state.copy()
            self.time += 1

    def agent_end(self, reward):  # Episode Terminated
        print('episode finished. Reward:%.1f / Epsilon:%.6f' % (reward, self.epsilon))

        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.last_state,
                                        True)
            self.q_net.experience_replay(self.time)

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Time count
        if self.policy_frozen is False:
            self.time += 1
示例#12
0
class DqnAgent():
    def __init__(self,
                 obs_dims,
                 act_dim,
                 lr=1e-3,
                 gamma=0.99,
                 replay_buffer_size=10000,
                 batch_size=64,
                 epsilon_min=0.01,
                 epsilon_dec=5e-5,
                 target_update_frequency=64):
        self.buffer = ReplayBuffer(replay_buffer_size, obs_dims)
        self.batch_size = batch_size
        self.q_eval = QNet(obs_dims, act_dim)
        self.q_target = QNet(obs_dims, act_dim)
        self.obs_dims = obs_dims
        self.act_dim = act_dim
        self.learn_ctr = 0
        self.target_update_frequency = target_update_frequency
        self.gamma = gamma
        self.epsilon = 1
        self.epsilon_min = epsilon_min
        self.epsilon_dec = epsilon_dec
        self.optimizer = torch.optim.Adam(self.q_eval.parameters(), lr=lr)
        self.loss_fn = torch.nn.MSELoss()

    def update_target(self):
        if self.learn_ctr % self.target_update_frequency == 0:
            self.q_target.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon = self.epsilon - self.epsilon_dec

    def choose_action(self, obs):
        if np.random.sample() < self.epsilon:
            return np.random.randint(self.act_dim)
        else:
            obs = torch.tensor(np.expand_dims(obs, axis=0), dtype=torch.float)
            return torch.argmax(self.q_eval(obs)).item()

    def store_transition(self, obs, act, rew, _obs, done):
        self.buffer.push(obs, act, rew, _obs, done)

    def sample_replay_buffer(self):
        return self.buffer.sample(self.batch_size)

    def learn(self):
        self.optimizer.zero_grad()
        obs, act, rew, _obs, done = self.sample_replay_buffer()
        obs = torch.tensor(obs, dtype=torch.float)
        act = torch.tensor(act, dtype=torch.long)
        rew = torch.tensor(rew, dtype=torch.long)
        _obs = torch.tensor(_obs, dtype=torch.float)
        done = torch.tensor(done, dtype=torch.long)
        idxs = torch.tensor(np.arange(self.batch_size), dtype=torch.long)
        q_pred = self.q_eval(obs)[idxs, act]
        q_next = self.q_target(_obs).max(dim=1)[0]
        q_target = rew + (1 - done) * self.gamma * q_next
        loss = self.loss_fn(q_target, q_pred)
        loss.backward()
        self.optimizer.step()
        self.update_target()
        self.decrement_epsilon()
class Agent(object):
    def __init__(self, g_list, test_g_list, env):
        self.g_list = g_list
        if test_g_list is None:
            self.test_g_list = g_list
        else:
            self.test_g_list = test_g_list

        self.env = env
        self.net = QNet()
        self.old_net = QNet()
        self.optimizer = optim.Adam(self.net.parameters(),
                                    lr=cmd_args.learning_rate)

        if cmd_args.ctx == 'gpu':
            self.net = self.net.cuda()
            self.old_net = self.old_net.cuda()

        self.eps_start = 1.0
        self.eps_end = 1.0
        self.eps_step = 10000
        self.burn_in = 100  # number of iterations to run first set ("intial burning in to memory") of simulations?
        self.step = 0

        self.best_eval = None
        self.pos = 0
        self.sample_idxes = list(range(len(g_list)))
        random.shuffle(self.sample_idxes)
        self.take_snapshot()

    def take_snapshot(self):
        self.old_net.load_state_dict(self.net.state_dict())

    # type = 0 for add, 1 for subtract
    def make_actions(self, greedy=True, _type=0):
        self.eps = self.eps_end + max(
            0., (self.eps_start - self.eps_end) *
            (self.eps_step - max(0., self.step)) / self.eps_step)

        cur_state = self.env.getStateRef()

        actions, q_arrs = self.net(cur_state,
                                   None,
                                   greedy_acts=True,
                                   _type=_type)

        q_vals = []

        for i in range(len(q_arrs)):
            tmp = q_arrs[i].numpy()
            tmp = tmp[actions[i]][0]
            q_vals.append(tmp)

        return actions, q_vals

    def run_simulation(self):

        self.env.setup(g_list)
        avg_rewards = []

        t_a, t_s = 0, 0

        for asdf in range(GLOBAL_EPISODE_STEPS):

            if asdf % 2 == 0:
                assert self.env.first_nodes == None

            for i in range(len(self.g_list)):

                g = self.g_list[i].to_networkx()

                con_nodes = list(set(list(sum(g.edges, ()))))
                for j in range(20):
                    if (j not in con_nodes):
                        rand_num = np.random.randint(0, 20)
                        g.add_edge(j, rand_num)
                        self.env.added_edges.append((j, rand_num))

                self.g_list[i] = S2VGraph(g, label=self.g_list[i].label)

            action_type = (asdf % 4) // 2

            # get Actions
            list_at, _ = self.make_actions(_type=action_type)

            # save State
            list_st = self.env.cloneState()

            cur_state = self.env.getStateRef()

            _, predicted_Q = self.net(cur_state,
                                      None,
                                      greedy_acts=False,
                                      _type=action_type)

            # get Rewards
            if self.env.first_nodes is not None:
                rewards = self.env.get_rewards(list_at, _type=action_type)
                avg_rewards.append(sum(rewards) / len(rewards))
            else:
                rewards = [0] * len(g_list)

            # Update graph to get S'
            self.env.step(list_at, _type=action_type)

            # get next state
            if env.isTerminal():
                s_prime = None
            else:
                s_prime = self.env.cloneState()

            # get S'and A' values
            try:
                sprime_at, q_primes = self.make_actions(_type=action_type)

            except:
                continue

            # Calculate Q(S', A')
            actual_Q = torch.Tensor(rewards) + torch.Tensor(q_primes)

            # Pass loss to network
            loss = F.mse_loss(predicted_Q, actual_Q)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        return avg_rewards

    def train(self):

        # set up progress bar
        pbar = tqdm(range(GLOBAL_NUM_STEPS), unit='steps')
        avgs = []
        # for each iteration
        for self.step in pbar:
            # run simulation
            # side effects?
            avgs += self.run_simulation()
            #print("tmp: ", tmp)
            #avg_reward_step.append(sum(tmp)/len(tmp))
            #plt.plot(tmp)
            #plt.show()
            #plt.savefig('test.png')
        print("avgs: ", avgs)
        mov_avg = np.convolve(np.array(avgs), np.ones(4), 'valid') / 4
        print("mov avg: ", list(mov_avg))
        print(type(mov_avg))
        print(mov_avg.shape)
        plt.clf()
        plt.plot(list(mov_avg))
        plt.title('running average of average rewards')

        plt.savefig("Results.png")

        plt.show()
示例#14
0
class CnnDqnAgent(object):
    def __init__(self):
        super(CnnDqnAgent, self).__init__()
        self.policy_frozen = False
        self.epsilon_delta = 1.0 / 10**4.4
        self.min_eps = 0.1
        self.actions = [0, 1, 2]

        self.cnn_feature_extractor = 'alexnet_feature_extractor.pickle'
        self.model = 'bvlc_alexnet.caffemodel'
        self.model_type = 'alexnet'
        self.image_feature_dim = 256 * 6 * 6
        self.image_feature_count = 1

        self.prediction_update_tick = 0

    def _observation_to_featurevec(self, observation):
        feature_image = [
            self.feature_extractor(observation["image"][i])
            for i in range(self.image_feature_count)
        ]
        return np.concatenate(feature_image + observation["depth"])

    def agent_init(self, **options):
        self.use_gpu = options['use_gpu']
        self.depth_image_dim = options['depth_image_dim']
        self.q_net_input_dim = self.image_feature_dim * self.image_feature_count + self.depth_image_dim

        if os.path.exists(self.cnn_feature_extractor):
            print("loading... " + self.cnn_feature_extractor)
            with open(self.cnn_feature_extractor, 'rb') as f:
                self.feature_extractor = pickle.load(f)
            print("done")
        else:
            print('there is no chainer alexnet model file ',
                  self.cnn_feature_extractor)
            print('making chainer model from ', self.model)
            print('this process take a tens of minutes.')
            self.feature_extractor = CnnFeatureExtractor(
                self.use_gpu, self.model, self.model_type,
                self.image_feature_dim)
            pickle.dump(self.feature_extractor,
                        open(self.cnn_feature_extractor, 'wb'))
            print("pickle.dump finished")

        self.time = 0
        self.epsilon = 1.0  # Initial exploratoin rate
        self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim)

    def agent_start(self, observation):
        # Initialize State
        self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim),
                              dtype=np.float32)

        new_feature_vec = self._observation_to_featurevec(observation)
        self.state[0, :] = new_feature_vec

        # Generate an Action e-greedy
        state_ = np.expand_dims(self.state, 0)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_, device=self.use_gpu)
        action, _, deg_intereset = self.q_net.e_greedy(state_, self.epsilon)
        return_action = action

        # Update for next step
        self.last_action = copy.deepcopy(return_action)
        self.last_state = self.state.copy()
        self.last_observation = new_feature_vec

        return return_action, deg_intereset

    def agent_step(self, reward, observation):
        new_feature_vec = self._observation_to_featurevec(observation)
        past_states = self.state[0:-1, :]
        self.state[0, :] = new_feature_vec
        self.state[1:, :] = past_states

        # Exploration decays along the time sequence
        state_ = np.expand_dims(self.state, 0)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_, device=self.use_gpu)

        if self.policy_frozen is False:  # Learning ON/OFF
            if self.q_net.initial_exploration < self.time:
                self.epsilon -= self.epsilon_delta
                if self.epsilon < self.min_eps:
                    self.epsilon = self.min_eps
                eps = self.epsilon
            else:  # Initial Exploation Phase
                print("Initial Exploration : %d/%d steps" %
                      (self.time, self.q_net.initial_exploration)),
                eps = 1.0
        else:  # Evaluation
            print("Policy is Frozen")
            eps = 0.05

        # Generate an Action by e-greedy action selection
        action, q_now, deg_intereset = self.q_net.e_greedy(state_, eps)

        return action, eps, q_now, new_feature_vec, deg_intereset

    def agent_step_update(self, reward, action, eps, q_now, new_feature_vec,
                          deg_intereset):
        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state,
                                        self.last_action, reward, self.state,
                                        False)
            self.q_net.experience_replay(self.time)

        self.prediction_update_tick += 1
        if self.prediction_update_tick >= 10:
            self.prediction_update_tick = 0
            print('prediction update')
            self.q_net.prediction_update()

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(
                self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Simple text based visualization
        if self.use_gpu >= 0:
            q_max = np.max(q_now.get())
        else:
            q_max = np.max(q_now)

        print(
            'Step:%d  Action:%d  Reward:%.1f  Epsilon:%.6f  Q_max:%3f def_interest:%3f'
            % (self.time, self.q_net.action_to_index(action), reward, eps,
               q_max, deg_intereset))

        # Updates for next step
        self.last_observation = new_feature_vec

        if self.policy_frozen is False:
            self.last_action = copy.deepcopy(action)
            self.last_state = self.state.copy()
            self.time += 1

    def agent_end(self, reward):  # Episode Terminated
        print('episode finished. Reward:%.1f / Epsilon:%.6f' %
              (reward, self.epsilon))

        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state,
                                        self.last_action, reward,
                                        self.last_state, True)
            self.q_net.experience_replay(self.time)

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(
                self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Time count
        if self.policy_frozen is False:
            self.time += 1
class CnnDqnAgent(object):
    policy_frozen = False
    epsilon_delta = 1.0 / 10 ** 4.4# print '%.10f' %(1.0 / 10 ** 4.4)
                                   # 0.0000398107170553496878006617676337697275812388397753238677978515625
    min_eps = 0.1

    actions = [0, 1, 2]

    cnn_feature_extractor = 'alexnet_feature_extractor.pickle'
    model = 'bvlc_alexnet.caffemodel'
    model_type = 'alexnet'
    image_feature_dim = 256 * 6 * 6
    image_feature_count = 1
    actions_evaluate = deque(maxlen=4) #----------------------------------------------------------

    def _observation_to_featurevec(self, observation):
        # TODO clean
        if self.image_feature_count == 1:
            #print observation["image"][0].shape, type(observation["image"][0])#会error因为不是np所以没有shape
            #print self.feature_extractor.feature(observation["image"][0]).shape#,\#返回的是1D的256*6*6
            #observation["depth"][0].shape
            return np.r_[self.feature_extractor.feature(observation["image"][0])]
                         #, observation["depth"][0]]
        elif self.image_feature_count == 4:
            return np.r_[self.feature_extractor.feature(observation["image"][0]),
                         self.feature_extractor.feature(observation["image"][1]),
                         self.feature_extractor.feature(observation["image"][2]),
                         self.feature_extractor.feature(observation["image"][3])]#
                         # observation["depth"][0],
                         # observation["depth"][1],
                         # observation["depth"][2],
                         # observation["depth"][3]]
        else:
            print("not supported: number of camera")

    def agent_init(self, **options):
        self.use_gpu = options['use_gpu']
        #self.depth_image_dim = options['depth_image_dim']
        self.q_net_input_dim = self.image_feature_dim * self.image_feature_count #+ self.depth_image_dim

        if os.path.exists(self.cnn_feature_extractor):
            print("loading... " + self.cnn_feature_extractor),
            self.feature_extractor = pickle.load(open(self.cnn_feature_extractor))
            print("done")
        else:
            self.feature_extractor = CnnFeatureExtractor(self.use_gpu, self.model, self.model_type, self.image_feature_dim)
            pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'w'))
            print("pickle.dump finished")

        self.time = 0
        self.epsilon = 1.0  # Initial exploratoin rate
        self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim)

    def agent_start(self, observation):
        obs_array = self._observation_to_featurevec(observation)#拿到前面去提取分析再r合并了

        # Initialize State
        self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.uint8)
        self.state[0] = obs_array
        state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)

        # Generate an Action e-greedy
        action, q_now = self.q_net.e_greedy(state_, self.epsilon)#return return_action1
        return_action = action                                   #return return_action2
        print return_action, type(return_action)#------------------------------------------¥¥¥¥ Random 2 <type 'int'>
        # Update for next step
        self.last_action = copy.deepcopy(return_action)
        self.last_state = self.state.copy()#作为下个状态的开始
        self.last_observation = obs_array #::::::::::::::::::::::::::::::::::::::::::::

        return return_action                                     #return return_action3
                                               # action, q_now = self.q_net.e_greedy(state_, self.epsilon)-75

    def agent_step(self, reward, observation):
        obs_array = self._observation_to_featurevec(observation)#拿到前面去提取分析再r_合并了

        #obs_processed = np.maximum(obs_array, self.last_observation)  # Take maximum from two frames

        # Compose State : 4-step sequential observation
        if self.q_net.hist_size == 4:
            self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_array], dtype=np.uint8)
        elif self.q_net.hist_size == 2:
            self.state = np.asanyarray([self.state[1], obs_array], dtype=np.uint8)
        elif self.q_net.hist_size == 1:
            self.state = np.asanyarray([obs_array], dtype=np.uint8)
        else:
            print("self.DQN.hist_size err")

        state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32)
        #state_从self.state = np.asanyarray([obs_array], dtype=np.uint8)的uint8去小数变成shape(1,1,256*6*6)的float32
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)

        # Exploration decays along the time sequence
        if self.policy_frozen is False:  # Learning ON/OFF
            if self.q_net.initial_exploration < self.time:#10000<运行时间
                self.epsilon -= self.epsilon_delta        #那么开始渐渐减少eps
                if self.epsilon < self.min_eps:   #如果eps已经被减少的快没了比预定的最小值还要小,
                    self.epsilon = self.min_eps   #则等于min_eps =0.1
                eps = self.epsilon # self.epsilon = 1.0 ----61 理由是 if np.random.rand() < epsilon:q_net.py160行
            else:  # Initial Exploation Phase
                print("Initial Exploration : %d/%d steps" % (self.time, self.q_net.initial_exploration)),
                eps = 1.0 #---------------------1¥打印现在的step,需要学习的步子(例如 Initial Exploration : 173/1000 steps)
        else:  # Evaluation
            print("Policy is Frozen")
            eps = 0.05

        # Generate an Action by e-greedy action selection
        action, q_now = self.q_net.e_greedy(state_, eps)#-----------------------------------------3维度数组state_和1.0
        return action, eps, q_now, obs_array
        # server.py 120行 self.agent.agent_step_update(reward, action, eps, q_now, obs_array)

    def agent_step_update(self, reward, action, eps, q_now, obs_array):
        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.state, False)
            print "-------------------  Real Index%d" % (self.q_net.data_index)#%int
            self.actions_evaluate.append(self.last_action)
            if self.actions_evaluate[-1] == self.actions.index(2) and reward >= 1.0 and len(self.actions_evaluate) == 4:
                if [self.actions_evaluate[i]for i in xrange(3)] == ([1, 0, 1]or[1, 0, 1]):
                    index = np.asanyarray(self.q_net.data_index, dtype=np.int8)
                    for i in xrange(1, len(self.actions_evaluate)+1):
                        self.q_net.d[2][index - i] -= 0.5
      #-----#   self.action_evaluate = deque()----------------------------------------------!!!!!!!!!!!!!!!!!!!!!!!!!!
            self.q_net.experience_replay(self.time)

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Simple text based visualization
        if self.use_gpu >= 0:
            q_max = np.max(q_now.get())
        else:
            q_max = np.max(q_now)

        print('Step:%d  Action:%d  Reward:%.1f  Epsilon:%.6f  Q_max:%3f' % (
            self.time, self.q_net.action_to_index(action), reward, eps, q_max))
        # ¥Step:92  Action:0  Reward:0.0  Epsilon:1.000000  Q_max:0.000000
        # Updates for next step
        self.last_observation = obs_array#::::::::::::::::::::::::::::::::::::::::

        if self.policy_frozen is False:
            self.last_action = copy.deepcopy(action)
            self.last_state = self.state.copy()
            self.time += 1

    def agent_end(self, reward):  # Episode Terminated!!
        print('episode finished. Reward:%.1f / Epsilon:%.6f' % (reward, self.epsilon))

        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.last_state,
                                        True)#----------------------------------------------------------------

            self.q_net.experience_replay(self.time)

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Time count
        if self.policy_frozen is False:
            self.time += 1
class CnnDqnAgent(object):
    policy_frozen = False
    epsilon_delta = 1.0 / 10**4.4
    min_eps = 0.1

    actions = [0, 1, 2]

    cnn_feature_extractor = 'alexnet_feature_extractor.pickle'
    model = 'bvlc_alexnet.caffemodel'
    model_type = 'alexnet'
    image_feature_dim = 256 * 6 * 6
    image_feature_count = 1

    def _observation_to_featurevec(self, observation):
        # TODO clean
        if self.image_feature_count == 1:
            return np.r_[
                self.feature_extractor.feature(observation["image"][0]),
                observation["depth"][0]]
        elif self.image_feature_count == 4:
            return np.r_[
                self.feature_extractor.feature(observation["image"][0]),
                self.feature_extractor.feature(observation["image"][1]),
                self.feature_extractor.feature(observation["image"][2]),
                self.feature_extractor.feature(observation["image"][3]),
                observation["depth"][0], observation["depth"][1],
                observation["depth"][2], observation["depth"][3]]
        else:
            print("not supported: number of camera")

    def agent_init(self, **options):
        self.use_gpu = options['use_gpu']
        self.depth_image_dim = options['depth_image_dim']
        self.q_net_input_dim = self.image_feature_dim * self.image_feature_count + self.depth_image_dim

        if os.path.exists(self.cnn_feature_extractor):
            print("loading... " + self.cnn_feature_extractor),
            self.feature_extractor = pickle.load(
                open(self.cnn_feature_extractor))
            print("done")
        else:
            self.feature_extractor = CnnFeatureExtractor(
                self.use_gpu, self.model, self.model_type,
                self.image_feature_dim)
            pickle.dump(self.feature_extractor,
                        open(self.cnn_feature_extractor, 'w'))
            print("pickle.dump finished")

        self.time = 0
        self.epsilon = 1.0  # Initial exploratoin rate
        self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim)
        #self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32)
        #self.last_state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32)

    def agent_start(self, observation):
        obs_array = self._observation_to_featurevec(observation)

        # Initialize State
        #self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.uint8)
        self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim),
                              dtype=np.float32)
        self.state[0] = obs_array
        state_ = np.asanyarray(self.state[0].reshape(1, self.q_net_input_dim),
                               dtype=np.float32)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)
        # reset lstm state
        self.q_net.action_model.reset()
        self.q_net.scene_model.reset()

        # Generate an Action e-greedy
        action, q_now = self.q_net.e_greedy(state_, self.epsilon)
        return_action = action

        # Update for next step
        self.last_action = copy.deepcopy(return_action)
        self.last_state = self.state.copy()
        self.last_observation = obs_array

        return return_action

    def agent_step(self, reward, observation):
        obs_array = self._observation_to_featurevec(observation)

        #obs_processed = np.maximum(obs_array, self.last_observation)  # Take maximum from two frames

        # Compose State : 4-step sequential observation
        #if self.q_net.hist_size == 4:
        #    self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_array], dtype=np.uint8)
        #elif self.q_net.hist_size == 2:
        #    self.state = np.asanyarray([self.state[1], obs_array], dtype=np.uint8)
        #elif self.q_net.hist_size == 1:
        #    self.state = np.asanyarray([obs_array], dtype=np.uint8)
        #else:
        #    print("self.DQN.hist_size err")

        np.append(self.state, obs_array)
        #self.state = np.asanyarray(self.state[len(self.state) - self.q_net.hist_size:len(self.state)], dtype=np.uint8)
        self.state = np.asanyarray(
            self.state[len(self.state) - self.q_net.hist_size:len(self.state)],
            dtype=np.float32)

        state_ = np.asanyarray(self.state[self.q_net.hist_size - 1].reshape(
            1, self.q_net_input_dim),
                               dtype=np.float32)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)

        # Exploration decays along the time sequence
        if self.policy_frozen is False:  # Learning ON/OFF
            if self.q_net.initial_exploration < self.time:
                self.epsilon -= self.epsilon_delta
                if self.epsilon < self.min_eps:
                    self.epsilon = self.min_eps
                eps = self.epsilon
            else:  # Initial Exploation Phase
                print("Initial Exploration : %d/%d steps" %
                      (self.time, self.q_net.initial_exploration)),
                eps = 1.0
        else:  # Evaluation
            print("Policy is Frozen")
            eps = 0.05

        last_state_ = np.asanyarray(self.state[self.q_net.hist_size -
                                               2].reshape(
                                                   1, self.q_net_input_dim),
                                    dtype=np.float32)
        #last_state_ = np.asanyarray(self.last_state[self.q_net.hist_size-1].reshape(1, self.q_net_input_dim), dtype=np.float32)
        if self.use_gpu >= 0:
            last_state_ = cuda.to_gpu(last_state_)
        # Generate an Action by e-greedy action selection
        action, q_now, interest = self.q_net.e_greedy_with_interest(
            state_, eps, last_state_)

        print("interest is %f" % interest)

        return action, eps, q_now, obs_array, interest

    def agent_step_update(self, reward, action, eps, q_now, obs_array):
        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state,
                                        self.last_action, reward, self.state,
                                        False)
            self.q_net.experience_replay(self.time)

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(
                self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Simple text based visualization
        if self.use_gpu >= 0:
            q_max = np.max(q_now.get())
        else:
            q_max = np.max(q_now)

        print('Step:%d  Action:%d  Reward:%.1f  Epsilon:%.6f  Q_max:%3f' %
              (self.time, self.q_net.action_to_index(action), reward, eps,
               q_max))

        # Updates for next step
        self.last_observation = obs_array

        if self.policy_frozen is False:
            self.last_action = copy.deepcopy(action)
            self.last_state = self.state.copy()
            self.time += 1

    def agent_end(self, reward):  # Episode Terminated
        print('episode finished. Reward:%.1f / Epsilon:%.6f' %
              (reward, self.epsilon))

        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state,
                                        self.last_action, reward,
                                        self.last_state, True)
            self.q_net.experience_replay(self.time)

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(
                self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Time count
        if self.policy_frozen is False:
            self.time += 1
示例#17
0
 def agent_init(self):
     self.q_net = QNet(self.use_gpu, self.q_net_input_dim, self.agent_id)
示例#18
0
class CnnDqnAgent(object):
    policy_frozen = False  # 学習をやめて、実行だけしたいときはTrueにする
    epsilon_delta = 1.0 / 10**4.4
    min_eps = 0.1

    # press, up, down, left, right, none
    num_of_action_type = 6
    num_of_pad = 5

    cnn_feature_extractor = 'alexnet_feature_extractor.pickle'
    model = 'bvlc_alexnet.caffemodel'
    model_type = 'alexnet'
    image_feature_dim = 256 * 6 * 6

    def agent_init(self, **options):
        self.use_gpu = options['use_gpu']
        self.pad_state_dim = options['pad_states_dim']
        self.q_net_input_dim = self.image_feature_dim + self.pad_state_dim

        if os.path.exists(self.cnn_feature_extractor):
            print("loading... " + self.cnn_feature_extractor),
            self.feature_extractor = pickle.load(
                open(self.cnn_feature_extractor))
        else:
            print("pickle.dump start")
            self.feature_extractor = CnnFeatureExtractor(
                self.use_gpu, self.model, self.model_type,
                self.image_feature_dim)
            pickle.dump(self.feature_extractor,
                        open(self.cnn_feature_extractor, 'wb'))
            print("pickle.dump finished")

        self.time = 0
        self.epsilon = 1.0  # Initial exploratoin rate
        self.q_net = QNet(self.use_gpu, self.num_of_action_type,
                          self.num_of_pad, self.q_net_input_dim)

    def agent_start(self, observation):
        obs_array = np.r_[self.feature_extractor.feature(observation["image"]),
                          observation["pad_states"]]

        # Initialize State
        self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim),
                              dtype=np.uint8)
        self.state[0] = obs_array
        state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size,
                                                  self.q_net_input_dim),
                               dtype=np.float32)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)

        # Generate an Action e-greedy
        action, q_now = self.q_net.e_greedy(state_, self.epsilon)
        return_action = action

        # Update for next step
        self.last_action = copy.deepcopy(return_action)
        self.last_state = self.state.copy()
        self.last_observation = obs_array

        return return_action

    def agent_step(self, reward, observation):
        obs_array = np.r_[self.feature_extractor.feature(observation["image"]),
                          observation["pad_states"]]

        #obs_processed = np.maximum(obs_array, self.last_observation)  # Take maximum from two frames

        # Compose State : 4-step sequential observation
        if self.q_net.hist_size == 4:
            self.state = np.asanyarray(
                [self.state[1], self.state[2], self.state[3], obs_array],
                dtype=np.uint8)
        elif self.q_net.hist_size == 2:
            self.state = np.asanyarray([self.state[1], obs_array],
                                       dtype=np.uint8)
        elif self.q_net.hist_size == 1:
            self.state = np.asanyarray([obs_array], dtype=np.uint8)
        else:
            print("self.DQN.hist_size err")

        state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size,
                                                  self.q_net_input_dim),
                               dtype=np.float32)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)

        # Exploration decays along the time sequence
        if self.policy_frozen is False:  # Learning ON/OFF
            if self.q_net.initial_exploration < self.time:
                self.epsilon -= self.epsilon_delta
                if self.epsilon < self.min_eps:
                    self.epsilon = self.min_eps
                eps = self.epsilon
            else:  # Initial Exploation Phase
                print("Initial Exploration : %d/%d steps" %
                      (self.time, self.q_net.initial_exploration)),
                eps = 1.0
        else:  # Evaluation
            print("Policy is Frozen")
            eps = 0.05

        # Generate an Action by e-greedy action selection
        action, q_now = self.q_net.e_greedy(state_, eps)

        return action, eps, q_now, obs_array

    def agent_step_update(self, reward, action, eps, q_now, obs_array):
        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state,
                                        self.last_action, reward, self.state,
                                        False)
            self.q_net.experience_replay(self.time)

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(
                self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Simple text based visualization
        if self.use_gpu >= 0:
            q_max = np.max(q_now.get())
        else:
            q_max = np.max(q_now)

        print('Step:%d  Reward:%f  Epsilon:%.6f  Q_max:%3f' %
              (self.time, reward, eps, q_max))
        print('Action: {0}'.format(action))

        # Updates for next step
        self.last_observation = obs_array

        if self.policy_frozen is False:
            self.last_action = copy.deepcopy(action)
            self.last_state = self.state.copy()
            self.time += 1

    def agent_end(self, reward):  # Episode Terminated
        print('episode finished. Reward:%.1f / Epsilon:%.6f' %
              (reward, self.epsilon))

        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state,
                                        self.last_action, reward,
                                        self.last_state, True)
            self.q_net.experience_replay(self.time)

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(
                self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Time count
        if self.policy_frozen is False:
            self.time += 1
class CnnDqnAgent(object):
    epsilon_delta = 1.0 / 10**4.4  #deltaの減少量
    min_eps = 0.1  #deltaの最小値

    actions = range(3)
    cnn_feature_extractor = 'alexnet_feature_extractor.pickle'  #1
    model = 'bvlc_alexnet.caffemodel'  #2
    model_type = 'alexnet'  #3
    image_feature_dim = 256 * 6 * 6
    image_count = 1

    def _observation_to_featurevec(self, observation):
        # TODO clean
        if self.image_count == 1:
            return np.r_[
                self.feature_extractor.feature(observation["image"][0]),
                observation["depth"][0]]
        elif self.image_count == 4:
            return np.r_[
                self.feature_extractor.feature(observation["image"][0]),
                self.feature_extractor.feature(observation["image"][1]),
                self.feature_extractor.feature(observation["image"][2]),
                self.feature_extractor.feature(observation["image"][3]),
                observation["depth"][0], observation["depth"][1],
                observation["depth"][2], observation["depth"][3]]
        else:
            print("not supported: number of camera")

    def agent_init(self, **options):
        try:
            self.image_count = options['image_count']
            self.depth_image_dim = options['depth_image_dim']
            self.use_gpu = options['use_gpu']
            self.test = options['test']
            self.folder = options["folder"]  #save_modelで使う->self.
            model_num = options['model_num']

            self.q_net_input_dim = self.image_feature_dim * self.image_count + self.depth_image_dim * self.image_count

            if os.path.exists(self.cnn_feature_extractor):
                print("loading... " + self.cnn_feature_extractor),
                self.feature_extractor = pickle.load(
                    open(self.cnn_feature_extractor))
                print("done")

            else:
                self.feature_extractor = CnnFeatureExtractor(
                    self.use_gpu, self.model, self.model_type,
                    self.image_feature_dim)
                pickle.dump(self.feature_extractor,
                            open(self.cnn_feature_extractor, 'w'))
                print("pickle.dump finished")

            self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim)

            self.time = model_num + 1  #saveとloadが同時に行われることを防ぐため
            if (self.test):
                self.epsilon = 0.0
            else:
                non_exploration = max(
                    self.time - self.q_net.initial_exploration, 0)
                self.epsilon = max(1.0 - non_exploration * self.epsilon_delta,
                                   self.min_eps)
            print "epsilon = ", self.epsilon

            if (self.test or model_num > 0):
                self.q_net.load_model(self.folder, model_num)
        except:
            import traceback
            import sys
            traceback.print_exc()
            sys.exit()

    # 行動取得系,state更新系メソッド
    def agent_start(self, observation):
        try:
            obs_array = self._observation_to_featurevec(observation)
            # Initialize State
            self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim),
                                  dtype=np.uint8)
            self.state[0] = obs_array
            state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size,
                                                      self.q_net_input_dim),
                                   dtype=np.float32)
            if self.use_gpu >= 0:
                state_ = cuda.to_gpu(state_)

            # Generate an Action e-greedy
            action, q_now = self.q_net.e_greedy(state_, self.epsilon)
            return_action = action

            # Update for next step
            self.last_action = copy.deepcopy(return_action)
            self.last_state = self.state.copy()
            return return_action
        except:
            import traceback
            import sys
            traceback.print_exc()
            sys.exit()

    # 行動取得系,state更新系メソッド
    def agent_step(self, observation):
        try:
            obs_array = self._observation_to_featurevec(observation)
            # Compose State : 4-step sequential observation
            if self.q_net.hist_size == 4:
                self.state = np.asanyarray(
                    [self.state[1], self.state[2], self.state[3], obs_array],
                    dtype=np.uint8)
            elif self.q_net.hist_size == 2:
                self.state = np.asanyarray([self.state[1], obs_array],
                                           dtype=np.uint8)
            elif self.q_net.hist_size == 1:
                self.state = np.asanyarray([obs_array], dtype=np.uint8)
            else:
                print("self.DQN.hist_size err")

            # q_funcに入れる際は(サンプル数,hist_size,q_net_input_dim)
            state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size,
                                                      self.q_net_input_dim),
                                   dtype=np.float32)
            if self.use_gpu >= 0:
                state_ = cuda.to_gpu(state_)

            # Exploration decays along the time sequence
            if self.test is False:  # Learning ON/OFF
                if self.q_net.initial_exploration < self.time:  #timeが1000を超えたら
                    self.epsilon -= self.epsilon_delta
                    if self.epsilon < self.min_eps:
                        self.epsilon = self.min_eps
                    eps = self.epsilon

                #最初に1000回ランダムに行動
                else:
                    print("Initial Exploration : %d/%d steps" %
                          (self.time, self.q_net.initial_exploration)),
                    eps = 1.0
            else:  # Evaluation
                print("Policy is Frozen")
                eps = 0.0

            # Generate an Action by e-greedy action selection
            action, q_now = self.q_net.e_greedy(state_, eps)
            return action, eps, q_now, obs_array
        except:
            import traceback
            import sys
            traceback.print_exc()
            sys.exit()

# 学習系メソッド

    def agent_step_update(self, reward, action, eps, q_now):
        try:
            # Learning Phase
            if self.test is False:  # Learning ON/OFF
                self.q_net.stock_experience(self.time, self.last_state,
                                            self.last_action, reward,
                                            self.state, False)
                self.q_net.experience_replay(self.time)

            # Target model update
            if self.q_net.initial_exploration < self.time and np.mod(
                    self.time, self.q_net.target_model_update_freq) == 0:
                print("Model Updated")
                self.q_net.target_model_update()

            # Simple text based visualization
            if self.use_gpu >= 0:
                q_max = np.max(q_now.get())
            else:
                q_max = np.max(q_now)

            print('Step:%d  Action:%d  Reward:%.1f  Epsilon:%.6f  Q_max:%3f' %
                  (self.time, self.q_net.action_to_index(action), reward, eps,
                   q_max))

            if self.test is False:
                self.last_action = copy.deepcopy(action)
                self.last_state = self.state.copy()
                # save model
                if self.q_net.initial_exploration < self.time and np.mod(
                        self.time, self.q_net.save_model_freq) == 0:
                    print "------------------Save Model------------------"
                    self.q_net.save_model(self.folder, self.time)

            # Time count
            self.time += 1
        except:
            import traceback
            import sys
            traceback.print_exc()
            sys.exit()

    # 学習系メソッド
    def agent_end(self, reward):  # Episode Terminated
        try:
            print('episode finished. Reward:%.1f / Epsilon:%.6f' %
                  (reward, self.epsilon))
            # Learning Phase
            if self.test is False:  # Learning ON/OFF
                self.q_net.stock_experience(self.time, self.last_state,
                                            self.last_action, reward,
                                            self.last_state, True)
                self.q_net.experience_replay(self.time)

            # Target model update
            if self.q_net.initial_exploration < self.time and np.mod(
                    self.time, self.q_net.target_model_update_freq) == 0:
                print("Model Updated")
                self.q_net.target_model_update()

            if self.test is False:
                # Model Save
                if self.q_net.initial_exploration < self.time and np.mod(
                        self.time, self.q_net.save_model_freq) == 0:
                    print "------------------Save Model------------------"
                    self.q_net.save_model(self.time, self.velocity)

            # Time count
            self.time += 1
        except:
            import traceback
            import sys
            traceback.print_exc()
            sys.exit()