Python QNet.target_model_update 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: q_net

클래스/타입: QNet

메소드/함수: target_model_update

hotexamples.com에서의 예제들: 6

Python QNet.target_model_update - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 q_net.QNet.target_model_update에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

QNet(8)

e_greedy(6)

experience_replay(6)

stock_experience(6)

target_model_update(5)

action_to_index(4)

load_state_dict(2)

parameters(2)

state_dict(2)

cuda(1)

e_greedy_with_interest(1)

load_model(1)

prediction_update(1)

save_model(1)

예제 #1

파일 보기

파일: cnn_dqn_agent.py 프로젝트: LungTakumi/lis

class CnnDqnAgent(object):
    policy_frozen = False
    epsilon_delta = 1.0 / 10 ** 4.4
    min_eps = 0.1

    actions = [0, 1, 2]

    cnn_feature_extractor = 'alexnet_feature_extractor.pickle'
    model = 'bvlc_alexnet.caffemodel'
    model_type = 'alexnet'
    image_feature_dim = 256 * 6 * 6
    image_feature_count = 1

    def _observation_to_featurevec(self, observation):
        # TODO clean
        if self.image_feature_count == 1:
            return np.r_[self.feature_extractor.feature(observation["image"][0]),
                         observation["depth"][0]]
        elif self.image_feature_count == 4:
            return np.r_[self.feature_extractor.feature(observation["image"][0]),
                         self.feature_extractor.feature(observation["image"][1]),
                         self.feature_extractor.feature(observation["image"][2]),
                         self.feature_extractor.feature(observation["image"][3]),
                         observation["depth"][0],
                         observation["depth"][1],
                         observation["depth"][2],
                         observation["depth"][3]]
        else:
            print("not supported: number of camera")

    def agent_init(self, **options):
        self.use_gpu = options['use_gpu']
        self.depth_image_dim = options['depth_image_dim']
        self.q_net_input_dim = self.image_feature_dim * self.image_feature_count + self.depth_image_dim

        if os.path.exists(self.cnn_feature_extractor):
            print("loading... " + self.cnn_feature_extractor),
            self.feature_extractor = pickle.load(open(self.cnn_feature_extractor))
            print("done")
        else:
            self.feature_extractor = CnnFeatureExtractor(self.use_gpu, self.model, self.model_type, self.image_feature_dim)
            pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'w'))
            print("pickle.dump finished")

        self.time = 0
        self.epsilon = 1.0  # Initial exploratoin rate
        self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim)

    def agent_start(self, observation):
        obs_array = self._observation_to_featurevec(observation)

        # Initialize State
        self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.uint8)
        self.state[0] = obs_array
        state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)

        # Generate an Action e-greedy
        action, q_now = self.q_net.e_greedy(state_, self.epsilon)
        return_action = action

        # Update for next step
        self.last_action = copy.deepcopy(return_action)
        self.last_state = self.state.copy()
        self.last_observation = obs_array

        return return_action

    def agent_step(self, reward, observation):
        obs_array = self._observation_to_featurevec(observation)

        #obs_processed = np.maximum(obs_array, self.last_observation)  # Take maximum from two frames

        # Compose State : 4-step sequential observation
        if self.q_net.hist_size == 4:
            self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_array], dtype=np.uint8)
        elif self.q_net.hist_size == 2:
            self.state = np.asanyarray([self.state[1], obs_array], dtype=np.uint8)
        elif self.q_net.hist_size == 1:
            self.state = np.asanyarray([obs_array], dtype=np.uint8)
        else:
            print("self.DQN.hist_size err")

        state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)

        # Exploration decays along the time sequence
        if self.policy_frozen is False:  # Learning ON/OFF
            if self.q_net.initial_exploration < self.time:
                self.epsilon -= self.epsilon_delta
                if self.epsilon < self.min_eps:
                    self.epsilon = self.min_eps
                eps = self.epsilon
            else:  # Initial Exploation Phase
                print("Initial Exploration : %d/%d steps" % (self.time, self.q_net.initial_exploration)),
                eps = 1.0
        else:  # Evaluation
            print("Policy is Frozen")
            eps = 0.05

        # Generate an Action by e-greedy action selection
        action, q_now = self.q_net.e_greedy(state_, eps)

        return action, eps, q_now, obs_array

    def agent_step_update(self, reward, action, eps, q_now, obs_array):
        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.state, False)
            self.q_net.experience_replay(self.time)

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Simple text based visualization
        if self.use_gpu >= 0:
            q_max = np.max(q_now.get())
        else:
            q_max = np.max(q_now)

        print('Step:%d  Action:%d  Reward:%.1f  Epsilon:%.6f  Q_max:%3f' % (
            self.time, self.q_net.action_to_index(action), reward, eps, q_max))

        # Updates for next step
        self.last_observation = obs_array

        if self.policy_frozen is False:
            self.last_action = copy.deepcopy(action)
            self.last_state = self.state.copy()
            self.time += 1

    def agent_end(self, reward):  # Episode Terminated
        print('episode finished. Reward:%.1f / Epsilon:%.6f' % (reward, self.epsilon))

        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.last_state,
                                        True)
            self.q_net.experience_replay(self.time)

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Time count
        if self.policy_frozen is False:
            self.time += 1

예제 #2

파일 보기

파일: cnn_dqn_agent.py 프로젝트: ttkerasoyeah/Reinforcelearning

class CnnDqnAgent(object):
    policy_frozen = False
    epsilon_delta = 1.0 / 10 ** 4.4# print '%.10f' %(1.0 / 10 ** 4.4)
                                   # 0.0000398107170553496878006617676337697275812388397753238677978515625
    min_eps = 0.1

    actions = [0, 1, 2]

    cnn_feature_extractor = 'alexnet_feature_extractor.pickle'
    model = 'bvlc_alexnet.caffemodel'
    model_type = 'alexnet'
    image_feature_dim = 256 * 6 * 6
    image_feature_count = 1
    actions_evaluate = deque(maxlen=4) #----------------------------------------------------------

    def _observation_to_featurevec(self, observation):
        # TODO clean
        if self.image_feature_count == 1:
            #print observation["image"][0].shape, type(observation["image"][0])#会error因为不是np所以没有shape
            #print self.feature_extractor.feature(observation["image"][0]).shape#,\#返回的是1D的256*6*6
            #observation["depth"][0].shape
            return np.r_[self.feature_extractor.feature(observation["image"][0])]
                         #, observation["depth"][0]]
        elif self.image_feature_count == 4:
            return np.r_[self.feature_extractor.feature(observation["image"][0]),
                         self.feature_extractor.feature(observation["image"][1]),
                         self.feature_extractor.feature(observation["image"][2]),
                         self.feature_extractor.feature(observation["image"][3])]#
                         # observation["depth"][0],
                         # observation["depth"][1],
                         # observation["depth"][2],
                         # observation["depth"][3]]
        else:
            print("not supported: number of camera")

    def agent_init(self, **options):
        self.use_gpu = options['use_gpu']
        #self.depth_image_dim = options['depth_image_dim']
        self.q_net_input_dim = self.image_feature_dim * self.image_feature_count #+ self.depth_image_dim

        if os.path.exists(self.cnn_feature_extractor):
            print("loading... " + self.cnn_feature_extractor),
            self.feature_extractor = pickle.load(open(self.cnn_feature_extractor))
            print("done")
        else:
            self.feature_extractor = CnnFeatureExtractor(self.use_gpu, self.model, self.model_type, self.image_feature_dim)
            pickle.dump(self.feature_extractor, open(self.cnn_feature_extractor, 'w'))
            print("pickle.dump finished")

        self.time = 0
        self.epsilon = 1.0  # Initial exploratoin rate
        self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim)

    def agent_start(self, observation):
        obs_array = self._observation_to_featurevec(observation)#拿到前面去提取分析再r合并了

        # Initialize State
        self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.uint8)
        self.state[0] = obs_array
        state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)

        # Generate an Action e-greedy
        action, q_now = self.q_net.e_greedy(state_, self.epsilon)#return return_action1
        return_action = action                                   #return return_action2
        print return_action, type(return_action)#------------------------------------------￥￥￥￥ Random 2 <type 'int'>
        # Update for next step
        self.last_action = copy.deepcopy(return_action)
        self.last_state = self.state.copy()#作为下个状态的开始
        self.last_observation = obs_array #：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：

        return return_action                                     #return return_action3
                                               # action, q_now = self.q_net.e_greedy(state_, self.epsilon)-75

    def agent_step(self, reward, observation):
        obs_array = self._observation_to_featurevec(observation)#拿到前面去提取分析再r＿合并了

        #obs_processed = np.maximum(obs_array, self.last_observation)  # Take maximum from two frames

        # Compose State : 4-step sequential observation
        if self.q_net.hist_size == 4:
            self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_array], dtype=np.uint8)
        elif self.q_net.hist_size == 2:
            self.state = np.asanyarray([self.state[1], obs_array], dtype=np.uint8)
        elif self.q_net.hist_size == 1:
            self.state = np.asanyarray([obs_array], dtype=np.uint8)
        else:
            print("self.DQN.hist_size err")

        state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32)
        #state_从self.state = np.asanyarray([obs_array], dtype=np.uint8)的uint8去小数变成shape（1，1，256*6*6）的float32
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)

        # Exploration decays along the time sequence
        if self.policy_frozen is False:  # Learning ON/OFF
            if self.q_net.initial_exploration < self.time:#10000<运行时间
                self.epsilon -= self.epsilon_delta        #那么开始渐渐减少eps
                if self.epsilon < self.min_eps:   #如果eps已经被减少的快没了比预定的最小值还要小，
                    self.epsilon = self.min_eps   #则等于min_eps =0.1
                eps = self.epsilon # self.epsilon = 1.0 ----61 理由是 if np.random.rand() < epsilon:q_net.py160行
            else:  # Initial Exploation Phase
                print("Initial Exploration : %d/%d steps" % (self.time, self.q_net.initial_exploration)),
                eps = 1.0 #---------------------1￥打印现在的step，需要学习的步子（例如 Initial Exploration : 173/1000 steps）
        else:  # Evaluation
            print("Policy is Frozen")
            eps = 0.05

        # Generate an Action by e-greedy action selection
        action, q_now = self.q_net.e_greedy(state_, eps)#-----------------------------------------3维度数组state_和1.0
        return action, eps, q_now, obs_array
        # server.py 120行 self.agent.agent_step_update(reward, action, eps, q_now, obs_array)

    def agent_step_update(self, reward, action, eps, q_now, obs_array):
        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.state, False)
            print "-------------------  Real Index%d" % (self.q_net.data_index)#%int
            self.actions_evaluate.append(self.last_action)
            if self.actions_evaluate[-1] == self.actions.index(2) and reward >= 1.0 and len(self.actions_evaluate) == 4:
                if [self.actions_evaluate[i]for i in xrange(3)] == ([1, 0, 1]or[1, 0, 1]):
                    index = np.asanyarray(self.q_net.data_index, dtype=np.int8)
                    for i in xrange(1, len(self.actions_evaluate)+1):
                        self.q_net.d[2][index - i] -= 0.5
      #-----#   self.action_evaluate = deque()----------------------------------------------!!!!!!!!!!!!!!!!!!!!!!!!!!
            self.q_net.experience_replay(self.time)

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Simple text based visualization
        if self.use_gpu >= 0:
            q_max = np.max(q_now.get())
        else:
            q_max = np.max(q_now)

        print('Step:%d  Action:%d  Reward:%.1f  Epsilon:%.6f  Q_max:%3f' % (
            self.time, self.q_net.action_to_index(action), reward, eps, q_max))
        # ￥Step:92  Action:0  Reward:0.0  Epsilon:1.000000  Q_max:0.000000
        # Updates for next step
        self.last_observation = obs_array#：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：：

        if self.policy_frozen is False:
            self.last_action = copy.deepcopy(action)
            self.last_state = self.state.copy()
            self.time += 1

    def agent_end(self, reward):  # Episode Terminated！！
        print('episode finished. Reward:%.1f / Epsilon:%.6f' % (reward, self.epsilon))

        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state, self.last_action, reward, self.last_state,
                                        True)#----------------------------------------------------------------

            self.q_net.experience_replay(self.time)

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Time count
        if self.policy_frozen is False:
            self.time += 1

예제 #3

파일 보기

파일: cnn_dqn_agent.py 프로젝트: kamito300/lis_lstm_interest

class CnnDqnAgent(object):
    policy_frozen = False
    epsilon_delta = 1.0 / 10**4.4
    min_eps = 0.1

    actions = [0, 1, 2]

    cnn_feature_extractor = 'alexnet_feature_extractor.pickle'
    model = 'bvlc_alexnet.caffemodel'
    model_type = 'alexnet'
    image_feature_dim = 256 * 6 * 6
    image_feature_count = 1

    def _observation_to_featurevec(self, observation):
        # TODO clean
        if self.image_feature_count == 1:
            return np.r_[
                self.feature_extractor.feature(observation["image"][0]),
                observation["depth"][0]]
        elif self.image_feature_count == 4:
            return np.r_[
                self.feature_extractor.feature(observation["image"][0]),
                self.feature_extractor.feature(observation["image"][1]),
                self.feature_extractor.feature(observation["image"][2]),
                self.feature_extractor.feature(observation["image"][3]),
                observation["depth"][0], observation["depth"][1],
                observation["depth"][2], observation["depth"][3]]
        else:
            print("not supported: number of camera")

    def agent_init(self, **options):
        self.use_gpu = options['use_gpu']
        self.depth_image_dim = options['depth_image_dim']
        self.q_net_input_dim = self.image_feature_dim * self.image_feature_count + self.depth_image_dim

        if os.path.exists(self.cnn_feature_extractor):
            print("loading... " + self.cnn_feature_extractor),
            self.feature_extractor = pickle.load(
                open(self.cnn_feature_extractor))
            print("done")
        else:
            self.feature_extractor = CnnFeatureExtractor(
                self.use_gpu, self.model, self.model_type,
                self.image_feature_dim)
            pickle.dump(self.feature_extractor,
                        open(self.cnn_feature_extractor, 'w'))
            print("pickle.dump finished")

        self.time = 0
        self.epsilon = 1.0  # Initial exploratoin rate
        self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim)
        #self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32)
        #self.last_state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.float32)

    def agent_start(self, observation):
        obs_array = self._observation_to_featurevec(observation)

        # Initialize State
        #self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim), dtype=np.uint8)
        self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim),
                              dtype=np.float32)
        self.state[0] = obs_array
        state_ = np.asanyarray(self.state[0].reshape(1, self.q_net_input_dim),
                               dtype=np.float32)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)
        # reset lstm state
        self.q_net.action_model.reset()
        self.q_net.scene_model.reset()

        # Generate an Action e-greedy
        action, q_now = self.q_net.e_greedy(state_, self.epsilon)
        return_action = action

        # Update for next step
        self.last_action = copy.deepcopy(return_action)
        self.last_state = self.state.copy()
        self.last_observation = obs_array

        return return_action

    def agent_step(self, reward, observation):
        obs_array = self._observation_to_featurevec(observation)

        #obs_processed = np.maximum(obs_array, self.last_observation)  # Take maximum from two frames

        # Compose State : 4-step sequential observation
        #if self.q_net.hist_size == 4:
        #    self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_array], dtype=np.uint8)
        #elif self.q_net.hist_size == 2:
        #    self.state = np.asanyarray([self.state[1], obs_array], dtype=np.uint8)
        #elif self.q_net.hist_size == 1:
        #    self.state = np.asanyarray([obs_array], dtype=np.uint8)
        #else:
        #    print("self.DQN.hist_size err")

        np.append(self.state, obs_array)
        #self.state = np.asanyarray(self.state[len(self.state) - self.q_net.hist_size:len(self.state)], dtype=np.uint8)
        self.state = np.asanyarray(
            self.state[len(self.state) - self.q_net.hist_size:len(self.state)],
            dtype=np.float32)

        state_ = np.asanyarray(self.state[self.q_net.hist_size - 1].reshape(
            1, self.q_net_input_dim),
                               dtype=np.float32)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)

        # Exploration decays along the time sequence
        if self.policy_frozen is False:  # Learning ON/OFF
            if self.q_net.initial_exploration < self.time:
                self.epsilon -= self.epsilon_delta
                if self.epsilon < self.min_eps:
                    self.epsilon = self.min_eps
                eps = self.epsilon
            else:  # Initial Exploation Phase
                print("Initial Exploration : %d/%d steps" %
                      (self.time, self.q_net.initial_exploration)),
                eps = 1.0
        else:  # Evaluation
            print("Policy is Frozen")
            eps = 0.05

        last_state_ = np.asanyarray(self.state[self.q_net.hist_size -
                                               2].reshape(
                                                   1, self.q_net_input_dim),
                                    dtype=np.float32)
        #last_state_ = np.asanyarray(self.last_state[self.q_net.hist_size-1].reshape(1, self.q_net_input_dim), dtype=np.float32)
        if self.use_gpu >= 0:
            last_state_ = cuda.to_gpu(last_state_)
        # Generate an Action by e-greedy action selection
        action, q_now, interest = self.q_net.e_greedy_with_interest(
            state_, eps, last_state_)

        print("interest is %f" % interest)

        return action, eps, q_now, obs_array, interest

    def agent_step_update(self, reward, action, eps, q_now, obs_array):
        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state,
                                        self.last_action, reward, self.state,
                                        False)
            self.q_net.experience_replay(self.time)

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(
                self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Simple text based visualization
        if self.use_gpu >= 0:
            q_max = np.max(q_now.get())
        else:
            q_max = np.max(q_now)

        print('Step:%d  Action:%d  Reward:%.1f  Epsilon:%.6f  Q_max:%3f' %
              (self.time, self.q_net.action_to_index(action), reward, eps,
               q_max))

        # Updates for next step
        self.last_observation = obs_array

        if self.policy_frozen is False:
            self.last_action = copy.deepcopy(action)
            self.last_state = self.state.copy()
            self.time += 1

    def agent_end(self, reward):  # Episode Terminated
        print('episode finished. Reward:%.1f / Epsilon:%.6f' %
              (reward, self.epsilon))

        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state,
                                        self.last_action, reward,
                                        self.last_state, True)
            self.q_net.experience_replay(self.time)

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(
                self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Time count
        if self.policy_frozen is False:
            self.time += 1

예제 #4

파일 보기

파일: cnn_dqn_agent.py 프로젝트: pokoyakazan/VirtualSimulation

class CnnDqnAgent(object):
    epsilon_delta = 1.0 / 10**4.4  #deltaの減少量
    min_eps = 0.1  #deltaの最小値

    actions = range(3)
    cnn_feature_extractor = 'alexnet_feature_extractor.pickle'  #1
    model = 'bvlc_alexnet.caffemodel'  #2
    model_type = 'alexnet'  #3
    image_feature_dim = 256 * 6 * 6
    image_count = 1

    def _observation_to_featurevec(self, observation):
        # TODO clean
        if self.image_count == 1:
            return np.r_[
                self.feature_extractor.feature(observation["image"][0]),
                observation["depth"][0]]
        elif self.image_count == 4:
            return np.r_[
                self.feature_extractor.feature(observation["image"][0]),
                self.feature_extractor.feature(observation["image"][1]),
                self.feature_extractor.feature(observation["image"][2]),
                self.feature_extractor.feature(observation["image"][3]),
                observation["depth"][0], observation["depth"][1],
                observation["depth"][2], observation["depth"][3]]
        else:
            print("not supported: number of camera")

    def agent_init(self, **options):
        try:
            self.image_count = options['image_count']
            self.depth_image_dim = options['depth_image_dim']
            self.use_gpu = options['use_gpu']
            self.test = options['test']
            self.folder = options["folder"]  #save_modelで使う->self.
            model_num = options['model_num']

            self.q_net_input_dim = self.image_feature_dim * self.image_count + self.depth_image_dim * self.image_count

            if os.path.exists(self.cnn_feature_extractor):
                print("loading... " + self.cnn_feature_extractor),
                self.feature_extractor = pickle.load(
                    open(self.cnn_feature_extractor))
                print("done")

            else:
                self.feature_extractor = CnnFeatureExtractor(
                    self.use_gpu, self.model, self.model_type,
                    self.image_feature_dim)
                pickle.dump(self.feature_extractor,
                            open(self.cnn_feature_extractor, 'w'))
                print("pickle.dump finished")

            self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim)

            self.time = model_num + 1  #saveとloadが同時に行われることを防ぐため
            if (self.test):
                self.epsilon = 0.0
            else:
                non_exploration = max(
                    self.time - self.q_net.initial_exploration, 0)
                self.epsilon = max(1.0 - non_exploration * self.epsilon_delta,
                                   self.min_eps)
            print "epsilon = ", self.epsilon

            if (self.test or model_num > 0):
                self.q_net.load_model(self.folder, model_num)
        except:
            import traceback
            import sys
            traceback.print_exc()
            sys.exit()

    # 行動取得系,state更新系メソッド
    def agent_start(self, observation):
        try:
            obs_array = self._observation_to_featurevec(observation)
            # Initialize State
            self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim),
                                  dtype=np.uint8)
            self.state[0] = obs_array
            state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size,
                                                      self.q_net_input_dim),
                                   dtype=np.float32)
            if self.use_gpu >= 0:
                state_ = cuda.to_gpu(state_)

            # Generate an Action e-greedy
            action, q_now = self.q_net.e_greedy(state_, self.epsilon)
            return_action = action

            # Update for next step
            self.last_action = copy.deepcopy(return_action)
            self.last_state = self.state.copy()
            return return_action
        except:
            import traceback
            import sys
            traceback.print_exc()
            sys.exit()

    # 行動取得系,state更新系メソッド
    def agent_step(self, observation):
        try:
            obs_array = self._observation_to_featurevec(observation)
            # Compose State : 4-step sequential observation
            if self.q_net.hist_size == 4:
                self.state = np.asanyarray(
                    [self.state[1], self.state[2], self.state[3], obs_array],
                    dtype=np.uint8)
            elif self.q_net.hist_size == 2:
                self.state = np.asanyarray([self.state[1], obs_array],
                                           dtype=np.uint8)
            elif self.q_net.hist_size == 1:
                self.state = np.asanyarray([obs_array], dtype=np.uint8)
            else:
                print("self.DQN.hist_size err")

            # q_funcに入れる際は(サンプル数,hist_size,q_net_input_dim)
            state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size,
                                                      self.q_net_input_dim),
                                   dtype=np.float32)
            if self.use_gpu >= 0:
                state_ = cuda.to_gpu(state_)

            # Exploration decays along the time sequence
            if self.test is False:  # Learning ON/OFF
                if self.q_net.initial_exploration < self.time:  #timeが1000を超えたら
                    self.epsilon -= self.epsilon_delta
                    if self.epsilon < self.min_eps:
                        self.epsilon = self.min_eps
                    eps = self.epsilon

                #最初に1000回ランダムに行動
                else:
                    print("Initial Exploration : %d/%d steps" %
                          (self.time, self.q_net.initial_exploration)),
                    eps = 1.0
            else:  # Evaluation
                print("Policy is Frozen")
                eps = 0.0

            # Generate an Action by e-greedy action selection
            action, q_now = self.q_net.e_greedy(state_, eps)
            return action, eps, q_now, obs_array
        except:
            import traceback
            import sys
            traceback.print_exc()
            sys.exit()

# 学習系メソッド

    def agent_step_update(self, reward, action, eps, q_now):
        try:
            # Learning Phase
            if self.test is False:  # Learning ON/OFF
                self.q_net.stock_experience(self.time, self.last_state,
                                            self.last_action, reward,
                                            self.state, False)
                self.q_net.experience_replay(self.time)

            # Target model update
            if self.q_net.initial_exploration < self.time and np.mod(
                    self.time, self.q_net.target_model_update_freq) == 0:
                print("Model Updated")
                self.q_net.target_model_update()

            # Simple text based visualization
            if self.use_gpu >= 0:
                q_max = np.max(q_now.get())
            else:
                q_max = np.max(q_now)

            print('Step:%d  Action:%d  Reward:%.1f  Epsilon:%.6f  Q_max:%3f' %
                  (self.time, self.q_net.action_to_index(action), reward, eps,
                   q_max))

            if self.test is False:
                self.last_action = copy.deepcopy(action)
                self.last_state = self.state.copy()
                # save model
                if self.q_net.initial_exploration < self.time and np.mod(
                        self.time, self.q_net.save_model_freq) == 0:
                    print "------------------Save Model------------------"
                    self.q_net.save_model(self.folder, self.time)

            # Time count
            self.time += 1
        except:
            import traceback
            import sys
            traceback.print_exc()
            sys.exit()

    # 学習系メソッド
    def agent_end(self, reward):  # Episode Terminated
        try:
            print('episode finished. Reward:%.1f / Epsilon:%.6f' %
                  (reward, self.epsilon))
            # Learning Phase
            if self.test is False:  # Learning ON/OFF
                self.q_net.stock_experience(self.time, self.last_state,
                                            self.last_action, reward,
                                            self.last_state, True)
                self.q_net.experience_replay(self.time)

            # Target model update
            if self.q_net.initial_exploration < self.time and np.mod(
                    self.time, self.q_net.target_model_update_freq) == 0:
                print("Model Updated")
                self.q_net.target_model_update()

            if self.test is False:
                # Model Save
                if self.q_net.initial_exploration < self.time and np.mod(
                        self.time, self.q_net.save_model_freq) == 0:
                    print "------------------Save Model------------------"
                    self.q_net.save_model(self.time, self.velocity)

            # Time count
            self.time += 1
        except:
            import traceback
            import sys
            traceback.print_exc()
            sys.exit()

예제 #5

파일 보기

class CnnDqnAgent(object):
    policy_frozen = False  # 学習をやめて、実行だけしたいときはTrueにする
    epsilon_delta = 1.0 / 10**4.4
    min_eps = 0.1

    # press, up, down, left, right, none
    num_of_action_type = 6
    num_of_pad = 5

    cnn_feature_extractor = 'alexnet_feature_extractor.pickle'
    model = 'bvlc_alexnet.caffemodel'
    model_type = 'alexnet'
    image_feature_dim = 256 * 6 * 6

    def agent_init(self, **options):
        self.use_gpu = options['use_gpu']
        self.pad_state_dim = options['pad_states_dim']
        self.q_net_input_dim = self.image_feature_dim + self.pad_state_dim

        if os.path.exists(self.cnn_feature_extractor):
            print("loading... " + self.cnn_feature_extractor),
            self.feature_extractor = pickle.load(
                open(self.cnn_feature_extractor))
        else:
            print("pickle.dump start")
            self.feature_extractor = CnnFeatureExtractor(
                self.use_gpu, self.model, self.model_type,
                self.image_feature_dim)
            pickle.dump(self.feature_extractor,
                        open(self.cnn_feature_extractor, 'wb'))
            print("pickle.dump finished")

        self.time = 0
        self.epsilon = 1.0  # Initial exploratoin rate
        self.q_net = QNet(self.use_gpu, self.num_of_action_type,
                          self.num_of_pad, self.q_net_input_dim)

    def agent_start(self, observation):
        obs_array = np.r_[self.feature_extractor.feature(observation["image"]),
                          observation["pad_states"]]

        # Initialize State
        self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim),
                              dtype=np.uint8)
        self.state[0] = obs_array
        state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size,
                                                  self.q_net_input_dim),
                               dtype=np.float32)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)

        # Generate an Action e-greedy
        action, q_now = self.q_net.e_greedy(state_, self.epsilon)
        return_action = action

        # Update for next step
        self.last_action = copy.deepcopy(return_action)
        self.last_state = self.state.copy()
        self.last_observation = obs_array

        return return_action

    def agent_step(self, reward, observation):
        obs_array = np.r_[self.feature_extractor.feature(observation["image"]),
                          observation["pad_states"]]

        #obs_processed = np.maximum(obs_array, self.last_observation)  # Take maximum from two frames

        # Compose State : 4-step sequential observation
        if self.q_net.hist_size == 4:
            self.state = np.asanyarray(
                [self.state[1], self.state[2], self.state[3], obs_array],
                dtype=np.uint8)
        elif self.q_net.hist_size == 2:
            self.state = np.asanyarray([self.state[1], obs_array],
                                       dtype=np.uint8)
        elif self.q_net.hist_size == 1:
            self.state = np.asanyarray([obs_array], dtype=np.uint8)
        else:
            print("self.DQN.hist_size err")

        state_ = np.asanyarray(self.state.reshape(1, self.q_net.hist_size,
                                                  self.q_net_input_dim),
                               dtype=np.float32)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_)

        # Exploration decays along the time sequence
        if self.policy_frozen is False:  # Learning ON/OFF
            if self.q_net.initial_exploration < self.time:
                self.epsilon -= self.epsilon_delta
                if self.epsilon < self.min_eps:
                    self.epsilon = self.min_eps
                eps = self.epsilon
            else:  # Initial Exploation Phase
                print("Initial Exploration : %d/%d steps" %
                      (self.time, self.q_net.initial_exploration)),
                eps = 1.0
        else:  # Evaluation
            print("Policy is Frozen")
            eps = 0.05

        # Generate an Action by e-greedy action selection
        action, q_now = self.q_net.e_greedy(state_, eps)

        return action, eps, q_now, obs_array

    def agent_step_update(self, reward, action, eps, q_now, obs_array):
        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state,
                                        self.last_action, reward, self.state,
                                        False)
            self.q_net.experience_replay(self.time)

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(
                self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Simple text based visualization
        if self.use_gpu >= 0:
            q_max = np.max(q_now.get())
        else:
            q_max = np.max(q_now)

        print('Step:%d  Reward:%f  Epsilon:%.6f  Q_max:%3f' %
              (self.time, reward, eps, q_max))
        print('Action: {0}'.format(action))

        # Updates for next step
        self.last_observation = obs_array

        if self.policy_frozen is False:
            self.last_action = copy.deepcopy(action)
            self.last_state = self.state.copy()
            self.time += 1

    def agent_end(self, reward):  # Episode Terminated
        print('episode finished. Reward:%.1f / Epsilon:%.6f' %
              (reward, self.epsilon))

        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state,
                                        self.last_action, reward,
                                        self.last_state, True)
            self.q_net.experience_replay(self.time)

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(
                self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Time count
        if self.policy_frozen is False:
            self.time += 1

예제 #6

파일 보기

class CnnDqnAgent(object):
    def __init__(self):
        super(CnnDqnAgent, self).__init__()
        self.policy_frozen = False
        self.epsilon_delta = 1.0 / 10**4.4
        self.min_eps = 0.1
        self.actions = [0, 1, 2]

        self.cnn_feature_extractor = 'alexnet_feature_extractor.pickle'
        self.model = 'bvlc_alexnet.caffemodel'
        self.model_type = 'alexnet'
        self.image_feature_dim = 256 * 6 * 6
        self.image_feature_count = 1

        self.prediction_update_tick = 0

    def _observation_to_featurevec(self, observation):
        feature_image = [
            self.feature_extractor(observation["image"][i])
            for i in range(self.image_feature_count)
        ]
        return np.concatenate(feature_image + observation["depth"])

    def agent_init(self, **options):
        self.use_gpu = options['use_gpu']
        self.depth_image_dim = options['depth_image_dim']
        self.q_net_input_dim = self.image_feature_dim * self.image_feature_count + self.depth_image_dim

        if os.path.exists(self.cnn_feature_extractor):
            print("loading... " + self.cnn_feature_extractor)
            with open(self.cnn_feature_extractor, 'rb') as f:
                self.feature_extractor = pickle.load(f)
            print("done")
        else:
            print('there is no chainer alexnet model file ',
                  self.cnn_feature_extractor)
            print('making chainer model from ', self.model)
            print('this process take a tens of minutes.')
            self.feature_extractor = CnnFeatureExtractor(
                self.use_gpu, self.model, self.model_type,
                self.image_feature_dim)
            pickle.dump(self.feature_extractor,
                        open(self.cnn_feature_extractor, 'wb'))
            print("pickle.dump finished")

        self.time = 0
        self.epsilon = 1.0  # Initial exploratoin rate
        self.q_net = QNet(self.use_gpu, self.actions, self.q_net_input_dim)

    def agent_start(self, observation):
        # Initialize State
        self.state = np.zeros((self.q_net.hist_size, self.q_net_input_dim),
                              dtype=np.float32)

        new_feature_vec = self._observation_to_featurevec(observation)
        self.state[0, :] = new_feature_vec

        # Generate an Action e-greedy
        state_ = np.expand_dims(self.state, 0)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_, device=self.use_gpu)
        action, _, deg_intereset = self.q_net.e_greedy(state_, self.epsilon)
        return_action = action

        # Update for next step
        self.last_action = copy.deepcopy(return_action)
        self.last_state = self.state.copy()
        self.last_observation = new_feature_vec

        return return_action, deg_intereset

    def agent_step(self, reward, observation):
        new_feature_vec = self._observation_to_featurevec(observation)
        past_states = self.state[0:-1, :]
        self.state[0, :] = new_feature_vec
        self.state[1:, :] = past_states

        # Exploration decays along the time sequence
        state_ = np.expand_dims(self.state, 0)
        if self.use_gpu >= 0:
            state_ = cuda.to_gpu(state_, device=self.use_gpu)

        if self.policy_frozen is False:  # Learning ON/OFF
            if self.q_net.initial_exploration < self.time:
                self.epsilon -= self.epsilon_delta
                if self.epsilon < self.min_eps:
                    self.epsilon = self.min_eps
                eps = self.epsilon
            else:  # Initial Exploation Phase
                print("Initial Exploration : %d/%d steps" %
                      (self.time, self.q_net.initial_exploration)),
                eps = 1.0
        else:  # Evaluation
            print("Policy is Frozen")
            eps = 0.05

        # Generate an Action by e-greedy action selection
        action, q_now, deg_intereset = self.q_net.e_greedy(state_, eps)

        return action, eps, q_now, new_feature_vec, deg_intereset

    def agent_step_update(self, reward, action, eps, q_now, new_feature_vec,
                          deg_intereset):
        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state,
                                        self.last_action, reward, self.state,
                                        False)
            self.q_net.experience_replay(self.time)

        self.prediction_update_tick += 1
        if self.prediction_update_tick >= 10:
            self.prediction_update_tick = 0
            print('prediction update')
            self.q_net.prediction_update()

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(
                self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Simple text based visualization
        if self.use_gpu >= 0:
            q_max = np.max(q_now.get())
        else:
            q_max = np.max(q_now)

        print(
            'Step:%d  Action:%d  Reward:%.1f  Epsilon:%.6f  Q_max:%3f def_interest:%3f'
            % (self.time, self.q_net.action_to_index(action), reward, eps,
               q_max, deg_intereset))

        # Updates for next step
        self.last_observation = new_feature_vec

        if self.policy_frozen is False:
            self.last_action = copy.deepcopy(action)
            self.last_state = self.state.copy()
            self.time += 1

    def agent_end(self, reward):  # Episode Terminated
        print('episode finished. Reward:%.1f / Epsilon:%.6f' %
              (reward, self.epsilon))

        # Learning Phase
        if self.policy_frozen is False:  # Learning ON/OFF
            self.q_net.stock_experience(self.time, self.last_state,
                                        self.last_action, reward,
                                        self.last_state, True)
            self.q_net.experience_replay(self.time)

        # Target model update
        if self.q_net.initial_exploration < self.time and np.mod(
                self.time, self.q_net.target_model_update_freq) == 0:
            print("Model Updated")
            self.q_net.target_model_update()

        # Time count
        if self.policy_frozen is False:
            self.time += 1