예제 #1
0
    def agent_start(self, observation):
        self.lstm_State = make_initial_state(1045)
        for key, value in self.lstm_State.items():
            value.data = cuda.to_gpu(value.data)
        # Preprocess
        tmp = np.bitwise_and(
            np.asarray(observation.intArray[128:]).reshape([210, 160]),
            0b0001111)  # Get Intensity from the observation
        obs_array = (spm.imresize(tmp, (110, 84)))[110 - 84 - 8:110 -
                                                   8, :]  # Scaling

        # Initialize State
        self.state = np.zeros((4, 84, 84), dtype=np.uint8)
        self.state[0] = obs_array
        state_ = cuda.to_gpu(
            np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32))

        # Generate an Action e-greedy
        returnAction = Action()
        action, Q_now = self.DQN.e_greedy(state_, self.epsilon)
        returnAction.intArray = [action]

        # Update for next step
        self.lastAction = copy.deepcopy(returnAction)
        self.last_state = self.state.copy()
        self.last_observation = obs_array

        return returnAction
    def agent_start(self, observation):
        self.lstm_State = make_initial_state(1045)
        for key, value in self.lstm_State.items():
            value.data = cuda.to_gpu(value.data)
        # Preprocess
        tmp = np.bitwise_and(np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111)  # Get Intensity from the observation
        obs_array = (spm.imresize(tmp, (110, 84)))[110-84-8:110-8, :]  # Scaling

        # Initialize State
        self.state = np.zeros((4, 84, 84), dtype=np.uint8)
        self.state[0] = obs_array
        state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32))

        # Generate an Action e-greedy
        returnAction = Action()
        action, Q_now = self.DQN.e_greedy(state_, self.epsilon)
        returnAction.intArray = [action]

        # Update for next step
        self.lastAction = copy.deepcopy(returnAction)
        self.last_state = self.state.copy()
        self.last_observation = obs_array

        return returnAction
    def agent_step(self, reward, observation):

        # Preproces
        tmp = np.bitwise_and(np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111)  # Get Intensity from the observation
        obs_array = (spm.imresize(tmp, (110, 84)))[110-84-8:110-8, :]  # Scaling
        obs_processed = np.maximum(obs_array, self.last_observation)  # Take maximum from two frames

        # Compose State : 4-step sequential observation
        self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_processed], dtype=np.uint8)
        state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32))
        
        state_for_lstm = Variable(cuda.to_gpu(np.asanyarray(self.last_state.reshape(1, 4, 84, 84), dtype=np.float32)))
        CNNout = self.DQN.Q_func_LSTM(state_for_lstm).reshape(1,64*7*7)
        now_CNN = self.DQN.Q_func_LSTM(Variable(state_)).reshape(1,64*7*7) 

        lstm_in = cuda.to_gpu(CNNout)
        
        returnAction = Action()                                                                                  
        action, Q_now = self.DQN.e_greedy(state_, self.epsilon)           
        returnAction.intArray = [action] 
        
        self.dqn_reward = reward
        #if dqn-reward is not 0, LSTM reset

        if self.dqn_reward != 0:
            reward = self.dqn_reward
            self.lstm_State=make_initial_state(1045)
            for key, value in self.lstm_State.items():
                value.data = cuda.to_gpu(value.data)
        else:
            LState = self.lstm_State
            self.lstm_State, self.lstm_s_dash = self.lstm_class.model_lstm.predict(lstm_in,LState)
            reward = lstm_loss - F.mean_squared_error(self.lstm_s_dash, Variable(cuda.to_gpu(now_CNN)))
            reward = reward.data.get()
            print reward


        # Exploration decays along the time sequence
        if self.policyFrozen is False:  # Learning ON/OFF
            if self.DQN.initial_exploration < self.time:
                self.epsilon -= 1.0/10**6
                if self.epsilon < 0.1:
                    self.epsilon = 0.1
                eps = self.epsilon
            else:  # Initial Exploation Phase
                print "Initial Exploration : %d/%d steps" % (self.time, self.DQN.initial_exploration)
                eps = 1.0
        else:  # Evaluation
                print "Policy is Frozen"
                eps = 0.05

        # Learning Phase
        if self.policyFrozen is False:  # Learning ON/OFF
            self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.state, False, self.dqn_reward)
            self.DQN.experienceReplay(self.time)

        # Target model update
        if self.DQN.initial_exploration < self.time and np.mod(self.time, self.DQN.target_model_update_freq) == 0:
            print "########### MODEL UPDATED ######################"
            self.DQN.target_model_update()
            np.save('params/DQN_LSTM_epoch1/l4_W.npy',self.DQN.model.l4.W.get())
            np.save('params/DQN_LSTM_epoch1/l4_b.npy',self.DQN.model.l4.b.get())
            np.save('params/DQN_LSTM_epoch1/q_value_W.npy',self.DQN.model.q_value.W.get())
            np.save('params/DQN_LSTM_epoch1/q_value_b.npy',self.DQN.model.q_value.b.get())

        # Simple text based visualization
        print ' Time Step %d /   ACTION  %d  /   REWARD %.1f   / EPSILON  %.6f  /   Q_max  %3f' % (self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now.get()))

        # Updates for next step
        self.last_observation = obs_array


        if self.policyFrozen is False:
            self.lastAction = copy.deepcopy(returnAction)
            self.last_state = self.state.copy()
            self.time += 1

        return returnAction
예제 #4
0
    cuda.get_device(args.gpu).use()
    model.to_gpu()

#学習アルゴリズムのセットアップ
optimizer = optimizers.RMSprop(lr=args.learning_rate,
                               alpha=args.decay_rate,
                               eps=1e-8)
optimizer.setup(model.collect_parameters())

whole_len = len(train_data)
whole_val_len = len(val_data)
epoch = 0
start_at = time.time()
cur_at = start_at
end_time = 0
state = make_initial_state(n_units)
train_loss_all = []
val_loss_all = []
iterations_count = 0

if args.gpu >= 0:
    loss = Variable(cuda.zeros(()))
    val_loss = Variable(cuda.zeros(()))
    for key, value in state.items():
        value.data = cuda.to_gpu(value.data)
else:
    loss = Variable(np.zeros((), dtype=np.float32))
    val_loss = Variable(np.zeros((), dtype=np.float32))

for i in xrange(whole_len * n_epochs):
    for j in xrange(0, len(train_data[i % whole_len]) - 1):
예제 #5
0
model = LSTM(3136, n_units)
if args.gpu >= 0:
    cuda.get_device(args.gpu).use()
    model.to_gpu()

# 学習アルゴリズムのセットアップ
optimizer = optimizers.RMSprop(lr=args.learning_rate, alpha=args.decay_rate, eps=1e-8)
optimizer.setup(model.collect_parameters())

whole_len = len(train_data)
whole_val_len = len(val_data)
epoch = 0
start_at = time.time()
cur_at = start_at
end_time = 0
state = make_initial_state(n_units)
train_loss_all = []
val_loss_all = []
iterations_count = 0

if args.gpu >= 0:
    loss = Variable(cuda.zeros(()))
    val_loss = Variable(cuda.zeros(()))
    for key, value in state.items():
        value.data = cuda.to_gpu(value.data)
else:
    loss = Variable(np.zeros((), dtype=np.float32))
    val_loss = Variable(np.zeros((), dtype=np.float32))

for i in xrange(whole_len * n_epochs):
    for j in xrange(0, len(train_data[i % whole_len]) - 1):
예제 #6
0
    def agent_step(self, reward, observation):

        # Preproces
        tmp = np.bitwise_and(
            np.asarray(observation.intArray[128:]).reshape([210, 160]),
            0b0001111)  # Get Intensity from the observation
        obs_array = (spm.imresize(tmp, (110, 84)))[110 - 84 - 8:110 -
                                                   8, :]  # Scaling
        obs_processed = np.maximum(
            obs_array, self.last_observation)  # Take maximum from two frames

        # Compose State : 4-step sequential observation
        self.state = np.asanyarray(
            [self.state[1], self.state[2], self.state[3], obs_processed],
            dtype=np.uint8)
        state_ = cuda.to_gpu(
            np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32))

        state_for_lstm = Variable(
            cuda.to_gpu(
                np.asanyarray(self.last_state.reshape(1, 4, 84, 84),
                              dtype=np.float32)))
        CNNout = self.DQN.Q_func_LSTM(state_for_lstm).reshape(1, 64 * 7 * 7)
        now_CNN = self.DQN.Q_func_LSTM(Variable(state_)).reshape(1, 64 * 7 * 7)

        lstm_in = cuda.to_gpu(CNNout)

        returnAction = Action()
        action, Q_now = self.DQN.e_greedy(state_, self.epsilon)
        returnAction.intArray = [action]

        self.dqn_reward = reward
        #if dqn-reward is not 0, LSTM reset

        if self.dqn_reward != 0:
            reward = self.dqn_reward
            self.lstm_State = make_initial_state(1045)
            for key, value in self.lstm_State.items():
                value.data = cuda.to_gpu(value.data)
        else:
            LState = self.lstm_State
            self.lstm_State, self.lstm_s_dash = self.lstm_class.model_lstm.predict(
                lstm_in, LState)
            reward = lstm_loss - F.mean_squared_error(
                self.lstm_s_dash, Variable(cuda.to_gpu(now_CNN)))
            reward = reward.data.get()
            print reward

        # Exploration decays along the time sequence
        if self.policyFrozen is False:  # Learning ON/OFF
            if self.DQN.initial_exploration < self.time:
                self.epsilon -= 1.0 / 10**6
                if self.epsilon < 0.1:
                    self.epsilon = 0.1
                eps = self.epsilon
            else:  # Initial Exploation Phase
                print "Initial Exploration : %d/%d steps" % (
                    self.time, self.DQN.initial_exploration)
                eps = 1.0
        else:  # Evaluation
            print "Policy is Frozen"
            eps = 0.05

        # Learning Phase
        if self.policyFrozen is False:  # Learning ON/OFF
            self.DQN.stockExperience(self.time, self.last_state,
                                     self.lastAction.intArray[0], reward,
                                     self.state, False, self.dqn_reward)
            self.DQN.experienceReplay(self.time)

        # Target model update
        if self.DQN.initial_exploration < self.time and np.mod(
                self.time, self.DQN.target_model_update_freq) == 0:
            print "########### MODEL UPDATED ######################"
            self.DQN.target_model_update()
            np.save('params/DQN_LSTM_epoch1/l4_W.npy',
                    self.DQN.model.l4.W.get())
            np.save('params/DQN_LSTM_epoch1/l4_b.npy',
                    self.DQN.model.l4.b.get())
            np.save('params/DQN_LSTM_epoch1/q_value_W.npy',
                    self.DQN.model.q_value.W.get())
            np.save('params/DQN_LSTM_epoch1/q_value_b.npy',
                    self.DQN.model.q_value.b.get())

        # Simple text based visualization
        print ' Time Step %d /   ACTION  %d  /   REWARD %.1f   / EPSILON  %.6f  /   Q_max  %3f' % (
            self.time, self.DQN.action_to_index(action), np.sign(reward), eps,
            np.max(Q_now.get()))

        # Updates for next step
        self.last_observation = obs_array

        if self.policyFrozen is False:
            self.lastAction = copy.deepcopy(returnAction)
            self.last_state = self.state.copy()
            self.time += 1

        return returnAction