def agent_start(self, observation): self.lstm_State = make_initial_state(1045) for key, value in self.lstm_State.items(): value.data = cuda.to_gpu(value.data) # Preprocess tmp = np.bitwise_and( np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111) # Get Intensity from the observation obs_array = (spm.imresize(tmp, (110, 84)))[110 - 84 - 8:110 - 8, :] # Scaling # Initialize State self.state = np.zeros((4, 84, 84), dtype=np.uint8) self.state[0] = obs_array state_ = cuda.to_gpu( np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32)) # Generate an Action e-greedy returnAction = Action() action, Q_now = self.DQN.e_greedy(state_, self.epsilon) returnAction.intArray = [action] # Update for next step self.lastAction = copy.deepcopy(returnAction) self.last_state = self.state.copy() self.last_observation = obs_array return returnAction
def agent_start(self, observation): self.lstm_State = make_initial_state(1045) for key, value in self.lstm_State.items(): value.data = cuda.to_gpu(value.data) # Preprocess tmp = np.bitwise_and(np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111) # Get Intensity from the observation obs_array = (spm.imresize(tmp, (110, 84)))[110-84-8:110-8, :] # Scaling # Initialize State self.state = np.zeros((4, 84, 84), dtype=np.uint8) self.state[0] = obs_array state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32)) # Generate an Action e-greedy returnAction = Action() action, Q_now = self.DQN.e_greedy(state_, self.epsilon) returnAction.intArray = [action] # Update for next step self.lastAction = copy.deepcopy(returnAction) self.last_state = self.state.copy() self.last_observation = obs_array return returnAction
def agent_step(self, reward, observation): # Preproces tmp = np.bitwise_and(np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111) # Get Intensity from the observation obs_array = (spm.imresize(tmp, (110, 84)))[110-84-8:110-8, :] # Scaling obs_processed = np.maximum(obs_array, self.last_observation) # Take maximum from two frames # Compose State : 4-step sequential observation self.state = np.asanyarray([self.state[1], self.state[2], self.state[3], obs_processed], dtype=np.uint8) state_ = cuda.to_gpu(np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32)) state_for_lstm = Variable(cuda.to_gpu(np.asanyarray(self.last_state.reshape(1, 4, 84, 84), dtype=np.float32))) CNNout = self.DQN.Q_func_LSTM(state_for_lstm).reshape(1,64*7*7) now_CNN = self.DQN.Q_func_LSTM(Variable(state_)).reshape(1,64*7*7) lstm_in = cuda.to_gpu(CNNout) returnAction = Action() action, Q_now = self.DQN.e_greedy(state_, self.epsilon) returnAction.intArray = [action] self.dqn_reward = reward #if dqn-reward is not 0, LSTM reset if self.dqn_reward != 0: reward = self.dqn_reward self.lstm_State=make_initial_state(1045) for key, value in self.lstm_State.items(): value.data = cuda.to_gpu(value.data) else: LState = self.lstm_State self.lstm_State, self.lstm_s_dash = self.lstm_class.model_lstm.predict(lstm_in,LState) reward = lstm_loss - F.mean_squared_error(self.lstm_s_dash, Variable(cuda.to_gpu(now_CNN))) reward = reward.data.get() print reward # Exploration decays along the time sequence if self.policyFrozen is False: # Learning ON/OFF if self.DQN.initial_exploration < self.time: self.epsilon -= 1.0/10**6 if self.epsilon < 0.1: self.epsilon = 0.1 eps = self.epsilon else: # Initial Exploation Phase print "Initial Exploration : %d/%d steps" % (self.time, self.DQN.initial_exploration) eps = 1.0 else: # Evaluation print "Policy is Frozen" eps = 0.05 # Learning Phase if self.policyFrozen is False: # Learning ON/OFF self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.state, False, self.dqn_reward) self.DQN.experienceReplay(self.time) # Target model update if self.DQN.initial_exploration < self.time and np.mod(self.time, self.DQN.target_model_update_freq) == 0: print "########### MODEL UPDATED ######################" self.DQN.target_model_update() np.save('params/DQN_LSTM_epoch1/l4_W.npy',self.DQN.model.l4.W.get()) np.save('params/DQN_LSTM_epoch1/l4_b.npy',self.DQN.model.l4.b.get()) np.save('params/DQN_LSTM_epoch1/q_value_W.npy',self.DQN.model.q_value.W.get()) np.save('params/DQN_LSTM_epoch1/q_value_b.npy',self.DQN.model.q_value.b.get()) # Simple text based visualization print ' Time Step %d / ACTION %d / REWARD %.1f / EPSILON %.6f / Q_max %3f' % (self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now.get())) # Updates for next step self.last_observation = obs_array if self.policyFrozen is False: self.lastAction = copy.deepcopy(returnAction) self.last_state = self.state.copy() self.time += 1 return returnAction
cuda.get_device(args.gpu).use() model.to_gpu() #学習アルゴリズムのセットアップ optimizer = optimizers.RMSprop(lr=args.learning_rate, alpha=args.decay_rate, eps=1e-8) optimizer.setup(model.collect_parameters()) whole_len = len(train_data) whole_val_len = len(val_data) epoch = 0 start_at = time.time() cur_at = start_at end_time = 0 state = make_initial_state(n_units) train_loss_all = [] val_loss_all = [] iterations_count = 0 if args.gpu >= 0: loss = Variable(cuda.zeros(())) val_loss = Variable(cuda.zeros(())) for key, value in state.items(): value.data = cuda.to_gpu(value.data) else: loss = Variable(np.zeros((), dtype=np.float32)) val_loss = Variable(np.zeros((), dtype=np.float32)) for i in xrange(whole_len * n_epochs): for j in xrange(0, len(train_data[i % whole_len]) - 1):
model = LSTM(3136, n_units) if args.gpu >= 0: cuda.get_device(args.gpu).use() model.to_gpu() # 学習アルゴリズムのセットアップ optimizer = optimizers.RMSprop(lr=args.learning_rate, alpha=args.decay_rate, eps=1e-8) optimizer.setup(model.collect_parameters()) whole_len = len(train_data) whole_val_len = len(val_data) epoch = 0 start_at = time.time() cur_at = start_at end_time = 0 state = make_initial_state(n_units) train_loss_all = [] val_loss_all = [] iterations_count = 0 if args.gpu >= 0: loss = Variable(cuda.zeros(())) val_loss = Variable(cuda.zeros(())) for key, value in state.items(): value.data = cuda.to_gpu(value.data) else: loss = Variable(np.zeros((), dtype=np.float32)) val_loss = Variable(np.zeros((), dtype=np.float32)) for i in xrange(whole_len * n_epochs): for j in xrange(0, len(train_data[i % whole_len]) - 1):
def agent_step(self, reward, observation): # Preproces tmp = np.bitwise_and( np.asarray(observation.intArray[128:]).reshape([210, 160]), 0b0001111) # Get Intensity from the observation obs_array = (spm.imresize(tmp, (110, 84)))[110 - 84 - 8:110 - 8, :] # Scaling obs_processed = np.maximum( obs_array, self.last_observation) # Take maximum from two frames # Compose State : 4-step sequential observation self.state = np.asanyarray( [self.state[1], self.state[2], self.state[3], obs_processed], dtype=np.uint8) state_ = cuda.to_gpu( np.asanyarray(self.state.reshape(1, 4, 84, 84), dtype=np.float32)) state_for_lstm = Variable( cuda.to_gpu( np.asanyarray(self.last_state.reshape(1, 4, 84, 84), dtype=np.float32))) CNNout = self.DQN.Q_func_LSTM(state_for_lstm).reshape(1, 64 * 7 * 7) now_CNN = self.DQN.Q_func_LSTM(Variable(state_)).reshape(1, 64 * 7 * 7) lstm_in = cuda.to_gpu(CNNout) returnAction = Action() action, Q_now = self.DQN.e_greedy(state_, self.epsilon) returnAction.intArray = [action] self.dqn_reward = reward #if dqn-reward is not 0, LSTM reset if self.dqn_reward != 0: reward = self.dqn_reward self.lstm_State = make_initial_state(1045) for key, value in self.lstm_State.items(): value.data = cuda.to_gpu(value.data) else: LState = self.lstm_State self.lstm_State, self.lstm_s_dash = self.lstm_class.model_lstm.predict( lstm_in, LState) reward = lstm_loss - F.mean_squared_error( self.lstm_s_dash, Variable(cuda.to_gpu(now_CNN))) reward = reward.data.get() print reward # Exploration decays along the time sequence if self.policyFrozen is False: # Learning ON/OFF if self.DQN.initial_exploration < self.time: self.epsilon -= 1.0 / 10**6 if self.epsilon < 0.1: self.epsilon = 0.1 eps = self.epsilon else: # Initial Exploation Phase print "Initial Exploration : %d/%d steps" % ( self.time, self.DQN.initial_exploration) eps = 1.0 else: # Evaluation print "Policy is Frozen" eps = 0.05 # Learning Phase if self.policyFrozen is False: # Learning ON/OFF self.DQN.stockExperience(self.time, self.last_state, self.lastAction.intArray[0], reward, self.state, False, self.dqn_reward) self.DQN.experienceReplay(self.time) # Target model update if self.DQN.initial_exploration < self.time and np.mod( self.time, self.DQN.target_model_update_freq) == 0: print "########### MODEL UPDATED ######################" self.DQN.target_model_update() np.save('params/DQN_LSTM_epoch1/l4_W.npy', self.DQN.model.l4.W.get()) np.save('params/DQN_LSTM_epoch1/l4_b.npy', self.DQN.model.l4.b.get()) np.save('params/DQN_LSTM_epoch1/q_value_W.npy', self.DQN.model.q_value.W.get()) np.save('params/DQN_LSTM_epoch1/q_value_b.npy', self.DQN.model.q_value.b.get()) # Simple text based visualization print ' Time Step %d / ACTION %d / REWARD %.1f / EPSILON %.6f / Q_max %3f' % ( self.time, self.DQN.action_to_index(action), np.sign(reward), eps, np.max(Q_now.get())) # Updates for next step self.last_observation = obs_array if self.policyFrozen is False: self.lastAction = copy.deepcopy(returnAction) self.last_state = self.state.copy() self.time += 1 return returnAction