def __init__(self, input_number, output_number, dummy=False): LogAgent.__init__(self, input_number, output_number) self.greedy_rate = 0.3 self.alpha = 0.5 self.ganmma = 0.9 self.lastobs = None self.input_number = input_number self.output_number = output_number from collections import defaultdict self.check_data = defaultdict(list) #self.q_mat = numpy.ones((input_number, output_numbe)) #self.mlnn = MultiLayerNeuralNetwork( [self.input_number, 5, self.output_number], # pybrain型network self.mlnn = MultiLayerNeuralNetwork( [ self.input_number + self.output_number, self.input_number + self.output_number, 1 ], threshold=1, start_learning_coef=0.02, sigmoid_alpha=1, print_error=False, mini_batch=50, epoch_limit=50, layer_type=[LinearLayer, SigmoidLayer, LinearLayer], rprop=False)
def __init__(self, input_number, output_number, dummy=False): LogAgent.__init__(self, input_number, output_number) self.greedy_rate = 0.3 self.alpha = 0.5 self.ganmma = 0.9 self.lastobs = None self.input_number = input_number self.output_number = output_number from collections import defaultdict self.check_data = defaultdict(list) #self.q_mat = numpy.ones((input_number, output_numbe)) #self.mlnn = MultiLayerNeuralNetwork( [self.input_number, 5, self.output_number], # pybrain型network self.mlnn = MultiLayerNeuralNetwork( [self.input_number+self.output_number, self.input_number+self.output_number , 1], threshold=1, start_learning_coef=0.02, sigmoid_alpha=1, print_error=False, mini_batch=50, epoch_limit=50, layer_type=[LinearLayer, SigmoidLayer, LinearLayer], rprop=False)
class QLearning(LogAgent): def __init__(self, input_number, output_number, dummy=False): LogAgent.__init__(self, input_number, output_number) self.greedy_rate = 0.3 self.alpha = 0.5 self.ganmma = 0.9 self.lastobs = None self.input_number = input_number self.output_number = output_number from collections import defaultdict self.check_data = defaultdict(list) #self.q_mat = numpy.ones((input_number, output_numbe)) #self.mlnn = MultiLayerNeuralNetwork( [self.input_number, 5, self.output_number], # pybrain型network self.mlnn = MultiLayerNeuralNetwork( [ self.input_number + self.output_number, self.input_number + self.output_number, 1 ], threshold=1, start_learning_coef=0.02, sigmoid_alpha=1, print_error=False, mini_batch=50, epoch_limit=50, layer_type=[LinearLayer, SigmoidLayer, LinearLayer], rprop=False) def integrateObservation(self, state_vec): self.agent_observation(state_vec) self.lastobs = state_vec return def giveReward(self, reward): self.agent_reward(reward) return def getAction(self, input_list=None, greedy=True, q_value=False): if input_list == None: input_list = self.lastobs if greedy and self.greedy_rate < numpy.random.random(): action = numpy.random.randint(self.output_number, size=1)[0] q_vaue = 0 else: # pybrain型network output_vec = self.get_q_values(input_list) action = list(output_vec).index(max(output_vec)) q_vaue = max(output_vec) self.agent_action(action) if q_value: return action, q_vaue else: return action def convert_input(self, input_list, action): action_vec = numpy.array( [[1. if x == action else 0. for x in range(self.output_number)]]) inp = numpy.append(input_list, action_vec) return numpy.array([inp]) def get_q_values(self, input_list, action=None): if action == None: q_values = [] for i in range(self.output_number): inp = self.convert_input(input_list, i) q_values.append(self.mlnn.predict(numpy.array(inp).ravel())) return q_values else: inp = self.convert_input(input_list, action) return self.mlnn.predict(numpy.array(inp).ravel()) def reset(self): self.agent_reset() return def learn(self, learn_count=5): train_data = self.history # ニューラルネットワークのトレーニングデータの形に変換 input_data, output_data = self.change_format(train_data) # # 実験3 # r = [{}, {}] # input_list= numpy.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]) # r[0]['input'] = self.convert_input(input_list, 1) # r[1]['input'] = self.convert_input(input_list, 2) # r[0]['before'] = self.mlnn.predict(numpy.array(r[0]['input']).ravel()) # r[1]['before'] = self.mlnn.predict(numpy.array(r[1]['input']).ravel()) # # 実験1 教師データの特徴確認. # for i,d in enumerate(input_data): # self.check_data[tuple(d.tolist())].append(output_data.tolist()[i][0]) # print '状態数 : ', len(self.check_data) # for state, points in self.check_data.items(): # if numpy.average(points) > 0: # print "%s %5d %10.4f, %10.4f " % (state, len(points), numpy.average(points), numpy.std(points) ) # ニューラルネットワークの学習 for i in range(learn_count): error_hist, valid_hist = self.mlnn.train_multi( input_data, output_data) self.train_error += [x[1] for x in error_hist] self.valid_error += [x[1] for x in valid_hist] # # 実験3 # r[0]['target'] = self.check_data[tuple(numpy.array(r[0]['input']).ravel().tolist())] # r[1]['target'] = self.check_data[tuple(numpy.array(r[1]['input']).ravel().tolist())] # r[0]['after'] = self.mlnn.predict(numpy.array(r[0]['input']).ravel()) # r[1]['after'] = self.mlnn.predict(numpy.array(r[1]['input']).ravel()) # print 'before : ', r[0]['before'], r[1]['before'] # print 'target : ', r[0]['target'], r[1]['target'] # print 'after : ', r[0]['after'], r[1]['after'] return error_hist def change_format(self, train_data): """pybrain型network用""" train_data_input = [] train_data_output = [] lastexperience = None for data in train_data: if not lastexperience: lastexperience = data continue _observation = lastexperience['observation'] _action = lastexperience['action'] _reward = lastexperience['reward'] # 行動前の状態におけるQ値 before_q_value = self.get_q_values(_observation, _action) # 行動後の状態におけるQ値 after_q_values = [] for i in range(self.output_number): tmp = self.get_q_values(data['observation'], i) after_q_values.append(tmp) # # 実験2 # if data['observation'].tolist() == [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]: # print before_q_value,after_q_values # 特定の行動のQ値を更新する. #print self.alpha *(_reward + self.ganmma * max(after_q_values)[0] - before_q_value[0] ) before_q_value[0] += self.alpha * ( _reward + self.ganmma * max(after_q_values)[0] - before_q_value[0]) #train_data_input.append(numpy.array(data['grid']).ravel()) input_data = numpy.array(self.convert_input(_observation, _action)).ravel() # for next lastexperience = data #print input_data, _reward,numpy.array(before_q_value) # #train_data_input.append(self.normalize_input(input_data)) train_data_input.append(input_data) train_data_output.append(numpy.array(before_q_value)) return numpy.array(train_data_input), numpy.array(train_data_output) def change_format_old(self, train_data): train_data_input = [] train_data_output = [] lastexperience = None for data in train_data: if not lastexperience: lastexperience = data continue _observation = lastexperience['observation'] _action = lastexperience['action'] _reward = lastexperience['reward'] # 行動前の状態におけるQ値 q_vec = self.get_q_values(_observation) # 行動後の状態におけるQ値 move, q_value = self.getAction(data['observation'], q_value=True) # 特定の行動のQ値を更新する. q_vec[_action] += self.alpha * (_reward + self.ganmma * q_value - q_vec[_action]) #train_data_input.append(numpy.array(data['grid']).ravel()) input_data = numpy.array(_observation).ravel() # for next lastexperience = data # #train_data_input.append(self.normalize_input(input_data)) train_data_input.append(input_data) train_data_output.append(numpy.array(q_vec)) return numpy.array(train_data_input), numpy.array(train_data_output)
class QLearning(LogAgent): def __init__(self, input_number, output_number, dummy=False): LogAgent.__init__(self, input_number, output_number) self.greedy_rate = 0.3 self.alpha = 0.5 self.ganmma = 0.9 self.lastobs = None self.input_number = input_number self.output_number = output_number from collections import defaultdict self.check_data = defaultdict(list) #self.q_mat = numpy.ones((input_number, output_numbe)) #self.mlnn = MultiLayerNeuralNetwork( [self.input_number, 5, self.output_number], # pybrain型network self.mlnn = MultiLayerNeuralNetwork( [self.input_number+self.output_number, self.input_number+self.output_number , 1], threshold=1, start_learning_coef=0.02, sigmoid_alpha=1, print_error=False, mini_batch=50, epoch_limit=50, layer_type=[LinearLayer, SigmoidLayer, LinearLayer], rprop=False) def integrateObservation(self, state_vec): self.agent_observation(state_vec) self.lastobs = state_vec return def giveReward(self, reward): self.agent_reward(reward) return def getAction(self, input_list=None, greedy=True, q_value=False): if input_list == None: input_list = self.lastobs if greedy and self.greedy_rate < numpy.random.random(): action = numpy.random.randint(self.output_number, size=1)[0] q_vaue = 0 else: # pybrain型network output_vec = self.get_q_values(input_list) action = list(output_vec).index(max(output_vec)) q_vaue = max(output_vec) self.agent_action(action) if q_value: return action, q_vaue else: return action def convert_input(self, input_list, action): action_vec = numpy.array([[ 1. if x == action else 0. for x in range(self.output_number)]]) inp = numpy.append(input_list, action_vec) return numpy.array([inp]) def get_q_values(self, input_list, action=None): if action == None: q_values = [] for i in range(self.output_number): inp = self.convert_input(input_list, i) q_values.append(self.mlnn.predict(numpy.array(inp).ravel()) ) return q_values else: inp = self.convert_input(input_list, action) return self.mlnn.predict(numpy.array(inp).ravel()) def reset(self): self.agent_reset() return def learn(self, learn_count=5): train_data = self.history # ニューラルネットワークのトレーニングデータの形に変換 input_data , output_data = self.change_format(train_data) # # 実験3 # r = [{}, {}] # input_list= numpy.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]) # r[0]['input'] = self.convert_input(input_list, 1) # r[1]['input'] = self.convert_input(input_list, 2) # r[0]['before'] = self.mlnn.predict(numpy.array(r[0]['input']).ravel()) # r[1]['before'] = self.mlnn.predict(numpy.array(r[1]['input']).ravel()) # # 実験1 教師データの特徴確認. # for i,d in enumerate(input_data): # self.check_data[tuple(d.tolist())].append(output_data.tolist()[i][0]) # print '状態数 : ', len(self.check_data) # for state, points in self.check_data.items(): # if numpy.average(points) > 0: # print "%s %5d %10.4f, %10.4f " % (state, len(points), numpy.average(points), numpy.std(points) ) # ニューラルネットワークの学習 for i in range(learn_count): error_hist, valid_hist = self.mlnn.train_multi(input_data , output_data) self.train_error += [x[1] for x in error_hist] self.valid_error += [x[1] for x in valid_hist] # # 実験3 # r[0]['target'] = self.check_data[tuple(numpy.array(r[0]['input']).ravel().tolist())] # r[1]['target'] = self.check_data[tuple(numpy.array(r[1]['input']).ravel().tolist())] # r[0]['after'] = self.mlnn.predict(numpy.array(r[0]['input']).ravel()) # r[1]['after'] = self.mlnn.predict(numpy.array(r[1]['input']).ravel()) # print 'before : ', r[0]['before'], r[1]['before'] # print 'target : ', r[0]['target'], r[1]['target'] # print 'after : ', r[0]['after'], r[1]['after'] return error_hist def change_format(self, train_data): """pybrain型network用""" train_data_input = [] train_data_output = [] lastexperience = None for data in train_data: if not lastexperience: lastexperience = data continue _observation = lastexperience['observation'] _action = lastexperience['action'] _reward = lastexperience['reward'] # 行動前の状態におけるQ値 before_q_value = self.get_q_values(_observation, _action) # 行動後の状態におけるQ値 after_q_values = [] for i in range(self.output_number): tmp = self.get_q_values(data['observation'], i) after_q_values.append(tmp) # # 実験2 # if data['observation'].tolist() == [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]: # print before_q_value,after_q_values # 特定の行動のQ値を更新する. #print self.alpha *(_reward + self.ganmma * max(after_q_values)[0] - before_q_value[0] ) before_q_value[0] += self.alpha * (_reward + self.ganmma * max(after_q_values)[0] - before_q_value[0]) #train_data_input.append(numpy.array(data['grid']).ravel()) input_data = numpy.array(self.convert_input(_observation, _action)).ravel() # for next lastexperience = data #print input_data, _reward,numpy.array(before_q_value) # #train_data_input.append(self.normalize_input(input_data)) train_data_input.append(input_data) train_data_output.append(numpy.array(before_q_value)) return numpy.array(train_data_input) , numpy.array(train_data_output) def change_format_old(self, train_data): train_data_input = [] train_data_output = [] lastexperience = None for data in train_data: if not lastexperience: lastexperience = data continue _observation = lastexperience['observation'] _action = lastexperience['action'] _reward = lastexperience['reward'] # 行動前の状態におけるQ値 q_vec = self.get_q_values(_observation) # 行動後の状態におけるQ値 move, q_value = self.getAction(data['observation'], q_value=True) # 特定の行動のQ値を更新する. q_vec[_action] += self.alpha * (_reward + self.ganmma * q_value - q_vec[_action]) #train_data_input.append(numpy.array(data['grid']).ravel()) input_data = numpy.array(_observation).ravel() # for next lastexperience = data # #train_data_input.append(self.normalize_input(input_data)) train_data_input.append(input_data) train_data_output.append(numpy.array(q_vec)) return numpy.array(train_data_input) , numpy.array(train_data_output)