示例#1
0
    def __init__(self, input_number, output_number, dummy=False):
        LogAgent.__init__(self, input_number, output_number)

        self.greedy_rate = 0.3
        self.alpha = 0.5
        self.ganmma = 0.9
        self.lastobs = None
        self.input_number = input_number
        self.output_number = output_number

        from collections import defaultdict
        self.check_data = defaultdict(list)

        #self.q_mat = numpy.ones((input_number, output_numbe))
        #self.mlnn = MultiLayerNeuralNetwork( [self.input_number, 5, self.output_number],
        # pybrain型network
        self.mlnn = MultiLayerNeuralNetwork(
            [
                self.input_number + self.output_number,
                self.input_number + self.output_number, 1
            ],
            threshold=1,
            start_learning_coef=0.02,
            sigmoid_alpha=1,
            print_error=False,
            mini_batch=50,
            epoch_limit=50,
            layer_type=[LinearLayer, SigmoidLayer, LinearLayer],
            rprop=False)
    def __init__(self, input_number, output_number, dummy=False):
        LogAgent.__init__(self, input_number, output_number)

        self.greedy_rate   = 0.3
        self.alpha         = 0.5
        self.ganmma        = 0.9
        self.lastobs       = None
        self.input_number  = input_number
        self.output_number = output_number

        from collections import defaultdict
        self.check_data = defaultdict(list)

        #self.q_mat = numpy.ones((input_number, output_numbe))
        #self.mlnn = MultiLayerNeuralNetwork( [self.input_number, 5, self.output_number],
        # pybrain型network
        self.mlnn = MultiLayerNeuralNetwork( [self.input_number+self.output_number, self.input_number+self.output_number , 1],
                                            threshold=1,
                                            start_learning_coef=0.02,
                                            sigmoid_alpha=1,
                                            print_error=False,
                                            mini_batch=50,
                                            epoch_limit=50,
                                            layer_type=[LinearLayer, SigmoidLayer, LinearLayer],
                                            rprop=False)
示例#3
0
class QLearning(LogAgent):
    def __init__(self, input_number, output_number, dummy=False):
        LogAgent.__init__(self, input_number, output_number)

        self.greedy_rate = 0.3
        self.alpha = 0.5
        self.ganmma = 0.9
        self.lastobs = None
        self.input_number = input_number
        self.output_number = output_number

        from collections import defaultdict
        self.check_data = defaultdict(list)

        #self.q_mat = numpy.ones((input_number, output_numbe))
        #self.mlnn = MultiLayerNeuralNetwork( [self.input_number, 5, self.output_number],
        # pybrain型network
        self.mlnn = MultiLayerNeuralNetwork(
            [
                self.input_number + self.output_number,
                self.input_number + self.output_number, 1
            ],
            threshold=1,
            start_learning_coef=0.02,
            sigmoid_alpha=1,
            print_error=False,
            mini_batch=50,
            epoch_limit=50,
            layer_type=[LinearLayer, SigmoidLayer, LinearLayer],
            rprop=False)

    def integrateObservation(self, state_vec):
        self.agent_observation(state_vec)
        self.lastobs = state_vec
        return

    def giveReward(self, reward):
        self.agent_reward(reward)
        return

    def getAction(self, input_list=None, greedy=True, q_value=False):
        if input_list == None:
            input_list = self.lastobs

        if greedy and self.greedy_rate < numpy.random.random():
            action = numpy.random.randint(self.output_number, size=1)[0]
            q_vaue = 0
        else:
            # pybrain型network
            output_vec = self.get_q_values(input_list)
            action = list(output_vec).index(max(output_vec))
            q_vaue = max(output_vec)

        self.agent_action(action)
        if q_value:
            return action, q_vaue
        else:
            return action

    def convert_input(self, input_list, action):
        action_vec = numpy.array(
            [[1. if x == action else 0. for x in range(self.output_number)]])
        inp = numpy.append(input_list, action_vec)
        return numpy.array([inp])

    def get_q_values(self, input_list, action=None):
        if action == None:
            q_values = []
            for i in range(self.output_number):
                inp = self.convert_input(input_list, i)
                q_values.append(self.mlnn.predict(numpy.array(inp).ravel()))
            return q_values
        else:
            inp = self.convert_input(input_list, action)
            return self.mlnn.predict(numpy.array(inp).ravel())

    def reset(self):
        self.agent_reset()
        return

    def learn(self, learn_count=5):
        train_data = self.history

        # ニューラルネットワークのトレーニングデータの形に変換
        input_data, output_data = self.change_format(train_data)

        # # 実験3
        # r = [{}, {}]
        # input_list= numpy.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0])
        # r[0]['input'] = self.convert_input(input_list, 1)
        # r[1]['input'] = self.convert_input(input_list, 2)
        # r[0]['before'] = self.mlnn.predict(numpy.array(r[0]['input']).ravel())
        # r[1]['before'] = self.mlnn.predict(numpy.array(r[1]['input']).ravel())

        # # 実験1 教師データの特徴確認.
        # for i,d in enumerate(input_data):
        #     self.check_data[tuple(d.tolist())].append(output_data.tolist()[i][0])
        # print '状態数 : ', len(self.check_data)
        # for state, points in self.check_data.items():
        #     if numpy.average(points) > 0:
        #         print "%s %5d %10.4f, %10.4f " % (state, len(points), numpy.average(points), numpy.std(points) )

        # ニューラルネットワークの学習
        for i in range(learn_count):
            error_hist, valid_hist = self.mlnn.train_multi(
                input_data, output_data)
            self.train_error += [x[1] for x in error_hist]
            self.valid_error += [x[1] for x in valid_hist]

        # # 実験3
        # r[0]['target'] = self.check_data[tuple(numpy.array(r[0]['input']).ravel().tolist())]
        # r[1]['target'] = self.check_data[tuple(numpy.array(r[1]['input']).ravel().tolist())]
        # r[0]['after'] = self.mlnn.predict(numpy.array(r[0]['input']).ravel())
        # r[1]['after'] = self.mlnn.predict(numpy.array(r[1]['input']).ravel())
        # print 'before : ', r[0]['before'], r[1]['before']
        # print 'target : ', r[0]['target'], r[1]['target']
        # print 'after  : ', r[0]['after'], r[1]['after']

        return error_hist

    def change_format(self, train_data):
        """pybrain型network用"""
        train_data_input = []
        train_data_output = []
        lastexperience = None
        for data in train_data:
            if not lastexperience:
                lastexperience = data
                continue
            _observation = lastexperience['observation']
            _action = lastexperience['action']
            _reward = lastexperience['reward']

            # 行動前の状態におけるQ値
            before_q_value = self.get_q_values(_observation, _action)

            # 行動後の状態におけるQ値
            after_q_values = []
            for i in range(self.output_number):
                tmp = self.get_q_values(data['observation'], i)
                after_q_values.append(tmp)

            # # 実験2
            # if data['observation'].tolist() == [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]:
            #     print before_q_value,after_q_values

            # 特定の行動のQ値を更新する.
            #print self.alpha *(_reward + self.ganmma * max(after_q_values)[0] - before_q_value[0] )
            before_q_value[0] += self.alpha * (
                _reward + self.ganmma * max(after_q_values)[0] -
                before_q_value[0])

            #train_data_input.append(numpy.array(data['grid']).ravel())
            input_data = numpy.array(self.convert_input(_observation,
                                                        _action)).ravel()

            # for next
            lastexperience = data
            #print input_data, _reward,numpy.array(before_q_value)

            #
            #train_data_input.append(self.normalize_input(input_data))
            train_data_input.append(input_data)
            train_data_output.append(numpy.array(before_q_value))

        return numpy.array(train_data_input), numpy.array(train_data_output)

    def change_format_old(self, train_data):
        train_data_input = []
        train_data_output = []
        lastexperience = None
        for data in train_data:
            if not lastexperience:
                lastexperience = data
                continue
            _observation = lastexperience['observation']
            _action = lastexperience['action']
            _reward = lastexperience['reward']

            # 行動前の状態におけるQ値
            q_vec = self.get_q_values(_observation)

            # 行動後の状態におけるQ値
            move, q_value = self.getAction(data['observation'], q_value=True)

            # 特定の行動のQ値を更新する.
            q_vec[_action] += self.alpha * (_reward + self.ganmma * q_value -
                                            q_vec[_action])

            #train_data_input.append(numpy.array(data['grid']).ravel())
            input_data = numpy.array(_observation).ravel()

            # for next
            lastexperience = data

            #
            #train_data_input.append(self.normalize_input(input_data))
            train_data_input.append(input_data)
            train_data_output.append(numpy.array(q_vec))

        return numpy.array(train_data_input), numpy.array(train_data_output)
class QLearning(LogAgent):
    def __init__(self, input_number, output_number, dummy=False):
        LogAgent.__init__(self, input_number, output_number)

        self.greedy_rate   = 0.3
        self.alpha         = 0.5
        self.ganmma        = 0.9
        self.lastobs       = None
        self.input_number  = input_number
        self.output_number = output_number

        from collections import defaultdict
        self.check_data = defaultdict(list)

        #self.q_mat = numpy.ones((input_number, output_numbe))
        #self.mlnn = MultiLayerNeuralNetwork( [self.input_number, 5, self.output_number],
        # pybrain型network
        self.mlnn = MultiLayerNeuralNetwork( [self.input_number+self.output_number, self.input_number+self.output_number , 1],
                                            threshold=1,
                                            start_learning_coef=0.02,
                                            sigmoid_alpha=1,
                                            print_error=False,
                                            mini_batch=50,
                                            epoch_limit=50,
                                            layer_type=[LinearLayer, SigmoidLayer, LinearLayer],
                                            rprop=False)

    def integrateObservation(self, state_vec):
        self.agent_observation(state_vec)
        self.lastobs = state_vec
        return

    def giveReward(self, reward):
        self.agent_reward(reward)
        return

    def getAction(self, input_list=None, greedy=True, q_value=False):
        if input_list == None:
            input_list = self.lastobs

        if greedy and self.greedy_rate < numpy.random.random():
            action = numpy.random.randint(self.output_number, size=1)[0]
            q_vaue = 0
        else:
            # pybrain型network
            output_vec = self.get_q_values(input_list)
            action = list(output_vec).index(max(output_vec))
            q_vaue = max(output_vec)

        self.agent_action(action)
        if q_value:
            return action, q_vaue
        else:
            return action

    def convert_input(self, input_list, action):
        action_vec = numpy.array([[ 1. if x == action else 0. for x in range(self.output_number)]])
        inp = numpy.append(input_list, action_vec)
        return numpy.array([inp])

    def get_q_values(self, input_list, action=None):
        if action == None:
            q_values = []
            for i in range(self.output_number):
                inp = self.convert_input(input_list, i)
                q_values.append(self.mlnn.predict(numpy.array(inp).ravel()) )
            return q_values
        else:
            inp = self.convert_input(input_list, action)
            return self.mlnn.predict(numpy.array(inp).ravel())

    def reset(self):
        self.agent_reset()
        return

    def learn(self, learn_count=5):
        train_data = self.history

        # ニューラルネットワークのトレーニングデータの形に変換
        input_data , output_data = self.change_format(train_data)

        # # 実験3
        # r = [{}, {}]
        # input_list= numpy.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0])
        # r[0]['input'] = self.convert_input(input_list, 1)
        # r[1]['input'] = self.convert_input(input_list, 2)
        # r[0]['before'] = self.mlnn.predict(numpy.array(r[0]['input']).ravel())
        # r[1]['before'] = self.mlnn.predict(numpy.array(r[1]['input']).ravel())


        # # 実験1 教師データの特徴確認.
        # for i,d in enumerate(input_data):
        #     self.check_data[tuple(d.tolist())].append(output_data.tolist()[i][0])
        # print '状態数 : ', len(self.check_data)
        # for state, points in self.check_data.items():
        #     if numpy.average(points) > 0:
        #         print "%s %5d %10.4f, %10.4f " % (state, len(points), numpy.average(points), numpy.std(points) )

        # ニューラルネットワークの学習
        for i in range(learn_count):
            error_hist, valid_hist = self.mlnn.train_multi(input_data , output_data)
            self.train_error += [x[1] for x in error_hist]
            self.valid_error += [x[1] for x in valid_hist]


        # # 実験3
        # r[0]['target'] = self.check_data[tuple(numpy.array(r[0]['input']).ravel().tolist())]
        # r[1]['target'] = self.check_data[tuple(numpy.array(r[1]['input']).ravel().tolist())]
        # r[0]['after'] = self.mlnn.predict(numpy.array(r[0]['input']).ravel())
        # r[1]['after'] = self.mlnn.predict(numpy.array(r[1]['input']).ravel())
        # print 'before : ', r[0]['before'], r[1]['before']
        # print 'target : ', r[0]['target'], r[1]['target']
        # print 'after  : ', r[0]['after'], r[1]['after']

        return error_hist

    def change_format(self, train_data):
        """pybrain型network用"""
        train_data_input  = []
        train_data_output = []
        lastexperience = None
        for data in train_data:
            if not lastexperience:
                lastexperience = data
                continue
            _observation = lastexperience['observation']
            _action      = lastexperience['action']
            _reward      = lastexperience['reward']

            # 行動前の状態におけるQ値
            before_q_value = self.get_q_values(_observation, _action)

            # 行動後の状態におけるQ値
            after_q_values = []
            for i in range(self.output_number):
                tmp = self.get_q_values(data['observation'], i)
                after_q_values.append(tmp)

            # # 実験2
            # if data['observation'].tolist() == [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]:
            #     print before_q_value,after_q_values

            # 特定の行動のQ値を更新する.
            #print self.alpha *(_reward + self.ganmma * max(after_q_values)[0] - before_q_value[0] )
            before_q_value[0] += self.alpha * (_reward + self.ganmma * max(after_q_values)[0]  - before_q_value[0])

            #train_data_input.append(numpy.array(data['grid']).ravel())
            input_data = numpy.array(self.convert_input(_observation, _action)).ravel()

            # for next
            lastexperience = data
            #print input_data, _reward,numpy.array(before_q_value)

            #
            #train_data_input.append(self.normalize_input(input_data))
            train_data_input.append(input_data)
            train_data_output.append(numpy.array(before_q_value))

        return numpy.array(train_data_input) , numpy.array(train_data_output)

    def change_format_old(self, train_data):
        train_data_input  = []
        train_data_output = []
        lastexperience = None
        for data in train_data:
            if not lastexperience:
                lastexperience = data
                continue
            _observation = lastexperience['observation']
            _action      = lastexperience['action']
            _reward      = lastexperience['reward']

            # 行動前の状態におけるQ値
            q_vec = self.get_q_values(_observation)

            # 行動後の状態におけるQ値
            move, q_value = self.getAction(data['observation'], q_value=True)

            # 特定の行動のQ値を更新する.
            q_vec[_action]  += self.alpha * (_reward + self.ganmma * q_value  - q_vec[_action])

            #train_data_input.append(numpy.array(data['grid']).ravel())
            input_data = numpy.array(_observation).ravel()

            # for next
            lastexperience = data

            #
            #train_data_input.append(self.normalize_input(input_data))
            train_data_input.append(input_data)
            train_data_output.append(numpy.array(q_vec))

        return numpy.array(train_data_input) , numpy.array(train_data_output)