Python Walker.choose_action 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: walker

클래스/타입: Walker

메소드/함수: choose_action

hotexamples.com에서의 예제들: 3

Python Walker.choose_action - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 walker.Walker.choose_action에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Walker(30)

choose_action(3)

init(3)

__init__(3)

start(2)

find_shortest_path(2)

get_quad_list(1)

startWalking(1)

setMPD(1)

join(1)

get_steps(1)

getX(1)

get_position(1)

empty_step(1)

draw(1)

display(1)

death(1)

current_position(1)

c(1)

visitFunction(1)

예제 #1

파일 보기

class Game:
    def __init__(self):
        self.n_features = 366
        self.n_actions = 8
        self.max_epoch = 30
        self.max_steps = 100

        # define sound source information
        # fixme
        self.src_pos_x = -3.0
        self.src_pos_y = 1.6
        self.src_pos_z = -4.0

        # sample as a grid map with 0.5m unit
        # fixme, change step length to 1m
        self.unit = 1.0
        self.room_grids_x = [i for i in np.arange(-3.0, 3.0 + self.unit, self.unit)]
        self.room_grids_z = [i for i in np.arange(-4.0, 4.0 + self.unit, self.unit)]

        # fixme, define wall and obstacles
        self.wall_axis_z = {-4: [i for i in np.arange(-5.0, 6.0, 1.0)],
                            4: [i for i in np.arange(-5.0, 6.0, 1.0)],
                            0: [i for i in np.arange(-5.0, 6.0, 1.0) if i != 0]}
        self.wall_axis_x = {5: [i for i in np.arange(-4.0, 5.0, 1.0)],
                            1: [i for i in np.arange(-4.0, 5.0, 1.0) if i != -2 and i != 2],
                            -1: [i for i in np.arange(-4.0, 5.0, 1.0) if i != -2 and i != 2],
                            -5: [i for i in np.arange(-4.0, 5.0, 1.0)]}

        # fixme, define checkpoints: room gates, hall center
        self.room_gates = [[-2.0, 1, -1.0], [2.0, 1, -1.0], [-2.0, 1, 1.0], [2.0, 1, 1.0]]
        self.hall_center = [[0, 0, 0]]

        # fixme, define room zone
        self.room1_x = [i for i in np.arange(-3.5, 0, 0.5)]
        self.room1_z = [i for i in np.arange(-4.5, -1, 0.5)]

        self.room2_x = [i for i in np.arange(0.5, 4.0, 0.5)]
        self.room2_z = [i for i in np.arange(-4.5, -1, 0.5)]

        self.room3_x = [i for i in np.arange(-3.5, 0, 0.5)]
        self.room3_z = [i for i in np.arange(1.5, 5.0, 0.5)]

        self.room4_x = [i for i in np.arange(0.5, 4.0, 0.5)]
        self.room4_z = [i for i in np.arange(1.5, 5.0, 0.5)]

        self.hall_x = [i for i in np.arange(-3.5, 4.0, 0.5)]
        self.hall_z = [i for i in np.arange(-0.5, 1.0, 0.5)]

        self.walker = Walker(self.n_features, self.n_actions)

    def detect_invalids(self, x, y, z, room):
        invalids = []
        directions = [[x, y, z - self.unit], [x + self.unit, y, z - self.unit],
                      [x + self.unit, y, z], [x + self.unit, y, z + self.unit],
                      [x, y, z + self.unit], [x - self.unit, y, z + self.unit],
                      [x - self.unit, y, z], [x - self.unit, y, z - self.unit]]

        for direction in directions:
            # along x axis, fix z, change x
            if self.wall_axis_x.get(direction[2]) is not None:
                if direction[0] in self.wall_axis_x[direction[2]]:
                    invalids.append(self.walker.action_labels.index(str(directions.index(direction) * 45)))

            # along z axis, fix x, change z
            if self.wall_axis_z.get(direction[0]) is not None:
                if direction[2] in self.wall_axis_z[direction[0]]:
                    invalids.append(self.walker.action_labels.index(str(directions.index(direction) * 45)))

        if room[4] is False:
            for direction in directions:
                if (direction[0] in self.room4_x and direction[2] in self.room4_z) or (
                        direction[0] == self.room_gates[3][0] and direction[2] == self.room_gates[3][2]):
                    invalids.append(self.walker.action_labels.index(str(directions.index(direction) * 45)))
        if room[3] is False:
            for direction in directions:
                if (direction[0] in self.room3_x and direction[2] in self.room3_z) or (
                        direction[0] == self.room_gates[2][0] and direction[2] == self.room_gates[2][2]):
                    invalids.append(self.walker.action_labels.index(str(directions.index(direction) * 45)))
        if room[2] is False:
            for direction in directions:
                if (direction[0] in self.room2_x and direction[2] in self.room2_z) or (
                        direction[0] == self.room_gates[1][0] and direction[2] == self.room_gates[1][2]):
                    invalids.append(self.walker.action_labels.index(str(directions.index(direction) * 45)))
        if room[1] is False:
            for direction in directions:
                if (direction[0] in self.room1_x and direction[2] in self.room1_z) or (
                        direction[0] == self.room_gates[0][0] and direction[2] == self.room_gates[0][2]):
                    invalids.append(self.walker.action_labels.index(str(directions.index(direction) * 45)))

        if room[0] is False:
            for direction in directions:
                if direction[0] in self.hall_x and direction[2] in self.hall_z:
                    invalids.append(self.walker.action_labels.index(str(directions.index(direction) * 45)))

        # todo, add some obstacles
        return invalids

    # fixme, return 1, 2, 3, 4 room, 0-hall
    def detect_which_room(self):
        if self.walker.pos_x in self.room1_x and self.walker.pos_z in self.room1_z:
            return 1
        elif self.walker.pos_x in self.room2_x and self.walker.pos_z in self.room2_z:
            return 2
        elif self.walker.pos_x in self.room3_x and self.walker.pos_z in self.room3_z:
            return 3
        elif self.walker.pos_x in self.room4_x and self.walker.pos_z in self.room4_z:
            return 4
        elif self.walker.pos_x in self.hall_x and self.walker.pos_z in self.hall_z:
            return 0
        else:
            return -1

    """
        based on guide path to learn actions:
        - learn: from inner room guide to gate; avoid obstacles
        - not learn: gate into inner room
        
        - reward: diff in angle
    """

    def learn_guide_actions(self, path, visit):
        a_his = None

        for pos in path:
            if path.index(pos) == len(path) - 2:
                break
            s = self.walker.observe_gcc_vector(pos[0], self.walker.pos_y, pos[1])
            s = np.array(s)[np.newaxis, :]

            pos_key = str(pos[0]) + "*" + str(pos[1])
            visit[pos_key] += 1

            pos_next = path[path.index(pos) + 1]
            s_ = self.walker.observe_gcc_vector(pos_next[1], self.walker.pos_y, pos_next[1])
            s_ = np.array(s_)[np.newaxis, :]

            # get action
            if pos_next[0] - pos[0] == 0 and pos_next[1] - pos[1] == -self.unit:
                a = 0
            elif pos_next[0] - pos[0] == self.unit and pos_next[1] - pos[1] == -self.unit:
                a = 1
            elif pos_next[0] - pos[0] == self.unit and pos_next[1] - pos[1] == 0:
                a = 2
            elif pos_next[0] - pos[0] == self.unit and pos_next[1] - pos[1] == self.unit:
                a = 3
            elif pos_next[0] - pos[0] == 0 and pos_next[1] - pos[1] == self.unit:
                a = 4
            elif pos_next[0] - pos[0] == -self.unit and pos_next[1] - pos[1] == self.unit:
                a = 5
            elif pos_next[0] - pos[0] == -self.unit and pos_next[1] - pos[1] == 0:
                a = 6
            elif pos_next[0] - pos[0] == -self.unit and pos_next[1] - pos[1] == -self.unit:
                a = 7
            else:
                print("Wrong action get from GUIDE path... ")
                a = None

            if a_his is None:
                a_his = a

            # get diff reward
            max_angle = max(float(self.walker.action_labels[a]), float(self.walker.action_labels[a_his]))
            min_angle = min(float(self.walker.action_labels[a]), float(self.walker.action_labels[a_his]))

            diff = min(abs(max_angle - min_angle), 360 - max_angle + min_angle)

            r = 1 - diff / 180

            pos_key = str(pos_next[0]) + "*" + str(pos_next[1])
            r -= (visit[pos_key]) * 0.2

            self.walker.learn(s, a, s_, r)
            a_his = a

    def play(self):
        records_step = []
        records_r = []

        """
            Begin epoch
        """
        for epoch in range(self.max_epoch):
            print("========== Epoch %d ======" % epoch)
            memory = collections.defaultdict(dict)
            visit = {}
            for i in self.room_grids_x:
                for j in self.room_grids_z:
                    visit[str(i) + "*" + str(j)] = 0
                    for k in self.walker.action_labels:
                        memory[str(i) + "*" + str(j)][k] = 0

            # init walker position
            # fixme, random choose
            self.walker.reset_walker_pos(2.0, 1, 3.0)
            DONE = False

            sum_reward = 0.0

            a_his = None

            # fixme, lock room zone and room gates
            ROOM = [None] * 5

            """
                Begin steps
            """
            for step in range(self.max_steps):
                print("************** step %d" % step)
                GUIDE = False

                print("x: " + str(self.walker.pos_x))
                print("z: " + str(self.walker.pos_z))

                s = self.walker.observe_gcc_vector(self.walker.pos_x, self.walker.pos_y, self.walker.pos_z)
                s = np.array(s)[np.newaxis, :]

                # fixme, judge: if walker in room, out or in.
                room_type = self.detect_which_room()

                # fixme, if already determine this room, not go out
                if ROOM[room_type] is not True:
                    # walker in room
                    if room_type in [1, 2, 3, 4]:
                        print("detect walker in room%d " % room_type)
                        # source is not in the room, GUIDE

                        # todo, give more obs about binary
                        if self.walker.sound_in_room(s) is False:
                            print("source is not in room%d" % room_type)

                            path = self.walker.find_shortest_path(self.walker.pos_x, self.walker.pos_z,
                                                                  self.room_gates[int("%d" % room_type) - 1][0],
                                                                  self.room_gates[int("%d" % room_type) - 1][2])

                            self.walker.reset_walker_pos(self.room_gates[int("%d" % room_type) - 1][0],
                                                         self.walker.pos_y,
                                                         self.room_gates[int("%d" % room_type) - 1][2])
                            print("guide to room gate %d " % room_type)

                            if room_type == 1 or room_type == 2:
                                self.walker.reset_walker_pos(self.walker.pos_x, self.walker.pos_y,
                                                             self.walker.pos_z + self.unit)
                            else:
                                self.walker.reset_walker_pos(self.walker.pos_x, self.walker.pos_y,
                                                             self.walker.pos_z - self.unit)

                            print("step further to the hall ")

                            # fixme, based on path generate experiences to learn
                            self.learn_guide_actions(path, visit)

                            ROOM[room_type] = False

                            GUIDE = True

                        # source in the room
                        else:
                            print("find source in room %d" % room_type)
                            ROOM[room_type] = True
                            HALL = False

                    # walker in the gate, GUIDE into room
                    elif room_type == -1:
                        f = 0
                        if self.walker.pos_x == self.room_gates[0][0] and self.walker.pos_z == self.room_gates[0][2]:
                            self.walker.reset_walker_pos(self.walker.pos_x, self.walker.pos_y,
                                                         self.walker.pos_z - self.unit)
                            f = 1

                        elif self.walker.pos_x == self.room_gates[1][0] and self.walker.pos_z == self.room_gates[1][2]:
                            self.walker.reset_walker_pos(self.walker.pos_x, self.walker.pos_y,
                                                         self.walker.pos_z - self.unit)
                            f = 2

                        elif self.walker.pos_x == self.room_gates[2][0] and self.walker.pos_z == self.room_gates[2][2]:
                            self.walker.reset_walker_pos(self.walker.pos_x, self.walker.pos_y,
                                                         self.walker.pos_z + self.unit)
                            f = 3

                        elif self.walker.pos_x == self.room_gates[3][0] and self.walker.pos_z == self.room_gates[3][2]:
                            self.walker.reset_walker_pos(self.walker.pos_x, self.walker.pos_y,
                                                         self.walker.pos_z + self.unit)
                            f = 4

                        print("detect walker in gate%d" % f)
                        print("step further into room%d" % f)
                        GUIDE = True

                    elif room_type == 0:
                        print("detect walker in the hall")
                        # todo, give more obs when walker in the hall

                    # else: walker in the hall
                    # if step == 0 \
                    #         or [self.walker.pos_x, self.walker.pos_y, self.walker.pos_z] in self.room_gates \
                    #         or [self.walker.pos_x, self.walker.pos_y, self.walker.pos_z] in self.hall_center:
                    #     fe = open('first_obs.pkl', 'rb')
                    #     obs = pickle.load(fe)
                    #
                    #     # ==================== Right obs
                    #     s_r = obs['right']
                    #     s_r = np.array(s_r)[np.newaxis, :]
                    #     a_r, p_r = self.walker.choose_action(s_r, [])
                    #     p_rr = [p_r[len(p_r) - 2], p_r[len(p_r) - 1]]
                    #     p_rr = np.append(p_rr, p_r[:len(p_r) - 2])
                    #
                    #     # ==================== Left obs
                    #     s_l = obs['left']
                    #     s_l = np.array(s_l)[np.newaxis, :]
                    #     a_l, p_l = self.walker.choose_action(s_l, [])
                    #     p_ll = [p_l[0], p_l[1]]
                    #     p_ll = np.append(p_l[2:], p_ll)
                    #
                    #     # ==================== Down obs
                    #     s_d = obs['down']
                    #     s_d = np.array(s_d)[np.newaxis, :]
                    #     a_d, p_d = self.walker.choose_action(s_d, [])
                    #     p_dd = [p_d[len(p_d) - 4], p_d[len(p_d) - 3], p_d[len(p_d) - 2], p_d[len(p_d) - 1]]
                    #     p_dd = np.append(p_dd, p_d[:len(p_d) - 4])
                    #
                    #     # ==================== Decide action
                    #     p_mix = [0] * self.n_actions
                    #     for i in range(self.n_actions):
                    #         if i not in invalids:
                    #             p_mix[i] = p[i] + p_rr[i] + p_ll[i] + p_dd[i]
                    #
                    #     p_mix = np.array(p_mix)
                    #     p_mix /= p_mix.sum()
                    #     a_mix = np.argmax(p_mix)
                    #
                    #     fe.close()
                    #
                    #     a = a_mix
                    #     a_his = a
                    #     p = p_mix
                    #     direction = self.walker.action_labels[a]

                    # if walker is guided to a new pos

                if GUIDE is True:
                    # fixme, init a_his if guide to a new pos
                    a_his = None
                    continue

                # detect walls and obstacles
                invalids = self.detect_invalids(self.walker.pos_x, self.walker.pos_y, self.walker.pos_z, ROOM)

                # fixme, cut down action space, but for the hall part allow more
                pos_key = str(self.walker.pos_x) + "*" + str(self.walker.pos_z)
                for i in memory[pos_key].keys():
                    if self.detect_which_room() == 0:
                        threshold = 5
                    else:
                        threshold = 2
                    if memory[pos_key][i] >= threshold:
                        invalids.append(self.walker.action_labels.index(i))
                visit[pos_key] += 1

                a, p = self.walker.choose_action(s, invalids)
                if a_his is None:
                    a_his = a

                direction = self.walker.action_labels[a]

                # print(p)
                print(direction)

                memory[pos_key][direction] += 1

                # step next
                if direction == '0':
                    self.walker.reset_walker_pos(self.walker.pos_x, self.walker.pos_y,
                                                 self.walker.pos_z - self.unit)
                elif direction == '45':
                    self.walker.reset_walker_pos(self.walker.pos_x + self.unit, self.walker.pos_y,
                                                 self.walker.pos_z - self.unit)
                elif direction == '90':
                    self.walker.reset_walker_pos(self.walker.pos_x + self.unit, self.walker.pos_y,
                                                 self.walker.pos_z)
                elif direction == '135':
                    self.walker.reset_walker_pos(self.walker.pos_x + self.unit, self.walker.pos_y,
                                                 self.walker.pos_z + self.unit)
                elif direction == '180':
                    self.walker.reset_walker_pos(self.walker.pos_x, self.walker.pos_y,
                                                 self.walker.pos_z + self.unit)
                elif direction == '225':
                    self.walker.reset_walker_pos(self.walker.pos_x - self.unit, self.walker.pos_y,
                                                 self.walker.pos_z + self.unit)
                elif direction == '270':
                    self.walker.reset_walker_pos(self.walker.pos_x - self.unit, self.walker.pos_y,
                                                 self.walker.pos_z)
                elif direction == '315':
                    self.walker.reset_walker_pos(self.walker.pos_x - self.unit, self.walker.pos_y,
                                                 self.walker.pos_z - self.unit)

                # fixme, don't have s_ when get source
                if self.walker.pos_x == self.src_pos_x and self.walker.pos_z == self.src_pos_z:
                    print("get source")
                    DONE = True
                    r = 5
                    s_ = np.array([0 for u in range(self.n_features)])[np.newaxis, :]

                else:
                    # fixme, rebuild reward function
                    # r = self.walker.observe_volume(self.walker.pos_x, self.walker.pos_y, self.walker.pos_z)
                    # r = 1 - abs((a + a_his) % self.n_actions - a_his) / (self.n_actions - 1)
                    pos_key = str(self.walker.pos_x) + "*" + str(self.walker.pos_z)

                    max_angle = max(float(self.walker.action_labels[a]), float(self.walker.action_labels[a_his]))
                    min_angle = min(float(self.walker.action_labels[a]), float(self.walker.action_labels[a_his]))

                    diff = min(abs(max_angle - min_angle), 360 - max_angle + min_angle)

                    r = 1 - diff / 180
                    r -= (visit[pos_key]) * 0.2

                    # # note action has been performed
                    # # fixme, give additional reward when in hall
                    if self.detect_which_room() == 0:
                        for i in range(1, 5):
                            path_temp = self.walker.find_shortest_path(self.walker.pos_x, self.walker.pos_z,
                                                                       self.room_gates[i - 1][0],
                                                                       self.room_gates[i - 1][2])
                            locals()['dis%d' % i] = len(path_temp) - 1

                        sum_dis = 0.0
                        # todo, need calculate for all grids in hall to get max num
                        max_dis = 12

                        for i in range(1, 5):
                            if ROOM[i] is None:
                                sum_dis += locals()['dis%d' % i]

                        # todo, reward should be diff for large distance
                        if sum_dis >= 10:
                            addition = 10
                        else:
                            addition = 0

                        r = 1 - (sum_dis + addition) / max_dis

                    # todo, give punishment when step into false Room
                    # will only first step to gate, then inner room guide until to hall
                    if self.walker.pos_x == 2 and self.walker.pos_z == -2:
                        r -= 1
                    if self.walker.pos_x == 2 and self.walker.pos_z == -1:
                        r -= 1

                    print("x: " + str(self.walker.pos_x))
                    print("z: " + str(self.walker.pos_z))
                    print("reward: " + str(r))
                    # give punishment if detect obstacles
                    # pub = self.detect_invalids(self.walker.pos_x, self.walker.pos_y, self.walker.pos_z)
                    # if len(pub) > 0:
                    #     r -= 0.5

                    s_ = self.walker.observe_gcc_vector(self.walker.pos_x, self.walker.pos_y, self.walker.pos_z)
                    s_ = np.array(s_)[np.newaxis, :]

                sum_reward += r
                a_his = a

                self.walker.learn(s, a, s_, r)
                if DONE:
                    break

            # evaluate epoch
            print("-----------------------------------" + str(sum_reward / step))

예제 #2

파일 보기

class Game:
    def __init__(self):
        self.n_features = 366
        self.n_actions = 8
        self.max_epoch = 100
        self.max_steps = 100

        # define sound source information
        self.src_pos_x = -3.0
        self.src_pos_y = 1.6
        self.src_pos_z = -3.0

        # sample as a grid map with 0.5m unit
        self.unit = 0.5
        self.room_grids = [i for i in np.arange(-3.5, 3.5 + self.unit, self.unit)]

        self.walker = Walker(self.n_features, self.n_actions)

    def detect_invalids(self, x, y, z):
        invalids = []
        if x == 3.5:
            invalids.append(self.walker.action_labels.index('90'))
            invalids.append(self.walker.action_labels.index('45'))
            invalids.append(self.walker.action_labels.index('135'))
        if x == -3.5:
            invalids.append(self.walker.action_labels.index('270'))
            invalids.append(self.walker.action_labels.index('225'))
            invalids.append(self.walker.action_labels.index('315'))
        if z == 3.5:
            invalids.append(self.walker.action_labels.index('180'))
            invalids.append(self.walker.action_labels.index('135'))
            invalids.append(self.walker.action_labels.index('225'))
        if z == -3.5:
            invalids.append(self.walker.action_labels.index('0'))
            invalids.append(self.walker.action_labels.index('315'))
            invalids.append(self.walker.action_labels.index('45'))

        obstable_x = [-1, -0.5, 0, 0.5, 1]
        obstable_z = [-1, -0.5, 0, 0.5, 1]

        if x == 1.5 and z == 1.5:
            invalids.append(self.walker.action_labels.index('315'))
        elif x == 1.5 and z == 1:
            invalids.append(self.walker.action_labels.index('315'))
            invalids.append(self.walker.action_labels.index('270'))
        elif x == 1.5 and z in np.arange(-0.5, 1, 0.5):
            invalids.append(self.walker.action_labels.index('315'))
            invalids.append(self.walker.action_labels.index('270'))
            invalids.append(self.walker.action_labels.index('225'))
        elif x == 1.5 and z == -1:
            invalids.append(self.walker.action_labels.index('225'))
            invalids.append(self.walker.action_labels.index('270'))
        elif x == 1.5 and z == -1.5:
            invalids.append(self.walker.action_labels.index('225'))
        elif x == 1 and z == -1.5:
            invalids.append(self.walker.action_labels.index('225'))
            invalids.append(self.walker.action_labels.index('180'))
        elif x in np.arange(-0.5, 1, 0.5) and z == -1.5:
            invalids.append(self.walker.action_labels.index('225'))
            invalids.append(self.walker.action_labels.index('180'))
            invalids.append(self.walker.action_labels.index('135'))
        elif x == -1 and z == -1.5:
            invalids.append(self.walker.action_labels.index('180'))
            invalids.append(self.walker.action_labels.index('135'))
        elif x == -1.5 and z == -1.5:
            invalids.append(self.walker.action_labels.index('135'))
        elif x == -1.5 and z == -1:
            invalids.append(self.walker.action_labels.index('90'))
            invalids.append(self.walker.action_labels.index('135'))
        elif x == -1.5 and z in np.arange(-0.5, 1, 0.5):
            invalids.append(self.walker.action_labels.index('90'))
            invalids.append(self.walker.action_labels.index('135'))
            invalids.append(self.walker.action_labels.index('45'))
        elif x == -1.5 and z == 1:
            invalids.append(self.walker.action_labels.index('45'))
            invalids.append(self.walker.action_labels.index('90'))
        elif x == -1.5 and z == 1.5:
            invalids.append(self.walker.action_labels.index('45'))
        elif x == -1 and z == 1.5:
            invalids.append(self.walker.action_labels.index('0'))
            invalids.append(self.walker.action_labels.index('45'))
        elif x in np.arange(-0.5, 1, 0.5) and z == 1.5:
            invalids.append(self.walker.action_labels.index('315'))
            invalids.append(self.walker.action_labels.index('0'))
            invalids.append(self.walker.action_labels.index('45'))
        elif x == 1 and z == 1.5:
            invalids.append(self.walker.action_labels.index('315'))
            invalids.append(self.walker.action_labels.index('0'))

        # todo, abstract an obstacle
        return invalids

    def play(self):
        records_step = []
        records_r = []

        for epoch in range(self.max_epoch):
            print("========== Epoch %d ======" % epoch)
            memory = collections.defaultdict(dict)
            visit = {}
            for i in self.room_grids:
                for j in self.room_grids:
                    visit[str(i) + "*" + str(j)] = 0
                    for k in self.walker.action_labels:
                        memory[str(i) + "*" + str(j)][k] = 0

            # init walker position
            # fixme, random choose
            self.walker.reset_walker_pos(3.0, 1, 3.0)
            DONE = False
            sum_reward = 0.0

            a_his = None
            for step in range(self.max_steps):
                s = self.walker.observe_gcc_vector(self.walker.pos_x, self.walker.pos_y, self.walker.pos_z)
                s = np.array(s)[np.newaxis, :]

                # fixme, use grids to detect
                # fixme, cut action space
                invalids = self.detect_invalids(self.walker.pos_x, self.walker.pos_y, self.walker.pos_z)

                pos_key = str(self.walker.pos_x) + "*" + str(self.walker.pos_z)
                for i in memory[pos_key].keys():
                    if memory[pos_key][i] >= 2:
                        invalids.append(self.walker.action_labels.index(i))
                visit[pos_key] += 1

                a, p = self.walker.choose_action(s, invalids)

                # step next state
                direction = self.walker.action_labels[a]

                # fixme, for the first step, give more obs, argmax
                if step == 0:
                    fe = open('first_obs.pkl', 'rb')
                    obs = pickle.load(fe)

                    s_r = obs['right']
                    s_r = np.array(s_r)[np.newaxis, :]
                    a_r, p_r = self.walker.choose_action(s_r, [])
                    p_rr = [p_r[len(p_r) - 2], p_r[len(p_r) - 1]]
                    p_rr = np.append(p_rr, p_r[:len(p_r) - 2])

                    s_l = obs['left']
                    s_l = np.array(s_l)[np.newaxis, :]
                    a_l, p_l = self.walker.choose_action(s_l, [])
                    p_ll = [p_l[0], p_l[1]]
                    p_ll = np.append(p_l[2:], p_ll)

                    s_d = obs['down']
                    s_d = np.array(s_d)[np.newaxis, :]
                    a_d, p_d = self.walker.choose_action(s_d, [])
                    p_dd = [p_d[len(p_d) - 4], p_d[len(p_d) - 3], p_d[len(p_d) - 2], p_d[len(p_d) - 1]]
                    p_dd = np.append(p_dd, p_d[:len(p_d) - 4])

                    # fixme, define first step based on obs, do argmax
                    p_mix = [0] * self.n_actions
                    for i in range(self.n_actions):
                        if i not in invalids:
                            p_mix[i] = p[i] + p_rr[i] + p_ll[i] + p_dd[i]

                    p_mix = np.array(p_mix)
                    p_mix /= p_mix.sum()
                    a_mix = np.argmax(p_mix)

                    fe.close()

                    a = a_mix
                    a_his = a
                    p = p_mix
                    direction = self.walker.action_labels[a]

                # if epoch == 20:
                #     print(p)
                #     print(direction)

                memory[pos_key][direction] += 1

                if direction == '0':
                    self.walker.reset_walker_pos(self.walker.pos_x, self.walker.pos_y,
                                                 self.walker.pos_z - self.unit)
                elif direction == '45':
                    self.walker.reset_walker_pos(self.walker.pos_x + self.unit, self.walker.pos_y,
                                                 self.walker.pos_z - self.unit)
                elif direction == '90':
                    self.walker.reset_walker_pos(self.walker.pos_x + self.unit, self.walker.pos_y,
                                                 self.walker.pos_z)
                elif direction == '135':
                    self.walker.reset_walker_pos(self.walker.pos_x + self.unit, self.walker.pos_y,
                                                 self.walker.pos_z + self.unit)
                elif direction == '180':
                    self.walker.reset_walker_pos(self.walker.pos_x, self.walker.pos_y,
                                                 self.walker.pos_z + self.unit)
                elif direction == '225':
                    self.walker.reset_walker_pos(self.walker.pos_x - self.unit, self.walker.pos_y,
                                                 self.walker.pos_z + self.unit)
                elif direction == '270':
                    self.walker.reset_walker_pos(self.walker.pos_x - self.unit, self.walker.pos_y,
                                                 self.walker.pos_z)
                elif direction == '315':
                    self.walker.reset_walker_pos(self.walker.pos_x - self.unit, self.walker.pos_y,
                                                 self.walker.pos_z - self.unit)

                # fixme, don't have s_ when get source
                if self.walker.pos_x == self.src_pos_x and self.walker.pos_z == self.src_pos_z:
                    print("get source")
                    DONE = True
                    r = 5
                    s_ = np.array([0 for u in range(self.n_features)])[np.newaxis, :]

                else:
                    # fixme, rebuild reward function
                    # r = self.walker.observe_volume(self.walker.pos_x, self.walker.pos_y, self.walker.pos_z)
                    # r = 0
                    pos_key = str(self.walker.pos_x) + "*" + str(self.walker.pos_z)
                    # r /= (visit[pos_key] + 1)

                    max_angle = max(float(self.walker.action_labels[a]), float(self.walker.action_labels[a_his]))
                    min_angle = min(float(self.walker.action_labels[a]), float(self.walker.action_labels[a_his]))

                    diff = min(abs(max_angle - min_angle), 360 - max_angle + min_angle)

                    # r = 1 - abs((a + a_his) % self.n_actions - a_his) / (self.n_actions - 1)
                    r = 1 - diff / 180
                    r -= (visit[pos_key]) * 0.2

                    # todo, think about punishment
                    pub = self.detect_invalids(self.walker.pos_x, self.walker.pos_y, self.walker.pos_z)
                    if len(pub) > 0:
                        r -= 0.5

                    s_ = self.walker.observe_gcc_vector(self.walker.pos_x, self.walker.pos_y, self.walker.pos_z)
                    s_ = np.array(s_)[np.newaxis, :]

                sum_reward += r
                a_his = a

                self.walker.learn(s, a, s_, r)
                if DONE:
                    break

            # fixme, think about a new way to evaluate
            print(step)
            print(sum_reward / step)
            records_step.append(step)
            records_r.append(sum_reward / step)

            # overload now
            if epoch % 500 == 0 and epoch != 0:
                with open('save/rl_8x3x8_src_-3_1.6_-3/records_step', 'w') as f:
                    f.write(str(records_step))
                with open('save/rl_8x3x8_src_-3_1.6_-3/records_reward', 'w') as f:
                    f.write(str(records_r))

예제 #3

파일 보기

class Game:
    def __init__(self):
        self.n_features = 366
        self.n_actions = 8
        self.max_epoch = 1000
        self.max_steps = 40

        # define sound source information
        self.src_pos_x = -2.0
        self.src_pos_y = 1.6
        self.src_pos_z = -4.0

        # sample as a grid map with 0.5m unit
        # change step length to 1m
        self.unit = 1.0
        self.room_grids_x = [i for i in np.arange(-3.0, 3.0 + self.unit, self.unit)]
        self.room_grids_z = [i for i in np.arange(-4.0, 4.0 + self.unit, self.unit)]

        # define wall and obstacles
        self.wall_axis_z = {-4: [i for i in np.arange(-5.0, 6.0, 1.0)],
                            4: [i for i in np.arange(-5.0, 6.0, 1.0)],
                            0: [i for i in np.arange(-5.0, 6.0, 1.0) if i != 0]}
        self.wall_axis_x = {5: [i for i in np.arange(-4.0, 5.0, 1.0)],
                            1: [i for i in np.arange(-4.0, 5.0, 1.0) if i != -2 and i != 2],
                            -1: [i for i in np.arange(-4.0, 5.0, 1.0) if i != -2 and i != 2],
                            -5: [i for i in np.arange(-4.0, 5.0, 1.0)]}

        # define checkpoints: room gates, hall center
        self.room_gates = [[-2.0, 1, -1.0], [2.0, 1, -1.0], [-2.0, 1, 1.0], [2.0, 1, 1.0]]
        self.hall_center = [[0, 0, 0]]

        # define room zone
        self.room1_x = [i for i in np.arange(-3.5, 0, 0.5)]
        self.room1_z = [i for i in np.arange(-4.5, -1, 0.5)]

        self.room2_x = [i for i in np.arange(0.5, 4.0, 0.5)]
        self.room2_z = [i for i in np.arange(-4.5, -1, 0.5)]

        self.room3_x = [i for i in np.arange(-3.5, 0, 0.5)]
        self.room3_z = [i for i in np.arange(1.5, 5.0, 0.5)]

        self.room4_x = [i for i in np.arange(0.5, 4.0, 0.5)]
        self.room4_z = [i for i in np.arange(1.5, 5.0, 0.5)]

        self.hall_x = [i for i in np.arange(-3.5, 4.0, 0.5)]
        self.hall_z = [i for i in np.arange(-0.5, 1.0, 0.5)]

        self.walker = Walker(self.n_features, self.n_actions)

        self.BayeProb = {1: 0.25, 2: 0.25, 3: 0.25, 4: 0.25}
        self.Pzx = {1: 1, 2: 1, 3: 1, 4: 1}

    # invalid direction, False room and defined obstacles
    def detect_invalids(self, x, y, z, room):
        invalids = []
        directions = [[x, y, z - self.unit], [x + self.unit, y, z - self.unit],
                      [x + self.unit, y, z], [x + self.unit, y, z + self.unit],
                      [x, y, z + self.unit], [x - self.unit, y, z + self.unit],
                      [x - self.unit, y, z], [x - self.unit, y, z - self.unit]]

        for direction in directions:
            # along x axis, fix z, change x
            if self.wall_axis_x.get(direction[2]) is not None:
                if direction[0] in self.wall_axis_x[direction[2]]:
                    invalids.append(self.walker.action_labels.index(str(directions.index(direction) * 45)))

            # along z axis, fix x, change z
            if self.wall_axis_z.get(direction[0]) is not None:
                if direction[2] in self.wall_axis_z[direction[0]]:
                    invalids.append(self.walker.action_labels.index(str(directions.index(direction) * 45)))

        if room[4] is False:
            for direction in directions:
                if (direction[0] in self.room4_x and direction[2] in self.room4_z) or (
                        direction[0] == self.room_gates[3][0] and direction[2] == self.room_gates[3][2]):
                    invalids.append(self.walker.action_labels.index(str(directions.index(direction) * 45)))
        if room[3] is False:
            for direction in directions:
                if (direction[0] in self.room3_x and direction[2] in self.room3_z) or (
                        direction[0] == self.room_gates[2][0] and direction[2] == self.room_gates[2][2]):
                    invalids.append(self.walker.action_labels.index(str(directions.index(direction) * 45)))
        if room[2] is False:
            for direction in directions:
                if (direction[0] in self.room2_x and direction[2] in self.room2_z) or (
                        direction[0] == self.room_gates[1][0] and direction[2] == self.room_gates[1][2]):
                    invalids.append(self.walker.action_labels.index(str(directions.index(direction) * 45)))
        if room[1] is False:
            for direction in directions:
                if (direction[0] in self.room1_x and direction[2] in self.room1_z) or (
                        direction[0] == self.room_gates[0][0] and direction[2] == self.room_gates[0][2]):
                    invalids.append(self.walker.action_labels.index(str(directions.index(direction) * 45)))

        if room[0] is False:
            for direction in directions:
                if direction[0] in self.hall_x and direction[2] in self.hall_z:
                    invalids.append(self.walker.action_labels.index(str(directions.index(direction) * 45)))

        return invalids

    # return 1, 2, 3, 4 room, 0-hall
    def detect_which_room(self):
        if self.walker.pos_x in self.room1_x and self.walker.pos_z in self.room1_z:
            return 1
        elif self.walker.pos_x in self.room2_x and self.walker.pos_z in self.room2_z:
            return 2
        elif self.walker.pos_x in self.room3_x and self.walker.pos_z in self.room3_z:
            return 3
        elif self.walker.pos_x in self.room4_x and self.walker.pos_z in self.room4_z:
            return 4
        elif self.walker.pos_x in self.hall_x and self.walker.pos_z in self.hall_z:
            return 0
        else:
            return -1

    """
        based on GUIDE path to learn actions:
        - learn: from inner room guide to gate; avoid obstacles
        - not learn: gate into inner room

        - reward: diff in angle
    """

    def learn_guide_actions(self, path, visit):
        a_his = None

        for pos in path:
            if path.index(pos) == len(path) - 2:
                break
            s = self.walker.observe_gcc_vector(pos[0], self.walker.pos_y, pos[1])
            s = np.array(s)[np.newaxis, :]

            pos_key = str(pos[0]) + "*" + str(pos[1])
            visit[pos_key] += 1

            pos_next = path[path.index(pos) + 1]
            s_ = self.walker.observe_gcc_vector(pos_next[1], self.walker.pos_y, pos_next[1])
            s_ = np.array(s_)[np.newaxis, :]

            # get action
            if pos_next[0] - pos[0] == 0 and pos_next[1] - pos[1] == -self.unit:
                a = 0
            elif pos_next[0] - pos[0] == self.unit and pos_next[1] - pos[1] == -self.unit:
                a = 1
            elif pos_next[0] - pos[0] == self.unit and pos_next[1] - pos[1] == 0:
                a = 2
            elif pos_next[0] - pos[0] == self.unit and pos_next[1] - pos[1] == self.unit:
                a = 3
            elif pos_next[0] - pos[0] == 0 and pos_next[1] - pos[1] == self.unit:
                a = 4
            elif pos_next[0] - pos[0] == -self.unit and pos_next[1] - pos[1] == self.unit:
                a = 5
            elif pos_next[0] - pos[0] == -self.unit and pos_next[1] - pos[1] == 0:
                a = 6
            elif pos_next[0] - pos[0] == -self.unit and pos_next[1] - pos[1] == -self.unit:
                a = 7
            else:
                print("Wrong action get from GUIDE path... ")
                a = None

            if a_his is None:
                a_his = a

            # get diff reward
            max_angle = max(float(self.walker.action_labels[a]), float(self.walker.action_labels[a_his]))
            min_angle = min(float(self.walker.action_labels[a]), float(self.walker.action_labels[a_his]))

            diff = min(abs(max_angle - min_angle), 360 - max_angle + min_angle)

            r = 1 - diff / 180

            pos_key = str(pos_next[0]) + "*" + str(pos_next[1])
            r -= (visit[pos_key]) * 0.2

            self.walker.learn(s, a, s_, r)
            a_his = a

    """
        Try use different way to do
    """

    def calculate_cov(self, z_real, z_exp):
        length = len(z_real)

        y = np.array(z_real)
        x = np.array(z_exp)

        x_avg = np.average(x)
        y_avg = np.average(y)

        xy = [(x[i] - x_avg) * (y[i] - y_avg) for i in range(length)]
        cov_xy = np.sum(xy)

        pow_x = [pow(float(x[i] - x_avg), 2.0) for i in range(length)]
        theta_x = math.sqrt(np.sum(pow_x))

        pow_y = [pow(float(y[i] - y_avg), 2.0) for i in range(length)]
        theta_y = math.sqrt(np.sum(pow_y))

        r = cov_xy / (theta_x * theta_y)  # [-1, 1]
        pear = r + 1  # [0, 2]

        # the larger pear means more similar
        return pear

    # when observe 1-dim vector s, update its confidence to all rooms
    def update_bayesia(self):
        key = str(float(self.walker.pos_x)) + "_" + str(self.walker.pos_y) + "_" + str(
            float(self.walker.pos_z))

        z_real = self.walker.observe_volume(self.walker.pos_x, self.walker.pos_y, self.walker.pos_z)
        for room in range(1, 5):
            simu = open('simu_r%d_vol.pkl' % room, 'rb')
            exp = pickle.load(simu)
            simu.close()

            # if meet (2,3) center in room, just use previous pzx;
            if exp.get(key) is not None:
                z_exp = exp[key]

                # cov = self.calculate_cov(z_real, z_exp)

                # 7 is a constant value, need to measure before doing experiment
                diff_vol = 7 - np.abs(np.average(z_real) - np.average(z_exp))
                if self.walker.pos_x == 2 and self.walker.pos_z == -1 and room == 1:
                    diff_vol += 5

                # larger, more similar
                self.Pzx[room] = diff_vol

        # need one learning rate
        baye_sum = 0.
        for room in range(1, 5):
            self.BayeProb[room] *= self.Pzx[room]
            baye_sum += self.BayeProb[room]

        # rescale baye prob
        for room in range(1, 5):
            self.BayeProb[room] /= baye_sum

    def play(self):
        for epoch in range(self.max_epoch):
            print("========== Epoch %d ======" % epoch)

            # init historical track
            memory = collections.defaultdict(dict)
            visit = {}
            for i in self.room_grids_x:
                for j in self.room_grids_z:
                    visit[str(i) + "*" + str(j)] = 0
                    for k in self.walker.action_labels:
                        memory[str(i) + "*" + str(j)][k] = 0

            """
                Init part
            """
            self.walker.reset_walker_pos(2.0, 1, 4.0)
            DONE = False
            sum_reward = 0.0
            a_his = None
            ROOM = [None] * 5

            self.BayeProb = {1: 0.25, 2: 0.25, 3: 0.25, 4: 0.25}
            self.Pzx = {1: 1, 2: 1, 3: 1, 4: 1}

            """
                get first observation and update 
            """

            s = self.walker.observe_gcc_vector(self.walker.pos_x, self.walker.pos_y, self.walker.pos_z)
            self.update_bayesia()
            print(self.Pzx)
            print(self.BayeProb)
            s = np.array(s)[np.newaxis, :]

            for step in range(self.max_steps):
                # Note: always has a s (init or s = s_)
                print("************** step %d" % step)

                print("x: " + str(self.walker.pos_x))
                print("z: " + str(self.walker.pos_z))

                # todo, if use A* to guide in some situations, update Bayesia, update s, continue; learn if needed

                invalids = self.detect_invalids(self.walker.pos_x, self.walker.pos_y, self.walker.pos_z, ROOM)

                # todo, append invalids
                pos_key = str(self.walker.pos_x) + "*" + str(self.walker.pos_z)
                for i in memory[pos_key].keys():
                    if self.detect_which_room() == 0:
                        threshold = 10
                    else:
                        threshold = 2
                    if memory[pos_key][i] >= threshold:
                        invalids.append(self.walker.action_labels.index(i))

                a, p = self.walker.choose_action(s, invalids)

                if a_his is None:
                    a_his = a

                direction = self.walker.action_labels[a]
                memory[pos_key][direction] += 1
                visit[pos_key] += 1

                """
                    Apply movement
                """

                if direction == '0':
                    self.walker.reset_walker_pos(self.walker.pos_x, self.walker.pos_y,
                                                 self.walker.pos_z - self.unit)
                elif direction == '45':
                    self.walker.reset_walker_pos(self.walker.pos_x + self.unit, self.walker.pos_y,
                                                 self.walker.pos_z - self.unit)
                elif direction == '90':
                    self.walker.reset_walker_pos(self.walker.pos_x + self.unit, self.walker.pos_y,
                                                 self.walker.pos_z)
                elif direction == '135':
                    self.walker.reset_walker_pos(self.walker.pos_x + self.unit, self.walker.pos_y,
                                                 self.walker.pos_z + self.unit)
                elif direction == '180':
                    self.walker.reset_walker_pos(self.walker.pos_x, self.walker.pos_y,
                                                 self.walker.pos_z + self.unit)
                elif direction == '225':
                    self.walker.reset_walker_pos(self.walker.pos_x - self.unit, self.walker.pos_y,
                                                 self.walker.pos_z + self.unit)
                elif direction == '270':
                    self.walker.reset_walker_pos(self.walker.pos_x - self.unit, self.walker.pos_y,
                                                 self.walker.pos_z)
                elif direction == '315':
                    self.walker.reset_walker_pos(self.walker.pos_x - self.unit, self.walker.pos_y,
                                                 self.walker.pos_z - self.unit)

                print("apply movement: " + direction)

                """
                    Receive new state, Update Bayesian Probs
                    todo, compute new Bayesian Probs：p(z|x) * Bel(x t-1), update pzx and BayeProb
                    fixme, observe z with 366 dim,
                """

                # reach source
                if self.walker.pos_x == self.src_pos_x and self.walker.pos_z == self.src_pos_z:
                    print("get source")
                    DONE = True
                    r = 10
                    s_ = np.array([0 for u in range(self.n_features)])[np.newaxis, :]

                else:
                    s_ = self.walker.observe_gcc_vector(self.walker.pos_x, self.walker.pos_y, self.walker.pos_z)
                    self.update_bayesia()
                    print(self.Pzx)
                    print(self.BayeProb)
                    s_ = np.array(s_)[np.newaxis, :]

                    # use guide based on Prob
                    # if self.BayeProb[1] > 0.95 and self.detect_which_room() != 1:
                    #     print("guide to Room 1")
                    #     self.walker.reset_walker_pos(-2.0, 1, -2.0)
                    #     s = s_
                    #     a_his = a
                    #     continue

                    print("x: " + str(self.walker.pos_x))
                    print("z: " + str(self.walker.pos_z))

                    # todo, design reward feedback, [explore + entropy + angle_diff]
                    # pos_key = str(self.walker.pos_x) + "*" + str(self.walker.pos_z)
                    #
                    # max_angle = max(float(self.walker.action_labels[a]), float(self.walker.action_labels[a_his]))
                    # min_angle = min(float(self.walker.action_labels[a]), float(self.walker.action_labels[a_his]))
                    #
                    # diff = min(abs(max_angle - min_angle), 360 - max_angle + min_angle)
                    #
                    # r = 1 - diff / 180
                    # r -= (visit[pos_key]) * 0.2

                    # when walker is in room 4, aim to out room, ok and can converge
                    if self.detect_which_room() == 4 or (self.walker.pos_x == 2 and self.walker.pos_z == 1):
                        path_temp = self.walker.find_shortest_path(self.walker.pos_x, self.walker.pos_z,
                                                                   self.room_gates[3][0],
                                                                   self.room_gates[3][2])
                        dis = len(path_temp) - 1

                        # r = 3 - dis * (self.BayeProb[1] + self.BayeProb[2] + self.BayeProb[3])
                        r = 1 - dis * (self.BayeProb[1] + self.BayeProb[2] + self.BayeProb[3]) / 3

                    # when walker is in room 1, aim to reach sound, ok and can converge
                    elif self.detect_which_room() == 1 or (self.walker.pos_x == -2 and self.walker.pos_z == -1):
                        max_angle = max(float(self.walker.action_labels[a]), float(self.walker.action_labels[a_his]))
                        min_angle = min(float(self.walker.action_labels[a]), float(self.walker.action_labels[a_his]))

                        diff = min(abs(max_angle - min_angle), 360 - max_angle + min_angle)

                        r = 1 - diff / 180

                        pos_key = str(self.walker.pos_x) + "*" + str(self.walker.pos_z)
                        r -= (visit[pos_key]) * 0.2

                    # todo, adjust actions in hall and other rooms
                    else:
                        # todo, if set room with low prob to be false ?
                        ROOM[4] = False
                        ROOM[2] = False
                        for i in range(1, 5):
                            path_temp = self.walker.find_shortest_path(self.walker.pos_x, self.walker.pos_z,
                                                                       self.room_gates[i - 1][0],
                                                                       self.room_gates[i - 1][2])
                            locals()['dis%d' % i] = len(path_temp) - 1

                        sum_dis = 0.0
                        for i in range(1, 5):
                            sum_dis += locals()['dis%d' % i] * self.BayeProb[i]

                        # todo, reward should be diff for large distance, i.e., value of 4 need be careful

                        if sum_dis >= 4:
                            addition = 10
                        else:
                            addition = 0
                        r = 1 - (sum_dis + addition) / 4

                        # if self.detect_which_room() == 2:
                        #     r = -0.5
                        #
                        # if self.walker.pos_x == 3.0 and self.walker.pos_z == 0:
                        #     r = 0.1
                        # if self.walker.pos_x == 2.0 and self.walker.pos_z == 0:
                        #     r = 0.5
                        # if self.walker.pos_x == 1.0 and self.walker.pos_z == 0:
                        #     r = 0.7
                        # if self.walker.pos_x == 0.0 and self.walker.pos_z == 0:
                        #     r = 0.8
                        # if self.walker.pos_x == -1.0 and self.walker.pos_z == 0:
                        #     r = 0.9
                        # if self.walker.pos_x == -2.0 and self.walker.pos_z == 0:
                        #     r = 0.9
                        # if self.walker.pos_x == -3.0 and self.walker.pos_z == 0:
                        #     r = 0.6

                    # if self.detect_which_room() == 0:
                    # for i in range(1, 5):
                    #     path_temp = self.walker.find_shortest_path(self.walker.pos_x, self.walker.pos_z,
                    #                                                self.room_gates[i - 1][0],
                    #                                                self.room_gates[i - 1][2])
                    #     locals()['dis%d' % i] = len(path_temp) - 1
                    #
                    # sum_dis = 0.0
                    # # todo, need calculate for all grids in hall to get max num
                    # max_dis = 12
                    #
                    # for i in range(1, 5):
                    #     if ROOM[i] is None:
                    #         sum_dis += locals()['dis%d' % i] * self.BayeProb[i]
                    #
                    # print(sum_dis)
                    #
                    # # todo, reward should be diff for large distance
                    # if sum_dis >= 5:
                    #     addition = 10
                    # else:
                    #     addition = 0
                    #
                    # r = 1 - (sum_dis + addition) / max_dis

                    print("reward: " + str(r))

                """
                    Learn part, ready for next step
                """
                # todo, use multiple td_error backward
                self.walker.learn(s, a, s_, r)
                sum_reward += r
                a_his = a
                s = s_

                if DONE:
                    break