def demo(self, demo_count=50):
     all_actions_taken = []
     for _ in range(demo_count):
         block_world = BlockWorld(self.states_x,
                                  self.states_y,
                                  self.blocks_count,
                                  1,
                                  record=False)
         goal = block_world.goal_config
         all_actions_taken.append({
             "goal": goal,
             "actions": block_world.run_environment()
         })
     RLTrainer.serialize_actions(all_actions_taken)
    def random_exploration2(self):
        episode_count = 2
        prev_action = None
        for ep in range(episode_count):
            block_world = BlockWorld(self.states_x,
                                     self.states_y,
                                     self.blocks_count,
                                     1,
                                     record=True)
            print("Goal: ", [
                COLORS_STR[i] for stack in block_world.goal_config
                for i in stack
            ])
            while block_world.get_reward() != 0:
                block_world.pre_render()
                action, block_id = self.get_random_action_from_prev_action(
                    prev_action)
                print("Action chosen :", action, block_id)
                if action != Action.DROP and action != Action.PICK:
                    block_world.move_block_by_action(action, block_id)

                prev_action = action, block_id
                block_world.render()
    def q_learning_supervised(self):
        gamma = 0.5
        alpha = 0.5

        q = defaultdict(lambda: defaultdict(lambda: 0))
        demos = RLTrainer.deserialize_actions()

        for demo in demos:
            block_world = BlockWorld(self.states_x,
                                     self.states_y,
                                     self.blocks_count,
                                     1,
                                     record=False)
            block_world.create_goal(demo["goal"])
            for action in demo["actions"]:
                curr_state = BlockWorld.convert_state_dict_to_tuple(
                    action["state"])
                action, sel_id = BlockWorld.parse_action(action["action"])
                if action != Action.FINISHED:
                    block_id = sel_id if action == Action.PICK else curr_state[
                        -1]
                    next_state = block_world.get_next_state_based_on_state_tuple(
                        curr_state, (action, block_id))
                    new_reward = block_world.get_reward_for_state(
                        next_state, block_world.goal_config)
                    q_i = q[curr_state][(action, block_id)]
                    if len(q[next_state]) > 0:
                        max_q = max([
                            q[next_state][a_dash] for a_dash in q[next_state]
                        ])
                    else:
                        max_q = 0
                    q[curr_state][(action, block_id)] = (
                        (1 - alpha) * q_i) + (alpha *
                                              (new_reward + gamma * max_q))
        return q
Пример #4
0
    def test_q_learning_real(self, q_old, starting_nu=0.1):
        nu = starting_nu
        block_world = BlockWorld(self.states_x,
                                 self.states_y,
                                 self.blocks_count,
                                 self.stack_count,
                                 record=False)
        cnt = 0
        while cnt < self.iteration_count:
            cnt += 1
            block_world.pre_render()
            curr_state = block_world.get_state_as_tuple_pramodith2()
            if self.debug and curr_state in q_old:
                print("Current State: %s" + str(curr_state), q_old[curr_state])
            action, block_id = self.get_next_action(curr_state, q_old, nu)
            if self.debug: print("Action: ", action, block_id)

            next_state = block_world.get_next_state_based_on_state_tuple(
                curr_state, (action, block_id))
            new_reward = block_world.get_reward_for_state(
                next_state, curr_state)
            new_reward += block_world.get_reward_for_state_action_pramodith(
                curr_state, next_state)
            print("Reward")
            print(new_reward)

            if new_reward >= 100:
                print("Converged in %d", cnt)
                return cnt

            print("q:", q_old.get(str(curr_state), None))
            block_world.update_state_from_tuple_pramodith(next_state)

            block_world.render()  # time.sleep(0.1)
        return cnt
Пример #5
0
    def q_learning_real(self,
                        starting_nu=0.0,
                        use_old=True,
                        record=False,
                        demo_id=1,
                        goal_config=None):
        alpha = 1
        gamma = 0.1
        action = None
        picked = False
        paused = False
        user_choice = True
        user_motion_pick = False
        rendered_pick = True
        record_actions = {}
        if use_old:
            if demo_id == 1:
                q_old = Demonstrations.load_obj("q_table/q_3_blocks_all_goals")
            else:
                q_old = Demonstrations.load_obj("q_table/q_demo_" +
                                                str(demo_id - 1))
        else:
            q_old = {}
        nu = starting_nu
        block_world = BlockWorld(self.states_x,
                                 self.states_y,
                                 self.blocks_count,
                                 self.stack_count,
                                 record=False,
                                 goal_config=goal_config)
        if record:
            record_actions["starting_state"] = [
                (block_world.block_dict[i].rect.centerx,
                 block_world.block_dict[i].rect.centery)
                for i in range(self.blocks_count)
            ]
            record_actions["goal_config"] = [block_world.goal_config]
            record_actions["actions"] = []
        if self.debug:
            print("Goal: ", [[COLORS_STR[i] for i in stack if i >= 0]
                             for stack in block_world.goal_config])

        cnt = 0
        q = q_old.copy()
        while cnt < self.iteration_count:
            cnt += 1
            block_world.pre_render()
            curr_state = block_world.get_state_as_tuple_pramodith2()
            if curr_state not in q:
                q[curr_state] = {}
            if self.debug: print("Current State: ", curr_state)
            user_choice = True
            action = None
            user_motion_pick = False
            while user_choice or paused:
                time.sleep(0.5)
                for event in pygame.event.get():
                    if event.type == KEYDOWN:
                        print("")
                        if event.key == K_SPACE:
                            if paused == False:
                                paused = True
                            else:
                                block_world.block_dict[block_id].surf.fill(
                                    COLORS[block_id])
                                user_motion_pick = False
                                paused = False
                                user_choice = False
                        if rendered_pick and paused:
                            print('Waiting for user correction.')
                            print(block_id)
                            if event.key == K_UP:
                                print("d")
                                user_choice = True
                                picked = False
                                user_motion_pick = True
                                action = Action.MOVE_UP
                                user_choice = True
                            elif event.key == K_DOWN:
                                print("d")
                                user_choice = True
                                picked = False
                                user_motion_pick = True
                                action = Action.MOVE_DOWN
                            elif event.key == K_LEFT:
                                print("d")
                                user_choice = True
                                user_motion_pick = True
                                picked = False
                                action = Action.MOVE_LEFT
                            elif event.key == K_RIGHT:
                                print("d")
                                user_choice = True
                                user_motion_pick = True
                                picked = False
                                action = Action.MOVE_RIGHT
                    elif event.type == pygame.MOUSEBUTTONDOWN:
                        if paused:
                            pos = pygame.mouse.get_pos()
                            for block in block_world.block_dict.values():
                                if block.rect.collidepoint(pos):
                                    if block_id:
                                        block_world.block_dict[
                                            block_id].surf.fill(
                                                COLORS[block_id])
                                    action = Action.PICK
                                    block_id = block.id
                                    user_choice = True
                                    picked = True
                                    block_world.block_dict[block_id].surf.fill(
                                        CLICKED_COLOR[block_id])
                                    rendered_pick = False
                                    break

                if not user_motion_pick and paused == False:
                    user_choice = False
                if paused == False or (not rendered_pick or user_motion_pick):
                    break

            if not user_choice:
                action, block_id = self.get_next_action(curr_state, q, nu)

                if record:
                    record_actions["actions"].append(
                        ('algorithm', action, block_id))
            else:
                if action == Action.PICK:
                    rendered_pick = True
                print('Skipping models choice to listen to the expert')
                if record and action:
                    record_actions["actions"].append(
                        ('user', action, block_id))

            if self.debug: print("Action: ", action, block_id)
            next_state = block_world.get_next_state_based_on_state_tuple(
                curr_state, (action, block_id))
            new_reward = block_world.get_reward_for_state(
                next_state, curr_state)
            new_reward += block_world.get_reward_for_state_action_pramodith(
                curr_state, next_state)
            if new_reward > 1 or new_reward < -1:
                if self.debug: print("next_state: ", next_state)
                if self.debug: print("new_reward: ", new_reward)

            if (action, block_id) in q[curr_state]:
                q_sa = q[curr_state][(action, block_id)]
            else:
                q_sa = 0
                q[curr_state][(action, block_id)] = 0

            if next_state in q and len(q[next_state]) > 0:
                max_q_dash_s_dash_a_dash = max(
                    [q[next_state][a_dash] for a_dash in q[next_state]])
            else:
                max_q_dash_s_dash_a_dash = 0
            if self.debug: print("max_q:", max_q_dash_s_dash_a_dash)
            if new_reward > 70:
                q[curr_state][(action, block_id)] = (
                    (1 - alpha) * q_sa) + (alpha * (new_reward))
                break
            else:
                q[curr_state][(
                    action,
                    block_id)] += alpha * (new_reward + gamma *
                                           (max_q_dash_s_dash_a_dash) - q_sa)

            if self.debug: print("q:", q[curr_state][(action, block_id)])

            block_world.update_state_from_tuple_pramodith(next_state)

            block_world.render()

            time.sleep(0.1)
        pygame.display.quit()
        Demonstrations.save_obj(q, "q_table/q_demo_" + str(demo_id))
        Demonstrations.save_obj(record_actions,
                                "state_action_recording/demo_" + str(demo_id))
 def random_exploration(self):
     gamma = 0.1
     q = defaultdict(lambda: 0)
     episode_count = 2
     prev_action = None
     for ep in range(episode_count):
         block_world = BlockWorld(self.states_x,
                                  self.states_y,
                                  self.blocks_count,
                                  1,
                                  record=True)
         print("Goal: ", [
             COLORS_STR[i] for stack in block_world.goal_config
             for i in stack
         ])
         while block_world.get_reward() != 0:
             block_world.pre_render()
             state = block_world.get_state_as_tuple()
             action, block_id = self.get_random_action_from_prev_action(
                 prev_action)
             next_state = block_world.get_next_state_based_on_state_tuple(
                 state, (action, block_id))
             q_val = gamma * max([
                 q[next_state, b]
                 for b in self.get_allowed_actions_from_prev_action(
                     (action, block_id))
             ])
             q[(block_world.get_state_as_tuple(),
                action)] = block_world.get_reward_for_state(
                    state, block_world.goal_config.tolist()) + q_val
             block_world.update_state_from_tuple(next_state)
             prev_action = action, block_id
             block_world.render()
    def q_learning_random(self):
        episode_count = 100
        success_count = 0

        for ep in range(episode_count):
            block_world = BlockWorld(self.states_x,
                                     self.states_y,
                                     self.blocks_count,
                                     1,
                                     record=False)
            print("Goal: %s" % [
                COLORS_STR[i] for stack in block_world.goal_config
                for i in stack
            ])
            for iteration in range(5000):
                block_world.pre_render()

                curr_state = block_world.get_state_as_tuple()
                action, block_id = self.get_next_action(curr_state, q, nu)

                next_state = block_world.get_next_state_based_on_state_tuple(
                    curr_state, (action, block_id))
                is_goal_next = block_world.get_reward_for_state(
                    next_state, block_world.goal_config) == 0
                if self.debug:
                    print("Current State: ", curr_state, is_goal_next)

                block_world.update_state_from_tuple(next_state)

                block_world.render()
                if is_goal_next:
                    print("Goal State Reached!!! in %d iterations" % iteration)
                    success_count += 1
                    break

                if iteration % 100 == 1:
                    print(
                        ep,
                        iteration,
                    )
                if self.debug: print(iteration)
        print("success_count: ", success_count)
    def q_learning(self,
                   q=None,
                   starting_nu=1.0,
                   decay_nu=True,
                   decay_rate=0.9995):
        gamma = 0.1
        alpha = 1
        episode_count = 100
        if not q:
            q = defaultdict(lambda: defaultdict(lambda: 0))
        success_count = 0

        for ep in range(episode_count):
            block_world = BlockWorld(self.states_x,
                                     self.states_y,
                                     self.blocks_count,
                                     1,
                                     record=False)
            nu = starting_nu
            print("Goal: ", [
                COLORS_STR[i] for stack in block_world.goal_config
                for i in stack
            ])
            for iteration in range(self.iteration_count):
                block_world.pre_render()

                curr_state = block_world.get_state_as_tuple()
                if self.debug: print("Current State: ", curr_state)
                action, block_id = self.get_next_action(curr_state, q, nu)
                if self.debug: print("Action: ", action, block_id)

                next_state = block_world.get_next_state_based_on_state_tuple(
                    curr_state, (action, block_id))
                new_reward = block_world.get_reward_for_state(
                    next_state, block_world.goal_config)
                if self.debug: print("next_state: ", next_state)
                if self.debug: print("new_reward: ", new_reward)

                q_i = q[curr_state][(action, block_id)]

                if len(q[next_state]) > 0:
                    max_q = max(
                        [q[next_state][a_dash] for a_dash in q[next_state]])
                else:
                    max_q = 0

                if self.debug: print("max_q:", max_q)
                q[curr_state][(action, block_id)] = (
                    (1 - alpha) * q_i) + (alpha * (new_reward + gamma * max_q))
                if self.debug: print("q:", q[curr_state][(action, block_id)])

                block_world.update_state_from_tuple(next_state)

                block_world.render()
                if new_reward == 1:
                    print("Goal State Reached!!! in %d iterations" % iteration)
                    success_count += 1
                    break

                if decay_nu and iteration > 50:
                    nu = decay_rate * nu

                if iteration % 100 == 1:
                    print("EP[%d]It[%d]: Q[%d], nu:[%f]" %
                          (ep, iteration, len(q), nu))

        print("success_count: ", success_count)
    def q_learning_real(self, starting_nu=0.0, use_old=True):
        alpha = 1
        gamma = 0.1
        actions_queue = deque(maxlen=5)
        state_distance_queue = deque(maxlen=6)
        converged = False
        if use_old:
            q_old = RLTrainer.load_obj("Q\q_oracle")
        else:
            q_old = {}

        nu = starting_nu
        block_world = BlockWorld(self.states_x,
                                 self.states_y,
                                 self.blocks_count,
                                 self.stack_count,
                                 record=False)
        if self.debug:
            print("Goal: ", [[COLORS_STR[i] for i in stack if i >= 0]
                             for stack in block_world.goal_config])
        state_s = State([[
            block_world.block_dict[i].rect.centerx,
            block_world.block_dict[i].rect.centery
        ] for i in range(self.blocks_count)],
                        selected_index=block_world.selected_block_id,
                        goal_config=block_world.goal_config,
                        screen_dims=(block_world.screen_width,
                                     block_world.screen_height))
        block_world.goal_loc = state_s.goal_positions

        remaining_prob = 1 - nu
        do_next = 0
        cnt = 0
        q = q_old.copy()
        while cnt < self.iteration_count:
            cnt += 1
            #while not converged:
            block_world.pre_render(True)

            curr_state = block_world.get_state_as_tuple_pramodith2()
            if curr_state not in q:
                q[curr_state] = {}
            #print("Current State: ", curr_state)
            '''
            if np.random.rand() < 0.1:
                state_s = State([[block_world.block_dict[i].rect.centerx, block_world.block_dict[i].rect.centery] for i in range(self.blocks_count)],
                    selected_index=block_world.selected_block_id, goal_config=block_world.goal_config,
                    screen_dims=(block_world.screen_width, block_world.screen_height))
                if state_s.goal_reached():
                    print("REACHED...")
                    break
                # block_world.goal_loc = state_s.goal_positions
                action, block_id = self.get_next_action_supervised_t(state_t=None, state_s=state_s, q=None, nu=0)
            else:
               
            
            if action==Action.PICK or action==Action.DROP:
                    actions_queue.append(0)
            else:
                    actions_queue.append(1)

            #action, block_id = self.get_next_action(curr_state, q, nu)


            
            state_distance_queue.append(curr_state[0])
            if len(state_distance_queue)==6:
                 if (len(set(list(state_distance_queue)[0::2]))==1 and len(set(list(state_distance_queue)[1::2]))==1) or do_next>0:
                    state_s = State([[block_world.block_dict[i].rect.centerx, block_world.block_dict[i].rect.centery] for i in range(self.blocks_count)],
                         selected_index=block_world.selected_block_id, goal_config=block_world.goal_config,
                         screen_dims=(block_world.screen_width, block_world.screen_height))
                    action, block_id = self.get_next_action_supervised_t(state_t=None, state_s=state_s, q=None, nu=0)

            '''
            action, block_id = self.get_next_action(curr_state, q, nu)
            if self.debug: print("Action: ", action, block_id)
            next_state = block_world.get_next_state_based_on_state_tuple(
                curr_state, (action, block_id))
            new_reward = block_world.get_reward_for_state(
                next_state, curr_state)
            new_reward += block_world.get_reward_for_state_action_pramodith(
                curr_state, next_state)
            if new_reward > 1 or new_reward < -1:
                if self.debug: print("next_state: ", next_state)
                if self.debug: print("new_reward: ", new_reward)

            # ever_seen_goal = ever_seen_goal or new_reward == 1
            if (action, block_id) in q[curr_state]:
                q_sa = q[curr_state][(action, block_id)]
            else:
                q_sa = 0
                q[curr_state][(action, block_id)] = 0

            if next_state in q and len(q[next_state]) > 0:
                max_q_dash_s_dash_a_dash = max(
                    [q[next_state][a_dash] for a_dash in q[next_state]])
            else:
                max_q_dash_s_dash_a_dash = 0
            if self.debug: print("max_q:", max_q_dash_s_dash_a_dash)
            if new_reward > 70:
                q[curr_state][(action, block_id)] = (
                    (1 - alpha) * q_sa) + (alpha * (new_reward))
                return
            else:
                q[curr_state][(
                    action,
                    block_id)] += alpha * (new_reward + gamma *
                                           (max_q_dash_s_dash_a_dash) - q_sa)

            if self.debug: print("q:", q[curr_state][(action, block_id)])

            block_world.update_state_from_tuple_pramodith(next_state)

            if cnt > 4000 and cnt % 250 == 0 and nu > 0.05:
                alpha -= 0.1
            # print(cnt)
            #    nu-=0.1
            #nu *= 0.9995

            block_world.render()

            # converged = ever_seen_goal and q == q_old
            #q_old = q
            # time.sleep(0.1)
        pygame.display.quit()
        #self.test_q_learning_real(q)
        RLTrainer.save_obj(q, "Q\q_oracle")
    def test_q_learning_real(self, q_old, starting_nu=0.1):
        nu = starting_nu
        prev_reward = 0
        block_world = BlockWorld(self.states_x,
                                 self.states_y,
                                 self.blocks_count,
                                 self.stack_count,
                                 record=False)
        ever_seen_goal = False
        cnt = 0
        '''
        state_s = State(
            [[block_world.block_dict[i].rect.centerx, block_world.block_dict[i].rect.centery] for i in
             range(self.blocks_count)],
            block_world.selected_block_id, block_world.goal_config,screen_dims=(block_world.screen_width, block_world.screen_height))
        block_world.goal_loc = state_s.goal_positions
        '''
        while cnt < self.iteration_count:
            cnt += 1
            #q = q_old.copy()
            block_world.pre_render()

            curr_state = block_world.get_state_as_tuple_pramodith2()
            if self.debug and curr_state in q_old:
                print("Current State: %s" + str(curr_state), q_old[curr_state])
            #if np.random.rand() < 0.01:
            '''
            state_s = State([[block_world.block_dict[i].rect.centerx, block_world.block_dict[i].rect.centery] for i in
                             range(self.blocks_count)],
                            selected_index=block_world.selected_block_id, goal_config=block_world.goal_config,
                            screen_dims=(block_world.screen_width, block_world.screen_height))
            action, block_id = self.get_next_action_supervised_t(state_t=None, state_s=state_s, q=None, nu=0)
            '''
            action, block_id = self.get_next_action(curr_state, q_old, nu)
            if self.debug: print("Action: ", action, block_id)

            next_state = block_world.get_next_state_based_on_state_tuple(
                curr_state, (action, block_id))
            new_reward = block_world.get_reward_for_state(
                next_state, curr_state)
            new_reward += block_world.get_reward_for_state_action_pramodith(
                curr_state, next_state)
            # print("Reward")
            # print(new_reward)

            if new_reward > 70:
                prev_reward = 71
                print("Converged in %d", cnt)
                #return cnt
            #if self.debug:

            # print("q:", q_old.get(str(curr_state),None))
            block_world.update_state_from_tuple_pramodith(next_state)

            block_world.render()
            pygame.image.save(block_world.screen,
                              "sample_videos/3_block_" + str(cnt) + ".jpg")
            if prev_reward > 70:
                return

            time.sleep(0.1)
        return cnt