Exemplo n.º 1
0
    comb -= comb.min()
    comb /= (comb.max() + 1e-9)
    Presenter().show_image(comb,
                           "depth_alignment",
                           torch=False,
                           waitkey=1,
                           scale=4)


for instruction_sets in train_instructions.values():
    for instruction_set in instruction_sets:
        env_id = instruction_set['env']
        print("pomdp_interface setting environment id:", env_id)
        set_current_env_id(env_id)
        env.set_environment(env_id, instruction_set["instructions"])
        env.reset(0)

        presenter = Presenter()
        cumulative_reward = 0
        while True:
            rate.sleep()
            action = teleoper.get_command()
            command = env.get_current_nl_command()
            state, reward, done = env.step(action)
            cumulative_reward += reward
            presenter.show_sample(state, action, cumulative_reward, command)
            #show_depth(state.image)
            if done:
                break
        print("Episode finished!")
Exemplo n.º 2
0
def automatic_demo():

    P.initialize_experiment()
    instruction_display = InstructionDisplay()

    rate = Rate(0.1)

    env = PomdpInterface(
        is_real=get_current_parameters()["Setup"]["real_drone"])
    train_instructions, dev_instructions, test_instructions, corpus = get_all_instructions(
    )
    all_instr = {
        **train_instructions,
        **dev_instructions,
        **train_instructions
    }
    token2term, word2token = get_word_to_token_map(corpus)

    # Run on dev set
    interact_instructions = dev_instructions

    env_range_start = get_current_parameters()["Setup"].get(
        "env_range_start", 0)
    env_range_end = get_current_parameters()["Setup"].get(
        "env_range_end", 10e10)
    interact_instructions = {
        k: v
        for k, v in interact_instructions.items()
        if env_range_start < k < env_range_end
    }

    model, _ = load_model(get_current_parameters()["Setup"]["model"])

    # Loop over the select few examples
    while True:

        for instruction_sets in interact_instructions.values():
            for set_idx, instruction_set in enumerate(instruction_sets):
                env_id = instruction_set['env']
                found_example = None
                for example in examples:
                    if example[0] == env_id:
                        found_example = example
                if found_example is None:
                    continue
                env.set_environment(env_id, instruction_set["instructions"])

                presenter = Presenter()
                cumulative_reward = 0
                for seg_idx in range(len(instruction_set["instructions"])):
                    if seg_idx != found_example[2]:
                        continue

                    print(f"RUNNING ENV {env_id} SEG {seg_idx}")

                    real_instruction_str = instruction_set["instructions"][
                        seg_idx]["instruction"]
                    instruction_display.show_instruction(real_instruction_str)
                    valid_segment = env.set_current_segment(seg_idx)
                    if not valid_segment:
                        continue
                    state = env.reset(seg_idx)

                    for i in range(START_PAUSE):
                        instruction_display.tick()
                        time.sleep(1)

                        tok_instruction = tokenize_instruction(
                            real_instruction_str, word2token)

                    state = env.reset(seg_idx)
                    print("Executing: f{instruction_str}")
                    while True:
                        instruction_display.tick()
                        rate.sleep()
                        action, internals = model.get_action(
                            state, tok_instruction)
                        state, reward, done, expired, oob = env.step(action)
                        cumulative_reward += reward
                        #presenter.show_sample(state, action, reward, cumulative_reward, real_instruction_str)
                        #show_depth(state.image)
                        if done:
                            break

                    for i in range(END_PAUSE):
                        instruction_display.tick()
                        time.sleep(1)
                        print("Segment finished!")
                    instruction_display.show_instruction("...")

            print("Env finished!")
Exemplo n.º 3
0
def interactive_demo():

    P.initialize_experiment()
    InteractAPI.launch_ui()

    rate = Rate(0.1)

    env = PomdpInterface(
        is_real=get_current_parameters()["Setup"]["real_drone"])
    train_instructions, dev_instructions, test_instructions, corpus = get_all_instructions(
    )
    all_instr = {
        **train_instructions,
        **dev_instructions,
        **train_instructions
    }
    token2term, word2token = get_word_to_token_map(corpus)

    # Run on dev set
    interact_instructions = dev_instructions

    env_range_start = get_current_parameters()["Setup"].get(
        "env_range_start", 0)
    env_range_end = get_current_parameters()["Setup"].get(
        "env_range_end", 10e10)
    interact_instructions = {
        k: v
        for k, v in interact_instructions.items()
        if env_range_start < k < env_range_end
    }

    count = 0
    stuck_count = 0

    model, _ = load_model(get_current_parameters()["Setup"]["model"])

    InteractAPI.write_empty_instruction()
    InteractAPI.write_real_instruction("None")
    instruction_str = InteractAPI.read_instruction_file()
    print("Initial instruction: ", instruction_str)

    for instruction_sets in interact_instructions.values():
        for set_idx, instruction_set in enumerate(instruction_sets):
            env_id = instruction_set['env']
            env.set_environment(env_id, instruction_set["instructions"])

            presenter = Presenter()
            cumulative_reward = 0
            for seg_idx in range(len(instruction_set["instructions"])):

                print(f"RUNNING ENV {env_id} SEG {seg_idx}")

                real_instruction_str = instruction_set["instructions"][
                    seg_idx]["instruction"]
                InteractAPI.write_real_instruction(real_instruction_str)
                valid_segment = env.set_current_segment(seg_idx)
                if not valid_segment:
                    continue
                state = env.reset(seg_idx)

                keep_going = True
                while keep_going:
                    InteractAPI.write_real_instruction(real_instruction_str)

                    while True:
                        cv2.waitKey(200)
                        instruction = InteractAPI.read_instruction_file()
                        if instruction == "CMD: Next":
                            print("Advancing")
                            keep_going = False
                            InteractAPI.write_empty_instruction()
                            break
                        elif instruction == "CMD: Reset":
                            print("Resetting")
                            env.reset(seg_idx)
                            InteractAPI.write_empty_instruction()
                        elif len(instruction.split(" ")) > 1:
                            instruction_str = instruction
                            break

                    if not keep_going:
                        continue

                    env.override_instruction(instruction_str)
                    tok_instruction = tokenize_instruction(
                        instruction_str, word2token)

                    state = env.reset(seg_idx)
                    print("Executing: f{instruction_str}")
                    while True:
                        rate.sleep()
                        action, internals = model.get_action(
                            state, tok_instruction)

                        state, reward, done, expired, oob = env.step(action)
                        cumulative_reward += reward
                        presenter.show_sample(state, action, reward,
                                              cumulative_reward,
                                              instruction_str)
                        #show_depth(state.image)
                        if done:
                            break
                    InteractAPI.write_empty_instruction()
                    print("Segment finished!")
        print("Env finished!")
Exemplo n.º 4
0
class SimplePolicyRoller:
    """
    Really only a wrapper around the roll_out_policy function, which does the policy rollout in the pomdp
    It collects actions both from the user-provided policy and from the oracle (as labels) and accumulates a dataset
    """
    def __init__(self,
                 instance_id=0,
                 real_drone=False,
                 policy=None,
                 oracle=None,
                 no_reward=False):

        self.presenter = Presenter()
        self.instance_id = instance_id

        self.word2token = None
        self.all_instructions = None
        self.all_env_ids, self.all_instructions, self.corpus, self.token2term, self.word2token = self.load_all_envs(
        )

        self.env = PomdpInterface(instance_id=self.instance_id,
                                  is_real=real_drone)
        self.policy = policy
        self.oracle = oracle
        self.no_reward = no_reward

    def load_all_envs(self):
        train_i, dev_i, test_i, corpus = get_all_instructions()
        all_instructions = merge_instruction_sets(train_i, dev_i, test_i)
        token2term, word2token = get_word_to_token_map(corpus)
        env_ids = list(all_instructions.keys())
        return env_ids, all_instructions, corpus, token2term, word2token

    def tokenize_string(self, s):
        word_list = filter(None, s.split(" "))
        token_instruction = list(map(lambda w: self.word2token[w], word_list))
        return token_instruction

    def set_policy(self, policy):
        self.policy = policy

    def save_rollouts(self, rollouts, dataset_name):
        env_rollouts = {}
        for rollout in rollouts:
            env_id = rollout[0]["env_id"]
            if env_id not in env_rollouts:
                env_rollouts[env_id] = []
            env_rollouts[env_id] += rollout

        for env_id, rollouts in env_rollouts.items():
            # This saves just a single segment per environment, as opposed to all segments that the oracle saves. Problem?
            if len(rollouts) > 0:
                #pruned_rollouts = [prune_sample(s) for s in rollouts]
                save_dataset(dataset_name, rollouts, env_id=env_id, lock=True)
                #save_metadata(dataset_name, env_id, {"seg_ids": segments})

    def choose_action(self, pol_action, ref_action, dagger_beta):
        use_expert = random.uniform(0, 1) < dagger_beta
        if use_expert:
            return ref_action
        else:
            return pol_action

    def sample_to_cpu(self, sample):
        for k, v in sample.items():
            if hasattr(v, "to") and isinstance(v.to, types.MethodType):
                sample[k] = v.to("cpu")

    def single_segment_rollout(self,
                               env_id,
                               set_idx,
                               seg_idx,
                               do_sample,
                               dagger_beta=0,
                               rl_rollout=True):
        instruction_sets = self.all_instructions[env_id][set_idx][
            'instructions']
        for instruction_set in instruction_sets:
            if instruction_set["seg_idx"] == seg_idx:
                break

        # TODO: Get rid of this idiocy:
        md.IS_ROLLOUT = True

        instruction_set = get_instruction_segment(
            env_id, set_idx, seg_idx, all_instr=self.all_instructions)

        self.env.set_environment(env_id,
                                 instruction_set=instruction_sets,
                                 fast=True)
        self.env.set_current_segment(seg_idx)

        self.policy.start_sequence()
        if hasattr(self.policy, "start_segment_rollout"):
            self.policy.start_segment_rollout(env_id, set_idx, seg_idx)
        if self.oracle:
            self.oracle.start_segment_rollout(env_id, set_idx, seg_idx)

        string_instruction, end_idx, start_idx = instruction_set[
            "instruction"], instruction_set["end_idx"], instruction_set[
                "start_idx"]
        token_instruction = self.tokenize_string(string_instruction)

        # TODO: Support oracle (including setCurrentSegment, and setting the path)
        rollout_sample = []

        # Reset the drone to the segment starting position:
        state = self.env.reset(seg_idx)

        first = True
        while True:
            action, rl_stuff = self.policy.get_action(state,
                                                      token_instruction,
                                                      sample=do_sample,
                                                      rl_rollout=rl_rollout)

            if self.oracle:
                ref_action, _ = self.oracle.get_action(state,
                                                       token_instruction)
                exec_action = self.choose_action(action, ref_action,
                                                 dagger_beta)
            else:
                ref_action = action
                exec_action = action

            next_state, extrinsic_reward, done, expired, oob = self.env.step(
                exec_action)

            # Calculate intrinsic reward (I don't like that this delays the loop)
            if hasattr(self.policy,
                       "calc_intrinsic_rewards") and not self.no_reward:
                intrinsic_rewards = self.policy.calc_intrinsic_rewards(
                    next_state, action, done, first)
            else:
                intrinsic_rewards = {"x": 0}
            intrinsic_reward = sum(intrinsic_rewards.values())

            sample = {
                "instruction": string_instruction,
                "ref_action": ref_action,
                "pol_action": action,
                "action": exec_action,
                "state": state,
                "extrinsic_reward": extrinsic_reward,
                "intrinsic_reward": intrinsic_reward - (1.0 if oob else 0.0),
                "full_reward": extrinsic_reward + intrinsic_reward,
                "done": done,
                "expired": expired,
                "env_id": env_id,
                "set_idx": set_idx,
                "seg_idx": seg_idx,
            }
            sample = dict_merge(sample, rl_stuff)
            if not self.no_reward:
                sample = dict_merge(sample, intrinsic_rewards)
            rollout_sample.append(sample)

            # Multiprocessing has stopped playing nice with PyTorch cuda. Move sample to cpu first.
            if rl_rollout:
                self.sample_to_cpu(sample)

            state = next_state
            first = False
            if done:
                #print(f"Done! Last action: {exec_action}")
                break

        md.IS_ROLLOUT = False
        # Add discounted returns
        return rollout_sample

    def rollout_segments(self,
                         env_ids,
                         seg_ids,
                         policy_state,
                         sample,
                         dagger_beta=0,
                         save_dataset_name=None,
                         land_afterwards=False,
                         rl_rollout=True):
        if policy_state is not None:
            self.policy.set_policy_state(policy_state)

        data = []
        for env_id, seg_idx in zip(env_ids, seg_ids):
            done = False
            while not done:
                try:
                    seg_data = self.single_segment_rollout(
                        env_id, 0, seg_idx, sample, dagger_beta, rl_rollout)
                    done = True
                except PomdpInterface.EnvException as e:
                    continue
            data.append(seg_data)

        if save_dataset_name:
            self.save_rollouts(data, save_dataset_name)

        # Land the real drone if we have one.
        if land_afterwards:
            self.env.land()

        return data