예제 #1
0
    def __init__(self,
                 feature_store,
                 pano_caffe=None,
                 batch_size=100,
                 seed=10,
                 splits=['train'],
                 tokenizer=None,
                 name=None):
        self.env = EnvBatch(feature_store=feature_store, batch_size=batch_size)
        if feature_store:
            self.feature_size = self.env.feature_size
        else:
            self.feature_size = 2048
        self.data = []
        if tokenizer:
            self.tok = tokenizer
        scans = []
        for split in splits:
            for item in load_datasets([split]):
                # Split multiple instructions into separate entries
                for j, instr in enumerate(item['instructions']):
                    # if item['scan'] not in self.env.featurized_scans:   # For fast training
                    #     continue
                    new_item = dict(item)
                    new_item['instr_id'] = '%s_%d' % (item['path_id'], j)
                    new_item['instructions'] = instr
                    if tokenizer:
                        new_item['instr_encoding'] = tokenizer.encode_sentence(
                            instr)
                    if not tokenizer or new_item[
                            'instr_encoding'] is not None:  # Filter the wrong data
                        self.data.append(new_item)
                        scans.append(item['scan'])
        if name is None:
            self.name = splits[0] if len(splits) > 0 else "FAKE"
        else:
            self.name = name

        self.pano_caffe = pano_caffe

        self.scans = set(scans)
        self.splits = splits
        self.seed = seed
        random.seed(self.seed)
        random.shuffle(self.data)

        self.ix = 0
        self.batch_size = batch_size
        self._load_nav_graphs()

        self.angle_feature = utils.get_all_point_angle_feature()
        self.sim = utils.new_simulator()
        self.buffered_state_dict = {}

        # It means that the fake data is equals to data in the supervised setup
        self.fake_data = self.data
        print('R2RBatch loaded with %d instructions, using splits: %s' %
              (len(self.data), ",".join(splits)))
예제 #2
0
    def __init__(self,
                 feature_store,
                 pano_caffee=None,
                 batch_size=100,
                 seed=10,
                 splits=['train'],
                 tokenizer=None,
                 name=None):
        self.env = EnvBatch(feature_store=feature_store, batch_size=batch_size)
        if feature_store:
            self.feature_size = self.env.feature_size
        self.data = []
        self.configs = {}

        self.motion_indicator = {}
        self.landmark = {}

        if not name:
            configs = np.load(args.configpath + "configs_" + splits[0] +
                              ".npy",
                              allow_pickle=True).item()
            self.configs.update(configs)

        if tokenizer:
            self.tok = tokenizer
        scans = []
        for item in tqdm(load_datasets(splits)):
            # Split multiple instructions into separate entries
            for j, instr in enumerate(item['instructions']):
                if item['scan'] not in self.env.featurized_scans:  # For fast training
                    continue
                new_item = dict(item)
                new_item['instr_id'] = '%s_%d' % (item['path_id'], j)
                #new_item['instr_id'] = str(item['path_id'])
                if args.configuration and not name:
                    each_configuration_list = self.configs[str(
                        new_item['instr_id'])]

                    # each_configuration_list = get_configurations(instr)
                    # self.configs[str(new_item['instr_id'])] = each_configuration_list
                    for config_id, each_c in enumerate(
                            each_configuration_list):
                        #self.motion_indicator[str(new_item['instr_id']) + "_" + str(config_id)] = get_motion_indicator(each_c)
                        self.landmark[str(new_item['instr_id']) + "_" +
                                      str(config_id)] = get_landmark(
                                          each_c, whether_root=True)

                    new_item['configurations'] = each_configuration_list
                    configuration_length = len(each_configuration_list)
                    tmp_str = " Quan ".join(each_configuration_list) + " Quan"
                    new_item['instructions'] = tmp_str
                    if configuration_length:
                        self.data.append(
                            (len(new_item['configurations']), new_item))

                    if tokenizer:
                        if 'instr_encoding' not in item:  # we may already include 'instr_encoding' when generating synthetic instructions
                            new_item[
                                'instr_encoding'] = tokenizer.encode_sentence(
                                    tmp_str)

                else:
                    new_item['instructions'] = instr
                    if tokenizer:
                        new_item['instr_encoding'] = tokenizer.encode_sentence(
                            instr)
                    if not tokenizer or new_item[
                            'instr_encoding'] is not None:  # Filter the wrong data
                        self.data.append(new_item)
                scans.append(item['scan'])

        np.save(
            f"/VL/space/zhan1624/R2R-EnvDrop/r2r_src/components3/landmarks/landmark_{splits[0]}.npy",
            self.landmark)
        '''
        np.save(f"/VL/space/zhan1624/R2R-EnvDrop/r2r_src/components2/configs/configs_{splits[0]}.npy", self.configs)
        np.save(f"/VL/space/zhan1624/R2R-EnvDrop/r2r_src/components2/motion_indicator/motion_indicator_{splits[0]}.npy", self.motion_indicator)
        np.save(f"/VL/space/zhan1624/R2R-EnvDrop/r2r_src/components2/landmarks/landmark_{splits[0]}.npy", self.landmark)
        '''
        if name is None:
            self.name = splits[0] if len(splits) > 0 else "FAKE"
        else:
            self.name = name
        self.pano_caffee = pano_caffee

        self.scans = set(scans)
        self.splits = splits

        if args.configuration and not name:
            #     self.data.sort(key=lambda x: x[0])
            self.data = list(map(lambda item: item[1], self.data))
        self.seed = seed
        random.seed(self.seed)
        random.shuffle(self.data)

        self.ix = 0
        self.batch_size = batch_size
        self._load_nav_graphs()

        self.angle_feature, self.pano_angles = list(
            zip(*utils.get_all_point_angle_feature()))

        self.sim = utils.new_simulator()
        self.buffered_state_dict = {}

        # It means that the fake data is equals to data in the supervised setup
        self.fake_data = self.data
        print('R2RBatch loaded with %d instructions, using splits: %s' %
              (len(self.data), ",".join(splits)))
예제 #3
0
    def __init__(self,
                 feature_store,
                 candidate_store,
                 batch_size=100,
                 seed=10,
                 splits=['train'],
                 tokenizer=None,
                 name=None):
        self.env = EnvBatch(feature_store=feature_store,
                            candidate_store=candidate_store,
                            batch_size=batch_size)
        if feature_store:
            self.feature_size = self.env.feature_size
        self.data = []
        if tokenizer:
            self.tok = tokenizer
        scans = []
        for split in splits:
            for item in load_datasets([split]):
                # Split multiple instructions into separate entries
                for j, instr in enumerate(item['instructions']):
                    if item['scan'] not in self.env.featurized_scans:  # For fast training
                        continue
                    new_item = dict(item)
                    new_item['instr_id'] = '%s_%d' % (item['path_id'], j)
                    new_item['instructions'] = instr
                    if tokenizer:
                        new_item['instr_encoding'] = tokenizer.encode_sentence(
                            instr)
                    if not tokenizer or new_item[
                            'instr_encoding'] is not None:  # Filter the wrong data
                        self.data.append(new_item)
                        scans.append(item['scan'])
        if name is None:
            self.name = splits[0] if len(splits) > 0 else "FAKE"
        else:
            self.name = name

        self.scans = set(scans)
        self.splits = splits
        self.seed = seed
        random.seed(self.seed)
        random.shuffle(self.data)

        if args.filter != "":
            filter_name, percent = args.filter.split("_")
            percent = int(percent) / 100
            scan_list = list(self.scans)
            scan_list = sorted(scan_list)
            scan_num = len(scan_list)
            scan_num_in_use = int(scan_num * percent)
            scan_in_use = set(scan_list[:scan_num_in_use])
            data_in_use = [
                datum for datum in self.data if datum['scan'] in scan_in_use
            ]
            data_num_in_use = len(data_in_use)
            if self.name == 'train':
                if filter_name == 'env':
                    print("With the top %d scans and %d data" %
                          (scan_num_in_use, data_num_in_use))
                    print("With percent %0.4f and %0.4f" %
                          (scan_num_in_use / len(self.scans),
                           data_num_in_use / len(self.data)))
                    print(scan_in_use)
                    self.scans = scan_in_use
                    self.data = data_in_use
                    assert len(self.data) == data_num_in_use
                elif filter_name == 'data':
                    print("With the all %d scans and %d data" %
                          (len(self.scans), data_num_in_use))
                    self.data = self.data[:data_num_in_use]
                    for datum in self.data[:5]:
                        print(datum['instr_id'])
                    assert len(self.data) == data_num_in_use
            # elif self.name == 'aug':
            #     if filter_name == 'env':
            #         print("With the top %d scans and %d data" % (scan_num_in_use, data_num_in_use))
            #         print("With percent %0.4f and %0.4f" % (scan_num_in_use / len(self.scans), data_num_in_use / len(self.data)))
            #         print(scan_in_use)
            #         self.scans = scan_in_use
            #         self.data = data_in_use
            #         assert len(self.data) == data_num_in_use
            #     elif filter_name == 'data':
            #         print("With the all %d scans and %d data" % (len(self.scans), len(self.data)))

        self.ix = 0
        self.batch_size = batch_size
        self._load_nav_graphs()

        self.angle_feature = utils.get_all_point_angle_feature()
        self.sim = utils.new_simulator()
        self.buffered_state_dict = {}

        # It means that the fake data is equals to data in the supervised setup
        self.fake_data = self.data
        print('R2RBatch loaded with %d instructions, using splits: %s' %
              (len(self.data), ",".join(splits)))
예제 #4
0
    def __init__(self,
                 feature_store,
                 batch_size=100,
                 seed=10,
                 splits=['train'],
                 tokenizer=None,
                 name=None):
        self.env = EnvBatch(feature_store=feature_store, batch_size=batch_size)
        if feature_store:
            self.feature_size = self.env.feature_size
        else:
            self.feature_size = 2048
        self.data = []
        if tokenizer:
            self.tok = tokenizer
        scans = []
        for split in splits:
            for i_item, item in enumerate(load_datasets([split])):
                if args.test_only and i_item == 64:
                    break
                if "/" in split:
                    try:
                        new_item = dict(item)
                        new_item['instr_id'] = item['path_id']
                        new_item['instructions'] = item['instructions'][0]
                        new_item['instr_encoding'] = item['instr_enc']
                        if new_item[
                                'instr_encoding'] is not None:  # Filter the wrong data
                            self.data.append(new_item)
                            scans.append(item['scan'])
                    except:
                        continue
                else:
                    # Split multiple instructions into separate entries
                    for j, instr in enumerate(item['instructions']):
                        try:
                            new_item = dict(item)
                            new_item['instr_id'] = '%s_%d' % (item['path_id'],
                                                              j)
                            new_item['instructions'] = instr
                            ''' BERT tokenizer '''
                            instr_tokens = tokenizer.tokenize(instr)
                            padded_instr_tokens, num_words = pad_instr_tokens(
                                instr_tokens, args.maxInput)
                            new_item[
                                'instr_encoding'] = tokenizer.convert_tokens_to_ids(
                                    padded_instr_tokens)

                            if new_item[
                                    'instr_encoding'] is not None:  # Filter the wrong data
                                self.data.append(new_item)
                                scans.append(item['scan'])
                        except:
                            continue

        if name is None:
            self.name = splits[0] if len(splits) > 0 else "FAKE"
        else:
            self.name = name

        self.scans = set(scans)
        self.splits = splits
        self.seed = seed
        random.seed(self.seed)
        random.shuffle(self.data)

        self.ix = 0
        self.batch_size = batch_size
        self._load_nav_graphs()

        self.angle_feature = utils.get_all_point_angle_feature()
        self.sim = utils.new_simulator()
        self.buffered_state_dict = {}

        # It means that the fake data is equals to data in the supervised setup
        self.fake_data = self.data
        print('R2RBatch loaded with %d instructions, using splits: %s' %
              (len(self.data), ",".join(splits)))
예제 #5
0
    def __init__(self, feature_store, batch_size=100, seed=10, splits=['train'], tokenizer=None,
                 path_type='planner_path', history='target', blind=False):
        self.buffered_state_dict = {}
        self.sim = utils.new_simulator()
        self.env = EnvBatch(feature_store=feature_store, batch_size=batch_size, blind=blind)
        self.data = []
        self.scans = []
        self.splits = splits
        for item in load_datasets(splits):
            double_num_dial = len(item['dialog_history'])
            dial = []
            dial_seps = []
            Last_QA = []
            QA_seps = []
            hist = []
            hist_enc = []
            target = []
            target.append(item['target'])
            tar_seps = []
            tar_seps.append('<TAR>')
            # For every dialog history, stitch together a single instruction string.
            self.scans.append(item['scan'])
            new_item = dict(item)
            new_item['inst_idx'] = item['inst_idx']
            if history == 'none':  # no language input at all
                new_item['instructions'] = ''
                if tokenizer:
                    new_item['instr_encoding'] = tokenizer.encode_sentence('')
            elif history == 'target' or len(item['dialog_history']) == 0:  # Have to use target only if no dialog history.
                tar = item['target']
                new_item['instructions'] = '<TAR> ' + tar
                new_item['Last_QA'] = Last_QA
                new_item['tar'] = item['target']
                if tokenizer:

                    new_item['instr_encoding'] = tokenizer.encode_sentence([tar], seps=['<TAR>'])
                    Last_QA_enc = tokenizer.encode_dial(None, None)
                    new_item['Last_QA_enc'] = Last_QA_enc

                    for i in range(15):
                        hist_enc.append(tokenizer.encode_dial(None, None))
                        hist.append('<pad>')
                    new_item['hist_enc'] = np.array(hist_enc)
                    new_item['hist'] = hist

                    tar_enc = tokenizer.encode_sentence(target, seps=tar_seps)
                    new_item['tar_enc'] = tar_enc
            elif history == 'oracle_ans':
                ora_a = item['dialog_history'][-1]['message']  # i.e., the last oracle utterance.
                tar = item['target']
                new_item['instructions'] = '<ORA> ' + ora_a + ' <TAR> ' + tar
                if tokenizer:
                    new_item['instr_encoding'] = tokenizer.encode_sentence([ora_a, tar], seps=['<ORA>', '<TAR>'])
            elif history == 'nav_q_oracle_ans':
                nav_q = item['dialog_history'][-2]['message']
                ora_a = item['dialog_history'][-1]['message']
                tar = item['target']
                new_item['instructions'] = '<NAV> ' + nav_q + ' <ORA> ' + ora_a + ' <TAR> ' + tar
                if tokenizer:
                    qa_enc = tokenizer.encode_sentence([nav_q, ora_a, tar], seps=['<NAV>', '<ORA>', '<TAR>'])
                    new_item['instr_encoding'] = qa_enc
            elif history == 'all':
                dia_inst = ''
                sentences = []
                seps = []
                for numm, turn in enumerate(item['dialog_history']):
                    sentences.append(turn['message'])
                    sep = '<NAV>' if turn['role'] == 'navigator' else '<ORA>'
                    seps.append(sep)
                    dia_inst += sep + ' ' + turn['message'] + ' '
                    dial.append(turn['message'])
                    dial_seps.append(sep)
                    if numm == double_num_dial - 1:
                        Last_QA.append(dial[-2])
                        Last_QA.append(dial[-1])
                        QA_seps.append(seps[-2])
                        QA_seps.append(seps[-1])
                sentences.append(item['target'])
                seps.append('<TAR>')
                dia_inst += '<TAR> ' + item['target']
                new_item['instructions'] = dia_inst
                new_item['Last_QA'] = Last_QA
                new_item['hist'] = hist
                new_item['tar'] = item['target']

                if tokenizer:
                    dia_enc = tokenizer.encode_sentence(sentences, seps=seps)
                    new_item['instr_encoding'] = dia_enc
                    ##
                    Last_QA_enc = tokenizer.encode_dial(Last_QA, seps=QA_seps)
                    new_item['Last_QA_enc'] = Last_QA_enc

                    tar_enc = tokenizer.encode_dial(target, seps=tar_seps)
                    new_item['tar_enc'] = tar_enc

                    dial_ix = 0
                    while dial_ix < (len(dial)-2):
                        qa_list = []
                        qa_seps = []

                        qa_list.append(dial[dial_ix])
                        qa_list.append(dial[dial_ix + 1])
                        qa_seps.append(dial_seps[dial_ix])
                        qa_seps.append(dial_seps[dial_ix + 1])

                        hist.append(qa_list)
                        hist_enc.append(tokenizer.encode_dial(qa_list, seps=qa_seps))
                        dial_ix = dial_ix + 2
                    if len(hist_enc) < 15:
                        for i in range(15-len(hist_enc)):
                            hist.append('<pad>')
                            hist_enc.append(tokenizer.encode_dial(None, None))
                        new_item['hist'] = hist
                        new_item['hist_enc'] = hist_enc

            # If evaluating against 'trusted_path', we need to calculate the trusted path and instantiate it.
            if path_type == 'trusted_path' and 'test' not in splits:
                # The trusted path is either the planner_path or the player_path depending on whether the player_path
                # contains the planner_path goal (e.g., stricter planner oracle success of player_path
                # indicates we can 'trust' it, otherwise we fall back to the planner path for supervision).
                # Hypothesize that this will combine the strengths of good human exploration with the known good, if
                # short, routes the planner uses.
                planner_goal = item['planner_path'][-1]  # this could be length 1 if "plan" is to not move at all.
                if planner_goal in item['player_path'][1:]:  # player walked through planner goal (did not start on it)
                    new_item['trusted_path'] = item['player_path'][:]  # trust the player.
                else:
                    new_item['trusted_path'] = item['planner_path'][:]  # trust the planner.
            self.data.append(new_item)

        self.scans = set(self.scans)
        self.splits = splits
        self.seed = seed
        random.seed(self.seed)
        random.shuffle(self.data)
        self.ix = 0
        self.batch_size = batch_size
        self._load_nav_graphs()
        self.path_type = path_type
        self.angle_feature = utils.get_all_point_angle_feature()
        print 'R2RBatch loaded with %d instructions, using splits: %s' % (len(self.data), ",".join(splits))