def load_vocabs(vocab_paths):
    vocab = io.load_json(vocab_paths['vocab_path'])
    char_vocab = io.load_json(vocab_paths['char_vocab_path'])
    anon_vocab = io.load_json(vocab_paths['anon_vocab_path'])
    anon_char_vocab = io.load_json(vocab_paths['anon_char_vocab_path'])

    return vocab, char_vocab, anon_vocab, anon_char_vocab
Пример #2
0
	def loadSyntheticData(self):
		cache_file = os.path.join(self.CACHE_DIR, 'lsh_programs.pkl')
		if not self.evict_cache and os.path.isfile(cache_file):
			data = load_json(cache_file)
			prog_items = data['raw_programs']
			anon_progs = data['anon_programs']
		else:
			standard_path = self.sampledDataPath + '/standard/train' + SYNTH_NAME
			uniform_path = self.sampledDataPath + '/uniform/train' + SYNTH_NAME
			tempered_path = self.sampledDataPath + '/tempered/train' + SYNTH_NAME
			standardDict = pickle.load(open(standard_path, "rb" ))
			uniformDict = pickle.load(open(uniform_path, "rb" ))
			temperedDict =  pickle.load(open(tempered_path, "rb" ))

			all_dicts = [standardDict, uniformDict, temperedDict]

			# this step is not stable across different runs if caching forest
			# so this needs to be cached too
			prog_items = list(standardDict.keys() | uniformDict.keys() | temperedDict.keys())
			anon_progs = [self.multi_dict_get(prog, all_dicts) for prog in prog_items]
			data = dict(raw_programs=prog_items, anon_programs=anon_progs)

			os.makedirs(self.CACHE_DIR, exist_ok=True)
			save_json(data, cache_file)

			# if we dont load cache here, we should regenerate forest too
			self.evict_cache = True

		return prog_items, anon_progs
def load_exp_data(all_exp_dir):
    params = []
    losses = []
    for exp_name in os.listdir(all_exp_dir):
        try:
            exp_dir = os.path.join(all_exp_dir, exp_name)
            config = load_config(os.path.join(exp_dir, 'config.json'))
            vec = config_to_vec(config)
            vec['exp_name'] = exp_name
            summaries = load_json(
                os.path.join(exp_dir, 'summaries', 'all_scalars.json'))
            k_loss = get_key_for_metric(summaries.keys(),
                                        'validation/loss/loss')
            if k_loss is None:
                print('Metric not foud... skipping')
                continue
            loss = np.average([x[2] for x in summaries[k_loss][-5:]])

            params.append(vec)
            losses.append(loss)
        except FileNotFoundError:
            print('File not found... skipping')
            continue

    return params, losses
Пример #4
0
    def LoadVocabulary(self, vocab_path):

        vocab_dict = io_utils.load_json(vocab_path)
        self.word_vocabulary = vocab_dict["vocabulary"]
        self.word_index = vocab_dict["word_index"]
        self.has_vocabulary = True
        """ TODO: should do this part?
    def __init__(self, config):
        super(self.__class__, self).__init__(config)

        # get options
        self.S = config.get("num_segment", 128)
        self.split = config.get("split", "train")
        self.data_dir = config.get("data_dir", "data/charades")
        self.feature_type = config.get("feature_type", "I3D")
        self.in_memory = config.get("in_memory", False)
        if self.feature_type == "I3D":
            self.feat_path = config.get(
                "video_feature_path",
                "data/charades/features/i3d_finetuned/{}.npy"
            )
        else:
            raise ValueError("Wrong feature_type")

        # cropping augmentation settings
        self.cropping_augmentation = config.get("cropping_augmentation",False)
        self.cropping_prob = config.get("cropping_prob",0.5)
        self.cropping_factor = config.get("cropping_factor",0.5)
        self.no_aug = False

        # get paths for proposals and captions
        paths = self._get_data_path(config)

        # create labels (or load existing one)
        ann_path = "data/charades/annotations/charades_sta_{}_pos.json".format(self.split)
        aux_ann_path = "data/charades/annotations/Charades_v1_{}.csv".format(self.split)
        self.anns, self.qids, self.vids = self._load_annotation(ann_path, aux_ann_path)
        if not self._exist_data(paths):
            self.generate_labels(config)

        # load features if use in_memory
        if self.in_memory:
            self.feats = {}
            for vid in tqdm(self.vids, desc="In-Memory: vid_feat"):
                self.feats[vid] = np.load(self.feat_path.format(vid)).squeeze()

            self.s_pos, self.e_pos, self.att_mask = {}, {}, {}
            grd_info = io_utils.load_hdf5(self.paths["grounding_info"], False)
            for k in tqdm(self.qids, desc="In-Memory: grounding"):
                self.s_pos[k] = grd_info["start_pos/"+k][()]
                self.e_pos[k] = grd_info["end_pos/"+k][()]
                self.att_mask[k] = grd_info["att_mask/"+k][()]

            self.query_labels = {}
            query_labels = h5py.File(self.paths["query_labels"], "r")
            for k in tqdm(self.qids, desc="In-Memory: query"):
                self.query_labels[k]= query_labels[k][:]

        # load query information
        query_info = io_utils.load_json(self.paths["query_info"])
        self.wtoi = query_info["wtoi"]
        self.itow = query_info["itow"]
        self.query_lengths = query_info["query_lengths"]

        self.batch_size = config.get("batch_size", 64)
        self.num_instances = len(self.qids)
Пример #6
0
    def __init__(self, config):
        super(self.__class__, self).__init__(config)

        # get options
        self.S = config.get("num_segment", 128)
        self.split = config.get("split", "train")
        self.data_dir = config.get("data_dir", "")
        self.feature_type = config.get("feature_type", "C3D")
        self.in_memory = config.get("in_memory", False)
        self.feat_hdf5 = config.get(
            "video_feature_path",
            "data/ActivityNet/feats/sub_activitynet_v1-3.c3d.hdf5")

        # cropping augmentation settings
        self.cropping_augmentation = config.get("cropping_augmentation", False)
        self.cropping_prob = config.get("cropping_prob", 0.5)
        self.cropping_factor = config.get("cropping_factor", 0.5)
        self.no_aug = False

        # get paths for proposals and captions
        paths = self._get_data_path(config)

        # create labels (or load existing one)
        ann_path = config.get(
            "annotation_path",
            "data/ActivityNet/captions/annotations/train.json")
        self.anns, self.qids, self.vids = self._load_annotation(ann_path)
        if not self._exist_data(paths):
            self.generate_labels(config)

        # load features if use in_memory
        if self.in_memory:
            self.feats = {}
            h = io_utils.load_hdf5(self.feat_hdf5, verbose=False)
            for k in tqdm(self.vids, desc="In-Memory: vid_feat"):
                self.feats[k] = h[k]["c3d_features"][:]

            self.s_pos, self.e_pos, self.att_mask = {}, {}, {}
            grd_info = io_utils.load_hdf5(self.paths["grounding_info"], False)
            for k in tqdm(self.qids, desc="In-Memory: grounding"):
                self.s_pos[k] = grd_info["start_pos/" + k][()]
                self.e_pos[k] = grd_info["end_pos/" + k][()]
                self.att_mask[k] = grd_info["att_mask/" + k][()]

            self.query_labels = {}
            query_labels = h5py.File(self.paths["query_labels"], "r")
            for k in tqdm(self.qids, desc="In-Memory: query"):
                self.query_labels[k] = query_labels[k][:]

        # load and prepare json files
        query_info = io_utils.load_json(self.paths["query_info"])
        self.wtoi = query_info["wtoi"]
        self.itow = query_info["itow"]
        self.query_lengths = query_info["query_lengths"]

        self.batch_size = config.get("batch_size", 64)
        self.num_instances = len(self.qids)
    def _load_data(self):
        '''
            Loads all shard-independent data
        '''
        rv_info_list, metadata_dict, num_shards_list, shard_size_list, data_len_list = [], {}, [], [], []
        shard_num_to_sampling_strategy = []
        shard_num_to_sampling_shard_num = []

        for sampling_strategy in self.sampling_strategy_list:
            scene_paths = paths.scene_graph_data_paths(self.problem,
                                                       self.split,
                                                       sampling_strategy)

            for _, path in scene_paths.items():
                if not os.path.exists(path) and not os.path.exists(
                        path.format(0)):
                    if 'student' not in path:
                        raise RuntimeError(
                            "Data path does not exist: [{}]. Generate using preprocessing script"
                            .format(path))

            rv_info = io.load_json(scene_paths['rv_info_path'])
            metadata = io.load_json(scene_paths['metadata_path'])
            num_shards = metadata['num_shards']
            shard_size = metadata['shard_size']
            data_len = metadata['data_len']

            rv_info_list.append(rv_info)
            metadata_dict[sampling_strategy] = metadata
            num_shards_list.append(num_shards)
            shard_num_to_sampling_strategy.extend([sampling_strategy] *
                                                  num_shards)
            shard_num_to_sampling_shard_num.extend(range(num_shards))
            shard_size_list.append(shard_size)
            data_len_list.append(data_len)

        self.rv_info = rv_info_list[0]  # assume all of these are the same
        self.metadata_dict = metadata_dict
        self.num_shards = sum(num_shards_list)  # consider all shards
        self.shard_size_list = shard_size_list
        self.data_len = sum(data_len_list)
        self.shard_num_to_sampling_strategy = shard_num_to_sampling_strategy
        self.shard_num_to_sampling_shard_num = shard_num_to_sampling_shard_num
Пример #8
0
    def _load_annotation(self, ann_path):
        """ Load annotations
        Args:
            ann_paths: path for annotations; list or string
        Returns:
            new_anns: loaded and preprocessed annotations
        """
        qid = 0
        new_anns = {}
        vids = []
        if isinstance(ann_path, list):
            # for validation annotation
            for ap in ann_path:
                anno = io_utils.load_json(ap)
                new_anns, qid, vids = self._preprocessing(anno, new_anns, qid, vids)
        else:
            # for train annotation
            anno = io_utils.load_json(ann_path)
            new_anns, qid, vids = self._preprocessing(anno, new_anns, qid, vids)

        return new_anns, list(new_anns.keys()), vids
def process_student_data(problem):
    # TODO: fix this, it's outdated

    rnn_paths = paths.rnn_data_paths(problem, 'train', 'education', 'standard')
    vocab_paths = paths.vocab_paths(problem, 'education')

    if not os.path.isfile(vocab_paths['vocab_path']):
        raise ValueError(
            'Run preprocessing script on rubric samples first to generate vocab file.'
        )

    vocab, char_vocab, anon_vocab, anon_char_vocab = load_vocabs(vocab_paths)
    metadata = io.load_json(rnn_paths['metadata_path'])

    # load training max-lengths
    max_len = metadata['max_len']
    char_max_len = metadata['char_max_len']
    anon_max_len = metadata['anon_max_len']
    anon_char_max_len = metadata['anon_char_max_len']

    programs, anon_programs = raw_student_data(problem)

    feat_programs, program_lengths, raw_programs = featurise_programs_rnn(
        programs, vocab, max_len)
    char_feat_programs, char_program_lengths, _ = featurise_programs_rnn(
        programs, char_vocab, char_max_len, character_level=True)

    anon_feat_programs, anon_program_lengths, anon_raw_programs = featurise_programs_rnn(
        anon_programs, anon_vocab, anon_max_len)
    anon_char_feat_programs, anon_char_program_lengths, _ = featurise_programs_rnn(
        anon_programs,
        anon_char_vocab,
        anon_char_max_len,
        character_level=True)

    program_mats = dict(programs=feat_programs, lengths=program_lengths)
    char_program_mats = dict(programs=char_feat_programs,
                             lengths=char_program_lengths)
    anon_program_mats = dict(programs=anon_feat_programs,
                             lengths=anon_program_lengths)
    anon_char_program_mats = dict(programs=anon_char_feat_programs,
                                  lengths=anon_char_program_lengths)

    io.save_pickle(raw_programs, rnn_paths['raw_student_programs_path'])
    io.savemat(char_program_mats, rnn_paths['student_char_programs_path'])
    io.savemat(program_mats, rnn_paths['student_programs_path'])

    io.save_pickle(anon_raw_programs,
                   rnn_paths['anon_raw_student_programs_path'])
    io.savemat(anon_char_program_mats,
               rnn_paths['anon_student_char_programs_path'])
    io.savemat(anon_program_mats, rnn_paths['anon_student_programs_path'])
    def _load_metadata(self):
        '''
            Loads all housekeeping data
        '''
        rnn_paths = paths.rnn_data_paths(self.problem, 'train', 'education', 'standard')
        vocab_paths = paths.vocab_paths(self.problem, 'education')

        for _, path in rnn_paths.items():
            if not os.path.exists(path) and not os.path.exists(path.format(0)):
                if 'student' not in path:
                    raise RuntimeError("Data path does not exist: [{}]. Generate using preprocessing script".format(path))

        metadata = io.load_json(rnn_paths['metadata_path'])
        self.max_len = metadata['max_len'] 
        self.char_max_len = metadata['char_max_len'] 
        self.anon_max_len = metadata['anon_max_len'] 
        self.anon_char_max_len = metadata['anon_char_max_len']

        self.vocab = io.load_json(vocab_paths['vocab_path'])
        self.w2i, self.i2w = self.vocab['w2i'], self.vocab['i2w']

        self.char_vocab = io.load_json(vocab_paths['char_vocab_path'])
        self.char_w2i, self.char_i2w = self.char_vocab['w2i'], self.char_vocab['i2w']

        assert self.char_w2i[PAD_TOKEN] == self.w2i[PAD_TOKEN]
        assert self.char_w2i[START_TOKEN] == self.w2i[START_TOKEN]
        assert self.char_w2i[END_TOKEN] == self.w2i[END_TOKEN]
        assert self.char_w2i[UNK_TOKEN] == self.w2i[UNK_TOKEN]

        self.anon_vocab = io.load_json(vocab_paths['anon_vocab_path'])
        self.anon_w2i, self.anon_i2w = self.anon_vocab['w2i'], self.anon_vocab['i2w']

        self.anon_char_vocab = io.load_json(vocab_paths['anon_char_vocab_path'])
        self.anon_char_w2i, self.anon_char_i2w = self.anon_char_vocab['w2i'], self.anon_char_vocab['i2w']

        assert self.anon_char_w2i[PAD_TOKEN] == self.anon_w2i[PAD_TOKEN]
        assert self.anon_char_w2i[START_TOKEN] == self.anon_w2i[START_TOKEN]
        assert self.anon_char_w2i[END_TOKEN] == self.anon_w2i[END_TOKEN]
        assert self.anon_char_w2i[UNK_TOKEN] == self.anon_w2i[UNK_TOKEN]
Пример #11
0
    def __init__(self, config):

        # get configions
        print(json.dumps(config, indent=4))
        self.hdf5_path = utils.get_value_from_dict(config, "encoded_hdf5_path", \
                "data/CLEVR_v1.0/preprocess/encoded_qa/vocab_train_raw/" \
                + "all_questions_use_zero_token/qa_train.h5")
        self.json_path = utils.get_value_from_dict(config, "encoded_json_path", \
                "data/CLEVR_v1.0/preprocess/encoded_qa/vocab_train_raw/" \
                + "all_questions_use_zero_token/qa_train.json")
        self.img_size = utils.get_value_from_dict(config, "img_size", 224)
        self.batch_size = utils.get_value_from_dict(config, "batch_size", 32)
        self.use_img = utils.get_value_from_dict(config, "use_img", False)
        self.use_gpu = utils.get_value_from_dict(config, "use_gpu", True)
        if self.use_img:
            self.img_dir = utils.get_value_from_dict(config, "img_dir",
                                                     "data/CLEVR_v1.0/images")
            self.prepro = trn.Compose([
                trn.Resize(self.img_size),
                trn.CenterCrop(self.img_size),
                trn.ToTensor(),
                trn.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            ])
        else:
            self.feat_dir = utils.get_value_from_dict(config, "feat_dir",
                                                      "data/CLEVR_v1.0/feats")

        # load hdf5 file including question_labels, question_length,
        # answer_labels
        hdf5_file = io_utils.load_hdf5(self.hdf5_path)
        self.max_time_steps = hdf5_file["question_labels"].shape[1]

        # load json file including woti, itow, atoi, itoa, splits, vocab_info,
        # question_ids, image_filenames
        self.json_file = io_utils.load_json(self.json_path)

        # set path of pre-computed assignments
        # NOTE: DEPRECATED
        self.assignment_path = utils.get_value_from_dict(
            config, "assignment_path", "")

        # set path of pre-computed logits of base models
        self.base_logits_path = utils.get_value_from_dict(
            config, "base_logits_path", "")

        self.fetching_answer_option = "simple"

        self.vis_mode = config.get("vis_mode", False)
def make_rnn_data(problem,
                  split,
                  domain='education',
                  sampling_strategy='standard'):
    rnn_paths = paths.rnn_data_paths(problem, split, domain, sampling_strategy)
    vocab_paths = paths.vocab_paths(problem, domain)
    os.makedirs(rnn_paths['data_path'], exist_ok=True)

    (counts_paths, labels_paths, rv_order_paths,
     tiers_paths, anon_mapping_paths, all_rvs_path) = \
         paths.raw_data_paths(problem, split, domain, sampling_strategy)
    n_shards = len(counts_paths)

    # get info that has to be collected across all shards
    max_lens = get_merged_info(counts_paths, labels_paths, rv_order_paths,
                               tiers_paths, anon_mapping_paths)
    vocab, char_vocab, anon_vocab, anon_char_vocab = load_vocabs(vocab_paths)
    max_len, char_max_len, anon_max_len, anon_char_max_len = max_lens

    all_rvs = io.load_json(all_rvs_path)
    rv_info = create_rv_info(all_rvs)
    # save all_rvs into rv_info
    rv_info['values'] = all_rvs

    data_len = 0
    shard_size = 0

    for i in range(n_shards):
        programs_i, anon_programs_i, labels_i, rv_order_i, tiers_i, _ = load_raw_rubric_data(
            counts_paths[i], labels_paths[i], rv_order_paths[i],
            tiers_paths[i], anon_mapping_paths[i])

        # assumes equally sized shards (except smaller remaining last one)
        shard_size = max(shard_size, len(programs_i))
        data_len += len(programs_i)

        feat_labels_i = featurise_labels(labels_i, rv_info, all_rvs)
        feat_rv_order_i, rv_order_lengths_i = featurise_rv_order(
            rv_order_i, rv_info)

        feat_programs_i, program_lengths_i = featurise_programs_rnn(
            programs_i, vocab, max_len)
        anon_feat_programs_i, anon_program_lengths_i = \
            featurise_programs_rnn(anon_programs_i, anon_vocab, anon_max_len)

        char_feat_programs_i, char_program_lengths_i = \
            featurise_programs_rnn(programs_i, char_vocab, char_max_len, character_level=True)

        anon_char_feat_programs_i, anon_char_program_lengths_i = \
            featurise_programs_rnn(anon_programs_i, anon_char_vocab, anon_char_max_len, character_level=True)

        program_mats_i = dict(programs=feat_programs_i,
                              lengths=program_lengths_i,
                              tiers=tiers_i)
        char_program_mats_i = dict(programs=char_feat_programs_i,
                                   lengths=char_program_lengths_i,
                                   tiers=tiers_i)
        anon_program_mats_i = dict(programs=anon_feat_programs_i,
                                   lengths=anon_program_lengths_i,
                                   tiers=tiers_i)
        anon_char_program_mats_i = dict(programs=anon_char_feat_programs_i,
                                        lengths=anon_char_program_lengths_i,
                                        tiers=tiers_i)
        rv_order_mats_i = dict(rv_orders=feat_rv_order_i,
                               lengths=rv_order_lengths_i)

        io.save_pickle(programs_i, rnn_paths['raw_programs_path'].format(i))
        io.savemat(program_mats_i, rnn_paths['feat_programs_path'].format(i))
        io.savemat(char_program_mats_i,
                   rnn_paths['char_feat_programs_path'].format(i))

        # TODO: save raw labels in raw_labels_path
        io.save_np(feat_labels_i, rnn_paths['feat_labels_path'].format(i))
        io.save_pickle(anon_programs_i,
                       rnn_paths['anon_raw_programs_path'].format(i))
        io.savemat(anon_program_mats_i,
                   rnn_paths['anon_feat_programs_path'].format(i))
        io.savemat(anon_char_program_mats_i,
                   rnn_paths['anon_char_feat_programs_path'].format(i))
        io.save_pickle(rv_order_i, rnn_paths['raw_rvOrder_path'].format(i))
        io.savemat(rv_order_mats_i, rnn_paths['feat_rvOrder_path'].format(i))

    io.save_json(rv_info, rnn_paths['rv_info_path'])

    metadata = dict(max_len=max_len,
                    char_max_len=char_max_len,
                    anon_max_len=anon_max_len,
                    anon_char_max_len=anon_char_max_len,
                    data_len=data_len,
                    num_shards=n_shards,
                    shard_size=shard_size)

    io.save_json(metadata, rnn_paths['metadata_path'])
def process_student_data(problem, account_for_counts=False):
    rnn_paths = paths.rnn_data_paths(problem, 'train', 'education', 'standard')
    os.makedirs(rnn_paths['student_data_path'], exist_ok=True)
    vocab_paths = paths.vocab_paths(problem, 'education')

    if not os.path.isfile(vocab_paths['vocab_path']):
        raise ValueError(
            'Run preprocessing script on rubric samples first to generate vocab file.'
        )

    vocab, char_vocab, anon_vocab, anon_char_vocab = load_vocabs(vocab_paths)
    metadata = io.load_json(rnn_paths['metadata_path'])

    # load training max-lengths
    # max_len = metadata['max_len']
    # char_max_len = metadata['char_max_len']
    # anon_max_len = metadata['anon_max_len']
    # anon_char_max_len = metadata['anon_char_max_len']

    # we do not want to load these from metadata bc some programs may be longer than ones
    # seen in training. Instead we want to recompute the maximum length...
    programs, labels, zipfs, anon_programs = raw_student_data(
        problem, account_for_counts)

    # we +2 to include start and end tokens
    max_len = max(len(x.split()) for x in programs) + 2
    char_max_len = max(len(x) for x in programs) + 2
    anon_max_len = max(len(x.split()) for x in anon_programs) + 2
    anon_char_max_len = max(len(x) for x in anon_programs) + 2

    feat_programs, program_lengths, raw_programs = featurise_programs_rnn(
        programs, vocab, max_len)
    char_feat_programs, char_program_lengths, _ = featurise_programs_rnn(
        programs, char_vocab, char_max_len, character_level=True)

    anon_feat_programs, anon_program_lengths, anon_raw_programs = featurise_programs_rnn(
        anon_programs, anon_vocab, anon_max_len)
    anon_char_feat_programs, anon_char_program_lengths, _ = featurise_programs_rnn(
        anon_programs,
        anon_char_vocab,
        anon_char_max_len,
        character_level=True)

    program_mats = dict(programs=feat_programs, lengths=program_lengths)
    char_program_mats = dict(programs=char_feat_programs,
                             lengths=char_program_lengths)
    anon_program_mats = dict(programs=anon_feat_programs,
                             lengths=anon_program_lengths)
    anon_char_program_mats = dict(programs=anon_char_feat_programs,
                                  lengths=anon_char_program_lengths)

    io.save_pickle(raw_programs, rnn_paths['raw_student_programs_path'])
    io.savemat(char_program_mats, rnn_paths['student_char_programs_path'])
    io.savemat(program_mats, rnn_paths['student_programs_path'])

    io.save_pickle(anon_raw_programs,
                   rnn_paths['anon_raw_student_programs_path'])
    io.savemat(anon_char_program_mats,
               rnn_paths['anon_student_char_programs_path'])
    io.savemat(anon_program_mats, rnn_paths['anon_student_programs_path'])

    io.save_np(labels, rnn_paths['feat_labels_path'])
    io.save_np(zipfs, rnn_paths['feat_zipfs_path'])
        offset_idx = render.index(to_find)
        end_idx = offset_idx + len(to_find)
        render = render[:offset_idx] + format_data[key] + render[end_idx:]
        for rv_set, (idx, n) in rv_data[key]:
            ret_idxs.append((rv_set, (idx  + offset_idx, n)))


    if len(rvs) > 0:
        ret_idxs.append((rvs, (0, len(render))))
    return nonterminal, render, ret_idxs

def tagged_data(data):
    ret = []
    for d in tqdm(data):
        p = d[0]
        ret.append(conv_sample(d[1]))
    return ret

if __name__ == "__main__":
    import argparse
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('data_file', type=str, help='which results to process')
    args = arg_parser.parse_args()


    data = load_json(args.data_file)
    conv_data = tagged_data(data)

    save_json(conv_data, os.path.join('output.json'))
Пример #15
0
def make_scene_graph_data(device,
                          problem,
                          split,
                          sampling_strategy='standard',
                          use_resnet=False):
    data_paths = paths.scene_graph_data_paths(problem, split,
                                              sampling_strategy)
    os.makedirs(data_paths['data_path'], exist_ok=True)

    (counts_paths, labels_paths, images_paths, rv_order_paths,
     tiers_paths, all_rvs_path) = \
         paths.raw_scene_graph_data_paths(problem, split, sampling_strategy)
    n_shards = len(counts_paths)

    all_rvs = io.load_json(all_rvs_path)
    rv_info = create_rv_info(all_rvs)
    # save all_rvs into rv_info
    rv_info['values'] = all_rvs

    data_len = 0
    shard_size = 0

    if use_resnet:
        # load huge model :(
        print('loading deep net for feature extraction...')
        net, expected_input_dim = load_classification_model()
        net = net.to(device)
        image_transforms = load_data_transforms(expected_input_dim)

    for i in range(n_shards):
        scene_graphs_i, images_i, labels_i, rv_order_i, tiers_i, _ = load_raw_scene_graph_data(
            counts_paths[i], labels_paths[i], rv_order_paths[i],
            images_paths[i], tiers_paths[i])

        n_items_i = len(scene_graphs_i)

        if use_resnet:
            feat_images_i = featurise_images(images_i, net, device,
                                             image_transforms)
        else:
            feat_images_i = images_i

        # assumes equally sized shards (except smaller remaining last one)
        shard_size = max(shard_size, n_items_i)
        data_len += n_items_i

        feat_labels_i = featurise_labels(labels_i, rv_info, all_rvs)
        feat_rv_order_i, rv_order_lengths_i = featurise_rv_order(
            rv_order_i, rv_info)

        image_mats_i = dict(images=feat_images_i, tiers=tiers_i)
        rv_order_mats_i = dict(rv_orders=feat_rv_order_i,
                               lengths=rv_order_lengths_i)

        io.savemat(image_mats_i, data_paths['feat_images_path'].format(i))
        io.save_np(feat_labels_i, data_paths['feat_labels_path'].format(i))
        io.save_pickle(rv_order_i, data_paths['raw_rvOrder_path'].format(i))
        io.savemat(rv_order_mats_i, data_paths['feat_rvOrder_path'].format(i))

    io.save_json(rv_info, data_paths['rv_info_path'])

    metadata = dict(data_len=data_len,
                    num_shards=n_shards,
                    shard_size=shard_size)

    io.save_json(metadata, data_paths['metadata_path'])
Пример #16
0
    def _load_data(self):
        '''
            Loads all shard-independent data
        '''
        rv_info_list, metadata_dict, num_shards_list, shard_size_list, data_len_list = [], {}, [], [], []
        w2i_list, i2w_list = [], []
        char_w2i_list, char_i2w_list = [], []
        anon_w2i_list, anon_i2w_list = [], []
        anon_char_w2i_list, anon_char_i2w_list = [], []
        shard_num_to_sampling_strategy = []
        shard_num_to_sampling_shard_num = []
        max_len_list, char_max_len_list = [], []
        anon_max_len_list, anon_char_max_len_list = [], []

        for sampling_strategy in self.sampling_strategy_list:
            rnn_paths = paths.rnn_data_paths(self.problem, self.split,
                                             self.domain, sampling_strategy)
            vocab_paths = paths.vocab_paths(self.problem, self.domain)
            """
            for _, path in rnn_paths.items():
                if not os.path.exists(path) and not os.path.exists(path.format(0)):
                    if 'student' not in path:
                        raise RuntimeError("Data path does not exist: [{}]. Generate using preprocessing script".format(path))
            """

            # contains w2i, i2w, num_categories for all rvs
            rv_info = io.load_json(rnn_paths['rv_info_path'])
            metadata = io.load_json(rnn_paths['metadata_path'])
            num_shards = metadata['num_shards']
            shard_size = metadata['shard_size']
            data_len = metadata['data_len']
            max_len = metadata['max_len']
            char_max_len = metadata['char_max_len']
            anon_max_len = metadata['anon_max_len']
            anon_char_max_len = metadata['anon_char_max_len']

            vocab = io.load_json(vocab_paths['vocab_path'])
            w2i, i2w = vocab['w2i'], vocab['i2w']

            char_vocab = io.load_json(vocab_paths['char_vocab_path'])
            char_w2i, char_i2w = char_vocab['w2i'], char_vocab['i2w']

            assert char_w2i[PAD_TOKEN] == w2i[PAD_TOKEN]
            assert char_w2i[START_TOKEN] == w2i[START_TOKEN]
            assert char_w2i[END_TOKEN] == w2i[END_TOKEN]
            assert char_w2i[UNK_TOKEN] == w2i[UNK_TOKEN]

            anon_vocab = io.load_json(vocab_paths['anon_vocab_path'])
            anon_w2i, anon_i2w = anon_vocab['w2i'], anon_vocab['i2w']

            anon_char_vocab = io.load_json(vocab_paths['anon_char_vocab_path'])
            anon_char_w2i, anon_char_i2w = anon_char_vocab[
                'w2i'], anon_char_vocab['i2w']

            assert anon_char_w2i[PAD_TOKEN] == anon_w2i[PAD_TOKEN]
            assert anon_char_w2i[START_TOKEN] == anon_w2i[START_TOKEN]
            assert anon_char_w2i[END_TOKEN] == anon_w2i[END_TOKEN]
            assert anon_char_w2i[UNK_TOKEN] == anon_w2i[UNK_TOKEN]

            rv_info_list.append(rv_info)
            metadata_dict[sampling_strategy] = metadata
            num_shards_list.append(num_shards)
            shard_num_to_sampling_strategy.extend([sampling_strategy] *
                                                  num_shards)
            shard_num_to_sampling_shard_num.extend(range(num_shards))
            shard_size_list.append(shard_size)
            data_len_list.append(data_len)
            w2i_list.append(w2i)
            i2w_list.append(i2w)
            char_w2i_list.append(char_w2i)
            char_i2w_list.append(char_i2w)
            anon_w2i_list.append(anon_w2i)
            anon_i2w_list.append(anon_i2w)
            anon_char_w2i_list.append(anon_char_w2i)
            anon_char_i2w_list.append(anon_char_i2w)
            max_len_list.append(max_len)
            char_max_len_list.append(char_max_len)
            anon_max_len_list.append(anon_max_len)
            anon_char_max_len_list.append(anon_char_max_len)

        self.rv_info = rv_info_list[0]  # assume all of these are the same
        self.metadata_dict = metadata_dict
        self.num_shards = sum(num_shards_list)  # consider all shards
        self.shard_size_list = shard_size_list
        self.data_len = sum(data_len_list)
        self.w2i = merge_dicts(*w2i_list)
        self.i2w = merge_dicts(*i2w_list)
        self.vocab = {'w2i': self.w2i, 'i2w': self.i2w}
        self.char_w2i = merge_dicts(*char_w2i_list)
        self.char_i2w = merge_dicts(*char_i2w_list)
        self.char_vocab = {'w2i': self.char_w2i, 'i2w': self.char_i2w}
        self.anon_w2i = merge_dicts(*anon_w2i_list)
        self.anon_i2w = merge_dicts(*anon_i2w_list)
        self.anon_vocab = {'w2i': self.anon_w2i, 'i2w': self.anon_i2w}
        self.anon_char_w2i = merge_dicts(*anon_char_w2i_list)
        self.anon_char_i2w = merge_dicts(*anon_char_i2w_list)
        self.anon_char_vocab = {
            'w2i': self.anon_char_w2i,
            'i2w': self.anon_char_i2w
        }
        self.shard_num_to_sampling_strategy = shard_num_to_sampling_strategy
        self.shard_num_to_sampling_shard_num = shard_num_to_sampling_shard_num
        self.max_len_list = max_len_list
        self.char_max_len_list = char_max_len_list
        self.anon_max_len_list = anon_max_len_list
        self.anon_char_max_len_list = anon_char_max_len_list
        # take max and we will need to pad to this size
        self.max_len = max(max_len_list)
        self.char_max_len = max(char_max_len_list)
        self.anon_max_len = max(anon_max_len_list)
        self.anon_char_max_len = max(anon_char_max_len_list)