def load_vocabs(vocab_paths): vocab = io.load_json(vocab_paths['vocab_path']) char_vocab = io.load_json(vocab_paths['char_vocab_path']) anon_vocab = io.load_json(vocab_paths['anon_vocab_path']) anon_char_vocab = io.load_json(vocab_paths['anon_char_vocab_path']) return vocab, char_vocab, anon_vocab, anon_char_vocab
def loadSyntheticData(self): cache_file = os.path.join(self.CACHE_DIR, 'lsh_programs.pkl') if not self.evict_cache and os.path.isfile(cache_file): data = load_json(cache_file) prog_items = data['raw_programs'] anon_progs = data['anon_programs'] else: standard_path = self.sampledDataPath + '/standard/train' + SYNTH_NAME uniform_path = self.sampledDataPath + '/uniform/train' + SYNTH_NAME tempered_path = self.sampledDataPath + '/tempered/train' + SYNTH_NAME standardDict = pickle.load(open(standard_path, "rb" )) uniformDict = pickle.load(open(uniform_path, "rb" )) temperedDict = pickle.load(open(tempered_path, "rb" )) all_dicts = [standardDict, uniformDict, temperedDict] # this step is not stable across different runs if caching forest # so this needs to be cached too prog_items = list(standardDict.keys() | uniformDict.keys() | temperedDict.keys()) anon_progs = [self.multi_dict_get(prog, all_dicts) for prog in prog_items] data = dict(raw_programs=prog_items, anon_programs=anon_progs) os.makedirs(self.CACHE_DIR, exist_ok=True) save_json(data, cache_file) # if we dont load cache here, we should regenerate forest too self.evict_cache = True return prog_items, anon_progs
def load_exp_data(all_exp_dir): params = [] losses = [] for exp_name in os.listdir(all_exp_dir): try: exp_dir = os.path.join(all_exp_dir, exp_name) config = load_config(os.path.join(exp_dir, 'config.json')) vec = config_to_vec(config) vec['exp_name'] = exp_name summaries = load_json( os.path.join(exp_dir, 'summaries', 'all_scalars.json')) k_loss = get_key_for_metric(summaries.keys(), 'validation/loss/loss') if k_loss is None: print('Metric not foud... skipping') continue loss = np.average([x[2] for x in summaries[k_loss][-5:]]) params.append(vec) losses.append(loss) except FileNotFoundError: print('File not found... skipping') continue return params, losses
def LoadVocabulary(self, vocab_path): vocab_dict = io_utils.load_json(vocab_path) self.word_vocabulary = vocab_dict["vocabulary"] self.word_index = vocab_dict["word_index"] self.has_vocabulary = True """ TODO: should do this part?
def __init__(self, config): super(self.__class__, self).__init__(config) # get options self.S = config.get("num_segment", 128) self.split = config.get("split", "train") self.data_dir = config.get("data_dir", "data/charades") self.feature_type = config.get("feature_type", "I3D") self.in_memory = config.get("in_memory", False) if self.feature_type == "I3D": self.feat_path = config.get( "video_feature_path", "data/charades/features/i3d_finetuned/{}.npy" ) else: raise ValueError("Wrong feature_type") # cropping augmentation settings self.cropping_augmentation = config.get("cropping_augmentation",False) self.cropping_prob = config.get("cropping_prob",0.5) self.cropping_factor = config.get("cropping_factor",0.5) self.no_aug = False # get paths for proposals and captions paths = self._get_data_path(config) # create labels (or load existing one) ann_path = "data/charades/annotations/charades_sta_{}_pos.json".format(self.split) aux_ann_path = "data/charades/annotations/Charades_v1_{}.csv".format(self.split) self.anns, self.qids, self.vids = self._load_annotation(ann_path, aux_ann_path) if not self._exist_data(paths): self.generate_labels(config) # load features if use in_memory if self.in_memory: self.feats = {} for vid in tqdm(self.vids, desc="In-Memory: vid_feat"): self.feats[vid] = np.load(self.feat_path.format(vid)).squeeze() self.s_pos, self.e_pos, self.att_mask = {}, {}, {} grd_info = io_utils.load_hdf5(self.paths["grounding_info"], False) for k in tqdm(self.qids, desc="In-Memory: grounding"): self.s_pos[k] = grd_info["start_pos/"+k][()] self.e_pos[k] = grd_info["end_pos/"+k][()] self.att_mask[k] = grd_info["att_mask/"+k][()] self.query_labels = {} query_labels = h5py.File(self.paths["query_labels"], "r") for k in tqdm(self.qids, desc="In-Memory: query"): self.query_labels[k]= query_labels[k][:] # load query information query_info = io_utils.load_json(self.paths["query_info"]) self.wtoi = query_info["wtoi"] self.itow = query_info["itow"] self.query_lengths = query_info["query_lengths"] self.batch_size = config.get("batch_size", 64) self.num_instances = len(self.qids)
def __init__(self, config): super(self.__class__, self).__init__(config) # get options self.S = config.get("num_segment", 128) self.split = config.get("split", "train") self.data_dir = config.get("data_dir", "") self.feature_type = config.get("feature_type", "C3D") self.in_memory = config.get("in_memory", False) self.feat_hdf5 = config.get( "video_feature_path", "data/ActivityNet/feats/sub_activitynet_v1-3.c3d.hdf5") # cropping augmentation settings self.cropping_augmentation = config.get("cropping_augmentation", False) self.cropping_prob = config.get("cropping_prob", 0.5) self.cropping_factor = config.get("cropping_factor", 0.5) self.no_aug = False # get paths for proposals and captions paths = self._get_data_path(config) # create labels (or load existing one) ann_path = config.get( "annotation_path", "data/ActivityNet/captions/annotations/train.json") self.anns, self.qids, self.vids = self._load_annotation(ann_path) if not self._exist_data(paths): self.generate_labels(config) # load features if use in_memory if self.in_memory: self.feats = {} h = io_utils.load_hdf5(self.feat_hdf5, verbose=False) for k in tqdm(self.vids, desc="In-Memory: vid_feat"): self.feats[k] = h[k]["c3d_features"][:] self.s_pos, self.e_pos, self.att_mask = {}, {}, {} grd_info = io_utils.load_hdf5(self.paths["grounding_info"], False) for k in tqdm(self.qids, desc="In-Memory: grounding"): self.s_pos[k] = grd_info["start_pos/" + k][()] self.e_pos[k] = grd_info["end_pos/" + k][()] self.att_mask[k] = grd_info["att_mask/" + k][()] self.query_labels = {} query_labels = h5py.File(self.paths["query_labels"], "r") for k in tqdm(self.qids, desc="In-Memory: query"): self.query_labels[k] = query_labels[k][:] # load and prepare json files query_info = io_utils.load_json(self.paths["query_info"]) self.wtoi = query_info["wtoi"] self.itow = query_info["itow"] self.query_lengths = query_info["query_lengths"] self.batch_size = config.get("batch_size", 64) self.num_instances = len(self.qids)
def _load_data(self): ''' Loads all shard-independent data ''' rv_info_list, metadata_dict, num_shards_list, shard_size_list, data_len_list = [], {}, [], [], [] shard_num_to_sampling_strategy = [] shard_num_to_sampling_shard_num = [] for sampling_strategy in self.sampling_strategy_list: scene_paths = paths.scene_graph_data_paths(self.problem, self.split, sampling_strategy) for _, path in scene_paths.items(): if not os.path.exists(path) and not os.path.exists( path.format(0)): if 'student' not in path: raise RuntimeError( "Data path does not exist: [{}]. Generate using preprocessing script" .format(path)) rv_info = io.load_json(scene_paths['rv_info_path']) metadata = io.load_json(scene_paths['metadata_path']) num_shards = metadata['num_shards'] shard_size = metadata['shard_size'] data_len = metadata['data_len'] rv_info_list.append(rv_info) metadata_dict[sampling_strategy] = metadata num_shards_list.append(num_shards) shard_num_to_sampling_strategy.extend([sampling_strategy] * num_shards) shard_num_to_sampling_shard_num.extend(range(num_shards)) shard_size_list.append(shard_size) data_len_list.append(data_len) self.rv_info = rv_info_list[0] # assume all of these are the same self.metadata_dict = metadata_dict self.num_shards = sum(num_shards_list) # consider all shards self.shard_size_list = shard_size_list self.data_len = sum(data_len_list) self.shard_num_to_sampling_strategy = shard_num_to_sampling_strategy self.shard_num_to_sampling_shard_num = shard_num_to_sampling_shard_num
def _load_annotation(self, ann_path): """ Load annotations Args: ann_paths: path for annotations; list or string Returns: new_anns: loaded and preprocessed annotations """ qid = 0 new_anns = {} vids = [] if isinstance(ann_path, list): # for validation annotation for ap in ann_path: anno = io_utils.load_json(ap) new_anns, qid, vids = self._preprocessing(anno, new_anns, qid, vids) else: # for train annotation anno = io_utils.load_json(ann_path) new_anns, qid, vids = self._preprocessing(anno, new_anns, qid, vids) return new_anns, list(new_anns.keys()), vids
def process_student_data(problem): # TODO: fix this, it's outdated rnn_paths = paths.rnn_data_paths(problem, 'train', 'education', 'standard') vocab_paths = paths.vocab_paths(problem, 'education') if not os.path.isfile(vocab_paths['vocab_path']): raise ValueError( 'Run preprocessing script on rubric samples first to generate vocab file.' ) vocab, char_vocab, anon_vocab, anon_char_vocab = load_vocabs(vocab_paths) metadata = io.load_json(rnn_paths['metadata_path']) # load training max-lengths max_len = metadata['max_len'] char_max_len = metadata['char_max_len'] anon_max_len = metadata['anon_max_len'] anon_char_max_len = metadata['anon_char_max_len'] programs, anon_programs = raw_student_data(problem) feat_programs, program_lengths, raw_programs = featurise_programs_rnn( programs, vocab, max_len) char_feat_programs, char_program_lengths, _ = featurise_programs_rnn( programs, char_vocab, char_max_len, character_level=True) anon_feat_programs, anon_program_lengths, anon_raw_programs = featurise_programs_rnn( anon_programs, anon_vocab, anon_max_len) anon_char_feat_programs, anon_char_program_lengths, _ = featurise_programs_rnn( anon_programs, anon_char_vocab, anon_char_max_len, character_level=True) program_mats = dict(programs=feat_programs, lengths=program_lengths) char_program_mats = dict(programs=char_feat_programs, lengths=char_program_lengths) anon_program_mats = dict(programs=anon_feat_programs, lengths=anon_program_lengths) anon_char_program_mats = dict(programs=anon_char_feat_programs, lengths=anon_char_program_lengths) io.save_pickle(raw_programs, rnn_paths['raw_student_programs_path']) io.savemat(char_program_mats, rnn_paths['student_char_programs_path']) io.savemat(program_mats, rnn_paths['student_programs_path']) io.save_pickle(anon_raw_programs, rnn_paths['anon_raw_student_programs_path']) io.savemat(anon_char_program_mats, rnn_paths['anon_student_char_programs_path']) io.savemat(anon_program_mats, rnn_paths['anon_student_programs_path'])
def _load_metadata(self): ''' Loads all housekeeping data ''' rnn_paths = paths.rnn_data_paths(self.problem, 'train', 'education', 'standard') vocab_paths = paths.vocab_paths(self.problem, 'education') for _, path in rnn_paths.items(): if not os.path.exists(path) and not os.path.exists(path.format(0)): if 'student' not in path: raise RuntimeError("Data path does not exist: [{}]. Generate using preprocessing script".format(path)) metadata = io.load_json(rnn_paths['metadata_path']) self.max_len = metadata['max_len'] self.char_max_len = metadata['char_max_len'] self.anon_max_len = metadata['anon_max_len'] self.anon_char_max_len = metadata['anon_char_max_len'] self.vocab = io.load_json(vocab_paths['vocab_path']) self.w2i, self.i2w = self.vocab['w2i'], self.vocab['i2w'] self.char_vocab = io.load_json(vocab_paths['char_vocab_path']) self.char_w2i, self.char_i2w = self.char_vocab['w2i'], self.char_vocab['i2w'] assert self.char_w2i[PAD_TOKEN] == self.w2i[PAD_TOKEN] assert self.char_w2i[START_TOKEN] == self.w2i[START_TOKEN] assert self.char_w2i[END_TOKEN] == self.w2i[END_TOKEN] assert self.char_w2i[UNK_TOKEN] == self.w2i[UNK_TOKEN] self.anon_vocab = io.load_json(vocab_paths['anon_vocab_path']) self.anon_w2i, self.anon_i2w = self.anon_vocab['w2i'], self.anon_vocab['i2w'] self.anon_char_vocab = io.load_json(vocab_paths['anon_char_vocab_path']) self.anon_char_w2i, self.anon_char_i2w = self.anon_char_vocab['w2i'], self.anon_char_vocab['i2w'] assert self.anon_char_w2i[PAD_TOKEN] == self.anon_w2i[PAD_TOKEN] assert self.anon_char_w2i[START_TOKEN] == self.anon_w2i[START_TOKEN] assert self.anon_char_w2i[END_TOKEN] == self.anon_w2i[END_TOKEN] assert self.anon_char_w2i[UNK_TOKEN] == self.anon_w2i[UNK_TOKEN]
def __init__(self, config): # get configions print(json.dumps(config, indent=4)) self.hdf5_path = utils.get_value_from_dict(config, "encoded_hdf5_path", \ "data/CLEVR_v1.0/preprocess/encoded_qa/vocab_train_raw/" \ + "all_questions_use_zero_token/qa_train.h5") self.json_path = utils.get_value_from_dict(config, "encoded_json_path", \ "data/CLEVR_v1.0/preprocess/encoded_qa/vocab_train_raw/" \ + "all_questions_use_zero_token/qa_train.json") self.img_size = utils.get_value_from_dict(config, "img_size", 224) self.batch_size = utils.get_value_from_dict(config, "batch_size", 32) self.use_img = utils.get_value_from_dict(config, "use_img", False) self.use_gpu = utils.get_value_from_dict(config, "use_gpu", True) if self.use_img: self.img_dir = utils.get_value_from_dict(config, "img_dir", "data/CLEVR_v1.0/images") self.prepro = trn.Compose([ trn.Resize(self.img_size), trn.CenterCrop(self.img_size), trn.ToTensor(), trn.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) else: self.feat_dir = utils.get_value_from_dict(config, "feat_dir", "data/CLEVR_v1.0/feats") # load hdf5 file including question_labels, question_length, # answer_labels hdf5_file = io_utils.load_hdf5(self.hdf5_path) self.max_time_steps = hdf5_file["question_labels"].shape[1] # load json file including woti, itow, atoi, itoa, splits, vocab_info, # question_ids, image_filenames self.json_file = io_utils.load_json(self.json_path) # set path of pre-computed assignments # NOTE: DEPRECATED self.assignment_path = utils.get_value_from_dict( config, "assignment_path", "") # set path of pre-computed logits of base models self.base_logits_path = utils.get_value_from_dict( config, "base_logits_path", "") self.fetching_answer_option = "simple" self.vis_mode = config.get("vis_mode", False)
def make_rnn_data(problem, split, domain='education', sampling_strategy='standard'): rnn_paths = paths.rnn_data_paths(problem, split, domain, sampling_strategy) vocab_paths = paths.vocab_paths(problem, domain) os.makedirs(rnn_paths['data_path'], exist_ok=True) (counts_paths, labels_paths, rv_order_paths, tiers_paths, anon_mapping_paths, all_rvs_path) = \ paths.raw_data_paths(problem, split, domain, sampling_strategy) n_shards = len(counts_paths) # get info that has to be collected across all shards max_lens = get_merged_info(counts_paths, labels_paths, rv_order_paths, tiers_paths, anon_mapping_paths) vocab, char_vocab, anon_vocab, anon_char_vocab = load_vocabs(vocab_paths) max_len, char_max_len, anon_max_len, anon_char_max_len = max_lens all_rvs = io.load_json(all_rvs_path) rv_info = create_rv_info(all_rvs) # save all_rvs into rv_info rv_info['values'] = all_rvs data_len = 0 shard_size = 0 for i in range(n_shards): programs_i, anon_programs_i, labels_i, rv_order_i, tiers_i, _ = load_raw_rubric_data( counts_paths[i], labels_paths[i], rv_order_paths[i], tiers_paths[i], anon_mapping_paths[i]) # assumes equally sized shards (except smaller remaining last one) shard_size = max(shard_size, len(programs_i)) data_len += len(programs_i) feat_labels_i = featurise_labels(labels_i, rv_info, all_rvs) feat_rv_order_i, rv_order_lengths_i = featurise_rv_order( rv_order_i, rv_info) feat_programs_i, program_lengths_i = featurise_programs_rnn( programs_i, vocab, max_len) anon_feat_programs_i, anon_program_lengths_i = \ featurise_programs_rnn(anon_programs_i, anon_vocab, anon_max_len) char_feat_programs_i, char_program_lengths_i = \ featurise_programs_rnn(programs_i, char_vocab, char_max_len, character_level=True) anon_char_feat_programs_i, anon_char_program_lengths_i = \ featurise_programs_rnn(anon_programs_i, anon_char_vocab, anon_char_max_len, character_level=True) program_mats_i = dict(programs=feat_programs_i, lengths=program_lengths_i, tiers=tiers_i) char_program_mats_i = dict(programs=char_feat_programs_i, lengths=char_program_lengths_i, tiers=tiers_i) anon_program_mats_i = dict(programs=anon_feat_programs_i, lengths=anon_program_lengths_i, tiers=tiers_i) anon_char_program_mats_i = dict(programs=anon_char_feat_programs_i, lengths=anon_char_program_lengths_i, tiers=tiers_i) rv_order_mats_i = dict(rv_orders=feat_rv_order_i, lengths=rv_order_lengths_i) io.save_pickle(programs_i, rnn_paths['raw_programs_path'].format(i)) io.savemat(program_mats_i, rnn_paths['feat_programs_path'].format(i)) io.savemat(char_program_mats_i, rnn_paths['char_feat_programs_path'].format(i)) # TODO: save raw labels in raw_labels_path io.save_np(feat_labels_i, rnn_paths['feat_labels_path'].format(i)) io.save_pickle(anon_programs_i, rnn_paths['anon_raw_programs_path'].format(i)) io.savemat(anon_program_mats_i, rnn_paths['anon_feat_programs_path'].format(i)) io.savemat(anon_char_program_mats_i, rnn_paths['anon_char_feat_programs_path'].format(i)) io.save_pickle(rv_order_i, rnn_paths['raw_rvOrder_path'].format(i)) io.savemat(rv_order_mats_i, rnn_paths['feat_rvOrder_path'].format(i)) io.save_json(rv_info, rnn_paths['rv_info_path']) metadata = dict(max_len=max_len, char_max_len=char_max_len, anon_max_len=anon_max_len, anon_char_max_len=anon_char_max_len, data_len=data_len, num_shards=n_shards, shard_size=shard_size) io.save_json(metadata, rnn_paths['metadata_path'])
def process_student_data(problem, account_for_counts=False): rnn_paths = paths.rnn_data_paths(problem, 'train', 'education', 'standard') os.makedirs(rnn_paths['student_data_path'], exist_ok=True) vocab_paths = paths.vocab_paths(problem, 'education') if not os.path.isfile(vocab_paths['vocab_path']): raise ValueError( 'Run preprocessing script on rubric samples first to generate vocab file.' ) vocab, char_vocab, anon_vocab, anon_char_vocab = load_vocabs(vocab_paths) metadata = io.load_json(rnn_paths['metadata_path']) # load training max-lengths # max_len = metadata['max_len'] # char_max_len = metadata['char_max_len'] # anon_max_len = metadata['anon_max_len'] # anon_char_max_len = metadata['anon_char_max_len'] # we do not want to load these from metadata bc some programs may be longer than ones # seen in training. Instead we want to recompute the maximum length... programs, labels, zipfs, anon_programs = raw_student_data( problem, account_for_counts) # we +2 to include start and end tokens max_len = max(len(x.split()) for x in programs) + 2 char_max_len = max(len(x) for x in programs) + 2 anon_max_len = max(len(x.split()) for x in anon_programs) + 2 anon_char_max_len = max(len(x) for x in anon_programs) + 2 feat_programs, program_lengths, raw_programs = featurise_programs_rnn( programs, vocab, max_len) char_feat_programs, char_program_lengths, _ = featurise_programs_rnn( programs, char_vocab, char_max_len, character_level=True) anon_feat_programs, anon_program_lengths, anon_raw_programs = featurise_programs_rnn( anon_programs, anon_vocab, anon_max_len) anon_char_feat_programs, anon_char_program_lengths, _ = featurise_programs_rnn( anon_programs, anon_char_vocab, anon_char_max_len, character_level=True) program_mats = dict(programs=feat_programs, lengths=program_lengths) char_program_mats = dict(programs=char_feat_programs, lengths=char_program_lengths) anon_program_mats = dict(programs=anon_feat_programs, lengths=anon_program_lengths) anon_char_program_mats = dict(programs=anon_char_feat_programs, lengths=anon_char_program_lengths) io.save_pickle(raw_programs, rnn_paths['raw_student_programs_path']) io.savemat(char_program_mats, rnn_paths['student_char_programs_path']) io.savemat(program_mats, rnn_paths['student_programs_path']) io.save_pickle(anon_raw_programs, rnn_paths['anon_raw_student_programs_path']) io.savemat(anon_char_program_mats, rnn_paths['anon_student_char_programs_path']) io.savemat(anon_program_mats, rnn_paths['anon_student_programs_path']) io.save_np(labels, rnn_paths['feat_labels_path']) io.save_np(zipfs, rnn_paths['feat_zipfs_path'])
offset_idx = render.index(to_find) end_idx = offset_idx + len(to_find) render = render[:offset_idx] + format_data[key] + render[end_idx:] for rv_set, (idx, n) in rv_data[key]: ret_idxs.append((rv_set, (idx + offset_idx, n))) if len(rvs) > 0: ret_idxs.append((rvs, (0, len(render)))) return nonterminal, render, ret_idxs def tagged_data(data): ret = [] for d in tqdm(data): p = d[0] ret.append(conv_sample(d[1])) return ret if __name__ == "__main__": import argparse arg_parser = argparse.ArgumentParser() arg_parser.add_argument('data_file', type=str, help='which results to process') args = arg_parser.parse_args() data = load_json(args.data_file) conv_data = tagged_data(data) save_json(conv_data, os.path.join('output.json'))
def make_scene_graph_data(device, problem, split, sampling_strategy='standard', use_resnet=False): data_paths = paths.scene_graph_data_paths(problem, split, sampling_strategy) os.makedirs(data_paths['data_path'], exist_ok=True) (counts_paths, labels_paths, images_paths, rv_order_paths, tiers_paths, all_rvs_path) = \ paths.raw_scene_graph_data_paths(problem, split, sampling_strategy) n_shards = len(counts_paths) all_rvs = io.load_json(all_rvs_path) rv_info = create_rv_info(all_rvs) # save all_rvs into rv_info rv_info['values'] = all_rvs data_len = 0 shard_size = 0 if use_resnet: # load huge model :( print('loading deep net for feature extraction...') net, expected_input_dim = load_classification_model() net = net.to(device) image_transforms = load_data_transforms(expected_input_dim) for i in range(n_shards): scene_graphs_i, images_i, labels_i, rv_order_i, tiers_i, _ = load_raw_scene_graph_data( counts_paths[i], labels_paths[i], rv_order_paths[i], images_paths[i], tiers_paths[i]) n_items_i = len(scene_graphs_i) if use_resnet: feat_images_i = featurise_images(images_i, net, device, image_transforms) else: feat_images_i = images_i # assumes equally sized shards (except smaller remaining last one) shard_size = max(shard_size, n_items_i) data_len += n_items_i feat_labels_i = featurise_labels(labels_i, rv_info, all_rvs) feat_rv_order_i, rv_order_lengths_i = featurise_rv_order( rv_order_i, rv_info) image_mats_i = dict(images=feat_images_i, tiers=tiers_i) rv_order_mats_i = dict(rv_orders=feat_rv_order_i, lengths=rv_order_lengths_i) io.savemat(image_mats_i, data_paths['feat_images_path'].format(i)) io.save_np(feat_labels_i, data_paths['feat_labels_path'].format(i)) io.save_pickle(rv_order_i, data_paths['raw_rvOrder_path'].format(i)) io.savemat(rv_order_mats_i, data_paths['feat_rvOrder_path'].format(i)) io.save_json(rv_info, data_paths['rv_info_path']) metadata = dict(data_len=data_len, num_shards=n_shards, shard_size=shard_size) io.save_json(metadata, data_paths['metadata_path'])
def _load_data(self): ''' Loads all shard-independent data ''' rv_info_list, metadata_dict, num_shards_list, shard_size_list, data_len_list = [], {}, [], [], [] w2i_list, i2w_list = [], [] char_w2i_list, char_i2w_list = [], [] anon_w2i_list, anon_i2w_list = [], [] anon_char_w2i_list, anon_char_i2w_list = [], [] shard_num_to_sampling_strategy = [] shard_num_to_sampling_shard_num = [] max_len_list, char_max_len_list = [], [] anon_max_len_list, anon_char_max_len_list = [], [] for sampling_strategy in self.sampling_strategy_list: rnn_paths = paths.rnn_data_paths(self.problem, self.split, self.domain, sampling_strategy) vocab_paths = paths.vocab_paths(self.problem, self.domain) """ for _, path in rnn_paths.items(): if not os.path.exists(path) and not os.path.exists(path.format(0)): if 'student' not in path: raise RuntimeError("Data path does not exist: [{}]. Generate using preprocessing script".format(path)) """ # contains w2i, i2w, num_categories for all rvs rv_info = io.load_json(rnn_paths['rv_info_path']) metadata = io.load_json(rnn_paths['metadata_path']) num_shards = metadata['num_shards'] shard_size = metadata['shard_size'] data_len = metadata['data_len'] max_len = metadata['max_len'] char_max_len = metadata['char_max_len'] anon_max_len = metadata['anon_max_len'] anon_char_max_len = metadata['anon_char_max_len'] vocab = io.load_json(vocab_paths['vocab_path']) w2i, i2w = vocab['w2i'], vocab['i2w'] char_vocab = io.load_json(vocab_paths['char_vocab_path']) char_w2i, char_i2w = char_vocab['w2i'], char_vocab['i2w'] assert char_w2i[PAD_TOKEN] == w2i[PAD_TOKEN] assert char_w2i[START_TOKEN] == w2i[START_TOKEN] assert char_w2i[END_TOKEN] == w2i[END_TOKEN] assert char_w2i[UNK_TOKEN] == w2i[UNK_TOKEN] anon_vocab = io.load_json(vocab_paths['anon_vocab_path']) anon_w2i, anon_i2w = anon_vocab['w2i'], anon_vocab['i2w'] anon_char_vocab = io.load_json(vocab_paths['anon_char_vocab_path']) anon_char_w2i, anon_char_i2w = anon_char_vocab[ 'w2i'], anon_char_vocab['i2w'] assert anon_char_w2i[PAD_TOKEN] == anon_w2i[PAD_TOKEN] assert anon_char_w2i[START_TOKEN] == anon_w2i[START_TOKEN] assert anon_char_w2i[END_TOKEN] == anon_w2i[END_TOKEN] assert anon_char_w2i[UNK_TOKEN] == anon_w2i[UNK_TOKEN] rv_info_list.append(rv_info) metadata_dict[sampling_strategy] = metadata num_shards_list.append(num_shards) shard_num_to_sampling_strategy.extend([sampling_strategy] * num_shards) shard_num_to_sampling_shard_num.extend(range(num_shards)) shard_size_list.append(shard_size) data_len_list.append(data_len) w2i_list.append(w2i) i2w_list.append(i2w) char_w2i_list.append(char_w2i) char_i2w_list.append(char_i2w) anon_w2i_list.append(anon_w2i) anon_i2w_list.append(anon_i2w) anon_char_w2i_list.append(anon_char_w2i) anon_char_i2w_list.append(anon_char_i2w) max_len_list.append(max_len) char_max_len_list.append(char_max_len) anon_max_len_list.append(anon_max_len) anon_char_max_len_list.append(anon_char_max_len) self.rv_info = rv_info_list[0] # assume all of these are the same self.metadata_dict = metadata_dict self.num_shards = sum(num_shards_list) # consider all shards self.shard_size_list = shard_size_list self.data_len = sum(data_len_list) self.w2i = merge_dicts(*w2i_list) self.i2w = merge_dicts(*i2w_list) self.vocab = {'w2i': self.w2i, 'i2w': self.i2w} self.char_w2i = merge_dicts(*char_w2i_list) self.char_i2w = merge_dicts(*char_i2w_list) self.char_vocab = {'w2i': self.char_w2i, 'i2w': self.char_i2w} self.anon_w2i = merge_dicts(*anon_w2i_list) self.anon_i2w = merge_dicts(*anon_i2w_list) self.anon_vocab = {'w2i': self.anon_w2i, 'i2w': self.anon_i2w} self.anon_char_w2i = merge_dicts(*anon_char_w2i_list) self.anon_char_i2w = merge_dicts(*anon_char_i2w_list) self.anon_char_vocab = { 'w2i': self.anon_char_w2i, 'i2w': self.anon_char_i2w } self.shard_num_to_sampling_strategy = shard_num_to_sampling_strategy self.shard_num_to_sampling_shard_num = shard_num_to_sampling_shard_num self.max_len_list = max_len_list self.char_max_len_list = char_max_len_list self.anon_max_len_list = anon_max_len_list self.anon_char_max_len_list = anon_char_max_len_list # take max and we will need to pad to this size self.max_len = max(max_len_list) self.char_max_len = max(char_max_len_list) self.anon_max_len = max(anon_max_len_list) self.anon_char_max_len = max(anon_char_max_len_list)