def load_train_dev_test_annotations(): config_dir = get_config_dir() anno_set = P.get_current_parameters()["Environment"].get("annotation_set") if anno_set == "7000": train_data = load_json( os.path.join(config_dir, "train_annotations_7000.json")) dev_data = load_json( os.path.join(config_dir, "dev_annotations_7000.json")) test_data = load_json( os.path.join(config_dir, "test_annotations_7000.json")) return train_data, dev_data, test_data elif anno_set == "6000": train_data = load_json( os.path.join(config_dir, "train_annotations_6000.json")) dev_data = load_json( os.path.join(config_dir, "dev_annotations_6000.json")) test_data = load_json( os.path.join(config_dir, "test_annotations_6000.json")) return train_data, dev_data, test_data elif anno_set == "4000": data = load_json(os.path.join(config_dir, "annotation_results.json")) return data["train"], data["dev"], data["test"] else: raise ValueError( f"Unknown annotation set: {anno_set}. Expected one of: 4000, 6000, 7000" )
def load_train_dev_test_annotations(): config_dir = get_config_dir() if not DATA_6000: data = load_json(os.path.join(config_dir, "annotation_results.json")) return data["train"], data["dev"], data["test"] else: train_data = load_json(os.path.join(config_dir, "train_annotations_6000.json")) dev_data = load_json(os.path.join(config_dir, "dev_annotations_6000.json")) test_data = load_json(os.path.join(config_dir, "test_annotations_6000.json")) return train_data, dev_data, test_data
def get_all_instructions(max_size=0, do_prune_ambiguous=False, full=False): #print("max_size:", max_size) # If instructions already loaded in memory, return them global cache global loaded_corpus global loaded_size if full: min_augment_len = 1 else: min_augment_len = P.get_current_parameters()["Setup"].get( "min_augment_len", 1) max_augment_len = P.get_current_parameters()["Setup"].get("augment_len", 1) train_key = f"train-{min_augment_len}-{max_augment_len}" dev_key = f"dev-{min_augment_len}-{max_augment_len}" test_key = f"test-{min_augment_len}-{max_augment_len}" if cache is not None and train_key in cache: # loaded_size == max_size: train_instructions = cache[train_key] dev_instructions = cache[dev_key] test_instructions = cache[test_key] corpus = loaded_corpus # Otherwise see if they've been pre-build in tmp files else: # Cache cache_dir = get_instruction_cache_dir() corpus_dir = get_config_dir() train_file = os.path.join( cache_dir, f"train_{min_augment_len}-{max_augment_len}.json") dev_file = os.path.join( cache_dir, f"dev_{min_augment_len}-{max_augment_len}.json") test_file = os.path.join( cache_dir, f"test_{min_augment_len}-{max_augment_len}.json") corpus_file = os.path.join(corpus_dir, "corpus.json") wfreq_file = os.path.join(corpus_dir, "word_freq.json") corpus_already_exists = False if os.path.isfile(corpus_file): with open(corpus_file, "r") as f: corpus = list(json.load(f)) #print("corpus: ", len(corpus)) corpus_already_exists = True # If they have been saved in tmp files, load them if os.path.isfile(train_file): train_instructions = load_instruction_data_from_json(train_file) dev_instructions = load_instruction_data_from_json(dev_file) test_instructions = load_instruction_data_from_json(test_file) assert corpus_already_exists, "Insruction data exists but corpus is gone!" # Otherwise rebuild instruction data from annotations else: print( f"REBUILDING INSTRUCTION DATA FOR SEGMENT LENGTHS: {min_augment_len} to {max_augment_len}!" ) print(f"USING OLD CORPUS: {corpus_already_exists}") os.makedirs(cache_dir, exist_ok=True) all_instructions, new_corpus = defaultdict(list), set() train_an, dev_an, test_an = load_train_dev_test_annotations() print("Loaded JSON Data") print("Parsing dataset") print(" train...") train_instructions, new_corpus, word_freq = parse_dataset( train_an, new_corpus) print(" dev...") dev_instructions, new_corpus, _ = parse_dataset(dev_an, new_corpus) print(" test...") test_instructions, new_corpus, _ = parse_dataset( test_an, new_corpus) print("Augmenting maybe?") train_instructions = augment_dataset(train_instructions, merge_len=max_augment_len, min_merge_len=min_augment_len) dev_instructions = augment_dataset(dev_instructions, merge_len=max_augment_len, min_merge_len=min_augment_len) test_instructions = augment_dataset(test_instructions, merge_len=max_augment_len, min_merge_len=min_augment_len) save_json(train_instructions, train_file) save_json(dev_instructions, dev_file) save_json(test_instructions, test_file) if not corpus_already_exists: corpus = new_corpus save_json(list(corpus), corpus_file) save_json(word_freq, wfreq_file) else: print("Warning! Regenerated pomdp, but kept the old corpus!") print("Saved instructions for quicker loading!") # Clip datasets to the provided size if max_size is not None and max_size > 0: num_train = int(math.ceil(max_size * 0.7)) num_dev = int(math.ceil(max_size * 0.15)) num_test = int(math.ceil(max_size * 0.15)) train_instructions = slice_list_tail(train_instructions, num_train) dev_instructions = slice_list_tail(dev_instructions, num_dev) test_instructions = slice_list_tail(test_instructions, num_test) if do_prune_ambiguous: train_instructions = prune_ambiguous(train_instructions) dev_instructions = prune_ambiguous(dev_instructions) test_instructions = prune_ambiguous(test_instructions) #print("Corpus: ", len(corpus)) #print("Loaded: ", len(train_instructions), len(dev_instructions), len(test_instructions)) if cache is None: cache = {} cache[train_key] = train_instructions cache[dev_key] = dev_instructions cache[test_key] = test_instructions loaded_corpus = corpus loaded_size = max_size return train_instructions, dev_instructions, test_instructions, corpus
def get_all_instructions(max_size=0, do_prune_ambiguous=False): #print("max_size:", max_size) # If instructions already loaded in memory, return them global loaded_train_instructions global loaded_test_instructions global loaded_dev_instructions global loaded_corpus global loaded_size if loaded_train_instructions is not None and loaded_size == max_size: train_instructions = loaded_train_instructions dev_instructions = loaded_dev_instructions test_instructions = loaded_test_instructions corpus = loaded_corpus # Otherwise see if they've been pre-build in tmp files else: # Cache cache_dir = get_instruction_cache_dir() corpus_dir = get_config_dir() train_file = os.path.join(cache_dir,"train.json") dev_file = os.path.join(cache_dir, "dev.json") test_file = os.path.join(cache_dir, "test.json") corpus_file = os.path.join(corpus_dir, "corpus.json") wfreq_file = os.path.join(corpus_dir, "word_freq.json") corpus_already_exists = False if os.path.isfile(corpus_file): with open(corpus_file, "r") as f: corpus = list(json.load(f)) print("corpus: ", len(corpus)) corpus_already_exists = True # If they have been saved in tmp files, load them if os.path.isfile(train_file): train_instructions = load_instruction_data_from_json(train_file) dev_instructions = load_instruction_data_from_json(dev_file) test_instructions = load_instruction_data_from_json(test_file) # Otherwise rebuild instruction data from annotations else: print("REBUILDING INSTRUCTION DATA! CORPUS WILL NOT BE VALID!") os.makedirs(cache_dir, exist_ok=True) all_instructions, corpus = defaultdict(list), set() train_an, dev_an, test_an = load_train_dev_test_annotations() print("Loaded JSON Data") train_instructions, corpus, word_freq = parse_dataset(train_an, corpus) dev_instructions, corpus, _ = parse_dataset(dev_an, corpus) test_instructions, corpus, _ = parse_dataset(test_an, corpus) #train_instructions = augment_dataset(train_instructions) #dev_instructions = augment_dataset(dev_instructions) #test_instructions = augment_dataset(test_instructions) save_json(train_instructions, train_file) save_json(dev_instructions, dev_file) save_json(test_instructions, test_file) if not corpus_already_exists: save_json(list(corpus), corpus_file) save_json(word_freq, wfreq_file) else: print("Warning! Regenerated pomdp, but kept the old corpus!") print("Saved instructions for quicker loading!") # Clip datasets to the provided size if max_size is not None and max_size > 0: num_train = int(math.ceil(max_size*0.7)) num_dev = int(math.ceil(max_size*0.15)) num_test = int(math.ceil(max_size*0.15)) train_instructions = slice_list_tail(train_instructions, num_train) dev_instructions = slice_list_tail(dev_instructions, num_dev) test_instructions = slice_list_tail(test_instructions, num_test) if do_prune_ambiguous: train_instructions = prune_ambiguous(train_instructions) dev_instructions = prune_ambiguous(dev_instructions) test_instructions = prune_ambiguous(test_instructions) #print("Corpus: ", len(corpus)) #print("Loaded: ", len(train_instructions), len(dev_instructions), len(test_instructions)) loaded_train_instructions = train_instructions loaded_dev_instructions = dev_instructions loaded_test_instructions = test_instructions loaded_corpus = corpus loaded_size = max_size return train_instructions, dev_instructions, test_instructions, corpus