예제 #1
0
def load_train_dev_test_annotations():
    config_dir = get_config_dir()
    anno_set = P.get_current_parameters()["Environment"].get("annotation_set")
    if anno_set == "7000":
        train_data = load_json(
            os.path.join(config_dir, "train_annotations_7000.json"))
        dev_data = load_json(
            os.path.join(config_dir, "dev_annotations_7000.json"))
        test_data = load_json(
            os.path.join(config_dir, "test_annotations_7000.json"))
        return train_data, dev_data, test_data
    elif anno_set == "6000":
        train_data = load_json(
            os.path.join(config_dir, "train_annotations_6000.json"))
        dev_data = load_json(
            os.path.join(config_dir, "dev_annotations_6000.json"))
        test_data = load_json(
            os.path.join(config_dir, "test_annotations_6000.json"))
        return train_data, dev_data, test_data
    elif anno_set == "4000":
        data = load_json(os.path.join(config_dir, "annotation_results.json"))
        return data["train"], data["dev"], data["test"]
    else:
        raise ValueError(
            f"Unknown annotation set: {anno_set}. Expected one of: 4000, 6000, 7000"
        )
예제 #2
0
파일: instructions.py 프로젝트: dxsun/drif
def load_train_dev_test_annotations():
    config_dir = get_config_dir()
    if not DATA_6000:
        data = load_json(os.path.join(config_dir, "annotation_results.json"))
        return data["train"], data["dev"], data["test"]
    else:
        train_data = load_json(os.path.join(config_dir, "train_annotations_6000.json"))
        dev_data = load_json(os.path.join(config_dir, "dev_annotations_6000.json"))
        test_data = load_json(os.path.join(config_dir, "test_annotations_6000.json"))
        return train_data, dev_data, test_data
예제 #3
0
def get_all_instructions(max_size=0, do_prune_ambiguous=False, full=False):
    #print("max_size:", max_size)

    # If instructions already loaded in memory, return them
    global cache
    global loaded_corpus
    global loaded_size

    if full:
        min_augment_len = 1
    else:
        min_augment_len = P.get_current_parameters()["Setup"].get(
            "min_augment_len", 1)
    max_augment_len = P.get_current_parameters()["Setup"].get("augment_len", 1)

    train_key = f"train-{min_augment_len}-{max_augment_len}"
    dev_key = f"dev-{min_augment_len}-{max_augment_len}"
    test_key = f"test-{min_augment_len}-{max_augment_len}"

    if cache is not None and train_key in cache:  # loaded_size == max_size:
        train_instructions = cache[train_key]
        dev_instructions = cache[dev_key]
        test_instructions = cache[test_key]
        corpus = loaded_corpus

    # Otherwise see if they've been pre-build in tmp files
    else:
        # Cache
        cache_dir = get_instruction_cache_dir()
        corpus_dir = get_config_dir()

        train_file = os.path.join(
            cache_dir, f"train_{min_augment_len}-{max_augment_len}.json")
        dev_file = os.path.join(
            cache_dir, f"dev_{min_augment_len}-{max_augment_len}.json")
        test_file = os.path.join(
            cache_dir, f"test_{min_augment_len}-{max_augment_len}.json")
        corpus_file = os.path.join(corpus_dir, "corpus.json")
        wfreq_file = os.path.join(corpus_dir, "word_freq.json")

        corpus_already_exists = False
        if os.path.isfile(corpus_file):
            with open(corpus_file, "r") as f:
                corpus = list(json.load(f))
                #print("corpus: ", len(corpus))
            corpus_already_exists = True

        # If they have been saved in tmp files, load them
        if os.path.isfile(train_file):
            train_instructions = load_instruction_data_from_json(train_file)
            dev_instructions = load_instruction_data_from_json(dev_file)
            test_instructions = load_instruction_data_from_json(test_file)
            assert corpus_already_exists, "Insruction data exists but corpus is gone!"

        # Otherwise rebuild instruction data from annotations
        else:
            print(
                f"REBUILDING INSTRUCTION DATA FOR SEGMENT LENGTHS: {min_augment_len} to {max_augment_len}!"
            )
            print(f"USING OLD CORPUS: {corpus_already_exists}")
            os.makedirs(cache_dir, exist_ok=True)

            all_instructions, new_corpus = defaultdict(list), set()

            train_an, dev_an, test_an = load_train_dev_test_annotations()

            print("Loaded JSON Data")

            print("Parsing dataset")
            print("    train...")
            train_instructions, new_corpus, word_freq = parse_dataset(
                train_an, new_corpus)
            print("    dev...")
            dev_instructions, new_corpus, _ = parse_dataset(dev_an, new_corpus)
            print("    test...")
            test_instructions, new_corpus, _ = parse_dataset(
                test_an, new_corpus)

            print("Augmenting maybe?")
            train_instructions = augment_dataset(train_instructions,
                                                 merge_len=max_augment_len,
                                                 min_merge_len=min_augment_len)
            dev_instructions = augment_dataset(dev_instructions,
                                               merge_len=max_augment_len,
                                               min_merge_len=min_augment_len)
            test_instructions = augment_dataset(test_instructions,
                                                merge_len=max_augment_len,
                                                min_merge_len=min_augment_len)

            save_json(train_instructions, train_file)
            save_json(dev_instructions, dev_file)
            save_json(test_instructions, test_file)

            if not corpus_already_exists:
                corpus = new_corpus
                save_json(list(corpus), corpus_file)
                save_json(word_freq, wfreq_file)
            else:
                print("Warning! Regenerated pomdp, but kept the old corpus!")

            print("Saved instructions for quicker loading!")

    # Clip datasets to the provided size
    if max_size is not None and max_size > 0:
        num_train = int(math.ceil(max_size * 0.7))
        num_dev = int(math.ceil(max_size * 0.15))
        num_test = int(math.ceil(max_size * 0.15))

        train_instructions = slice_list_tail(train_instructions, num_train)
        dev_instructions = slice_list_tail(dev_instructions, num_dev)
        test_instructions = slice_list_tail(test_instructions, num_test)

    if do_prune_ambiguous:
        train_instructions = prune_ambiguous(train_instructions)
        dev_instructions = prune_ambiguous(dev_instructions)
        test_instructions = prune_ambiguous(test_instructions)

    #print("Corpus: ", len(corpus))
    #print("Loaded: ", len(train_instructions), len(dev_instructions), len(test_instructions))
    if cache is None:
        cache = {}

    cache[train_key] = train_instructions
    cache[dev_key] = dev_instructions
    cache[test_key] = test_instructions
    loaded_corpus = corpus
    loaded_size = max_size

    return train_instructions, dev_instructions, test_instructions, corpus
예제 #4
0
파일: instructions.py 프로젝트: dxsun/drif
def get_all_instructions(max_size=0, do_prune_ambiguous=False):
    #print("max_size:", max_size)

    # If instructions already loaded in memory, return them
    global loaded_train_instructions
    global loaded_test_instructions
    global loaded_dev_instructions
    global loaded_corpus
    global loaded_size
    if loaded_train_instructions is not None and loaded_size == max_size:
        train_instructions = loaded_train_instructions
        dev_instructions = loaded_dev_instructions
        test_instructions = loaded_test_instructions
        corpus = loaded_corpus

    # Otherwise see if they've been pre-build in tmp files
    else:
        # Cache
        cache_dir = get_instruction_cache_dir()
        corpus_dir = get_config_dir()

        train_file = os.path.join(cache_dir,"train.json")
        dev_file = os.path.join(cache_dir, "dev.json")
        test_file = os.path.join(cache_dir, "test.json")
        corpus_file = os.path.join(corpus_dir, "corpus.json")
        wfreq_file = os.path.join(corpus_dir, "word_freq.json")

        corpus_already_exists = False
        if os.path.isfile(corpus_file):
            with open(corpus_file, "r") as f:
                corpus = list(json.load(f))
                print("corpus: ", len(corpus))
            corpus_already_exists = True

        # If they have been saved in tmp files, load them
        if os.path.isfile(train_file):
            train_instructions = load_instruction_data_from_json(train_file)
            dev_instructions = load_instruction_data_from_json(dev_file)
            test_instructions = load_instruction_data_from_json(test_file)

        # Otherwise rebuild instruction data from annotations
        else:
            print("REBUILDING INSTRUCTION DATA! CORPUS WILL NOT BE VALID!")
            os.makedirs(cache_dir, exist_ok=True)

            all_instructions, corpus = defaultdict(list), set()

            train_an, dev_an, test_an = load_train_dev_test_annotations()

            print("Loaded JSON Data")

            train_instructions, corpus, word_freq = parse_dataset(train_an, corpus)
            dev_instructions, corpus, _ = parse_dataset(dev_an, corpus)
            test_instructions, corpus, _ = parse_dataset(test_an, corpus)

            #train_instructions = augment_dataset(train_instructions)
            #dev_instructions = augment_dataset(dev_instructions)
            #test_instructions = augment_dataset(test_instructions)

            save_json(train_instructions, train_file)
            save_json(dev_instructions, dev_file)
            save_json(test_instructions, test_file)

            if not corpus_already_exists:
                save_json(list(corpus), corpus_file)
                save_json(word_freq, wfreq_file)
            else:
                print("Warning! Regenerated pomdp, but kept the old corpus!")

            print("Saved instructions for quicker loading!")

    # Clip datasets to the provided size
    if max_size is not None and max_size > 0:
        num_train = int(math.ceil(max_size*0.7))
        num_dev = int(math.ceil(max_size*0.15))
        num_test = int(math.ceil(max_size*0.15))

        train_instructions = slice_list_tail(train_instructions, num_train)
        dev_instructions = slice_list_tail(dev_instructions, num_dev)
        test_instructions = slice_list_tail(test_instructions, num_test)

    if do_prune_ambiguous:
        train_instructions = prune_ambiguous(train_instructions)
        dev_instructions = prune_ambiguous(dev_instructions)
        test_instructions = prune_ambiguous(test_instructions)

    #print("Corpus: ", len(corpus))
    #print("Loaded: ", len(train_instructions), len(dev_instructions), len(test_instructions))
    loaded_train_instructions = train_instructions
    loaded_dev_instructions = dev_instructions
    loaded_test_instructions = test_instructions
    loaded_corpus = corpus
    loaded_size = max_size

    return train_instructions, dev_instructions, test_instructions, corpus