示例#1
0
def load_DSTC2(args, tokenizer, test_flag=False, OOV=False, debugging=False):
    if (test_flag):
        test = generate_dataset(
            'data/dialog-bAbI-tasks/dialog-babi-task6tst.txt',
            tokenizer,
            debugging=debugging)
        return None, None, test
    else:
        train = generate_dataset(
            'data/dialog-bAbI-tasks/dialog-babi-task6trn.txt',
            tokenizer,
            debugging=debugging)
        dev = generate_dataset(
            'data/dialog-bAbI-tasks/dialog-babi-task6dev.txt',
            tokenizer,
            debugging=debugging)
        test = generate_dataset(
            'data/dialog-bAbI-tasks/dialog-babi-task6tst.txt',
            tokenizer,
            debugging=debugging)
        smd = {"train": train, "valid": dev, "test": test}
        train_loader, valid_loader, test_loader = get_loader(
            args, smd, tokenizer)
        print(f"Max Len:{test_dataloader(args,train_loader)}")
        print(f"Max Len:{test_dataloader(args,valid_loader)}")
        print(f"Max Len:{test_dataloader(args,test_loader)}")
        return train_loader, valid_loader, test_loader
def load_DIALKG(args,
                tokenizer,
                test_flag=False,
                kb_percentage=0,
                debugging=False):
    if (test_flag):
        test = generate_dataset(
            json.load(open(f'{args.dataset_path}/opendialkg/test.json')),
            tokenizer, debugging)
        return None, None, test, None
    else:
        train = generate_dataset(
            json.load(open(f'{args.dataset_path}/opendialkg/train.json')),
            tokenizer, debugging)
        dev = generate_dataset(
            json.load(open(f'{args.dataset_path}/opendialkg/validation.json')),
            tokenizer, debugging)
        test = generate_dataset(
            json.load(open(f'{args.dataset_path}/opendialkg/test.json')),
            tokenizer, debugging)
        data = {"train": train, "valid": dev, "test": test}

        print('Len train set: ', len(train))
        print('Len dev set: ', len(dev))
        print('Len test set: ', len(test))

        # Augment Knowledge based on number of iteration in kb_percentage
        if kb_percentage > 0:
            # Load augmentation data
            gen_dialogues = []
            for gen_dialogue_file in gen_dialogue_files:
                gen_dialogues += json.load(
                    open(
                        f'{args.dataset_path}/generated_dialogue_bs100_rs{random_seed}.json',
                        'r'))
            random.seed(0)
            augment_data = random.sample(gen_dialogues, kb_percentage)
            augment = generate_dataset(augment_data, tokenizer, debugging)

            train += augment

        print('Len Train augmented: ', len(train))

        train_loader, valid_loader, test_loader = get_loader(
            args, data, tokenizer)
        print(f"Max Len:{test_dataloader(args,train_loader)}")
        print(f"Max Len:{test_dataloader(args,valid_loader)}")
        print(f"Max Len:{test_dataloader(args,test_loader)}")
        return train_loader, valid_loader, test_loader
示例#3
0
def load_MWOZ(args,tokenizer,test_flag=False,debugging=False):
    train = json.load(open("data/MultiWOZ_2.1/train_data.json"))
    valid = json.load(open("data/MultiWOZ_2.1/valid_data.json"))
    test = json.load(open("data/MultiWOZ_2.1/test_data.json"))
    if(test_flag):
        test = generate_dataset(test,tokenizer,debugging=debugging)
        return None, None, test, None
    else:
        train = generate_dataset(train,tokenizer,debugging=debugging)
        dev = generate_dataset(valid,tokenizer,debugging=debugging)
        test = generate_dataset(test,tokenizer,debugging=debugging)
        dataset_dict = {"train":train,"valid":dev,"test":test}
        train_loader, valid_loader, test_loader = get_loader(args, dataset_dict, tokenizer)
        print(f"Max Len:{test_dataloader(args,train_loader)}")
        print(f"Max Len:{test_dataloader(args,valid_loader)}")
        print(f"Max Len:{test_dataloader(args,test_loader)}")
        return train_loader, valid_loader, None
示例#4
0
def load_SMD(args, tokenizer, test_flag=False, debugging=False, delex=False):
    if (test_flag):
        test = generate_dataset("../../knowledge_embed/smd/SMD/test.txt",
                                tokenizer, debugging)
        return test, None
    else:
        train = generate_dataset("../../knowledge_embed/smd/SMD/train.txt",
                                 tokenizer, debugging)
        dev = generate_dataset("../../knowledge_embed/smd/SMD/dev.txt",
                               tokenizer, debugging)
        test = generate_dataset("../../knowledge_embed/smd/SMD/test.txt",
                                tokenizer, debugging)
        smd = {"train": train, "valid": dev, "test": test}
        train_loader, valid_loader, test_loader = get_loader(
            args, smd, tokenizer)
        print(f"Max Len:{test_dataloader(args,train_loader)}")
        print(f"Max Len:{test_dataloader(args,valid_loader)}")
        print(f"Max Len:{test_dataloader(args,test_loader)}")
        return train_loader, valid_loader, test_loader
示例#5
0
def load_BABI(args,
              tokenizer,
              test_flag=False,
              OOV=False,
              debugging=False,
              kb_percentage=0):
    if (test_flag):
        if (OOV):
            test = generate_dataset(
                f'{args.dataset_path}/dialog-babi-task5tst-OOV.txt',
                tokenizer,
                debugging=debugging)
        else:
            test = generate_dataset(
                f'{args.dataset_path}/dialog-babi-task5tst.txt',
                tokenizer,
                debugging=debugging)
        return None, None, test
    else:

        train = generate_dataset(
            f'{args.dataset_path}/dialog-babi-task5trn.txt',
            tokenizer,
            debugging=debugging)
        if (kb_percentage > 0):
            train += generate_dataset(
                f'{args.dataset_path}/gen-babi5-nk558-nd{kb_percentage}-rs0.txt',
                tokenizer,
                debugging=debugging)
        dev = generate_dataset(f'{args.dataset_path}/dialog-babi-task5dev.txt',
                               tokenizer,
                               debugging=debugging)
        test = generate_dataset(
            f'{args.dataset_path}/dialog-babi-task5tst.txt',
            tokenizer,
            debugging=debugging)
        smd = {"train": train, "valid": dev, "test": test}
        train_loader, valid_loader, test_loader = get_loader(
            args, smd, tokenizer)
        print(f"Max Len:{test_dataloader(args,train_loader)}")
        print(f"Max Len:{test_dataloader(args,valid_loader)}")
        print(f"Max Len:{test_dataloader(args,test_loader)}")
        return train_loader, valid_loader, test_loader
示例#6
0
def load_MWOZ_SINGLE(args,tokenizer,test_flag=False,debugging=False,kb_percentage=0):
    if(test_flag):
        test = []
        for d in ["train","hotel","attraction","restaurant","taxi"]:
            test += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/test/{d}_single.json")),tokenizer,debugging=debugging, domain=d)
        return None, None, test, None
    else:
        train = []
        for d in ["train","hotel","attraction","restaurant","taxi"]:
            train += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/train/{d}_single.json")),tokenizer,debugging=debugging, domain=d)
            if (d == "taxi" and args.up_sampler): # double taxi training data
                train += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/train/{d}_single.json")),tokenizer,debugging=debugging, domain=d)
                # train += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/train/{d}_single.json")),tokenizer,debugging=debugging, domain=d)
            
            if (kb_percentage>0 and d != "taxi"):
                train += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/train/{d}_augmented_{kb_percentage}_single.json")),tokenizer,debugging=debugging, domain=d)
                
                if args.up_sampler: # triple attraction and hotel augmented data
                    if d == "attraction" or d == "hotel":
                        train += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/train/{d}_augmented_{kb_percentage}_single.json")),tokenizer,debugging=debugging, domain=d)
                        train += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/train/{d}_augmented_{kb_percentage}_single.json")),tokenizer,debugging=debugging, domain=d)
                        
                # to make the distribution balance
                #train += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/train/taxi_single.json")),tokenizer,debugging=debugging)

        valid = []
        for d in ["train","hotel","attraction","restaurant","taxi"]:
            valid += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/valid/{d}_single.json")),tokenizer,debugging=debugging, domain=d)

        test = []
        for d in ["train","hotel","attraction","restaurant","taxi"]:
            test += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/test/{d}_single.json")),tokenizer,debugging=debugging, domain=d)
        dataset_dict = {"train":train,"valid":valid,"test":test}
        train_loader, valid_loader, test_loader = get_loader(args, dataset_dict, tokenizer)
        print(f"Max Len:{test_dataloader(args,train_loader)}")
        print(f"Max Len:{test_dataloader(args,valid_loader)}")
        print(f"Max Len:{test_dataloader(args,test_loader)}")
        return train_loader, valid_loader, None
def get_training_file_for_KB(args, indx, tokenizer):
    train_kb = generate_dataset_FINETUNE(f"data/SMD/test/dialog_{indx}.txt",
                                         tokenizer)
    data = {"train": train_kb, "valid": train_kb, "test": train_kb}
    train_loader, _, _ = get_loader(args, data, tokenizer)
    return train_loader