def load_DSTC2(args, tokenizer, test_flag=False, OOV=False, debugging=False): if (test_flag): test = generate_dataset( 'data/dialog-bAbI-tasks/dialog-babi-task6tst.txt', tokenizer, debugging=debugging) return None, None, test else: train = generate_dataset( 'data/dialog-bAbI-tasks/dialog-babi-task6trn.txt', tokenizer, debugging=debugging) dev = generate_dataset( 'data/dialog-bAbI-tasks/dialog-babi-task6dev.txt', tokenizer, debugging=debugging) test = generate_dataset( 'data/dialog-bAbI-tasks/dialog-babi-task6tst.txt', tokenizer, debugging=debugging) smd = {"train": train, "valid": dev, "test": test} train_loader, valid_loader, test_loader = get_loader( args, smd, tokenizer) print(f"Max Len:{test_dataloader(args,train_loader)}") print(f"Max Len:{test_dataloader(args,valid_loader)}") print(f"Max Len:{test_dataloader(args,test_loader)}") return train_loader, valid_loader, test_loader
def load_DIALKG(args, tokenizer, test_flag=False, kb_percentage=0, debugging=False): if (test_flag): test = generate_dataset( json.load(open(f'{args.dataset_path}/opendialkg/test.json')), tokenizer, debugging) return None, None, test, None else: train = generate_dataset( json.load(open(f'{args.dataset_path}/opendialkg/train.json')), tokenizer, debugging) dev = generate_dataset( json.load(open(f'{args.dataset_path}/opendialkg/validation.json')), tokenizer, debugging) test = generate_dataset( json.load(open(f'{args.dataset_path}/opendialkg/test.json')), tokenizer, debugging) data = {"train": train, "valid": dev, "test": test} print('Len train set: ', len(train)) print('Len dev set: ', len(dev)) print('Len test set: ', len(test)) # Augment Knowledge based on number of iteration in kb_percentage if kb_percentage > 0: # Load augmentation data gen_dialogues = [] for gen_dialogue_file in gen_dialogue_files: gen_dialogues += json.load( open( f'{args.dataset_path}/generated_dialogue_bs100_rs{random_seed}.json', 'r')) random.seed(0) augment_data = random.sample(gen_dialogues, kb_percentage) augment = generate_dataset(augment_data, tokenizer, debugging) train += augment print('Len Train augmented: ', len(train)) train_loader, valid_loader, test_loader = get_loader( args, data, tokenizer) print(f"Max Len:{test_dataloader(args,train_loader)}") print(f"Max Len:{test_dataloader(args,valid_loader)}") print(f"Max Len:{test_dataloader(args,test_loader)}") return train_loader, valid_loader, test_loader
def load_MWOZ(args,tokenizer,test_flag=False,debugging=False): train = json.load(open("data/MultiWOZ_2.1/train_data.json")) valid = json.load(open("data/MultiWOZ_2.1/valid_data.json")) test = json.load(open("data/MultiWOZ_2.1/test_data.json")) if(test_flag): test = generate_dataset(test,tokenizer,debugging=debugging) return None, None, test, None else: train = generate_dataset(train,tokenizer,debugging=debugging) dev = generate_dataset(valid,tokenizer,debugging=debugging) test = generate_dataset(test,tokenizer,debugging=debugging) dataset_dict = {"train":train,"valid":dev,"test":test} train_loader, valid_loader, test_loader = get_loader(args, dataset_dict, tokenizer) print(f"Max Len:{test_dataloader(args,train_loader)}") print(f"Max Len:{test_dataloader(args,valid_loader)}") print(f"Max Len:{test_dataloader(args,test_loader)}") return train_loader, valid_loader, None
def load_SMD(args, tokenizer, test_flag=False, debugging=False, delex=False): if (test_flag): test = generate_dataset("../../knowledge_embed/smd/SMD/test.txt", tokenizer, debugging) return test, None else: train = generate_dataset("../../knowledge_embed/smd/SMD/train.txt", tokenizer, debugging) dev = generate_dataset("../../knowledge_embed/smd/SMD/dev.txt", tokenizer, debugging) test = generate_dataset("../../knowledge_embed/smd/SMD/test.txt", tokenizer, debugging) smd = {"train": train, "valid": dev, "test": test} train_loader, valid_loader, test_loader = get_loader( args, smd, tokenizer) print(f"Max Len:{test_dataloader(args,train_loader)}") print(f"Max Len:{test_dataloader(args,valid_loader)}") print(f"Max Len:{test_dataloader(args,test_loader)}") return train_loader, valid_loader, test_loader
def load_BABI(args, tokenizer, test_flag=False, OOV=False, debugging=False, kb_percentage=0): if (test_flag): if (OOV): test = generate_dataset( f'{args.dataset_path}/dialog-babi-task5tst-OOV.txt', tokenizer, debugging=debugging) else: test = generate_dataset( f'{args.dataset_path}/dialog-babi-task5tst.txt', tokenizer, debugging=debugging) return None, None, test else: train = generate_dataset( f'{args.dataset_path}/dialog-babi-task5trn.txt', tokenizer, debugging=debugging) if (kb_percentage > 0): train += generate_dataset( f'{args.dataset_path}/gen-babi5-nk558-nd{kb_percentage}-rs0.txt', tokenizer, debugging=debugging) dev = generate_dataset(f'{args.dataset_path}/dialog-babi-task5dev.txt', tokenizer, debugging=debugging) test = generate_dataset( f'{args.dataset_path}/dialog-babi-task5tst.txt', tokenizer, debugging=debugging) smd = {"train": train, "valid": dev, "test": test} train_loader, valid_loader, test_loader = get_loader( args, smd, tokenizer) print(f"Max Len:{test_dataloader(args,train_loader)}") print(f"Max Len:{test_dataloader(args,valid_loader)}") print(f"Max Len:{test_dataloader(args,test_loader)}") return train_loader, valid_loader, test_loader
def load_MWOZ_SINGLE(args,tokenizer,test_flag=False,debugging=False,kb_percentage=0): if(test_flag): test = [] for d in ["train","hotel","attraction","restaurant","taxi"]: test += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/test/{d}_single.json")),tokenizer,debugging=debugging, domain=d) return None, None, test, None else: train = [] for d in ["train","hotel","attraction","restaurant","taxi"]: train += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/train/{d}_single.json")),tokenizer,debugging=debugging, domain=d) if (d == "taxi" and args.up_sampler): # double taxi training data train += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/train/{d}_single.json")),tokenizer,debugging=debugging, domain=d) # train += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/train/{d}_single.json")),tokenizer,debugging=debugging, domain=d) if (kb_percentage>0 and d != "taxi"): train += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/train/{d}_augmented_{kb_percentage}_single.json")),tokenizer,debugging=debugging, domain=d) if args.up_sampler: # triple attraction and hotel augmented data if d == "attraction" or d == "hotel": train += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/train/{d}_augmented_{kb_percentage}_single.json")),tokenizer,debugging=debugging, domain=d) train += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/train/{d}_augmented_{kb_percentage}_single.json")),tokenizer,debugging=debugging, domain=d) # to make the distribution balance #train += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/train/taxi_single.json")),tokenizer,debugging=debugging) valid = [] for d in ["train","hotel","attraction","restaurant","taxi"]: valid += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/valid/{d}_single.json")),tokenizer,debugging=debugging, domain=d) test = [] for d in ["train","hotel","attraction","restaurant","taxi"]: test += generate_dataset(json.load(open(f"data/MultiWOZ_2.1/test/{d}_single.json")),tokenizer,debugging=debugging, domain=d) dataset_dict = {"train":train,"valid":valid,"test":test} train_loader, valid_loader, test_loader = get_loader(args, dataset_dict, tokenizer) print(f"Max Len:{test_dataloader(args,train_loader)}") print(f"Max Len:{test_dataloader(args,valid_loader)}") print(f"Max Len:{test_dataloader(args,test_loader)}") return train_loader, valid_loader, None
def get_training_file_for_KB(args, indx, tokenizer): train_kb = generate_dataset_FINETUNE(f"data/SMD/test/dialog_{indx}.txt", tokenizer) data = {"train": train_kb, "valid": train_kb, "test": train_kb} train_loader, _, _ = get_loader(args, data, tokenizer) return train_loader