def __init__(self, data_name): self.data_name = data_name self.slot_dict = delex.prepareSlotValuesIndependent() self.delex_dialogues = json.load(open('data/multi-woz/delex.json')) self.db = MultiWozDB() self.labels = list() self.hyps = list()
def main(): with open('data.json') as f: whole_data = json.load(f) with open('dialogue_acts.json') as f: whole_act_data = json.load(f) dic = delexicalize.prepareSlotValuesIndependent() testListFile = [] fin = open('testListFile.json') for line in fin: testListFile.append(line[:-1]) fin.close() valListFile = [] fin = open('valListFile.json') for line in fin: valListFile.append(line[:-1]) fin.close() with open('../data/train.json', 'w') as f_train: with open('../data/val.json', 'w') as f_val: with open('../data/test.json', 'w') as f_test: train_turns = [] val_turns = [] test_turns = [] num = 0 for k in whole_data: data = whole_data[k]['log'] turn = k.split('.')[0] act_data = whole_act_data[turn] if k in testListFile: test_turns.append({ "file": turn, "info": print_data(data, act_data, dic) }) elif k in valListFile: val_turns.append({ "file": turn, "info": print_data(data, act_data, dic) }) else: train_turns.append({ "file": turn, "info": print_data(data, act_data, dic) }) num += 1 print("Finished {}/{}".format(num, len(whole_data))) json.dump(train_turns, f_train, indent=2) json.dump(val_turns, f_val, indent=2) json.dump(test_turns, f_test, indent=2)
def main(): with open('data.json') as f: whole_data = json.load(f) with open('dialogue_acts.json') as f: whole_act_data = json.load(f) previous_data = [] with open('../data/train.bak.json') as f: previous_data.extend(json.load(f)) with open('../data/val.bak.json') as f: previous_data.extend(json.load(f)) with open('../data/test.bak.json') as f: previous_data.extend(json.load(f)) previous_data = {_['file']: _['info'] for _ in previous_data} dic = delexicalize.prepareSlotValuesIndependent() testListFile = [] fin = open('testListFile.json') for line in fin: testListFile.append(line[:-1]) fin.close() valListFile = [] fin = open('valListFile.json') for line in fin: valListFile.append(line[:-1]) fin.close() with open('../data/train.json', 'w') as f_train: with open('../data/val.json', 'w') as f_val: with open('../data/test.json', 'w') as f_test: train_turns = [] val_turns = [] test_turns = [] num = 0 for k in whole_data: data = whole_data[k]['log'] turn = k.split('.')[0] act_data = whole_act_data[turn] ps = previous_data[turn] infos = print_data(data, act_data, dic) for j in range(len(infos)): infos[j]['act'] = ps[j]['act'] if k in testListFile: test_turns.append({"file": turn, "info": infos}) elif k in valListFile: val_turns.append({"file": turn, "info": infos}) else: train_turns.append({"file": turn, "info": infos}) num += 1 sys.stdout.write("Finished {}/{} \r".format(num, len(whole_data))) json.dump(train_turns, f_train, indent=2) json.dump(val_turns, f_val, indent=2) json.dump(test_turns, f_test, indent=2)
def __init__(self, data_name): self.data_name = data_name self.slot_dict = delex.prepareSlotValuesIndependent() self.delex_dialogues = json.load(file('data/multi-woz/delex.json')) # self.delex_dialogues = json.load(file('/home/bapeng/experiment/multiwoz2.1/MultiWOZ_2.1/data.json')) self.db = MultiWozDB() self.labels = list() self.hyps = list()
def createDelexData(): """Main function of the script - loads delexical dictionary, goes through each dialogue and does: 1) data normalization 2) delexicalization 3) addition of database pointer 4) saves the delexicalized data """ # create dictionary of delexicalied values that then we will search against, order matters here! dic = delexicalize.prepareSlotValuesIndependent() delex_data = {} fin1 = open('data/woz2/data.json') data = json.load(fin1) for dialogue_name in tqdm(data): if 'WOZ' not in dialogue_name: continue dialogue = data[dialogue_name] #print dialogue_name for idx, turn in enumerate(dialogue['log']): # normalization, split and delexicalization of the sentence sent = normalize(turn['text']) words = sent.split() sent = delexicalize.delexicalise(' '.join(words), dic) # changes to numbers only here digitpat = re.compile('\d+') sent = re.sub(digitpat, '[value_count]', sent) # delexicalized sentence added to the dialogue dialogue['log'][idx]['text'] = sent if idx % 2 == 1: # if it's a system turn # add database pointer pointer_vector = addDBPointer(turn) #print pointer_vector dialogue['log'][idx - 1]['db_pointer'] = pointer_vector.tolist() delex_data[dialogue_name] = dialogue with open('data/delex.json', 'w') as outfile: json.dump(delex_data, outfile) return delex_data
def createDelexData(): """Main function of the script - loads delexical dictionary, goes through each dialogue and does: 1) data normalization 2) delexicalization 3) addition of database pointer 4) saves the delexicalized data """ # download the data loadData() # create dictionary of delexicalied values that then we will search against, order matters here! dic = delexicalize.prepareSlotValuesIndependent() delex_data = {} with open('data/multi-woz/data.json') as fin1: data = json.load(fin1) with open('data/multi-woz/dialogue_acts.json') as fin2: data2 = json.load(fin2) cnt = 10 for dialogue_name in tqdm(data): dialogue = data[dialogue_name] # print(dialogue_name) idx_acts = 1 for idx, turn in enumerate(dialogue['log']): # normalization, split and delexicalization of the sentence sent = normalize(turn['text']) words = sent.split() sent = delexicalize.delexicalise(' '.join(words), dic) # parsing reference number GIVEN belief state sent = delexicaliseReferenceNumber(sent, turn) # changes to numbers only here digitpat = re.compile('\d+') sent = re.sub(digitpat, '[value_count]', sent) # delexicalized sentence added to the dialogue dialogue['log'][idx]['text'] = sent if idx % 2 == 1: # if it's a system turn # add database pointer pointer_vector = addDBPointer(turn) # add booking pointer pointer_vector = addBookingPointer(dialogue, turn, pointer_vector) # print(pointer_vector) dialogue['log'][idx - 1]['db_pointer'] = pointer_vector.tolist() # FIXING delexicalization: dialogue = fixDelex(dialogue_name, dialogue, data2, idx, idx_acts) idx_acts += 1 delex_data[dialogue_name] = dialogue with open('data/multi-woz/delex.json', 'w') as outfile: json.dump(delex_data, outfile) return delex_data
def sub_func(entry): data, act_data, k = entry dic = delexicalize.prepareSlotValuesIndependent() return print_data(data, act_data, dic)