Exemplo n.º 1
0
 def __init__(self, data_name):
     self.data_name = data_name
     self.slot_dict = delex.prepareSlotValuesIndependent()
     self.delex_dialogues = json.load(open('data/multi-woz/delex.json'))
     self.db = MultiWozDB()
     self.labels = list()
     self.hyps = list()
Exemplo n.º 2
0
def main():
    with open('data.json') as f:
        whole_data = json.load(f)

    with open('dialogue_acts.json') as f:
        whole_act_data = json.load(f)

    dic = delexicalize.prepareSlotValuesIndependent()

    testListFile = []
    fin = open('testListFile.json')
    for line in fin:
        testListFile.append(line[:-1])
    fin.close()

    valListFile = []
    fin = open('valListFile.json')
    for line in fin:
        valListFile.append(line[:-1])
    fin.close()

    with open('../data/train.json', 'w') as f_train:
        with open('../data/val.json', 'w') as f_val:
            with open('../data/test.json', 'w') as f_test:
                train_turns = []
                val_turns = []
                test_turns = []
                num = 0
                for k in whole_data:
                    data = whole_data[k]['log']
                    turn = k.split('.')[0]
                    act_data = whole_act_data[turn]

                    if k in testListFile:
                        test_turns.append({
                            "file":
                            turn,
                            "info":
                            print_data(data, act_data, dic)
                        })
                    elif k in valListFile:
                        val_turns.append({
                            "file":
                            turn,
                            "info":
                            print_data(data, act_data, dic)
                        })
                    else:
                        train_turns.append({
                            "file":
                            turn,
                            "info":
                            print_data(data, act_data, dic)
                        })
                    num += 1
                    print("Finished {}/{}".format(num, len(whole_data)))

                json.dump(train_turns, f_train, indent=2)
                json.dump(val_turns, f_val, indent=2)
                json.dump(test_turns, f_test, indent=2)
Exemplo n.º 3
0
def main():
    with open('data.json') as f:
        whole_data = json.load(f)

    with open('dialogue_acts.json') as f:
        whole_act_data = json.load(f)

    previous_data = []
    with open('../data/train.bak.json') as f:
        previous_data.extend(json.load(f))
    with open('../data/val.bak.json') as f:
        previous_data.extend(json.load(f))
    with open('../data/test.bak.json') as f:
        previous_data.extend(json.load(f))
    previous_data = {_['file']: _['info'] for _ in previous_data}

    dic = delexicalize.prepareSlotValuesIndependent()

    testListFile = []
    fin = open('testListFile.json')
    for line in fin:
        testListFile.append(line[:-1])
    fin.close()

    valListFile = []
    fin = open('valListFile.json')
    for line in fin:
        valListFile.append(line[:-1])
    fin.close()

    with open('../data/train.json', 'w') as f_train:
        with open('../data/val.json', 'w') as f_val:
            with open('../data/test.json', 'w') as f_test:
                train_turns = []
                val_turns = []
                test_turns = []
                num = 0
                for k in whole_data:
                    data = whole_data[k]['log']
                    turn = k.split('.')[0]
                    act_data = whole_act_data[turn]
                    ps = previous_data[turn]

                    infos = print_data(data, act_data, dic)
                    for j in range(len(infos)):
                        infos[j]['act'] = ps[j]['act']

                    if k in testListFile:
                        test_turns.append({"file": turn, "info": infos})
                    elif k in valListFile:
                        val_turns.append({"file": turn, "info": infos})
                    else:
                        train_turns.append({"file": turn, "info": infos})
                    num += 1
                    sys.stdout.write("Finished {}/{} \r".format(num, len(whole_data)))

                json.dump(train_turns, f_train, indent=2)
                json.dump(val_turns, f_val, indent=2)
                json.dump(test_turns, f_test, indent=2)
Exemplo n.º 4
0
    def __init__(self, data_name):
        self.data_name = data_name
        self.slot_dict = delex.prepareSlotValuesIndependent()
        self.delex_dialogues = json.load(file('data/multi-woz/delex.json'))
        # self.delex_dialogues = json.load(file('/home/bapeng/experiment/multiwoz2.1/MultiWOZ_2.1/data.json'))

        self.db = MultiWozDB()
        self.labels = list()
        self.hyps = list()
Exemplo n.º 5
0
def createDelexData():
    """Main function of the script - loads delexical dictionary,
    goes through each dialogue and does:
    1) data normalization
    2) delexicalization
    3) addition of database pointer
    4) saves the delexicalized data
    """

    # create dictionary of delexicalied values that then we will search against, order matters here!
    dic = delexicalize.prepareSlotValuesIndependent()
    delex_data = {}

    fin1 = open('data/woz2/data.json')
    data = json.load(fin1)

    for dialogue_name in tqdm(data):
        if 'WOZ' not in dialogue_name:
            continue
        dialogue = data[dialogue_name]
        #print dialogue_name

        for idx, turn in enumerate(dialogue['log']):
            # normalization, split and delexicalization of the sentence
            sent = normalize(turn['text'])

            words = sent.split()
            sent = delexicalize.delexicalise(' '.join(words), dic)

            # changes to numbers only here
            digitpat = re.compile('\d+')
            sent = re.sub(digitpat, '[value_count]', sent)

            # delexicalized sentence added to the dialogue
            dialogue['log'][idx]['text'] = sent

            if idx % 2 == 1:  # if it's a system turn
                # add database pointer
                pointer_vector = addDBPointer(turn)

                #print pointer_vector
                dialogue['log'][idx -
                                1]['db_pointer'] = pointer_vector.tolist()

        delex_data[dialogue_name] = dialogue

    with open('data/delex.json', 'w') as outfile:
        json.dump(delex_data, outfile)

    return delex_data
Exemplo n.º 6
0
def createDelexData():
    """Main function of the script - loads delexical dictionary,
    goes through each dialogue and does:
    1) data normalization
    2) delexicalization
    3) addition of database pointer
    4) saves the delexicalized data
    """
    # download the data
    loadData()

    # create dictionary of delexicalied values that then we will search against, order matters here!
    dic = delexicalize.prepareSlotValuesIndependent()
    delex_data = {}

    with open('data/multi-woz/data.json') as fin1:
        data = json.load(fin1)

    with open('data/multi-woz/dialogue_acts.json') as fin2:
        data2 = json.load(fin2)

    cnt = 10

    for dialogue_name in tqdm(data):
        dialogue = data[dialogue_name]
        # print(dialogue_name)

        idx_acts = 1

        for idx, turn in enumerate(dialogue['log']):
            # normalization, split and delexicalization of the sentence
            sent = normalize(turn['text'])

            words = sent.split()
            sent = delexicalize.delexicalise(' '.join(words), dic)

            # parsing reference number GIVEN belief state
            sent = delexicaliseReferenceNumber(sent, turn)

            # changes to numbers only here
            digitpat = re.compile('\d+')
            sent = re.sub(digitpat, '[value_count]', sent)

            # delexicalized sentence added to the dialogue
            dialogue['log'][idx]['text'] = sent

            if idx % 2 == 1:  # if it's a system turn
                # add database pointer
                pointer_vector = addDBPointer(turn)
                # add booking pointer
                pointer_vector = addBookingPointer(dialogue, turn,
                                                   pointer_vector)

                # print(pointer_vector)
                dialogue['log'][idx -
                                1]['db_pointer'] = pointer_vector.tolist()

            # FIXING delexicalization:
            dialogue = fixDelex(dialogue_name, dialogue, data2, idx, idx_acts)
            idx_acts += 1

        delex_data[dialogue_name] = dialogue

    with open('data/multi-woz/delex.json', 'w') as outfile:
        json.dump(delex_data, outfile)

    return delex_data
Exemplo n.º 7
0
def sub_func(entry):
    data, act_data, k = entry
    dic = delexicalize.prepareSlotValuesIndependent()
    return print_data(data, act_data, dic)