Exemplos de normalize em Python, exemplos de simulator.multiwoz_utils.utils.nlp.normalize em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: delexicalize.py Projeto: qbetterk/user-simulator

def delexicaliseReferenceNumber(sent, turn):
    """Based on the belief state, we can find reference number that
    during data gathering was created randomly."""
    domains = [
        'restaurant', 'hotel', 'attraction', 'train', 'taxi', 'hospital'
    ]  # , 'police']
    if turn['metadata']:
        for domain in domains:
            if turn['metadata'][domain]['book']['booked']:
                for slot in turn['metadata'][domain]['book']['booked'][0]:
                    if slot == 'reference':
                        val = '[' + domain + '_' + slot + ']'
                    else:
                        val = '[' + domain + '_' + slot + ']'
                    key = normalize(
                        turn['metadata'][domain]['book']['booked'][0][slot])
                    sent = (' ' + sent + ' ').replace(' ' + key + ' ',
                                                      ' ' + val + ' ')

                    # try reference with hashtag
                    key = normalize(
                        "#" +
                        turn['metadata'][domain]['book']['booked'][0][slot])
                    sent = (' ' + sent + ' ').replace(' ' + key + ' ',
                                                      ' ' + val + ' ')

                    # try reference with ref#
                    key = normalize(
                        "ref#" +
                        turn['metadata'][domain]['book']['booked'][0][slot])
                    sent = (' ' + sent + ' ').replace(' ' + key + ' ',
                                                      ' ' + val + ' ')
    return sent

Exemplo n.º 2

0

Exibir arquivo

Arquivo: delexicalize.py Projeto: qbetterk/user-simulator

def delexicalize_one_sent0(sent, dic):
    sent, kv_dic = normalize(sent)

    words = sent.split()
    sent, kv_dic_tmp = delexicalize.delexicalise(' '.join(words), dic)

    kv_dic.update(kv_dic_tmp)
    # parsing reference number GIVEN belief state
    # sent = delexicaliseReferenceNumber(sent, turn)

    # changes to numbers only here
    # digitpat = re.compile('(?<!looking for)(?<=for) \d+ (?!of)|(?<=party of) \d+ | \d+ (?=people|person|of us)')
    # # digitpat = re.compile(' \d+ ')
    # value_count = re.findall(digitpat, sent)
    # while value_count:
    #
    #     index = sent.find(value_count[0])
    #
    #     if not delexicalize.check_balance(sent[:index]):
    #         # pdb.set_trace()
    #         value_count.pop(0)
    #         continue
    #
    #     sent = sent[:index] + \
    #            ' [value_count|' + value_count[0][1:-1] + '] ' + \
    #            sent[index + len(value_count[0]):]
    #     value_count = re.findall(digitpat, sent)

    digitpat = re.compile('\d+')
    digits = re.findall(digitpat, sent)
    if len(digits):
        kv_dic.update({'[value_count]': digits})
    sent = re.sub(digitpat, '[value_count]', sent)

    return sent, kv_dic

Exemplo n.º 3

0

Exibir arquivo

Arquivo: delexicalize.py Projeto: qbetterk/user-simulator

def delexicalize_one_sent(sent):
    def normalize(text):
        def insertSpace(token, text):
            sidx = 0
            while True:
                sidx = text.find(token, sidx)
                if sidx == -1:
                    break
                if sidx + 1 < len(text) and re.match('[0-9]', text[sidx - 1]) and \
                        re.match('[0-9]', text[sidx + 1]):
                    sidx += 1
                    continue
                if text[sidx - 1] != ' ':
                    text = text[:sidx] + ' ' + text[sidx:]
                    sidx += 1
                if sidx + len(token) < len(text) and text[sidx +
                                                          len(token)] != ' ':
                    text = text[:sidx + 1] + ' ' + text[sidx + 1:]
                sidx += 1
            return text

        # lower case every word
        text = text.lower()

        # replace white spaces in front and end
        text = re.sub(r'^\s*|\s*$', '', text)

        # hotel domain pfb30
        text = re.sub(r"b&b", "bed and breakfast", text)
        text = re.sub(r"b and b", "bed and breakfast", text)

        # normalize phone number
        ms = re.findall('\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4,5})', text)
        if ms:
            sidx = 0
            for m in ms:
                sidx = text.find(m[0], sidx)
                if text[sidx - 1] == '(':
                    sidx -= 1
                eidx = text.find(m[-1], sidx) + len(m[-1])
                text = text.replace(text[sidx:eidx], ''.join(m))

        # normalize postcode
        ms = re.findall(
            '([a-z]{1}[\. ]?[a-z]{1}[\. ]?\d{1,2}[, ]+\d{1}[\. ]?[a-z]{1}[\. ]?[a-z]{1}|[a-z]{2}\d{2}[a-z]{2})',
            text)
        if ms:
            sidx = 0
            for m in ms:
                sidx = text.find(m, sidx)
                eidx = sidx + len(m)
                text = text[:sidx] + re.sub('[,\. ]', '', m) + text[eidx:]

        # weird unicode bug
        text = re.sub(u"(\u2018|\u2019)", "'", text)

        text = ' ' + text + ' '
        # # replace time and and price
        timepat = re.compile(
            " \d{1,2}[:]\d{1,2}[ \.,\?]| \d{4}[ \.,\?]| \d{1,2}[ap][m\. ]+| \d{1,2} [ap][m\. ]+| \d{1,2}[:]\d{1,2}[ap]m[ \.,\?]"
        )
        # # some utterances just miss the ":"
        # timepat_noise = re.compile(" \d{4}[ \.,\?]")
        pricepat = re.compile("\d{1,3}[\.]\d{1,2}")

        value_time = re.findall(timepat, text)
        # pdb.set_trace()
        while value_time:
            index = text.find(value_time[0])
            text = text[:index] + \
                   ' [value_time|' + value_time[0][1:-1] + ']' + \
                   text[index + len(value_time[0]) - 1:]
            value_time = re.findall(timepat, text)

        value_price = re.findall(pricepat, text)

        if value_price:
            text = re.sub(pricepat, ' [value_price|' + value_price[0] + '] ',
                          text)

        text = text[1:-1]

        # replace st.
        text = text.replace(';', ',')
        text = re.sub('$\/', '', text)
        text = text.replace('/', ' and ')

        # replace other special characters
        text = text.replace('-', ' ')
        text = re.sub('[\"\<>@\(\)]', '', text)

        # insert white space before and after tokens:
        for token in ['?', '.', ',', '!']:
            text = insertSpace(token, text)

        # insert white space for 's
        text = insertSpace('\'s', text)

        # replace it's, does't, you'd ... etc
        text = re.sub('^\'', '', text)
        text = re.sub('\'$', '', text)
        text = re.sub('\'\s', ' ', text)
        text = re.sub('\s\'', ' ', text)

        fin = open(
            '/data/qkun/simulator/data/multiwoz-master/utils/mapping.pair')
        replacements = []
        for line in fin.readlines():
            tok_from, tok_to = line.replace('\n', '').split('\t')
            replacements.append((' ' + tok_from + ' ', ' ' + tok_to + ' '))

        for fromx, tox in replacements:
            text = ' ' + text + ' '
            text = text.replace(fromx, tox)[1:-1]

        # remove multiple spaces
        text = re.sub(' +', ' ', text)

        # concatenate numbers
        tmp = text
        tokens = text.split()
        i = 1
        while i < len(tokens):
            if re.match(u'^\d+$', tokens[i]) and \
                    re.match(u'\d+$', tokens[i - 1]):
                tokens[i - 1] += tokens[i]
                del tokens[i]
            else:
                i += 1
        text = ' '.join(tokens)

        return text

    def check_balance(string):
        # open_tup = tuple('[')
        # close_tup = tuple(']')
        # map = dict(zip(open_tup, close_tup))
        queue = 0

        for i in string:
            if i == '[':
                queue += 1
            elif i == ']':
                if not queue:
                    return False
                else:
                    queue -= 1
        if not queue:
            return True
        else:
            return False

    def delexicalise(utt, dictionary):
        for key, val in dictionary:
            utt = (' ' + utt + ' ')
            if key in utt:
                idx = 0
                while utt[idx:].find(' ' + key + ' ') != -1:
                    idx += utt[idx:].find(' ' + key + ' ')
                    # # to exclude the case that 'ask' is a verb
                    if key == 'ask' and idx > 2 and utt[idx - 2:idx] == ' i':
                        idx += 1
                        continue
                    if check_balance(utt[:idx]):
                        utt = utt[:
                                  idx] + ' ' + val[:
                                                   -1] + '|' + key + '] ' + utt[
                                                       idx + len(key) + 2:]
                        idx += len(key) + 4 + len(val[:-1])
                    else:
                        idx += len(key)
            utt = utt[1:-1]

        return utt

    def delex_people_count(sent):
        sent = ' ' + sent + ' '
        digitpat = re.compile(
            '(?<!looking for)(?<=for) \d+ (?!of)|(?<=party of) \d+ | \d+ (?=people|person|of us)'
        )
        value_count = re.findall(digitpat, sent)
        while value_count:
            index = sent.find(value_count[0])
            if not check_balance(sent[:index]):
                value_count.pop(0)
                continue

            sent = sent[:index] + \
                   ' [value_count|' + value_count[0][1:-1] + '] ' + \
                   sent[index + len(value_count[0]):]
            value_count = re.findall(digitpat, sent)
        sent = sent[1:-1]
        return sent

    # # # extract slots
    # # replace time, date, specific price
    sent = ' '.join(word_tokenize(sent))

    sent = normalize(sent)
    # pdb.set_trace()
    # # replace info in db
    db_entity_file = open(
        '/data/qkun/simulator/data/multiwoz-master/db_entity_file.pkl', 'rb')
    db_entity_list = pkl.load(db_entity_file)
    db_entity_file.close()
    sent = delexicalise(sent, db_entity_list)

    # # replace # of people for reservation
    sent = delex_people_count(sent)

    # # # replace and generate dic
    slotpat = re.compile('\[.*?\]')
    slots = re.findall(slotpat, sent)
    slot_dic = {}

    for slot in slots:
        if '|' in slot:
            [slot_name, slot_val] = slot[1:-1].split('|')
            slot_name = '[' + slot_name + ']'
            slot_dic[slot_name] = slot_val
            sent = sent.replace(slot, slot_name)

    sent = sent.replace('[', "").replace(']', "")
    return sent, slot_dic

Exemplo n.º 4

0

Exibir arquivo

Arquivo: delexicalize.py Projeto: qbetterk/user-simulator

def createDelexData():
    """Main function of the script - loads delexical dictionary,
    goes through each dialogue and does:
    1) data normalization
    2) delexicalization
    3) addition of database pointer
    4) saves the delexicalized data
    """
    # download the data
    loadData()

    # create dictionary of delexicalied values that then we will search against, order matters here!
    dic = delexicalize.prepareSlotValuesIndependent()

    fin1 = file('data/multi-woz/data.json')
    data = json.load(fin1)

    fin2 = file('data/multi-woz/dialogue_acts.json')
    data2 = json.load(fin2)

    for dialogue_name in tqdm(data):
        dialogue = data[dialogue_name]
        # print dialogue_name

        idx_acts = 1

        for idx, turn in enumerate(dialogue['log']):
            # normalization, split and delexicalization of the sentence
            sent = normalize(turn['text'])

            words = sent.split()
            sent = delexicalize.delexicalise(' '.join(words), dic)

            # parsing reference number GIVEN belief state
            sent = delexicaliseReferenceNumber(sent, turn)

            # changes to numbers only here
            digitpat = re.compile('\d+')
            sent = re.sub(digitpat, '[value_count]', sent)

            # delexicalized sentence added to the dialogue
            dialogue['log'][idx]['text'] = sent

            if idx % 2 == 1:  # if it's a system turn
                # add database pointer
                pointer_vector = addDBPointer(turn)
                # add booking pointer
                pointer_vector = addBookingPointer(dialogue, turn,
                                                   pointer_vector)

                # print pointer_vector
                dialogue['log'][idx -
                                1]['db_pointer'] = pointer_vector.tolist()

            # FIXING delexicalization:
            dialogue = fixDelex(dialogue_name, dialogue, data2, idx, idx_acts)
            idx_acts += 1

        delex_data[dialogue_name] = dialogue

    with open('data/multi-woz/delex.json', 'w') as outfile:
        json.dump(delex_data, outfile)

    return delex_data

Exemplo n.º 5

0

Exibir arquivo

    fin1 = file('data/multi-woz/data.json')
    data = json.load(fin1)

    fin2 = file('data/multi-woz/dialogue_acts.json')
    data2 = json.load(fin2)

    for dialogue_name in tqdm(data):
        dialogue = data[dialogue_name]
        #print dialogue_name

        idx_acts = 1

        for idx, turn in enumerate(dialogue['log']):
            # normalization, split and delexicalization of the sentence
            sent = normalize(turn['text'])

            words = sent.split()
            sent = delexicalize.delexicalise(' '.join(words), dic)

            # parsing reference number GIVEN belief state
            sent = delexicaliseReferenceNumber(sent, turn)

            # changes to numbers only here
            digitpat = re.compile('\d+')
            sent = re.sub(digitpat, '[value_count]', sent)

            # delexicalized sentence added to the dialogue
            dialogue['log'][idx]['text'] = sent

            if idx % 2 == 1:  # if it's a system turn

Exemplo n.º 6

0

Exibir arquivo

def queryResultVenues(domain, turn, real_belief=False):
    # query the db
    sql_query = "select * from {}".format(domain)

    if real_belief == True:
        items = turn.items()
    elif real_belief == 'tracking':
        for slot in turn[domain]:
            key = slot[0].split("-")[1]
            val = slot[0].split("-")[2]
            if key == "price range":
                key = "pricerange"
            elif key == "leave at":
                key = "leaveAt"
            elif key == "arrive by":
                key = "arriveBy"
            if val == "do n't care":
                pass
            else:
                if flag:
                    sql_query += " where "
                    val2 = val.replace("'", "''")
                    val2 = normalize(val2)
                    if key == 'leaveAt':
                        sql_query += key + " > " + r"'" + val2 + r"'"
                    elif key == 'arriveBy':
                        sql_query += key + " < " + r"'" + val2 + r"'"
                    else:
                        sql_query += r" " + key + "=" + r"'" + val2 + r"'"
                    flag = False
                else:
                    val2 = val.replace("'", "''")
                    val2 = normalize(val2)
                    if key == 'leaveAt':
                        sql_query += r" and " + key + " > " + r"'" + val2 + r"'"
                    elif key == 'arriveBy':
                        sql_query += r" and " + key + " < " + r"'" + val2 + r"'"
                    else:
                        sql_query += r" and " + key + "=" + r"'" + val2 + r"'"

            try:  # "select * from attraction  where name = 'queens college'"
                return dbs[domain].execute(sql_query).fetchall()
            except:
                return []  # TODO test it
        pass
    else:
        items = turn['metadata'][domain]['semi'].items()

    flag = True
    for key, val in items:
        if val == "" or val == "dontcare" or val == 'not mentioned' or val == "don't care" or val == "dont care" or val == "do n't care":
            pass
        else:
            if flag:
                sql_query += " where "
                val2 = val.replace("'", "''")
                val2 = normalize(val2)
                if key == 'leaveAt':
                    sql_query += r" " + key + " > " + r"'" + val2 + r"'"
                elif key == 'arriveBy':
                    sql_query += r" " + key + " < " + r"'" + val2 + r"'"
                else:
                    sql_query += r" " + key + "=" + r"'" + val2 + r"'"
                flag = False
            else:
                val2 = val.replace("'", "''")
                val2 = normalize(val2)
                if key == 'leaveAt':
                    sql_query += r" and " + key + " > " + r"'" + val2 + r"'"
                elif key == 'arriveBy':
                    sql_query += r" and " + key + " < " + r"'" + val2 + r"'"
                else:
                    sql_query += r" and " + key + "=" + r"'" + val2 + r"'"

    try:  # "select * from attraction  where name = 'queens college'"
        return dbs[domain].execute(sql_query).fetchall()
    except:
        return []  # TODO test it

Exemplo n.º 7

0

Exibir arquivo

def prepareSlotValuesIndependent():
    domains = [
        'restaurant', 'hotel', 'attraction', 'train', 'taxi', 'hospital',
        'police'
    ]
    requestables = ['phone', 'address', 'postcode', 'reference', 'id']
    dic = []
    dic_area = []
    dic_food = []
    dic_price = []

    # read databases
    for domain in domains:
        try:
            fin = file('db/' + domain + '_db.json')
            db_json = json.load(fin)
            fin.close()

            for ent in db_json:
                for key, val in ent.items():
                    if val == '?' or val == 'free':
                        pass
                    elif key == 'address':
                        dic.append((normalize(val),
                                    '[' + domain + '_' + 'address' + ']'))
                        if "road" in val:
                            val = val.replace("road", "rd")
                            dic.append((normalize(val),
                                        '[' + domain + '_' + 'address' + ']'))
                        elif "rd" in val:
                            val = val.replace("rd", "road")
                            dic.append((normalize(val),
                                        '[' + domain + '_' + 'address' + ']'))
                        elif "st" in val:
                            val = val.replace("st", "street")
                            dic.append((normalize(val),
                                        '[' + domain + '_' + 'address' + ']'))
                        elif "street" in val:
                            val = val.replace("street", "st")
                            dic.append((normalize(val),
                                        '[' + domain + '_' + 'address' + ']'))
                    elif key == 'name':
                        dic.append((normalize(val),
                                    '[' + domain + '_' + 'name' + ']'))
                        if "b & b" in val:
                            val = val.replace("b & b", "bed and breakfast")
                            dic.append((normalize(val),
                                        '[' + domain + '_' + 'name' + ']'))
                        elif "bed and breakfast" in val:
                            val = val.replace("bed and breakfast", "b & b")
                            dic.append((normalize(val),
                                        '[' + domain + '_' + 'name' + ']'))
                        elif "hotel" in val and 'gonville' not in val:
                            val = val.replace("hotel", "")
                            dic.append((normalize(val),
                                        '[' + domain + '_' + 'name' + ']'))
                        elif "restaurant" in val:
                            val = val.replace("restaurant", "")
                            dic.append((normalize(val),
                                        '[' + domain + '_' + 'name' + ']'))
                    elif key == 'postcode':
                        dic.append((normalize(val),
                                    '[' + domain + '_' + 'postcode' + ']'))
                    elif key == 'phone':
                        dic.append((val, '[' + domain + '_' + 'phone' + ']'))
                    elif key == 'trainID':
                        dic.append(
                            (normalize(val), '[' + domain + '_' + 'id' + ']'))
                    elif key == 'department':
                        dic.append((normalize(val),
                                    '[' + domain + '_' + 'department' + ']'))

                    # NORMAL DELEX
                    elif key == 'area':
                        dic_area.append((normalize(val),
                                         '[' + 'value' + '_' + 'area' + ']'))
                    elif key == 'food':
                        dic_food.append((normalize(val),
                                         '[' + 'value' + '_' + 'food' + ']'))
                    elif key == 'pricerange':
                        dic_price.append(
                            (normalize(val),
                             '[' + 'value' + '_' + 'pricerange' + ']'))
                    else:
                        pass
                    # TODO car type?
        except:
            pass

        if domain == 'hospital':
            dic.append(
                (normalize('Hills Rd'), '[' + domain + '_' + 'address' + ']'))
            dic.append((normalize('Hills Road'),
                        '[' + domain + '_' + 'address' + ']'))
            dic.append(
                (normalize('CB20QQ'), '[' + domain + '_' + 'postcode' + ']'))
            dic.append(('01223245151', '[' + domain + '_' + 'phone' + ']'))
            dic.append(('1223245151', '[' + domain + '_' + 'phone' + ']'))
            dic.append(('0122324515', '[' + domain + '_' + 'phone' + ']'))
            dic.append((normalize('Addenbrookes Hospital'),
                        '[' + domain + '_' + 'name' + ']'))

        elif domain == 'police':
            dic.append(
                (normalize('Parkside'), '[' + domain + '_' + 'address' + ']'))
            dic.append(
                (normalize('CB11JG'), '[' + domain + '_' + 'postcode' + ']'))
            dic.append(('01223358966', '[' + domain + '_' + 'phone' + ']'))
            dic.append(('1223358966', '[' + domain + '_' + 'phone' + ']'))
            dic.append((normalize('Parkside Police Station'),
                        '[' + domain + '_' + 'name' + ']'))

    # add at the end places from trains
    fin = file('db/' + 'train' + '_db.json')
    db_json = json.load(fin)
    fin.close()

    for ent in db_json:
        for key, val in ent.items():
            if key == 'departure' or key == 'destination':
                dic.append(
                    (normalize(val), '[' + 'value' + '_' + 'place' + ']'))

    # add specific values:
    for key in [
            'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday',
            'sunday'
    ]:
        dic.append((normalize(key), '[' + 'value' + '_' + 'day' + ']'))

    # more general values add at the end
    dic.extend(dic_area)
    dic.extend(dic_food)
    dic.extend(dic_price)

    return dic