示例#1
0
 def _delex_texts(self):
     """Delexicalize texts in the buffers and save them separately in the member variables,
     along with the delexicalization instructions used for the operation."""
     self._delexed_texts = []
     self._absts = []
     for text_idx, (text, da) in enumerate(zip(self._sents, self._das)):
         delex_text = []
         absts = []
         # do the delexicalization, keep track of which slots we used
         for tok_idx, (form, lemma, tag) in enumerate(text):
             slot = da.has_value(lemma)
             if slot and slot in self._abst_slots:
                 delex_text.append(('X-' + slot, 'X-' + slot, tag))
                 absts.append(Abst(slot, lemma, form, tok_idx, tok_idx + 1))
             else:
                 delex_text.append((form, lemma, tag))
         # fix coordinated delexicalized values
         self._delex_fix_coords(delex_text, da, absts)
         covered_slots = set([a.slot for a in absts])
         # check and warn if we left isomething non-delexicalized
         for dai in da:
             if (dai.slot in self._abst_slots
                     and dai.value not in [None, 'none', 'dont_care']
                     and dai.slot not in covered_slots):
                 log_info(
                     "Cannot delexicalize slot  %s  at %d:\nDA: %s\nTx: %s\n"
                     % (dai.slot, text_idx, unicode(da), " ".join(
                         [form for form, _, _ in text])))
         # save the delexicalized text and the delexicalization instructions
         self._delexed_texts.append(delex_text)
         self._absts.append(absts)
示例#2
0
    def _create_delex_texts(self):
        """Delexicalize texts in the buffers and save them separately in the member variables,
        along with the delexicalization instructions used for the operation."""
        self._delex_texts = []
        self._absts = []
        for text_idx, (text, da) in enumerate(zip(self._texts, self._das)):
            delex_text = []
            absts = []
            # do the delexicalization, keep track of which slots we used
            for tok_idx, (form, lemma, tag) in enumerate(text):
                # abstract away from numbers
                abst_form = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', form.lower())
                abst_lemma = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', lemma)
                # try to find if the surface form belongs to some slot
                slot, value = self._rev_sf_dict.get(
                    (abst_form, abst_lemma, tag), (None, None))
                # if we found a slot, get back the numbers
                if slot:
                    for num_match in re.finditer(r'(?: |^)([0-9]+)(?: |$)',
                                                 lemma):
                        value = re.sub(r'_',
                                       num_match.group(1),
                                       value,
                                       count=1)
                # fall back to directly comparing against the DA value
                else:
                    slot = da.has_value(lemma)
                    value = lemma

                # if we found something, delexicalize it (check if the value corresponds to the DA!)
                if (slot and slot in self._abst_slots
                        and da.value_for_slot(slot)
                        not in [None, 'none', 'dont_care']
                        and value in da.value_for_slot(slot)):
                    delex_text.append(('X-' + slot, 'X-' + slot, tag))
                    absts.append(Abst(slot, value, form, tok_idx, tok_idx + 1))
                # otherwise keep the token as it is
                else:
                    delex_text.append((form, lemma, tag))
            # fix coordinated delexicalized values
            self._delex_fix_coords(delex_text, da, absts)
            covered_slots = set([a.slot for a in absts])
            # check and warn if we left isomething non-delexicalized
            for dai in da:
                if (dai.slot in self._abst_slots
                        and dai.value not in [None, 'none', 'dont_care']
                        and dai.slot not in covered_slots):
                    log_info(
                        "Cannot delexicalize slot  %s  at %d:\nDA: %s\nTx: %s\n"
                        % (dai.slot, text_idx, str(da), " ".join(
                            [form for form, _, _ in text])))
            # save the delexicalized text and the delexicalization instructions
            self._delex_texts.append(delex_text)
            self._absts.append(absts)
示例#3
0
def get_abstraction(text, conc_da, slot_names=False):
    """Get the abstraction instructions and convert the string (replace *SLOT with X).
    If slot_names is true, "X-slot_name" is used instead."""
    abstr = []
    toks = tokenize(text).split(' ')

    for dai in conc_da:
        slot_abst = '*' + dai.slot.upper()
        try:
            idx = toks.index(slot_abst)
            toks[idx] = 'X' + ('-' + dai.slot if slot_names else '')
            abstr.append(
                Abst(slot=dai.slot, value=dai.value, start=idx, end=idx + 1))
        except ValueError:
            continue

    return ' '.join(toks), "\t".join([unicode(a) for a in abstr])
示例#4
0
def delex_sent(da, conc, abst_slots, use_slot_names=True, delex_slot_names=False):
    """Abstract the given slots in the given sentence (replace them with X).

    @param da: concrete DA
    @param conc: concrete sentence text (string -- split only on whitespace, or list of tokens)
    @param abst_slots: a set of slots to be abstracted
    @param slot_names: boolean -- use slot names in the abstraction (X-slot), or just X?
    @return: a tuple of the abstracted text (in the same format as conc), abstracted DA, \
        and abstraction instructions
    """
    return_string = False
    if isinstance(conc, basestring):
        toks = conc.split(' ')
        return_string = True
    else:
        toks = conc
    absts = []
    abst_da = DA()
    toks_mask = [True] * len(toks)

    # find all values in the sentence, building the abstracted DA along the way
    # search first for longer values (so that substrings don't block them)
    for dai in sorted(da,
                      key=lambda dai: len(dai.value) if dai.value is not None else 0,
                      reverse=True):
        # first, create the 'abstracted' DAI as the copy of the current DAI
        abst_da.append(DAI(dai.da_type, dai.slot, dai.value))
        if dai.value is None:
            continue
        pos = find_value(dai.value, toks, toks_mask)
        # if the value is to be abstracted, replace the value in the abstracted DAI
        # and save abstraction instruction (even if not found in the sentence)
        if dai.slot in abst_slots and dai.value != 'dont_care':
            abst_da[-1].value = 'X-' + dai.slot
            # save the abstraction instruction
            absts.append(Abst(dai.slot, dai.value, surface_form=' '.join(toks[pos[0]:pos[1]]),
                              start=pos[0], end=pos[1]))

    if delex_slot_names:
        for dai in sorted([dai for dai in da if dai.slot is not None],
                          key=lambda dai: len(dai.slot),
                          reverse=True):
            pos = find_value(dai.slot.replace('_', ' '), toks, toks_mask)
            if dai.slot in abst_slots:
                absts.append(Abst(dai.slot, None, surface_form=' '.join(toks[pos[0]:pos[1]]),
                                  start=pos[0], end=pos[1]))

    # go from the beginning of the sentence, replacing the values to be abstracted
    absts.sort(key=lambda a: a.start)
    shift = 0
    for abst in absts:
        # select only those that should actually be abstracted on the output
        if abst.slot not in abst_slots or abst.value == 'dont_care' or abst.start < 0:
            continue
        # replace the text with the placeholder (X-slot/X-value, X-slot-name, X)
        if delex_slot_names and abst.value is None:
            toks[abst.start - shift:abst.end - shift] = ['X-slot']
        elif use_slot_names:
            toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot]
        else:
            toks[abst.start - shift:abst.end - shift] = ['X' if not delex_slot_names else 'X-value']
        # update abstraction instruction indexes
        shift_add = abst.end - abst.start - 1
        abst.start -= shift
        abst.end = abst.start + 1
        shift += shift_add

    return ' '.join(toks) if return_string else toks, abst_da, absts
示例#5
0
def delex_sent(da,
               sent,
               delex_slots,
               use_slot_names=True,
               delex_slot_names=False,
               repeated=False):
    """Delexicalize ("abstract") the given slots in the given sentence (replace them with X
    or X-slot_name).

    @param da: concrete DA
    @param sent: lexicalized sentence text (string -- split only on whitespace, or list of tokens)
    @param delex_slots: a set of slots to be delexicalized, or a dict (with a set of values to \
        leave untouched for each slot)
    @param slot_names: boolean -- use slot names in the abstraction (X-slot), or just X?
    @return: a tuple of the abstracted text (in the same format as sent), delexicalized DA, \
        and abstraction instructions
    """
    return_string = False
    if isinstance(sent, basestring):
        toks = sent.split(' ')
        return_string = True
    else:
        toks = sent
    if isinstance(delex_slots, set):  # convert sets to dicts
        delex_slots = {slot: set() for slot in delex_slots}
    absts = []
    abst_da = DA()
    toks_mask = [True] * len(toks)

    # find all values in the sentence, building the delexicalized DA along the way
    # search first for longer values (so that substrings don't block them)
    for dai in sorted(da,
                      key=lambda dai: len(dai.value)
                      if dai.value is not None else 0,
                      reverse=True):
        # first, create the delexicalized (abstracted) DAI as the copy of the current DAI
        abst_da.append(DAI(dai.da_type, dai.slot, dai.value))
        if dai.value is None:
            continue

        # search for the 1st or all occurrences
        found = 0
        pos = (-1, -1)
        while found < 1 or (repeated and pos != (-1, -1)):
            pos = find_value(dai.value, toks, toks_mask)
            # if the value is to be delexicalize, replace the value in the delexicalized DAI
            # and save abstraction instruction (even if not found in the sentence)
            if (dai.slot in delex_slots
                    and dai.value not in delex_slots[dai.slot]
                    and dai.value != 'dont_care' and (found == 0 or pos !=
                                                      (-1, -1))):

                abst_da[-1].value = 'X-' + dai.slot
                # save the abstraction instruction
                absts.append(
                    Abst(dai.slot,
                         dai.value,
                         surface_form=' '.join(toks[pos[0]:pos[1]]),
                         start=pos[0],
                         end=pos[1]))
            found += 1

    if delex_slot_names:
        for dai in sorted([dai for dai in da if dai.slot is not None],
                          key=lambda dai: len(dai.slot),
                          reverse=True):
            pos = find_value(dai.slot.replace('_', ' '), toks, toks_mask)
            if dai.slot in delex_slots:
                absts.append(
                    Abst(dai.slot,
                         None,
                         surface_form=' '.join(toks[pos[0]:pos[1]]),
                         start=pos[0],
                         end=pos[1]))

    # go from the beginning of the sentence, replacing the values to be delexicalized
    absts.sort(key=lambda a: a.start)
    shift = 0
    for abst in absts:
        # select only those that should actually be delexicalized on the output
        if (abst.slot not in delex_slots
                or abst.value in delex_slots[abst.slot]
                or abst.value == 'dont_care' or abst.start < 0):
            continue
        # replace the text with the placeholder (X-slot/X-value, X-slot-name, X)
        if delex_slot_names and abst.value is None:
            toks[abst.start - shift:abst.end - shift] = ['X-slot']
        elif use_slot_names:
            toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot]
        else:
            toks[abst.start - shift:abst.end -
                 shift] = ['X' if not delex_slot_names else 'X-value']
        # update abstraction instruction indexes
        shift_add = abst.end - abst.start - 1
        abst.start -= shift
        abst.end = abst.start + 1
        shift += shift_add

    return ' '.join(toks) if return_string else toks, abst_da, absts
示例#6
0
def abstract_sent(da, conc, abst_slots, slot_names):
    """Abstract the given slots in the given sentence (replace them with X).

    @param da: concrete DA
    @param conc: concrete sentence text
    @param abstr_slots: a set of slots to be abstracted
    @return: a tuple of the abstracted text, abstracted DA, and abstraction instructions
    """
    toks = conc.split(' ')
    absts = []
    abst_da = DA()
    toks_mask = [True] * len(toks)

    # find all values in the sentence, building the abstracted DA along the way
    # search first for longer values (so that substrings don't block them)
    for dai in sorted(da,
                      key=lambda dai: len(dai.value)
                      if dai.value is not None else 0,
                      reverse=True):
        # first, create the 'abstracted' DAI as the copy of the current DAI
        abst_da.append(DAI(dai.da_type, dai.slot, dai.value))
        if dai.value is None:
            continue
        # try to find the value in the sentence (first exact, then fuzzy)
        # while masking tokens of previously found values
        val_toks = dai.value.split(' ')
        pos = find_substr(val_toks,
                          [t if m else '' for t, m in zip(toks, toks_mask)])
        if pos is None:
            pos = find_substr_approx(
                val_toks, [t if m else '' for t, m in zip(toks, toks_mask)])
        if pos is not None:
            for idx in xrange(
                    pos[0],
                    pos[1]):  # mask found things so they're not found twice
                toks_mask[idx] = False
        if pos is None or pos == (0, 0):  # default to -1 for unknown positions
            pos = -1, -1
        # if the value is to be abstracted, replace the value in the abstracted DAI
        # and save abstraction instruction (even if not found in the sentence)
        if dai.slot in abst_slots and dai.value != 'dont_care':
            abst_da[-1].value = 'X-' + dai.slot
            # save the abstraction instruction
            absts.append(Abst(dai.slot, dai.value, start=pos[0], end=pos[1]))

    # go from the beginning of the sentence, replacing the values to be abstracted
    absts.sort(key=lambda a: a.start)
    shift = 0
    for abst in absts:
        # select only those that should actually be abstracted on the output
        if abst.slot not in abst_slots or abst.value == 'dont_care' or abst.start < 0:
            continue
        # replace the text
        if slot_names:
            toks[abst.start - shift:abst.end - shift] = ['X-' + abst.slot]
        else:
            toks[abst.start - shift:abst.end - shift] = ['X']
        # update abstraction instruction indexes
        shift_add = abst.end - abst.start - 1
        abst.start -= shift
        abst.end = abst.start + 1
        shift += shift_add

    return ' '.join(toks), abst_da, absts