def process_instance(da, conc, mr, multi_ref_id): original_da = deepcopy(da) # why do the das need to be sorted? This seems weird # Anyway, we checked it gets sorted in delex_sent anyway so nothing to # do about it until later da.sort() conc_das.append(da) text, da, abst = delex_sent(da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True) # Originall we didn't want to lower case because it will make things # easier for udpipe later on, however ... # we changed our mind on this because the upper case characters are # messing with udpipe's ability to properly sentence tokenize. # we need underscores instead of dashes or else udpipe breaks it apart # text = re.sub(r"X-", r"X_", text) # Again running into problems with leaving x and as a capital letter # and also with udpipe randomly segmenting it but sometimes not. We # really need to find a more reliable sentence tokenizer / word # tokenizer text = text.lower().replace('x-', 'x') # We're testing out making xnear upper case to see if it reduces the # incorrect dropping of it by the deep parser text = text.replace('xnear', 'Xnear') # detokenize some of the apostrophe stuff because udpipe does it # differently. Namely removing spaces between letters and apostrophes text = re.sub(find_apostrophes, r"\1\2", text) da.sort() da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1 das.append(da) concs.append(conc) absts.append(abst) texts.append(text) # now for our own bastardized sentence tokenization and human eval # required stuff this_conc_sents = sent_tokenize(conc) num_sents = len(this_conc_sents) this_delex_sents = [] for i, this_conc_sent in enumerate(this_conc_sents): text, _, _ = delex_sent(original_da, tokenize(this_conc_sent), slots_to_abstract, args.slot_names, repeated=True) text = text.lower().replace('x-', 'x') # We're testing out making xnear upper case to see if it reduces the # incorrect dropping of it by the deep parser text = text.replace('xnear', 'Xnear') # detokenize some of the apostrophe stuff because udpipe does it # differently. Namely removing spaces between letters and apostrophes text = re.sub(find_apostrophes, r"\1\2", text) this_delex_sents.append(text) # start appending the sentence specific ones sent_ids.append('_'.join([mr.replace(' ', ''), str(multi_ref_id), str(i)])) mrs_for_delex.append(mr) # now we're onto something else original_sents.append('\n'.join(this_conc_sents)) delexicalised_sents.append('\n'.join(this_delex_sents))
def process_instance(da, conc): da.sort() conc_das.append(da) text, da, abst = delex_sent(da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True) text = text.lower().replace('x-', 'X-') # lowercase all but placeholders da.sort() da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1 das.append(da) concs.append(conc) absts.append(abst) texts.append(text)
def process_instance(conc_da, conc): # sort the DA using the same order as in E2E NLG data conc_da.dais.sort(key=lambda dai: (['name', 'eat_type', 'food', 'price_range', 'rating', 'area', 'family_friendly', 'near'].index(dai.slot), dai.value)) conc_das.append(conc_da) text, da, abst = delex_sent(conc_da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True) text = text.lower().replace('x-', 'X-') # lowercase all but placeholders da.dais.sort(key=lambda dai: (['name', 'eat_type', 'food', 'price_range', 'rating', 'area', 'family_friendly', 'near'].index(dai.slot), dai.value)) da_keys[str(da)] = da_keys.get(str(da), 0) + 1 das.append(da) concs.append(conc) absts.append(abst) texts.append(text)
def process_instance(da, conc): da.sort() conc_das.append(da) # store the non-delexicalized version of the DA # delexicalize text, da, abst = delex_sent(da, conc, slots_to_abstract, args.slot_names) da.sort() # delexicalization does not keep DAI order, need to sort again # store the DA text = fix_capitalization(text) conc = fix_capitalization(conc) da_keys[str(da)] = da_keys.get(str(da), 0) + 1 das.append(da) concs.append(conc) absts.append(abst) texts.append(text)
def process_instance(da, conc): da.sort() conc_das.append(da) # store the non-delexicalized version of the DA # delexicalize text, da, abst = delex_sent(da, conc, slots_to_abstract, args.slot_names) da.sort() # delexicalization does not keep DAI order, need to sort again # store the DA text = fix_capitalization(text) conc = fix_capitalization(conc) da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1 das.append(da) concs.append(conc) absts.append(abst) texts.append(text)
def delexicalize_refs(mrs, refs, delex_slots, delex_output_file): delex_refs = [] print('Delexicalizing...', end=' ', file=sys.stderr) for pos, (mr, ref) in enumerate(zip(mrs, refs)): delex_ref = [] print(pos, end=' ', file=sys.stderr) sys.stderr.flush() for sent in ref: delex, _, absts = delex_sent(mr, [tok[0] for tok in sent], delex_slots, repeated=True) off = 0 shift = 0 delex_tagged = [] for abst in absts: if abst.start == -1: # skip delex instances not actually occurring in this sentence continue delex_tagged.extend(sent[off + shift:abst.start + shift]) # POS tag for all delex'd slots is usually NNP, except for phone numbers and counts delex_pos = { 'count': 'CD', 'phone': 'CD' }.get(abst.slot, 'NNP') delex_tagged.append( [delex[abst.start], delex[abst.start], delex_pos]) off = abst.end shift += abst.surface_form.count(' ') delex_tagged.extend(sent[off + shift:]) delex_ref.append(delex_tagged) delex_refs.append(delex_ref) with codecs.open(delex_output_file + '.delex.txt', 'w', 'UTF-8') as fh: fh.write('\n'.join([ ' '.join([tok[0] for sent in ref for tok in sent]) for ref in delex_refs ])) write_output(delex_refs, 'lca', delex_output_file + '.delex.tag.lca.txt') write_output(delex_refs, 'collins', delex_output_file + '.delex.tag.collins.txt') return delex_refs
def create_fake_data(real_data, columns, score_type='nlg'): """Given some real data, create additional fake data, using human references and distorting them. Will start from scores provided, or default to best possible score. @param real_data: a real data set, as pd.DataFrame @param columns: list of columns for the fake data set @param score_type: switch between Likert scale 1-6 ('nlg') and HTER ('hter') @return: a fake data set, with the given columns, some of them empty """ def target_score(src_score, distort_step): if score_type == 'hter': return src_score + distort_step elif score_type == 'rank': return 1. # ignore scores for ranks return max(1, min(4., src_score - distort_step)) normalize = False best_score = 6. num_steps = 4 if score_type == 'hter': normalize = True best_score = 0. num_steps = 5 elif score_type == 'rank': best_score = 1. fake_data = pd.DataFrame(index=np.arange(len(real_data) * (num_steps + 1)), columns=columns) vocab = {} # add references as perfect data items for idx, row in enumerate(real_data.itertuples()): fake_data.loc[idx]['orig_ref'] = row.orig_ref fake_data.loc[idx]['system_ref'] = row.orig_ref fake_data.loc[idx]['mr'] = row.mr fake_data.loc[idx]['is_real'] = 0 for quant in ['naturalness', 'quality', 'informativeness']: fake_data.loc[idx][quant] = (getattr(row, quant) if ( hasattr(row, quant) and getattr(row, quant) is not None and not np.isnan(getattr(row, quant))) else best_score) for tok in tokenize(row.orig_ref).split(' '): vocab[tok] = vocab.get(tok, 0) + 1 lexicalizer = Lexicalizer(cfg={'mode': 'tokens'}) # default lexicalizer vocab = build_vocab(vocab) for distort_step in xrange(1, num_steps + 1): for idx, row in enumerate(real_data.itertuples(), start=distort_step * len(real_data)): fake_data.loc[idx]['orig_ref'] = row.orig_ref fake_data.loc[idx]['mr'] = row.mr fake_data.loc[idx]['is_real'] = 0 # delexicalize data da = DA.parse_cambridge_da(row.mr) sent, _, lex_instr = delex_sent(da, tokenize(row.orig_ref).split(' '), DELEX_SLOTS) ref_len = len(sent) # distort sent = distort_sent(sent, distort_step, vocab) # lexicalize again sent = lexicalizer._tree_to_sentence([(tok, None) for tok in sent], lex_instr) fake_data.loc[idx]['system_ref'] = ' '.join(sent) for quant in ['naturalness', 'quality', 'informativeness']: score = (getattr(row, quant) if ( hasattr(row, quant) and getattr(row, quant) is not None and not np.isnan(getattr(row, quant))) else best_score) score = target_score(score, distort_step) fake_data.loc[idx][quant] = (((score / ref_len) * 100) if normalize else score) return fake_data
def preprocess_sent(da, sent, delex_slots, delex_slot_names): sent = tokenize(sent.lower()).split(' ') if delex_slots: return delex_sent(da, sent, delex_slots, not delex_slot_names, delex_slot_names)[0] return sent