def read_system_training_data(filename): insts = [] for inst in pd.read_csv(filename, index_col=None, encoding='UTF-8').to_dict('records'): insts.append({ 'dataset': 'E2E', 'mr': DA.parse_diligent_da(inst['mr']).to_cambridge_da_string(), 'delex_mr': DA.parse_diligent_da(inst['mr']).get_delexicalized( set(['name', 'near'])).to_cambridge_da_string(), 'system': 'HUMAN', 'system_ref': None, 'orig_ref': inst['ref'], 'informativeness': None, 'naturalness': None, 'quality': None, 'is_real': 0 }) log_info( "Using %d different training human references to create fake pairs" % len(insts)) return insts
def process_file(tagger_model, input_file): detok = Detokenizer() df = pd.read_csv(input_file, sep="\t", encoding="UTF-8") raw_mrs = list(df['MR']) raw_refs = [detok.detokenize(text) for text in list(df['output'])] mrs = [DA.parse_diligent_da(mr) for mr in raw_mrs] tagger = MorphoTagger(tagger_model) tagged_refs = [tagger.tag(line) for line in raw_refs] for ff in ['ngram', 'lca', 'collins']: write_output(tagged_refs, ff, re.sub(r'\.tsv', '.tag.%s.txt' % ff, input_file)) stats = data_stats(mrs, tagged_refs, { 'name': [], 'near': [] }, re.sub(r'\.tsv', '', input_file)) return stats
def convert(args): src = pd.read_csv(args.src_file, index_col=None, encoding='utf-8') df = pd.DataFrame(index=np.arange(len(src)), columns=COLUMNS) for src_col, trg_col in COLUMN_MAP.iteritems(): if isinstance(trg_col, list): for trg_col_ in trg_col: df[trg_col_] = src[src_col] else: df[trg_col] = src[src_col] df['mr'] = [ DA.parse_diligent_da(da).to_cambridge_da_string() for da in src['mr'] ] df['is_real'] = np.ones(len(src), dtype=np.int32) df['dataset'] = ['INLG'] * len(src) df['system'] = ['human'] * len(src) df.to_csv(args.out_file, columns=COLUMNS, sep=b"\t", index=False, encoding='UTF-8')
def convert(args): """Main function – read in the CSV data and output TGEN-specific files.""" # find out which slots should be abstracted (from command-line argument) slots_to_abstract = set() if args.abstract is not None: slots_to_abstract.update(re.split(r'[, ]+', args.abstract)) # initialize storage conc_das = [] das = [] # abstracted DAs concs = [] # concrete sentences texts = [] # abstracted sentences absts = [] # abstraction descriptions # statistics about different DAs da_keys = {} insts = 0 def process_instance(da, conc): da.sort() conc_das.append(da) text, da, abst = delex_sent(da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True) text = text.lower().replace('x-', 'X-') # lowercase all but placeholders da.sort() da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1 das.append(da) concs.append(conc) absts.append(abst) texts.append(text) # process the input data and store it in memory with open(args.in_file, 'r') as fh: csvread = csv.reader(fh, encoding='UTF-8') csvread.next() # skip header for mr, text, voice in csvread: da = DA.parse_diligent_da(mr, voice) process_instance(da, text) insts += 1 print 'Processed', insts, 'instances.' print '%d different DAs.' % len(da_keys) print '%.2f average DAIs per DA' % (sum([len(d) for d in das]) / float(len(das))) print 'Max DA len: %d, max text len: %d' % (max( [len(da) for da in das]), max([text.count(' ') + 1 for text in texts])) # for multi-ref mode, group by the same conc DA if args.multi_ref: groups = OrderedDict() for conc_da, da, conc, text, abst in zip(conc_das, das, concs, texts, absts): group = groups.get(unicode(conc_da), {}) group['da'] = da group['conc_da'] = conc_da group['abst'] = group.get('abst', []) + [abst] group['conc'] = group.get('conc', []) + [conc] group['text'] = group.get('text', []) + [text] groups[unicode(conc_da)] = group conc_das, das, concs, texts, absts = [], [], [], [], [] for group in groups.itervalues(): conc_das.append(group['conc_da']) das.append(group['da']) concs.append("\n".join(group['conc']) + "\n") texts.append("\n".join(group['text']) + "\n") absts.append("\n".join([ "\t".join([unicode(a) for a in absts_]) for absts_ in group['abst'] ]) + "\n") else: # convert abstraction instruction to string (coordinate output with multi-ref mode) absts = ["\t".join([unicode(a) for a in absts_]) for absts_ in absts] with codecs.open(args.out_name + '-das.txt', 'w', 'UTF-8') as fh: for da in das: fh.write(unicode(da) + "\n") with codecs.open(args.out_name + '-conc_das.txt', 'w', 'UTF-8') as fh: for conc_da in conc_das: fh.write(unicode(conc_da) + "\n") with codecs.open(args.out_name + '-conc.txt', 'w', 'UTF-8') as fh: for conc in concs: fh.write(conc + "\n") with codecs.open(args.out_name + '-abst.txt', 'w', 'UTF-8') as fh: for abst in absts: fh.write(abst + "\n") with codecs.open(args.out_name + '-text.txt', 'w', 'UTF-8') as fh: for text in texts: fh.write(text + "\n")
def convert(args): """Main function – read in the CSV data and output TGEN-specific files.""" # find out which slots should be abstracted (from command-line argument) slots_to_abstract = set() if args.abstract is not None: slots_to_abstract.update(re.split(r'[, ]+', args.abstract)) # initialize storage conc_das = [] das = [] # abstracted DAs concs = [] # concrete sentences texts = [] # abstracted sentences absts = [] # abstraction descriptions # statistics about different DAs da_keys = {} insts = 0 def process_instance(da, conc): da.sort() conc_das.append(da) text, da, abst = delex_sent(da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True) text = text.lower().replace('x-', 'X-') # lowercase all but placeholders da.sort() da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1 das.append(da) concs.append(conc) absts.append(abst) texts.append(text) # process the input data and store it in memory with open(args.in_file, 'r') as fh: csvread = csv.reader(fh, encoding='UTF-8') csvread.next() # skip header for mr, text in csvread: da = DA.parse_diligent_da(mr) process_instance(da, text) insts += 1 print 'Processed', insts, 'instances.' print '%d different DAs.' % len(da_keys) print '%.2f average DAIs per DA' % (sum([len(d) for d in das]) / float(len(das))) print 'Max DA len: %d, max text len: %d' % (max([len(da) for da in das]), max([text.count(' ') + 1 for text in texts])) # for multi-ref mode, group by the same conc DA if args.multi_ref: groups = OrderedDict() for conc_da, da, conc, text, abst in zip(conc_das, das, concs, texts, absts): group = groups.get(unicode(conc_da), {}) group['da'] = da group['conc_da'] = conc_da group['abst'] = group.get('abst', []) + [abst] group['conc'] = group.get('conc', []) + [conc] group['text'] = group.get('text', []) + [text] groups[unicode(conc_da)] = group conc_das, das, concs, texts, absts = [], [], [], [], [] for group in groups.itervalues(): conc_das.append(group['conc_da']) das.append(group['da']) concs.append("\n".join(group['conc']) + "\n") texts.append("\n".join(group['text']) + "\n") absts.append("\n".join(["\t".join([unicode(a) for a in absts_]) for absts_ in group['abst']]) + "\n") else: # convert abstraction instruction to string (coordinate output with multi-ref mode) absts = ["\t".join([unicode(a) for a in absts_]) for absts_ in absts] with codecs.open(args.out_name + '-das.txt', 'w', 'UTF-8') as fh: for da in das: fh.write(unicode(da) + "\n") with codecs.open(args.out_name + '-conc_das.txt', 'w', 'UTF-8') as fh: for conc_da in conc_das: fh.write(unicode(conc_da) + "\n") with codecs.open(args.out_name + '-conc.txt', 'w', 'UTF-8') as fh: for conc in concs: fh.write(conc + "\n") with codecs.open(args.out_name + '-abst.txt', 'w', 'UTF-8') as fh: for abst in absts: fh.write(abst + "\n") with codecs.open(args.out_name + '-text.txt', 'w', 'UTF-8') as fh: for text in texts: fh.write(text + "\n")
def convert(args): src = pd.read_csv(args.src_file, index_col=None, encoding='utf-8') data = [] src_col = args.column trg_col = COLUMN_MAP[src_col[:3]] unique_mrs = set() for _, src_inst in src.iterrows(): mr = DA.parse_diligent_da(src_inst['mr']).to_cambridge_da_string() delex_mr = DA.parse_diligent_da(src_inst['mr']).get_delexicalized( set(['name', 'near'])).to_cambridge_da_string() unique_mrs.add(delex_mr) syss = [{ 'sys': src_inst['sys%d' % i], 'ref': src_inst['ref%d' % i], 'val': src_inst['%s%d' % (src_col, i)] } for i in xrange(1, 6)] for sys1, sys2 in itertools.combinations(syss, 2): if sys1['val'] < sys2['val']: # without loss of generality sys1, sys2 = sys2, sys1 if sys1['val'] == sys2['val']: # ignore those that are equal continue trg_inst = { 'dataset': 'E2E', 'system': SYSTEMS_MAP[sys1['sys']], 'system2': SYSTEMS_MAP[sys2['sys']], 'orig_ref': None, 'mr': mr, 'delex_mr': delex_mr, 'system_ref': sys1['ref'], 'system_ref2': sys2['ref'], 'is_real': 1, 'informativeness': None, 'naturalness': None, 'quality': None } trg_inst[trg_col] = 1 data.append(trg_inst) unique_mrs = sorted(list(unique_mrs)) random.shuffle(unique_mrs) part_sizes = [int(p) for p in args.ratio.split(':')] part_sizes = [ int(round(p * len(unique_mrs) / float(sum(part_sizes)))) for p in part_sizes ] part_sizes[0] = len(unique_mrs) - sum(part_sizes[1:]) part_labels = args.labels.split(':') part_start = 0 log_info('Data sizes in MRs: %s' % ':'.join([str(p) for p in part_sizes])) # remove ambiguous instances if args.unambiguous: occs = Counter([(inst['mr'], inst['system'], inst['system2']) for inst in data]) ambig = set() for mr, sys1, sys2 in occs.iterkeys(): if occs.get((mr, sys2, sys1), 0) == occs[(mr, sys1, sys2)]: ambig.add((mr, sys1, sys2)) uniq_data = [] used_insts = set() for inst in data: mr, sys1, sys2 = inst['mr'], inst['system'], inst['system2'] if (mr, sys1, sys2) in ambig or (mr, sys1, sys2) in used_insts: continue uniq_data.append(inst) used_insts.add((mr, sys1, sys2)) data = uniq_data # mark down the configuration with codecs.open(os.path.join(args.out_path, 'config'), 'wb', encoding='UTF-8') as fh: fh.write(pprint.pformat(vars(args), indent=4, width=100)) # split the output for part_no, (part_size, part_label) in enumerate(zip(part_sizes, part_labels)): part_mrs = set(unique_mrs[part_start:part_start + part_size]) part_data = [inst for inst in data if inst['delex_mr'] in part_mrs] if args.shuffle: random.shuffle(part_data) part_df = pd.DataFrame(part_data) if part_no == 0 and args.fake_data: # create fake data indiv_sys_outputs = get_sys_outputs(part_data) if args.fake_data_from: indiv_sys_outputs.extend( read_system_training_data(args.fake_data_from)) fake_insts = create_fake_data( pd.DataFrame.from_records(indiv_sys_outputs), part_df.columns, score_type='rank') fake_pairs = create_fake_pairs(fake_insts, len(indiv_sys_outputs)) part_df = part_df.append(fake_pairs, sort=True) out_file = os.path.join(args.out_path, part_label + '.tsv') log_info('File: %s, total size %d' % (out_file, len(part_df))) part_df.to_csv(out_file, columns=COLUMNS, sep=b"\t", index=False, encoding='UTF-8') part_start += part_size
def parse_mr(mr_text): return DA.parse_diligent_da(mr_text).get_delexicalized( set(['name', 'near']))
def read_e2e_data(): with codecs.open('data/e2e-refs.tag.ngram.txt', 'r', 'UTF-8') as fh: refs = [split_tags(inst.strip()) for inst in fh.readlines()] with codecs.open('data/e2e-mrs.txt', 'r', 'UTF-8') as fh: mrs = [DA.parse_diligent_da(mr) for mr in fh.readlines()] return mrs, refs
def convert(args): """Main function – read in the CSV data and output TGEN-specific files.""" # find out which slots should be abstracted (from command-line argument) slots_to_abstract = set() if args.abstract is not None: slots_to_abstract.update(re.split(r'[, ]+', args.abstract)) # initialize storage conc_das = [] das = [] # abstracted DAs concs = [] # concrete sentences texts = [] # abstracted sentences absts = [] # abstraction descriptions # statistics about different DAs da_keys = {} insts = 0 def process_instance(conc_da, conc): # sort the DA using the same order as in E2E NLG data conc_da.dais.sort(key=lambda dai: (['name', 'eat_type', 'food', 'price_range', 'rating', 'area', 'family_friendly', 'near'].index(dai.slot), dai.value)) conc_das.append(conc_da) text, da, abst = delex_sent(conc_da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True) text = text.lower().replace('x-', 'X-') # lowercase all but placeholders da.dais.sort(key=lambda dai: (['name', 'eat_type', 'food', 'price_range', 'rating', 'area', 'family_friendly', 'near'].index(dai.slot), dai.value)) da_keys[str(da)] = da_keys.get(str(da), 0) + 1 das.append(da) concs.append(conc) absts.append(abst) texts.append(text) # process the input data and store it in memory data = pd.read_csv(args.in_file, sep=',', encoding='UTF-8') data['mr'] = data['mr'].fillna('') for inst in data.itertuples(): da = DA.parse_diligent_da(inst.mr) process_instance(da, inst.ref) insts += 1 if insts % 100 == 0: print('%d...' % insts, end='', flush=True, file=sys.stderr) print('Processed', insts, 'instances.', file=sys.stderr) print('%d different DAs.' % len(da_keys), file=sys.stderr) print('%.2f average DAIs per DA' % (sum([len(d) for d in das]) / float(len(das))), file=sys.stderr) print('Max DA len: %d, max text len: %d' % (max([len(da) for da in das]), max([text.count(' ') + 1 for text in texts])), file=sys.stderr) # for multi-ref mode, group by the same conc DA if args.multi_ref: groups = OrderedDict() # keep the original order (by 1st occurrence of DA) for conc_da, da, conc, text, abst in zip(conc_das, das, concs, texts, absts): group = groups.get(str(conc_da), {}) group['da'] = da group['conc_da'] = conc_da group['abst'] = group.get('abst', []) + [abst] group['conc'] = group.get('conc', []) + [conc] group['text'] = group.get('text', []) + [text] groups[str(conc_da)] = group conc_das, das, concs, texts, absts = [], [], [], [], [] for group in groups.values(): conc_das.append(group['conc_da']) das.append(group['da']) concs.append("\n".join(group['conc']) + "\n") texts.append("\n".join(group['text']) + "\n") absts.append("\n".join(["\t".join([str(a) for a in absts_]) for absts_ in group['abst']]) + "\n") else: # convert abstraction instruction to string (coordinate output with multi-ref mode) absts = ["\t".join([str(a) for a in absts_]) for absts_ in absts] with codecs.open(args.out_name + '-das.txt', 'w', 'UTF-8') as fh: for da in das: fh.write(str(da) + "\n") with codecs.open(args.out_name + '-conc_das.txt', 'w', 'UTF-8') as fh: for conc_da in conc_das: fh.write(str(conc_da) + "\n") with codecs.open(args.out_name + '-conc.txt', 'w', 'UTF-8') as fh: for conc in concs: fh.write(conc + "\n") with codecs.open(args.out_name + '-abst.txt', 'w', 'UTF-8') as fh: for abst in absts: fh.write(abst + "\n") with codecs.open(args.out_name + '-text.txt', 'w', 'UTF-8') as fh: for text in texts: fh.write(text + "\n")
def convert(args): """Main function – read in the CSV data and output TGEN-specific files.""" # find out which slots should be abstracted (from command-line argument) slots_to_abstract = set() if args.abstract is not None: slots_to_abstract.update(re.split(r'[, ]+', args.abstract)) # initialize storage conc_das = [] das = [] # abstracted DAs concs = [] # concrete sentences texts = [] # abstracted sentences absts = [] # abstraction descriptions original_sents = [] delexicalised_sents = [] sent_ids = [] mrs_for_delex = [] # statistics about different DAs da_keys = {} insts = 0 find_apostrophes = r"([a-z])\s('[a-z]{1,2}\b)" def process_instance(da, conc, mr, multi_ref_id): original_da = deepcopy(da) # why do the das need to be sorted? This seems weird # Anyway, we checked it gets sorted in delex_sent anyway so nothing to # do about it until later da.sort() conc_das.append(da) text, da, abst = delex_sent(da, tokenize(conc), slots_to_abstract, args.slot_names, repeated=True) # Originall we didn't want to lower case because it will make things # easier for udpipe later on, however ... # we changed our mind on this because the upper case characters are # messing with udpipe's ability to properly sentence tokenize. # we need underscores instead of dashes or else udpipe breaks it apart # text = re.sub(r"X-", r"X_", text) # Again running into problems with leaving x and as a capital letter # and also with udpipe randomly segmenting it but sometimes not. We # really need to find a more reliable sentence tokenizer / word # tokenizer text = text.lower().replace('x-', 'x') # We're testing out making xnear upper case to see if it reduces the # incorrect dropping of it by the deep parser text = text.replace('xnear', 'Xnear') # detokenize some of the apostrophe stuff because udpipe does it # differently. Namely removing spaces between letters and apostrophes text = re.sub(find_apostrophes, r"\1\2", text) da.sort() da_keys[unicode(da)] = da_keys.get(unicode(da), 0) + 1 das.append(da) concs.append(conc) absts.append(abst) texts.append(text) # now for our own bastardized sentence tokenization and human eval # required stuff this_conc_sents = sent_tokenize(conc) num_sents = len(this_conc_sents) this_delex_sents = [] for i, this_conc_sent in enumerate(this_conc_sents): text, _, _ = delex_sent(original_da, tokenize(this_conc_sent), slots_to_abstract, args.slot_names, repeated=True) text = text.lower().replace('x-', 'x') # We're testing out making xnear upper case to see if it reduces the # incorrect dropping of it by the deep parser text = text.replace('xnear', 'Xnear') # detokenize some of the apostrophe stuff because udpipe does it # differently. Namely removing spaces between letters and apostrophes text = re.sub(find_apostrophes, r"\1\2", text) this_delex_sents.append(text) # start appending the sentence specific ones sent_ids.append('_'.join([mr.replace(' ', ''), str(multi_ref_id), str(i)])) mrs_for_delex.append(mr) # now we're onto something else original_sents.append('\n'.join(this_conc_sents)) delexicalised_sents.append('\n'.join(this_delex_sents)) # this_delex_sents = sent_tokenize(text) # num_sents = len(this_conc_sents) # if num_sents != len(this_delex_sents): # # this is very bad if this happens! # # import ipdb; ipdb.set_trace() # print '\n' # print this_conc_sents # print this_delex_sents # print '\nnext example' # original_sents.append('\n'.join(this_conc_sents)) # delexicalised_sents.append('\n'.join(this_delex_sents)) # for i in range(num_sents): # sent_ids.append('_'.join([mr.replace(' ', ''), str(multi_ref_id), # str(i)])) # mrs_for_delex.append(mr) # process the input data and store it in memory with open(args.in_file, 'r') as fh: csvread = csv.reader(fh, encoding='UTF-8') csvread.next() # skip header multi_ref_count = Counter() for mr, text in tqdm(csvread): multi_ref_count[mr] += 1 da = DA.parse_diligent_da(mr) process_instance(da, text, mr, multi_ref_count[mr]) insts += 1 print 'Processed', insts, 'instances.' print '%d different DAs.' % len(da_keys) print '%.2f average DAIs per DA' % (sum([len(d) for d in das]) / float(len(das))) print 'Max DA len: %d, max text len: %d' % (max([len(da) for da in das]), max([text.count(' ') + 1 for text in texts])) # for multi-ref mode, group by the same conc DA if args.multi_ref: groups = OrderedDict() for conc_da, da, conc, text, abst in zip(conc_das, das, concs, texts, absts): group = groups.get(unicode(conc_da), {}) group['da'] = da group['conc_da'] = conc_da group['abst'] = group.get('abst', []) + [abst] group['conc'] = group.get('conc', []) + [conc] group['text'] = group.get('text', []) + [text] groups[unicode(conc_da)] = group conc_das, das, concs, texts, absts = [], [], [], [], [] for group in groups.itervalues(): conc_das.append(group['conc_da']) das.append(group['da']) concs.append("\n".join(group['conc']) + "\n") texts.append("\n".join(group['text']) + "\n") absts.append("\n".join(["\t".join([unicode(a) for a in absts_]) for absts_ in group['abst']]) + "\n") else: # convert abstraction instruction to string (coordinate output with multi-ref mode) absts = ["\t".join([unicode(a) for a in absts_]) for absts_ in absts] with codecs.open(args.out_name + '-das.txt', 'w', 'UTF-8') as fh: for da in das: fh.write(unicode(da) + "\n") with codecs.open(args.out_name + '-conc_das.txt', 'w', 'UTF-8') as fh: for conc_da in conc_das: fh.write(unicode(conc_da) + "\n") with codecs.open(args.out_name + '-conc.txt', 'w', 'UTF-8') as fh: for conc in concs: fh.write(conc + "\n") with codecs.open(args.out_name + '-abst.txt', 'w', 'UTF-8') as fh: for abst in absts: fh.write(abst + "\n") # We join on double new lines so that udpipe will read them out as # different paragraphs with codecs.open(args.out_name + '-text.txt', 'w', 'UTF-8') as fh: for text in texts: fh.write(text + "\n\n") # here are all our new ones with codecs.open(args.out_name + '-orig_sents.txt', 'w', 'UTF-8') as fh: for this in original_sents: fh.write(this + "\n") # again gets a double new lines for processing with udpipe with codecs.open(args.out_name + '-delex_sents.txt', 'w', 'UTF-8') as fh: for this in delexicalised_sents: fh.write(this + "\n\n") with codecs.open(args.out_name + '-sent_ids.txt', 'w', 'UTF-8') as fh: for this in sent_ids: fh.write(this + "\n") with codecs.open(args.out_name + '-mrs_for_delex.txt', 'w', 'UTF-8') as fh: for this in mrs_for_delex: fh.write(this + "\n")