def augment_with_contrast_tgen(dataset, filename): """Augments the MRs with auxiliary tokens indicating a pair of slots that should be contrasted in the corresponding generated utterance. The output is in the format accepted by TGen. """ contrast_connectors = ['but', 'however', 'yet'] scalar_slots = get_scalar_slots() alignments = [] contrasts = [] print('Augmenting MRs with contrast in ' + str(filename)) # Read in the data data_cont = data_loader.init_test_data(os.path.join(config.DATA_DIR, dataset, filename)) mrs, utterances = data_cont['data'] _, _, slot_sep, val_sep, val_sep_end = data_cont['separators'] for i, mr in enumerate(mrs): mr_dict = OrderedDict() # Extract the slot-value pairs into a dictionary for slot_value in mr.split(slot_sep): slot, value, slot_orig, _ = data_loader.parse_slot_and_value(slot_value, val_sep, val_sep_end) mr_dict[slot] = value mrs[i] = mrs[i].replace(slot_orig, slot) alignments.append(find_alignment(utterances[i], mr_dict)) for i in range(len(utterances)): contrasts.append(['none', 'none', 'none', 'none']) for contrast_conn in contrast_connectors: contrast_pos = utterances[i].find(contrast_conn) if contrast_pos >= 0: slot_before = None value_before = None slot_after = None value_after = None for pos, slot, value in alignments[i]: if pos > contrast_pos: if not slot_before: break if slot in scalar_slots: slot_after = slot value_after = value break else: if slot in scalar_slots: slot_before = slot value_before = value if slot_before and slot_after: if scalar_slots[slot_before][value_before] - scalar_slots[slot_after][value_after] == 0: contrasts[i][2] = slot_before contrasts[i][3] = slot_after else: contrasts[i][0] = slot_before contrasts[i][1] = slot_after break new_df = pd.DataFrame(columns=['mr', 'ref', 'contrast1', 'contrast2', 'concession1', 'concession2']) new_df['mr'] = mrs new_df['ref'] = utterances new_df['contrast1'] = [tup[0] for tup in contrasts] new_df['contrast2'] = [tup[1] for tup in contrasts] new_df['concession1'] = [tup[2] for tup in contrasts] new_df['concession2'] = [tup[3] for tup in contrasts] filename_out = ''.join(filename.split('.')[:-1]) + '_augm_contrast_tgen.csv' new_df.to_csv(os.path.join(config.DATA_DIR, dataset, filename_out), index=False, encoding='utf8')
def align_slots(dataset, filename): """Aligns slots of the MRs with their mentions in the corresponding utterances.""" alignments = [] alignment_strings = [] print('Aligning slots in ' + str(filename)) # Read in the data data_cont = data_loader.init_test_data( os.path.join(config.DATA_DIR, dataset, filename)) mrs_orig, utterances_orig = data_cont['data'] _, _, slot_sep, val_sep, val_sep_end = data_cont['separators'] # Preprocess the MRs and the utterances mrs = [ data_loader.preprocess_mr(mr, data_cont['separators']) for mr in mrs_orig ] utterances = [ data_loader.preprocess_utterance(utt) for utt in utterances_orig ] for i, mr in enumerate(mrs): mr_dict = OrderedDict() # Extract the slot-value pairs into a dictionary for slot_value in mr.split(slot_sep): slot, value, _, _ = data_loader.parse_slot_and_value( slot_value, val_sep, val_sep_end) mr_dict[slot] = value alignments.append(find_alignment(utterances[i], mr_dict)) for i in range(len(utterances)): alignment_strings.append(' '.join([ '({0}: {1})'.format(pos, slot) for pos, slot, _ in alignments[i] ])) new_df = pd.DataFrame(columns=['mr', 'ref', 'alignment']) new_df['mr'] = mrs_orig new_df['ref'] = utterances_orig new_df['alignment'] = alignment_strings filename_out = os.path.splitext(filename)[0] + ' [aligned].csv' new_df.to_csv(os.path.join(config.DATA_DIR, dataset, filename_out), index=False, encoding='utf8')
def score_slot_realizations(path, filename): """Analyzes unrealized and hallucinated slot mentions in the utterances.""" errors = [] incorrect_slots = [] # slot_cnt = 0 print('Analyzing missing slot realizations and hallucinations in ' + str(filename)) # Read in the data data_cont = data_loader.init_test_data(os.path.join(path, filename)) dataset_name = data_cont['dataset_name'] mrs_orig, utterances_orig = data_cont['data'] _, _, slot_sep, val_sep, val_sep_end = data_cont['separators'] # Preprocess the MRs and the utterances mrs = [ data_loader.preprocess_mr(mr, data_cont['separators']) for mr in mrs_orig ] utterances = [ data_loader.preprocess_utterance(utt) for utt in utterances_orig ] # DEBUG PRINT # print('\n'.join([mr1 + '\n' + mr2 + '\n' for mr1, mr2 in zip(mrs_orig, mrs)])) for i, mr in enumerate(mrs): mr_dict = OrderedDict() # Extract the slot-value pairs into a dictionary for slot_value in mr.split(slot_sep): slot, value, _, _ = data_loader.parse_slot_and_value( slot_value, val_sep, val_sep_end) mr_dict[slot] = value # Count auxiliary slots # if not re.match(r'<!.*?>', slot): # slot_cnt += 1 # TODO: get rid of this hack # Move the food-slot to the end of the dict (because of delexing) if 'food' in mr_dict: food_val = mr_dict['food'] del (mr_dict['food']) mr_dict['food'] = food_val # Delexicalize the MR and the utterance utterances[i] = data_loader.delex_sample(mr_dict, utterances[i], dataset=dataset_name) # Count the missing and hallucinated slots in the utterance cur_errors, cur_incorrect_slots = count_errors(utterances[i], mr_dict) errors.append(cur_errors) incorrect_slots.append(', '.join(cur_incorrect_slots)) # DEBUG PRINT # print(slot_cnt) new_df = pd.DataFrame(columns=['mr', 'ref', 'errors', 'incorrect slots']) new_df['mr'] = mrs_orig new_df['ref'] = utterances_orig new_df['errors'] = errors new_df['incorrect slots'] = incorrect_slots filename_out = os.path.splitext(filename)[0] + ' [errors].csv' new_df.to_csv(os.path.join(path, filename_out), index=False, encoding='utf8')
def analyze_contrast_relations(dataset, filename): """Identifies the slots involved in a contrast relation.""" contrast_connectors = ['but', 'however', 'yet'] slots_before = [] slots_after = [] print('Analyzing contrast relations in ' + str(filename)) # Read in the data data_cont = data_loader.init_test_data( os.path.join(config.DATA_DIR, dataset, filename)) mrs_orig, utterances_orig = data_cont['data'] _, _, slot_sep, val_sep, val_sep_end = data_cont['separators'] # Preprocess the MRs mrs = [ data_loader.preprocess_mr(mr, data_cont['separators']) for mr in mrs_orig ] for mr, utt in zip(mrs, utterances_orig): mr_dict = OrderedDict() mr_list_augm = [] # Extract the slot-value pairs into a dictionary for slot_value in mr.split(slot_sep): slot, value, slot_orig, value_orig = data_loader.parse_slot_and_value( slot_value, val_sep, val_sep_end) mr_dict[slot] = value mr_list_augm.append((slot, value_orig)) # Find the slot alignment alignment = find_alignment(utt, mr_dict) slot_before = None slot_after = None for contrast_conn in contrast_connectors: contrast_pos = utt.find(contrast_conn) if contrast_pos >= 0: slot_before = None slot_after = None for pos, slot, value in alignment: slot_before = slot_after slot_after = slot if pos > contrast_pos: break break slots_before.append(slot_before if slot_before is not None else '') slots_after.append(slot_after if slot_after is not None else '') # Calculate the frequency distribution of slots involved in a contrast relation contrast_slot_cnt = Counter() contrast_slot_cnt.update(slots_before + slots_after) del contrast_slot_cnt[''] print('\n---- Slot distribution in contrast relations ----\n') print('\n'.join(slot + ': ' + str(freq) for slot, freq in contrast_slot_cnt.most_common())) # Calculate the frequency distribution of slot pairs involved in a contrast relation contrast_slot_cnt = Counter() slot_pairs = [ tuple(sorted(slot_pair)) for slot_pair in zip(slots_before, slots_after) if slot_pair != ('', '') ] contrast_slot_cnt.update(slot_pairs) print('\n---- Slot pair distribution in contrast relations ----\n') print('\n'.join(slot_pair[0] + ', ' + slot_pair[1] + ': ' + str(freq) for slot_pair, freq in contrast_slot_cnt.most_common())) new_df = pd.DataFrame( columns=['mr', 'ref', 'slot before contrast', 'slot after contrast']) new_df['mr'] = mrs_orig new_df['ref'] = utterances_orig new_df['slot before contrast'] = slots_before new_df['slot after contrast'] = slots_after filename_out = os.path.splitext(filename)[0] + ' [contrast relations].csv' new_df.to_csv(os.path.join(config.DATA_DIR, dataset, filename_out), index=False, encoding='utf8')
def score_contrast(dataset, filename): """Determines whether the indicated contrast relation is correctly realized in the utterance.""" contrast_connectors = ['but', 'however', 'yet'] contrast_missed = [] contrast_incorrectness = [] contrast_total = [] print('Analyzing contrast realizations in ' + str(filename)) # Read in the data data_cont = data_loader.init_test_data( os.path.join(config.EVAL_DIR, dataset, filename)) dataset_name = data_cont['dataset_name'] mrs_orig, utterances_orig = data_cont['data'] _, _, slot_sep, val_sep, val_sep_end = data_cont['separators'] # Preprocess the MRs and the utterances mrs = [ data_loader.preprocess_mr(mr, data_cont['separators']) for mr in mrs_orig ] utterances = [ data_loader.preprocess_utterance(utt) for utt in utterances_orig ] for i, mr in enumerate(mrs): contrast_found = False contrast_correct = False contrast_slots = [] mr_dict = OrderedDict() # Extract the slot-value pairs into a dictionary for slot_value in mr.split(slot_sep): slot, value, _, _ = data_loader.parse_slot_and_value( slot_value, val_sep, val_sep_end) # Extract slots to be contrasted if slot in [config.CONTRAST_TOKEN, config.CONCESSION_TOKEN]: contrast_slots.extend(value.split()) else: mr_dict[slot] = value # Delexicalize the MR and the utterance utterances[i] = data_loader.delex_sample(mr_dict, utterances[i], dataset=dataset_name) # Determine the slot alignment in the utterance alignment = find_alignment(utterances[i], mr_dict) contrast_total.append(1 if len(contrast_slots) > 0 else 0) if len(contrast_slots) > 0: for contrast_conn in contrast_connectors: contrast_pos = utterances[i].find(contrast_conn) if contrast_pos < 0: continue slot_left_pos = -1 slot_right_pos = -1 dist = 0 contrast_found = True # Check whether the correct pair of slots was contrasted for pos, slot, _ in alignment: # DEBUG PRINT # print(alignment) # print(contrast_slots) # print() if slot_left_pos > -1: dist += 1 if slot in contrast_slots: if slot_left_pos == -1: slot_left_pos = pos else: slot_right_pos = pos break if slot_left_pos > -1 and slot_right_pos > -1: if slot_left_pos < contrast_pos < slot_right_pos and dist <= 2: contrast_correct = True break else: contrast_found = True contrast_correct = True contrast_missed.append(0 if contrast_found else 1) contrast_incorrectness.append(0 if contrast_correct else 1) new_df = pd.DataFrame(columns=[ 'mr', 'ref', 'missed contrast', 'incorrect contrast', 'total contrast' ]) new_df['mr'] = mrs_orig new_df['ref'] = utterances_orig new_df['missed contrast'] = contrast_missed new_df['incorrect contrast'] = contrast_incorrectness new_df['total contrast'] = contrast_total filename_out = os.path.splitext(filename)[0] + ' [contrast eval].csv' new_df.to_csv(os.path.join(config.EVAL_DIR, dataset, filename_out), index=False, encoding='utf8')
def score_emphasis(dataset, filename): """Determines how many of the indicated emphasis instances are realized in the utterance.""" emph_missed = [] emph_total = [] print('Analyzing emphasis realizations in ' + str(filename)) # Read in the data data_cont = data_loader.init_test_data( os.path.join(config.EVAL_DIR, dataset, filename)) dataset_name = data_cont['dataset_name'] mrs_orig, utterances_orig = data_cont['data'] _, _, slot_sep, val_sep, val_sep_end = data_cont['separators'] # Preprocess the MRs and the utterances mrs = [ data_loader.preprocess_mr(mr, data_cont['separators']) for mr in mrs_orig ] utterances = [ data_loader.preprocess_utterance(utt) for utt in utterances_orig ] for i, mr in enumerate(mrs): expect_emph = False emph_slots = set() mr_dict = OrderedDict() # Extract the slot-value pairs into a dictionary for slot_value in mr.split(slot_sep): slot, value, _, _ = data_loader.parse_slot_and_value( slot_value, val_sep, val_sep_end) # Extract slots to be emphasized if slot == config.EMPH_TOKEN: expect_emph = True else: mr_dict[slot] = value if expect_emph: emph_slots.add(slot) expect_emph = False # Delexicalize the MR and the utterance utterances[i] = data_loader.delex_sample(mr_dict, utterances[i], dataset=dataset_name) # Determine the slot alignment in the utterance alignment = find_alignment(utterances[i], mr_dict) emph_total.append(len(emph_slots)) # Check how many emphasized slots were not realized before the name-slot for pos, slot, _ in alignment: # DEBUG PRINT # print(alignment) # print(emph_slots) # print() if slot == 'name': break if slot in emph_slots: emph_slots.remove(slot) emph_missed.append(len(emph_slots)) new_df = pd.DataFrame( columns=['mr', 'ref', 'missed emphasis', 'total emphasis']) new_df['mr'] = mrs_orig new_df['ref'] = utterances_orig new_df['missed emphasis'] = emph_missed new_df['total emphasis'] = emph_total filename_out = os.path.splitext(filename)[0] + ' [emphasis eval].csv' new_df.to_csv(os.path.join(config.EVAL_DIR, dataset, filename_out), index=False, encoding='utf8')
def augment_with_aux_indicators(dataset, filename, indicators, mode='all', alt_contrast_mode=False): """Augment MRs in a dataset with auxiliary tokens indicating desired discourse phenomena in the corresponding utterances. Depending on the mode, the augmented dataset will only contain samples which exhibit 1.) at most one of the desired indicators ('single'), 2.) the one selected indicator only ('only'), or 3.) all the desired indicators at once ('combo'). The default mode ('all') keeps all samples in the dataset. """ if indicators is None or len(indicators) == 0: return mrs_augm = [] mrs_single = [] utterances_single = [] mrs_emph_only = [] utterances_emph_only = [] mrs_contrast_only = [] utterances_contrast_only = [] mrs_combo = [] utterances_combo = [] emph_only_ctr = 0 contrast_only_ctr = 0 combo_ctr = 0 print('Augmenting MRs with ' + ' + '.join(indicators) + ' in ' + str(filename)) # Read in the data data_cont = data_loader.init_test_data(os.path.join(config.DATA_DIR, dataset, filename)) mrs, utterances = data_cont['data'] _, _, slot_sep, val_sep, val_sep_end = data_cont['separators'] for mr, utt in zip(mrs, utterances): mr_dict = OrderedDict() mr_list_augm = [] # Extract the slot-value pairs into a dictionary for slot_value in mr.split(slot_sep): slot, value, slot_orig, value_orig = data_loader.parse_slot_and_value(slot_value, val_sep, val_sep_end) mr_dict[slot] = value mr_list_augm.append((slot, value_orig)) # mrs[i] = mrs[i].replace(slot_orig, slot) # Find the slot alignment alignment = find_alignment(utt, mr_dict) # Augment the MR with auxiliary tokens if 'emphasis' in indicators: __add_emphasis_tokens(mr_list_augm, alignment) if 'contrast' in indicators: __add_contrast_tokens(mr_list_augm, utt, alignment, alt_mode=alt_contrast_mode) # Convert augmented MR from list to string representation mr_augm = (slot_sep + ' ').join([s + val_sep + v + (val_sep_end if val_sep_end is not None else '') for s, v in mr_list_augm]) mrs_augm.append(mr_augm) # Count and separate the different augmentation instances slots_augm = set([s for s, v in mr_list_augm]) if config.EMPH_TOKEN in slots_augm and (config.CONTRAST_TOKEN in slots_augm or config.CONCESSION_TOKEN in slots_augm): mrs_combo.append(mr_augm) utterances_combo.append(utt) combo_ctr += 1 else: mrs_single.append(mr_augm) utterances_single.append(utt) if config.EMPH_TOKEN in slots_augm: mrs_emph_only.append(mr_augm) utterances_emph_only.append(utt) emph_only_ctr += 1 elif config.CONTRAST_TOKEN in slots_augm or config.CONCESSION_TOKEN in slots_augm: mrs_contrast_only.append(mr_augm) utterances_contrast_only.append(utt) contrast_only_ctr += 1 print('# of MRs with emphasis only:', emph_only_ctr) print('# of MRs with contrast/concession only:', contrast_only_ctr) print('# of MRs with emphasis & contrast/concession:', combo_ctr) new_df = pd.DataFrame(columns=['mr', 'ref']) if mode == 'single': new_df['mr'] = mrs_single new_df['ref'] = utterances_single elif mode == 'only': if 'emphasis' in indicators: new_df['mr'] = mrs_emph_only new_df['ref'] = utterances_emph_only elif 'contrast' in indicators: new_df['mr'] = mrs_contrast_only new_df['ref'] = utterances_contrast_only elif mode == 'combo': new_df['mr'] = mrs_combo new_df['ref'] = utterances_combo else: new_df['mr'] = mrs_augm new_df['ref'] = utterances # Store augmented dataset to a new file filename_out = os.path.splitext(filename)[0] + '_augm_' + '_'.join(indicators) \ + (('_' + mode) if mode != 'all' else '') + ('_alt' if alt_contrast_mode else '') + '.csv' new_df.to_csv(os.path.join(config.DATA_DIR, dataset, filename_out), index=False, encoding='utf8')
def augment_by_utterance_splitting(dataset, filename, denoise_only=False): """Performs utterance splitting and augments the dataset with new pseudo-samples whose utterances are one sentence long. The MR of each pseudo-sample contains only slots mentioned in the corresponding sentence. Assumes a CSV or JSON file as input. """ if not filename.lower().endswith(('.csv', '.json')): raise ValueError('Unexpected file type. Please provide a CSV or JSON file as input.') mrs_dicts = [] data_new = [] print('Performing utterance splitting on ' + str(filename)) # Read in the data data_cont = data_loader.init_test_data(os.path.join(config.DATA_DIR, dataset, filename)) mrs, utterances = data_cont['data'] _, _, slot_sep, val_sep, val_sep_end = data_cont['separators'] for i, mr in enumerate(mrs): mr_dict = OrderedDict() # Extract the slot-value pairs into a dictionary for slot_value in mr.split(slot_sep): slot, _, _, value_orig = data_loader.parse_slot_and_value(slot_value, val_sep, val_sep_end) mr_dict[slot] = value_orig mrs_dicts.append(mr_dict) new_mrs, new_utterances = split_content(mrs_dicts, utterances, filename, permute=False, denoise_only=denoise_only) suffix = ' [' + ('denoised' if denoise_only else 'utt. split') + ']' filename_out = os.path.splitext(filename)[0] + suffix + os.path.splitext(filename)[1] if filename.lower().endswith('.csv'): for row, mr in enumerate(new_mrs): if len(mr) == 0: continue mr_str = ', '.join(['{0}[{1}]'.format(slot, value) for slot, value in mr.items()]) data_new.append([mr_str, new_utterances[row]]) # Write the augmented dataset to a new file pd.DataFrame(data_new).to_csv(os.path.join(config.DATA_DIR, dataset, filename_out), header=['mr', 'ref'], index=False, encoding='utf8') elif filename.lower().endswith('.json'): for row, mr in enumerate(new_mrs): if len(mr) == 0: continue mr_str = mr.pop('da') mr_str += '(' + slot_sep.join( ['{0}{1}{2}'.format(key.rstrip(string.digits), val_sep, value) for key, value in mr.items()] ) + ')' data_new.append([mr_str, new_utterances[row]]) # Write the augmented dataset to a new file with io.open(os.path.join(config.DATA_DIR, dataset, filename_out), 'w', encoding='utf8') as f_data_new: json.dump(data_new, f_data_new, indent=4)