def read_intent_slot_outputs(queries, intent_file, slot_file, intent_logits, slot_logits, slot_masks, intents=None, slots=None): intent_dict = get_vocab(intent_file) slot_dict = get_vocab(slot_file) pred_intents = np.argmax(intent_logits, 1) pred_slots = np.argmax(slot_logits, axis=2) slot_masks = slot_masks > 0.5 for i, query in enumerate(queries): logging.info(f'Query: {query}') pred = pred_intents[i] logging.info(f'Predicted intent:\t{pred}\t{intent_dict[pred]}') if intents is not None: logging.info( f'True intent:\t{intents[i]}\t{intent_dict[intents[i]]}') pred_slot = pred_slots[i][slot_masks[i]] tokens = query.strip().split() if len(pred_slot) != len(tokens): raise ValueError('Pred_slot and tokens must be of the same length') for j, token in enumerate(tokens): output = f'{token}\t{slot_dict[pred_slot[j]]}' if slots is not None: output = f'{output}\t{slot_dict[slots[i][j]]}' logging.info(output)
def process_atis(infold, outfold, modes=['train', 'test'], do_lower_case=False): """ MSFT's dataset, processed by Kaggle https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk """ vocab = get_vocab(f'{infold}/atis.dict.vocab.csv') if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold)) return outfold logging.info(f'Processing ATIS dataset and storing at {outfold}.') os.makedirs(outfold, exist_ok=True) outfiles = {} for mode in modes: outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') outfiles[mode].write('sentence\tlabel\n') outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w') queries = open(f'{infold}/atis.{mode}.query.csv', 'r').readlines() intents = open(f'{infold}/atis.{mode}.intent.csv', 'r').readlines() slots = open(f'{infold}/atis.{mode}.slots.csv', 'r').readlines() for i, query in enumerate(queries): sentence = ids2text(query.strip().split()[1:-1], vocab) if do_lower_case: sentence = sentence.lower() outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n') slot = ' '.join(slots[i].strip().split()[1:-1]) outfiles[mode + '_slots'].write(slot + '\n') shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv') shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv') for mode in modes: outfiles[mode].close()
def __init__(self, data_dir, do_lower_case=False, dataset_name='default', none_slot_label='O', pad_label=-1): if dataset_name == 'atis': self.data_dir = process_atis(data_dir, do_lower_case) elif dataset_name == 'snips-atis': self.data_dir, self.pad_label = self.merge(data_dir, [ 'ATIS/nemo-processed-uncased', 'snips/nemo-processed-uncased/all' ], dataset_name) elif dataset_name == 'dialogflow': self.data_dir = process_dialogflow(data_dir, do_lower_case) elif dataset_name == 'mturk-processed': self.data_dir = process_mturk(data_dir, do_lower_case) elif dataset_name in set(['snips-light', 'snips-speak', 'snips-all']): self.data_dir = process_snips(data_dir, do_lower_case) if dataset_name.endswith('light'): self.data_dir = f'{self.data_dir}/light' elif dataset_name.endswith('speak'): self.data_dir = f'{self.data_dir}/speak' elif dataset_name.endswith('all'): self.data_dir = f'{self.data_dir}/all' elif dataset_name.startswith('jarvis'): self.data_dir = process_jarvis_datasets( data_dir, do_lower_case, dataset_name, modes=["train", "test", "eval"], ignore_prev_intent=False) else: if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']): raise FileNotFoundError( "Make sure that your data follows the standard format " "supported by JointIntentSlotDataset. Your data must " "contain dict.intents.csv and dict.slots.csv.") self.data_dir = data_dir self.intent_dict_file = self.data_dir + '/dict.intents.csv' self.slot_dict_file = self.data_dir + '/dict.slots.csv' self.num_intents = len(get_vocab(self.intent_dict_file)) slots = label2idx(self.slot_dict_file) self.num_slots = len(slots) for mode in ['train', 'test', 'eval']: if not if_exist(self.data_dir, [f'{mode}.tsv']): logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.') continue slot_file = f'{self.data_dir}/{mode}_slots.tsv' with open(slot_file, 'r') as f: slot_lines = f.readlines() input_file = f'{self.data_dir}/{mode}.tsv' with open(input_file, 'r') as f: input_lines = f.readlines()[1:] # Skipping headers at index 0 if len(slot_lines) != len(input_lines): raise ValueError( "Make sure that the number of slot lines match the " "number of intent lines. There should be a 1-1 " "correspondence between every slot and intent lines.") dataset = list(zip(slot_lines, input_lines)) raw_slots, queries, raw_intents = [], [], [] for slot_line, input_line in dataset: slot_list = [int(slot) for slot in slot_line.strip().split()] raw_slots.append(slot_list) parts = input_line.strip().split() raw_intents.append(int(parts[-1])) queries.append(' '.join(parts[:-1])) infold = input_file[:input_file.rfind('/')] logging.info(f'Three most popular intents during {mode}ing') total_intents, intent_label_freq = get_label_stats( raw_intents, infold + f'/{mode}_intent_stats.tsv') merged_slots = itertools.chain.from_iterable(raw_slots) logging.info(f'Three most popular slots during {mode}ing') slots_total, slots_label_freq = get_label_stats( merged_slots, infold + f'/{mode}_slot_stats.tsv') if mode == 'train': self.slot_weights = calc_class_weights(slots_label_freq) logging.info(f'Slot weights are - {self.slot_weights}') self.intent_weights = calc_class_weights(intent_label_freq) logging.info(f'Intent weights are - {self.intent_weights}') logging.info(f'Total intents - {total_intents}') logging.info(f'Intent label frequency - {intent_label_freq}') logging.info(f'Total Slots - {slots_total}') logging.info(f'Slots label frequency - {slots_label_freq}') if pad_label != -1: self.pad_label = pad_label else: if none_slot_label not in slots: raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.') self.pad_label = slots[none_slot_label]
def merge(self, data_dir, subdirs, dataset_name, modes=['train', 'test']): outfold = f'{data_dir}/{dataset_name}' if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold)) slots = get_vocab(f'{outfold}/dict.slots.csv') none_slot = 0 for key in slots: if slots[key] == 'O': none_slot = key break return outfold, int(none_slot) os.makedirs(outfold, exist_ok=True) data_files, slot_files = {}, {} for mode in modes: data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w') data_files[mode].write('sentence\tlabel\n') slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w') intents, slots = {}, {} intent_shift, slot_shift = 0, 0 none_intent, none_slot = -1, -1 for subdir in subdirs: curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv') curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv') for key in curr_intents: if intent_shift > 0 and curr_intents[key] == 'O': continue if curr_intents[key] == 'O' and intent_shift == 0: none_intent = int(key) intents[int(key) + intent_shift] = curr_intents[key] for key in curr_slots: if slot_shift > 0 and curr_slots[key] == 'O': continue if slot_shift == 0 and curr_slots[key] == 'O': none_slot = int(key) slots[int(key) + slot_shift] = curr_slots[key] for mode in modes: with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f: for line in f.readlines()[1:]: text, label = line.strip().split('\t') label = int(label) if curr_intents[label] == 'O': label = none_intent else: label = label + intent_shift data_files[mode].write(f'{text}\t{label}\n') with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f: for line in f.readlines(): labels = [int(label) for label in line.strip().split()] shifted_labels = [] for label in labels: if curr_slots[label] == 'O': shifted_labels.append(none_slot) else: shifted_labels.append(label + slot_shift) slot_files[mode].write(list2str(shifted_labels) + '\n') intent_shift += len(curr_intents) slot_shift += len(curr_slots) write_vocab_in_order(intents, f'{outfold}/dict.intents.csv') write_vocab_in_order(slots, f'{outfold}/dict.slots.csv') return outfold, none_slot
def __init__(self, data_dir, none_slot_label='O', pad_label=-1): if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']): raise FileNotFoundError( "Make sure that your data follows the standard format " "supported by JointIntentSlotDataset. Your data must " "contain dict.intents.csv and dict.slots.csv.") self.data_dir = data_dir self.intent_dict_file = self.data_dir + '/dict.intents.csv' self.slot_dict_file = self.data_dir + '/dict.slots.csv' self.num_intents = len(get_vocab(self.intent_dict_file)) slots = JointIntentSlotDataDesc.label2idx(self.slot_dict_file) self.num_slots = len(slots) for mode in ['train', 'test', 'dev']: if not if_exist(self.data_dir, [f'{mode}.tsv']): logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.') continue slot_file = f'{self.data_dir}/{mode}_slots.tsv' with open(slot_file, 'r') as f: slot_lines = f.readlines() input_file = f'{self.data_dir}/{mode}.tsv' with open(input_file, 'r') as f: input_lines = f.readlines()[1:] # Skipping headers at index 0 if len(slot_lines) != len(input_lines): raise ValueError( "Make sure that the number of slot lines match the " "number of intent lines. There should be a 1-1 " "correspondence between every slot and intent lines.") dataset = list(zip(slot_lines, input_lines)) raw_slots, queries, raw_intents = [], [], [] for slot_line, input_line in dataset: slot_list = [int(slot) for slot in slot_line.strip().split()] raw_slots.append(slot_list) parts = input_line.strip().split() raw_intents.append(int(parts[-1])) queries.append(' '.join(parts[:-1])) infold = input_file[:input_file.rfind('/')] logging.info(f'Three most popular intents during {mode}ing') total_intents, intent_label_freq = get_label_stats( raw_intents, infold + f'/{mode}_intent_stats.tsv') merged_slots = itertools.chain.from_iterable(raw_slots) logging.info(f'Three most popular slots during {mode}ing') slots_total, slots_label_freq = get_label_stats( merged_slots, infold + f'/{mode}_slot_stats.tsv') if mode == 'train': self.slot_weights = calc_class_weights(slots_label_freq) logging.info(f'Slot weights are - {self.slot_weights}') self.intent_weights = calc_class_weights(intent_label_freq) logging.info(f'Intent weights are - {self.intent_weights}') logging.info(f'Total intents - {total_intents}') logging.info(f'Intent label frequency - {intent_label_freq}') logging.info(f'Total Slots - {slots_total}') logging.info(f'Slots label frequency - {slots_label_freq}') if pad_label != -1: self.pad_label = pad_label else: if none_slot_label not in slots: raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.') self.pad_label = slots[none_slot_label]