def process_atis(infold, outfold, modes=['train', 'test'], do_lower_case=False): """ MSFT's dataset, processed by Kaggle https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk """ vocab = get_vocab(f'{infold}/atis.dict.vocab.csv') if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold)) return outfold logging.info(f'Processing ATIS dataset and storing at {outfold}.') os.makedirs(outfold, exist_ok=True) outfiles = {} for mode in modes: outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') outfiles[mode].write('sentence\tlabel\n') outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w') queries = open(f'{infold}/atis.{mode}.query.csv', 'r').readlines() intents = open(f'{infold}/atis.{mode}.intent.csv', 'r').readlines() slots = open(f'{infold}/atis.{mode}.slots.csv', 'r').readlines() for i, query in enumerate(queries): sentence = ids2text(query.strip().split()[1:-1], vocab) if do_lower_case: sentence = sentence.lower() outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n') slot = ' '.join(slots[i].strip().split()[1:-1]) outfiles[mode + '_slots'].write(slot + '\n') shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv') shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv') for mode in modes: outfiles[mode].close()
def create_vocab_lm(self, data_dir, do_lower_case): if if_exist(data_dir, ['train.txt', 'vocab.txt']): logging.info("Vocabulary has been created.") with open(os.path.join(data_dir, 'vocab.txt'), 'r') as f: vocab_size = len(f.readlines()) return vocab_size logging.info(f'Creating vocabulary from training data at {data_dir}') with open(f'{data_dir}/train.txt', 'r') as f: txt = f.read() if do_lower_case: txt = txt.lower() lines = re.split(r'[\n]', txt) sentences = [line.strip().split() for line in lines if line.strip()] vocab = {"[PAD]": 0, "[SEP]": 1, "[CLS]": 2, "[MASK]": 3} idx = 4 for sentence in sentences: for word in sentence: if word not in vocab: vocab[word] = idx idx += 1 with open(f'{data_dir}/vocab.txt', 'w') as f: for word in sorted(vocab.keys()): f.write(word + '\n') logging.info(f"Created vocabulary of size {len(vocab)}") return len(vocab)
def process_snips(infold, outfold, do_lower_case, modes=['train', 'test'], dev_split=0.1): if not os.path.exists(infold): link = 'https://github.com/snipsco/spoken-language-understanding-research-datasets' raise ValueError( f'Data not found at {infold}. ' f'You may request to download the SNIPS dataset from {link}.') exist = True for dataset in ['light', 'speak', 'all']: if if_exist(f'{outfold}/{dataset}', [f'{mode}.tsv' for mode in modes]): logging.info( DATABASE_EXISTS_TMP.format('SNIPS-' + dataset, outfold)) else: exist = False if exist: return outfold logging.info( f'Processing SNIPS dataset and storing at folders "speak", "light" and "all" under {outfold}.' ) logging.info( f'Processing and importing "smart-speaker-en-close-field" -> "speak" and "smart-speaker-en-close-field" -> "light".' ) os.makedirs(outfold, exist_ok=True) speak_dir = 'smart-speaker-en-close-field' light_dir = 'smart-lights-en-close-field' light_files = [f'{infold}/{light_dir}/dataset.json'] speak_files = [f'{infold}/{speak_dir}/training_dataset.json'] speak_files.append(f'{infold}/{speak_dir}/test_dataset.json') light_train, light_dev, light_slots, light_intents = get_dataset( light_files, dev_split) speak_train, speak_dev, speak_slots, speak_intents = get_dataset( speak_files) create_dataset(light_train, light_dev, light_slots, light_intents, do_lower_case, f'{outfold}/light') create_dataset(speak_train, speak_dev, speak_slots, speak_intents, do_lower_case, f'{outfold}/speak') create_dataset( light_train + speak_train, light_dev + speak_dev, light_slots | speak_slots, light_intents | speak_intents, do_lower_case, f'{outfold}/all', )
def __init__(self, data_dir, modes=['train', 'test', 'dev']): self.data_dir = data_dir max_label_id = 0 for mode in modes: if not if_exist(self.data_dir, [f'{mode}.tsv']): logging.info( f'Stats calculation for {mode} mode is skipped as {mode}.tsv was not found.' ) continue input_file = f'{self.data_dir}/{mode}.tsv' with open(input_file, 'r') as f: input_lines = f.readlines()[1:] # Skipping headers at index 0 try: int(input_lines[0].strip().split()[-1]) except ValueError: logging.warning(f'No numerical labels found for {mode}.tsv.') raise queries, raw_sentences = [], [] for input_line in input_lines: parts = input_line.strip().split() label = int(parts[-1]) raw_sentences.append(label) queries.append(' '.join(parts[:-1])) infold = input_file[:input_file.rfind('/')] logging.info(f'Three most popular classes in {mode} dataset') total_sents, sent_label_freq, max_id = get_label_stats( raw_sentences, infold + f'/{mode}_sentence_stats.tsv') max_label_id = max(max_label_id, max_id) if mode == 'train': class_weights_dict = get_freq_weights(sent_label_freq) logging.info(f'Class Weights: {class_weights_dict}') logging.info(f'Total Sentences: {total_sents}') logging.info(f'Sentence class frequencies - {sent_label_freq}') self.class_weights = fill_class_weights(class_weights_dict, max_label_id) self.num_labels = max_label_id + 1
default='~/data/state_tracking/multiwoz2.1', type=str) args = parser.parse_args() # Get absolute paths. abs_source_data_dir = expanduser(args.source_data_dir) abs_target_data_dir = expanduser(args.target_data_dir) if not exists(abs_source_data_dir): raise FileNotFoundError(f"{abs_source_data_dir} doesn't exist.") # Check if the files exist. if if_exist( abs_target_data_dir, [ 'trainListFile.json', 'val_dials.json', 'test_dials.json', 'train_dials.json', 'ontology.json' ], ): print( f'Data is already processed and stored at {abs_target_data_dir}, skipping preprocessing.' ) exit(0) # Set domain. DOMAINS = [ 'restaurant', 'hotel', 'attraction', 'train', 'taxi', 'hospital', 'police' ] PHONE_NUM_TMPL = '\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4,5})' POSTCODE_TMPL = ('([a-z]{1}[\. ]?[a-z]{1}[\. ]?\d{1,2}[, ]+\d{1}[\. ]?' +
if __name__ == "__main__": # Parse the command-line arguments. parser = argparse.ArgumentParser(description='Process MultiWOZ dataset') parser.add_argument( "--source_data_dir", required=True, type=str, help='The path to the folder containing the MultiWOZ data files.' ) parser.add_argument("--target_data_dir", default='multiwoz2.1/', type=str) args = parser.parse_args() if not exists(args.source_data_dir): raise FileNotFoundError(f"{args.source_data_dir} does not exist.") # Check if the files exist if if_exist(args.target_data_dir, ['ontology.json', 'dev_dials.json', 'test_dials.json', 'train_dials.json']): print(f'Data is already processed and stored at {args.source_data_dir}, skipping pre-processing.') exit(0) fin = open('multiwoz_mapping.pair', 'r') REPLACEMENTS = [] for line in fin.readlines(): tok_from, tok_to = line.replace('\n', '').split('\t') REPLACEMENTS.append((' ' + tok_from + ' ', ' ' + tok_to + ' ')) print('Creating dialogues...') # Process MultiWOZ dataset delex_data = createData(args.source_data_dir) # Divide data divideData(delex_data, args.source_data_dir, args.target_data_dir)
def __init__(self, data_dir, none_slot_label='O', pad_label=-1): if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']): raise FileNotFoundError( "Make sure that your data follows the standard format " "supported by JointIntentSlotDataset. Your data must " "contain dict.intents.csv and dict.slots.csv.") self.data_dir = data_dir self.intent_dict_file = self.data_dir + '/dict.intents.csv' self.slot_dict_file = self.data_dir + '/dict.slots.csv' self.intents_label_ids = JointIntentSlotDataDesc.label2idx( self.intent_dict_file) self.num_intents = len(self.intents_label_ids) self.slots_label_ids = JointIntentSlotDataDesc.label2idx( self.slot_dict_file) self.num_slots = len(self.slots_label_ids) for mode in ['train', 'test', 'dev']: if not if_exist(self.data_dir, [f'{mode}.tsv']): logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.') continue logging.info(f' Stats calculating for {mode} mode...') slot_file = f'{self.data_dir}/{mode}_slots.tsv' with open(slot_file, 'r') as f: slot_lines = f.readlines() input_file = f'{self.data_dir}/{mode}.tsv' with open(input_file, 'r') as f: input_lines = f.readlines()[1:] # Skipping headers at index 0 if len(slot_lines) != len(input_lines): raise ValueError( "Make sure that the number of slot lines match the " "number of intent lines. There should be a 1-1 " "correspondence between every slot and intent lines.") dataset = list(zip(slot_lines, input_lines)) raw_slots, queries, raw_intents = [], [], [] for slot_line, input_line in dataset: slot_list = [int(slot) for slot in slot_line.strip().split()] raw_slots.append(slot_list) parts = input_line.strip().split() raw_intents.append(int(parts[-1])) queries.append(' '.join(parts[:-1])) infold = self.data_dir logging.info(f'Three most popular intents in {mode} mode:') total_intents, intent_label_freq = get_label_stats( raw_intents, infold + f'/{mode}_intent_stats.tsv') merged_slots = itertools.chain.from_iterable(raw_slots) logging.info(f'Three most popular slots in {mode} mode:') slots_total, slots_label_freq = get_label_stats( merged_slots, infold + f'/{mode}_slot_stats.tsv') if mode == 'train': self.slot_weights = calc_class_weights(slots_label_freq) logging.info(f'Slot weights are - {self.slot_weights}') self.intent_weights = calc_class_weights(intent_label_freq) logging.info(f'Intent weights are - {self.intent_weights}') logging.info(f'Total intents - {total_intents}') logging.info(f'Intent label frequency - {intent_label_freq}') logging.info(f'Total Slots - {slots_total}') logging.info(f'Slots label frequency - {slots_label_freq}') if pad_label != -1: self.pad_label = pad_label else: if none_slot_label not in self.slots_label_ids: raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.') self.pad_label = self.slots_label_ids[none_slot_label]
def process_jarvis_datasets(infold, outfold, modes=['train', 'test', 'dev'], do_lower_case=False, ignore_prev_intent=False): """ process and convert Jarvis datasets into NeMo's BIO format """ dataset_name = "jarvis" if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']): logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold)) return outfold logging.info(f'Processing {dataset_name} dataset and storing at {outfold}') os.makedirs(outfold, exist_ok=True) outfiles = {} intents_list = {} slots_list = {} slots_list_all = {} outfiles['dict_intents'] = open(f'{outfold}/dict.intents.csv', 'w') outfiles['dict_slots'] = open(f'{outfold}/dict.slots.csv', 'w') outfiles['dict_slots'].write('O\n') slots_list["O"] = 0 slots_list_all["O"] = 0 for mode in modes: if if_exist(outfold, [f'{mode}.tsv']): logging.info( MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode)) continue if not if_exist(infold, [f'{mode}.tsv']): logging.info(f'{mode} mode of {dataset_name}' f' is skipped as it was not found.') continue outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') outfiles[mode].write('sentence\tlabel\n') outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w') queries = open(f'{infold}/{mode}.tsv', 'r').readlines() for i, query in enumerate(queries): line_splits = query.strip().split("\t") if len(line_splits) == 3: intent_str, slot_tags_str, sentence = line_splits else: intent_str, sentence = line_splits slot_tags_str = "" if intent_str not in intents_list: intents_list[intent_str] = len(intents_list) outfiles['dict_intents'].write(f'{intent_str}\n') if ignore_prev_intent: start_token = 2 else: start_token = 1 if do_lower_case: sentence = sentence.lower() sentence_cld = " ".join(sentence.strip().split()[start_token:-1]) outfiles[mode].write(f'{sentence_cld}\t' f'{str(intents_list[intent_str])}\n') slot_tags_list = [] if slot_tags_str.strip(): slot_tags = slot_tags_str.strip().split(",") for st in slot_tags: if not st.strip(): continue [start_i, end_i, slot_name] = st.strip().split(":") slot_tags_list.append( [int(start_i), int(end_i), slot_name]) if slot_name not in slots_list: slots_list[slot_name] = len(slots_list) slots_list_all[f'B-{slot_name}'] = len(slots_list_all) slots_list_all[f'I-{slot_name}'] = len(slots_list_all) outfiles['dict_slots'].write(f'B-{slot_name}\n') outfiles['dict_slots'].write(f'I-{slot_name}\n') slot_tags_list.sort(key=lambda x: x[0]) slots = [] processed_index = 0 for tag_start, tag_end, tag_str in slot_tags_list: if tag_start > processed_index: words_list = sentence[processed_index:tag_start].strip( ).split() slots.extend([str(slots_list_all['O'])] * len(words_list)) words_list = sentence[tag_start:tag_end].strip().split() slots.append(str(slots_list_all[f'B-{tag_str}'])) slots.extend([str(slots_list_all[f'I-{tag_str}'])] * (len(words_list) - 1)) processed_index = tag_end if processed_index < len(sentence): words_list = sentence[processed_index:].strip().split() slots.extend([str(slots_list_all['O'])] * len(words_list)) slots = slots[1:-1] slot = ' '.join(slots) outfiles[mode + '_slots'].write(slot + '\n') outfiles[mode + '_slots'].close() outfiles[mode].close() outfiles['dict_slots'].close() outfiles['dict_intents'].close() return outfold
def partition_data(data, infold, outfold): """Partition the data into train, valid, and test sets based on the list of val and test specified in the dataset. """ if if_exist(outfold, [ 'trainListFile.json', 'val_dialogs.json', 'test_dialogs.json', 'train_dialogs.json', 'ontology.json' ]): print(f'Data is already processed and stored at {outfold}') return os.makedirs(outfold, exist_ok=True) shutil.copyfile(f'{infold}/ontology.json', f'{outfold}/ontology.json') with open(f'{infold}/testListFile.json', 'r') as fin: test_files = [line.strip() for line in fin.readlines()] with open(f'{infold}/valListFile.json', 'r') as fin: val_files = [line.strip() for line in fin.readlines()] train_list_files = open(f'{outfold}/trainListFile.json', 'w') train_dialogs, val_dialogs, test_dialogs = [], [], [] count_train, count_val, count_test = 0, 0, 0 for dialog_id in data: dialog = data[dialog_id] domains = [ key for key in dialog['goal'].keys() if key in DOMAINS and dialog['goal'][key] ] dial = get_dialog(dialog) if dial: dialogue = {} dialogue['dialog_idx'] = dialog_id dialogue['domains'] = list(set(domains)) last_bs = [] dialogue['dialog'] = [] for idx, turn in enumerate(dial): turn_dl = { 'sys_transcript': dial[idx - 1]['sys'] if idx > 0 else "", 'turn_idx': idx, 'transcript': turn['usr'], 'sys_acts': dial[idx - 1]['sys_a'] if idx > 0 else [], 'domain': turn['domain'], } turn_dl['belief_state'] = [{ "slots": [s], "act": "inform" } for s in turn['bvs']] turn_dl['turn_label'] = [ bs["slots"][0] for bs in turn_dl['belief_state'] if bs not in last_bs ] last_bs = turn_dl['belief_state'] dialogue['dialog'].append(turn_dl) if dialog_id in test_files: test_dialogs.append(dialogue) count_test += 1 elif dialog_id in val_files: val_dialogs.append(dialogue) count_val += 1 else: train_list_files.write(dialog_id + '\n') train_dialogs.append(dialogue) count_train += 1 print(f"Dialogs: {count_train} train, {count_val} val, {count_test} test.") # save all dialogues with open(f'{outfold}/val_dialogs.json', 'w') as fout: json.dump(val_dialogs, fout, indent=4) with open(f'{outfold}/test_dialogs.json', 'w') as fout: json.dump(test_dialogs, fout, indent=4) with open(f'{outfold}/train_dialogs.json', 'w') as fout: json.dump(train_dialogs, fout, indent=4) train_list_files.close()
def __init__(self, data_dir, do_lower_case=False, dataset_name='default', none_slot_label='O', pad_label=-1): if dataset_name == 'atis': self.data_dir = process_atis(data_dir, do_lower_case) elif dataset_name == 'snips-atis': self.data_dir, self.pad_label = self.merge(data_dir, [ 'ATIS/nemo-processed-uncased', 'snips/nemo-processed-uncased/all' ], dataset_name) elif dataset_name == 'dialogflow': self.data_dir = process_dialogflow(data_dir, do_lower_case) elif dataset_name == 'mturk-processed': self.data_dir = process_mturk(data_dir, do_lower_case) elif dataset_name in set(['snips-light', 'snips-speak', 'snips-all']): self.data_dir = process_snips(data_dir, do_lower_case) if dataset_name.endswith('light'): self.data_dir = f'{self.data_dir}/light' elif dataset_name.endswith('speak'): self.data_dir = f'{self.data_dir}/speak' elif dataset_name.endswith('all'): self.data_dir = f'{self.data_dir}/all' elif dataset_name.startswith('jarvis'): self.data_dir = process_jarvis_datasets( data_dir, do_lower_case, dataset_name, modes=["train", "test", "eval"], ignore_prev_intent=False) else: if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']): raise FileNotFoundError( "Make sure that your data follows the standard format " "supported by JointIntentSlotDataset. Your data must " "contain dict.intents.csv and dict.slots.csv.") self.data_dir = data_dir self.intent_dict_file = self.data_dir + '/dict.intents.csv' self.slot_dict_file = self.data_dir + '/dict.slots.csv' self.num_intents = len(get_vocab(self.intent_dict_file)) slots = label2idx(self.slot_dict_file) self.num_slots = len(slots) for mode in ['train', 'test', 'eval']: if not if_exist(self.data_dir, [f'{mode}.tsv']): logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.') continue slot_file = f'{self.data_dir}/{mode}_slots.tsv' with open(slot_file, 'r') as f: slot_lines = f.readlines() input_file = f'{self.data_dir}/{mode}.tsv' with open(input_file, 'r') as f: input_lines = f.readlines()[1:] # Skipping headers at index 0 if len(slot_lines) != len(input_lines): raise ValueError( "Make sure that the number of slot lines match the " "number of intent lines. There should be a 1-1 " "correspondence between every slot and intent lines.") dataset = list(zip(slot_lines, input_lines)) raw_slots, queries, raw_intents = [], [], [] for slot_line, input_line in dataset: slot_list = [int(slot) for slot in slot_line.strip().split()] raw_slots.append(slot_list) parts = input_line.strip().split() raw_intents.append(int(parts[-1])) queries.append(' '.join(parts[:-1])) infold = input_file[:input_file.rfind('/')] logging.info(f'Three most popular intents during {mode}ing') total_intents, intent_label_freq = get_label_stats( raw_intents, infold + f'/{mode}_intent_stats.tsv') merged_slots = itertools.chain.from_iterable(raw_slots) logging.info(f'Three most popular slots during {mode}ing') slots_total, slots_label_freq = get_label_stats( merged_slots, infold + f'/{mode}_slot_stats.tsv') if mode == 'train': self.slot_weights = calc_class_weights(slots_label_freq) logging.info(f'Slot weights are - {self.slot_weights}') self.intent_weights = calc_class_weights(intent_label_freq) logging.info(f'Intent weights are - {self.intent_weights}') logging.info(f'Total intents - {total_intents}') logging.info(f'Intent label frequency - {intent_label_freq}') logging.info(f'Total Slots - {slots_total}') logging.info(f'Slots label frequency - {slots_label_freq}') if pad_label != -1: self.pad_label = pad_label else: if none_slot_label not in slots: raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.') self.pad_label = slots[none_slot_label]
def merge(self, data_dir, subdirs, dataset_name, modes=['train', 'test']): outfold = f'{data_dir}/{dataset_name}' if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold)) slots = get_vocab(f'{outfold}/dict.slots.csv') none_slot = 0 for key in slots: if slots[key] == 'O': none_slot = key break return outfold, int(none_slot) os.makedirs(outfold, exist_ok=True) data_files, slot_files = {}, {} for mode in modes: data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w') data_files[mode].write('sentence\tlabel\n') slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w') intents, slots = {}, {} intent_shift, slot_shift = 0, 0 none_intent, none_slot = -1, -1 for subdir in subdirs: curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv') curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv') for key in curr_intents: if intent_shift > 0 and curr_intents[key] == 'O': continue if curr_intents[key] == 'O' and intent_shift == 0: none_intent = int(key) intents[int(key) + intent_shift] = curr_intents[key] for key in curr_slots: if slot_shift > 0 and curr_slots[key] == 'O': continue if slot_shift == 0 and curr_slots[key] == 'O': none_slot = int(key) slots[int(key) + slot_shift] = curr_slots[key] for mode in modes: with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f: for line in f.readlines()[1:]: text, label = line.strip().split('\t') label = int(label) if curr_intents[label] == 'O': label = none_intent else: label = label + intent_shift data_files[mode].write(f'{text}\t{label}\n') with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f: for line in f.readlines(): labels = [int(label) for label in line.strip().split()] shifted_labels = [] for label in labels: if curr_slots[label] == 'O': shifted_labels.append(none_slot) else: shifted_labels.append(label + slot_shift) slot_files[mode].write(list2str(shifted_labels) + '\n') intent_shift += len(curr_intents) slot_shift += len(curr_slots) write_vocab_in_order(intents, f'{outfold}/dict.intents.csv') write_vocab_in_order(slots, f'{outfold}/dict.slots.csv') return outfold, none_slot
def __init__(self, dataset_name, data_dir, do_lower_case, modes=['train', 'test', 'eval']): if dataset_name == 'sst-2': self.data_dir = process_sst_2(data_dir) self.num_labels = 2 self.eval_file = self.data_dir + '/dev.tsv' elif dataset_name == 'imdb': self.num_labels = 2 self.data_dir = process_imdb(data_dir, do_lower_case) self.eval_file = self.data_dir + '/test.tsv' elif dataset_name == 'thucnews': self.num_labels = 14 self.data_dir = process_thucnews(data_dir) self.eval_file = self.data_dir + '/test.tsv' elif dataset_name.startswith('nlu-'): if dataset_name.endswith('chat'): self.data_dir = f'{data_dir}/ChatbotCorpus.json' self.num_labels = 2 elif dataset_name.endswith('ubuntu'): self.data_dir = f'{data_dir}/AskUbuntuCorpus.json' self.num_labels = 5 elif dataset_name.endswith('web'): data_dir = f'{data_dir}/WebApplicationsCorpus.json' self.num_labels = 8 self.data_dir = process_nlu(data_dir, do_lower_case, dataset_name=dataset_name) self.eval_file = self.data_dir + '/test.tsv' elif dataset_name.startswith('jarvis'): self.data_dir = process_jarvis_datasets( data_dir, do_lower_case, dataset_name, modes=['train', 'test', 'eval'], ignore_prev_intent=False ) intents = get_intent_labels(f'{self.data_dir}/dict.intents.csv') self.num_labels = len(intents) elif dataset_name != 'default_format': raise ValueError( "Looks like you passed a dataset name that isn't " + "already supported by NeMo. Please make sure " + "that you build the preprocessing method for it. " + "default_format assumes that a data file has a header and each line of the file follows " + "the format: text [TAB] label. Label is assumed to be an integer." ) self.train_file = self.data_dir + '/train.tsv' for mode in modes: if not if_exist(self.data_dir, [f'{mode}.tsv']): logging.info(f'Stats calculation for {mode} mode is skipped as {mode}.tsv was not found.') continue input_file = f'{self.data_dir}/{mode}.tsv' with open(input_file, 'r') as f: input_lines = f.readlines()[1:] # Skipping headers at index 0 try: int(input_lines[0].strip().split()[-1]) except ValueError: logging.warning(f'No numerical labels found for {mode}.tsv in {dataset_name} dataset.') raise queries, raw_sentences = [], [] for input_line in input_lines: parts = input_line.strip().split() raw_sentences.append(int(parts[-1])) queries.append(' '.join(parts[:-1])) infold = input_file[: input_file.rfind('/')] logging.info(f'Three most popular classes in {mode} dataset') total_sents, sent_label_freq = get_label_stats(raw_sentences, infold + f'/{mode}_sentence_stats.tsv') if mode == 'train': self.class_weights = calc_class_weights(sent_label_freq) logging.info(f'Class weights are - {self.class_weights}') logging.info(f'Total Sentences - {total_sents}') logging.info(f'Sentence class frequencies - {sent_label_freq}')
"--source_data_dir", required=True, type=str, help='The path to the folder containing the MultiWOZ data files.' ) parser.add_argument("--target_data_dir", default='multiwoz2.1/', type=str) parser.add_argument("--overwrite_files", action="store_true", help="Whether to overwrite preprocessed file") args = parser.parse_args() # Get the absolute path. abs_source_data_dir = expanduser(args.source_data_dir) abs_target_data_dir = expanduser(args.target_data_dir) if not exists(abs_source_data_dir): raise FileNotFoundError(f"{abs_source_data_dir} does not exist.") # Check if the files exist if ( if_exist(abs_target_data_dir, ['ontology.json', 'dev_dials.json', 'test_dials.json', 'train_dials.json', 'db']) and not args.overwrite_files ): print(f'Data is already processed and stored at {abs_target_data_dir}, skipping pre-processing.') exit(0) fin = open('multiwoz_mapping.pair', 'r') REPLACEMENTS = [] for line in fin.readlines(): tok_from, tok_to = line.replace('\n', '').split('\t') REPLACEMENTS.append((' ' + tok_from + ' ', ' ' + tok_to + ' ')) print('Creating dialogues...') # Process MultiWOZ dataset delex_data = createData(abs_source_data_dir) # Divide data