def process_atis(infold, outfold, modes=['train', 'test'], do_lower_case=False): """ MSFT's dataset, processed by Kaggle https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk """ vocab = get_vocab(f'{infold}/atis.dict.vocab.csv') if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold)) return outfold logging.info(f'Processing ATIS dataset and storing at {outfold}.') os.makedirs(outfold, exist_ok=True) outfiles = {} for mode in modes: outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') outfiles[mode].write('sentence\tlabel\n') outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w') queries = open(f'{infold}/atis.{mode}.query.csv', 'r').readlines() intents = open(f'{infold}/atis.{mode}.intent.csv', 'r').readlines() slots = open(f'{infold}/atis.{mode}.slots.csv', 'r').readlines() for i, query in enumerate(queries): sentence = ids2text(query.strip().split()[1:-1], vocab) if do_lower_case: sentence = sentence.lower() outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n') slot = ' '.join(slots[i].strip().split()[1:-1]) outfiles[mode + '_slots'].write(slot + '\n') shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv') shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv') for mode in modes: outfiles[mode].close()
def process_snips(infold, outfold, do_lower_case, modes=['train', 'test'], dev_split=0.1): if not os.path.exists(infold): link = 'https://github.com/snipsco/spoken-language-understanding-research-datasets' raise ValueError( f'Data not found at {infold}. ' f'You may request to download the SNIPS dataset from {link}.') exist = True for dataset in ['light', 'speak', 'all']: if if_exist(f'{outfold}/{dataset}', [f'{mode}.tsv' for mode in modes]): logging.info( DATABASE_EXISTS_TMP.format('SNIPS-' + dataset, outfold)) else: exist = False if exist: return outfold logging.info( f'Processing SNIPS dataset and storing at folders "speak", "light" and "all" under {outfold}.' ) logging.info( f'Processing and importing "smart-speaker-en-close-field" -> "speak" and "smart-speaker-en-close-field" -> "light".' ) os.makedirs(outfold, exist_ok=True) speak_dir = 'smart-speaker-en-close-field' light_dir = 'smart-lights-en-close-field' light_files = [f'{infold}/{light_dir}/dataset.json'] speak_files = [f'{infold}/{speak_dir}/training_dataset.json'] speak_files.append(f'{infold}/{speak_dir}/test_dataset.json') light_train, light_dev, light_slots, light_intents = get_dataset( light_files, dev_split) speak_train, speak_dev, speak_slots, speak_intents = get_dataset( speak_files) create_dataset(light_train, light_dev, light_slots, light_intents, do_lower_case, f'{outfold}/light') create_dataset(speak_train, speak_dev, speak_slots, speak_intents, do_lower_case, f'{outfold}/speak') create_dataset( light_train + speak_train, light_dev + speak_dev, light_slots | speak_slots, light_intents | speak_intents, do_lower_case, f'{outfold}/all', )
def create_vocab_mlm( data_dir, vocab_size, sample_size, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], train_file=''): vocab = special_tokens[:] bert_dir = f'{data_dir}/bert' if if_exist(bert_dir, ['tokenizer.model']): logging.info(DATABASE_EXISTS_TMP.format('WikiText_BERT', bert_dir)) return data_dir, f'{bert_dir}/tokenizer.model' logging.info(f'Processing WikiText dataset and store at {bert_dir}') os.makedirs(bert_dir, exist_ok=True) if not train_file: files = glob.glob(f'{data_dir}/*.txt') train_file = f'{bert_dir}/merged.txt' logging.info(f"Merging {len(files)} txt files into {train_file}") with open(train_file, "w") as merged: for file in tqdm(files): with open(file, 'r') as inf: content = inf.read().strip() merged.write(content + '\n\n\n') else: train_file = f'{data_dir}/{train_file}' cmd = (f"--input={train_file} --model_prefix={bert_dir}/tokenizer " f"--vocab_size={vocab_size - len(vocab)} " f"--input_sentence_size={sample_size} " f"--shuffle_input_sentence=true --hard_vocab_limit=false " f"--bos_id=-1 --eos_id=-1") SPT.Train(cmd) # Add BERT control symbols tokens = [] with open(f"{bert_dir}/tokenizer.vocab", "r") as f: f.readline() # skip first <unk> token # Read tokens from each line and parse for vocab for line in f: piece = line.split("\t")[0] token = piece[1:] if piece.startswith("▁") else f"##{piece}" tokens.append(token) vocab.extend(tokens) # Save vocabulary to output file with open(f'{bert_dir}/vocab.txt', "w") as f: for token in vocab: f.write(f"{token}\n".format()) return data_dir, f'{bert_dir}/tokenizer.model'
def process_jarvis_datasets(infold, outfold, modes=['train', 'test', 'dev'], do_lower_case=False, ignore_prev_intent=False): """ process and convert Jarvis datasets into NeMo's BIO format """ dataset_name = "jarvis" if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']): logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold)) return outfold logging.info(f'Processing {dataset_name} dataset and storing at {outfold}') os.makedirs(outfold, exist_ok=True) outfiles = {} intents_list = {} slots_list = {} slots_list_all = {} outfiles['dict_intents'] = open(f'{outfold}/dict.intents.csv', 'w') outfiles['dict_slots'] = open(f'{outfold}/dict.slots.csv', 'w') outfiles['dict_slots'].write('O\n') slots_list["O"] = 0 slots_list_all["O"] = 0 for mode in modes: if if_exist(outfold, [f'{mode}.tsv']): logging.info( MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode)) continue if not if_exist(infold, [f'{mode}.tsv']): logging.info(f'{mode} mode of {dataset_name}' f' is skipped as it was not found.') continue outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') outfiles[mode].write('sentence\tlabel\n') outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w') queries = open(f'{infold}/{mode}.tsv', 'r').readlines() for i, query in enumerate(queries): line_splits = query.strip().split("\t") if len(line_splits) == 3: intent_str, slot_tags_str, sentence = line_splits else: intent_str, sentence = line_splits slot_tags_str = "" if intent_str not in intents_list: intents_list[intent_str] = len(intents_list) outfiles['dict_intents'].write(f'{intent_str}\n') if ignore_prev_intent: start_token = 2 else: start_token = 1 if do_lower_case: sentence = sentence.lower() sentence_cld = " ".join(sentence.strip().split()[start_token:-1]) outfiles[mode].write(f'{sentence_cld}\t' f'{str(intents_list[intent_str])}\n') slot_tags_list = [] if slot_tags_str.strip(): slot_tags = slot_tags_str.strip().split(",") for st in slot_tags: if not st.strip(): continue [start_i, end_i, slot_name] = st.strip().split(":") slot_tags_list.append( [int(start_i), int(end_i), slot_name]) if slot_name not in slots_list: slots_list[slot_name] = len(slots_list) slots_list_all[f'B-{slot_name}'] = len(slots_list_all) slots_list_all[f'I-{slot_name}'] = len(slots_list_all) outfiles['dict_slots'].write(f'B-{slot_name}\n') outfiles['dict_slots'].write(f'I-{slot_name}\n') slot_tags_list.sort(key=lambda x: x[0]) slots = [] processed_index = 0 for tag_start, tag_end, tag_str in slot_tags_list: if tag_start > processed_index: words_list = sentence[processed_index:tag_start].strip( ).split() slots.extend([str(slots_list_all['O'])] * len(words_list)) words_list = sentence[tag_start:tag_end].strip().split() slots.append(str(slots_list_all[f'B-{tag_str}'])) slots.extend([str(slots_list_all[f'I-{tag_str}'])] * (len(words_list) - 1)) processed_index = tag_end if processed_index < len(sentence): words_list = sentence[processed_index:].strip().split() slots.extend([str(slots_list_all['O'])] * len(words_list)) slots = slots[1:-1] slot = ' '.join(slots) outfiles[mode + '_slots'].write(slot + '\n') outfiles[mode + '_slots'].close() outfiles[mode].close() outfiles['dict_slots'].close() outfiles['dict_intents'].close() return outfold
def merge(self, data_dir, subdirs, dataset_name, modes=['train', 'test']): outfold = f'{data_dir}/{dataset_name}' if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold)) slots = get_vocab(f'{outfold}/dict.slots.csv') none_slot = 0 for key in slots: if slots[key] == 'O': none_slot = key break return outfold, int(none_slot) os.makedirs(outfold, exist_ok=True) data_files, slot_files = {}, {} for mode in modes: data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w') data_files[mode].write('sentence\tlabel\n') slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w') intents, slots = {}, {} intent_shift, slot_shift = 0, 0 none_intent, none_slot = -1, -1 for subdir in subdirs: curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv') curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv') for key in curr_intents: if intent_shift > 0 and curr_intents[key] == 'O': continue if curr_intents[key] == 'O' and intent_shift == 0: none_intent = int(key) intents[int(key) + intent_shift] = curr_intents[key] for key in curr_slots: if slot_shift > 0 and curr_slots[key] == 'O': continue if slot_shift == 0 and curr_slots[key] == 'O': none_slot = int(key) slots[int(key) + slot_shift] = curr_slots[key] for mode in modes: with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f: for line in f.readlines()[1:]: text, label = line.strip().split('\t') label = int(label) if curr_intents[label] == 'O': label = none_intent else: label = label + intent_shift data_files[mode].write(f'{text}\t{label}\n') with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f: for line in f.readlines(): labels = [int(label) for label in line.strip().split()] shifted_labels = [] for label in labels: if curr_slots[label] == 'O': shifted_labels.append(none_slot) else: shifted_labels.append(label + slot_shift) slot_files[mode].write(list2str(shifted_labels) + '\n') intent_shift += len(curr_intents) slot_shift += len(curr_slots) write_vocab_in_order(intents, f'{outfold}/dict.intents.csv') write_vocab_in_order(slots, f'{outfold}/dict.slots.csv') return outfold, none_slot