示例#1
0
def process_atis(infold, outfold, modes=['train', 'test'], do_lower_case=False):
    """ MSFT's dataset, processed by Kaggle
    https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk
    """
    vocab = get_vocab(f'{infold}/atis.dict.vocab.csv')

    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
        logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold))
        return outfold
    logging.info(f'Processing ATIS dataset and storing at {outfold}.')

    os.makedirs(outfold, exist_ok=True)

    outfiles = {}
    for mode in modes:
        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
        outfiles[mode].write('sentence\tlabel\n')
        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w')

        queries = open(f'{infold}/atis.{mode}.query.csv', 'r').readlines()
        intents = open(f'{infold}/atis.{mode}.intent.csv', 'r').readlines()
        slots = open(f'{infold}/atis.{mode}.slots.csv', 'r').readlines()

        for i, query in enumerate(queries):
            sentence = ids2text(query.strip().split()[1:-1], vocab)
            if do_lower_case:
                sentence = sentence.lower()
            outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n')
            slot = ' '.join(slots[i].strip().split()[1:-1])
            outfiles[mode + '_slots'].write(slot + '\n')

    shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv')
    shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv')
    for mode in modes:
        outfiles[mode].close()
示例#2
0
def process_snips(infold,
                  outfold,
                  do_lower_case,
                  modes=['train', 'test'],
                  dev_split=0.1):
    if not os.path.exists(infold):
        link = 'https://github.com/snipsco/spoken-language-understanding-research-datasets'
        raise ValueError(
            f'Data not found at {infold}. '
            f'You may request to download the SNIPS dataset from {link}.')

    exist = True
    for dataset in ['light', 'speak', 'all']:
        if if_exist(f'{outfold}/{dataset}', [f'{mode}.tsv' for mode in modes]):
            logging.info(
                DATABASE_EXISTS_TMP.format('SNIPS-' + dataset, outfold))
        else:
            exist = False
    if exist:
        return outfold

    logging.info(
        f'Processing SNIPS dataset and storing at folders "speak", "light" and "all" under {outfold}.'
    )
    logging.info(
        f'Processing and importing "smart-speaker-en-close-field" -> "speak" and "smart-speaker-en-close-field" -> "light".'
    )

    os.makedirs(outfold, exist_ok=True)

    speak_dir = 'smart-speaker-en-close-field'
    light_dir = 'smart-lights-en-close-field'

    light_files = [f'{infold}/{light_dir}/dataset.json']
    speak_files = [f'{infold}/{speak_dir}/training_dataset.json']
    speak_files.append(f'{infold}/{speak_dir}/test_dataset.json')

    light_train, light_dev, light_slots, light_intents = get_dataset(
        light_files, dev_split)
    speak_train, speak_dev, speak_slots, speak_intents = get_dataset(
        speak_files)

    create_dataset(light_train, light_dev, light_slots, light_intents,
                   do_lower_case, f'{outfold}/light')
    create_dataset(speak_train, speak_dev, speak_slots, speak_intents,
                   do_lower_case, f'{outfold}/speak')
    create_dataset(
        light_train + speak_train,
        light_dev + speak_dev,
        light_slots | speak_slots,
        light_intents | speak_intents,
        do_lower_case,
        f'{outfold}/all',
    )
示例#3
0
def create_vocab_mlm(
        data_dir,
        vocab_size,
        sample_size,
        special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
        train_file=''):
    vocab = special_tokens[:]
    bert_dir = f'{data_dir}/bert'
    if if_exist(bert_dir, ['tokenizer.model']):
        logging.info(DATABASE_EXISTS_TMP.format('WikiText_BERT', bert_dir))
        return data_dir, f'{bert_dir}/tokenizer.model'
    logging.info(f'Processing WikiText dataset and store at {bert_dir}')
    os.makedirs(bert_dir, exist_ok=True)

    if not train_file:
        files = glob.glob(f'{data_dir}/*.txt')
        train_file = f'{bert_dir}/merged.txt'
        logging.info(f"Merging {len(files)} txt files into {train_file}")

        with open(train_file, "w") as merged:
            for file in tqdm(files):
                with open(file, 'r') as inf:
                    content = inf.read().strip()
                merged.write(content + '\n\n\n')
    else:
        train_file = f'{data_dir}/{train_file}'

    cmd = (f"--input={train_file} --model_prefix={bert_dir}/tokenizer "
           f"--vocab_size={vocab_size - len(vocab)} "
           f"--input_sentence_size={sample_size} "
           f"--shuffle_input_sentence=true --hard_vocab_limit=false "
           f"--bos_id=-1 --eos_id=-1")
    SPT.Train(cmd)

    # Add BERT control symbols
    tokens = []

    with open(f"{bert_dir}/tokenizer.vocab", "r") as f:
        f.readline()  # skip first <unk> token

        # Read tokens from each line and parse for vocab
        for line in f:
            piece = line.split("\t")[0]
            token = piece[1:] if piece.startswith("▁") else f"##{piece}"
            tokens.append(token)

    vocab.extend(tokens)

    # Save vocabulary to output file
    with open(f'{bert_dir}/vocab.txt', "w") as f:
        for token in vocab:
            f.write(f"{token}\n".format())
    return data_dir, f'{bert_dir}/tokenizer.model'
示例#4
0
def process_jarvis_datasets(infold,
                            outfold,
                            modes=['train', 'test', 'dev'],
                            do_lower_case=False,
                            ignore_prev_intent=False):
    """ process and convert Jarvis datasets into NeMo's BIO format
    """
    dataset_name = "jarvis"
    if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']):
        logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold))
        return outfold

    logging.info(f'Processing {dataset_name} dataset and storing at {outfold}')

    os.makedirs(outfold, exist_ok=True)

    outfiles = {}
    intents_list = {}
    slots_list = {}
    slots_list_all = {}

    outfiles['dict_intents'] = open(f'{outfold}/dict.intents.csv', 'w')
    outfiles['dict_slots'] = open(f'{outfold}/dict.slots.csv', 'w')

    outfiles['dict_slots'].write('O\n')
    slots_list["O"] = 0
    slots_list_all["O"] = 0

    for mode in modes:
        if if_exist(outfold, [f'{mode}.tsv']):
            logging.info(
                MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode))
            continue

        if not if_exist(infold, [f'{mode}.tsv']):
            logging.info(f'{mode} mode of {dataset_name}'
                         f' is skipped as it was not found.')
            continue

        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
        outfiles[mode].write('sentence\tlabel\n')
        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w')

        queries = open(f'{infold}/{mode}.tsv', 'r').readlines()

        for i, query in enumerate(queries):
            line_splits = query.strip().split("\t")
            if len(line_splits) == 3:
                intent_str, slot_tags_str, sentence = line_splits
            else:
                intent_str, sentence = line_splits
                slot_tags_str = ""

            if intent_str not in intents_list:
                intents_list[intent_str] = len(intents_list)
                outfiles['dict_intents'].write(f'{intent_str}\n')

            if ignore_prev_intent:
                start_token = 2
            else:
                start_token = 1

            if do_lower_case:
                sentence = sentence.lower()
            sentence_cld = " ".join(sentence.strip().split()[start_token:-1])
            outfiles[mode].write(f'{sentence_cld}\t'
                                 f'{str(intents_list[intent_str])}\n')

            slot_tags_list = []
            if slot_tags_str.strip():
                slot_tags = slot_tags_str.strip().split(",")
                for st in slot_tags:
                    if not st.strip():
                        continue
                    [start_i, end_i, slot_name] = st.strip().split(":")
                    slot_tags_list.append(
                        [int(start_i), int(end_i), slot_name])
                    if slot_name not in slots_list:
                        slots_list[slot_name] = len(slots_list)
                        slots_list_all[f'B-{slot_name}'] = len(slots_list_all)
                        slots_list_all[f'I-{slot_name}'] = len(slots_list_all)
                        outfiles['dict_slots'].write(f'B-{slot_name}\n')
                        outfiles['dict_slots'].write(f'I-{slot_name}\n')

            slot_tags_list.sort(key=lambda x: x[0])
            slots = []
            processed_index = 0
            for tag_start, tag_end, tag_str in slot_tags_list:
                if tag_start > processed_index:
                    words_list = sentence[processed_index:tag_start].strip(
                    ).split()
                    slots.extend([str(slots_list_all['O'])] * len(words_list))
                words_list = sentence[tag_start:tag_end].strip().split()
                slots.append(str(slots_list_all[f'B-{tag_str}']))
                slots.extend([str(slots_list_all[f'I-{tag_str}'])] *
                             (len(words_list) - 1))
                processed_index = tag_end

            if processed_index < len(sentence):
                words_list = sentence[processed_index:].strip().split()
                slots.extend([str(slots_list_all['O'])] * len(words_list))

            slots = slots[1:-1]
            slot = ' '.join(slots)
            outfiles[mode + '_slots'].write(slot + '\n')

        outfiles[mode + '_slots'].close()
        outfiles[mode].close()

    outfiles['dict_slots'].close()
    outfiles['dict_intents'].close()

    return outfold
示例#5
0
    def merge(self, data_dir, subdirs, dataset_name, modes=['train', 'test']):
        outfold = f'{data_dir}/{dataset_name}'
        if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
            logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold))
            slots = get_vocab(f'{outfold}/dict.slots.csv')
            none_slot = 0
            for key in slots:
                if slots[key] == 'O':
                    none_slot = key
                    break
            return outfold, int(none_slot)

        os.makedirs(outfold, exist_ok=True)

        data_files, slot_files = {}, {}
        for mode in modes:
            data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w')
            data_files[mode].write('sentence\tlabel\n')
            slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w')

        intents, slots = {}, {}
        intent_shift, slot_shift = 0, 0
        none_intent, none_slot = -1, -1

        for subdir in subdirs:
            curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv')
            curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv')

            for key in curr_intents:
                if intent_shift > 0 and curr_intents[key] == 'O':
                    continue
                if curr_intents[key] == 'O' and intent_shift == 0:
                    none_intent = int(key)
                intents[int(key) + intent_shift] = curr_intents[key]

            for key in curr_slots:
                if slot_shift > 0 and curr_slots[key] == 'O':
                    continue
                if slot_shift == 0 and curr_slots[key] == 'O':
                    none_slot = int(key)
                slots[int(key) + slot_shift] = curr_slots[key]

            for mode in modes:
                with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f:
                    for line in f.readlines()[1:]:
                        text, label = line.strip().split('\t')
                        label = int(label)
                        if curr_intents[label] == 'O':
                            label = none_intent
                        else:
                            label = label + intent_shift
                        data_files[mode].write(f'{text}\t{label}\n')

                with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f:
                    for line in f.readlines():
                        labels = [int(label) for label in line.strip().split()]
                        shifted_labels = []
                        for label in labels:
                            if curr_slots[label] == 'O':
                                shifted_labels.append(none_slot)
                            else:
                                shifted_labels.append(label + slot_shift)
                        slot_files[mode].write(list2str(shifted_labels) + '\n')

            intent_shift += len(curr_intents)
            slot_shift += len(curr_slots)

        write_vocab_in_order(intents, f'{outfold}/dict.intents.csv')
        write_vocab_in_order(slots, f'{outfold}/dict.slots.csv')
        return outfold, none_slot