Exemplo n.º 1
0
def process_atis(infold, outfold, modes=['train', 'test'], do_lower_case=False):
    """ MSFT's dataset, processed by Kaggle
    https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk
    """
    vocab = get_vocab(f'{infold}/atis.dict.vocab.csv')

    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
        logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold))
        return outfold
    logging.info(f'Processing ATIS dataset and storing at {outfold}.')

    os.makedirs(outfold, exist_ok=True)

    outfiles = {}
    for mode in modes:
        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
        outfiles[mode].write('sentence\tlabel\n')
        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w')

        queries = open(f'{infold}/atis.{mode}.query.csv', 'r').readlines()
        intents = open(f'{infold}/atis.{mode}.intent.csv', 'r').readlines()
        slots = open(f'{infold}/atis.{mode}.slots.csv', 'r').readlines()

        for i, query in enumerate(queries):
            sentence = ids2text(query.strip().split()[1:-1], vocab)
            if do_lower_case:
                sentence = sentence.lower()
            outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n')
            slot = ' '.join(slots[i].strip().split()[1:-1])
            outfiles[mode + '_slots'].write(slot + '\n')

    shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv')
    shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv')
    for mode in modes:
        outfiles[mode].close()
Exemplo n.º 2
0
    def create_vocab_lm(self, data_dir, do_lower_case):
        if if_exist(data_dir, ['train.txt', 'vocab.txt']):
            logging.info("Vocabulary has been created.")
            with open(os.path.join(data_dir, 'vocab.txt'), 'r') as f:
                vocab_size = len(f.readlines())
            return vocab_size

        logging.info(f'Creating vocabulary from training data at {data_dir}')

        with open(f'{data_dir}/train.txt', 'r') as f:
            txt = f.read()
        if do_lower_case:
            txt = txt.lower()
        lines = re.split(r'[\n]', txt)
        sentences = [line.strip().split() for line in lines if line.strip()]

        vocab = {"[PAD]": 0, "[SEP]": 1, "[CLS]": 2, "[MASK]": 3}
        idx = 4
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = idx
                    idx += 1

        with open(f'{data_dir}/vocab.txt', 'w') as f:
            for word in sorted(vocab.keys()):
                f.write(word + '\n')
        logging.info(f"Created vocabulary of size {len(vocab)}")

        return len(vocab)
Exemplo n.º 3
0
def process_snips(infold,
                  outfold,
                  do_lower_case,
                  modes=['train', 'test'],
                  dev_split=0.1):
    if not os.path.exists(infold):
        link = 'https://github.com/snipsco/spoken-language-understanding-research-datasets'
        raise ValueError(
            f'Data not found at {infold}. '
            f'You may request to download the SNIPS dataset from {link}.')

    exist = True
    for dataset in ['light', 'speak', 'all']:
        if if_exist(f'{outfold}/{dataset}', [f'{mode}.tsv' for mode in modes]):
            logging.info(
                DATABASE_EXISTS_TMP.format('SNIPS-' + dataset, outfold))
        else:
            exist = False
    if exist:
        return outfold

    logging.info(
        f'Processing SNIPS dataset and storing at folders "speak", "light" and "all" under {outfold}.'
    )
    logging.info(
        f'Processing and importing "smart-speaker-en-close-field" -> "speak" and "smart-speaker-en-close-field" -> "light".'
    )

    os.makedirs(outfold, exist_ok=True)

    speak_dir = 'smart-speaker-en-close-field'
    light_dir = 'smart-lights-en-close-field'

    light_files = [f'{infold}/{light_dir}/dataset.json']
    speak_files = [f'{infold}/{speak_dir}/training_dataset.json']
    speak_files.append(f'{infold}/{speak_dir}/test_dataset.json')

    light_train, light_dev, light_slots, light_intents = get_dataset(
        light_files, dev_split)
    speak_train, speak_dev, speak_slots, speak_intents = get_dataset(
        speak_files)

    create_dataset(light_train, light_dev, light_slots, light_intents,
                   do_lower_case, f'{outfold}/light')
    create_dataset(speak_train, speak_dev, speak_slots, speak_intents,
                   do_lower_case, f'{outfold}/speak')
    create_dataset(
        light_train + speak_train,
        light_dev + speak_dev,
        light_slots | speak_slots,
        light_intents | speak_intents,
        do_lower_case,
        f'{outfold}/all',
    )
    def __init__(self, data_dir, modes=['train', 'test', 'dev']):
        self.data_dir = data_dir

        max_label_id = 0
        for mode in modes:
            if not if_exist(self.data_dir, [f'{mode}.tsv']):
                logging.info(
                    f'Stats calculation for {mode} mode is skipped as {mode}.tsv was not found.'
                )
                continue

            input_file = f'{self.data_dir}/{mode}.tsv'
            with open(input_file, 'r') as f:
                input_lines = f.readlines()[1:]  # Skipping headers at index 0

            try:
                int(input_lines[0].strip().split()[-1])
            except ValueError:
                logging.warning(f'No numerical labels found for {mode}.tsv.')
                raise

            queries, raw_sentences = [], []
            for input_line in input_lines:
                parts = input_line.strip().split()
                label = int(parts[-1])
                raw_sentences.append(label)
                queries.append(' '.join(parts[:-1]))

            infold = input_file[:input_file.rfind('/')]

            logging.info(f'Three most popular classes in {mode} dataset')
            total_sents, sent_label_freq, max_id = get_label_stats(
                raw_sentences, infold + f'/{mode}_sentence_stats.tsv')
            max_label_id = max(max_label_id, max_id)

            if mode == 'train':
                class_weights_dict = get_freq_weights(sent_label_freq)
                logging.info(f'Class Weights: {class_weights_dict}')

            logging.info(f'Total Sentences: {total_sents}')
            logging.info(f'Sentence class frequencies - {sent_label_freq}')

        self.class_weights = fill_class_weights(class_weights_dict,
                                                max_label_id)

        self.num_labels = max_label_id + 1
Exemplo n.º 5
0
                        default='~/data/state_tracking/multiwoz2.1',
                        type=str)
    args = parser.parse_args()

    # Get absolute paths.
    abs_source_data_dir = expanduser(args.source_data_dir)
    abs_target_data_dir = expanduser(args.target_data_dir)

    if not exists(abs_source_data_dir):
        raise FileNotFoundError(f"{abs_source_data_dir} doesn't exist.")

    # Check if the files exist.
    if if_exist(
            abs_target_data_dir,
        [
            'trainListFile.json', 'val_dials.json', 'test_dials.json',
            'train_dials.json', 'ontology.json'
        ],
    ):
        print(
            f'Data is already processed and stored at {abs_target_data_dir}, skipping preprocessing.'
        )
        exit(0)

    # Set domain.
    DOMAINS = [
        'restaurant', 'hotel', 'attraction', 'train', 'taxi', 'hospital',
        'police'
    ]
    PHONE_NUM_TMPL = '\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4,5})'
    POSTCODE_TMPL = ('([a-z]{1}[\. ]?[a-z]{1}[\. ]?\d{1,2}[, ]+\d{1}[\. ]?' +
Exemplo n.º 6
0

if __name__ == "__main__":
    # Parse the command-line arguments.
    parser = argparse.ArgumentParser(description='Process MultiWOZ dataset')
    parser.add_argument(
        "--source_data_dir", required=True, type=str, help='The path to the folder containing the MultiWOZ data files.'
    )
    parser.add_argument("--target_data_dir", default='multiwoz2.1/', type=str)
    args = parser.parse_args()

    if not exists(args.source_data_dir):
        raise FileNotFoundError(f"{args.source_data_dir} does not exist.")

    # Check if the files exist
    if if_exist(args.target_data_dir, ['ontology.json', 'dev_dials.json', 'test_dials.json', 'train_dials.json']):
        print(f'Data is already processed and stored at {args.source_data_dir}, skipping pre-processing.')
        exit(0)

    fin = open('multiwoz_mapping.pair', 'r')
    REPLACEMENTS = []
    for line in fin.readlines():
        tok_from, tok_to = line.replace('\n', '').split('\t')
        REPLACEMENTS.append((' ' + tok_from + ' ', ' ' + tok_to + ' '))

    print('Creating dialogues...')
    # Process MultiWOZ dataset
    delex_data = createData(args.source_data_dir)
    # Divide data
    divideData(delex_data, args.source_data_dir, args.target_data_dir)
Exemplo n.º 7
0
    def __init__(self, data_dir, none_slot_label='O', pad_label=-1):
        if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']):
            raise FileNotFoundError(
                "Make sure that your data follows the standard format "
                "supported by JointIntentSlotDataset. Your data must "
                "contain dict.intents.csv and dict.slots.csv.")

        self.data_dir = data_dir
        self.intent_dict_file = self.data_dir + '/dict.intents.csv'
        self.slot_dict_file = self.data_dir + '/dict.slots.csv'

        self.intents_label_ids = JointIntentSlotDataDesc.label2idx(
            self.intent_dict_file)
        self.num_intents = len(self.intents_label_ids)
        self.slots_label_ids = JointIntentSlotDataDesc.label2idx(
            self.slot_dict_file)
        self.num_slots = len(self.slots_label_ids)

        for mode in ['train', 'test', 'dev']:
            if not if_exist(self.data_dir, [f'{mode}.tsv']):
                logging.info(f' Stats calculation for {mode} mode'
                             f' is skipped as {mode}.tsv was not found.')
                continue
            logging.info(f' Stats calculating for {mode} mode...')
            slot_file = f'{self.data_dir}/{mode}_slots.tsv'
            with open(slot_file, 'r') as f:
                slot_lines = f.readlines()

            input_file = f'{self.data_dir}/{mode}.tsv'
            with open(input_file, 'r') as f:
                input_lines = f.readlines()[1:]  # Skipping headers at index 0

            if len(slot_lines) != len(input_lines):
                raise ValueError(
                    "Make sure that the number of slot lines match the "
                    "number of intent lines. There should be a 1-1 "
                    "correspondence between every slot and intent lines.")

            dataset = list(zip(slot_lines, input_lines))

            raw_slots, queries, raw_intents = [], [], []
            for slot_line, input_line in dataset:
                slot_list = [int(slot) for slot in slot_line.strip().split()]
                raw_slots.append(slot_list)
                parts = input_line.strip().split()
                raw_intents.append(int(parts[-1]))
                queries.append(' '.join(parts[:-1]))

            infold = self.data_dir

            logging.info(f'Three most popular intents in {mode} mode:')
            total_intents, intent_label_freq = get_label_stats(
                raw_intents, infold + f'/{mode}_intent_stats.tsv')
            merged_slots = itertools.chain.from_iterable(raw_slots)

            logging.info(f'Three most popular slots in {mode} mode:')
            slots_total, slots_label_freq = get_label_stats(
                merged_slots, infold + f'/{mode}_slot_stats.tsv')

            if mode == 'train':
                self.slot_weights = calc_class_weights(slots_label_freq)
                logging.info(f'Slot weights are - {self.slot_weights}')

                self.intent_weights = calc_class_weights(intent_label_freq)
                logging.info(f'Intent weights are - {self.intent_weights}')

            logging.info(f'Total intents - {total_intents}')
            logging.info(f'Intent label frequency - {intent_label_freq}')
            logging.info(f'Total Slots - {slots_total}')
            logging.info(f'Slots label frequency - {slots_label_freq}')

        if pad_label != -1:
            self.pad_label = pad_label
        else:
            if none_slot_label not in self.slots_label_ids:
                raise ValueError(f'none_slot_label {none_slot_label} not '
                                 f'found in {self.slot_dict_file}.')
            self.pad_label = self.slots_label_ids[none_slot_label]
Exemplo n.º 8
0
def process_jarvis_datasets(infold,
                            outfold,
                            modes=['train', 'test', 'dev'],
                            do_lower_case=False,
                            ignore_prev_intent=False):
    """ process and convert Jarvis datasets into NeMo's BIO format
    """
    dataset_name = "jarvis"
    if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']):
        logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold))
        return outfold

    logging.info(f'Processing {dataset_name} dataset and storing at {outfold}')

    os.makedirs(outfold, exist_ok=True)

    outfiles = {}
    intents_list = {}
    slots_list = {}
    slots_list_all = {}

    outfiles['dict_intents'] = open(f'{outfold}/dict.intents.csv', 'w')
    outfiles['dict_slots'] = open(f'{outfold}/dict.slots.csv', 'w')

    outfiles['dict_slots'].write('O\n')
    slots_list["O"] = 0
    slots_list_all["O"] = 0

    for mode in modes:
        if if_exist(outfold, [f'{mode}.tsv']):
            logging.info(
                MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode))
            continue

        if not if_exist(infold, [f'{mode}.tsv']):
            logging.info(f'{mode} mode of {dataset_name}'
                         f' is skipped as it was not found.')
            continue

        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
        outfiles[mode].write('sentence\tlabel\n')
        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w')

        queries = open(f'{infold}/{mode}.tsv', 'r').readlines()

        for i, query in enumerate(queries):
            line_splits = query.strip().split("\t")
            if len(line_splits) == 3:
                intent_str, slot_tags_str, sentence = line_splits
            else:
                intent_str, sentence = line_splits
                slot_tags_str = ""

            if intent_str not in intents_list:
                intents_list[intent_str] = len(intents_list)
                outfiles['dict_intents'].write(f'{intent_str}\n')

            if ignore_prev_intent:
                start_token = 2
            else:
                start_token = 1

            if do_lower_case:
                sentence = sentence.lower()
            sentence_cld = " ".join(sentence.strip().split()[start_token:-1])
            outfiles[mode].write(f'{sentence_cld}\t'
                                 f'{str(intents_list[intent_str])}\n')

            slot_tags_list = []
            if slot_tags_str.strip():
                slot_tags = slot_tags_str.strip().split(",")
                for st in slot_tags:
                    if not st.strip():
                        continue
                    [start_i, end_i, slot_name] = st.strip().split(":")
                    slot_tags_list.append(
                        [int(start_i), int(end_i), slot_name])
                    if slot_name not in slots_list:
                        slots_list[slot_name] = len(slots_list)
                        slots_list_all[f'B-{slot_name}'] = len(slots_list_all)
                        slots_list_all[f'I-{slot_name}'] = len(slots_list_all)
                        outfiles['dict_slots'].write(f'B-{slot_name}\n')
                        outfiles['dict_slots'].write(f'I-{slot_name}\n')

            slot_tags_list.sort(key=lambda x: x[0])
            slots = []
            processed_index = 0
            for tag_start, tag_end, tag_str in slot_tags_list:
                if tag_start > processed_index:
                    words_list = sentence[processed_index:tag_start].strip(
                    ).split()
                    slots.extend([str(slots_list_all['O'])] * len(words_list))
                words_list = sentence[tag_start:tag_end].strip().split()
                slots.append(str(slots_list_all[f'B-{tag_str}']))
                slots.extend([str(slots_list_all[f'I-{tag_str}'])] *
                             (len(words_list) - 1))
                processed_index = tag_end

            if processed_index < len(sentence):
                words_list = sentence[processed_index:].strip().split()
                slots.extend([str(slots_list_all['O'])] * len(words_list))

            slots = slots[1:-1]
            slot = ' '.join(slots)
            outfiles[mode + '_slots'].write(slot + '\n')

        outfiles[mode + '_slots'].close()
        outfiles[mode].close()

    outfiles['dict_slots'].close()
    outfiles['dict_intents'].close()

    return outfold
Exemplo n.º 9
0
def partition_data(data, infold, outfold):
    """Partition the data into train, valid, and test sets
    based on the list of val and test specified in the dataset.
    """
    if if_exist(outfold, [
            'trainListFile.json', 'val_dialogs.json', 'test_dialogs.json',
            'train_dialogs.json', 'ontology.json'
    ]):
        print(f'Data is already processed and stored at {outfold}')
        return
    os.makedirs(outfold, exist_ok=True)
    shutil.copyfile(f'{infold}/ontology.json', f'{outfold}/ontology.json')

    with open(f'{infold}/testListFile.json', 'r') as fin:
        test_files = [line.strip() for line in fin.readlines()]

    with open(f'{infold}/valListFile.json', 'r') as fin:
        val_files = [line.strip() for line in fin.readlines()]

    train_list_files = open(f'{outfold}/trainListFile.json', 'w')

    train_dialogs, val_dialogs, test_dialogs = [], [], []
    count_train, count_val, count_test = 0, 0, 0

    for dialog_id in data:
        dialog = data[dialog_id]
        domains = [
            key for key in dialog['goal'].keys()
            if key in DOMAINS and dialog['goal'][key]
        ]

        dial = get_dialog(dialog)
        if dial:
            dialogue = {}
            dialogue['dialog_idx'] = dialog_id
            dialogue['domains'] = list(set(domains))
            last_bs = []
            dialogue['dialog'] = []

            for idx, turn in enumerate(dial):
                turn_dl = {
                    'sys_transcript': dial[idx - 1]['sys'] if idx > 0 else "",
                    'turn_idx': idx,
                    'transcript': turn['usr'],
                    'sys_acts': dial[idx - 1]['sys_a'] if idx > 0 else [],
                    'domain': turn['domain'],
                }
                turn_dl['belief_state'] = [{
                    "slots": [s],
                    "act": "inform"
                } for s in turn['bvs']]
                turn_dl['turn_label'] = [
                    bs["slots"][0] for bs in turn_dl['belief_state']
                    if bs not in last_bs
                ]
                last_bs = turn_dl['belief_state']
                dialogue['dialog'].append(turn_dl)

            if dialog_id in test_files:
                test_dialogs.append(dialogue)
                count_test += 1
            elif dialog_id in val_files:
                val_dialogs.append(dialogue)
                count_val += 1
            else:
                train_list_files.write(dialog_id + '\n')
                train_dialogs.append(dialogue)
                count_train += 1

    print(f"Dialogs: {count_train} train, {count_val} val, {count_test} test.")

    # save all dialogues
    with open(f'{outfold}/val_dialogs.json', 'w') as fout:
        json.dump(val_dialogs, fout, indent=4)

    with open(f'{outfold}/test_dialogs.json', 'w') as fout:
        json.dump(test_dialogs, fout, indent=4)

    with open(f'{outfold}/train_dialogs.json', 'w') as fout:
        json.dump(train_dialogs, fout, indent=4)

    train_list_files.close()
Exemplo n.º 10
0
    def __init__(self,
                 data_dir,
                 do_lower_case=False,
                 dataset_name='default',
                 none_slot_label='O',
                 pad_label=-1):
        if dataset_name == 'atis':
            self.data_dir = process_atis(data_dir, do_lower_case)
        elif dataset_name == 'snips-atis':
            self.data_dir, self.pad_label = self.merge(data_dir, [
                'ATIS/nemo-processed-uncased',
                'snips/nemo-processed-uncased/all'
            ], dataset_name)
        elif dataset_name == 'dialogflow':
            self.data_dir = process_dialogflow(data_dir, do_lower_case)
        elif dataset_name == 'mturk-processed':
            self.data_dir = process_mturk(data_dir, do_lower_case)
        elif dataset_name in set(['snips-light', 'snips-speak', 'snips-all']):
            self.data_dir = process_snips(data_dir, do_lower_case)
            if dataset_name.endswith('light'):
                self.data_dir = f'{self.data_dir}/light'
            elif dataset_name.endswith('speak'):
                self.data_dir = f'{self.data_dir}/speak'
            elif dataset_name.endswith('all'):
                self.data_dir = f'{self.data_dir}/all'
        elif dataset_name.startswith('jarvis'):
            self.data_dir = process_jarvis_datasets(
                data_dir,
                do_lower_case,
                dataset_name,
                modes=["train", "test", "eval"],
                ignore_prev_intent=False)
        else:
            if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']):
                raise FileNotFoundError(
                    "Make sure that your data follows the standard format "
                    "supported by JointIntentSlotDataset. Your data must "
                    "contain dict.intents.csv and dict.slots.csv.")
            self.data_dir = data_dir

        self.intent_dict_file = self.data_dir + '/dict.intents.csv'
        self.slot_dict_file = self.data_dir + '/dict.slots.csv'
        self.num_intents = len(get_vocab(self.intent_dict_file))
        slots = label2idx(self.slot_dict_file)
        self.num_slots = len(slots)

        for mode in ['train', 'test', 'eval']:

            if not if_exist(self.data_dir, [f'{mode}.tsv']):
                logging.info(f' Stats calculation for {mode} mode'
                             f' is skipped as {mode}.tsv was not found.')
                continue

            slot_file = f'{self.data_dir}/{mode}_slots.tsv'
            with open(slot_file, 'r') as f:
                slot_lines = f.readlines()

            input_file = f'{self.data_dir}/{mode}.tsv'
            with open(input_file, 'r') as f:
                input_lines = f.readlines()[1:]  # Skipping headers at index 0

            if len(slot_lines) != len(input_lines):
                raise ValueError(
                    "Make sure that the number of slot lines match the "
                    "number of intent lines. There should be a 1-1 "
                    "correspondence between every slot and intent lines.")

            dataset = list(zip(slot_lines, input_lines))

            raw_slots, queries, raw_intents = [], [], []
            for slot_line, input_line in dataset:
                slot_list = [int(slot) for slot in slot_line.strip().split()]
                raw_slots.append(slot_list)
                parts = input_line.strip().split()
                raw_intents.append(int(parts[-1]))
                queries.append(' '.join(parts[:-1]))

            infold = input_file[:input_file.rfind('/')]

            logging.info(f'Three most popular intents during {mode}ing')
            total_intents, intent_label_freq = get_label_stats(
                raw_intents, infold + f'/{mode}_intent_stats.tsv')
            merged_slots = itertools.chain.from_iterable(raw_slots)

            logging.info(f'Three most popular slots during {mode}ing')
            slots_total, slots_label_freq = get_label_stats(
                merged_slots, infold + f'/{mode}_slot_stats.tsv')

            if mode == 'train':
                self.slot_weights = calc_class_weights(slots_label_freq)
                logging.info(f'Slot weights are - {self.slot_weights}')

                self.intent_weights = calc_class_weights(intent_label_freq)
                logging.info(f'Intent weights are - {self.intent_weights}')

            logging.info(f'Total intents - {total_intents}')
            logging.info(f'Intent label frequency - {intent_label_freq}')
            logging.info(f'Total Slots - {slots_total}')
            logging.info(f'Slots label frequency - {slots_label_freq}')

        if pad_label != -1:
            self.pad_label = pad_label
        else:
            if none_slot_label not in slots:
                raise ValueError(f'none_slot_label {none_slot_label} not '
                                 f'found in {self.slot_dict_file}.')
            self.pad_label = slots[none_slot_label]
Exemplo n.º 11
0
    def merge(self, data_dir, subdirs, dataset_name, modes=['train', 'test']):
        outfold = f'{data_dir}/{dataset_name}'
        if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
            logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold))
            slots = get_vocab(f'{outfold}/dict.slots.csv')
            none_slot = 0
            for key in slots:
                if slots[key] == 'O':
                    none_slot = key
                    break
            return outfold, int(none_slot)

        os.makedirs(outfold, exist_ok=True)

        data_files, slot_files = {}, {}
        for mode in modes:
            data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w')
            data_files[mode].write('sentence\tlabel\n')
            slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w')

        intents, slots = {}, {}
        intent_shift, slot_shift = 0, 0
        none_intent, none_slot = -1, -1

        for subdir in subdirs:
            curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv')
            curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv')

            for key in curr_intents:
                if intent_shift > 0 and curr_intents[key] == 'O':
                    continue
                if curr_intents[key] == 'O' and intent_shift == 0:
                    none_intent = int(key)
                intents[int(key) + intent_shift] = curr_intents[key]

            for key in curr_slots:
                if slot_shift > 0 and curr_slots[key] == 'O':
                    continue
                if slot_shift == 0 and curr_slots[key] == 'O':
                    none_slot = int(key)
                slots[int(key) + slot_shift] = curr_slots[key]

            for mode in modes:
                with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f:
                    for line in f.readlines()[1:]:
                        text, label = line.strip().split('\t')
                        label = int(label)
                        if curr_intents[label] == 'O':
                            label = none_intent
                        else:
                            label = label + intent_shift
                        data_files[mode].write(f'{text}\t{label}\n')

                with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f:
                    for line in f.readlines():
                        labels = [int(label) for label in line.strip().split()]
                        shifted_labels = []
                        for label in labels:
                            if curr_slots[label] == 'O':
                                shifted_labels.append(none_slot)
                            else:
                                shifted_labels.append(label + slot_shift)
                        slot_files[mode].write(list2str(shifted_labels) + '\n')

            intent_shift += len(curr_intents)
            slot_shift += len(curr_slots)

        write_vocab_in_order(intents, f'{outfold}/dict.intents.csv')
        write_vocab_in_order(slots, f'{outfold}/dict.slots.csv')
        return outfold, none_slot
    def __init__(self, dataset_name, data_dir, do_lower_case, modes=['train', 'test', 'eval']):
        if dataset_name == 'sst-2':
            self.data_dir = process_sst_2(data_dir)
            self.num_labels = 2
            self.eval_file = self.data_dir + '/dev.tsv'
        elif dataset_name == 'imdb':
            self.num_labels = 2
            self.data_dir = process_imdb(data_dir, do_lower_case)
            self.eval_file = self.data_dir + '/test.tsv'
        elif dataset_name == 'thucnews':
            self.num_labels = 14
            self.data_dir = process_thucnews(data_dir)
            self.eval_file = self.data_dir + '/test.tsv'
        elif dataset_name.startswith('nlu-'):
            if dataset_name.endswith('chat'):
                self.data_dir = f'{data_dir}/ChatbotCorpus.json'
                self.num_labels = 2
            elif dataset_name.endswith('ubuntu'):
                self.data_dir = f'{data_dir}/AskUbuntuCorpus.json'
                self.num_labels = 5
            elif dataset_name.endswith('web'):
                data_dir = f'{data_dir}/WebApplicationsCorpus.json'
                self.num_labels = 8
            self.data_dir = process_nlu(data_dir, do_lower_case, dataset_name=dataset_name)
            self.eval_file = self.data_dir + '/test.tsv'
        elif dataset_name.startswith('jarvis'):
            self.data_dir = process_jarvis_datasets(
                data_dir, do_lower_case, dataset_name, modes=['train', 'test', 'eval'], ignore_prev_intent=False
            )

            intents = get_intent_labels(f'{self.data_dir}/dict.intents.csv')
            self.num_labels = len(intents)
        elif dataset_name != 'default_format':
            raise ValueError(
                "Looks like you passed a dataset name that isn't "
                + "already supported by NeMo. Please make sure "
                + "that you build the preprocessing method for it. "
                + "default_format assumes that a data file has a header and each line of the file follows "
                + "the format: text [TAB] label. Label is assumed to be an integer."
            )

        self.train_file = self.data_dir + '/train.tsv'

        for mode in modes:
            if not if_exist(self.data_dir, [f'{mode}.tsv']):
                logging.info(f'Stats calculation for {mode} mode is skipped as {mode}.tsv was not found.')
                continue

            input_file = f'{self.data_dir}/{mode}.tsv'
            with open(input_file, 'r') as f:
                input_lines = f.readlines()[1:]  # Skipping headers at index 0

            try:
                int(input_lines[0].strip().split()[-1])
            except ValueError:
                logging.warning(f'No numerical labels found for {mode}.tsv in {dataset_name} dataset.')
                raise

            queries, raw_sentences = [], []
            for input_line in input_lines:
                parts = input_line.strip().split()
                raw_sentences.append(int(parts[-1]))
                queries.append(' '.join(parts[:-1]))

            infold = input_file[: input_file.rfind('/')]

            logging.info(f'Three most popular classes in {mode} dataset')
            total_sents, sent_label_freq = get_label_stats(raw_sentences, infold + f'/{mode}_sentence_stats.tsv')

            if mode == 'train':
                self.class_weights = calc_class_weights(sent_label_freq)
                logging.info(f'Class weights are - {self.class_weights}')

            logging.info(f'Total Sentences - {total_sents}')
            logging.info(f'Sentence class frequencies - {sent_label_freq}')
Exemplo n.º 13
0
        "--source_data_dir", required=True, type=str, help='The path to the folder containing the MultiWOZ data files.'
    )
    parser.add_argument("--target_data_dir", default='multiwoz2.1/', type=str)
    parser.add_argument("--overwrite_files", action="store_true", help="Whether to overwrite preprocessed file")
    args = parser.parse_args()

    # Get the absolute path.
    abs_source_data_dir = expanduser(args.source_data_dir)
    abs_target_data_dir = expanduser(args.target_data_dir)

    if not exists(abs_source_data_dir):
        raise FileNotFoundError(f"{abs_source_data_dir} does not exist.")

    # Check if the files exist
    if (
        if_exist(abs_target_data_dir, ['ontology.json', 'dev_dials.json', 'test_dials.json', 'train_dials.json', 'db'])
        and not args.overwrite_files
    ):
        print(f'Data is already processed and stored at {abs_target_data_dir}, skipping pre-processing.')
        exit(0)

    fin = open('multiwoz_mapping.pair', 'r')
    REPLACEMENTS = []
    for line in fin.readlines():
        tok_from, tok_to = line.replace('\n', '').split('\t')
        REPLACEMENTS.append((' ' + tok_from + ' ', ' ' + tok_to + ' '))

    print('Creating dialogues...')
    # Process MultiWOZ dataset
    delex_data = createData(abs_source_data_dir)
    # Divide data