Exemplo n.º 1
0
def process_atis(infold, outfold, modes=['train', 'test'], do_lower_case=False):
    """ MSFT's dataset, processed by Kaggle
    https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk
    """
    vocab = get_vocab(f'{infold}/atis.dict.vocab.csv')

    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
        logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold))
        return outfold
    logging.info(f'Processing ATIS dataset and storing at {outfold}.')

    os.makedirs(outfold, exist_ok=True)

    outfiles = {}
    for mode in modes:
        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
        outfiles[mode].write('sentence\tlabel\n')
        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w')

        queries = open(f'{infold}/atis.{mode}.query.csv', 'r').readlines()
        intents = open(f'{infold}/atis.{mode}.intent.csv', 'r').readlines()
        slots = open(f'{infold}/atis.{mode}.slots.csv', 'r').readlines()

        for i, query in enumerate(queries):
            sentence = ids2text(query.strip().split()[1:-1], vocab)
            if do_lower_case:
                sentence = sentence.lower()
            outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n')
            slot = ' '.join(slots[i].strip().split()[1:-1])
            outfiles[mode + '_slots'].write(slot + '\n')

    shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv')
    shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv')
    for mode in modes:
        outfiles[mode].close()
Exemplo n.º 2
0
def process_assistant(infold, outfold, modes=['train', 'test']):
    """
    https://github.com/xliuhw/NLU-Evaluation-Data - this dataset includes
    about 25 thousand examples with 66 various multi-domain intents and 57 entity types.
    """
    if if_exist(outfold, [f'{mode}_slots.tsv' for mode in modes]):
        logging.info(DATABASE_EXISTS_TMP.format('robot', outfold))
        return outfold

    logging.info(f'Processing assistant commands dataset and store at {outfold}')
    os.makedirs(outfold, exist_ok=True)

    # copy train/test files to the convenient directory to work with
    copy_input_files(infold)
    infold += "/dataset"

    # get list of intents from train folder (test folder supposed to be the same)
    intent_names = get_intents(infold + "/trainset")
    write_files(intent_names, f'{outfold}/dict.intents.csv')

    # get all train and test queries with their intent
    for mode in modes:
        intent_queries = get_intent_queries(infold, intent_names, mode)
        write_files(intent_queries, f'{outfold}/{mode}.tsv')

    # get list of all unique slots in training and testing files
    slot_types = get_slots(infold, modes)
    write_files(slot_types, f'{outfold}/dict.slots.csv')

    # create files of slot queries
    slot_dict = {k: v for v, k in enumerate(slot_types)}
    for mode in modes:
        slot_queries = get_slot_queries(infold, slot_dict, mode, intent_names)
        write_files(slot_queries, f'{outfold}/{mode}_slots.tsv')
Exemplo n.º 3
0
def process_text2sparql(infold: str, outfold: str, do_lower_case: bool):
    """ Process and convert MeetKai's text2sparql datasets to NeMo's neural machine translation format.

    Args:
        infold: directory path to raw text2sparql data containing
            train.tsv, test_easy.tsv, test_hard.tsv
        outfold: output directory path to save formatted data for NeuralMachineTranslationDataset
            the first line is header (sentence [tab] label)
            each line should be [sentence][tab][label]
        do_lower_case: if true, convert all sentences and labels to lower
    """
    logging.info(f"Processing Text2Sparql dataset and storing at: {outfold}")

    os.makedirs(outfold, exist_ok=True)

    dataset_name = "Text2Sparql"
    for prefix in prefix_map:
        input_file = os.path.join(infold, prefix)
        output_file = os.path.join(outfold, prefix_map[prefix])

        if if_exist(outfold, [prefix_map[prefix]]):
            logging.info(f"** {MODE_EXISTS_TMP.format(prefix_map[prefix], dataset_name, output_file)}")
            continue

        if not if_exist(infold, [prefix]):
            logging.info(f"** {prefix} of {dataset_name}" f" is skipped as it was not found")
            continue

        assert input_file != output_file, "input file cannot equal output file"
        with open(input_file, "r") as in_file:
            with open(output_file, "w") as out_file:
                reader = csv.reader(in_file, delimiter="\t")

                # replace headers
                out_file.write("sentence\tlabel\n")
                next(reader)

                for line in reader:
                    sentence = line[0]
                    label = line[1]
                    if do_lower_case:
                        sentence = sentence.lower()
                        label = label.lower()
                    out_file.write(f"{sentence}\t{label}\n")
Exemplo n.º 4
0
def process_snips(infold,
                  outfold,
                  do_lower_case,
                  modes=['train', 'test'],
                  dev_split=0.1):
    if not os.path.exists(infold):
        link = 'https://github.com/snipsco/spoken-language-understanding-research-datasets'
        raise ValueError(
            f'Data not found at {infold}. '
            f'You may request to download the SNIPS dataset from {link}.')

    exist = True
    for dataset in ['light', 'speak', 'all']:
        if if_exist(f'{outfold}/{dataset}', [f'{mode}.tsv' for mode in modes]):
            logging.info(
                DATABASE_EXISTS_TMP.format('SNIPS-' + dataset, outfold))
        else:
            exist = False
    if exist:
        return outfold

    logging.info(
        f'Processing SNIPS dataset and storing at folders "speak", "light" and "all" under {outfold}.'
    )
    logging.info(
        f'Processing and importing "smart-speaker-en-close-field" -> "speak" and "smart-speaker-en-close-field" -> "light".'
    )

    os.makedirs(outfold, exist_ok=True)

    speak_dir = 'smart-speaker-en-close-field'
    light_dir = 'smart-lights-en-close-field'

    light_files = [f'{infold}/{light_dir}/dataset.json']
    speak_files = [f'{infold}/{speak_dir}/training_dataset.json']
    speak_files.append(f'{infold}/{speak_dir}/test_dataset.json')

    light_train, light_dev, light_slots, light_intents = get_dataset(
        light_files, dev_split)
    speak_train, speak_dev, speak_slots, speak_intents = get_dataset(
        speak_files)

    create_dataset(light_train, light_dev, light_slots, light_intents,
                   do_lower_case, f'{outfold}/light')
    create_dataset(speak_train, speak_dev, speak_slots, speak_intents,
                   do_lower_case, f'{outfold}/speak')
    create_dataset(
        light_train + speak_train,
        light_dev + speak_dev,
        light_slots | speak_slots,
        light_intents | speak_intents,
        do_lower_case,
        f'{outfold}/all',
    )
Exemplo n.º 5
0
def download_text2sparql(infold: str):
    """Downloads text2sparql train, test_easy, and test_hard data

    Args:
        infold: save directory path
    """
    os.makedirs(infold, exist_ok=True)

    for prefix in prefix_map:
        url = base_url + prefix

        logging.info(f"Downloading: {url}")
        if if_exist(infold, [prefix]):
            logging.info("** Download file already exists, skipping download")
        else:
            req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
            with open(os.path.join(infold, prefix), "wb") as handle:
                handle.write(urlopen(req, timeout=20).read())
Exemplo n.º 6
0
def process_jarvis_datasets(
    infold, outfold, modes=['train', 'test', 'dev'], do_lower_case=False, ignore_prev_intent=False
):
    """ process and convert Jarvis datasets into NeMo's BIO format
    """
    dataset_name = "jarvis"
    if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']):
        logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold))
        return outfold

    logging.info(f'Processing {dataset_name} dataset and storing at {outfold}')

    os.makedirs(outfold, exist_ok=True)

    outfiles = {}
    intents_list = {}
    slots_list = {}
    slots_list_all = {}

    outfiles['dict_intents'] = open(f'{outfold}/dict.intents.csv', 'w')
    outfiles['dict_slots'] = open(f'{outfold}/dict.slots.csv', 'w')

    outfiles['dict_slots'].write('O\n')
    slots_list["O"] = 0
    slots_list_all["O"] = 0

    for mode in modes:
        if if_exist(outfold, [f'{mode}.tsv']):
            logging.info(MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode))
            continue

        if not if_exist(infold, [f'{mode}.tsv']):
            logging.info(f'{mode} mode of {dataset_name}' f' is skipped as it was not found.')
            continue

        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
        outfiles[mode].write('sentence\tlabel\n')
        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w')

        queries = open(f'{infold}/{mode}.tsv', 'r').readlines()

        for i, query in enumerate(queries):
            line_splits = query.strip().split("\t")
            if len(line_splits) == 3:
                intent_str, slot_tags_str, sentence = line_splits
            else:
                intent_str, sentence = line_splits
                slot_tags_str = ""

            if intent_str not in intents_list:
                intents_list[intent_str] = len(intents_list)
                outfiles['dict_intents'].write(f'{intent_str}\n')

            if ignore_prev_intent:
                start_token = 2
            else:
                start_token = 1

            if do_lower_case:
                sentence = sentence.lower()
            sentence_cld = " ".join(sentence.strip().split()[start_token:-1])
            outfiles[mode].write(f'{sentence_cld}\t' f'{str(intents_list[intent_str])}\n')

            slot_tags_list = []
            if slot_tags_str.strip():
                slot_tags = slot_tags_str.strip().split(",")
                for st in slot_tags:
                    if not st.strip():
                        continue
                    [start_i, end_i, slot_name] = st.strip().split(":")
                    slot_tags_list.append([int(start_i), int(end_i), slot_name])
                    if slot_name not in slots_list:
                        slots_list[slot_name] = len(slots_list)
                        slots_list_all[f'B-{slot_name}'] = len(slots_list_all)
                        slots_list_all[f'I-{slot_name}'] = len(slots_list_all)
                        outfiles['dict_slots'].write(f'B-{slot_name}\n')
                        outfiles['dict_slots'].write(f'I-{slot_name}\n')

            slot_tags_list.sort(key=lambda x: x[0])
            slots = []
            processed_index = 0
            for tag_start, tag_end, tag_str in slot_tags_list:
                if tag_start > processed_index:
                    words_list = sentence[processed_index:tag_start].strip().split()
                    slots.extend([str(slots_list_all['O'])] * len(words_list))
                words_list = sentence[tag_start:tag_end].strip().split()
                slots.append(str(slots_list_all[f'B-{tag_str}']))
                slots.extend([str(slots_list_all[f'I-{tag_str}'])] * (len(words_list) - 1))
                processed_index = tag_end

            if processed_index < len(sentence):
                words_list = sentence[processed_index:].strip().split()
                slots.extend([str(slots_list_all['O'])] * len(words_list))

            slots = slots[1:-1]
            slot = ' '.join(slots)
            outfiles[mode + '_slots'].write(slot + '\n')

        outfiles[mode + '_slots'].close()
        outfiles[mode].close()

    outfiles['dict_slots'].close()
    outfiles['dict_intents'].close()

    return outfold
Exemplo n.º 7
0
    def __init__(
        self,
        data_dir: str,
        modes: List[str] = ['train', 'test', 'dev'],
        none_slot_label: str = 'O',
        pad_label: int = -1,
    ):
        if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']):
            raise FileNotFoundError(
                "Make sure that your data follows the standard format "
                "supported by JointIntentSlotDataset. Your data must "
                "contain dict.intents.csv and dict.slots.csv.")

        self.data_dir = data_dir
        self.intent_dict_file = self.data_dir + '/dict.intents.csv'
        self.slot_dict_file = self.data_dir + '/dict.slots.csv'

        self.intents_label_ids = IntentSlotDataDesc.label2idx(
            self.intent_dict_file)
        self.num_intents = len(self.intents_label_ids)
        self.slots_label_ids = IntentSlotDataDesc.label2idx(
            self.slot_dict_file)
        self.num_slots = len(self.slots_label_ids)

        infold = self.data_dir
        for mode in modes:
            if not if_exist(self.data_dir, [f'{mode}.tsv']):
                logging.info(f' Stats calculation for {mode} mode'
                             f' is skipped as {mode}.tsv was not found.')
                continue
            logging.info(f' Stats calculating for {mode} mode...')
            slot_file = f'{self.data_dir}/{mode}_slots.tsv'
            with open(slot_file, 'r') as f:
                slot_lines = f.readlines()

            input_file = f'{self.data_dir}/{mode}.tsv'
            with open(input_file, 'r') as f:
                input_lines = f.readlines()[1:]  # Skipping headers at index 0

            if len(slot_lines) != len(input_lines):
                raise ValueError(
                    "Make sure that the number of slot lines match the "
                    "number of intent lines. There should be a 1-1 "
                    "correspondence between every slot and intent lines.")

            dataset = list(zip(slot_lines, input_lines))

            raw_slots, raw_intents = [], []
            for slot_line, input_line in dataset:
                slot_list = [int(slot) for slot in slot_line.strip().split()]
                raw_slots.append(slot_list)
                parts = input_line.strip().split()
                raw_intents.append(int(parts[-1]))

            logging.info(f'Three most popular intents in {mode} mode:')
            total_intents, intent_label_freq, max_id = get_label_stats(
                raw_intents, infold + f'/{mode}_intent_stats.tsv')

            merged_slots = itertools.chain.from_iterable(raw_slots)
            logging.info(f'Three most popular slots in {mode} mode:')
            slots_total, slots_label_freq, max_id = get_label_stats(
                merged_slots, infold + f'/{mode}_slot_stats.tsv')

            logging.info(f'Total Number of Intents: {total_intents}')
            logging.info(f'Intent Label Frequencies: {intent_label_freq}')
            logging.info(f'Total Number of Slots: {slots_total}')
            logging.info(f'Slots Label Frequencies: {slots_label_freq}')

            if mode == 'train':
                intent_weights_dict = get_freq_weights(intent_label_freq)
                logging.info(f'Intent Weights: {intent_weights_dict}')
                slot_weights_dict = get_freq_weights(slots_label_freq)
                logging.info(f'Slot Weights: {slot_weights_dict}')

        self.intent_weights = fill_class_weights(intent_weights_dict,
                                                 self.num_intents - 1)
        self.slot_weights = fill_class_weights(slot_weights_dict,
                                               self.num_slots - 1)

        if pad_label != -1:
            self.pad_label = pad_label
        else:
            if none_slot_label not in self.slots_label_ids:
                raise ValueError(f'none_slot_label {none_slot_label} not '
                                 f'found in {self.slot_dict_file}.')
            self.pad_label = self.slots_label_ids[none_slot_label]
Exemplo n.º 8
0
    def __init__(
        self,
        data_dir: str,
        modes: List[str] = ["train", "test", "dev"],
        none_slot_label: str = "O",
        pad_label: int = -1,
    ):
        if not if_exist(data_dir, ["dict.intents.csv", "dict.slots.csv"]):
            raise FileNotFoundError(
                "Make sure that your data follows the standard format "
                "supported by MultiLabelIntentSlotDataset. Your data must "
                "contain dict.intents.csv and dict.slots.csv.")

        self.data_dir = data_dir
        self.intent_dict_file = self.data_dir + "/dict.intents.csv"
        self.slot_dict_file = self.data_dir + "/dict.slots.csv"

        self.intents_label_ids = get_labels_to_labels_id_mapping(
            self.intent_dict_file)
        self.num_intents = len(self.intents_label_ids)
        self.slots_label_ids = get_labels_to_labels_id_mapping(
            self.slot_dict_file)
        self.num_slots = len(self.slots_label_ids)

        infold = self.data_dir
        for mode in modes:
            if not if_exist(self.data_dir, [f"{mode}.tsv"]):
                logging.info(f" Stats calculation for {mode} mode"
                             f" is skipped as {mode}.tsv was not found.")
                continue
            logging.info(f" Stats calculating for {mode} mode...")
            slot_file = f"{self.data_dir}/{mode}_slots.tsv"
            with open(slot_file, "r") as f:
                slot_lines = f.readlines()

            input_file = f"{self.data_dir}/{mode}.tsv"
            with open(input_file, "r") as f:
                input_lines = f.readlines()[1:]  # Skipping headers at index 0

            if len(slot_lines) != len(input_lines):
                raise ValueError(
                    "Make sure that the number of slot lines match the "
                    "number of intent lines. There should be a 1-1 "
                    "correspondence between every slot and intent lines.")

            dataset = list(zip(slot_lines, input_lines))

            raw_slots, raw_intents = [], []
            for slot_line, input_line in dataset:
                slot_list = [int(slot) for slot in slot_line.strip().split()]
                raw_slots.append(slot_list)
                parts = input_line.strip().split("\t")[1:][0]
                parts = list(map(int, parts.split(",")))
                parts = [
                    1 if label in parts else 0
                    for label in range(self.num_intents)
                ]
                raw_intents.append(tuple(parts))

            logging.info(f"Three most popular intents in {mode} mode:")
            total_intents, intent_label_freq, max_id = get_multi_label_stats(
                raw_intents, infold + f"/{mode}_intent_stats.tsv")

            merged_slots = itertools.chain.from_iterable(raw_slots)
            logging.info(f"Three most popular slots in {mode} mode:")
            slots_total, slots_label_freq, max_id = get_label_stats(
                merged_slots, infold + f"/{mode}_slot_stats.tsv")

            logging.info(f"Total Number of Intent Labels: {total_intents}")
            logging.info(f"Intent Label Frequencies: {intent_label_freq}")
            logging.info(f"Total Number of Slots: {slots_total}")
            logging.info(f"Slots Label Frequencies: {slots_label_freq}")

            if mode == "train":
                intent_weights_dict = get_freq_weights_bce_with_logits_loss(
                    intent_label_freq)
                logging.info(f"Intent Weights: {intent_weights_dict}")
                slot_weights_dict = get_freq_weights(slots_label_freq)
                logging.info(f"Slot Weights: {slot_weights_dict}")

        self.intent_weights = fill_class_weights(intent_weights_dict,
                                                 self.num_intents - 1)
        self.slot_weights = fill_class_weights(slot_weights_dict,
                                               self.num_slots - 1)

        if pad_label != -1:
            self.pad_label = pad_label
        else:
            if none_slot_label not in self.slots_label_ids:
                raise ValueError(f"none_slot_label {none_slot_label} not "
                                 f"found in {self.slot_dict_file}.")
            self.pad_label = self.slots_label_ids[none_slot_label]