Пример #1
0
def process_atis(infold, outfold, modes=['train', 'test'], do_lower_case=False):
    """ MSFT's dataset, processed by Kaggle
    https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk
    """
    vocab = get_vocab(f'{infold}/atis.dict.vocab.csv')

    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
        logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold))
        return outfold
    logging.info(f'Processing ATIS dataset and storing at {outfold}.')

    os.makedirs(outfold, exist_ok=True)

    outfiles = {}
    for mode in modes:
        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
        outfiles[mode].write('sentence\tlabel\n')
        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w')

        queries = open(f'{infold}/atis.{mode}.query.csv', 'r').readlines()
        intents = open(f'{infold}/atis.{mode}.intent.csv', 'r').readlines()
        slots = open(f'{infold}/atis.{mode}.slots.csv', 'r').readlines()

        for i, query in enumerate(queries):
            sentence = ids2text(query.strip().split()[1:-1], vocab)
            if do_lower_case:
                sentence = sentence.lower()
            outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n')
            slot = ' '.join(slots[i].strip().split()[1:-1])
            outfiles[mode + '_slots'].write(slot + '\n')

    shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv')
    shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv')
    for mode in modes:
        outfiles[mode].close()
    def save_embeddings(self, bert_hidden_states, output_file, mode):
        """Generate schema element embeddings and save it as a numpy file."""
        schema_embeddings = []
        max_num_intent = self.schema_config["MAX_NUM_INTENT"]
        max_num_cat_slot = self.schema_config["MAX_NUM_CAT_SLOT"]
        max_num_noncat_slot = self.schema_config["MAX_NUM_NONCAT_SLOT"]
        max_num_slot = max_num_cat_slot + max_num_noncat_slot
        max_num_value = self.schema_config["MAX_NUM_VALUE_PER_CAT_SLOT"]
        embedding_dim = self.schema_config["EMBEDDING_DIMENSION"]

        for _ in self.schemas.services:
            schema_embeddings.append({
                "intent_emb":
                np.zeros([max_num_intent, embedding_dim]),
                "req_slot_emb":
                np.zeros([max_num_slot, embedding_dim]),
                "cat_slot_emb":
                np.zeros([max_num_cat_slot, embedding_dim]),
                "noncat_slot_emb":
                np.zeros([max_num_noncat_slot, embedding_dim]),
                "cat_slot_value_emb":
                np.zeros([max_num_cat_slot, max_num_value, embedding_dim]),
            })

        # Populate the embeddings based on bert inference results and save them.
        self._populate_schema_embeddings(schema_embeddings, bert_hidden_states,
                                         mode)

        master_device = not torch.distributed.is_initialized(
        ) or torch.distributed.get_rank() == 0
        if master_device:
            with open(output_file, "wb") as f_s:
                np.save(f_s, schema_embeddings)
                logging.info(f"The schema embeddings saved at {output_file}")
                f_s.close()
    def on_epoch_end(self):
        if self.global_rank is None or self.global_rank == 0:
            step = self.step

            delta = datetime.timedelta(seconds=(time.time() -
                                                self._last_epoch_start))
            logging.info(f"Finished epoch {self.epoch_num} in {delta}")
Пример #4
0
    def __init__(self,
                 data_dir,
                 domains={
                     "attraction": 0,
                     "restaurant": 1,
                     "taxi": 2,
                     "train": 3,
                     "hotel": 4
                 }):
        logging.info(f'Processing MultiWOZ dataset')

        self.all_domains = {
            'attraction': 0,
            'restaurant': 1,
            'taxi': 2,
            'train': 3,
            'hotel': 4,
            'hospital': 5,
            'bus': 6,
            'police': 7,
        }
        self.gating_dict = {'ptr': 0, 'dontcare': 1, 'none': 2}

        self.data_dir = data_dir
        self.domains = domains
        self.vocab = Vocab()

        ontology_file = open(f'{self.data_dir}/ontology.json', 'r')
        self.ontology = json.load(ontology_file)

        self.vocab_file = None
        self.slots = None

        self.get_slots()
        self.get_vocab()
Пример #5
0
def process_assistant(infold, outfold, modes=['train', 'test']):
    """
    https://github.com/xliuhw/NLU-Evaluation-Data - this dataset includes
    about 25 thousand examples with 66 various multi-domain intents and 57 entity types.
    """
    if if_exist(outfold, [f'{mode}_slots.tsv' for mode in modes]):
        logging.info(DATABASE_EXISTS_TMP.format('robot', outfold))
        return outfold

    logging.info(
        f'Processing assistant commands dataset and store at {outfold}')
    os.makedirs(outfold, exist_ok=True)

    # copy train/test files to the convenient directory to work with
    copy_input_files(infold)
    infold += "/dataset"

    # get list of intents from train folder (test folder supposed to be the same)
    intent_names = get_intents(infold + "/trainset")
    write_files(intent_names, f'{outfold}/dict.intents.csv')

    # get all train and test queries with their intent
    for mode in modes:
        intent_queries = get_intent_queries(infold, intent_names, mode)
        write_files(intent_queries, f'{outfold}/{mode}.tsv')

    # get list of all unique slots in training and testing files
    slot_types = get_slots(infold, modes)
    write_files(slot_types, f'{outfold}/dict.slots.csv')

    # create files of slot queries
    slot_dict = {k: v for v, k in enumerate(slot_types)}
    for mode in modes:
        slot_queries = get_slot_queries(infold, slot_dict, mode, intent_names)
        write_files(slot_queries, f'{outfold}/{mode}_slots.tsv')
 def on_action_end(self):
     if self.global_rank is None or self.global_rank == 0:
         if self._swriter is not None:
             self._swriter.close()
         delta = datetime.timedelta(seconds=(time.time() -
                                             self._start_time))
         logging.info("Done in %s", delta)
Пример #7
0
def errors_per_class(cm, dict):
    """
    Summarize confusions per each class in the confusion matrix.
    It can be useful both for Intents and Slots.
    It counts each confusion twice in both directions.
    Args:
        cm: Confusion matrix
        dict: Dictionary with key as a name and index as a value (Intents or Slots)
    """
    size = cm.shape[0]
    confused_per_class = {}
    total_errors = 0
    for class_num in range(size):
        sum = 0
        for i in range(size):
            if i != class_num:
                sum += cm[class_num][i]
                sum += cm[i][class_num]
        confused_per_class[dict[class_num]] = sum
        total_errors += sum
        # logging.info(f'{dict[class_num]} - {sum}')

    logging.info(f'Total errors (multiplied by 2): {total_errors}')
    sorted_confused_per_class = sorted(confused_per_class.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
    for conf_str in sorted_confused_per_class:
        logging.info(conf_str)
Пример #8
0
Файл: data.py Проект: vsl9/NeMo
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        logging.info("keep_words {} / {} = {:.4f}".format(
            len(keep_words),
            len(self.word2index),
            len(keep_words) / len(self.word2index),
        ))

        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {
            PAD_token: "PAD",
            SOS_token: "SOS",
            EOS_token: "EOS",
        }
        self.num_words = 3  # Count default tokens

        for word in keep_words:
            self.addWord(word)
def eval_epochs_done_callback(
    global_vars,
    eval_data_layer,
    do_lower_case,
    n_best_size,
    max_answer_length,
    version_2_with_negative,
    null_score_diff_threshold,
):
    exact_match, f1, _ = eval_data_layer.dataset.evaluate(
        unique_ids=global_vars["eval_unique_ids"],
        start_logits=global_vars["eval_start_logits"],
        end_logits=global_vars["eval_end_logits"],
        n_best_size=n_best_size,
        max_answer_length=max_answer_length,
        version_2_with_negative=version_2_with_negative,
        null_score_diff_threshold=null_score_diff_threshold,
        do_lower_case=do_lower_case,
    )

    logging.info(f"Exact_match = {exact_match}, f1 = {f1}")

    global_vars["eval_unique_ids"] = []
    global_vars["eval_start_logits"] = []
    global_vars["eval_end_logits"] = []

    return dict({"exact_match": exact_match, "f1": f1})
Пример #10
0
def get_label_stats(labels, outfile='stats.tsv'):
    '''

    Args:
        labels: list of all labels
        outfile: path to the file where to save label stats

    Returns:
        total (int): total number of labels
        label_frequencies (list of tuples): each tuple represent (label, label frequency)
    '''
    labels = Counter(labels)
    total = sum(labels.values())
    out = open(outfile, 'w')
    i = 0
    freq_dict = {}
    label_frequencies = labels.most_common()
    for k, v in label_frequencies:
        out.write(f'{k}\t\t{round(v/total,5)}\t\t{v}\n')
        if i < 3:
            logging.info(f'{i} item: {k}, {v} out of {total}, {v / total}.')
        i += 1
        freq_dict[k] = v

    return total, freq_dict, max(labels.keys())
Пример #11
0
def eval_epochs_done_callback(global_vars, punct_label_ids, capit_label_ids, graph_fold=None, normalize_cm=True):
    '''
    Args:
      graph_fold (str): path to output folder
      normalize_cm (bool): flag to indicate whether to
        normalize confusion matrix
    '''
    results = {}
    punct_class_report = _eval_epochs_done_callback('punct', global_vars, punct_label_ids, graph_fold, normalize_cm)
    for label in punct_class_report:
        if label != 'accuracy':
            label_name = label[: label.index('(label id') - 1] if 'label id' in label else label
            results['pF1 ' + label_name] = round(punct_class_report[label]['f1-score'] * 100, 2)
            results['pPR ' + label_name] = round(punct_class_report[label]['precision'] * 100, 2)
            results['pR ' + label_name] = round(punct_class_report[label]['recall'] * 100, 2)

    capit_class_report = _eval_epochs_done_callback('capit', global_vars, capit_label_ids, graph_fold, normalize_cm)
    for label in capit_class_report:
        if label != 'accuracy':
            label_name = label[: label.index('(label id') - 1] if 'label id' in label else label
            results['cF1: ' + label_name] = round(capit_class_report[label]['f1-score'] * 100, 2)
            results['pPR ' + label_name] = round(capit_class_report[label]['precision'] * 100, 2)
            results['pR ' + label_name] = round(capit_class_report[label]['recall'] * 100, 2)

    logging.info(f'results: {results}')
    return results
def write_timestamped(contents, dir=None, name=None, mode="wb"):
    """
    Generates a timestamped file path in the specified directory.

    Args:
        contents (bytes-like object or callable): Either a bytes-like object that can be written to disk, or a callable which will return such an object.
        dir (str): The directory to write into.
        name (str): The name of the file.

    Optional Args:
        mode(str): The mode to use when writing. Defaults to "wb".

    Returns:
        str: The complete file path, or None if nothing was written.
    """
    if dir is not None:
        if not os.path.exists(dir):
            # logging.debug("{:} does not exist, creating now.".format(dir))
            os.makedirs(dir, exist_ok=True)

        path = timestamped_filepath(dir, name)

        if callable(contents):
            contents = contents()

        if os.path.exists(path):
            logging.warning(
                "{:} already exists. Will not overwrite.".format(path))
        else:
            with open(path, mode) as f:
                logging.info("Writing to {:}".format(path))
                f.write(contents)
            return path
    return None
        def load_plugins():
            import ctypes

            for plugin in plugins:
                path = os.path.abspath(plugin)
                logging.info("Loading plugin library: {:}".format(path))
                ctypes.CDLL(path)
Пример #14
0
def process_imdb(infold, outfold, uncased, modes=['train', 'test']):
    if not os.path.exists(infold):
        link = 'www.kaggle.com/iarunava/imdb-movie-reviews-dataset'
        raise ValueError(f'Data not found at {infold}. '
                         f'Please download IMDB from {link}.')

    logging.info(f'Processing IMDB dataset and store at {outfold}')
    os.makedirs(outfold, exist_ok=True)

    outfiles = {}
    for mode in modes:
        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
        outfiles[mode].write('sentence\tlabel\n')
        for sent in ['neg', 'pos']:
            if sent == 'neg':
                label = 0
            else:
                label = 1
            files = glob.glob(f'{data_dir}/{mode}/{sent}/*.txt')
            for file in files:
                with open(file, 'r') as f:
                    review = f.read().strip()
                if uncased:
                    review = review.lower()
                review = review.replace("<br />", "")
                outfiles[mode].write(f'{review}\t{label}\n')
    for mode in modes:
        outfiles[mode].close()
Пример #15
0
def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, mode='train', is_training=True):
    logging.info(f"Loading {mode} data...")
    data_file = f'{data_desc.data_dir}/{mode}.tsv'
    shuffle = args.shuffle_data if is_training else False
    data_layer = nemo_nlp.nm.data_layers.BertTextClassificationDataLayer(
        input_file=data_file,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        num_samples=num_samples,
        shuffle=shuffle,
        batch_size=batch_size,
        use_cache=args.use_cache,
    )

    ids, type_ids, input_mask, labels = data_layer()
    data_size = len(data_layer)

    if data_size < batch_size:
        logging.warning("Batch_size is larger than the dataset size")
        logging.warning("Reducing batch_size to dataset size")
        batch_size = data_size

    steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus))

    hidden_states = model(input_ids=ids, token_type_ids=type_ids, attention_mask=input_mask)

    logits = classifier(hidden_states=hidden_states)
    loss = loss_fn(logits=logits, labels=labels)

    if is_training:
        tensors_to_evaluate = [loss, logits]
Пример #16
0
def analyze_confusion_matrix(cm, dict, max_pairs=10):
    """
    Sort all confusions in the confusion matrix by value and display results.
    Print results in a format: (name -> name, value)
    Args:
        cm: Confusion matrix
        dict: Dictionary with key as a name and index as a value (Intents or Slots)
        max_pairs: Max number of confusions to print
    """
    threshold = 5  # just arbitrary value to take confusion with at least this number
    confused_pairs = {}
    size = cm.shape[0]
    for i in range(size):
        res = cm[i].argsort()
        for j in range(size):
            pos = res[size - j - 1]
            # no confusion - same row and column
            if pos == i:
                continue
            elif cm[i][pos] >= threshold:
                str = f'{dict[i]} -> {dict[pos]}'
                confused_pairs[str] = cm[i][pos]
            else:
                break

    # sort by max confusions and print first max_pairs
    sorted_confused_pairs = sorted(confused_pairs.items(),
                                   key=lambda x: x[1],
                                   reverse=True)
    for i, pair_str in enumerate(sorted_confused_pairs):
        if i >= max_pairs:
            break
        logging.info(pair_str)
def _added_token_counts(data_iterator, try_swapping, max_input_examples=10000):
    """Computes how many times different phrases have to be added.

  Args:
	data_iterator: Iterator to yield source lists and targets. See function
	  yield_sources_and_targets in utils.py for the available iterators. The
	  strings in the source list will be concatenated, possibly after swapping
	  their order if swapping is enabled.
	try_swapping: Whether to try if swapping sources results in less added text.
	max_input_examples: Maximum number of examples to be read from the iterator.

  Returns:
	Tuple (collections.Counter for phrases, added phrases for each example).
  """
    phrase_counter = collections.Counter()
    num_examples = 0
    all_added_phrases = []
    for sources, target in data_iterator:
        if num_examples >= max_input_examples:
            break
        if num_examples % 1000 == 0:
            logging.info("{} examples processed.".format(num_examples))
        added_phrases = _get_added_phrases(' '.join(sources), target)
        if try_swapping and len(sources) == 2:
            added_phrases_swap = _get_added_phrases(' '.join(sources[::-1]), target)
            # If we can align more and have to add less after swapping, we assume that
            # the sources would be swapped during conversion.
            if len(''.join(added_phrases_swap)) < len(''.join(added_phrases)):
                added_phrases = added_phrases_swap
        for phrase in added_phrases:
            phrase_counter[phrase] += 1
        all_added_phrases.append(added_phrases)
        num_examples += 1
    logging.info(f'{num_examples} examples processed.\n')
    return phrase_counter, all_added_phrases
Пример #18
0
def process_dialogflow(infold, outfold, dev_split=0.1):
    if not os.path.exists(infold):
        link = 'www.dialogflow.com'
        raise ValueError(f'Data not found at {infold}. '
                         f'Export your dialogflow data from'
                         f'{link} and unzip at {infold}.')

    if if_exist(outfold, [f'{mode}.tsv' for mode in ['train', 'test']]):
        logging.info(DATABASE_EXISTS_TMP.format('mturk', outfold))
        return

    os.makedirs(outfold, exist_ok=True)

    files = get_intent_query_files_dialogflow(infold)
    slot_labels = get_slots_dialogflow(files)
    intent_queries, intent_names, slot_tags = get_intents_slots_dialogflow(
        files, slot_labels)
    train_queries, train_slots, test_queries, test_slots = partition_data(
        intent_queries, slot_tags, split=dev_split)

    write_files(train_queries, f'{outfold}/train.tsv')
    write_files(train_slots, f'{outfold}/train_slots.tsv')

    write_files(test_queries, f'{outfold}/test.tsv')
    write_files(test_slots, f'{outfold}/test_slots.tsv')

    write_files(slot_labels, f'{outfold}/dict.slots.csv')
    write_files(intent_names, f'{outfold}/dict.intents.csv')
def eval_epochs_done_callback(global_vars, label_ids, graph_fold=None, none_label_id=0, normalize_cm=True):
    labels = np.asarray(global_vars['all_labels'])
    preds = np.asarray(global_vars['all_preds'])
    subtokens_mask = np.asarray(global_vars['all_subtokens_mask']) > 0.5

    labels = labels[subtokens_mask]
    preds = preds[subtokens_mask]

    # print predictions and labels for a small random subset of data
    sample_size = 20
    i = 0
    if preds.shape[0] > sample_size + 1:
        i = random.randint(0, preds.shape[0] - sample_size - 1)
    logging.info("Sampled preds: [%s]" % list2str(preds[i : i + sample_size]))
    logging.info("Sampled labels: [%s]" % list2str(labels[i : i + sample_size]))

    accuracy = sum(labels == preds) / labels.shape[0]
    logging.info(f'Accuracy: {accuracy}')

    f1_scores = get_f1_scores(labels, preds, average_modes=['weighted', 'macro', 'micro'])
    for k, v in f1_scores.items():
        logging.info(f'{k}: {v}')

    classification_report = get_classification_report(labels, preds, label_ids)
    logging.info(classification_report)

    # calculate and plot confusion_matrix
    if graph_fold:
        plot_confusion_matrix(labels, preds, graph_fold, label_ids, normalize=normalize_cm)

    return dict({'Accuracy': accuracy})
Пример #20
0
def process_sst_2(data_dir):
    if not os.path.exists(data_dir):
        link = 'https://gluebenchmark.com/tasks'
        raise ValueError(f'Data not found at {data_dir}. '
                         f'Please download SST-2 from {link}.')
    logging.info('Keep in mind that SST-2 is only available in lower case.')
    return data_dir
Пример #21
0
    def create_vocab(self):
        self.vocab.add_words(self.slots, 'slot')

        filename = f'{self.data_dir}/train_dials.json'
        logging.info(f'Building vocab from {filename}')
        dialogs = json.load(open(filename, 'r'))

        max_value_len = 0

        for dialog_dict in dialogs:
            for turn in dialog_dict['dialogue']:
                self.vocab.add_words(turn['system_transcript'], 'utterance')
                self.vocab.add_words(turn['transcript'], 'utterance')

                turn_beliefs = fix_general_label_error_multiwoz(
                    turn['belief_state'], self.slots)
                lengths = [
                    len(turn_beliefs[slot]) for slot in self.slots
                    if slot in turn_beliefs
                ]
                lengths.append(max_value_len)
                max_value_len = max(lengths)

        logging.info(f'Saving vocab to {self.data_dir}')
        with open(self.vocab_file, 'wb') as handle:
            pickle.dump(self.vocab, handle)
Пример #22
0
    def __call__(self):
        class DummyContextManager(object):
            def __enter__(self):
                return None

            def __exit__(self, exc_type, exc_value, traceback):
                return None

        network_parser = self.network_loader()
        try:
            network, parser = network_parser
            assert isinstance(network, trt.INetworkDefinition)
        except (ValueError, AssertionError):
            network = network_parser
            parser = DummyContextManager()

        with trt.Builder(TRT_LOGGER) as builder, network, parser:
            if self.preprocess_network:
                logging.debug("Applying network preprocessing: {:}".format(
                    self.preprocess_network))
                self.preprocess_network(network)

            if self.layerwise:
                TensorRTRunnerV2.mark_layerwise(network)

            if logging.getEffectiveLevel() <= logging.DEBUG:
                TensorRTRunnerV2.log_network(network)

            config = builder.create_builder_config()
            profile = TensorRTRunnerV2.build_profile(builder, network,
                                                     self.profile_shapes)
            config.add_optimization_profile(profile)

            config.max_workspace_size = int(self.max_workspace_size)
            if self.fp16_mode:
                config.flags = 1 << int(trt.BuilderFlag.FP16)
            if self.int8_mode:
                config.flags = config.flags | 1 << int(trt.BuilderFlag.INT8)
                if not network.has_explicit_precision:
                    if not self.calibrator:
                        logging.critical(
                            "Network does not have explicit precision. A calibrator must be provided in order to use int8 mode."
                        )
                    self.calibrator.set_input_metadata(
                        get_input_metadata_from_profile(profile, network))
                    config.int8_calibrator = self.calibrator

            logging.debug("Using builder configuration flags: {:}".format(
                config.flags))
            logging.info(
                "Building engine: max workspace size={:} bytes, fp16={:}, int8={:}, layerwise={:}"
                .format(self.max_workspace_size, self.fp16_mode,
                        self.int8_mode, self.layerwise))
            engine = builder.build_engine(network, config)
            self.written_engine_path = write_timestamped(
                contents=lambda: engine.serialize(),
                dir=self.write_engine,
                name="tensorrt_runner_v2.engine")
            return engine
def receive_on_queue(queue, timeout=None):
    logging.info("Waiting for data to become available on queue")
    obj = queue.get(block=True, timeout=timeout)
    if is_compressed(obj):
        logging.debug("Decompressing output")
        obj = decompress(obj)
    logging.info("Received {:} on queue".format(obj))
    return obj
    def on_iteration_start(self):
        if self.step == 4:
            profiler.start()
            logging.info(f"********************Starting profiler at step: " +
                         str(self.step))

        if self.global_rank is None or self.global_rank == 0:
            self._last_iter_start = time.time()
Пример #25
0
Файл: data.py Проект: vsl9/NeMo
def readVocs(datafile, corpus_name):
    logging.info("Reading lines...")
    # Read the file and split into lines
    lines = open(datafile, encoding="utf-8").read().strip().split("\n")
    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split("\t")] for l in lines]
    voc = Voc(corpus_name)
    return voc, pairs
Пример #26
0
def eval_epochs_done_callback(global_vars):
    eloss = mean(global_vars["eval_loss"])
    etop1 = mean(global_vars["top1"])
    logging.info("Evaluation Loss: {0}".format(eloss))
    logging.info("Evaluation Top@1: {0}".format(etop1))
    for k in global_vars.keys():
        global_vars[k] = []
    return dict({"Evaluation Loss": eloss, "Evaluation Top@1": etop1})
Пример #27
0
def _eval_epochs_done_callback(task_name, global_vars, label_ids, graph_fold=None, normalize_cm=True):
    labels = np.array(global_vars[task_name + '_labels'])
    preds = np.array(global_vars[task_name + '_preds'])

    # calculate and plot confusion_matrix
    if graph_fold:
        plot_confusion_matrix(labels, preds, graph_fold, label_ids, normalize=normalize_cm, prefix=task_name)

    logging.info(f'{get_classification_report(labels, preds, label_ids)}')
    return get_classification_report(labels, preds, label_ids, output_dict=True)
Пример #28
0
    def get_vocab(self):
        self.vocab_file = f'{self.data_dir}/vocab.pkl'

        if os.path.exists(self.vocab_file):
            logging.info(f'Loading vocab from {self.data_dir}')
            self.vocab = pickle.load(open(self.vocab_file, 'rb'))
        else:
            self.create_vocab()

        logging.info(f'Vocab size {len(self.vocab)}')
 def forward(self, mel_spectrogram):
     if not self._removed_weight_norm:
         logging.info("remove WN")
         self.waveglow = self.waveglow.remove_weightnorm(self.waveglow)
         self._removed_weight_norm = True
     if self.training:
         raise ValueError("You are using the WaveGlow Infer Neural Module in training mode.")
     with torch.no_grad():
         audio = self.waveglow.infer(mel_spectrogram, sigma=self._sigma)
     return audio
Пример #30
0
def write_predictions_to_file(predictions, input_json_files, output_dir,
                              schemas, state_tracker, eval_debug,
                              in_domain_services):
    """Write the predicted dialogues as json files.

  Args:
    predictions: An iterator containing model predictions. This is the output of
      the predict method in the estimator.
    input_json_files: A list of json paths containing the dialogues to run
      inference on.
    schemas: Schemas to all services in the dst dataset (train, dev and test splits).
    output_dir: The directory where output json files will be created.
  """
    logging.info(f"Writing predictions to {output_dir} started.")

    # Index all predictions.
    all_predictions = {}
    for idx, prediction in enumerate(predictions):
        if not prediction["is_real_example"]:
            continue
        eval_dataset, dialog_id, turn_id, service_name = prediction[
            'example_id'].split('-')
        all_predictions[(dialog_id, turn_id, service_name)] = prediction
    logging.info(
        f'Predictions for {idx} examples in {eval_dataset} dataset are getting processed.'
    )

    # Read each input file and write its predictions.
    for input_file_path in input_json_files:
        with open(input_file_path) as f:
            dialogs = json.load(f)
            logging.debug(f'{input_file_path} file is loaded')
            pred_dialogs = []
            for d in dialogs:
                if state_tracker == 'baseline':
                    pred_dialog = get_predicted_dialog_baseline(
                        d, all_predictions, schemas)
                elif state_tracker == 'nemotracker':
                    pred_dialog = get_predicted_dialog_nemotracker(
                        d, all_predictions, schemas, eval_debug,
                        in_domain_services)
                else:
                    raise ValueError(
                        f"tracker_mode {state_tracker} is not defined.")
                pred_dialogs.append(pred_dialog)
            f.close()
        input_file_name = os.path.basename(input_file_path)
        output_file_path = os.path.join(output_dir, input_file_name)
        with open(output_file_path, "w") as f:
            json.dump(pred_dialogs,
                      f,
                      indent=2,
                      separators=(",", ": "),
                      sort_keys=True)
            f.close()