예제 #1
0
    def __validate_params(self, params):
        """
            Checks whether dictionary contains parameters being primitive types (string, int, float etc.)
            or (lists of)+ primitive types.

            Args:
                params: dictionary of parameters.

            Returns:
                True if all parameters were ok, False otherwise.
        """
        ok = True

        # Iterate over parameters and check them one by one.
        for key, variable in params.items():
            if not self.__is_of_allowed_type(variable):
                logging.warning(
                    "Parameter '{}' contains a variable '{}' of type '{}' which is not allowed.".format(
                        key, variable, type(variable)
                    )
                )
                ok = False

        # Return the result.
        return ok
예제 #2
0
    def get_raw_scores(self, preds):
        """
        Computes the exact and f1 scores from the examples
        and the model predictions
        """
        exact_scores = {}
        f1_scores = {}

        for example in self.examples:
            qas_id = example.qas_id
            gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])]

            if not gold_answers:
                # For unanswerable questions,
                # only correct answer is empty string
                gold_answers = [""]

            if qas_id not in preds:
                logging.warning("Missing prediction for %s" % qas_id)
                continue

            prediction = preds[qas_id]
            exact_scores[qas_id] = max(exact_match_score(a, prediction) for a in gold_answers)
            f1_scores[qas_id] = max(f1_score(a, prediction) for a in gold_answers)

        return exact_scores, f1_scores
def create_pipeline(num_samples, batch_size, num_gpus, input_dropout, data_prefix, is_training):
    logging.info(f"Loading {data_prefix} data...")
    shuffle = args.shuffle_data if is_training else False

    data_layer = MultiWOZDataLayer(
        args.data_dir,
        data_desc.domains,
        all_domains=data_desc.all_domains,
        vocab=data_desc.vocab,
        slots=data_desc.slots,
        gating_dict=data_desc.gating_dict,
        num_samples=num_samples,
        shuffle=shuffle,
        num_workers=0,
        batch_size=batch_size,
        mode=data_prefix,
        is_training=is_training,
        input_dropout=input_dropout,
    )

    input_data = data_layer()
    data_size = len(data_layer)
    logging.info(f'The length of data layer is {data_size}')

    if data_size < batch_size:
        logging.warning("Batch_size is larger than the dataset size")
        logging.warning("Reducing batch_size to dataset size")
        batch_size = data_size

    steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus))
    logging.info(f"Steps_per_epoch = {steps_per_epoch}")

    outputs, hidden = encoder(inputs=input_data.src_ids, input_lens=input_data.src_lens)

    point_outputs, gate_outputs = decoder(
        encoder_hidden=hidden,
        encoder_outputs=outputs,
        input_lens=input_data.src_lens,
        src_ids=input_data.src_ids,
        targets=input_data.tgt_ids,
    )

    gate_loss = gate_loss_fn(logits=gate_outputs, labels=input_data.gating_labels)
    ptr_loss = ptr_loss_fn(logits=point_outputs, labels=input_data.tgt_ids, length_mask=input_data.tgt_lens)
    total_loss = total_loss_fn(loss_1=gate_loss, loss_2=ptr_loss)

    if is_training:
        tensors_to_evaluate = [total_loss, gate_loss, ptr_loss]
    else:
        tensors_to_evaluate = [
            total_loss,
            point_outputs,
            gate_outputs,
            input_data.gating_labels,
            input_data.turn_domain,
            input_data.tgt_ids,
            input_data.tgt_lens,
        ]

    return tensors_to_evaluate, total_loss, ptr_loss, gate_loss, steps_per_epoch, data_layer
def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, mode='train', is_training=True):
    logging.info(f"Loading {mode} data...")
    data_file = f'{data_desc.data_dir}/{mode}.tsv'
    shuffle = args.shuffle_data if is_training else False
    data_layer = nemo_nlp.nm.data_layers.BertTextClassificationDataLayer(
        input_file=data_file,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        num_samples=num_samples,
        shuffle=shuffle,
        batch_size=batch_size,
        use_cache=args.use_cache,
    )

    ids, type_ids, input_mask, labels = data_layer()
    data_size = len(data_layer)

    if data_size < batch_size:
        logging.warning("Batch_size is larger than the dataset size")
        logging.warning("Reducing batch_size to dataset size")
        batch_size = data_size

    steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus))

    hidden_states = model(input_ids=ids, token_type_ids=type_ids, attention_mask=input_mask)

    logits = classifier(hidden_states=hidden_states)
    loss = loss_fn(logits=logits, labels=labels)

    if is_training:
        tensors_to_evaluate = [loss, logits]
def write_timestamped(contents, dir=None, name=None, mode="wb"):
    """
    Generates a timestamped file path in the specified directory.

    Args:
        contents (bytes-like object or callable): Either a bytes-like object that can be written to disk, or a callable which will return such an object.
        dir (str): The directory to write into.
        name (str): The name of the file.

    Optional Args:
        mode(str): The mode to use when writing. Defaults to "wb".

    Returns:
        str: The complete file path, or None if nothing was written.
    """
    if dir is not None:
        if not os.path.exists(dir):
            # logging.debug("{:} does not exist, creating now.".format(dir))
            os.makedirs(dir, exist_ok=True)

        path = timestamped_filepath(dir, name)

        if callable(contents):
            contents = contents()

        if os.path.exists(path):
            logging.warning(
                "{:} already exists. Will not overwrite.".format(path))
        else:
            with open(path, mode) as f:
                logging.info("Writing to {:}".format(path))
                f.write(contents)
            return path
    return None
예제 #6
0
    def list_pretrained_models() -> Optional[List[PretrainedModelInfo]]:
        """List all available pre-trained models (e.g. weights) for convolutional
        encoder-decoder CTC-based speech recognition models.

        Returns:
            A list of PretrainedModelInfo tuples.
            The pretrained_model_name field of the tuple can be used to
            retrieve pre-trained model's weights (pass it as
            pretrained_model_name argument to the module's constructor)
        """
        logging.warning("TODO: CHANGE ME TO GRAB STUFF FROM NGC")
        result = []
        model = PretrainedModelInfo(
            pretrained_model_name="QuartzNet15x5-En",
            location=
            "https://nemo-public.s3.us-east-2.amazonaws.com/nemo_0.11_models_test/QuartzNet15x5-En-Base.nemo",
            description=
            "The model is trained on ~3300 hours of publicly available data and achieves a WER of 3.91% on LibriSpeech dev-clean, and a WER of 10.58% on dev-other.",
            parameters="",
        )
        result.append(model)

        model = PretrainedModelInfo(
            pretrained_model_name="QuartzNet15x5-Zh",
            location=
            "https://nemo-public.s3.us-east-2.amazonaws.com/nemo_0.11_models_test/QuartzNet15x5-Zh-Base.nemo",
            description=
            "The model is trained on ai-shell2 mandarin chinese dataset.",
            parameters="",
        )
        result.append(model)
        return result
예제 #7
0
    def deduce_format(shape):
        """
        Guesses the data format of a given shape.

        Args:
            shape (Tuple[int]): The shape, including batch dimension.

        Returns:
            DataFormat: The deduced data format.
        """
        # The smaller this ratio, the closer a and b are.
        def minmax_ratio(a, b):
            return abs(max(a, b) / min(a, b))

        # Assume all shapes include batch dimension
        if len(shape) == 4:
            # Typically, H and W are quite close, so if minmax_ratio(0, 1) > minmax_ratio(1, 2), then we assume CHW.
            if minmax_ratio(shape[1], shape[2]) > minmax_ratio(shape[2], shape[3]):
                return DataFormat.NCHW
            return DataFormat.NHWC
        elif len(shape) == 3:
            return DataFormat.NHW
        elif len(shape) == 2:
            return DataFormat.NW
        else:
            logging.warning(
                "Cannot deduce format for "
                + str(shape)
                + ". Currently only implemented for input_buffers with 1-3 non-batch dimensions. Please update this function!"
            )
            return DataFormat.UNKNOWN
예제 #8
0
def download_wkt2(data_dir):
    os.makedirs('data/lm', exist_ok=True)
    logging.warning(f'Data not found at {data_dir}. '
                    f'Downloading wikitext-2 to data/lm')
    data_dir = 'data/lm/wikitext-2'
    subprocess.call('scripts/get_wkt2.sh')
    return data_dir
예제 #9
0
 def check(self, model):
     try:
         onnx.checker.check_model(model)
         logging.debug("ONNX Checker Passed")
     except onnx.checker.ValidationError as err:
         logging.warning(
             "ONNX Checker exited with an error: {:}".format(err))
     return model
예제 #10
0
def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, mode='train'):
    logging.info(f"Loading {mode} data...")
    data_file = f'{data_desc.data_dir}/{mode}.tsv'
    slot_file = f'{data_desc.data_dir}/{mode}_slots.tsv'
    shuffle = args.shuffle_data if mode == 'train' else False

    data_layer = BertJointIntentSlotDataLayer(
        input_file=data_file,
        slot_file=slot_file,
        pad_label=data_desc.pad_label,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        num_samples=num_samples,
        shuffle=shuffle,
        batch_size=batch_size,
        ignore_extra_tokens=args.ignore_extra_tokens,
        ignore_start_end=args.ignore_start_end,
    )

    input_data = data_layer()
    data_size = len(data_layer)

    logging.info(f'The length of data layer is {data_size}')

    if data_size < batch_size:
        logging.warning("Batch_size is larger than the dataset size")
        logging.warning("Reducing batch_size to dataset size")
        batch_size = data_size

    steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus))
    logging.info(f"Steps_per_epoch = {steps_per_epoch}")

    hidden_states = pretrained_bert_model(
        input_ids=input_data.input_ids,
        token_type_ids=input_data.input_type_ids,
        attention_mask=input_data.input_mask)

    intent_logits, slot_logits = classifier(hidden_states=hidden_states)

    intent_loss = intent_loss_fn(logits=intent_logits,
                                 labels=input_data.intents)
    slot_loss = slot_loss_fn(logits=slot_logits,
                             labels=input_data.slots,
                             loss_mask=input_data.loss_mask)
    total_loss = total_loss_fn(loss_1=intent_loss, loss_2=slot_loss)

    if mode == 'train':
        tensors_to_evaluate = [total_loss, intent_logits, slot_logits]
    else:
        tensors_to_evaluate = [
            intent_logits,
            slot_logits,
            input_data.intents,
            input_data.slots,
            input_data.subtokens_mask,
        ]

    return tensors_to_evaluate, total_loss, steps_per_epoch, data_layer
예제 #11
0
 def __init__(self, dataset_name, data_dir, do_lower_case):
     if dataset_name == 'wikitext-2':
         if not os.path.exists(data_dir):
             data_dir = download_wkt2(data_dir)
         self.vocab_size = create_vocab_lm(data_dir, do_lower_case)
         self.data_dir = data_dir
     else:
         logging.warning("Looks like you passed a dataset name that isn't "
                         "already supported by NeMo. Please make sure that "
                         "you build the preprocessing method for it.")
예제 #12
0
파일: perturb.py 프로젝트: vsl9/NeMo
 def from_config(cls, config):
     ptbs = []
     for p in config:
         if p['aug_type'] not in perturbation_types:
             logging.warning("%s perturbation not known. Skipping.",
                             p['aug_type'])
             continue
         perturbation = perturbation_types[p['aug_type']]
         ptbs.append((p['prob'], perturbation(**p['cfg'])))
     return cls(perturbations=ptbs)
예제 #13
0
파일: tacotron2.py 프로젝트: ehutt/NeMo
    def infer(self, memory, memory_lengths):
        """ Decoder inference
        PARAMS
        ------
        memory: Encoder outputs
        RETURNS
        -------
        mel_outputs: mel outputs from the decoder
        gate_outputs: gate outputs from the decoder
        alignments: sequence of attention weights from the decoder
        """
        decoder_input = self.get_go_frame(memory)

        if memory.size(0) > 1:
            mask = ~get_mask_from_lengths(memory_lengths)
        else:
            mask = None

        self.initialize_decoder_states(memory, mask=mask)

        mel_lengths = torch.zeros([memory.size(0)], dtype=torch.int32)
        not_finished = torch.ones([memory.size(0)], dtype=torch.int32)
        if torch.cuda.is_available():
            mel_lengths = mel_lengths.cuda()
            not_finished = not_finished.cuda()

        mel_outputs, gate_outputs, alignments = [], [], []
        while True:
            decoder_input = self.prenet(decoder_input, inference=True)
            mel_output, gate_output, alignment = self.decode(decoder_input)

            dec = torch.le(torch.sigmoid(gate_output.data),
                           self.gate_threshold).to(torch.int32).squeeze(1)

            not_finished = not_finished * dec
            mel_lengths += not_finished

            if self.early_stopping and torch.sum(not_finished) == 0:
                break

            mel_outputs += [mel_output.squeeze(1)]
            gate_outputs += [gate_output]
            alignments += [alignment]

            if len(mel_outputs) == self.max_decoder_steps:
                logging.warning("Reached max decoder steps")
                break

            decoder_input = mel_output

        mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
            mel_outputs, gate_outputs, alignments)

        return mel_outputs, gate_outputs, alignments, mel_lengths
예제 #14
0
 def __init__(self, dataset_name, data_dir, do_lower_case):
     if dataset_name == 'wikitext-2':
         if not os.path.exists(data_dir):
             raise FileNotFoundError(
                 "Dataset not found. Run './get_wkt2.sh DATA_DIR' from examples/nlp/scripts"
             )
         self.vocab_size = self.create_vocab_lm(data_dir, do_lower_case)
         self.data_dir = data_dir
     else:
         logging.warning("Looks like you passed a dataset name that isn't "
                         "already supported by NeMo. Please make sure that "
                         "you build the preprocessing method for it.")
    def resize(self, name, shape):
        found = False
        for buf_dict in [self.device_buffers, self.host_outputs]:
            if name in buf_dict:
                found = True
                buf_dict[name].resize(shape)

        if not found:
            logging.warning(
                "Buffer: {:} was not found, could not resize".format(name))
        else:
            logging.debug("Resizing {:} buffer to {:}".format(name, shape))
예제 #16
0
    def add_categorical_slots(self, state_update, agg_sys_state):
        """Add features for categorical slots."""
        categorical_slots = self.service_schema.categorical_slots
        self.num_categorical_slots = len(categorical_slots)
        for slot_idx, slot in enumerate(categorical_slots):
            values = state_update.get(slot, [])
            # Add categorical slot value features.
            slot_values = self.service_schema.get_categorical_slot_values(slot)
            self.num_categorical_slot_values[slot_idx] = len(slot_values)
            # set slot mask to 1, i.e. the slot exists in the service
            self.cat_slot_status_mask[slot_idx] = 1
            # set the number of active slot values for this slots in the service
            for slot_value_idx in range(
                    len(self.service_schema._categorical_slot_values[slot])):
                self.cat_slot_values_mask[slot_idx][slot_value_idx] = 1

            if not values:
                self.categorical_slot_status[slot_idx] = STATUS_OFF
            elif values[0] == STR_DONTCARE:
                self.categorical_slot_status[slot_idx] = STATUS_DONTCARE
            else:
                value_id = self.service_schema.get_categorical_slot_value_id(
                    slot, values[0])
                if value_id < 0:
                    logging.warning(
                        f"Categorical value not found: EXAMPLE_ID:{self.example_id}, EXAMPLE_ID_NUM:{self.example_id_num}"
                    )
                    logging.warning(
                        f"SYSTEM: {self.system_utterance} || USER: {self.user_utterance}"
                    )
                else:
                    if values[0] not in agg_sys_state.get(slot, []):
                        self.categorical_slot_status[slot_idx] = STATUS_ACTIVE
                        self.categorical_slot_values[slot_idx] = value_id
                    else:
                        if self._add_carry_status:
                            self.categorical_slot_status[
                                slot_idx] = STATUS_CARRY
                        else:
                            self.categorical_slot_status[
                                slot_idx] = STATUS_ACTIVE

                        if self._add_carry_value:
                            self.categorical_slot_values[
                                slot_idx] = self.service_schema.get_categorical_slot_value_id(
                                    slot, "#CARRYVALUE#")
                            logging.debug(
                                f"Found slot:{slot}, value:{values[0]}, slot_id:{self.categorical_slot_values[slot_idx]} in prev states: {agg_sys_state}"
                            )
                        else:
                            self.categorical_slot_values[slot_idx] = value_id
예제 #17
0
파일: sgd_loss.py 프로젝트: rssaketh/NeMo
    def __init__(self, reduction='mean'):
        """
        Args:
            reduction (str): specifies the reduction to apply to the final loss, choose 'mean' or 'sum'
        """
        super().__init__()

        if reduction not in ['mean', 'sum']:
            logging.warning(f'{reduction} reduction is not supported. Setting reduction to "mean"')
            reduction = 'mean'

        self.reduction = reduction
        self._cross_entropy = torch.nn.CrossEntropyLoss(reduction=self.reduction)
        self._criterion_req_slots = torch.nn.BCEWithLogitsLoss(reduction=self.reduction)
def get_input_metadata_from_profile(profile, network):
    input_metadata = OrderedDict()
    for index in range(network.num_inputs):
        tensor = network.get_input(index)
        if tensor.is_shape_tensor:
            shapes = profile.get_shape_input(tensor.name)
        else:
            shapes = profile.get_shape(tensor.name)
        if tuple(shapes[0]) != tuple(shapes[1]):
            logging.warning(
                "In profile 0, min != max, using opt shapes for calibration")
        # Always use opt shape
        input_metadata[tensor.name] = (trt.nptype(tensor.dtype), shapes[1])
    return input_metadata
예제 #19
0
    def __init__(self, dataset_name, data_dir, vocab_size, sample_size, special_tokens, train_file=''):
        if dataset_name == 'wikitext-2':
            if not os.path.exists(data_dir):
                data_dir = download_wkt2(data_dir)
            self.data_dir, self.tokenizer_model = create_vocab_mlm(
                data_dir, vocab_size, sample_size, special_tokens, train_file
            )
        else:
            logging.warning(
                "Looks like you passed a dataset name that isn't "
                "already supported by NeMo. Please make sure that "
                "you build the preprocessing method for it."
            )

        self.train_file = f'{data_dir}/train.txt'
        self.eval_file = f'{data_dir}/valid.txt'
        self.test_file = f'{data_dir}/test.txt'
    def __init__(self, data_dir, modes=['train', 'test', 'dev']):
        self.data_dir = data_dir

        max_label_id = 0
        for mode in modes:
            if not if_exist(self.data_dir, [f'{mode}.tsv']):
                logging.info(
                    f'Stats calculation for {mode} mode is skipped as {mode}.tsv was not found.'
                )
                continue

            input_file = f'{self.data_dir}/{mode}.tsv'
            with open(input_file, 'r') as f:
                input_lines = f.readlines()[1:]  # Skipping headers at index 0

            try:
                int(input_lines[0].strip().split()[-1])
            except ValueError:
                logging.warning(f'No numerical labels found for {mode}.tsv.')
                raise

            queries, raw_sentences = [], []
            for input_line in input_lines:
                parts = input_line.strip().split()
                label = int(parts[-1])
                raw_sentences.append(label)
                queries.append(' '.join(parts[:-1]))

            infold = input_file[:input_file.rfind('/')]

            logging.info(f'Three most popular classes in {mode} dataset')
            total_sents, sent_label_freq, max_id = get_label_stats(
                raw_sentences, infold + f'/{mode}_sentence_stats.tsv')
            max_label_id = max(max_label_id, max_id)

            if mode == 'train':
                class_weights_dict = get_freq_weights(sent_label_freq)
                logging.info(f'Class Weights: {class_weights_dict}')

            logging.info(f'Total Sentences: {total_sents}')
            logging.info(f'Sentence class frequencies - {sent_label_freq}')

        self.class_weights = fill_class_weights(class_weights_dict,
                                                max_label_id)

        self.num_labels = max_label_id + 1
예제 #21
0
    def _create_config_header(self):
        """ A protected method that create a header stored later in the configuration file. """

        # Get module "full specification".
        module_full_spec = str(self.__module__) + "." + str(
            self.__class__.__qualname__)
        module_class_name = type(self).__name__
        # print(module_full_spec)

        # Check whether module belongs to a collection.
        spec_list = module_full_spec.split(".")

        # Do not check Neural Modules from unit tests.
        if spec_list[0] == "tests":
            # Set collection variables.
            collection_type = "tests"
            collection_version = None
        else:
            # Check if component belongs to any collection
            if len(spec_list) < 3 or (spec_list[0] != "nemo"
                                      and spec_list[1] != "collection"):
                logging.warning(
                    "Module `{}` does not belong to any collection. This won't be allowed in the next release."
                    .format(module_class_name))
                collection_type = "unknown"
                collection_version = None
            else:
                # Ok, set collection.
                collection_type = spec_list[2]
                collection_version = None
                # TODO: to be SET!
                # print(getattr("nemo.collections.nlp", __version__))

        # Create a "header" with module "specification".
        header = {
            "nemo_core_version": nemo_version,
            "collection_type": collection_type,
            "collection_version": collection_version,
            # "class": module_class_name, # Operating only on full_spec now.
            "full_spec": module_full_spec,
        }
        return header
예제 #22
0
    def __init__(
        self,
        data_layers: List[DataLayerNM],
        batch_size: int,
        shuffle: bool = False,
        combination_mode: DataCombination = DataCombination.CROSSPRODUCT,
        port_names: List[str] = None,
    ):
        """
        data_layers: (list) of DataLayerNM objects
        batch_size: (int) batchsize when the underlying dataset is loaded
        combination_mode: (DataCombination) defines how to combine the datasets.
        shuffle: (bool) whether underlying multi dataset should be shuffled in each epoch
        port_names: List(str) user can override all port names if specified 
        """
        super().__init__()
        self._data_layers = data_layers
        self._batch_size = batch_size
        self._shuffle = shuffle
        self._combination_mode = combination_mode
        self._port_names = port_names
        self._dataset = MultiDataset(
            datasets=[dl.dataset for dl in self._data_layers], combination_mode=combination_mode
        )

        self._ports = dict()
        if self._port_names:
            i = 0
            for dl in self._data_layers:
                for _, port_type in dl.output_ports.items():
                    self._ports[self._port_names[i]] = port_type
                    i += 1
        else:
            for dl_idx, dl in enumerate(self._data_layers):
                for port_name, port_type in dl.output_ports.items():
                    if port_name in self._ports:
                        logging.warning(f"name collision {port_name}, will rename")
                        self._ports[f"{port_name}_{dl_idx}"] = port_type
                    else:
                        self._ports[port_name] = port_type
예제 #23
0
    def list_pretrained_models() -> Optional[List[PretrainedModelInfo]]:
        """List all available pre-trained models (e.g. weights) for convolutional
        encoder-decoder CTC-based speech recognition models.

        Returns:
            A list of PretrainedModelInfo tuples.
            The pretrained_model_name field of the tuple can be used to
            retrieve pre-trained model's weights (pass it as
            pretrained_model_name argument to the module's constructor)
        """
        logging.warning("TODO: CHANGE ME TO GRAB STUFF FROM NGC")
        result = []
        model = PretrainedModelInfo(
            pretrained_model_name="JasperNet10x5-En",
            location=
            "https://nemo-public.s3.us-east-2.amazonaws.com/nemo_0.11_models_test/JasperNet10x5-En-Base.nemo",
            description=
            "The model achieves a WER of 3.46% on LibriSpeech dev-clean, 10.40% on dev-other, 3.69% on test-clean, and 10.49% on test-other.",
            parameters="",
        )
        result.append(model)
        return result
예제 #24
0
def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mode='train'):
    logging.info(f"Loading {mode} data...")
    data_file = f'{data_desc.data_dir}/{mode}.tsv'
    shuffle = args.shuffle_data if mode == 'train' else False

    data_layer = nemo.collections.nlp.nm.data_layers.text_classification_datalayer.BertTextClassificationDataLayer(
        input_file=data_file,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        num_samples=num_samples,
        shuffle=shuffle,
        batch_size=batch_size,
    )

    ids, type_ids, input_mask, labels = data_layer()
    data_size = len(data_layer)

    if data_size < batch_size:
        logging.warning("Batch_size is larger than the dataset size")
        logging.warning("Reducing batch_size to dataset size")
        batch_size = data_size

    steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus))
    logging.info(f"Steps_per_epoch = {steps_per_epoch}")

    hidden_states = pretrained_bert_model(input_ids=ids, token_type_ids=type_ids, attention_mask=input_mask)

    logits = classifier(hidden_states=hidden_states)
    loss = loss_fn(logits=logits, labels=labels)

    if mode == 'train':
        tensors_to_evaluate = [loss, logits]
    else:
        tensors_to_evaluate = [logits, labels]

    return tensors_to_evaluate, loss, steps_per_epoch, data_layer
def send_on_queue(queue, obj):
    if not is_pickleable(obj):
        logging.warning("Cannot pickle: {:}. Sending None instead".format(obj))
        queue.put(None)
        return

    if sys.getsizeof(obj) > PIPE_MAX_SEND_BYTES:
        logging.warning(
            "Object size ({:} bytes) exceeds maximum size that can be sent over queues ({:} bytes). Attempting to compress - this may take some time. If this does not work or you want to avoid the compression overhead, you should disable subprocesses via the --no-subprocess flag, or by setting use_subprocess=False in Comparator.run()."
            .format(sys.getsizeof(obj), PIPE_MAX_SEND_BYTES))
        obj = compress(obj)

    if sys.getsizeof(obj) > PIPE_MAX_SEND_BYTES:
        logging.warning(
            "Compressed object is still too large to send. Sending None instead."
        )
        queue.put(None)
        return

    logging.info("Sending: {:} on queue".format(obj))
    queue.put(obj)
예제 #26
0
def get_features(
    queries,
    max_seq_length,
    tokenizer,
    label_ids=None,
    pad_label='O',
    raw_labels=None,
    ignore_extra_tokens=False,
    ignore_start_end=False,
):
    """
    Args:
    queries (list of str): text sequences
    max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP]
    tokenizer (Tokenizer): such as NemoBertTokenizer
    pad_label (str): pad value use for labels.
        by default, it's the neutral label.
    raw_labels (list of str): list of labels for every word in a sequence
    label_ids (dict): dict to map labels to label ids. Starts
        with pad_label->0 and then increases in alphabetical order.
        Required for training and evaluation, not needed for inference.
    ignore_extra_tokens (bool): whether to ignore extra tokens in
        the loss_mask,
    ignore_start_end (bool): whether to ignore bos and eos tokens in
        the loss_mask
    """
    all_subtokens = []
    all_loss_mask = []
    all_subtokens_mask = []
    all_segment_ids = []
    all_input_ids = []
    all_input_mask = []
    sent_lengths = []
    all_labels = []
    with_label = False

    if raw_labels is not None:
        with_label = True

    for i, query in enumerate(queries):
        words = query.strip().split()

        # add bos token
        subtokens = ['[CLS]']
        loss_mask = [1 - ignore_start_end]
        subtokens_mask = [0]
        if with_label:
            pad_id = label_ids[pad_label]
            labels = [pad_id]
            query_labels = [label_ids[lab] for lab in raw_labels[i]]

        for j, word in enumerate(words):
            word_tokens = tokenizer.text_to_tokens(word)
            subtokens.extend(word_tokens)

            loss_mask.append(1)
            loss_mask.extend([int(not ignore_extra_tokens)] * (len(word_tokens) - 1))

            subtokens_mask.append(1)
            subtokens_mask.extend([0] * (len(word_tokens) - 1))

            if with_label:
                labels.extend([query_labels[j]] * len(word_tokens))
        # add eos token
        subtokens.append('[SEP]')
        loss_mask.append(1 - ignore_start_end)
        subtokens_mask.append(0)
        sent_lengths.append(len(subtokens))
        all_subtokens.append(subtokens)
        all_loss_mask.append(loss_mask)
        all_subtokens_mask.append(subtokens_mask)
        all_input_mask.append([1] * len(subtokens))

        if with_label:
            labels.append(pad_id)
            all_labels.append(labels)

    max_seq_length = min(max_seq_length, max(sent_lengths))
    logging.info(f'Max length: {max_seq_length}')
    datasets_utils.get_stats(sent_lengths)
    too_long_count = 0

    for i, subtokens in enumerate(all_subtokens):
        if len(subtokens) > max_seq_length:
            subtokens = ['[CLS]'] + subtokens[-max_seq_length + 1 :]
            all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1 :]
            all_loss_mask[i] = [int(not ignore_start_end)] + all_loss_mask[i][-max_seq_length + 1 :]
            all_subtokens_mask[i] = [0] + all_subtokens_mask[i][-max_seq_length + 1 :]

            if with_label:
                all_labels[i] = [pad_id] + all_labels[i][-max_seq_length + 1 :]
            too_long_count += 1

        all_input_ids.append([tokenizer.tokens_to_ids(t) for t in subtokens])

        if len(subtokens) < max_seq_length:
            extra = max_seq_length - len(subtokens)
            all_input_ids[i] = all_input_ids[i] + [0] * extra
            all_loss_mask[i] = all_loss_mask[i] + [0] * extra
            all_subtokens_mask[i] = all_subtokens_mask[i] + [0] * extra
            all_input_mask[i] = all_input_mask[i] + [0] * extra

            if with_label:
                all_labels[i] = all_labels[i] + [pad_id] * extra

        all_segment_ids.append([0] * max_seq_length)

    logging.warning(f'{too_long_count} are longer than {max_seq_length}')

    for i in range(min(len(all_input_ids), 5)):
        logging.debug("*** Example ***")
        logging.debug("i: %s", i)
        logging.debug("subtokens: %s", " ".join(list(map(str, all_subtokens[i]))))
        logging.debug("loss_mask: %s", " ".join(list(map(str, all_loss_mask[i]))))
        logging.debug("input_mask: %s", " ".join(list(map(str, all_input_mask[i]))))
        logging.debug("subtokens_mask: %s", " ".join(list(map(str, all_subtokens_mask[i]))))
        if with_label:
            logging.debug("labels: %s", " ".join(list(map(str, all_labels[i]))))
    return (all_input_ids, all_segment_ids, all_input_mask, all_loss_mask, all_subtokens_mask, all_labels)
예제 #27
0
    def __init__(
        self,
        text_file,
        label_file,
        max_seq_length,
        tokenizer,
        num_samples=-1,
        shuffle=False,
        pad_label='O',
        label_ids=None,
        ignore_extra_tokens=False,
        ignore_start_end=False,
        use_cache=False,
    ):

        if use_cache:
            # Cache features
            data_dir = os.path.dirname(text_file)
            filename = os.path.basename(text_file)

            if not filename.endswith('.txt'):
                raise ValueError("{text_file} should have extension .txt")

            features_pkl = os.path.join(data_dir, filename[:-4] + "_features.pkl")
            label_ids_pkl = os.path.join(data_dir, "label_ids.pkl")

        if use_cache and os.path.exists(features_pkl) and os.path.exists(label_ids_pkl):
            # If text_file was already processed, load from pickle
            features = pickle.load(open(features_pkl, 'rb'))
            logging.info(f'features restored from {features_pkl}')

            label_ids = pickle.load(open(label_ids_pkl, 'rb'))
            logging.info(f'Labels to ids dict restored from {label_ids_pkl}')
        else:
            if num_samples == 0:
                raise ValueError("num_samples has to be positive", num_samples)

            with open(text_file, 'r') as f:
                text_lines = f.readlines()

            # Collect all possible labels
            unique_labels = set([])
            labels_lines = []
            with open(label_file, 'r') as f:
                for line in f:
                    line = line.strip().split()
                    labels_lines.append(line)
                    unique_labels.update(line)

            if len(labels_lines) != len(text_lines):
                raise ValueError("Labels file should contain labels for every word")

            if shuffle or num_samples > 0:
                dataset = list(zip(text_lines, labels_lines))
                random.shuffle(dataset)

                if num_samples > 0:
                    dataset = dataset[:num_samples]

                dataset = list(zip(*dataset))
                text_lines = dataset[0]
                labels_lines = dataset[1]

            # for dev/test sets use label mapping from training set
            if label_ids:
                if len(label_ids) != len(unique_labels):
                    logging.warning(
                        f'Not all labels from the specified'
                        + ' label_ids dictionary are present in the'
                        + ' current dataset. Using the provided'
                        + ' label_ids dictionary.'
                    )
                else:
                    logging.info(f'Using the provided label_ids dictionary.')
            else:
                logging.info(
                    f'Creating a new label to label_id dictionary.'
                    + ' It\'s recommended to use label_ids generated'
                    + ' during training for dev/test sets to avoid'
                    + ' errors if some labels are not'
                    + ' present in the dev/test sets.'
                    + ' For training set label_ids should be None.'
                )

                label_ids = {pad_label: 0}
                if pad_label in unique_labels:
                    unique_labels.remove(pad_label)
                for label in sorted(unique_labels):
                    label_ids[label] = len(label_ids)

            features = get_features(
                text_lines,
                max_seq_length,
                tokenizer,
                pad_label=pad_label,
                raw_labels=labels_lines,
                label_ids=label_ids,
                ignore_extra_tokens=ignore_extra_tokens,
                ignore_start_end=ignore_start_end,
            )

            if use_cache:
                pickle.dump(features, open(features_pkl, "wb"))
                logging.info(f'features saved to {features_pkl}')

                pickle.dump(label_ids, open(label_ids_pkl, "wb"))
                logging.info(f'labels to ids dict saved to {label_ids_pkl}')

        self.all_input_ids = features[0]
        self.all_segment_ids = features[1]
        self.all_input_mask = features[2]
        self.all_loss_mask = features[3]
        self.all_subtokens_mask = features[4]
        self.all_labels = features[5]
        self.label_ids = label_ids

        infold = text_file[: text_file.rfind('/')]
        merged_labels = itertools.chain.from_iterable(self.all_labels)
        logging.info('Three most popular labels')
        _, self.label_frequencies = datasets_utils.get_label_stats(merged_labels, infold + '/label_stats.tsv')

        # save label_ids
        out = open(infold + '/label_ids.csv', 'w')
        labels, _ = zip(*sorted(self.label_ids.items(), key=lambda x: x[1]))
        out.write('\n'.join(labels))
        logging.info(f'Labels: {self.label_ids}')
        logging.info(f'Labels mapping saved to : {out.name}')
예제 #28
0
파일: dataset.py 프로젝트: lokhiufung/NeMo
    def __init__(
        self,
        kaldi_dir,
        labels,
        min_duration=None,
        max_duration=None,
        max_utts=0,
        unk_index=-1,
        blank_index=-1,
        normalize=True,
        eos_id=None,
    ):
        self.eos_id = eos_id
        self.unk_index = unk_index
        self.blank_index = blank_index
        self.labels_map = {label: i for i, label in enumerate(labels)}

        data = []
        duration = 0.0
        filtered_duration = 0.0

        # Read Kaldi features (MFCC, PLP) using feats.scp
        feats_path = os.path.join(kaldi_dir, 'feats.scp')
        id2feats = {utt_id: torch.from_numpy(feats) for utt_id, feats in kaldi_io.read_mat_scp(feats_path)}

        # Get durations, if utt2dur exists
        utt2dur_path = os.path.join(kaldi_dir, 'utt2dur')
        id2dur = {}
        if os.path.exists(utt2dur_path):
            with open(utt2dur_path, 'r') as f:
                for line in f:
                    utt_id, dur = line.split()
                    id2dur[utt_id] = float(dur)
        elif max_duration or min_duration:
            raise ValueError(
                f"KaldiFeatureDataset max_duration or min_duration is set but"
                f" utt2dur file not found in {kaldi_dir}."
            )
        else:
            logging.info(
                f"Did not find utt2dur when loading data from " f"{kaldi_dir}. Skipping dataset duration calculations."
            )

        # Match transcripts to features
        text_path = os.path.join(kaldi_dir, 'text')
        parser = parsers.make_parser(labels, 'en', unk_id=unk_index, blank_id=self.blank_index, do_normalize=normalize)
        with open(text_path, 'r') as f:
            for line in f:
                split_idx = line.find(' ')
                utt_id = line[:split_idx]

                audio_features = id2feats.get(utt_id)

                if audio_features is not None:

                    text = line[split_idx:].strip()
                    # if normalize:
                    #     # TODO: WTF?
                    #     text = parser._normalize(text)

                    dur = id2dur[utt_id] if id2dur else None

                    # Filter by duration if specified & utt2dur exists
                    if min_duration and dur < min_duration:
                        filtered_duration += dur
                        continue
                    if max_duration and dur > max_duration:
                        filtered_duration += dur
                        continue

                    sample = {
                        'utt_id': utt_id,
                        'text': text,
                        'tokens': parser(text),
                        'audio': audio_features.t(),
                        'duration': dur,
                    }

                    data.append(sample)
                    duration += dur

                    if max_utts > 0 and len(data) >= max_utts:
                        logging.warning(f"Stop parsing due to max_utts ({max_utts})")
                        break

        if id2dur:
            # utt2dur durations are in seconds
            logging.info(
                f"Dataset loaded with {duration / 3600 : .2f} hours. "
                f"Filtered {filtered_duration / 3600 : .2f} hours."
            )

        self.data = data
예제 #29
0
    def _loss_function(
        self,
        logit_intent_status,
        intent_status_labels,
        logit_req_slot_status,
        requested_slot_status,
        req_slot_mask,
        logit_cat_slot_status,
        categorical_slot_status,
        cat_slot_status_mask,
        logit_cat_slot_value,
        categorical_slot_values,
        logit_noncat_slot_status,
        noncategorical_slot_status,
        noncat_slot_status_mask,
        logit_noncat_slot_start,
        logit_noncat_slot_end,
        noncategorical_slot_value_start,
        noncategorical_slot_value_end,
    ):
        # Intent loss
        intent_loss = self._cross_entropy(logit_intent_status,
                                          intent_status_labels)

        # Requested slots.
        # Shape: (batch_size, max_num_slots)
        # mask unused slots
        # Sigmoid cross entropy is used because more than one slots can be requested in a single utterance
        requested_slot_loss = self._criterion_req_slots(
            logit_req_slot_status.view(-1)[req_slot_mask],
            requested_slot_status.view(-1)[req_slot_mask])

        # Categorical slot status
        # Shape of logit_cat_slot_status: (batch_size, max_num_cat_slots, 3)
        cat_slot_status_mask = cat_slot_status_mask.view(-1) > 0.5
        if sum(cat_slot_status_mask) == 0:
            logging.warning(f'No active categorical slots in the batch')
            cat_slot_status_loss = self._cross_entropy(
                logit_cat_slot_status.view(-1, 3),
                torch.argmax(logit_cat_slot_status.view(-1, 3), dim=-1))
        else:
            cat_slot_status_loss = self._cross_entropy(
                logit_cat_slot_status.view(-1, 3)[cat_slot_status_mask],
                categorical_slot_status.view(-1)[cat_slot_status_mask],
            )

        # Categorical slot values.
        # Shape: (batch_size, max_num_cat_slots, max_num_slot_values).
        max_num_slot_values = logit_cat_slot_value.size()[-1]

        # Zero out losses for categorical slot value when the slot status is not active.
        cat_slot_value_mask = (
            categorical_slot_status == STATUS_ACTIVE).view(-1)
        # to handle cases with no active categorical slot value
        cat_slot_value_mask = cat_slot_value_mask.view(-1) > 0.5
        if sum(cat_slot_value_mask) == 0:
            logging.warning(
                f'No active values for categorical slots in the batch.')
            cat_slot_value_loss = self._cross_entropy(
                logit_cat_slot_value.view(-1, max_num_slot_values),
                torch.argmax(logit_cat_slot_value.view(-1,
                                                       max_num_slot_values),
                             dim=-1),
            )
        else:
            slot_values_active_logits = logit_cat_slot_value.view(
                -1, max_num_slot_values)[cat_slot_value_mask]
            slot_values_active_labels = categorical_slot_values.view(
                -1)[cat_slot_value_mask]
            cat_slot_value_loss = self._cross_entropy(
                slot_values_active_logits, slot_values_active_labels)

        # Non-categorical slot status.
        # Shape: (batch_size, max_num_noncat_slots, 3).
        noncat_slot_status_mask = noncat_slot_status_mask.view(-1) > 0.5
        if sum(noncat_slot_status_mask) == 0:
            logging.warning(f'No active non-categorical slots in the batch.')
            noncat_slot_status_loss = self._cross_entropy(
                logit_noncat_slot_status.view(-1, 3),
                torch.argmax(logit_noncat_slot_status.view(-1, 3), dim=-1))
        else:
            noncat_slot_status_loss = self._cross_entropy(
                logit_noncat_slot_status.view(-1, 3)[noncat_slot_status_mask],
                noncategorical_slot_status.view(-1)[noncat_slot_status_mask],
            )

        # Non-categorical slot spans.
        # Shape: (batch_size, max_num_noncat_slots, max_num_tokens).n
        max_num_tokens = logit_noncat_slot_start.size()[-1]
        # Zero out losses for non-categorical slot spans when the slot status is not active.
        # changed here
        non_cat_slot_value_mask = (
            noncategorical_slot_status == STATUS_ACTIVE).view(-1)
        # non_cat_slot_value_mask = (noncategorical_slot_status > -1 ).view(-1)
        # to handle cases with no active categorical slot value
        non_cat_slot_value_mask = non_cat_slot_value_mask.view(-1)
        if sum(non_cat_slot_value_mask) == 0:
            logging.warning(
                f'No active values for non-categorical slots in the batch.')
            span_start_loss = self._cross_entropy(
                logit_noncat_slot_start.view(-1, max_num_tokens),
                torch.argmax(logit_noncat_slot_start.view(-1, max_num_tokens),
                             dim=-1),
            )
            span_end_loss = self._cross_entropy(
                logit_noncat_slot_end.view(-1, max_num_tokens),
                torch.argmax(logit_noncat_slot_end.view(-1, max_num_tokens),
                             dim=-1),
            )
        else:
            noncat_slot_start_active_logits = logit_noncat_slot_start.view(
                -1, max_num_tokens)[non_cat_slot_value_mask]
            noncat_slot_start_active_labels = noncategorical_slot_value_start.view(
                -1)[non_cat_slot_value_mask]
            span_start_loss = self._cross_entropy(
                noncat_slot_start_active_logits,
                noncat_slot_start_active_labels)

            noncat_slot_end_active_logits = logit_noncat_slot_end.view(
                -1, max_num_tokens)[non_cat_slot_value_mask]
            noncat_slot_end_active_labels = noncategorical_slot_value_end.view(
                -1)[non_cat_slot_value_mask]
            span_end_loss = self._cross_entropy(noncat_slot_end_active_logits,
                                                noncat_slot_end_active_labels)

        losses = {
            "intent_loss": intent_loss,
            "requested_slot_loss": requested_slot_loss,
            "cat_slot_status_loss": cat_slot_status_loss,
            "cat_slot_value_loss": cat_slot_value_loss,
            "noncat_slot_status_loss": noncat_slot_status_loss,
            "span_start_loss": span_start_loss,
            "span_end_loss": span_end_loss,
        }

        total_loss = sum(losses.values())
        if self.reduction == 'mean':
            total_loss = total_loss / len(losses)
        else:
            batch_size = logit_intent_status.shape[0]
            total_loss = total_loss / batch_size
        return total_loss
예제 #30
0
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    """Project the tokenized prediction back to the original text."""

    # When we created the data, we kept track of the alignment between original
    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
    # now `orig_text` contains the span of our original text corresponding to
    # the span that we predicted.
    #
    # However, `orig_text` may contain extra characters that we don't want in
    # our prediction.
    #
    # For example, let's say:
    #   pred_text = steve smith
    #   orig_text = Steve Smith's
    #
    # We don't want to return `orig_text` because it contains the extra "'s".
    #
    # We don't want to return `pred_text` because it's already been normalized
    # (the SQuAD eval script also does punctuation stripping/lower casing but
    # our tokenizer does additional normalization like stripping accent
    # characters).
    #
    # What we really want to return is "Steve Smith".
    #
    # Therefore, we have to apply a semi-complicated alignment heuristic
    # between `pred_text` and `orig_text` to get a character-to-character
    # alignment. This can fail in certain cases in which case we just return
    # `orig_text`.

    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for (i, c) in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return ns_text, ns_to_s_map

    # We first tokenize `orig_text`, strip whitespace from the result
    # and `pred_text`, and check if they are the same length. If they are
    # NOT the same length, the heuristic has failed. If they are the same
    # length, we assume the characters are one-to-one aligned.
    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if verbose_logging:
            logging.warning("Unable to find text: '%s' in '%s'" %
                            (pred_text, orig_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
            logging.warning(
                "Length not equal after stripping spaces: '%s' vs '%s'",
                orig_ns_text,
                tok_ns_text,
            )
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if verbose_logging:
            logging.warning("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if verbose_logging:
            logging.warning("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text