def __validate_params(self, params): """ Checks whether dictionary contains parameters being primitive types (string, int, float etc.) or (lists of)+ primitive types. Args: params: dictionary of parameters. Returns: True if all parameters were ok, False otherwise. """ ok = True # Iterate over parameters and check them one by one. for key, variable in params.items(): if not self.__is_of_allowed_type(variable): logging.warning( "Parameter '{}' contains a variable '{}' of type '{}' which is not allowed.".format( key, variable, type(variable) ) ) ok = False # Return the result. return ok
def get_raw_scores(self, preds): """ Computes the exact and f1 scores from the examples and the model predictions """ exact_scores = {} f1_scores = {} for example in self.examples: qas_id = example.qas_id gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])] if not gold_answers: # For unanswerable questions, # only correct answer is empty string gold_answers = [""] if qas_id not in preds: logging.warning("Missing prediction for %s" % qas_id) continue prediction = preds[qas_id] exact_scores[qas_id] = max(exact_match_score(a, prediction) for a in gold_answers) f1_scores[qas_id] = max(f1_score(a, prediction) for a in gold_answers) return exact_scores, f1_scores
def create_pipeline(num_samples, batch_size, num_gpus, input_dropout, data_prefix, is_training): logging.info(f"Loading {data_prefix} data...") shuffle = args.shuffle_data if is_training else False data_layer = MultiWOZDataLayer( args.data_dir, data_desc.domains, all_domains=data_desc.all_domains, vocab=data_desc.vocab, slots=data_desc.slots, gating_dict=data_desc.gating_dict, num_samples=num_samples, shuffle=shuffle, num_workers=0, batch_size=batch_size, mode=data_prefix, is_training=is_training, input_dropout=input_dropout, ) input_data = data_layer() data_size = len(data_layer) logging.info(f'The length of data layer is {data_size}') if data_size < batch_size: logging.warning("Batch_size is larger than the dataset size") logging.warning("Reducing batch_size to dataset size") batch_size = data_size steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus)) logging.info(f"Steps_per_epoch = {steps_per_epoch}") outputs, hidden = encoder(inputs=input_data.src_ids, input_lens=input_data.src_lens) point_outputs, gate_outputs = decoder( encoder_hidden=hidden, encoder_outputs=outputs, input_lens=input_data.src_lens, src_ids=input_data.src_ids, targets=input_data.tgt_ids, ) gate_loss = gate_loss_fn(logits=gate_outputs, labels=input_data.gating_labels) ptr_loss = ptr_loss_fn(logits=point_outputs, labels=input_data.tgt_ids, length_mask=input_data.tgt_lens) total_loss = total_loss_fn(loss_1=gate_loss, loss_2=ptr_loss) if is_training: tensors_to_evaluate = [total_loss, gate_loss, ptr_loss] else: tensors_to_evaluate = [ total_loss, point_outputs, gate_outputs, input_data.gating_labels, input_data.turn_domain, input_data.tgt_ids, input_data.tgt_lens, ] return tensors_to_evaluate, total_loss, ptr_loss, gate_loss, steps_per_epoch, data_layer
def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, mode='train', is_training=True): logging.info(f"Loading {mode} data...") data_file = f'{data_desc.data_dir}/{mode}.tsv' shuffle = args.shuffle_data if is_training else False data_layer = nemo_nlp.nm.data_layers.BertTextClassificationDataLayer( input_file=data_file, tokenizer=tokenizer, max_seq_length=args.max_seq_length, num_samples=num_samples, shuffle=shuffle, batch_size=batch_size, use_cache=args.use_cache, ) ids, type_ids, input_mask, labels = data_layer() data_size = len(data_layer) if data_size < batch_size: logging.warning("Batch_size is larger than the dataset size") logging.warning("Reducing batch_size to dataset size") batch_size = data_size steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus)) hidden_states = model(input_ids=ids, token_type_ids=type_ids, attention_mask=input_mask) logits = classifier(hidden_states=hidden_states) loss = loss_fn(logits=logits, labels=labels) if is_training: tensors_to_evaluate = [loss, logits]
def write_timestamped(contents, dir=None, name=None, mode="wb"): """ Generates a timestamped file path in the specified directory. Args: contents (bytes-like object or callable): Either a bytes-like object that can be written to disk, or a callable which will return such an object. dir (str): The directory to write into. name (str): The name of the file. Optional Args: mode(str): The mode to use when writing. Defaults to "wb". Returns: str: The complete file path, or None if nothing was written. """ if dir is not None: if not os.path.exists(dir): # logging.debug("{:} does not exist, creating now.".format(dir)) os.makedirs(dir, exist_ok=True) path = timestamped_filepath(dir, name) if callable(contents): contents = contents() if os.path.exists(path): logging.warning( "{:} already exists. Will not overwrite.".format(path)) else: with open(path, mode) as f: logging.info("Writing to {:}".format(path)) f.write(contents) return path return None
def list_pretrained_models() -> Optional[List[PretrainedModelInfo]]: """List all available pre-trained models (e.g. weights) for convolutional encoder-decoder CTC-based speech recognition models. Returns: A list of PretrainedModelInfo tuples. The pretrained_model_name field of the tuple can be used to retrieve pre-trained model's weights (pass it as pretrained_model_name argument to the module's constructor) """ logging.warning("TODO: CHANGE ME TO GRAB STUFF FROM NGC") result = [] model = PretrainedModelInfo( pretrained_model_name="QuartzNet15x5-En", location= "https://nemo-public.s3.us-east-2.amazonaws.com/nemo_0.11_models_test/QuartzNet15x5-En-Base.nemo", description= "The model is trained on ~3300 hours of publicly available data and achieves a WER of 3.91% on LibriSpeech dev-clean, and a WER of 10.58% on dev-other.", parameters="", ) result.append(model) model = PretrainedModelInfo( pretrained_model_name="QuartzNet15x5-Zh", location= "https://nemo-public.s3.us-east-2.amazonaws.com/nemo_0.11_models_test/QuartzNet15x5-Zh-Base.nemo", description= "The model is trained on ai-shell2 mandarin chinese dataset.", parameters="", ) result.append(model) return result
def deduce_format(shape): """ Guesses the data format of a given shape. Args: shape (Tuple[int]): The shape, including batch dimension. Returns: DataFormat: The deduced data format. """ # The smaller this ratio, the closer a and b are. def minmax_ratio(a, b): return abs(max(a, b) / min(a, b)) # Assume all shapes include batch dimension if len(shape) == 4: # Typically, H and W are quite close, so if minmax_ratio(0, 1) > minmax_ratio(1, 2), then we assume CHW. if minmax_ratio(shape[1], shape[2]) > minmax_ratio(shape[2], shape[3]): return DataFormat.NCHW return DataFormat.NHWC elif len(shape) == 3: return DataFormat.NHW elif len(shape) == 2: return DataFormat.NW else: logging.warning( "Cannot deduce format for " + str(shape) + ". Currently only implemented for input_buffers with 1-3 non-batch dimensions. Please update this function!" ) return DataFormat.UNKNOWN
def download_wkt2(data_dir): os.makedirs('data/lm', exist_ok=True) logging.warning(f'Data not found at {data_dir}. ' f'Downloading wikitext-2 to data/lm') data_dir = 'data/lm/wikitext-2' subprocess.call('scripts/get_wkt2.sh') return data_dir
def check(self, model): try: onnx.checker.check_model(model) logging.debug("ONNX Checker Passed") except onnx.checker.ValidationError as err: logging.warning( "ONNX Checker exited with an error: {:}".format(err)) return model
def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, mode='train'): logging.info(f"Loading {mode} data...") data_file = f'{data_desc.data_dir}/{mode}.tsv' slot_file = f'{data_desc.data_dir}/{mode}_slots.tsv' shuffle = args.shuffle_data if mode == 'train' else False data_layer = BertJointIntentSlotDataLayer( input_file=data_file, slot_file=slot_file, pad_label=data_desc.pad_label, tokenizer=tokenizer, max_seq_length=args.max_seq_length, num_samples=num_samples, shuffle=shuffle, batch_size=batch_size, ignore_extra_tokens=args.ignore_extra_tokens, ignore_start_end=args.ignore_start_end, ) input_data = data_layer() data_size = len(data_layer) logging.info(f'The length of data layer is {data_size}') if data_size < batch_size: logging.warning("Batch_size is larger than the dataset size") logging.warning("Reducing batch_size to dataset size") batch_size = data_size steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus)) logging.info(f"Steps_per_epoch = {steps_per_epoch}") hidden_states = pretrained_bert_model( input_ids=input_data.input_ids, token_type_ids=input_data.input_type_ids, attention_mask=input_data.input_mask) intent_logits, slot_logits = classifier(hidden_states=hidden_states) intent_loss = intent_loss_fn(logits=intent_logits, labels=input_data.intents) slot_loss = slot_loss_fn(logits=slot_logits, labels=input_data.slots, loss_mask=input_data.loss_mask) total_loss = total_loss_fn(loss_1=intent_loss, loss_2=slot_loss) if mode == 'train': tensors_to_evaluate = [total_loss, intent_logits, slot_logits] else: tensors_to_evaluate = [ intent_logits, slot_logits, input_data.intents, input_data.slots, input_data.subtokens_mask, ] return tensors_to_evaluate, total_loss, steps_per_epoch, data_layer
def __init__(self, dataset_name, data_dir, do_lower_case): if dataset_name == 'wikitext-2': if not os.path.exists(data_dir): data_dir = download_wkt2(data_dir) self.vocab_size = create_vocab_lm(data_dir, do_lower_case) self.data_dir = data_dir else: logging.warning("Looks like you passed a dataset name that isn't " "already supported by NeMo. Please make sure that " "you build the preprocessing method for it.")
def from_config(cls, config): ptbs = [] for p in config: if p['aug_type'] not in perturbation_types: logging.warning("%s perturbation not known. Skipping.", p['aug_type']) continue perturbation = perturbation_types[p['aug_type']] ptbs.append((p['prob'], perturbation(**p['cfg']))) return cls(perturbations=ptbs)
def infer(self, memory, memory_lengths): """ Decoder inference PARAMS ------ memory: Encoder outputs RETURNS ------- mel_outputs: mel outputs from the decoder gate_outputs: gate outputs from the decoder alignments: sequence of attention weights from the decoder """ decoder_input = self.get_go_frame(memory) if memory.size(0) > 1: mask = ~get_mask_from_lengths(memory_lengths) else: mask = None self.initialize_decoder_states(memory, mask=mask) mel_lengths = torch.zeros([memory.size(0)], dtype=torch.int32) not_finished = torch.ones([memory.size(0)], dtype=torch.int32) if torch.cuda.is_available(): mel_lengths = mel_lengths.cuda() not_finished = not_finished.cuda() mel_outputs, gate_outputs, alignments = [], [], [] while True: decoder_input = self.prenet(decoder_input, inference=True) mel_output, gate_output, alignment = self.decode(decoder_input) dec = torch.le(torch.sigmoid(gate_output.data), self.gate_threshold).to(torch.int32).squeeze(1) not_finished = not_finished * dec mel_lengths += not_finished if self.early_stopping and torch.sum(not_finished) == 0: break mel_outputs += [mel_output.squeeze(1)] gate_outputs += [gate_output] alignments += [alignment] if len(mel_outputs) == self.max_decoder_steps: logging.warning("Reached max decoder steps") break decoder_input = mel_output mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs( mel_outputs, gate_outputs, alignments) return mel_outputs, gate_outputs, alignments, mel_lengths
def __init__(self, dataset_name, data_dir, do_lower_case): if dataset_name == 'wikitext-2': if not os.path.exists(data_dir): raise FileNotFoundError( "Dataset not found. Run './get_wkt2.sh DATA_DIR' from examples/nlp/scripts" ) self.vocab_size = self.create_vocab_lm(data_dir, do_lower_case) self.data_dir = data_dir else: logging.warning("Looks like you passed a dataset name that isn't " "already supported by NeMo. Please make sure that " "you build the preprocessing method for it.")
def resize(self, name, shape): found = False for buf_dict in [self.device_buffers, self.host_outputs]: if name in buf_dict: found = True buf_dict[name].resize(shape) if not found: logging.warning( "Buffer: {:} was not found, could not resize".format(name)) else: logging.debug("Resizing {:} buffer to {:}".format(name, shape))
def add_categorical_slots(self, state_update, agg_sys_state): """Add features for categorical slots.""" categorical_slots = self.service_schema.categorical_slots self.num_categorical_slots = len(categorical_slots) for slot_idx, slot in enumerate(categorical_slots): values = state_update.get(slot, []) # Add categorical slot value features. slot_values = self.service_schema.get_categorical_slot_values(slot) self.num_categorical_slot_values[slot_idx] = len(slot_values) # set slot mask to 1, i.e. the slot exists in the service self.cat_slot_status_mask[slot_idx] = 1 # set the number of active slot values for this slots in the service for slot_value_idx in range( len(self.service_schema._categorical_slot_values[slot])): self.cat_slot_values_mask[slot_idx][slot_value_idx] = 1 if not values: self.categorical_slot_status[slot_idx] = STATUS_OFF elif values[0] == STR_DONTCARE: self.categorical_slot_status[slot_idx] = STATUS_DONTCARE else: value_id = self.service_schema.get_categorical_slot_value_id( slot, values[0]) if value_id < 0: logging.warning( f"Categorical value not found: EXAMPLE_ID:{self.example_id}, EXAMPLE_ID_NUM:{self.example_id_num}" ) logging.warning( f"SYSTEM: {self.system_utterance} || USER: {self.user_utterance}" ) else: if values[0] not in agg_sys_state.get(slot, []): self.categorical_slot_status[slot_idx] = STATUS_ACTIVE self.categorical_slot_values[slot_idx] = value_id else: if self._add_carry_status: self.categorical_slot_status[ slot_idx] = STATUS_CARRY else: self.categorical_slot_status[ slot_idx] = STATUS_ACTIVE if self._add_carry_value: self.categorical_slot_values[ slot_idx] = self.service_schema.get_categorical_slot_value_id( slot, "#CARRYVALUE#") logging.debug( f"Found slot:{slot}, value:{values[0]}, slot_id:{self.categorical_slot_values[slot_idx]} in prev states: {agg_sys_state}" ) else: self.categorical_slot_values[slot_idx] = value_id
def __init__(self, reduction='mean'): """ Args: reduction (str): specifies the reduction to apply to the final loss, choose 'mean' or 'sum' """ super().__init__() if reduction not in ['mean', 'sum']: logging.warning(f'{reduction} reduction is not supported. Setting reduction to "mean"') reduction = 'mean' self.reduction = reduction self._cross_entropy = torch.nn.CrossEntropyLoss(reduction=self.reduction) self._criterion_req_slots = torch.nn.BCEWithLogitsLoss(reduction=self.reduction)
def get_input_metadata_from_profile(profile, network): input_metadata = OrderedDict() for index in range(network.num_inputs): tensor = network.get_input(index) if tensor.is_shape_tensor: shapes = profile.get_shape_input(tensor.name) else: shapes = profile.get_shape(tensor.name) if tuple(shapes[0]) != tuple(shapes[1]): logging.warning( "In profile 0, min != max, using opt shapes for calibration") # Always use opt shape input_metadata[tensor.name] = (trt.nptype(tensor.dtype), shapes[1]) return input_metadata
def __init__(self, dataset_name, data_dir, vocab_size, sample_size, special_tokens, train_file=''): if dataset_name == 'wikitext-2': if not os.path.exists(data_dir): data_dir = download_wkt2(data_dir) self.data_dir, self.tokenizer_model = create_vocab_mlm( data_dir, vocab_size, sample_size, special_tokens, train_file ) else: logging.warning( "Looks like you passed a dataset name that isn't " "already supported by NeMo. Please make sure that " "you build the preprocessing method for it." ) self.train_file = f'{data_dir}/train.txt' self.eval_file = f'{data_dir}/valid.txt' self.test_file = f'{data_dir}/test.txt'
def __init__(self, data_dir, modes=['train', 'test', 'dev']): self.data_dir = data_dir max_label_id = 0 for mode in modes: if not if_exist(self.data_dir, [f'{mode}.tsv']): logging.info( f'Stats calculation for {mode} mode is skipped as {mode}.tsv was not found.' ) continue input_file = f'{self.data_dir}/{mode}.tsv' with open(input_file, 'r') as f: input_lines = f.readlines()[1:] # Skipping headers at index 0 try: int(input_lines[0].strip().split()[-1]) except ValueError: logging.warning(f'No numerical labels found for {mode}.tsv.') raise queries, raw_sentences = [], [] for input_line in input_lines: parts = input_line.strip().split() label = int(parts[-1]) raw_sentences.append(label) queries.append(' '.join(parts[:-1])) infold = input_file[:input_file.rfind('/')] logging.info(f'Three most popular classes in {mode} dataset') total_sents, sent_label_freq, max_id = get_label_stats( raw_sentences, infold + f'/{mode}_sentence_stats.tsv') max_label_id = max(max_label_id, max_id) if mode == 'train': class_weights_dict = get_freq_weights(sent_label_freq) logging.info(f'Class Weights: {class_weights_dict}') logging.info(f'Total Sentences: {total_sents}') logging.info(f'Sentence class frequencies - {sent_label_freq}') self.class_weights = fill_class_weights(class_weights_dict, max_label_id) self.num_labels = max_label_id + 1
def _create_config_header(self): """ A protected method that create a header stored later in the configuration file. """ # Get module "full specification". module_full_spec = str(self.__module__) + "." + str( self.__class__.__qualname__) module_class_name = type(self).__name__ # print(module_full_spec) # Check whether module belongs to a collection. spec_list = module_full_spec.split(".") # Do not check Neural Modules from unit tests. if spec_list[0] == "tests": # Set collection variables. collection_type = "tests" collection_version = None else: # Check if component belongs to any collection if len(spec_list) < 3 or (spec_list[0] != "nemo" and spec_list[1] != "collection"): logging.warning( "Module `{}` does not belong to any collection. This won't be allowed in the next release." .format(module_class_name)) collection_type = "unknown" collection_version = None else: # Ok, set collection. collection_type = spec_list[2] collection_version = None # TODO: to be SET! # print(getattr("nemo.collections.nlp", __version__)) # Create a "header" with module "specification". header = { "nemo_core_version": nemo_version, "collection_type": collection_type, "collection_version": collection_version, # "class": module_class_name, # Operating only on full_spec now. "full_spec": module_full_spec, } return header
def __init__( self, data_layers: List[DataLayerNM], batch_size: int, shuffle: bool = False, combination_mode: DataCombination = DataCombination.CROSSPRODUCT, port_names: List[str] = None, ): """ data_layers: (list) of DataLayerNM objects batch_size: (int) batchsize when the underlying dataset is loaded combination_mode: (DataCombination) defines how to combine the datasets. shuffle: (bool) whether underlying multi dataset should be shuffled in each epoch port_names: List(str) user can override all port names if specified """ super().__init__() self._data_layers = data_layers self._batch_size = batch_size self._shuffle = shuffle self._combination_mode = combination_mode self._port_names = port_names self._dataset = MultiDataset( datasets=[dl.dataset for dl in self._data_layers], combination_mode=combination_mode ) self._ports = dict() if self._port_names: i = 0 for dl in self._data_layers: for _, port_type in dl.output_ports.items(): self._ports[self._port_names[i]] = port_type i += 1 else: for dl_idx, dl in enumerate(self._data_layers): for port_name, port_type in dl.output_ports.items(): if port_name in self._ports: logging.warning(f"name collision {port_name}, will rename") self._ports[f"{port_name}_{dl_idx}"] = port_type else: self._ports[port_name] = port_type
def list_pretrained_models() -> Optional[List[PretrainedModelInfo]]: """List all available pre-trained models (e.g. weights) for convolutional encoder-decoder CTC-based speech recognition models. Returns: A list of PretrainedModelInfo tuples. The pretrained_model_name field of the tuple can be used to retrieve pre-trained model's weights (pass it as pretrained_model_name argument to the module's constructor) """ logging.warning("TODO: CHANGE ME TO GRAB STUFF FROM NGC") result = [] model = PretrainedModelInfo( pretrained_model_name="JasperNet10x5-En", location= "https://nemo-public.s3.us-east-2.amazonaws.com/nemo_0.11_models_test/JasperNet10x5-En-Base.nemo", description= "The model achieves a WER of 3.46% on LibriSpeech dev-clean, 10.40% on dev-other, 3.69% on test-clean, and 10.49% on test-other.", parameters="", ) result.append(model) return result
def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mode='train'): logging.info(f"Loading {mode} data...") data_file = f'{data_desc.data_dir}/{mode}.tsv' shuffle = args.shuffle_data if mode == 'train' else False data_layer = nemo.collections.nlp.nm.data_layers.text_classification_datalayer.BertTextClassificationDataLayer( input_file=data_file, tokenizer=tokenizer, max_seq_length=args.max_seq_length, num_samples=num_samples, shuffle=shuffle, batch_size=batch_size, ) ids, type_ids, input_mask, labels = data_layer() data_size = len(data_layer) if data_size < batch_size: logging.warning("Batch_size is larger than the dataset size") logging.warning("Reducing batch_size to dataset size") batch_size = data_size steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus)) logging.info(f"Steps_per_epoch = {steps_per_epoch}") hidden_states = pretrained_bert_model(input_ids=ids, token_type_ids=type_ids, attention_mask=input_mask) logits = classifier(hidden_states=hidden_states) loss = loss_fn(logits=logits, labels=labels) if mode == 'train': tensors_to_evaluate = [loss, logits] else: tensors_to_evaluate = [logits, labels] return tensors_to_evaluate, loss, steps_per_epoch, data_layer
def send_on_queue(queue, obj): if not is_pickleable(obj): logging.warning("Cannot pickle: {:}. Sending None instead".format(obj)) queue.put(None) return if sys.getsizeof(obj) > PIPE_MAX_SEND_BYTES: logging.warning( "Object size ({:} bytes) exceeds maximum size that can be sent over queues ({:} bytes). Attempting to compress - this may take some time. If this does not work or you want to avoid the compression overhead, you should disable subprocesses via the --no-subprocess flag, or by setting use_subprocess=False in Comparator.run()." .format(sys.getsizeof(obj), PIPE_MAX_SEND_BYTES)) obj = compress(obj) if sys.getsizeof(obj) > PIPE_MAX_SEND_BYTES: logging.warning( "Compressed object is still too large to send. Sending None instead." ) queue.put(None) return logging.info("Sending: {:} on queue".format(obj)) queue.put(obj)
def get_features( queries, max_seq_length, tokenizer, label_ids=None, pad_label='O', raw_labels=None, ignore_extra_tokens=False, ignore_start_end=False, ): """ Args: queries (list of str): text sequences max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP] tokenizer (Tokenizer): such as NemoBertTokenizer pad_label (str): pad value use for labels. by default, it's the neutral label. raw_labels (list of str): list of labels for every word in a sequence label_ids (dict): dict to map labels to label ids. Starts with pad_label->0 and then increases in alphabetical order. Required for training and evaluation, not needed for inference. ignore_extra_tokens (bool): whether to ignore extra tokens in the loss_mask, ignore_start_end (bool): whether to ignore bos and eos tokens in the loss_mask """ all_subtokens = [] all_loss_mask = [] all_subtokens_mask = [] all_segment_ids = [] all_input_ids = [] all_input_mask = [] sent_lengths = [] all_labels = [] with_label = False if raw_labels is not None: with_label = True for i, query in enumerate(queries): words = query.strip().split() # add bos token subtokens = ['[CLS]'] loss_mask = [1 - ignore_start_end] subtokens_mask = [0] if with_label: pad_id = label_ids[pad_label] labels = [pad_id] query_labels = [label_ids[lab] for lab in raw_labels[i]] for j, word in enumerate(words): word_tokens = tokenizer.text_to_tokens(word) subtokens.extend(word_tokens) loss_mask.append(1) loss_mask.extend([int(not ignore_extra_tokens)] * (len(word_tokens) - 1)) subtokens_mask.append(1) subtokens_mask.extend([0] * (len(word_tokens) - 1)) if with_label: labels.extend([query_labels[j]] * len(word_tokens)) # add eos token subtokens.append('[SEP]') loss_mask.append(1 - ignore_start_end) subtokens_mask.append(0) sent_lengths.append(len(subtokens)) all_subtokens.append(subtokens) all_loss_mask.append(loss_mask) all_subtokens_mask.append(subtokens_mask) all_input_mask.append([1] * len(subtokens)) if with_label: labels.append(pad_id) all_labels.append(labels) max_seq_length = min(max_seq_length, max(sent_lengths)) logging.info(f'Max length: {max_seq_length}') datasets_utils.get_stats(sent_lengths) too_long_count = 0 for i, subtokens in enumerate(all_subtokens): if len(subtokens) > max_seq_length: subtokens = ['[CLS]'] + subtokens[-max_seq_length + 1 :] all_input_mask[i] = [1] + all_input_mask[i][-max_seq_length + 1 :] all_loss_mask[i] = [int(not ignore_start_end)] + all_loss_mask[i][-max_seq_length + 1 :] all_subtokens_mask[i] = [0] + all_subtokens_mask[i][-max_seq_length + 1 :] if with_label: all_labels[i] = [pad_id] + all_labels[i][-max_seq_length + 1 :] too_long_count += 1 all_input_ids.append([tokenizer.tokens_to_ids(t) for t in subtokens]) if len(subtokens) < max_seq_length: extra = max_seq_length - len(subtokens) all_input_ids[i] = all_input_ids[i] + [0] * extra all_loss_mask[i] = all_loss_mask[i] + [0] * extra all_subtokens_mask[i] = all_subtokens_mask[i] + [0] * extra all_input_mask[i] = all_input_mask[i] + [0] * extra if with_label: all_labels[i] = all_labels[i] + [pad_id] * extra all_segment_ids.append([0] * max_seq_length) logging.warning(f'{too_long_count} are longer than {max_seq_length}') for i in range(min(len(all_input_ids), 5)): logging.debug("*** Example ***") logging.debug("i: %s", i) logging.debug("subtokens: %s", " ".join(list(map(str, all_subtokens[i])))) logging.debug("loss_mask: %s", " ".join(list(map(str, all_loss_mask[i])))) logging.debug("input_mask: %s", " ".join(list(map(str, all_input_mask[i])))) logging.debug("subtokens_mask: %s", " ".join(list(map(str, all_subtokens_mask[i])))) if with_label: logging.debug("labels: %s", " ".join(list(map(str, all_labels[i])))) return (all_input_ids, all_segment_ids, all_input_mask, all_loss_mask, all_subtokens_mask, all_labels)
def __init__( self, text_file, label_file, max_seq_length, tokenizer, num_samples=-1, shuffle=False, pad_label='O', label_ids=None, ignore_extra_tokens=False, ignore_start_end=False, use_cache=False, ): if use_cache: # Cache features data_dir = os.path.dirname(text_file) filename = os.path.basename(text_file) if not filename.endswith('.txt'): raise ValueError("{text_file} should have extension .txt") features_pkl = os.path.join(data_dir, filename[:-4] + "_features.pkl") label_ids_pkl = os.path.join(data_dir, "label_ids.pkl") if use_cache and os.path.exists(features_pkl) and os.path.exists(label_ids_pkl): # If text_file was already processed, load from pickle features = pickle.load(open(features_pkl, 'rb')) logging.info(f'features restored from {features_pkl}') label_ids = pickle.load(open(label_ids_pkl, 'rb')) logging.info(f'Labels to ids dict restored from {label_ids_pkl}') else: if num_samples == 0: raise ValueError("num_samples has to be positive", num_samples) with open(text_file, 'r') as f: text_lines = f.readlines() # Collect all possible labels unique_labels = set([]) labels_lines = [] with open(label_file, 'r') as f: for line in f: line = line.strip().split() labels_lines.append(line) unique_labels.update(line) if len(labels_lines) != len(text_lines): raise ValueError("Labels file should contain labels for every word") if shuffle or num_samples > 0: dataset = list(zip(text_lines, labels_lines)) random.shuffle(dataset) if num_samples > 0: dataset = dataset[:num_samples] dataset = list(zip(*dataset)) text_lines = dataset[0] labels_lines = dataset[1] # for dev/test sets use label mapping from training set if label_ids: if len(label_ids) != len(unique_labels): logging.warning( f'Not all labels from the specified' + ' label_ids dictionary are present in the' + ' current dataset. Using the provided' + ' label_ids dictionary.' ) else: logging.info(f'Using the provided label_ids dictionary.') else: logging.info( f'Creating a new label to label_id dictionary.' + ' It\'s recommended to use label_ids generated' + ' during training for dev/test sets to avoid' + ' errors if some labels are not' + ' present in the dev/test sets.' + ' For training set label_ids should be None.' ) label_ids = {pad_label: 0} if pad_label in unique_labels: unique_labels.remove(pad_label) for label in sorted(unique_labels): label_ids[label] = len(label_ids) features = get_features( text_lines, max_seq_length, tokenizer, pad_label=pad_label, raw_labels=labels_lines, label_ids=label_ids, ignore_extra_tokens=ignore_extra_tokens, ignore_start_end=ignore_start_end, ) if use_cache: pickle.dump(features, open(features_pkl, "wb")) logging.info(f'features saved to {features_pkl}') pickle.dump(label_ids, open(label_ids_pkl, "wb")) logging.info(f'labels to ids dict saved to {label_ids_pkl}') self.all_input_ids = features[0] self.all_segment_ids = features[1] self.all_input_mask = features[2] self.all_loss_mask = features[3] self.all_subtokens_mask = features[4] self.all_labels = features[5] self.label_ids = label_ids infold = text_file[: text_file.rfind('/')] merged_labels = itertools.chain.from_iterable(self.all_labels) logging.info('Three most popular labels') _, self.label_frequencies = datasets_utils.get_label_stats(merged_labels, infold + '/label_stats.tsv') # save label_ids out = open(infold + '/label_ids.csv', 'w') labels, _ = zip(*sorted(self.label_ids.items(), key=lambda x: x[1])) out.write('\n'.join(labels)) logging.info(f'Labels: {self.label_ids}') logging.info(f'Labels mapping saved to : {out.name}')
def __init__( self, kaldi_dir, labels, min_duration=None, max_duration=None, max_utts=0, unk_index=-1, blank_index=-1, normalize=True, eos_id=None, ): self.eos_id = eos_id self.unk_index = unk_index self.blank_index = blank_index self.labels_map = {label: i for i, label in enumerate(labels)} data = [] duration = 0.0 filtered_duration = 0.0 # Read Kaldi features (MFCC, PLP) using feats.scp feats_path = os.path.join(kaldi_dir, 'feats.scp') id2feats = {utt_id: torch.from_numpy(feats) for utt_id, feats in kaldi_io.read_mat_scp(feats_path)} # Get durations, if utt2dur exists utt2dur_path = os.path.join(kaldi_dir, 'utt2dur') id2dur = {} if os.path.exists(utt2dur_path): with open(utt2dur_path, 'r') as f: for line in f: utt_id, dur = line.split() id2dur[utt_id] = float(dur) elif max_duration or min_duration: raise ValueError( f"KaldiFeatureDataset max_duration or min_duration is set but" f" utt2dur file not found in {kaldi_dir}." ) else: logging.info( f"Did not find utt2dur when loading data from " f"{kaldi_dir}. Skipping dataset duration calculations." ) # Match transcripts to features text_path = os.path.join(kaldi_dir, 'text') parser = parsers.make_parser(labels, 'en', unk_id=unk_index, blank_id=self.blank_index, do_normalize=normalize) with open(text_path, 'r') as f: for line in f: split_idx = line.find(' ') utt_id = line[:split_idx] audio_features = id2feats.get(utt_id) if audio_features is not None: text = line[split_idx:].strip() # if normalize: # # TODO: WTF? # text = parser._normalize(text) dur = id2dur[utt_id] if id2dur else None # Filter by duration if specified & utt2dur exists if min_duration and dur < min_duration: filtered_duration += dur continue if max_duration and dur > max_duration: filtered_duration += dur continue sample = { 'utt_id': utt_id, 'text': text, 'tokens': parser(text), 'audio': audio_features.t(), 'duration': dur, } data.append(sample) duration += dur if max_utts > 0 and len(data) >= max_utts: logging.warning(f"Stop parsing due to max_utts ({max_utts})") break if id2dur: # utt2dur durations are in seconds logging.info( f"Dataset loaded with {duration / 3600 : .2f} hours. " f"Filtered {filtered_duration / 3600 : .2f} hours." ) self.data = data
def _loss_function( self, logit_intent_status, intent_status_labels, logit_req_slot_status, requested_slot_status, req_slot_mask, logit_cat_slot_status, categorical_slot_status, cat_slot_status_mask, logit_cat_slot_value, categorical_slot_values, logit_noncat_slot_status, noncategorical_slot_status, noncat_slot_status_mask, logit_noncat_slot_start, logit_noncat_slot_end, noncategorical_slot_value_start, noncategorical_slot_value_end, ): # Intent loss intent_loss = self._cross_entropy(logit_intent_status, intent_status_labels) # Requested slots. # Shape: (batch_size, max_num_slots) # mask unused slots # Sigmoid cross entropy is used because more than one slots can be requested in a single utterance requested_slot_loss = self._criterion_req_slots( logit_req_slot_status.view(-1)[req_slot_mask], requested_slot_status.view(-1)[req_slot_mask]) # Categorical slot status # Shape of logit_cat_slot_status: (batch_size, max_num_cat_slots, 3) cat_slot_status_mask = cat_slot_status_mask.view(-1) > 0.5 if sum(cat_slot_status_mask) == 0: logging.warning(f'No active categorical slots in the batch') cat_slot_status_loss = self._cross_entropy( logit_cat_slot_status.view(-1, 3), torch.argmax(logit_cat_slot_status.view(-1, 3), dim=-1)) else: cat_slot_status_loss = self._cross_entropy( logit_cat_slot_status.view(-1, 3)[cat_slot_status_mask], categorical_slot_status.view(-1)[cat_slot_status_mask], ) # Categorical slot values. # Shape: (batch_size, max_num_cat_slots, max_num_slot_values). max_num_slot_values = logit_cat_slot_value.size()[-1] # Zero out losses for categorical slot value when the slot status is not active. cat_slot_value_mask = ( categorical_slot_status == STATUS_ACTIVE).view(-1) # to handle cases with no active categorical slot value cat_slot_value_mask = cat_slot_value_mask.view(-1) > 0.5 if sum(cat_slot_value_mask) == 0: logging.warning( f'No active values for categorical slots in the batch.') cat_slot_value_loss = self._cross_entropy( logit_cat_slot_value.view(-1, max_num_slot_values), torch.argmax(logit_cat_slot_value.view(-1, max_num_slot_values), dim=-1), ) else: slot_values_active_logits = logit_cat_slot_value.view( -1, max_num_slot_values)[cat_slot_value_mask] slot_values_active_labels = categorical_slot_values.view( -1)[cat_slot_value_mask] cat_slot_value_loss = self._cross_entropy( slot_values_active_logits, slot_values_active_labels) # Non-categorical slot status. # Shape: (batch_size, max_num_noncat_slots, 3). noncat_slot_status_mask = noncat_slot_status_mask.view(-1) > 0.5 if sum(noncat_slot_status_mask) == 0: logging.warning(f'No active non-categorical slots in the batch.') noncat_slot_status_loss = self._cross_entropy( logit_noncat_slot_status.view(-1, 3), torch.argmax(logit_noncat_slot_status.view(-1, 3), dim=-1)) else: noncat_slot_status_loss = self._cross_entropy( logit_noncat_slot_status.view(-1, 3)[noncat_slot_status_mask], noncategorical_slot_status.view(-1)[noncat_slot_status_mask], ) # Non-categorical slot spans. # Shape: (batch_size, max_num_noncat_slots, max_num_tokens).n max_num_tokens = logit_noncat_slot_start.size()[-1] # Zero out losses for non-categorical slot spans when the slot status is not active. # changed here non_cat_slot_value_mask = ( noncategorical_slot_status == STATUS_ACTIVE).view(-1) # non_cat_slot_value_mask = (noncategorical_slot_status > -1 ).view(-1) # to handle cases with no active categorical slot value non_cat_slot_value_mask = non_cat_slot_value_mask.view(-1) if sum(non_cat_slot_value_mask) == 0: logging.warning( f'No active values for non-categorical slots in the batch.') span_start_loss = self._cross_entropy( logit_noncat_slot_start.view(-1, max_num_tokens), torch.argmax(logit_noncat_slot_start.view(-1, max_num_tokens), dim=-1), ) span_end_loss = self._cross_entropy( logit_noncat_slot_end.view(-1, max_num_tokens), torch.argmax(logit_noncat_slot_end.view(-1, max_num_tokens), dim=-1), ) else: noncat_slot_start_active_logits = logit_noncat_slot_start.view( -1, max_num_tokens)[non_cat_slot_value_mask] noncat_slot_start_active_labels = noncategorical_slot_value_start.view( -1)[non_cat_slot_value_mask] span_start_loss = self._cross_entropy( noncat_slot_start_active_logits, noncat_slot_start_active_labels) noncat_slot_end_active_logits = logit_noncat_slot_end.view( -1, max_num_tokens)[non_cat_slot_value_mask] noncat_slot_end_active_labels = noncategorical_slot_value_end.view( -1)[non_cat_slot_value_mask] span_end_loss = self._cross_entropy(noncat_slot_end_active_logits, noncat_slot_end_active_labels) losses = { "intent_loss": intent_loss, "requested_slot_loss": requested_slot_loss, "cat_slot_status_loss": cat_slot_status_loss, "cat_slot_value_loss": cat_slot_value_loss, "noncat_slot_status_loss": noncat_slot_status_loss, "span_start_loss": span_start_loss, "span_end_loss": span_end_loss, } total_loss = sum(losses.values()) if self.reduction == 'mean': total_loss = total_loss / len(losses) else: batch_size = logit_intent_status.shape[0] total_loss = total_loss / batch_size return total_loss
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): """Project the tokenized prediction back to the original text.""" # When we created the data, we kept track of the alignment between original # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So # now `orig_text` contains the span of our original text corresponding to # the span that we predicted. # # However, `orig_text` may contain extra characters that we don't want in # our prediction. # # For example, let's say: # pred_text = steve smith # orig_text = Steve Smith's # # We don't want to return `orig_text` because it contains the extra "'s". # # We don't want to return `pred_text` because it's already been normalized # (the SQuAD eval script also does punctuation stripping/lower casing but # our tokenizer does additional normalization like stripping accent # characters). # # What we really want to return is "Steve Smith". # # Therefore, we have to apply a semi-complicated alignment heuristic # between `pred_text` and `orig_text` to get a character-to-character # alignment. This can fail in certain cases in which case we just return # `orig_text`. def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() for (i, c) in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i ns_chars.append(c) ns_text = "".join(ns_chars) return ns_text, ns_to_s_map # We first tokenize `orig_text`, strip whitespace from the result # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. tokenizer = BasicTokenizer(do_lower_case=do_lower_case) tok_text = " ".join(tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: logging.warning("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: logging.warning( "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text, ) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} for (i, tok_index) in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None if start_position in tok_s_to_ns_map: ns_start_position = tok_s_to_ns_map[start_position] if ns_start_position in orig_ns_to_s_map: orig_start_position = orig_ns_to_s_map[ns_start_position] if orig_start_position is None: if verbose_logging: logging.warning("Couldn't map start position") return orig_text orig_end_position = None if end_position in tok_s_to_ns_map: ns_end_position = tok_s_to_ns_map[end_position] if ns_end_position in orig_ns_to_s_map: orig_end_position = orig_ns_to_s_map[ns_end_position] if orig_end_position is None: if verbose_logging: logging.warning("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position:(orig_end_position + 1)] return output_text