def __init__(self, learning_rate: float = 0.001, beta1: float = 0.9, beta2: float = 0.999, beta3_batch: float = 0.999, beta3_checkpoint: float = 0., epsilon: float = 1e-8, k_lo: float = 0.1, k_hi: float = 10, schedule_decay: float = 0.004, use_batch_objective: bool = True, use_checkpoint_objective: bool = False, use_nesterov_momentum: bool = False, **kwargs) -> None: check_condition( any((use_batch_objective, use_checkpoint_objective)), "Must use at least one of: batch objective, checkpoint objective") super().__init__(learning_rate=learning_rate, **kwargs) self.beta1 = beta1 self.beta2 = beta2 self.beta3_batch = beta3_batch self.beta3_checkpoint = beta3_checkpoint self.epsilon = epsilon self.k_lo = k_lo self.k_hi = k_hi self.schedule_decay = schedule_decay self.use_batch_objective = use_batch_objective self.use_checkpoint_objective = use_checkpoint_objective self.use_nesterov_momentum = use_nesterov_momentum
def __call__(self, hidden: Union[mx.sym.Symbol, mx.nd.NDArray], weight: Optional[mx.nd.NDArray] = None, bias: Optional[mx.nd.NDArray] = None): """ Linear transformation to vocab size. Returns logits. :param hidden: Decoder representation for n elements. Shape: (n, self.num_hidden). :return: Logits. Shape(n, self.vocab_size). """ if isinstance(hidden, mx.sym.Symbol): # TODO dropout? return mx.sym.FullyConnected(data=hidden, num_hidden=self.vocab_size, weight=self.w, bias=self.b, flatten=False, name=C.LOGITS_NAME) # Equivalent NDArray implementation (requires passed weights/biases) assert isinstance(hidden, mx.nd.NDArray) utils.check_condition( weight is not None and bias is not None, "OutputLayer NDArray implementation requires passing weight and bias NDArrays." ) return mx.nd.FullyConnected(data=hidden, num_hidden=bias.shape[0], weight=weight, bias=bias, flatten=False)
def __next__(self): if self._next is None: raise StopIteration alignment = self._next check_condition(bool(alignment), "Empty alignment in file %s" % self.path) if self.add_bos: alignment = [i + 1 if i != -1 else -1 for i in alignment] alignment.insert(0, 0) if self.add_eos: alignment.append(self.source_lengths[self.last_idx] - 1) self.last_idx += 1 if not self._iterated_once: self.count += 1 # fetch next element self._next = next(self._iter, None) if self._next is None: self._iter = None if not self._iterated_once: self._iterated_once = True return alignment
def __init__(self, rnn_config: rnn.RNNConfig, prefix=C.BIDIRECTIONALRNN_PREFIX, layout=C.TIME_MAJOR, encoder_class: Callable = RecurrentEncoder) -> None: utils.check_condition( rnn_config.num_hidden % 2 == 0, "num_hidden must be a multiple of 2 for BiDirectionalRNNEncoders.") self.rnn_config = rnn_config self.internal_rnn_config = rnn_config.copy( num_hidden=rnn_config.num_hidden // 2) if layout[0] == 'N': logger.warning( "Batch-major layout for encoder input. Consider using time-major layout for faster speed" ) # time-major layout as _encode needs to swap layout for SequenceReverse self.forward_rnn = encoder_class(rnn_config=self.internal_rnn_config, prefix=prefix + C.FORWARD_PREFIX, layout=C.TIME_MAJOR) self.reverse_rnn = encoder_class(rnn_config=self.internal_rnn_config, prefix=prefix + C.REVERSE_PREFIX, layout=C.TIME_MAJOR) self.layout = layout self.prefix = prefix
def __next__(self): if self._next is None: raise StopIteration sentence_tokens = self._next sentence = tokens2ids(sentence_tokens, self.vocab) check_condition(bool(sentence), "Empty sentence in file %s" % self.path) if self.add_bos: sentence.insert(0, self.vocab[C.BOS_SYMBOL]) if self.add_eos: sentence.append(self.vocab[C.EOS_SYMBOL]) if not self._iterated_once: self.count += 1 # fetch next element self._next = next(self._iter, None) if self._next is None: self._iter = None if not self._iterated_once: self._iterated_once = True return sentence
def read_alignment_file(path, trg_lengths, src_lengths): """ read flat alignment file :param path: path to alignment file :param trg_lengths: array of target lengths (for each sentence) :param src_lengths: array of source lengths (for each sentence) :return: array of alignments (unprocessed) """ check_condition( len(trg_lengths) == len(src_lengths), "source and target sentences must be parallel") file = smart_open(path) content = file.readlines() if len(content) == len(trg_lengths): is_multiline = False alignments = _read_flat_alignment_file(content=content, trg_lengths=trg_lengths) else: is_multiline = True alignments = _read_multiline_alignment_file(content=content, trg_lengths=trg_lengths) check_condition( len(alignments) == len(trg_lengths), "alignment mst be parallel") return alignments, is_multiline
def main(): params = argparse.ArgumentParser( description='CLI to build source and target vocab(s).') arguments.add_build_vocab_args(params) args = params.parse_args() num_words, num_words_other = args.num_words utils.check_condition( num_words == num_words_other, "Vocabulary CLI only allows a common value for --num-words") word_min_count, word_min_count_other = args.word_min_count utils.check_condition( word_min_count == word_min_count_other, "Vocabulary CLI only allows a common value for --word-min-count") global logger logger = log.setup_main_logger("build_vocab", file_logging=True, console=True, path="%s.%s" % (args.output, C.LOG_NAME)) vocab = build_from_paths(args.inputs, num_words=num_words, min_count=word_min_count) logger.info("Vocabulary size: %d ", len(vocab)) vocab_to_json(vocab, args.output + C.JSON_SUFFIX)
def translate(output_handler: output_handler.OutputHandler, source_data: Iterable[str], translator: inference.Translator, chunk_id: int = 0, reference_data: Iterable[str] = None) -> float: """ Translates each line from source_data, calling output handler after translating a batch. :param output_handler: A handler that will be called once with the output of each translation. :param source_data: A enumerable list of source sentences that will be translated. :param translator: The translator that will be used for each line of input. :param chunk_id: Global id of the chunk. :param reference_data: A enumerable list of reference sentences that will be force aligned to the source. :return: Total time taken. """ tic = time.time() trans_inputs = [translator.make_input(i, line, reference) for i, (line, reference) in enumerate(itertools.zip_longest(source_data, reference_data if reference_data else [None]), chunk_id + 1)] if translator.dictionary is not None: utils.check_condition(translator.batch_size == 1, "Dictionary replacement works only with batch size 1") translator.seq_idx = trans_inputs[0].id - 1 trans_outputs = translator.translate(trans_inputs) total_time = time.time() - tic batch_time = total_time / len(trans_inputs) for trans_input, trans_output in zip(trans_inputs, trans_outputs): output_handler.handle(trans_input, trans_output, batch_time) return total_time
def determine_context(args: argparse.Namespace, exit_stack: ExitStack) -> List[mx.Context]: """ Determine the context we should run on (CPU or GPU). :param args: Arguments as returned by argparse. :param exit_stack: An ExitStack from contextlib. :return: A list with the context(s) to run on. """ if args.use_cpu: logger.info("Training Device: CPU") context = [mx.cpu()] else: num_gpus = utils.get_num_gpus() check_condition( num_gpus >= 1, "No GPUs found, consider running on the CPU with --use-cpu " "(note: check depends on nvidia-smi and this could also mean that the nvidia-smi " "binary isn't on the path).") if args.disable_device_locking: context = utils.expand_requested_device_ids(args.device_ids) else: context = exit_stack.enter_context( utils.acquire_gpus(args.device_ids, lock_dir=args.lock_dir)) if args.batch_type == C.BATCH_TYPE_SENTENCE: check_condition( args.batch_size % len(context) == 0, "When using multiple devices the batch size must be " "divisible by the number of devices. Choose a batch " "size that is a multiple of %d." % len(context)) logger.info("Training Device(s): GPU %s", context) context = [mx.gpu(gpu_id) for gpu_id in context] return context
def _populate_bucket_batch_sizes(self): """ Compute bucket-specific batch sizes (sentences, average_words) and default bucket batch size. If sentence-based batching: number of sentences is the same for each batch, determines the number of words. If word-based batching: number of sentences for each batch is set to the multiple of number of devices that produces the number of words closest to the target batch size. Average target sentence length (non-padding symbols) is used for word number calculations. Sets: self.bucket_batch_sizes """ # Pre-defined bucket batch sizes if self.bucket_batch_sizes is not None: return # Otherwise compute here self.bucket_batch_sizes = [None for _ in self.buckets] largest_total_batch_size = 0 for buck_idx, bucket_shape in enumerate(self.buckets): # Target/label length with padding padded_seq_len = bucket_shape[1] # Average target/label length excluding padding average_seq_len = self.data_target_average_len[buck_idx] # Word-based: num words determines num sentences # Sentence-based: num sentences determines num words if self.batch_by_words: check_condition( padded_seq_len <= self.batch_size, "Word batch size must cover sequence lengths for all" " buckets: (%d > %d)" % (padded_seq_len, self.batch_size)) # Multiple of number of devices (int) closest to target number of words, assuming each sentence is of # average length batch_size_seq = self.batch_num_devices * round( (self.batch_size / average_seq_len) / self.batch_num_devices) batch_size_word = batch_size_seq * average_seq_len else: batch_size_seq = self.batch_size batch_size_word = batch_size_seq * average_seq_len self.bucket_batch_sizes[buck_idx] = BucketBatchSize( batch_size_seq, batch_size_word) # Track largest batch size by total elements largest_total_batch_size = max(largest_total_batch_size, batch_size_seq * max(*bucket_shape)) # Final step: guarantee that largest bucket by sequence length also has largest total batch size. # When batching by sentences, this will already be the case. if self.batch_by_words: padded_seq_len = max(*self.buckets[-1]) average_seq_len = self.data_target_average_len[-1] while self.bucket_batch_sizes[ -1].batch_size * padded_seq_len < largest_total_batch_size: self.bucket_batch_sizes[-1] = BucketBatchSize( self.bucket_batch_sizes[-1].batch_size + self.batch_num_devices, self.bucket_batch_sizes[-1].average_words_per_batch + self.batch_num_devices * average_seq_len)
def __init__(self, num_embed: int, prefix: str, scale_up_input: bool, scale_down_positions: bool) -> None: utils.check_condition( num_embed % 2 == 0, "Positional embeddings require an even embedding size it " "is however %d." % num_embed) self.scale_up_input = scale_up_input self.scale_down_positions = scale_down_positions self.num_embed = num_embed self.prefix = prefix
def __init__(self, num_hidden, prefix='lstm_', params=None, forget_bias=1.0, dropout: float = 0.0) -> None: super().__init__(num_hidden, prefix, params, forget_bias) utils.check_condition( dropout > 0.0, "RecurrentDropoutLSTMCell shoud have dropout > 0.0") self.dropout = dropout
def get_num_hidden(self) -> int: """ Return the representation size of this encoder. """ if isinstance(self.encoders[-1], BatchMajor2TimeMajor): utils.check_condition( len(self.encoders) > 1, "Cannot return num_hidden from a BatchMajor2TimeMajor encoder only" ) return self.encoders[-2].get_num_hidden() else: return self.encoders[-1].get_num_hidden()
def __init__(self, kernel_width: int, num_hidden: int, act_type: str = C.GLU, weight_normalization: bool = False): super().__init__() self.kernel_width = kernel_width self.num_hidden = num_hidden utils.check_condition(act_type in C.CNN_ACTIVATION_TYPES, "Unknown activation %s." % act_type) self.act_type = act_type self.weight_normalization = weight_normalization
def __call__(self, data: mx.sym.Symbol, data_length: mx.sym.Symbol, seq_len: int) -> mx.sym.Symbol: """ Run the convolutional block. :param data: Input data. Shape: (batch_size, seq_len, num_hidden). :param data_length: Vector with sequence lengths. Shape: (batch_size,). :param seq_len: Maximum sequence length. :return: Shape: (batch_size, seq_len, num_hidden). """ if self.pad_type == C.CNN_PAD_LEFT: # we pad enough on both sides and later slice the extra padding from the right padding = (self.config.kernel_width - 1, ) elif self.pad_type == C.CNN_PAD_CENTERED: # we pad enough so that the output size is equal to the input size and we don't need to slice utils.check_condition( self.config.kernel_width % 2 == 1, "Only odd kernel widths supported, but got %d" % self.config.kernel_width) padding = (int((self.config.kernel_width - 1) / 2), ) else: raise ValueError("Unknown pad type %s" % self.pad_type) num_hidden = self._pre_activation_num_hidden() # Apply masking (so that we properly have zero padding for variable sequence length batches) # Note: SequenceMask expects time-major data # (seq_len, batch_size, num_hidden) data = mx.sym.swapaxes(data, dim1=0, dim2=1) data = mx.sym.SequenceMask(data=data, sequence_length=data_length, use_sequence_length=True, value=0) # (batch_size, num_hidden, seq_len) data = mx.sym.transpose(data, axes=(1, 2, 0)) data_conv = mx.sym.Convolution(data=data, weight=self.conv_weight, bias=self.conv_bias, pad=padding, kernel=(self.config.kernel_width, ), num_filter=num_hidden, layout="NCW") # (batch_size, 2 * num_hidden, seq_len) if self.pad_type == C.CNN_PAD_LEFT: data_conv = mx.sym.slice_axis(data=data_conv, axis=2, begin=0, end=seq_len) return self._post_convolution(data_conv)
def main(): params = argparse.ArgumentParser( description='Evaluate translations by calculating metrics with ' 'respect to a reference set.') arguments.add_evaluate_args(params) arguments.add_logging_args(params) args = params.parse_args() if args.quiet: logger.setLevel(logging.ERROR) utils.check_condition(args.offset >= 0, "Offset should be non-negative.") log_sockeye_version(logger) logger.info("Command: %s", " ".join(sys.argv)) logger.info("Arguments: %s", args) references = [' '.join(e) for e in data_io.read_content(args.references)] hypotheses = [h.strip() for h in args.hypotheses] logger.info("%d hypotheses | %d references", len(hypotheses), len(references)) if not args.not_strict: utils.check_condition( len(hypotheses) == len(references), "Number of hypotheses (%d) and references (%d) does not match." % (len(hypotheses), len(references))) if not args.sentence: scores = [] for metric in args.metrics: if metric == C.BLEU: bleu_score = raw_corpus_bleu(hypotheses, references, args.offset) scores.append("%.6f" % bleu_score) elif metric == C.CHRF: chrf_score = chrf.corpus_chrf(hypotheses, references, trim_whitespaces=True) scores.append("%.6f" % chrf_score) print("\t".join(scores), file=sys.stdout) else: for h, r in zip(hypotheses, references): scores = [] for metric in args.metrics: if metric == C.BLEU: bleu = raw_corpus_bleu(h, r, args.offset) scores.append("%.6f" % bleu) elif metric == C.CHRF: chrf_score = chrf.corpus_chrf(h, r, trim_whitespaces=True) scores.append("%.6f" % chrf_score) print("\t".join(scores), file=sys.stdout)
def __init__(self, updates_per_checkpoint: int, half_life: int, warmup: int = 0) -> None: super().__init__(warmup) check_condition(updates_per_checkpoint > 0, "updates_per_checkpoint needs to be > 0.") check_condition(half_life > 0, "half_life needs to be > 0.") # 0.5 base_lr = base_lr * sqrt(1 + T * factor) # then factor = 3 ./ T, with T = half_life * updates_per_checkpoint self.factor = 3. / (half_life * updates_per_checkpoint) self.t_last_log = -1 self.log_every_t = int(half_life * updates_per_checkpoint)
def load_params_from_file(self, fname: str): """ Loads and sets model parameters from file. :param fname: Path to load parameters from. """ assert self._is_built utils.check_condition( os.path.exists(fname), "No model parameter file found under %s. " "This is either not a model directory or the first training " "checkpoint has not happened yet." % fname) self.params, _ = utils.load_params(fname) logger.info('Loaded params from "%s"', fname)
def __init__(self, context: mx.context.Context, inputs: str, references: str, model: str, max_input_len: Optional[int] = None, beam_size: int = C.DEFAULT_BEAM_SIZE, bucket_width_source: int = 10, length_penalty_alpha: float = 1.0, length_penalty_beta: float = 0.0, softmax_temperature: Optional[float] = None, max_output_length_num_stds: int = C.DEFAULT_NUM_STD_MAX_OUTPUT_LENGTH, ensemble_mode: str = 'linear', sample_size: int = -1, random_seed: int = 42) -> None: self.context = context self.max_input_len = max_input_len self.max_output_length_num_stds = max_output_length_num_stds self.ensemble_mode = ensemble_mode self.beam_size = beam_size self.batch_size = 16 self.bucket_width_source = bucket_width_source self.length_penalty_alpha = length_penalty_alpha self.length_penalty_beta = length_penalty_beta self.softmax_temperature = softmax_temperature self.model = model with data_io.smart_open(inputs) as inputs_fin, data_io.smart_open(references) as references_fin: input_sentences = inputs_fin.readlines() target_sentences = references_fin.readlines() utils.check_condition(len(input_sentences) == len(target_sentences), "Number of sentence pairs do not match") if sample_size <= 0: sample_size = len(input_sentences) if sample_size < len(input_sentences): # custom random number generator to guarantee the same samples across runs in order to be able to # compare metrics across independent runs random_gen = random.Random(random_seed) self.input_sentences, self.target_sentences = zip( *random_gen.sample(list(zip(input_sentences, target_sentences)), sample_size)) else: self.input_sentences, self.target_sentences = input_sentences, target_sentences logger.info("Created CheckpointDecoder(max_input_len=%d, beam_size=%d, model=%s, num_sentences=%d)", max_input_len if max_input_len is not None else -1, beam_size, model, len(self.input_sentences)) with data_io.smart_open(os.path.join(self.model, C.DECODE_REF_NAME), 'w') as trg_out, \ data_io.smart_open(os.path.join(self.model, C.DECODE_IN_NAME), 'w') as src_out: [trg_out.write(s) for s in self.target_sentences] [src_out.write(s) for s in self.input_sentences]
def __init__(self, config: ConvolutionalEmbeddingConfig, prefix: str = C.CHAR_SEQ_ENCODER_PREFIX) -> None: utils.check_condition( len(config.num_filters) == config.max_filter_width, "num_filters must have max_filter_width elements.") self.num_embed = config.num_embed self.output_dim = config.output_dim self.max_filter_width = config.max_filter_width self.num_filters = config.num_filters[:] self.pool_stride = config.pool_stride self.num_highway_layers = config.num_highway_layers self.prefix = prefix self.dropout = config.dropout self.add_positional_encoding = config.add_positional_encoding self.conv_weight = { filter_width: mx.sym.Variable( "%s%s%d%s" % (self.prefix, "conv_", filter_width, "_weight")) for filter_width in range(1, self.max_filter_width + 1) } self.conv_bias = { filter_width: mx.sym.Variable( "%s%s%d%s" % (self.prefix, "conv_", filter_width, "_bias")) for filter_width in range(1, self.max_filter_width + 1) } self.project_weight = mx.sym.Variable(self.prefix + "project_weight") self.project_bias = mx.sym.Variable(self.prefix + "project_bias") self.gate_weight = [ mx.sym.Variable("%s%s%d%s" % (self.prefix, "gate_", i, "_weight")) for i in range(self.num_highway_layers) ] self.gate_bias = [ mx.sym.Variable("%s%s%d%s" % (self.prefix, "gate_", i, "_bias")) for i in range(self.num_highway_layers) ] self.transform_weight = [ mx.sym.Variable("%s%s%d%s" % (self.prefix, "transform_", i, "_weight")) for i in range(self.num_highway_layers) ] self.transform_bias = [ mx.sym.Variable("%s%s%d%s" % (self.prefix, "transform_", i, "_bias")) for i in range(self.num_highway_layers) ]
def dot_attention(queries: mx.sym.Symbol, keys: mx.sym.Symbol, values: mx.sym.Symbol, lengths: Optional[mx.sym.Symbol] = None, dropout: float = 0.0, bias: Optional[mx.sym.Symbol] = None, prefix: Optional[str] = ''): """ Computes dot attention for a set of queries, keys, and values. :param queries: Attention queries. Shape: (n, lq, d). :param keys: Attention keys. Shape: (n, lk, d). :param values: Attention values. Shape: (n, lk, dv). :param lengths: Optional sequence lengths of the keys. Shape: (n,). :param dropout: Dropout probability. :param bias: Optional 3d bias tensor. :param prefix: Optional prefix :return: 'Context' vectors for each query. Shape: (n, lq, dv), 'probs' vector for each query (n, lq, lk). """ utils.check_condition( lengths is not None or bias is not None, "Must provide either length or bias argument for masking") # (n, lq, lk) logits = mx.sym.batch_dot(lhs=queries, rhs=keys, transpose_b=True, name='%sdot' % prefix) if lengths is not None: # mask lk dimension # (lk, n, lq) logits = mx.sym.transpose(data=logits, axes=(2, 0, 1)) logits = mx.sym.SequenceMask(data=logits, use_sequence_length=True, sequence_length=lengths, value=C.LARGE_NEGATIVE_VALUE) # (n, lq, lk) logits = mx.sym.transpose(data=logits, axes=(1, 2, 0)) if bias is not None: logits = mx.sym.broadcast_add(logits, bias, name='%sbias_add' % prefix) probs = mx.sym.softmax(logits, axis=-1) probs = mx.sym.Dropout(probs, p=dropout) if dropout > 0.0 else probs # (n, lq, lk) x (n, lk, dv) -> (n, lq, dv) res = mx.sym.batch_dot(lhs=probs, rhs=values, name='%scontexts' % prefix) return res, probs
def determine_decode_and_evaluate_context( args: argparse.Namespace, exit_stack: ExitStack, train_context: List[mx.Context]) -> Tuple[int, Optional[mx.Context]]: """ Determine the number of sentences to decode and the context we should run on (CPU or GPU). :param args: Arguments as returned by argparse. :param exit_stack: An ExitStack from contextlib. :param train_context: Context for training. :return: The number of sentences to decode and a list with the context(s) to run on. """ num_to_decode = args.decode_and_evaluate if args.optimized_metric == C.BLEU and num_to_decode == 0: logger.info( "You chose BLEU as the optimized metric, will turn on BLEU monitoring during training. " "To control how many validation sentences are used for calculating bleu use " "the --decode-and-evaluate argument.") num_to_decode = -1 if num_to_decode == 0: return 0, None if args.use_cpu or args.decode_and_evaluate_use_cpu: context = mx.cpu() elif args.decode_and_evaluate_device_id is not None: # decode device is defined from the commandline num_gpus = utils.get_num_gpus() check_condition( num_gpus >= 1, "No GPUs found, consider running on the CPU with --use-cpu " "(note: check depends on nvidia-smi and this could also mean that the nvidia-smi " "binary isn't on the path).") if args.disable_device_locking: context = utils.expand_requested_device_ids( [args.decode_and_evaluate_device_id]) else: context = exit_stack.enter_context( utils.acquire_gpus([args.decode_and_evaluate_device_id], lock_dir=args.lock_dir)) context = mx.gpu(context[0]) else: # default decode context is the last training device context = train_context[-1] logger.info("Decode and Evaluate Device(s): %s", context) return num_to_decode, context
def get_coverage(config: CoverageConfig) -> 'Coverage': """ Returns a Coverage instance. :param config: Coverage configuration. :return: Instance of Coverage. """ if config.type == 'count': utils.check_condition(config.num_hidden == 1, "Count coverage requires coverage_num_hidden==1") if config.type == "gru": return GRUCoverage(config.num_hidden, config.layer_normalization) elif config.type in {"tanh", "sigmoid", "relu", "softrelu"}: return ActivationCoverage(config.num_hidden, config.type, config.layer_normalization) elif config.type == "count": return CountCoverage() else: raise ValueError("Unknown coverage type %s" % config.type)
def _get_embed_weights( self) -> Tuple[mx.sym.Symbol, mx.sym.Symbol, mx.sym.Symbol]: """ Returns embedding parameters for source and target. :return: Tuple of source and target parameter symbols. """ assert isinstance(self.config.config_embed_source, encoder.EmbeddingConfig) assert isinstance(self.config.config_embed_target, encoder.EmbeddingConfig) w_embed_source = mx.sym.Variable( C.SOURCE_EMBEDDING_PREFIX + "weight", shape=(self.config.config_embed_source.vocab_size, self.config.config_embed_source.num_embed)) w_embed_target = mx.sym.Variable( C.TARGET_EMBEDDING_PREFIX + "weight", shape=(self.config.config_embed_target.vocab_size, self.config.config_embed_target.num_embed)) w_out_target = mx.sym.Variable("target_output_weight", shape=(self.config.output_layer_size, self.decoder.get_num_hidden())) if self.config.weight_tying: if C.WEIGHT_TYING_SRC in self.config.weight_tying_type \ and C.WEIGHT_TYING_TRG in self.config.weight_tying_type: logger.info("Tying the source and target embeddings.") w_embed_source = w_embed_target = mx.sym.Variable( C.SHARED_EMBEDDING_PREFIX + "weight", shape=(self.config.config_embed_source.vocab_size, self.config.config_embed_source.num_embed)) if C.WEIGHT_TYING_SOFTMAX in self.config.weight_tying_type: logger.info( "Tying the target embeddings and output layer parameters.") utils.check_condition( self.config.config_embed_target.num_embed == self.decoder.get_num_hidden(), "Weight tying requires target embedding size and decoder hidden size " + "to be equal: %d vs. %d" % (self.config.config_embed_target.num_embed, self.decoder.get_num_hidden())) w_out_target = w_embed_target return w_embed_source, w_embed_target, w_out_target
def get_encodings(length, depth) -> np.ndarray: utils.check_condition( depth % 2 == 0, "Positional embeddings require an even embedding size it " "is however %d." % depth) # (1, depth) channels = np.arange(depth // 2).reshape((1, -1)) # (length, 1) positions = np.arange(0, length).reshape((-1, 1)) scaled_positions = positions / np.power(10000, (2 * channels) / depth) # sinusoids: sin = np.sin(scaled_positions) # cosines: cos = np.cos(scaled_positions) # interleave: (1, length, num_embed) encodings = np.hstack([sin, cos]).reshape(1, length, depth) return encodings
def __init__(self, schedule: List[Tuple[float, int]], updates_per_checkpoint: int) -> None: super().__init__() check_condition(all(num_updates > 0 for (_, num_updates) in schedule), "num_updates for each step should be > 0.") check_condition( all(num_updates % updates_per_checkpoint == 0 for (_, num_updates) in schedule), "num_updates for each step should be divisible by updates_per_checkpoint." ) self.schedule = schedule self.current_step = 0 self.current_rate = 0. self.current_step_num_updates = 0 self.current_step_started_at = 0 self.next_step_at = 0 self.latest_t = 0 self._update_rate(self.current_step)
def __init__(self, reduce_factor: float, reduce_num_not_improved: int, warmup: int = 0) -> None: super().__init__(warmup) check_condition(0.0 < reduce_factor <= 1, "reduce_factor should be in ]0,1].") self.reduce_factor = reduce_factor self.reduce_num_not_improved = reduce_num_not_improved self.num_not_improved = 0 self.lr = None # type: float self.t_last_log = -1 self.warmed_up = not self.warmup > 0 logger.info( "Will reduce the learning rate by a factor of %.2f whenever" " the validation score doesn't improve %d times.", reduce_factor, reduce_num_not_improved)
def _check_dist_kvstore_requirements(self, lr_decay_opt_states_reset, lr_decay_param_reset, optimizer): # In distributed training the optimizer will run remotely. For eve we however need to pass information about # the loss, which is not possible anymore by means of accessing self.module._curr_module._optimizer. utils.check_condition(optimizer != C.OPTIMIZER_EVE, "Eve optimizer not supported with distributed training.") utils.check_condition(not issubclass(type(self.lr_scheduler), lr_scheduler.AdaptiveLearningRateScheduler), "Adaptive learning rate schedulers not supported with a dist kvstore. " "Try a fixed schedule such as %s." % C.LR_SCHEDULER_FIXED_RATE_INV_SQRT_T) utils.check_condition(not lr_decay_param_reset, "Parameter reset when the learning rate decays not " "supported with distributed training.") utils.check_condition(not lr_decay_opt_states_reset, "Optimizer state reset when the learning rate decays " "not supported with distributed training.")
def __init__( self, batch_size: int, output_folder: str, optimized_metric: str = C.PERPLEXITY, use_tensorboard: bool = False, cp_decoder: Optional[checkpoint_decoder.CheckpointDecoder] = None ) -> None: self.output_folder = output_folder # stores dicts of metric names & values for each checkpoint self.metrics = [] # type: List[Dict] self.metrics_filename = os.path.join(output_folder, C.METRICS_NAME) self.best_checkpoint = 0 self.start_tic = time.time() self.summary_writer = None if use_tensorboard: import tensorboard # pylint: disable=import-error log_dir = os.path.join(output_folder, C.TENSORBOARD_NAME) if os.path.exists(log_dir): logger.info("Deleting existing tensorboard log dir %s", log_dir) shutil.rmtree(log_dir) logger.info("Logging training events for Tensorboard at '%s'", log_dir) self.summary_writer = tensorboard.FileWriter(log_dir) self.cp_decoder = cp_decoder self.ctx = mp.get_context('spawn') # type: ignore self.decoder_metric_queue = self.ctx.Queue() self.decoder_process = None # type: Optional[mp.Process] # TODO(fhieber): MXNet Speedometer uses root logger. How to fix this? self.speedometer = mx.callback.Speedometer( batch_size=batch_size, frequent=C.MEASURE_SPEED_EVERY, auto_reset=False) utils.check_condition(optimized_metric in C.METRICS, "Unsupported metric: %s" % optimized_metric) if optimized_metric == C.BLEU: utils.check_condition(self.cp_decoder is not None, "%s requires CheckpointDecoder" % C.BLEU) self.optimized_metric = optimized_metric self.validation_best = C.METRIC_WORST[self.optimized_metric] logger.info("Early stopping by optimizing '%s'", self.optimized_metric) self.tic = 0
def __init__(self, prefix: str, depth_att: int = 512, heads: int = 8, depth_out: int = 512, dropout: float = 0.0) -> None: self.prefix = prefix utils.check_condition( depth_att % heads == 0, "Number of heads (%d) must divide attention depth (%d)" % (heads, depth_att)) self.depth = depth_att self.heads = heads self.depth_out = depth_out self.dropout = dropout self.depth_per_head = self.depth // self.heads self.w_h2o = mx.sym.Variable("%sh2o_weight" % prefix) self.b_h2o = mx.sym.Variable("%sh2o_bias" % prefix)