Exemplo n.º 1
0
    def _decode(self, tokens, encoder_outs):
        # wrap in Variable
        tokens = utils.volatile_variable(tokens)

        avg_probs = None
        avg_attn = None
        for model, encoder_out in zip(self.models, encoder_outs):
            with utils.maybe_no_grad():
                decoder_out, attn = model.decoder(tokens, encoder_out)
            probs = model.get_normalized_probs(decoder_out[:, -1, :],
                                               log_probs=False).data
            if avg_probs is None:
                avg_probs = probs
            else:
                avg_probs.add_(probs)
            if attn is not None:
                attn = attn[:, -1, :].data
                if avg_attn is None:
                    avg_attn = attn
                else:
                    avg_attn.add_(attn)
        avg_probs.div_(len(self.models))
        avg_probs.log_()
        if avg_attn is not None:
            avg_attn.div_(len(self.models))

        return avg_probs, avg_attn
Exemplo n.º 2
0
    def generate_batched_itr(self, data_itr, beam_size=None, maxlen_a=0.0, maxlen_b=None,
                             cuda=False, timer=None, prefix_size=0):
        """Iterate over a batched dataset and yield individual translations.

        Args:
            maxlen_a/b: generate sequences of maximum length ax + b,
                where x is the source sentence length.
            cuda: use GPU for generation
            timer: StopwatchMeter for timing generations.
        """
        if maxlen_b is None:
            maxlen_b = self.maxlen

        for sample in data_itr:
            s = utils.make_variable(sample, volatile=True, cuda=cuda)
            input = s['net_input']
            srclen = input['src_tokens'].size(1)
            if timer is not None:
                timer.start()
            with utils.maybe_no_grad():
                hypos = self.generate(
                    input['src_tokens'],
                    input['src_lengths'],
                    beam_size=beam_size,
                    maxlen=int(maxlen_a*srclen + maxlen_b),
                    prefix_tokens=s['target'][:, :prefix_size] if prefix_size > 0 else None,
                )
            if timer is not None:
                timer.stop(sum([len(h[0]['tokens']) for h in hypos]))
            for i, id in enumerate(s['id'].data):
                src = input['src_tokens'].data[i, :]
                # remove padding from ref
                ref = utils.strip_pad(s['target'].data[i, :], self.pad) if s['target'] is not None else None
                yield id, src, ref, hypos[i]
    def forward(self, input, incremental_state=None):
        """
        Input: Time x Batch x Channel.
        Args:
            incremental_state: Used to buffer signal; if not None, then input is
                expected to contain a single frame. If the input order changes
                between time steps, call reorder_incremental_state.
        """
        if incremental_state is None:
            return super().forward(input)

        # reshape weight
        weight = self._get_linearized_weight()
        kw = self.kernel_size[0]

        bsz = input.size(0)  # input: bsz x len x dim
        if kw > 1:
            input = input.data
            input_buffer = self._get_input_buffer(incremental_state)
            if input_buffer is None:
                input_buffer = input.new(bsz, kw, input.size(2)).zero_()
                self._set_input_buffer(incremental_state, input_buffer)
            else:
                # shift buffer
                input_buffer[:, :-1, :] = input_buffer[:, 1:, :].clone()
            # append next input
            input_buffer[:, -1, :] = input[:, -1, :]
            input = utils.volatile_variable(input_buffer)
        with utils.maybe_no_grad():
            output = F.linear(input.view(bsz, -1), weight, self.bias)
        return output.view(bsz, 1, -1)
    def _decode(self, tokens, encoder_outs, incremental_states):
        # wrap in Variable
        tokens = utils.volatile_variable(tokens)

        avg_probs = None
        avg_attn = None
        for model, encoder_out in zip(self.models, encoder_outs):
            with utils.maybe_no_grad():
                if incremental_states[model] is not None:
                    decoder_out = list(
                        model.decoder(tokens, encoder_out,
                                      incremental_states[model]))
                else:
                    decoder_out = list(model.decoder(tokens, encoder_out))
                decoder_out[0] = decoder_out[0][:, -1, :]
                attn = decoder_out[1]
            probs = model.get_normalized_probs(decoder_out,
                                               log_probs=False).data
            if avg_probs is None:
                avg_probs = probs
            else:
                avg_probs.add_(probs)
            if attn is not None:
                attn = attn[:, -1, :].data
                if avg_attn is None:
                    avg_attn = attn
                else:
                    avg_attn.add_(attn)
        avg_probs.div_(len(self.models))
        avg_probs.log_()
        if avg_attn is not None:
            avg_attn.div_(len(self.models))

        return avg_probs, avg_attn
Exemplo n.º 5
0
    def _decode(self, tokens, encoder_outs, src_doctopic_reshaped, incremental_states):

        # print(tokens, encoder_outs, src_doctopic_reshaped.size(), incremental_states)
        
        # wrap in Variable
        tokens = utils.volatile_variable(tokens)

        avg_probs = None
        avg_attn = None
        for model, encoder_out in zip(self.models, encoder_outs):
            with utils.maybe_no_grad():
                
                decoder_out, attn = model.decoder(tokens, encoder_out, src_doctopic_reshaped, incremental_states[model])
            probs = model.get_normalized_probs(decoder_out[:, -1, :], log_probs=False).data
            if avg_probs is None:
                avg_probs = probs
            else:
                avg_probs.add_(probs)
            if attn is not None:
                attn = attn[:, -1, :].data
                if avg_attn is None:
                    avg_attn = attn
                else:
                    avg_attn.add_(attn)
        avg_probs.div_(len(self.models))
        avg_probs.log_()
        if avg_attn is not None:
            avg_attn.div_(len(self.models))

        return avg_probs, avg_attn
    def _async_forward(self, rank, device_id, eval=False):
        if eval:
            self.model.eval()
        else:
            self.model.train()
            self.optimizer.zero_grad()

        with utils.maybe_no_grad(eval):
            sample_size, logging_output, oom = 0, {}, False
            if self._sample is not None:
                try:
                    # calculate loss and sample size
                    self.loss, sample_size, logging_output = self.criterion(
                        self.model, self._sample)
                except RuntimeError as e:
                    if not eval and 'out of memory' in str(e):
                        print(
                            '| WARNING: ran out of memory on GPU #{}, skipping batch'
                            .format(device_id))
                        oom = True
                        self.loss = None
                        if hasattr(torch.cuda, 'empty_cache'):
                            torch.cuda.empty_cache()
                    else:
                        raise e

        return sample_size, logging_output, oom
Exemplo n.º 7
0
 def generate(
     self, src_tokens, src_lengths, beam_size=None, maxlen=None, prefix_tokens=None
 ):
     """Generate a batch of translations."""
     with utils.maybe_no_grad():
         return self._generate(
             src_tokens, src_lengths, beam_size, maxlen, prefix_tokens
         )
Exemplo n.º 8
0
    def generate_batched_itr(
        self,
        data_itr,
        beam_size=None,
        maxlen_a=0.0,
        maxlen_b=None,
        cuda=False,
        timer=None,
        prefix_size=0,
    ):
        """Iterate over a batched dataset and yield individual translations.

        Args:
            maxlen_a/b: generate sequences of maximum length ax + b,
                where x is the source sentence length.
            cuda: use GPU for generation
            timer: StopwatchMeter for timing generations.
        """
        if maxlen_b is None:
            maxlen_b = self.maxlen

        for sample in data_itr:
            s = utils.make_variable(sample, volatile=True, cuda=cuda)
            input = s["net_input"]
            # Take the max source length to compute the max target length
            srclen = input["src_tokens"].size(1)
            # FIXME: handle characters properly
            if self.use_char_source:
                raise ValueError(
                    "Character level encoder is not supported yet for "
                    "multisource sentences."
                )
            encoder_inputs = (input["src_tokens"], input["src_lengths"])
            if timer is not None:
                timer.start()
            with utils.maybe_no_grad():
                hypos = self.generate(
                    encoder_inputs,
                    srcs_ids=input["src_ids"],
                    beam_size=beam_size,
                    maxlen=int(maxlen_a * srclen + maxlen_b),
                    prefix_tokens=s["target"][:, :prefix_size]
                    if prefix_size > 0
                    else None,
                )
            if timer is not None:
                timer.stop(s["ntokens"])
            for i, id in enumerate(s["id"]):
                src = input["src_tokens"].index_select(
                    0,
                    input['src_ids'][self.align_to]
                )
                # remove padding from ref
                ref = utils.strip_pad(s["target"][i, :], self.pad)
                yield id, src, ref, hypos[i]
Exemplo n.º 9
0
    def generate_batched_itr(
        self,
        data_itr,
        beam_size=None,
        maxlen_a=0.0,
        maxlen_b=None,
        cuda=False,
        timer=None,
        prefix_size=0,
    ):
        """Iterate over a batched dataset and yield individual translations.

        Args:
            maxlen_a/b: generate sequences of maximum length ax + b,
                where x is the source sentence length.
            cuda: use GPU for generation
            timer: StopwatchMeter for timing generations.
        """
        if maxlen_b is None:
            maxlen_b = self.maxlen

        for sample in data_itr:
            s = utils.make_variable(sample, volatile=True, cuda=cuda)
            input = s["net_input"]
            srclen = input["src_tokens"].size(1)
            if self.use_char_source:
                encoder_input = (
                    input["src_tokens"],
                    input["src_lengths"],
                    input["char_inds"],
                    input["word_lengths"],
                )
            else:
                encoder_input = (input["src_tokens"], input["src_lengths"])
            if timer is not None:
                timer.start()
            with utils.maybe_no_grad():
                hypos = self.generate(
                    encoder_input,
                    beam_size=beam_size,
                    maxlen=int(maxlen_a * srclen + maxlen_b),
                    prefix_tokens=s["target"][:, :prefix_size]
                    if prefix_size > 0
                    else None,
                )
            if timer is not None:
                timer.stop(s["ntokens"])
            for i, id in enumerate(s["id"].data):
                src = input["src_tokens"].data[i, :]
                # remove padding from ref
                ref = utils.strip_pad(s["target"].data[i, :], self.pad)
                yield id, src, ref, hypos[i]
Exemplo n.º 10
0
 def generate(
     self,
     encoder_inputs,
     srcs_ids,
     beam_size=None,
     maxlen=None,
     prefix_tokens=None,
     src_weights=None,
 ):
     """Generate a batch of translations."""
     with utils.maybe_no_grad():
         return self._generate(
             encoder_inputs, srcs_ids, beam_size, maxlen, prefix_tokens, src_weights
         )
Exemplo n.º 11
0
    def _forward(self, sample, eval=False):
        # prepare model and optimizer
        if eval:
            self.model.eval()
        else:
            self.model.train()
            self.optimizer.zero_grad()

        loss = None
        sample_size = 0
        logging_output = {
            'ntokens': sample['ntokens'] if sample is not None else 0,
            'nsentences':
            sample['target'].size(0) if sample is not None else 0,
        }
        oom = 0
        if sample is not None:
            try:
                with utils.maybe_no_grad(eval):
                    # calculate loss and sample size
                    loss, sample_size, logging_output_ = self.criterion(
                        self.model, sample)
                    logging_output.update(logging_output_)
            except RuntimeError as e:
                if not eval and 'out of memory' in str(e):
                    print('| WARNING: ran out of memory, skipping batch')
                    oom = 1
                    loss = None
                    if hasattr(torch.cuda, 'empty_cache'):
                        torch.cuda.empty_cache()
                else:
                    raise e

        # synchronize logging outputs for multi-GPU training
        if self.args.distributed_world_size > 1:
            sample_sizes, logging_outputs, ooms = zip(*list(
                distributed_utils.all_gather_list((sample_size, logging_output,
                                                   oom))))
            ooms = sum(ooms)
        else:
            sample_sizes = [sample_size]
            logging_outputs = [logging_output]
            ooms = oom

        return loss, sample_sizes, logging_outputs, ooms
Exemplo n.º 12
0
    def _decode(self, tokens, encoder_outs, incremental_states, n_srcs=1):
        # wrap in Variable
        tokens = utils.volatile_variable(tokens)

        # Source sentences are weighted equally (for now)
        srcs_weights = [1 / n_srcs] * n_srcs

        avg_probs = None
        avg_attn = None
        for src_id, src_weight in enumerate(srcs_weights):
            for model_id, (model_weight, model) in enumerate(
                zip(self.model_weights, self.models)
            ):
                with utils.maybe_no_grad():
                    encoder_out = encoder_outs[src_id][model_id]
                    incremental_state = incremental_states[(src_id, model_id)]
                    decoder_out = list(
                        model.decoder(tokens, encoder_out, incremental_state)
                    )
                    decoder_out[0] = decoder_out[0][:, -1, :]
                    attn = decoder_out[1]
                    if len(decoder_out) == 3:
                        possible_translation_tokens = decoder_out[2]
                    else:
                        possible_translation_tokens = None
                probs = (
                    src_weight
                    * model_weight
                    * model.get_normalized_probs(decoder_out, log_probs=False)
                )
                if avg_probs is None:
                    avg_probs = probs
                else:
                    avg_probs.add_(probs)
                if attn is not None and src_id == self.align_to:
                    attn = attn[:, -1, :]
                    if avg_attn is None:
                        avg_attn = attn
                    else:
                        avg_attn.add_(attn)
        avg_probs.log_()
        if avg_attn is not None:
            avg_attn.div_(len(self.models))

        return avg_probs, avg_attn, possible_translation_tokens
    def score(self, sample):
        """Score a batch of translations."""
        net_input = sample['net_input']

        # compute scores for each model in the ensemble
        avg_probs = None
        avg_attn = None
        for model in self.models:
            with utils.maybe_no_grad():
                model.eval()
                encoder_out = model.encoder(
                    net_input['src_tokens'],
                    net_input['src_lengths'],
                )
                decoder_out = model.decoder(
                    net_input['prev_output_tokens'],
                    encoder_out,
                )
                attn = decoder_out[1]
            probs = model.get_normalized_probs(decoder_out,
                                               log_probs=False).data
            if avg_probs is None:
                avg_probs = probs
            else:
                avg_probs.add_(probs)
            if attn is not None:
                attn = attn.data
                if avg_attn is None:
                    avg_attn = attn
                else:
                    avg_attn.add_(attn)
        avg_probs.div_(len(self.models))
        avg_probs.log_()
        if avg_attn is not None:
            avg_attn.div_(len(self.models))
        avg_probs = avg_probs.gather(
            dim=2,
            index=sample['target'].data.unsqueeze(-1),
        )
        return avg_probs.squeeze(2), avg_attn
Exemplo n.º 14
0
    def generate_batched_itr(self,
                             data_itr,
                             beam_size=None,
                             maxlen_a=0.0,
                             maxlen_b=None,
                             cuda_device=None,
                             timer=None):
        """Iterate over a batched dataset and yield individual translations.

        Args:
            maxlen_a/b: generate sequences of maximum length ax + b,
                where x is the source sentence length.
            cuda_device: GPU on which to do generation.
            timer: StopwatchMeter for timing generations.
        """
        if maxlen_b is None:
            maxlen_b = self.maxlen

        for sample in data_itr:
            s = utils.make_variable(sample,
                                    volatile=True,
                                    cuda_device=cuda_device)
            input = s['net_input']
            srclen = input['src_tokens'].size(1)
            if timer is not None:
                timer.start()
            with utils.maybe_no_grad():
                hypos = self.generate(input['src_tokens'],
                                      beam_size=beam_size,
                                      maxlen=int(maxlen_a * srclen + maxlen_b))
            if timer is not None:
                timer.stop(s['ntokens'])
            for i, id in enumerate(s['id'].data):
                src = input['src_tokens'].data[i, :]
                # remove padding from ref
                ref = utils.strip_pad(s['target'].data[i, :], self.pad)
                yield id, src, ref, hypos[i]
Exemplo n.º 15
0
    def _decode(self, tokens, encoder_outs, incremental_states):
        # wrap in Variable
        tokens = utils.volatile_variable(tokens)

        avg_probs = None
        avg_attn = None
        for model_weight, model, encoder_out in zip(
            self.model_weights, self.models, encoder_outs
        ):
            with utils.maybe_no_grad():
                decoder_out = list(
                    model.decoder(tokens, encoder_out, incremental_states[model])
                )
                decoder_out[0] = decoder_out[0][:, -1, :]
                attn = decoder_out[1]
                if len(decoder_out) == 3:
                    possible_translation_tokens = decoder_out[2]
                else:
                    possible_translation_tokens = None
            probs = model_weight * model.get_normalized_probs(
                decoder_out, log_probs=False
            )
            if avg_probs is None:
                avg_probs = probs
            else:
                avg_probs.add_(probs)
            if attn is not None:
                attn = attn[:, -1, :]
                if avg_attn is None:
                    avg_attn = attn
                else:
                    avg_attn.add_(attn)
        avg_probs.log_()
        if avg_attn is not None:
            avg_attn.div_(len(self.models))

        return avg_probs, avg_attn, possible_translation_tokens