示例#1
0
    def forward(self, src_tokens):
        # embed tokens and positions
        x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
        x = F.dropout(x, p=self.dropout, training=self.training)
        input_embedding = x

        # project to size of convolution
        x = self.fc1(x)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # temporal convolutions
        for proj, conv in zip(self.projections, self.convolutions):
            residual = x if proj is None else proj(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
            x = conv(x)
            x = F.glu(x, dim=2)
            x = (x + residual) * math.sqrt(0.5)

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # project back to size of embedding
        x = self.fc2(x)

        # scale gradients (this only affects backward, not forward)
        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        y = (x + input_embedding) * math.sqrt(0.5)

        return x, y
    def forward(self, src_tokens, src_lengths):
        # embed tokens and positions
        x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
        x = F.dropout(x, p=self.dropout, training=self.training)
        input_embedding = x.transpose(0, 1)

        # project to size of convolution
        x = self.fc1(x)

        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()  # -> T x B
        if not encoder_padding_mask.any():
            encoder_padding_mask = None

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # temporal convolutions
        for proj, conv, attention in zip(self.projections, self.convolutions,
                                         self.attention):
            residual = x if proj is None else proj(x)

            if encoder_padding_mask is not None:
                x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)

            x = F.dropout(x, p=self.dropout, training=self.training)
            padding_l = (conv.kernel_size[0] - 1) // 2
            padding_r = conv.kernel_size[0] // 2
            x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
            x = conv(x)
            x = F.glu(x, dim=2)
            if attention is not None:
                x = attention(x)
            x = (x + residual) * math.sqrt(0.5)

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # project back to size of embedding
        x = self.fc2(x)

        if encoder_padding_mask is not None:
            encoder_padding_mask = encoder_padding_mask.t()  # -> B x T
            x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)

        # scale gradients (this only affects backward, not forward)
        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        y = (x + input_embedding.transpose(0, 1)) * math.sqrt(0.5)

        # transposing the output to T x B x C and saving to a file
        if self._encoder_states_dir:
            self._save_encoder_state(x.transpose(1, 0), "batch-%s.pt")

        return {
            'encoder_out': (x, y),
            'encoder_padding_mask': encoder_padding_mask,  # B x T
        }
示例#3
0
文件: hubert.py 项目: sdadas/fairseq
 def forward_features(self, source: torch.Tensor) -> torch.Tensor:
     if self.feature_grad_mult > 0:
         features = self.feature_extractor(source)
         if self.feature_grad_mult != 1.0:
             features = GradMultiply.apply(features, self.feature_grad_mult)
     else:
         with torch.no_grad():
             features = self.feature_extractor(source)
     return features
示例#4
0
    def forward(self, source, padding_mask, **kwargs):
        features_only = kwargs.get("features_only", False)
        mask = kwargs.get("mask", True)

        if self.feature_grad_mult > 0:
            features = self.feature_extractor(source)
            if self.feature_grad_mult != 1.0:
                features = GradMultiply.apply(features, self.feature_grad_mult)
        else:
            with torch.no_grad():
                features = self.feature_extractor(source)

        features = features.transpose(1, 2)

        if padding_mask is not None:
            extra = padding_mask.size(1) % features.size(1)
            if extra > 0:
                padding_mask = padding_mask[:, :-extra]
            padding_mask = padding_mask.view(padding_mask.size(0),
                                             features.size(1), -1)
            padding_mask = padding_mask.all(-1)
            num_frames = (~padding_mask).float().sum()
            features_pen = (features.float().pow(2).sum(-1) *
                            (~padding_mask).float()).sum() / num_frames
        else:
            features_pen = features.float().pow(2).mean()

        if self.post_extract_proj is not None:
            features = self.post_extract_proj(features)

        features = self.dropout_input(features)

        if mask:
            x, mask_indices = self.apply_mask(features, padding_mask)
        else:
            x = features
            mask_indices = None

        x = self.encoder(x, padding_mask=padding_mask)

        if features_only:
            return {"x": x, "padding_mask": padding_mask}

        x = self.phone_proj(x)

        result = {
            "x": x,
            "padding_mask": padding_mask,
            "features_pen": features_pen
        }

        return result
示例#5
0
    def forward(self, guess_tokens, guess_lengths, marker):
        # embed tokens and positions
        #print(self.embed_positions(guess_tokens)[0,0])
        x = self.embed_tokens(guess_tokens) + self.embed_positions0(
            guess_tokens, marker=marker, mark=0) + self.embed_positions1(
                guess_tokens, marker=marker, mark=1)
        x = F.dropout(x, p=self.dropout, training=self.training)
        guess_embedding = x  #--------------------------------------------------------------------------------------------

        # project to size of convolution
        x = self.fc1(x)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # temporal convolutions
        for proj, conv in zip(self.projections, self.convolutions):
            residual = x if proj is None else proj(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
            if conv.kernel_size[0] % 2 == 1:
                # padding is implicit in the conv
                x = conv(x)
            else:
                padding_l = (conv.kernel_size[0] - 1) // 2
                padding_r = conv.kernel_size[0] // 2
                x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
                x = conv(x)
            x = F.glu(x, dim=2)
            x = (x + residual) * math.sqrt(0.5)

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # project back to size of embedding
        x = self.fc2(x)

        # scale gradients (this only affects backward, not forward)
        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        y = (x + guess_embedding) * math.sqrt(
            0.5
        )  #-----------------------------------------------------------------------

        return x, y
示例#6
0
    def forward(self, source, padding_mask=None, mask=True, features_only=False, **kwargs):

        if self.feature_grad_mult > 0:
            features = self.feature_extractor(source)
            if self.feature_grad_mult != 1.0:
                features = GradMultiply.apply(features, self.feature_grad_mult)
        else:
            with torch.no_grad():
                features = self.feature_extractor(source)

        features_pen = features.float().pow(2).mean()

        features = features.transpose(1, 2)
        features = self.layer_norm(features)

        if padding_mask is not None:
            extra = padding_mask.size(1) % features.size(1)
            if extra > 0:
                padding_mask = padding_mask[:, :-extra]
            padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1)
            padding_mask = padding_mask.all(-1)

        if self.post_extract_proj is not None:
            features = self.post_extract_proj(features)

        features = self.dropout_input(features)

        if mask:
            x, mask_indices = self.apply_mask(features, padding_mask)
        else:
            x = features; mask_indices = None

        x = self.encoder(x, padding_mask=padding_mask) # B, T, V
        size = x.size()
        x = x[mask_indices] # B, N, V

        if features_only:
            return {"x": x, "padding_mask": padding_mask, "mask_indices": mask_indices}

        x = self.phone_proj(x) # B, N, V

        result = {"x": x, "padding_mask": padding_mask, "mask_indices": mask_indices, "features_pen": features_pen, "size": size}

        return result
示例#7
0
    def forward(self, src_tokens, src_lengths):
        # embed tokens and positions
        x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
        x = F.dropout(x, p=self.dropout, training=self.training)
        input_embedding = x.transpose(0, 1)

        # project to size of convolution
        x = self.fc1(x)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # temporal convolutions
        for proj, conv, attention in zip(self.projections, self.convolutions,
                                         self.attention):
            residual = x if proj is None else proj(x)

            x = F.dropout(x, p=self.dropout, training=self.training)
            padding_l = (conv.kernel_size[0] - 1) // 2
            padding_r = conv.kernel_size[0] // 2
            x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
            x = conv(x)
            x = F.glu(x, dim=2)
            if attention is not None:
                x = attention(x)
            x = (x + residual) * math.sqrt(0.5)

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # project back to size of embedding
        x = self.fc2(x)

        # scale gradients (this only affects backward, not forward)
        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        y = (x + input_embedding.transpose(0, 1)) * math.sqrt(0.5)

        return {
            'encoder_out': (x, y),
        }
示例#8
0
    def forward(self, src_tokens):
        positions = Variable(
            make_positions(src_tokens.data,
                           self.dictionary.pad(),
                           left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)
        )  #left_pad=True  # position embedding

        # embed tokens and positions
        x = self.embed_tokens(src_tokens) + self.embed_positions(positions)
        x = F.dropout(x, p=self.dropout, training=self.training)
        input_embedding = x

        # project to size of convolution
        x = self.fc1(x)  # if embedding size does not equal with channel size

        # B x T x C -> T x B x C    C=Channel
        x = x.transpose(0, 1)

        # temporal convolutions
        for proj, conv in zip(self.projections, self.convolutions):
            residual = x if proj is None else proj(
                x)  # if dim of x does not equal with dim of conv output
            x = F.dropout(x, p=self.dropout, training=self.training)
            x = conv(x)
            x = F.glu(x, dim=-1)
            x = (x + residual) * math.sqrt(0.5)  # residual connection

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # project back to size of embedding
        x = self.fc2(x)  # encoder_out[0], embed_dim

        # scale gradients (this only affects backward, not forward)
        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        y = (x + input_embedding) * math.sqrt(0.5)  # encoder_out[1]
        # z + e for conditional input c, scale by √0.5

        return x, y  #
示例#9
0
    def forward(self, src_tokens, src_lengths):
        # embed tokens and positions
        x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
        x = F.dropout(x, p=self.dropout, training=self.training)
        input_embedding = x.transpose(0, 1)

        # project to size of convolution
        x = self.fc1(x)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # temporal convolutions
        for proj, conv, attention in zip(self.projections, self.convolutions, self.attention):
            residual = x if proj is None else proj(x)

            x = F.dropout(x, p=self.dropout, training=self.training)
            padding_l = (conv.kernel_size[0] - 1) // 2
            padding_r = conv.kernel_size[0] // 2
            x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
            x = conv(x)
            x = F.glu(x, dim=2)
            if attention is not None:
                x = attention(x)
            x = (x + residual) * math.sqrt(0.5)

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # project back to size of embedding
        x = self.fc2(x)

        # scale gradients (this only affects backward, not forward)
        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        y = (x + input_embedding.transpose(0, 1)) * math.sqrt(0.5)

        return {
            'encoder_out': (x, y),
        }
示例#10
0
文件: fconv.py 项目: ahiroto/ParlAI
    def forward(self, src_tokens):
        positions = Variable(make_positions(src_tokens.data, self.dictionary.pad(),
                                            left_pad=LanguagePairDataset.LEFT_PAD_SOURCE))

        # embed tokens and positions
        x = self.embed_tokens(src_tokens) + self.embed_positions(positions)
        x = F.dropout(x, p=self.dropout, training=self.training)
        input_embedding = x

        # project to size of convolution
        x = self.fc1(x)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # temporal convolutions
        for proj, conv in zip(self.projections, self.convolutions):
            residual = x if proj is None else proj(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
            x = conv(x)
            x = F.glu(x, dim=-1)
            x = (x + residual) * math.sqrt(0.5)

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # project back to size of embedding
        x = self.fc2(x)

        # scale gradients (this only affects backward, not forward)
        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        y = (x + input_embedding) * math.sqrt(0.5)

        return x, y
示例#11
0
    def forward(self,
                source,
                padding_mask=None,
                mask=True,
                features_only=False,
                alignments=None):
        if self.feature_grad_mult > 0:
            features = self.feature_extractor(source)
            if self.feature_grad_mult != 1.0:
                features = GradMultiply.apply(features, self.feature_grad_mult)
        else:
            with torch.no_grad():
                features = self.feature_extractor(source)

        compute_alignment_metrics = self.cfg.compute_alignment_metrics and alignments is not None

        features_pen = features.float().pow(2).mean()

        features = features.transpose(1, 2)
        features = self.layer_norm(features)
        unmasked_features = features.clone()

        if padding_mask is not None:
            assert padding_mask.size(1) == 1
            padding_mask = padding_mask.squeeze(1)
            scale = padding_mask.size(1) // features.size(1)
            extra = padding_mask.size(1) % features.size(
                1
            )  # should be 0 since 1st CNN reduces number of features [scale] times (due to the architecture choice)
            assert extra == 0
            padding_mask = padding_mask[:, ::scale]
            assert np.all(padding_mask.shape == features.shape[:-1])
            if compute_alignment_metrics:
                alignments = alignments[:, ::scale]

        if self.post_extract_proj is not None:
            features = self.post_extract_proj(features)

        features = self.dropout_input(features)
        unmasked_features = self.dropout_features(unmasked_features)

        num_vars = None
        code_ppl = None
        prob_ppl = None
        curr_temp = None

        if self.input_quantizer:
            q = self.input_quantizer(features, produce_targets=False)
            features = q["x"]
            num_vars = q["num_vars"]
            code_ppl = q["code_perplexity"]
            prob_ppl = q["prob_perplexity"]
            curr_temp = q["temp"]
            features = self.project_inp(features)

        # [!] careful with some nasty indirect dependencies - changing this function arg padding_mask -> sth else
        #     seems to break the code completely because indirect dependencies on passthrough **kwargs etc. or so are so horrible

        if mask:
            x, mask_indices = self.apply_mask(features, padding_mask)
            if mask_indices is not None:
                y = unmasked_features[mask_indices].view(
                    unmasked_features.size(0), -1, unmasked_features.size(-1))
                if compute_alignment_metrics:
                    alignments = alignments[mask_indices]
            else:
                y = unmasked_features
        else:
            x = features
            y = unmasked_features
            mask_indices = None

        x = self.encoder(x, padding_mask=padding_mask)

        if features_only:
            return {"x": x, "padding_mask": padding_mask}

        if self.quantizer:
            q = self.quantizer(y, produce_targets=compute_alignment_metrics)
            y = q["x"]
            num_vars = q["num_vars"]
            code_ppl = q["code_perplexity"]
            prob_ppl = q["prob_perplexity"]
            curr_temp = q["temp"]

            y = self.project_q(y)

            if self.negatives_from_everywhere:
                neg_cands, *_ = self.quantizer(unmasked_features,
                                               produce_targets=False)
                negs, _ = self.sample_negatives(neg_cands, y.size(1))
                negs = self.project_q(negs)

            else:
                negs, _ = self.sample_negatives(y, y.size(1))

            if self.codebook_negatives > 0:
                cb_negs = self.quantizer.sample_from_codebook(
                    y.size(0) * y.size(1), self.codebook_negatives)
                cb_negs = cb_negs.view(self.codebook_negatives, y.size(0),
                                       y.size(1), -1)  # order doesnt matter
                cb_negs = self.project_q(cb_negs)
                negs = torch.cat([negs, cb_negs], dim=0)
        else:
            y = self.project_q(y)

            if self.negatives_from_everywhere:
                negs, _ = self.sample_negatives(unmasked_features, y.size(1))
                negs = self.project_q(negs)
            else:
                negs, _ = self.sample_negatives(y, y.size(1))

        x = x[mask_indices].view(x.size(0), -1, x.size(-1))

        if self.target_glu:
            y = self.target_glu(y)
            negs = self.target_glu(negs)

        x = self.final_proj(x)
        x = self.compute_preds(x, y, negs)

        result = {
            "x": x,
            "padding_mask": padding_mask,
            "features_pen": features_pen
        }

        if prob_ppl is not None:
            result["prob_perplexity"] = prob_ppl
            result["code_perplexity"] = code_ppl
            result["num_vars"] = num_vars
            result["temp"] = curr_temp

        if compute_alignment_metrics:
            result = {
                **result,
                **self.get_alignment_metrics(q["targets"], alignments)
            }

        return result
示例#12
0
    def forward(self, src_tokens, src_lengths=None):
        """
        Args:
            src_tokens (LongTensor): tokens in the source language of shape
                `(batch, src_len)`
            src_lengths (LongTensor): lengths of each source sentence of shape
                `(batch)`

        Returns:
            dict:
                - **encoder_out** (tuple): a tuple with two elements, where the
                  first element is the last encoder layer's output and the
                  second element is the same quantity summed with the input
                  embedding (used for attention). The shape of both tensors is
                  `(batch, src_len, embed_dim)`.
                - **encoder_padding_mask** (ByteTensor): the positions of
                  padding elements of shape `(batch, src_len)`
        """
        src_lengths = (src_tokens.ne(self.pad)).long().sum(dim=1)

        # embed tokens and positions
        x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
        x = F.dropout(x, p=self.dropout, training=self.training)
        input_embedding = x

        # project to size of convolution
        x = self.fc1(x)

        # used to mask padding in input
        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()  # -> T x B
        if not encoder_padding_mask.any():
            encoder_padding_mask = None

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        residuals = [x]
        # temporal convolutions
        for proj, conv, res_layer in zip(self.projections, self.convolutions,
                                         self.residuals):
            if res_layer > 0:
                residual = residuals[-res_layer]
                residual = residual if proj is None else proj(residual)
            else:
                residual = None

            if encoder_padding_mask is not None:
                x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)

            x = F.dropout(x, p=self.dropout, training=self.training)
            if conv.kernel_size[0] % 2 == 1:
                # padding is implicit in the conv
                x = conv(x)
            else:
                padding_l = (conv.kernel_size[0] - 1) // 2
                padding_r = conv.kernel_size[0] // 2
                x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
                x = conv(x)
            x = F.glu(x, dim=2)

            if residual is not None:
                x = (x + residual) * math.sqrt(0.5)
            residuals.append(x)

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # project back to size of embedding
        x = self.fc2(x)

        if encoder_padding_mask is not None:
            encoder_padding_mask = encoder_padding_mask.t()  # -> B x T
            x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)

        # scale gradients (this only affects backward, not forward)
        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        y = (x + input_embedding) * math.sqrt(0.5)

        return {
            'encoder_out': (x, y),
            'encoder_padding_mask': encoder_padding_mask,  # B x T
        }
示例#13
0
    def forward(
        self,
        source,
        padding_mask=None,
        mask=True,
        features_only=False,
        layer=None,
        mask_indices=None,
        mask_channel_indices=None,
        padding_count=None,
    ):
        features = source

        if self.feature_grad_mult > 0:
            features = self.feature_extractor(features)
            if self.feature_grad_mult != 1.0:
                features = GradMultiply.apply(features, self.feature_grad_mult)
        else:
            with torch.no_grad():
                features = self.feature_extractor(features)

        features = features.transpose(1, 2)

        features = self.layer_norm(features)

        orig_padding_mask = padding_mask

        if padding_mask is not None and padding_mask.any():
            input_lengths = (1 - padding_mask.long()).sum(-1)
            # apply conv formula to get real output_lengths
            output_lengths = self._get_feat_extract_output_lengths(input_lengths)

            padding_mask = torch.zeros(
                features.shape[:2], dtype=features.dtype, device=features.device
            )

            # these two operations makes sure that all values
            # before the output lengths indices are attended to
            padding_mask[
                (
                    torch.arange(padding_mask.shape[0], device=padding_mask.device),
                    output_lengths - 1,
                )
            ] = 1
            padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool()
        else:
            padding_mask = None

        if self.post_extract_proj is not None:
            features = self.post_extract_proj(features)

        pre_encoder_features = None
        if self.cfg.ema_transformer_only:
            pre_encoder_features = features.clone()

        features = self.dropout_input(features)

        if mask:
            x, mask_indices = self.apply_mask(
                features,
                padding_mask,
                mask_indices=mask_indices,
                mask_channel_indices=mask_channel_indices,
            )
        else:
            x = features
            mask_indices = None

        x, layer_results = self.encoder(
            x,
            padding_mask=padding_mask,
            layer=layer,
        )

        if features_only:
            return {
                "x": x,
                "padding_mask": padding_mask,
                "layer_results": layer_results,
            }

        result = {
            "losses": {},
        }

        with torch.no_grad():
            self.ema.model.eval()

            if self.cfg.ema_transformer_only:
                y, layer_results = self.ema.model.extract_features(
                    pre_encoder_features,
                    padding_mask=padding_mask,
                    min_layer=self.cfg.encoder_layers - self.average_top_k_layers,
                )
                y = {
                    "x": y,
                    "padding_mask": padding_mask,
                    "layer_results": layer_results,
                }
            else:
                y = self.ema.model.extract_features(
                    source=source,
                    padding_mask=orig_padding_mask,
                    mask=False,
                )

            target_layer_results = [l[2] for l in y["layer_results"]]

            permuted = False
            if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer:
                target_layer_results = [
                    tl.permute(1, 2, 0) for tl in target_layer_results  # TBC -> BCT
                ]
                permuted = True

            if self.cfg.batch_norm_target_layer:
                target_layer_results = [
                    F.batch_norm(
                        tl.float(), running_mean=None, running_var=None, training=True
                    )
                    for tl in target_layer_results
                ]

            if self.cfg.instance_norm_target_layer:
                target_layer_results = [
                    F.instance_norm(tl.float()) for tl in target_layer_results
                ]

            if permuted:
                target_layer_results = [
                    tl.transpose(1, 2) for tl in target_layer_results  # BCT -> BTC
                ]

            if self.cfg.group_norm_target_layer:
                target_layer_results = [
                    F.layer_norm(tl.float(), tl.shape[-2:])
                    for tl in target_layer_results
                ]

            if self.cfg.layer_norm_target_layer:
                target_layer_results = [
                    F.layer_norm(tl.float(), tl.shape[-1:])
                    for tl in target_layer_results
                ]

            y = sum(target_layer_results) / len(target_layer_results)

            if self.cfg.layer_norm_targets:
                y = F.layer_norm(y.float(), y.shape[-1:])

            if self.cfg.instance_norm_targets:
                y = F.instance_norm(y.float().transpose(1, 2)).transpose(1, 2)

            if not permuted:
                y = y.transpose(0, 1)

            y = y[mask_indices]

        x = x[mask_indices]
        x = self.final_proj(x)

        sz = x.size(-1)

        if self.loss_beta == 0:
            loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1)
        else:
            loss = F.smooth_l1_loss(
                x.float(), y.float(), reduction="none", beta=self.loss_beta
            ).sum(dim=-1)

        if self.loss_scale is not None:
            scale = self.loss_scale
        else:
            scale = 1 / math.sqrt(sz)

        result["losses"]["regression"] = loss.sum() * scale

        if "sample_size" not in result:
            result["sample_size"] = loss.numel()

        with torch.no_grad():
            result["target_var"] = self.compute_var(y)
            result["pred_var"] = self.compute_var(x.float())

        if self.num_updates > 5000 and result["target_var"] < self.cfg.min_target_var:
            logger.error(
                f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting"
            )
            raise Exception(
                f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting"
            )
        if self.num_updates > 5000 and result["pred_var"] < self.cfg.min_pred_var:
            logger.error(
                f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting"
            )
            raise Exception(
                f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting"
            )

        if self.ema is not None:
            result["ema_decay"] = self.ema.get_decay() * 1000

        return result
示例#14
0
    def forward(self, src_tokens, src_lengths):
        torch.set_printoptions(threshold=8000)

        x1 = self.embed_tokens(src_tokens)
        x2 = self.embed_positions(src_tokens)

        if src_tokens.lt(0).sum() >0:
            print("negative voc idx (voc size {})".format(len(self.dictionary)))
            print(src_tokens)
            exit()

        x = x1 + x2

        x = F.dropout(x, p=self.dropout, training=self.training)
        input_embedding = x

        # project to size of convolution
        x = self.fc1(x)

        # used to mask padding in input
        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()  # -> T x B  ###puts 1's where the pad index 0's otherwise
        if not encoder_padding_mask.any():
            encoder_padding_mask = None

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        residuals = [x]
        # temporal convolutions
        for proj, conv, res_layer in zip(self.projections, self.convolutions, self.residuals):
            if res_layer > 0:
                residual = residuals[-res_layer]
                residual = residual if proj is None else proj(residual)
            else:
                residual = None

            if encoder_padding_mask is not None:
                x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)

            x = F.dropout(x, p=self.dropout, training=self.training)
            if conv.kernel_size[0] % 2 == 1:
                # padding is implicit in the conv
                x = conv(x)
            else:
                padding_l = (conv.kernel_size[0] - 1) // 2
                padding_r = conv.kernel_size[0] // 2
                x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
                x = conv(x)
            x = F.glu(x, dim=2)

            if residual is not None:
                x = (x + residual) * math.sqrt(self.normalization_constant)
            residuals.append(x)

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # project back to size of embedding
        x = self.fc2(x)

        if encoder_padding_mask is not None:
            encoder_padding_mask = encoder_padding_mask.t()  # -> B x T
            x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)

        # scale gradients (this only affects backward, not forward)
        if self.num_attention_layers > 0:
          x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        y = (x + input_embedding) * math.sqrt(self.normalization_constant)

        return {
            'encoder_out': (x, y),
            'encoder_padding_mask': encoder_padding_mask,  # B x T
        }
示例#15
0
    def target_predict(self,
                       source,
                       padding_mask=None,
                       mask=True,
                       mask_indices=None):

        with torch.no_grad():
            if self.feature_grad_mult > 0:
                features = self.feature_extractor_target(source)
                if self.feature_grad_mult != 1.0:
                    features = GradMultiply.apply(features,
                                                  self.feature_grad_mult)
            else:
                with torch.no_grad():
                    features = self.feature_extractor_target(source)

            features_pen = features.float().pow(2).mean()

            features = features.transpose(1, 2)
            features = self.layer_norm(features)
            unmasked_features = features.clone()

            if padding_mask is not None:
                extra = padding_mask.size(1) % features.size(1)
                if extra > 0:
                    padding_mask = padding_mask[:, :-extra]
                padding_mask = padding_mask.view(padding_mask.size(0),
                                                 features.size(1), -1)
                padding_mask = padding_mask.all(-1)

            if self.post_extract_proj_target is not None:
                features = self.post_extract_proj_target(features)
            #print("target feature", features.size())
            features = self.dropout_input(features)
            unmasked_features = self.dropout_features(unmasked_features)

            num_vars = None
            code_ppl = None
            prob_ppl = None
            curr_temp = None

            if self.input_quantizer:
                if self.shared_quantizer:
                    q = self.input_quantizer(features, produce_targets=False)
                else:
                    q = self.input_quantizer(features, produce_targets=False)
                features = q["x"]
                num_vars = q["num_vars"]
                code_ppl = q["code_perplexity"]
                prob_ppl = q["prob_perplexity"]
                curr_temp = q["temp"]
                if self.shared_quantizer:
                    features = self.project_inp(features)
                else:
                    features = self.project_inp_target(features)

            if mask:
                x, mask_indices = self.apply_mask(features, padding_mask,
                                                  mask_indices)
                if mask_indices is not None:
                    y = unmasked_features[mask_indices].view(
                        unmasked_features.size(0), -1,
                        unmasked_features.size(-1))
                else:
                    y = unmasked_features
            else:
                x = features
                y = unmasked_features
                mask_indices = None

            x = self.encoder_target(x, padding_mask=padding_mask)
            #print("target encoded", x.size())

            if self.quantizer:
                if self.shared_quantizer:
                    q = self.quantizer(y, produce_targets=False)
                else:
                    q = self.quantizer_target(y, produce_targets=False)
                y = q["x"]
                num_vars = q["num_vars"]
                code_ppl = q["code_perplexity"]
                prob_ppl = q["prob_perplexity"]
                curr_temp = q["temp"]

                if self.shared_quantizer:
                    y = self.project_q(y)
                else:
                    y = self.project_q_target(y)

            else:
                y = self.project_q(y)
            #print("target before masking", x.size())
            x = x[mask_indices].view(x.size(0), -1, x.size(-1))
            #print("target after masking", x.size())
            if self.target_glu:
                y = self.target_glu_target(y)
                negs = self.target_glu_target(negs)

            x = self.final_proj_target(x)
            #x = self.compute_preds(x, y, negs)
            #print("target before prediction", x.size())
            result = {
                "x": x,
                "padding_mask": padding_mask,
                "features_pen": features_pen
            }

            if prob_ppl is not None:
                result["prob_perplexity"] = prob_ppl
                result["code_perplexity"] = code_ppl
                result["num_vars"] = num_vars
                result["temp"] = curr_temp

            return result
示例#16
0
    def forward(
        self,
        source,
        padding_mask=None,
        mask=True,
        features_only=False,
        layer=None,
        fix_n=0,
        mask_indices=None,
        mask_channel_indices=None,
        padding_count=None,
    ):

        if self.feature_grad_mult > 0:
            features = self.feature_extractor(source)
            if self.feature_grad_mult != 1.0:
                features = GradMultiply.apply(features, self.feature_grad_mult)
        else:
            with torch.no_grad():
                features = self.feature_extractor(source)

        features_pen = features.float().pow(2).mean()

        features = features.transpose(1, 2)
        features = self.layer_norm(features)
        unmasked_features = features.clone()

        if padding_mask is not None and padding_mask.any():
            input_lengths = (1 - padding_mask.long()).sum(-1)
            # apply conv formula to get real output_lengths
            output_lengths = self._get_feat_extract_output_lengths(
                input_lengths)

            padding_mask = torch.zeros(features.shape[:2],
                                       dtype=features.dtype,
                                       device=features.device)

            # these two operations makes sure that all values
            # before the output lengths indices are attended to
            padding_mask[(
                torch.arange(padding_mask.shape[0],
                             device=padding_mask.device),
                output_lengths - 1,
            )] = 1
            padding_mask = (
                1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool()
        else:
            padding_mask = None

        if self.post_extract_proj is not None:
            features = self.post_extract_proj(features)

        features = self.dropout_input(features)
        unmasked_features = self.dropout_features(unmasked_features)

        num_vars = None
        code_ppl = None
        prob_ppl = None
        curr_temp = None

        if self.input_quantizer:
            q = self.input_quantizer(features, produce_targets=False)
            features = q["x"]
            num_vars = q["num_vars"]
            code_ppl = q["code_perplexity"]
            prob_ppl = q["prob_perplexity"]
            curr_temp = q["temp"]
            features = self.project_inp(features)

        if mask:
            x, mask_indices = self.apply_mask(
                features,
                padding_mask,
                mask_indices=mask_indices,
                mask_channel_indices=mask_channel_indices,
            )
            if not is_xla_tensor(x) and mask_indices is not None:
                # tpu-comment: reducing the size in a dynamic way causes
                # too many recompilations on xla.
                y = unmasked_features[mask_indices].view(
                    unmasked_features.size(0), -1, unmasked_features.size(-1))
            else:
                y = unmasked_features
        else:
            x = features
            y = unmasked_features
            mask_indices = None

        x, layer_results = self.encoder(x,
                                        padding_mask=padding_mask,
                                        layer=layer,
                                        fix_n=fix_n)

        if features_only:
            return {
                "x": x,
                "padding_mask": padding_mask,
                "features": unmasked_features,
                "layer_results": layer_results,
            }
        x = self.final_proj(x)
        result = {
            "features": x,
            # "feautres_padding_mask": padding_mask,
            # "layer_results": layer_results,
        }

        if self.quantizer:
            q = self.quantizer(y, produce_targets=False)
            y = q["x"]
            num_vars = q["num_vars"]
            code_ppl = q["code_perplexity"]
            prob_ppl = q["prob_perplexity"]
            curr_temp = q["temp"]

            y = self.project_q(y)

            q_unmasked = self.quantizer(unmasked_features,
                                        produce_targets=False)
            quant_features = q_unmasked["x"]
            # quant_features = self.q2h_proj(q_unmasked["x"])
            if self.negatives_from_everywhere:
                # neg_cands = self.quantizer(unmasked_features, produce_targets=False)["x"]
                neg_cands = q_unmasked["x"]
                negs, _ = self.sample_negatives(
                    neg_cands,
                    y.size(1),
                    padding_count=padding_count,
                )
                negs = self.project_q(negs)

            else:
                negs, _ = self.sample_negatives(
                    y,
                    y.size(1),
                    padding_count=padding_count,
                )

            if self.codebook_negatives > 0:
                cb_negs = self.quantizer.sample_from_codebook(
                    y.size(0) * y.size(1), self.codebook_negatives)
                cb_negs = cb_negs.view(self.codebook_negatives, y.size(0),
                                       y.size(1), -1)  # order doesnt matter
                cb_negs = self.project_q(cb_negs)
                negs = torch.cat([negs, cb_negs], dim=0)
        else:
            y = self.project_q(y)

            if self.negatives_from_everywhere:
                negs, _ = self.sample_negatives(
                    unmasked_features,
                    y.size(1),
                    padding_count=padding_count,
                )
                negs = self.project_q(negs)
            else:
                negs, _ = self.sample_negatives(
                    y,
                    y.size(1),
                    padding_count=padding_count,
                )
        if not is_xla_tensor(x):
            # tpu-comment: reducing the size in a dynamic way causes
            # too many recompilations on xla.
            x = x[mask_indices].view(x.size(0), -1, x.size(-1))

        if self.target_glu:
            y = self.target_glu(y)
            negs = self.target_glu(negs)

        # x = self.final_proj(x)
        x = self.compute_preds(x, y, negs)
        result["x"] = x
        result["padding_mask"] = padding_mask
        result["features_pen"] = features_pen
        result["quant_features"] = quant_features

        # result = {
        #     "x": x,
        #     "padding_mask": padding_mask,
        #     "features_pen": features_pen,
        #     "quant_features":quant_features,
        # }

        if prob_ppl is not None:
            result["prob_perplexity"] = prob_ppl
            result["code_perplexity"] = code_ppl
            result["num_vars"] = num_vars
            result["temp"] = curr_temp

        return result
示例#17
0
    def forward(self,
                source,
                padding_mask=None,
                mask=True,
                features_only=False):

        if self.feature_grad_mult > 0:
            features = self.feature_extractor(source)
            if self.feature_grad_mult != 1.0:
                features = GradMultiply.apply(features, self.feature_grad_mult)
        else:
            with torch.no_grad():
                features = self.feature_extractor(source)

        features_pen = features.float().pow(2).mean()

        features = features.transpose(1, 2)
        features = self.layer_norm(features)
        unmasked_features = features.clone()

        if padding_mask is not None:
            input_lengths = (1 - padding_mask.long()).sum(-1)
            # apply conv formula to get real output_lengths
            output_lengths = self._get_feat_extract_output_lengths(
                input_lengths)

            padding_mask = torch.zeros(features.shape[:2],
                                       dtype=features.dtype,
                                       device=features.device)

            # these two operations makes sure that all values
            # before the output lengths indices are attended to
            padding_mask[(torch.arange(padding_mask.shape[0],
                                       device=padding_mask.device),
                          output_lengths - 1)] = 1
            padding_mask = (
                1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool()

        if self.post_extract_proj is not None:
            features = self.post_extract_proj(features)

        features = self.dropout_input(features)
        unmasked_features = self.dropout_features(unmasked_features)

        num_vars = None
        code_ppl = None
        prob_ppl = None
        curr_temp = None

        if self.input_quantizer:
            q = self.input_quantizer(features, produce_targets=False)
            features = q["x"]
            num_vars = q["num_vars"]
            code_ppl = q["code_perplexity"]
            prob_ppl = q["prob_perplexity"]
            curr_temp = q["temp"]
            features = self.project_inp(features)

        if mask:
            x, mask_indices = self.apply_mask(features, padding_mask)
            if mask_indices is not None:
                y = unmasked_features[mask_indices].view(
                    unmasked_features.size(0), -1, unmasked_features.size(-1))
            else:
                y = unmasked_features
        else:
            x = features
            y = unmasked_features
            mask_indices = None

        x = self.encoder(x, padding_mask=padding_mask)

        if features_only:
            return {"x": x, "padding_mask": padding_mask}

        if self.quantizer:
            q = self.quantizer(y, produce_targets=False)
            y = q["x"]
            num_vars = q["num_vars"]
            code_ppl = q["code_perplexity"]
            prob_ppl = q["prob_perplexity"]
            curr_temp = q["temp"]

            y = self.project_q(y)

            if self.negatives_from_everywhere:
                neg_cands, *_ = self.quantizer(unmasked_features,
                                               produce_targets=False)
                negs, _ = self.sample_negatives(neg_cands, y.size(1))
                negs = self.project_q(negs)

            else:
                negs, _ = self.sample_negatives(y, y.size(1))

            if self.codebook_negatives > 0:
                cb_negs = self.quantizer.sample_from_codebook(
                    y.size(0) * y.size(1), self.codebook_negatives)
                cb_negs = cb_negs.view(self.codebook_negatives, y.size(0),
                                       y.size(1), -1)  # order doesnt matter
                cb_negs = self.project_q(cb_negs)
                negs = torch.cat([negs, cb_negs], dim=0)
        else:
            y = self.project_q(y)

            if self.negatives_from_everywhere:
                negs, _ = self.sample_negatives(unmasked_features, y.size(1))
                negs = self.project_q(negs)
            else:
                negs, _ = self.sample_negatives(y, y.size(1))

        x = x[mask_indices].view(x.size(0), -1, x.size(-1))

        if self.target_glu:
            y = self.target_glu(y)
            negs = self.target_glu(negs)

        x = self.final_proj(x)
        x = self.compute_preds(x, y, negs)

        result = {
            "x": x,
            "padding_mask": padding_mask,
            "features_pen": features_pen
        }

        if prob_ppl is not None:
            result["prob_perplexity"] = prob_ppl
            result["code_perplexity"] = code_ppl
            result["num_vars"] = num_vars
            result["temp"] = curr_temp

        return result
    def forward(self, src_tokens, src_lengths, src_doctopic, src_wordtopics):
        # embed tokens and positions
        # print(self.embed_tokens(src_tokens), self.embed_positions(src_tokens), src_doctopic, src_wordtopics)

        # ''' 1)
        # src_doctopic: batchsize x 512
        # src_wordtopics: batchsize x wordcount x 512
        src_doctopic_ext = src_doctopic.unsqueeze(1) # batchsize x 1 x 512
        # print(src_doctopic_ext)
        src_wordtopics_doctopic = src_wordtopics * src_doctopic_ext # batchsize x wordcount x 512
        # print(src_wordtopics_doctopic)
        # ''' 

        ''' 2)
        # src_doctopic: batchsize x 512
        # src_wordtopics: batchsize x wordcount x 512
        src_doctopic_ext = src_doctopic.unsqueeze(1) # batchsize x 1 x 512
        # print(src_doctopic_ext)
        src_wordtopics_doctopic = src_wordtopics * src_doctopic_ext # batchsize x wordcount x 512
        # print(src_wordtopics_doctopic)
	# Normalize src_wordtopics_doctopic (April 29th)
        src_wordtopics_doctopic = F.normalize(src_wordtopics_doctopic, p=2, dim=2)
        '''
        
        
        x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens) # batchsize x wordcount x 512
        #word_embedding = torch.FloatTensor(vector_dict.get_embedding(src_tokens.cpu().numpy(), self.embed_dim)).to('cuda')
        #x = word_embedding + self.embed_positions(src_tokens) # batchsize x wordcount x 512
        # print(x)

        # Concat wordtopics*doctopic to (wordembedding+posembedding)
        x = torch.cat((x, src_wordtopics_doctopic), 2)
        # print(x)
        
        x = F.dropout(x, p=self.dropout, training=self.training)
        input_embedding = x

        # project to size of convolution
        x = self.fc1(x)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # temporal convolutions
        for proj, conv in zip(self.projections, self.convolutions):
            residual = x if proj is None else proj(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
            padding_l = (conv.kernel_size[0] - 1) // 2
            padding_r = conv.kernel_size[0] // 2
            x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
            x = conv(x)


            # NGTU BEGIN
            part_1, part_2 = x[:,:,:self.embed_dim],x[:,:,self.embed_dim:] 
            part_1 = torch.tanh(part_1)
            x = torch.cat([part_1,part_2],dim=2)
            x = F.glu(x, dim=2)
            x = self.lay_norm(x + residual)
            # NGTU END

            '''
            # original GLU BEGIN 
            x = F.glu(x, dim=2)
            x = (x + residual) * math.sqrt(0.5)
            # GLU END
            '''



        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # project back to size of embedding
        x = self.fc2(x)

        # scale gradients (this only affects backward, not forward)
        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        y = (x + input_embedding) * math.sqrt(0.5)

        # print(x,y)
        return x, y
示例#19
0
    def forward(
        self,
        src_tokens,
        src_lengths,
        return_all_hiddens=False,
        padding_mask=None,
        features_only=True,
    ):
        mask = self.training or self.alway_mask
        if self.feature_grad_mult > 0 and self.training:
            features = self.subsample(src_tokens)
            if self.feature_grad_mult != 1.0:
                features = GradMultiply.apply(features, self.feature_grad_mult)
        else:
            with torch.no_grad():
                features = self.subsample(src_tokens)
        features = features.transpose(1, 2)
        features = self.feat_layer_norm(features)
        if self.feat_proj is not None:
            features = self.feat_proj(features)

        if padding_mask is not None:
            input_lengths = (1 - padding_mask.long()).sum(-1)
            # apply conv formula to get real output_lengths
            output_lengths = self._get_feat_extract_output_lengths(
                input_lengths)

            padding_mask = torch.zeros(features.shape[:2],
                                       dtype=features.dtype,
                                       device=features.device)

            # these two operations makes sure that all values
            # before the output lengths indices are attended to
            padding_mask[(
                torch.arange(padding_mask.shape[0],
                             device=padding_mask.device),
                output_lengths - 1,
            )] = 1
            padding_mask = (
                1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool()

        features = self.feat_scale * features if self.feat_scale != 1.0 else features
        unmasked_features = features.clone()

        features = self.dropout_input(features)
        unmasked_features = self.dropout_features(unmasked_features)
        if mask:
            x, mask_indices = self.apply_mask(features, padding_mask)
        else:
            x = features
            mask_indices = None

        def cal_transformer_layers(x,
                                   encoder_padding_mask,
                                   return_all_hiddens=False):
            # x: B x T x C
            positions = self.embed_positions(x.transpose(1, 2)).transpose(1, 2)
            x = x + positions
            if not self.normalize_before:
                x = self.layer_norm(x)

            # B x T x C -> T x B x C
            x = x.transpose(0, 1)
            encoder_states = []
            for layer in self.layers:
                x = layer(x, encoder_padding_mask)
                if return_all_hiddens:
                    encoder_states.append(x)
            if self.normalize_before:
                x = self.layer_norm(x)
            return x, encoder_states

        x, encoder_states = cal_transformer_layers(x, padding_mask,
                                                   return_all_hiddens)
        if features_only:
            return {
                "encoder_out": [x],  # [T x B x C]
                "encoder_padding_mask":
                [padding_mask] if padding_mask is not None else [],  # B x T
                "encoder_embedding": [],  #
                "encoder_states":
                encoder_states,  # List[T x B x C]
                "src_tokens": [],
                "src_lengths": [],
                "mask_indices": [mask_indices],
            }

        x_unmasked = x
        if self.mask_prob > 0 or self.mask_channel_prob > 0:
            x_unmasked, _ = cal_transformer_layers(unmasked_features,
                                                   padding_mask)
        return {
            "encoder_out": [x],  # [T x B x C]
            "encoder_unmasked_out": [x_unmasked],  # [T x B x C]
            "encoder_padding_mask":
            [padding_mask] if padding_mask is not None else [],  # B x T
            "encoder_embedding": [],  #
            "encoder_states":
            encoder_states,  # List[T x B x C]
            "src_tokens": [],
            "src_lengths": [],
            "mask_indices":
            [mask_indices] if mask_indices is not None else [],  # B X T
        }
示例#20
0
    def forward(self,
                source,
                padding_mask=None,
                mask=True,
                features_only=False):

        if self.feature_grad_mult > 0:
            features = self.feature_extractor(source)
            if self.feature_grad_mult != 1.0:
                features = GradMultiply.apply(features, self.feature_grad_mult)
        else:
            with torch.no_grad():
                features = self.feature_extractor(source)

        features_pen = features.float().pow(2).mean()

        features = features.transpose(1, 2)
        features = self.layer_norm(features)
        unmasked_features = features.clone()

        if padding_mask is not None:
            extra = padding_mask.size(1) % features.size(1)
            if extra > 0:
                padding_mask = padding_mask[:, :-extra]
            padding_mask = padding_mask.view(padding_mask.size(0),
                                             features.size(1), -1)
            padding_mask = padding_mask.all(-1)

        if self.post_extract_proj is not None:
            features = self.post_extract_proj(features)

        features = self.dropout_input(features)
        unmasked_features = self.dropout_features(unmasked_features)

        num_vars = None
        code_ppl = None
        prob_ppl = None
        curr_temp = None

        if mask:
            x, mask_indices = self.apply_mask(features, padding_mask)
            if mask_indices is not None:
                y = unmasked_features[mask_indices].view(
                    unmasked_features.size(0), -1, unmasked_features.size(-1))
            else:
                y = unmasked_features
        else:
            x = features
            y = unmasked_features
            mask_indices = None

        x = self.encoder(x, padding_mask=padding_mask)

        if features_only:
            return {"x": x, "padding_mask": padding_mask}

        y = self.project_q(y)

        if self.negatives_from_everywhere:
            negs, _ = self.sample_negatives(unmasked_features, y.size(1))
            negs = self.project_q(negs)
        else:
            negs, _ = self.sample_negatives(y, y.size(1))

        logits_ali = self.phone_proj(x[mask_indices])  # N, V
        x = x[mask_indices].view(x.size(0), -1, x.size(-1))

        if self.target_glu:
            y = self.target_glu(y)
            negs = self.target_glu(negs)

        x = self.final_proj(x)
        x = self.compute_preds(x, y, negs)

        result = {
            "x": x,
            "padding_mask": padding_mask,
            "mask_indices": mask_indices,
            "features_pen": features_pen,
            "logits_ali": logits_ali
        }

        return result
示例#21
0
    def forward(self, src_tokens, src_lengths, src_doctopic, src_wordtopics):

        # embed tokens and positions
        # print("------------------------------------------------------")
        # print(self.embed_tokens(src_tokens))
        # print("------------------------------------------------------")
        # print(self.embed_positions(src_tokens))
        # print("------------------------------------------------------")
        # print(src_doctopic)
        # print("------------------------------------------------------")
        # print(src_wordtopics)
        # exit(0)
        ''' Shapes
        src_doctopic: batchsize x 512
        src_wordtopics: batchsize x wordcount x 512
        src_doctopic_ext = src_doctopic.unsqueeze(1) # batchsize x 1 x 512
        src_wordtopics_doctopic = src_wordtopics * src_doctopic_ext # batchsize x wordcount x 512
        src_wordtopics_doctopic = src_wordtopics # batchsize x wordcount x 512
        '''

        # if self.variant == 1:
        # print("USING ENCODER DECODER WITH enc(t', tD) dec(tD)")
        src_doctopic_ext = src_doctopic.unsqueeze(1)  # batchsize x 1 x 512
        src_wordtopics_doctopic = src_wordtopics * src_doctopic_ext  # batchsize x wordcount x 512
        # else:
        # print("USING ENCODER DECODER WITH enc(t') dec(tD)")
        #   src_wordtopics_doctopic = src_wordtopics# batchsize x wordcount x 512

        x = self.embed_tokens(src_tokens) + self.embed_positions(
            src_tokens)  # batchsize x wordcount x 512
        # print(x)

        # Concat wordtopics*doctopic to (wordembedding+posembedding)
        x = torch.cat((x, src_wordtopics_doctopic), 2)
        # print(x)

        x = F.dropout(x, p=self.dropout, training=self.training)
        input_embedding = x

        # project to size of convolution
        x = self.fc1(x)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # temporal convolutions
        for proj, conv in zip(self.projections, self.convolutions):
            residual = x if proj is None else proj(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
            padding_l = (conv.kernel_size[0] - 1) // 2
            padding_r = conv.kernel_size[0] // 2
            x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
            x = conv(x)
            x = F.glu(x, dim=2)  # GLU
            x = (x + residual) * math.sqrt(0.5)

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # project back to size of embedding
        x = self.fc2(x)

        # scale gradients (this only affects backward, not forward)
        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        y = (x + input_embedding) * math.sqrt(0.5)

        # print(x,y)
        return x, y
示例#22
0
    def forward(self, src_tokens, src_lengths):
        # embed tokens and positions
        x = self.embed_tokens(src_tokens)
        x = F.dropout2d(x, p=self.token_dropout, training=self.training)
        x += self.embed_positions(src_tokens)
        x = F.dropout(x, p=self.dropout, training=self.training)
        input_embedding = x

        # project to size of convolution
        x = self.fc1(x)

        # used to mask padding in input
        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()  # -> T x B
        if not encoder_padding_mask.any():
            encoder_padding_mask = None

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        residuals = [x]
        # temporal convolutions
        for proj, conv, res_layer in zip(self.projections, self.convolutions, self.residuals):
            if res_layer > 0:
                residual = residuals[-res_layer]
                residual = residual if proj is None else proj(residual)
            else:
                residual = None

            if encoder_padding_mask is not None:
                x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)

            x = F.dropout(x, p=self.dropout, training=self.training)
            if conv.kernel_size[0] % 2 == 1:
                # padding is implicit in the conv
                x = conv(x)
            else:
                padding_l = (conv.kernel_size[0] - 1) // 2
                padding_r = conv.kernel_size[0] // 2
                x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
                x = conv(x)
            x = F.glu(x, dim=2)

            if residual is not None:
                x = (x + residual) * math.sqrt(self.normalization_constant)
            residuals.append(x)

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # project back to size of embedding
        x = self.fc2(x)

        if encoder_padding_mask is not None:
            encoder_padding_mask = encoder_padding_mask.t()  # -> B x T
            x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)

        # scale gradients (this only affects backward, not forward)
        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        y = (x + input_embedding) * math.sqrt(self.normalization_constant)

        return {
            'encoder_out': (x, y),
            'encoder_padding_mask': encoder_padding_mask,  # B x T
        }
示例#23
0
 def mult_rst_grad(self, rst, ratio):
     assert isinstance(rst, dict)  # instead of EncoderOut
     assert len(rst["encoder_out"]) == 1
     rst["encoder_out"][0] = GradMultiply.apply(rst["encoder_out"][0],
                                                ratio)
     return rst
示例#24
0
    def forward(self,
                source,
                padding_mask=None,
                mask=True,
                features_only=False):

        if self.feature_grad_mult > 0:
            features = self.feature_extractor(source)
            if self.feature_grad_mult != 1.0:
                features = GradMultiply.apply(features, self.feature_grad_mult)
        else:
            with torch.no_grad():
                features = self.feature_extractor(source)

        features_pen = features.float().pow(2).mean()

        features = features.transpose(1, 2)
        features = self.layer_norm(features)
        unmasked_features = features.clone()

        if padding_mask is not None:
            extra = padding_mask.size(1) % features.size(1)
            if extra > 0:
                padding_mask = padding_mask[:, :-extra]
            padding_mask = padding_mask.view(padding_mask.size(0),
                                             features.size(1), -1)
            padding_mask = padding_mask.all(-1)

        if self.post_extract_proj is not None:
            features = self.post_extract_proj(features)

        features = self.dropout_input(features)
        unmasked_features = self.dropout_features(unmasked_features)

        num_vars = None
        code_ppl = None
        prob_ppl = None
        curr_temp = None

        if self.input_quantizer:
            q = self.input_quantizer(features, produce_targets=False)
            features = q["x"]
            num_vars = q["num_vars"]
            code_ppl = q["code_perplexity"]
            prob_ppl = q["prob_perplexity"]
            curr_temp = q["temp"]
            features = self.project_inp(features)

        if mask:
            x, mask_indices = self.apply_mask(features, padding_mask)
            if mask_indices is not None:
                y = unmasked_features[mask_indices].view(
                    unmasked_features.size(0), -1, unmasked_features.size(-1))
            else:
                y = unmasked_features
        else:
            x = features
            y = unmasked_features
            mask_indices = None

        x = self.encoder(x, padding_mask=padding_mask)

        if features_only:
            return {"x": x, "padding_mask": padding_mask}

        if self.quantizer:
            q = self.quantizer(y, produce_targets=False)
            y = q["x"]
            num_vars = q["num_vars"]
            code_ppl = q["code_perplexity"]
            prob_ppl = q["prob_perplexity"]
            curr_temp = q["temp"]

            y = self.project_q(y)

            if self.negatives_from_everywhere:
                neg_cands, *_ = self.quantizer(unmasked_features,
                                               produce_targets=False)
                negs, _ = self.sample_negatives(neg_cands, y.size(1))
                negs = self.project_q(negs)

            else:
                negs, _ = self.sample_negatives(y, y.size(1))

            if self.codebook_negatives > 0:
                cb_negs = self.quantizer.sample_from_codebook(
                    y.size(0) * y.size(1), self.codebook_negatives)
                cb_negs = cb_negs.view(self.codebook_negatives, y.size(0),
                                       y.size(1), -1)  # order doesnt matter
                cb_negs = self.project_q(cb_negs)
                negs = torch.cat([negs, cb_negs], dim=0)
        else:
            y = self.project_q(y)

            if self.negatives_from_everywhere:
                negs, _ = self.sample_negatives(unmasked_features, y.size(1))
                negs = self.project_q(negs)
            else:
                negs, _ = self.sample_negatives(y, y.size(1))

        x = x[mask_indices].view(x.size(0), -1, x.size(-1))

        if self.target_glu:
            y = self.target_glu(y)
            negs = self.target_glu(negs)

        x = self.final_proj(x)
        x = self.compute_preds(x, y, negs)

        result = {
            "x": x,
            "padding_mask": padding_mask,
            "features_pen": features_pen
        }

        if prob_ppl is not None:
            result["prob_perplexity"] = prob_ppl
            result["code_perplexity"] = code_ppl
            result["num_vars"] = num_vars
            result["temp"] = curr_temp

        return result
示例#25
0
    def forward(self,
                text_sequences,
                text_positions=None,
                lengths=None,
                speaker_embed=None):
        assert self.n_speakers == 1 or speaker_embed is not None

        # embed text_sequences
        x = self.embed_tokens(text_sequences)
        if text_positions is not None:
            x += self.embed_text_positions(text_positions)

        x = F.dropout(x, p=self.dropout, training=self.training)

        # embed speakers
        if speaker_embed is not None:
            # expand speaker embedding for all time steps
            # (B, N) -> (B, T, N)
            ss = speaker_embed.size()
            speaker_embed = speaker_embed.unsqueeze(1).expand(
                ss[0], x.size(1), ss[-1])
            speaker_embed_btc = speaker_embed
            speaker_embed_tbc = speaker_embed.transpose(0, 1)
            x += F.softsign(self.speaker_fc1(speaker_embed_btc))

        input_embedding = x

        # project to size of convolution
        x = self.fc1(x)

        use_convtbc = isinstance(self.convolutions[0], _ConvTBC)
        # TBC case: B x T x C -> T x B x C
        # Generic case: B x T x C -> B x C x T
        x = x.transpose(0, 1) if use_convtbc else x.transpose(1, 2)

        # 1D conv blocks
        for proj, speaker_proj, conv in zip(self.projections,
                                            self.speaker_projections,
                                            self.convolutions):
            residual = x if proj is None else proj(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
            x = conv(x)
            splitdim = -1 if use_convtbc else 1
            a, b = x.split(x.size(splitdim) // 2, dim=splitdim)
            if speaker_proj is not None:
                softsign = F.softsign(
                    speaker_proj(speaker_embed_tbc
                                 if use_convtbc else speaker_embed_btc))
                softsign = softsign if use_convtbc else softsign.transpose(
                    1, 2)
                a = a + softsign
            x = a * F.sigmoid(b)
            x = (x + residual) * math.sqrt(0.5)

        # Back to batch first
        x = x.transpose(0, 1) if use_convtbc else x.transpose(1, 2)

        # project back to size of embedding
        keys = self.fc2(x)
        if speaker_embed is not None:
            keys += F.softsign(self.speaker_fc2(speaker_embed_btc))

        # scale gradients (this only affects backward, not forward)
        if self.num_attention_layers is not None:
            keys = GradMultiply.apply(keys,
                                      1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        values = (keys + input_embedding) * math.sqrt(0.5)

        return keys, values
    def forward(self, src_tokens, src_lengths):
        """
        Args:
            src_tokens (LongTensor): tokens in the source language of shape
                `(batch, src_len)`
            src_lengths (LongTensor): lengths of each source sentence of shape
                `(batch)`

        Returns:
            dict:
                - **encoder_out** (tuple): a tuple with two elements, where the
                  first element is the last encoder layer's output and the
                  second element is the same quantity summed with the input
                  embedding (used for attention). The shape of both tensors is
                  `(batch, src_len, embed_dim)`.
                - **encoder_padding_mask** (ByteTensor): the positions of
                  padding elements of shape `(batch, src_len)`
        """
        # ELMo
        src_sens = src_tokens.cpu().numpy().tolist()
        for i in range(len(src_sens)):
            for j in range(len(src_sens[i])):
                src_sens[i][j] = self.id2token[src_sens[i][j]]
        char_ids = batch_to_ids(src_sens)
        elmo_embeds = self.elmo(char_ids.to('cuda'))
        elmo_embeds = elmo_embeds['elmo_representations']
        input_elmo_embeds = elmo_embeds[0]
        output_elmo_embeds = None
        if self.args.num_output_repr == 2:
            output_elmo_embeds = elmo_embeds[1]
        if torch.cuda.is_available():
            input_elmo_embeds = input_elmo_embeds.to('cuda')
            if output_elmo_embeds is not None:
                output_elmo_embeds = output_elmo_embeds.to('cuda')
        if self.args.merge_mode == 'sum':
            input_elmo_embeds = self.elmo_projection(input_elmo_embeds)
            if output_elmo_embeds is not None:
                output_elmo_embeds = self.elmo_projection(output_elmo_embeds)

        if self.args.use_other_embed:
            # embed tokens and positions
            if self.args.merge_mode == 'sum':
                x = self.embed_tokens(src_tokens) + self.embed_positions(
                    src_tokens) + input_elmo_embeds
            else:
                x = self.embed_tokens(src_tokens) + self.embed_positions(
                    src_tokens)
                x = torch.cat((x, input_elmo_embeds), dim=-1)
        else:
            # just use ELMo repr
            if self.args.merge_mode == 'sum':
                x = input_elmo_embeds + self.embed_positions(src_tokens)
            else:
                x = torch.cat(
                    (self.embed_positions(src_tokens), input_elmo_embeds),
                    dim=-1)

        x = F.dropout(x, p=self.dropout, training=self.training)
        input_embedding = x

        # project to size of convolution
        x = self.fc1(x)

        # used to mask padding in input
        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()  # -> T x B
        if not encoder_padding_mask.any():
            encoder_padding_mask = None

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        residuals = [x]
        # temporal convolutions
        for proj, conv, res_layer in zip(self.projections, self.convolutions,
                                         self.residuals):
            if res_layer > 0:
                residual = residuals[-res_layer]
                residual = residual if proj is None else proj(residual)
            else:
                residual = None

            if encoder_padding_mask is not None:
                x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)

            x = F.dropout(x, p=self.dropout, training=self.training)
            if conv.kernel_size[0] % 2 == 1:
                # padding is implicit in the conv
                x = conv(x)
            else:
                padding_l = (conv.kernel_size[0] - 1) // 2
                padding_r = conv.kernel_size[0] // 2
                x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
                x = conv(x)
            x = F.glu(x, dim=2)

            if residual is not None:
                x = (x + residual) * math.sqrt(0.5)
            residuals.append(x)

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # ELMo and `concat`
        if output_elmo_embeds is not None and self.args.merge_mode == 'concat':
            x = torch.cat((x, output_elmo_embeds), dim=-1)

        # project back to size of embedding
        x = self.fc2(x)

        # ELMo and `sum`
        if output_elmo_embeds is not None and self.args.merge_mode == 'sum':
            x += output_elmo_embeds

        if encoder_padding_mask is not None:
            encoder_padding_mask = encoder_padding_mask.t()  # -> B x T
            x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)

        # scale gradients (this only affects backward, not forward)
        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        y = (x + input_embedding) * math.sqrt(0.5)

        return {
            'encoder_out': (x, y),
            'encoder_padding_mask': encoder_padding_mask,  # B x T
        }
示例#27
0
    def forward(self, src_tokens, src_lengths):
        """
        Args:
            src_tokens (LongTensor): tokens in the source language of shape
                `(batch, src_len)`
            src_lengths (LongTensor): lengths of each source sentence of shape
                `(batch)`

        Returns:
            dict:
                - **encoder_out** (tuple): a tuple with two elements, where the
                  first element is the last encoder layer's output and the
                  second element is the same quantity summed with the input
                  embedding (used for attention). The shape of both tensors is
                  `(batch, src_len, embed_dim)`.
                - **encoder_padding_mask** (ByteTensor): the positions of
                  padding elements of shape `(batch, src_len)`
        """
        # embed tokens and positions
        x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
        x = F.dropout(x, p=self.dropout, training=self.training)
        input_embedding = x

        # project to size of convolution
        x = self.fc1(x)

        # used to mask padding in input
        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()  # -> T x B
        if not encoder_padding_mask.any():
            encoder_padding_mask = None

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        residuals = [x]
        # temporal convolutions
        for proj, convs, res_layer in zip(self.projections,
                                          self.inner_convolutions,
                                          self.residuals):
            if res_layer > 0:
                residual = residuals[-res_layer]
                residual = residual if proj is None else proj(residual)
            else:
                residual = None

            if encoder_padding_mask is not None:
                x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)

            x = F.dropout(x, p=self.dropout, training=self.training)
            conv_list = []
            for conv in convs:
                if conv.kernel_size[0] % 2 == 1:
                    # padding is implicit in the conv
                    t = conv(x)
                else:
                    padding_l = (conv.kernel_size[0] - 1) // 2
                    padding_r = conv.kernel_size[0] // 2
                    t = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
                    t = conv(t)
                conv_list.append(t)

            #print("CList: {}".format(conv_list))
            x = torch.stack(conv_list, dim=1)  #.sum(dim=0)
            #print("SHAPE: {}".format(x.shape))
            #x = x.T
            #print("SHAPE: {}".format(x.shape))
            #x = self.mp2d(x)#.sum(dim=0)
            #print("SHAPE: {}".format(x.shape))
            #x = x.T
            #print("SHAPE: {}".format(x.shape))
            #print("SHAPE: {}".format(x.shape))
            #x.cuda()
            x = x.sum(dim=1)
            #print("SHAPE: {}".format(x.shape))
            x = F.glu(x, dim=2)
            # TODO(naetherm): With the layer normalization below we only did that after the
            # last encoder but we want to place the normalization between the encoders
            #if self.batch_norm:
            x = F.layer_norm(x, x.shape)

            if residual is not None:
                x = (x + residual) * math.sqrt(0.5)
            residuals.append(x)

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # TODO(naetherm): First plausible position for the layer normalization
        #if self.batch_norm:

        # project back to size of embedding
        x = self.fc2(x)

        # TODO(naetherm): Second plausible position for the layer normalization
        #x = F.layer_norm(x, x.shape)

        if encoder_padding_mask is not None:
            encoder_padding_mask = encoder_padding_mask.t()  # -> B x T
            x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)

        # TODO(naetherm): Third plausible position for the layer normalization

        # scale gradients (this only affects backward, not forward)
        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        y = (x + input_embedding) * math.sqrt(0.5)

        return {
            'encoder_out': (x, y),
            'encoder_padding_mask': encoder_padding_mask,  # B x T
        }