Пример #1
0
 def forward(self, text):
     embedded = self.embedding(text)
     x, (hn, cn) = self.hidden(embedded)  # (seq_len, batch, hidden_size)
     x = x[-1, :, :]
     x = F.hardswish(self.decode(x))
     y = F.linear(x, self.embedding.weight.data)
     return y
Пример #2
0
    def train_batch(self, x, y, jump_aux=False, drop_final=False):
        layers = self.net(x)
        if self.feature_layer == 'logits':
            logits = layers['logits']
            loss = F.cross_entropy(logits, y)
            return dict(loss=loss, logits=logits)
        feature_maps = layers[self.feature_layer]
        raw_attentions = layers[self.attention_layer]
        attention_maps_ = self.attentions(raw_attentions)
        dropout_mask = self.dropout(
            torch.ones([attention_maps_.shape[0], self.M, 1], device=x.device))
        attention_maps = attention_maps_ * torch.unsqueeze(dropout_mask, -1)
        feature_maps, feature_maps_d = self.texture_enhance(
            feature_maps, attention_maps_)
        feature_maps_d = feature_maps_d - feature_maps_d.mean(dim=[2, 3],
                                                              keepdim=True)
        feature_maps_d = feature_maps_d / (
            torch.std(feature_maps_d, dim=[2, 3], keepdim=True) + 1e-8)
        feature_matrix_ = self.atp(feature_maps, attention_maps_)
        feature_matrix = feature_matrix_ * dropout_mask

        B, M, N = feature_matrix.size()
        if not jump_aux:
            aux_loss, feature_matrix_d = self.auxiliary_loss(
                feature_maps_d, attention_maps_, y)
        else:
            feature_matrix_d = self.atp(feature_maps_d, attention_maps_)
            aux_loss = 0
        feature_matrix = feature_matrix.view(B, -1)
        feature_matrix = F.hardswish(self.projection_local(feature_matrix))
        final = layers['final']
        attention_maps = attention_maps.sum(dim=1, keepdim=True)
        final = self.atp(final, attention_maps, norm=1).squeeze(1)
        final = self.dropout_final(final)
        projected_final = F.hardswish(self.project_final(final))
        #projected_final=self.dropout(projected_final.view(B,1,-1)).view(B,-1)
        if drop_final:
            projected_final *= 0
        feature_matrix = torch.cat((feature_matrix, projected_final), 1)
        ensemble_logit = self.ensemble_classifier_fc(feature_matrix)
        ensemble_loss = F.cross_entropy(ensemble_logit, y)
        return dict(ensemble_loss=ensemble_loss,
                    aux_loss=aux_loss,
                    attention_maps=attention_maps_,
                    ensemble_logit=ensemble_logit,
                    feature_matrix=feature_matrix_,
                    feature_matrix_d=feature_matrix_d)
    def forward(self, src: Tensor, tgt: Tensor) -> Tensor:
        r"""Forward propagate data.

        Args:
            src: Input to create the hidden context vector.
            tgt: Expected output.

        Shapes:
            src: (S, N, E)
            tgt: (T, N, E)
        """
        T, N, E = tgt.shape
        assert T == self.tgt_window, (
            f"The output sequence length must be the same length the target "
            f"window. {T} != {self.tgt_window}")

        tgt_future_mask = self.future_token_square_mask(T)

        assert src.shape[-1] == self.n_in_features, (
            f"The shape must be of size time_features + linear_features.")

        time_features = src[:, :, :self.n_time_features]
        linear_features = src[:, :, -self.n_linear_features:]

        assert time_features.shape[-1] > 0, (
            "There should at least be one time feature used.")
        assert linear_features.shape[-1] > 0, (
            "There should at least be one linear feature used.")

        time_embeddings = self.time_embedding(time_features) * math.sqrt(
            self.d_time_embed)
        linear_embeddings = F.hardswish(self.linear_embedding(linear_features))

        src_embeddings = F.hardswish(
            torch.cat([time_embeddings, linear_embeddings], dim=-1))

        encoded = self.encoder(src_embeddings)

        tgt_embeddings = self.tgt_embedding(tgt)
        decoder = self.decoder(tgt_embeddings,
                               encoded,
                               tgt_mask=tgt_future_mask)

        out = self.projection(decoder)
        return out
    def forward(self, src: Tensor) -> Tensor:
        r"""Forward propagate data. It expects the src tensor to be of shape
        (S, N, F) where the first time_features E are used in the Time2Vec
        model. The model will consume the first time_features from the src
        tensor and create time embeddings via a Time2Vec model. It will then
        pass the remaining linear_features into a standard linear layer to be
        concatenated with the time embeddings.

        Args:
            src: features

        Shapes:
            src: (S, N, F)
            out: (S, N, P)
        """
        assert (src.shape[-1] == self.n_in_features
                ), "The shape must be of size time_features + linear_features."

        time_features = src[:, :, :self.n_time_features]
        linear_features = src[:, :, self.n_time_features:]

        assert (time_features.shape[-1] >
                0), "There should at least be one time feature used."
        assert (linear_features.shape[-1] >
                0), "There should at least be one linear feature used."

        time_embeddings = self.time_embedding(time_features) * math.sqrt(
            self.d_time_embed)
        if self.positional_encoding is not None:
            time_embeddings = self.positional_encoding(time_embeddings)
        linear_proj = self.dropout1(self.linear_src(linear_features))

        # Concatenate the time embeddings and linear features that were
        # previously separated.
        x = F.hardswish(torch.cat([time_embeddings, linear_proj], dim=-1))

        assert x.shape[-1] == self.d_time_embed + self.d_linear_embed, (
            "The dimensionality of the concatenated time embeddings and "
            "linear hidden dims must be equal to d_time_embed + d_linear_embed."
        )

        encoded = F.hardswish(self.encoder(x))
        out = self.projection(encoded)

        return out
Пример #5
0
 def forward(self, x, y=0, train_batch=False, AG=None):
     if train_batch:
         if AG is None:
             return self.train_batch(x, y)
         else:
             loss_pack = self.train_batch(x, y)
             with torch.no_grad():
                 Xaug, index = AG.agda(x, loss_pack['attention_maps'])
             #self.eval()
             loss_pack2 = self.train_batch(Xaug, y, jump_aux=False)
             #self.train()
             loss_pack['AGDA_ensemble_loss'] = loss_pack2['ensemble_loss']
             loss_pack['AGDA_aux_loss'] = loss_pack2['aux_loss']
             one_hot = F.one_hot(index, self.M)
             loss_pack['match_loss'] = torch.mean(
                 torch.norm(loss_pack2['feature_matrix_d'] -
                            loss_pack['feature_matrix_d'],
                            dim=-1) * (torch.ones_like(one_hot) - one_hot))
             return loss_pack
     layers = self.net(x)
     if self.feature_layer == 'logits':
         logits = layers['logits']
         return logits
     raw_attentions = layers[self.attention_layer]
     attention_maps = self.attentions(raw_attentions)
     feature_maps = layers[self.feature_layer]
     feature_maps, feature_maps_d = self.texture_enhance(
         feature_maps, attention_maps)
     feature_matrix = self.atp(feature_maps, attention_maps)
     B, M, N = feature_matrix.size()
     feature_matrix = self.dropout(feature_matrix)
     feature_matrix = feature_matrix.view(B, -1)
     feature_matrix = F.hardswish(self.projection_local(feature_matrix))
     final = layers['final']
     attention_maps2 = attention_maps.sum(dim=1, keepdim=True)
     final = self.atp(final, attention_maps2, norm=1).squeeze(1)
     projected_final = F.hardswish(self.project_final(final))
     feature_matrix = torch.cat((feature_matrix, projected_final), 1)
     ensemble_logit = self.ensemble_classifier_fc(feature_matrix)
     return ensemble_logit
Пример #6
0
    def forward(self, src: Tensor) -> Tensor:
        r"""Forward propagate data.
        Args:
            src: tensor containing time features.

        Shapes:
            src: (*, N, F)
            output: (*, N, E)
        """
        linear = self.dropout1(self.linear_time_proj(src))
        periodic = self.dropout2(self.activation(self.periodic_time_proj(src)))
        out = F.hardswish(torch.cat([linear, periodic], dim=-1))
        out = self.dropout3(self.proj(out))
        return out
Пример #7
0
 def forward(self, x):
     return F.hardswish(x, inplace=self.inplace)
Пример #8
0
 def forward(self, x):
     return F.hardswish(x)
Пример #9
0
 def forward(self, input):
     return self.activation_post_process(F.hardswish(input))
Пример #10
0
    def forward(self, x):
        h0 = F.hardswish(self.bn0(self.conv0(x)))

        h1 = F.hardswish(self.bn1(self.conv1(h0)))
        h2 = F.hardswish(self.bn2(self.conv2(h1)) + h0)
        
        h3 = F.hardswish(self.bn3(self.conv3(h2)))
        h4 = F.hardswish(self.bn4(self.conv4(h3)) + h2)

        h5 = F.hardswish(self.bn5(self.conv5(h4)))
        h6 = F.hardswish(self.bn6(self.conv6(h5)) + h4)
        
        h7 = F.hardswish(self.bn7(self.conv7(h6)))
        h8 = F.hardswish(self.bn8(self.conv8(h7)) + h6)
        
        h9 = F.hardswish(self.bn9(self.conv9(h8)))
        h10 = F.hardswish(self.bn10(self.conv10(h9)) + h8)

        h11 = F.hardswish(self.bn11(self.conv11(h10)))
        h12 = F.hardswish(self.bn12(self.conv12(h11)) + h10)
        
        h13 = F.hardswish(self.bn13(self.conv13(h12)))
        h14 = F.hardswish(self.bn14(self.conv14(h13)) + h12)

        h15 = F.hardswish(self.bn15(self.conv15(h14)))
        h16 = F.hardswish(self.bn16(self.conv16(h15)) + h14)
        
        h17 = F.hardswish(self.bn17(self.conv17(h16)))
        h18 = F.hardswish(self.bn18(self.conv18(h17)) + h16)
        
        h19 = F.hardswish(self.bn19(self.conv19(h18)))
        h20 = F.hardswish(self.bn20(self.conv20(h19)) + h18)

        h21 = F.hardswish(self.bn21(self.conv21(h20)))
        h22 = F.hardswish(self.bn22(self.conv22(h21)) + h20)
        
        h23 = F.hardswish(self.bn23(self.conv23(h22)))
        h24 = F.hardswish(self.bn24(self.conv24(h23)) + h22)

        h25 = F.hardswish(self.bn25(self.conv25(h24)))
        h26 = F.hardswish(self.bn26(self.conv26(h25)) + h24)
        
        h27 = F.hardswish(self.bn27(self.conv27(h26)))
        h28 = F.hardswish(self.bn28(self.conv28(h27)) + h26)
        
        h29 = F.hardswish(self.bn29(self.conv29(h28)))
        h30 = F.hardswish(self.bn30(self.conv30(h29)) + h28)

        h31 = F.hardswish(self.bn31(self.conv31(h30)))
        h32 = F.hardswish(self.bn32(self.conv32(h31)) + h30)
        
        h33 = F.hardswish(self.bn33(self.conv33(h32)))
        h34 = F.hardswish(self.bn34(self.conv34(h33)) + h32)

        h35 = F.hardswish(self.bn35(self.conv35(h34)))
        h36 = F.hardswish(self.bn36(self.conv36(h35)) + h34)
        
        h37 = F.hardswish(self.bn37(self.conv37(h36)))
        #loss = policy_loss
        h38 = F.hardswish(self.bn38(self.conv38(h37)) + h36)

        #policy network
        h_p1 = F.hardswish(self.bn_p1(self.conv_p1(h38)))
        h_p1 = torch.flatten(h_p1,1)
        out_p = self.fc_p2(h_p1)

        # value network
        h_v1 = F.hardswish(self.bn_v1(self.conv_v1(h38)))
        h_v1 = torch.flatten(h_v1,1)
        h_v2 = F.hardswish(self.fc_v2(h_v1))

        out_v = torch.tanh(self.fc_v3(h_v2))

        return (out_p, out_v)
Пример #11
0
 def forward(self, x, y, z, w):
     x = F.hardswish(x)
     y = hardswish_forward_0(y)
     z = hardswish_forward_1(z)
     w = hardswish_forward_2(w)
     return x, y, z, w
Пример #12
0
 def forward(self, input: torch.Tensor) -> torch.Tensor:
     return F.hardswish(input)
Пример #13
0
    def optimize_layer(self, node, float_layer, layer_inputs, layer_act_group,
                       net_inputs, net_loss, last_quant_mods, device):
        batch_factor = 0.5 if layer_inputs[0].size(0) == 1 else 1

        layer = node.module
        float_data = np.fabs(
            float_layer.weight.cpu().detach().numpy().flatten())
        quant_data = np.fabs(layer.weight.cpu().detach().numpy().flatten())
        q_noise = np.square(float_data - quant_data).mean()

        sqnr = 10 * np.log10(np.square(float_data).mean() / q_noise)
        quantize_efficiency = sqnr / 8.0

        lr_factor = NndctOption.nndct_finetune_lr_factor.value
        lr_factor = lr_factor * batch_factor
        if quantize_efficiency > 4.5:
            lr_factor = 0.1 * lr_factor * batch_factor

        lr_w = lr_factor * layer.weight.std().item()
        # lr_w=1e-3
        opt_weight = torch.optim.Adam([layer.weight], lr=lr_w)
        opt_bias = None
        lr_b = 0
        if hasattr(layer, "bias") and layer.bias is not None:
            if layer.bias.flatten().shape[0] == 1: lr_b = 0.0
            else: lr_b = lr_factor * layer.bias.std().item()
            # lr_b = lr_factor * layer.bias.std().item()
            # lr_b=1e-3
            opt_bias = torch.optim.Adam([layer.bias], lr=lr_b)

        #print(f"learning rate: lr_w={lr_w}, lr_b={lr_b}")
        #print(f"pre quant efficiency:{quantize_efficiency}")
        iters = 20
        total_loss = AverageMeter("layer_loss")
        best_params = self.get_layer_params(layer)
        handlers = self.hook_cache_output([float_layer])
        for input_args in zip(*net_inputs):
            with torch.no_grad():
                f_model = self._float_model.to(device)
                f_model.eval()
                new_input_args = []
                for ip in input_args:
                    if isinstance(ip, torch.Tensor):
                        new_input_args.append(ip.to(device))
                _ = f_model(*new_input_args)
        torch.cuda.empty_cache()
        self.clean_hooks(handlers)

        for i in range(iters):
            for idx, layer_input in enumerate(layer_inputs):
                train_output = self._cached_outputs[float_layer][idx].to(
                    device)
                qout = layer(layer_input.to(device))
                # train_output = train_output.to(device)

                if node in layer_act_group:
                    act_node = layer_act_group[node]
                    q_act_layer = act_node.module
                    inplace = q_act_layer.inplace
                    q_act_layer.inplace = False
                    qout = q_act_layer(qout)
                    q_act_layer.inplace = inplace
                    if act_node.op.type == NNDCT_OP.RELU:
                        train_output = F.relu(train_output)
                    elif act_node.op.type == NNDCT_OP.RELU6:
                        train_output = F.relu6(train_output)
                    elif act_node.op.type == NNDCT_OP.HSIGMOID:
                        train_output = F.hardsigmoid(train_output)
                    elif act_node.op.type == NNDCT_OP.HSWISH:
                        train_output = F.hardswish(train_output)
                    else:
                        raise NotImplementedError()

                if NndctOption.nndct_quant_opt.value > 0:
                    loss = F.mse_loss(qout, train_output) + F.mse_loss(
                        layer.weight,
                        float_layer.weight.detach().to(device))
                else:
                    loss = F.mse_loss(qout, train_output)

                total_loss.update(loss.item())

                opt_weight.zero_grad()
                if opt_bias:
                    opt_bias.zero_grad()

                loss.backward()
                opt_weight.step()
                if opt_bias:
                    opt_bias.step()

            float_data = np.fabs(layer.weight.cpu().detach().numpy().flatten())
            layer.param_quantized = False
            handlers = self.hook_cache_output(last_quant_mods,
                                              hook_type="single")
            eval_loss = self.eval_loss(net_inputs, last_quant_mods, device)
            self.clean_hooks(handlers)
            quant_data = np.fabs(layer.weight.cpu().detach().numpy().flatten())
            q_noise = np.square(float_data - quant_data).mean()
            sqnr = 10 * np.log10(np.square(float_data).mean() / q_noise)
            quantize_efficiency = sqnr / 8.0
            #print(f"post quant efficiency:{quantize_efficiency}")
            # print(f"eval loss:{eval_loss} best loss:{net_loss}")
            if eval_loss < net_loss:
                best_params = self.get_layer_params(layer)
                net_loss = eval_loss
            else:
                self.set_layer_params(layer, best_params[0], best_params[1])
                break
        # self.set_layer_params(layer, best_params[0], best_params[1])
        #print(f"{node.name}\n{total_loss}")
        #print(f"opt net loss:{net_loss}")
        # self.clean_hooks()
        del self.cached_outputs[float_layer]
        # del cached_outputs
        torch.cuda.empty_cache()
        # print(f"iter:{i}")
        return net_loss
Пример #14
0
def hardswish(input, *args, **kwargs):
    return _wrap_tensor(input, F.hardswish(input.F, *args, **kwargs))
Пример #15
0
 def forward(self, input: Tensor) -> Tensor:
     return F.hardswish(input, self.inplace)
Пример #16
0
 def forward(self, input):
     return F.hardswish(input, inplace=self.inplace)
Пример #17
0
 def forward(self, input):
     return F.hardswish(input)