def __init__(self, config): super().__init__() self.dense = MaskedLinear( config.intermediate_size, config.hidden_size, pruning_method=config.pruning_method, mask_init=config.mask_init, mask_scale=config.mask_scale, ) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, config): super().__init__() self.dense = MaskedLinear( config.hidden_size, config.intermediate_size, pruning_method=config.pruning_method, mask_init=config.mask_init, mask_scale=config.mask_scale, ) if isinstance(config.hidden_act, str): self.intermediate_act_fn = ACT2FN[config.hidden_act] else: self.intermediate_act_fn = config.hidden_act
def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr( config, "embedding_size"): raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (config.hidden_size, config.num_attention_heads)) self.output_attentions = config.output_attentions self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.query = MaskedLinear( config.hidden_size, self.all_head_size, pruning_method=config.pruning_method, mask_init=config.mask_init, mask_scale=config.mask_scale, ) self.key = MaskedLinear( config.hidden_size, self.all_head_size, pruning_method=config.pruning_method, mask_init=config.mask_init, mask_scale=config.mask_scale, ) self.value = MaskedLinear( config.hidden_size, self.all_head_size, pruning_method=config.pruning_method, mask_init=config.mask_init, mask_scale=config.mask_scale, ) self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def create_masked_linear(in_features, out_features, config, bias=True): ret = MaskedLinear( in_features=in_features, out_features=out_features, pruning_method=config.pruning_method, mask_init=config.mask_init, mask_scale=config.mask_scale, mask_block_rows=config.mask_block_rows, mask_block_cols=config.mask_block_cols, ampere_pruning_method=config.ampere_pruning_method, ampere_mask_init=config.ampere_mask_init, ampere_mask_scale=config.ampere_mask_scale, shuffling_method=config.shuffling_method, in_shuffling_group=config.in_shuffling_group, out_shuffling_group=config.out_shuffling_group, ) return ret