def test_masked_layer_norm(self): x_n = np.random.rand(2, 3, 7) mask_n = np.array([[1, 1, 0], [1, 1, 1]]) x = torch.from_numpy(x_n).float() mask = torch.from_numpy(mask_n).bool() layer_norm = MaskedLayerNorm(7, gamma0=0.2) normed_x = layer_norm(x, mask) N = 7 * 5 mean = (x_n * np.expand_dims(mask_n, axis=-1)).sum() / N std = np.sqrt((((x_n - mean) * np.expand_dims(mask_n, axis=-1)) ** 2).sum() / N + 1e-6) expected = 0.2 * (x_n - mean) / (std + 1e-6) assert np.allclose(normed_x.data.numpy(), expected)
def __init__(self, input_dim: int, num_layers: int, hidden_dims: Union[int, List[int]], dropout=0.1): super().__init__() if not isinstance(hidden_dims, list): hidden_dims = [hidden_dims] * num_layers if not isinstance(dropout, list): dropout = [dropout] * num_layers # type: ignore self._activations = [torch.nn.functional.relu] * num_layers input_dims = [input_dim] + hidden_dims[:-1] linear_layers = [] for layer_input_dim, layer_output_dim in zip(input_dims, hidden_dims): linear_layers.append(torch.nn.Linear(layer_input_dim, layer_output_dim)) self._linear_layers = torch.nn.ModuleList(linear_layers) dropout_layers = [torch.nn.Dropout(p=value) for value in dropout] self._dropout = torch.nn.ModuleList(dropout_layers) self._output_dim = hidden_dims[-1] self.lin = torch.nn.Linear(self._output_dim, self._output_dim) self.ln = MaskedLayerNorm(size=hidden_dims[0])
def __init__( self, vocab: Vocabulary, # bert_model: Union[str, BertModel], bert_model_name: str, # bert_config_file: str, debug: bool, bert_pretrain_model: str, bert_max_length: int, multi_orac: bool, semantic_red_map: bool, # use redundancy map or not semantic_red_map_key: str, # p or f semantic_red_map_loss: str, # bin or mag pair_oracle: bool, # use pairwise estimation as salience estimation fusion_feedforward: FeedForward, semantic_feedforard: FeedForward, graph_encoder: GraphEncoder, span_extractor: SpanExtractor, matrix_attn: MatrixAttention, trainable: bool = True, use_disco: bool = True, use_disco_graph=True, use_coref: bool = False, index: str = "bert", dropout: float = 0.2, tmp_dir: str = '/datadrive/tmp/', stop_by_word_count: bool = True, use_pivot_decode: bool = False, trigram_block=True, min_pred_word: int = 30, max_pred_word: int = 80, step: int = 10, min_pred_unit: int = 6, max_pred_unit: int = 9, threshold_red_map: List = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: # super(TensorBertSum, self).__init__(vocab, regularizer) super(TensorBertSum, self).__init__(vocab) self.debug = debug self.embedder = AutoModel.from_pretrained(bert_model_name) self.bert_pretrain_model = bert_pretrain_model if bert_max_length > 512: if 'roberta' in bert_model_name: first_half = self.embedder.embeddings.position_embeddings.weight second_half = torch.zeros_like(first_half, dtype=torch.float32, requires_grad=True) out = torch.cat([first_half, second_half], dim=0) self.embedder.embeddings.position_embeddings.weight = torch.nn.Parameter( out) self.embedder.embeddings.position_embeddings.num_embeddings *= 2 elif 'bert' in bert_model_name: first_half = self.embedder.bert_model.embeddings.position_embeddings.weight second_half = torch.zeros_like(first_half, dtype=torch.float32, requires_grad=True) # second_half = torch.empty(first_half.size(), dtype=torch.float32,requires_grad=True) # torch.nn.init.normal_(second_half, mean=0.0, std=1.0) out = torch.cat([first_half, second_half], dim=0) self.embedder.bert_model.embeddings.position_embeddings.weight = torch.nn.Parameter( out) self.embedder.bert_model.embeddings.position_embeddings.num_embeddings = 512 * 2 self.embedder.max_pieces = 512 * 2 else: raise NotImplementedError if bert_pretrain_model is not None: model_dump: OrderedDict = torch.load( os.path.join(bert_pretrain_model, 'best.th')) trimmed_dump_embedder = OrderedDict() for k, v in model_dump.items(): if k.startswith("embedder"): trimmed_dump_embedder[k] = v self.load_state_dict(trimmed_dump_embedder) print('finish loading pretrained bert') in_features = 768 self._index = index self._dropout = torch.nn.Dropout(p=dropout) self._classification_layer = torch.nn.Linear(in_features, 1) self._loss = torch.nn.BCELoss(reduction='none') # self._loss = torch.nn.BCEWithLogitsLoss(reduction='none') self._layer_norm = MaskedLayerNorm(768) self._multi_orac = multi_orac # ROUGES self._stop_by_word_count = stop_by_word_count self._threshold_red_map = threshold_red_map if stop_by_word_count: self.slot_num = int((max_pred_word - min_pred_word) / step) for i in range(self.slot_num): setattr( self, "rouge_{}".format(i), PyrougeEvaluation(name='rouge_{}'.format(i), cand_path=tmp_dir, ref_path=tmp_dir, path_to_valid=tmp_dir)) else: self._min_pred_unit = min_pred_unit self._max_pred_unit = max_pred_unit for i in range(min_pred_unit, max_pred_unit): for ths in threshold_red_map: setattr( self, "rouge_{}_{}".format(i, ths), PyrougeEvaluation(name="rouge_{}_{}".format(i, ths), cand_path=tmp_dir, ref_path=tmp_dir, path_to_valid=tmp_dir)) self._sigmoid = nn.Sigmoid() initializer(self._classification_layer) self._use_disco = use_disco self._use_disco_graph = use_disco_graph if use_disco_graph: self.disco_graph_encoder = graph_encoder self._use_coref = use_coref if use_coref: self.coref_graph_encoder = graph_encoder if self._use_coref and self._use_disco_graph: self._fusion_feedforward = fusion_feedforward self._span_extractor = span_extractor self._trigram_block = trigram_block self._use_pivot_decode = use_pivot_decode self._min_pred_word = min_pred_word self._max_pred_word = max_pred_word self._step = step self._semantic_red_map = semantic_red_map self._semantic_red_map_loss = semantic_red_map_loss self._semantic_red_map_key = semantic_red_map_key if self._semantic_red_map: self.red_matrix_attn = matrix_attn self._semantic_feedforard = semantic_feedforard self._pair_oracle = pair_oracle if self._pair_oracle: self.pair_matrix_attn = matrix_attn
def __init__(self, embedding_dim: int, filters: Sequence[Sequence[int]], num_highway: int, projection_dim: int, activation: str = 'relu', projection_location: str = 'after_highway', do_layer_norm: bool = False) -> None: super().__init__() if projection_location not in _VALID_PROJECTION_LOCATIONS: raise ConfigurationError( f"unknown projection location: {projection_location}") self.input_dim = embedding_dim self.output_dim = projection_dim self._projection_location = projection_location if activation == 'tanh': self._activation = torch.nn.functional.tanh elif activation == 'relu': self._activation = torch.nn.functional.relu else: raise ConfigurationError(f"unknown activation {activation}") # Create the convolutions self._convolutions: List[torch.nn.Module] = [] for i, (width, num) in enumerate(filters): conv = torch.nn.Conv1d(in_channels=embedding_dim, out_channels=num, kernel_size=width, bias=True) conv.weight.data.uniform_(-0.05, 0.05) conv.bias.data.fill_(0.0) self.add_module(f"char_conv_{i}", conv) # needs to match the old ELMo name self._convolutions.append(conv) # Create the highway layers num_filters = sum(num for _, num in filters) if projection_location == 'after_cnn': highway_dim = projection_dim else: # highway_dim is the number of cnn filters highway_dim = num_filters self._highways = Highway(highway_dim, num_highway, activation=torch.nn.functional.relu) for highway_layer in self._highways._layers: # pylint: disable=protected-access # highway is a linear layer for each highway layer # with fused W and b weights highway_layer.weight.data.normal_(mean=0.0, std=np.sqrt(1.0 / highway_dim)) highway_layer.bias[:highway_dim].data.fill_(0.0) highway_layer.bias[highway_dim:].data.fill_(2.0) # Projection layer: always num_filters -> projection_dim self._projection = torch.nn.Linear(num_filters, projection_dim, bias=True) self._projection.weight.data.normal_(mean=0.0, std=np.sqrt(1.0 / num_filters)) self._projection.bias.data.fill_(0.0) # And add a layer norm if do_layer_norm: self._layer_norm: Callable = MaskedLayerNorm(self.output_dim, gamma0=0.1) else: self._layer_norm = lambda tensor, mask: tensor