示例#1
0
    def test_masked_layer_norm(self):
        x_n = np.random.rand(2, 3, 7)
        mask_n = np.array([[1, 1, 0], [1, 1, 1]])

        x = torch.from_numpy(x_n).float()
        mask = torch.from_numpy(mask_n).bool()

        layer_norm = MaskedLayerNorm(7, gamma0=0.2)
        normed_x = layer_norm(x, mask)

        N = 7 * 5
        mean = (x_n * np.expand_dims(mask_n, axis=-1)).sum() / N
        std = np.sqrt((((x_n - mean) * np.expand_dims(mask_n, axis=-1)) ** 2).sum() / N + 1e-6)
        expected = 0.2 * (x_n - mean) / (std + 1e-6)

        assert np.allclose(normed_x.data.numpy(), expected)
示例#2
0
    def __init__(self,
                 input_dim: int,
                 num_layers: int,
                 hidden_dims: Union[int, List[int]],
                 dropout=0.1):
        super().__init__()

        if not isinstance(hidden_dims, list):
            hidden_dims = [hidden_dims] * num_layers
        if not isinstance(dropout, list):
            dropout = [dropout] * num_layers  # type: ignore

        self._activations = [torch.nn.functional.relu] * num_layers
        input_dims = [input_dim] + hidden_dims[:-1]
        linear_layers = []
        for layer_input_dim, layer_output_dim in zip(input_dims, hidden_dims):
            linear_layers.append(torch.nn.Linear(layer_input_dim, layer_output_dim))
        self._linear_layers = torch.nn.ModuleList(linear_layers)
        dropout_layers = [torch.nn.Dropout(p=value) for value in dropout]
        self._dropout = torch.nn.ModuleList(dropout_layers)
        self._output_dim = hidden_dims[-1]

        self.lin = torch.nn.Linear(self._output_dim, self._output_dim)
        self.ln = MaskedLayerNorm(size=hidden_dims[0])
示例#3
0
    def __init__(
            self,
            vocab: Vocabulary,
            # bert_model: Union[str, BertModel],
            bert_model_name: str,
            # bert_config_file: str,
            debug: bool,
            bert_pretrain_model: str,
            bert_max_length: int,
            multi_orac: bool,
            semantic_red_map: bool,  # use redundancy map or not
            semantic_red_map_key: str,  # p or f
            semantic_red_map_loss: str,  # bin or mag
            pair_oracle: bool,  # use pairwise estimation as salience estimation
            fusion_feedforward: FeedForward,
            semantic_feedforard: FeedForward,
            graph_encoder: GraphEncoder,
            span_extractor: SpanExtractor,
            matrix_attn: MatrixAttention,
            trainable: bool = True,
            use_disco: bool = True,
            use_disco_graph=True,
            use_coref: bool = False,
            index: str = "bert",
            dropout: float = 0.2,
            tmp_dir: str = '/datadrive/tmp/',
            stop_by_word_count: bool = True,
            use_pivot_decode: bool = False,
            trigram_block=True,
            min_pred_word: int = 30,
            max_pred_word: int = 80,
            step: int = 10,
            min_pred_unit: int = 6,
            max_pred_unit: int = 9,
            threshold_red_map: List = None,
            initializer: InitializerApplicator = InitializerApplicator(),
            regularizer: Optional[RegularizerApplicator] = None) -> None:
        # super(TensorBertSum, self).__init__(vocab, regularizer)
        super(TensorBertSum, self).__init__(vocab)
        self.debug = debug

        self.embedder = AutoModel.from_pretrained(bert_model_name)
        self.bert_pretrain_model = bert_pretrain_model
        if bert_max_length > 512:
            if 'roberta' in bert_model_name:
                first_half = self.embedder.embeddings.position_embeddings.weight

                second_half = torch.zeros_like(first_half,
                                               dtype=torch.float32,
                                               requires_grad=True)
                out = torch.cat([first_half, second_half], dim=0)
                self.embedder.embeddings.position_embeddings.weight = torch.nn.Parameter(
                    out)
                self.embedder.embeddings.position_embeddings.num_embeddings *= 2

            elif 'bert' in bert_model_name:
                first_half = self.embedder.bert_model.embeddings.position_embeddings.weight
                second_half = torch.zeros_like(first_half,
                                               dtype=torch.float32,
                                               requires_grad=True)
                # second_half = torch.empty(first_half.size(), dtype=torch.float32,requires_grad=True)
                # torch.nn.init.normal_(second_half, mean=0.0, std=1.0)
                out = torch.cat([first_half, second_half], dim=0)
                self.embedder.bert_model.embeddings.position_embeddings.weight = torch.nn.Parameter(
                    out)
                self.embedder.bert_model.embeddings.position_embeddings.num_embeddings = 512 * 2
                self.embedder.max_pieces = 512 * 2
            else:
                raise NotImplementedError

        if bert_pretrain_model is not None:
            model_dump: OrderedDict = torch.load(
                os.path.join(bert_pretrain_model, 'best.th'))
            trimmed_dump_embedder = OrderedDict()
            for k, v in model_dump.items():
                if k.startswith("embedder"):
                    trimmed_dump_embedder[k] = v
            self.load_state_dict(trimmed_dump_embedder)
            print('finish loading pretrained bert')

        in_features = 768
        self._index = index
        self._dropout = torch.nn.Dropout(p=dropout)
        self._classification_layer = torch.nn.Linear(in_features, 1)
        self._loss = torch.nn.BCELoss(reduction='none')
        # self._loss = torch.nn.BCEWithLogitsLoss(reduction='none')
        self._layer_norm = MaskedLayerNorm(768)
        self._multi_orac = multi_orac

        # ROUGES
        self._stop_by_word_count = stop_by_word_count
        self._threshold_red_map = threshold_red_map
        if stop_by_word_count:
            self.slot_num = int((max_pred_word - min_pred_word) / step)
            for i in range(self.slot_num):
                setattr(
                    self, "rouge_{}".format(i),
                    PyrougeEvaluation(name='rouge_{}'.format(i),
                                      cand_path=tmp_dir,
                                      ref_path=tmp_dir,
                                      path_to_valid=tmp_dir))
        else:
            self._min_pred_unit = min_pred_unit
            self._max_pred_unit = max_pred_unit
            for i in range(min_pred_unit, max_pred_unit):
                for ths in threshold_red_map:
                    setattr(
                        self, "rouge_{}_{}".format(i, ths),
                        PyrougeEvaluation(name="rouge_{}_{}".format(i, ths),
                                          cand_path=tmp_dir,
                                          ref_path=tmp_dir,
                                          path_to_valid=tmp_dir))
        self._sigmoid = nn.Sigmoid()
        initializer(self._classification_layer)

        self._use_disco = use_disco

        self._use_disco_graph = use_disco_graph
        if use_disco_graph:
            self.disco_graph_encoder = graph_encoder
        self._use_coref = use_coref
        if use_coref:
            self.coref_graph_encoder = graph_encoder
        if self._use_coref and self._use_disco_graph:
            self._fusion_feedforward = fusion_feedforward
        self._span_extractor = span_extractor

        self._trigram_block = trigram_block
        self._use_pivot_decode = use_pivot_decode
        self._min_pred_word = min_pred_word
        self._max_pred_word = max_pred_word
        self._step = step

        self._semantic_red_map = semantic_red_map
        self._semantic_red_map_loss = semantic_red_map_loss
        self._semantic_red_map_key = semantic_red_map_key

        if self._semantic_red_map:
            self.red_matrix_attn = matrix_attn
            self._semantic_feedforard = semantic_feedforard
        self._pair_oracle = pair_oracle
        if self._pair_oracle:
            self.pair_matrix_attn = matrix_attn
示例#4
0
    def __init__(self,
                 embedding_dim: int,
                 filters: Sequence[Sequence[int]],
                 num_highway: int,
                 projection_dim: int,
                 activation: str = 'relu',
                 projection_location: str = 'after_highway',
                 do_layer_norm: bool = False) -> None:
        super().__init__()

        if projection_location not in _VALID_PROJECTION_LOCATIONS:
            raise ConfigurationError(
                f"unknown projection location: {projection_location}")

        self.input_dim = embedding_dim
        self.output_dim = projection_dim
        self._projection_location = projection_location

        if activation == 'tanh':
            self._activation = torch.nn.functional.tanh
        elif activation == 'relu':
            self._activation = torch.nn.functional.relu
        else:
            raise ConfigurationError(f"unknown activation {activation}")

        # Create the convolutions
        self._convolutions: List[torch.nn.Module] = []
        for i, (width, num) in enumerate(filters):
            conv = torch.nn.Conv1d(in_channels=embedding_dim,
                                   out_channels=num,
                                   kernel_size=width,
                                   bias=True)
            conv.weight.data.uniform_(-0.05, 0.05)
            conv.bias.data.fill_(0.0)
            self.add_module(f"char_conv_{i}",
                            conv)  # needs to match the old ELMo name
            self._convolutions.append(conv)

        # Create the highway layers
        num_filters = sum(num for _, num in filters)
        if projection_location == 'after_cnn':
            highway_dim = projection_dim
        else:
            # highway_dim is the number of cnn filters
            highway_dim = num_filters
        self._highways = Highway(highway_dim,
                                 num_highway,
                                 activation=torch.nn.functional.relu)
        for highway_layer in self._highways._layers:  # pylint: disable=protected-access
            # highway is a linear layer for each highway layer
            # with fused W and b weights
            highway_layer.weight.data.normal_(mean=0.0,
                                              std=np.sqrt(1.0 / highway_dim))
            highway_layer.bias[:highway_dim].data.fill_(0.0)
            highway_layer.bias[highway_dim:].data.fill_(2.0)

        # Projection layer: always num_filters -> projection_dim
        self._projection = torch.nn.Linear(num_filters,
                                           projection_dim,
                                           bias=True)
        self._projection.weight.data.normal_(mean=0.0,
                                             std=np.sqrt(1.0 / num_filters))
        self._projection.bias.data.fill_(0.0)

        # And add a layer norm
        if do_layer_norm:
            self._layer_norm: Callable = MaskedLayerNorm(self.output_dim,
                                                         gamma0=0.1)
        else:
            self._layer_norm = lambda tensor, mask: tensor