예제 #1
0
    def __init__(self, bert_dir: str, vocabulary_builder: VocabularyBuilder,
                 dropout: float, is_used_crf: bool):
        """
        初始化
        :param bert_dir: 预训练好的 bert 模型所在 dir
        :param vocabulary_builder: vocabulary builder
        :param dropout: bert 最后一层输出的 dropout
        :param is_used_crf: 是否使用 crf, True: 使用 crf; False: 不使用 crf
        """

        super().__init__()

        self.label_vocabulary = vocabulary_builder.label_vocabulary
        self.dropout = Dropout(dropout)
        self.is_used_crf = is_used_crf
        self.bert = BertModel.from_pretrained(bert_dir)

        bert_config: BertConfig = self.bert.config

        self.classifier = Linear(bert_config.hidden_size,
                                 self.label_vocabulary.label_size)

        if self.is_used_crf:
            constraints = BIO.allowed_transitions(
                label_vocabulary=self.label_vocabulary)
            self.crf = ConditionalRandomField(
                num_tags=self.label_vocabulary.label_size,
                constraints=constraints)
        else:
            self.crf = None

        self.reset_parameters()
예제 #2
0
def test_allowed_transitions():
    """
    测试允许转移mask pair
    :return:
    """

    label_vocabulary = LabelVocabulary(labels=[["B-L1", "I-L1", "B-L2", "I-L2", "O"]],
                                       padding=LabelVocabulary.PADDING)

    allowed_pairs = BIO.allowed_transitions(label_vocabulary=label_vocabulary)

    for from_idx, to_idx in allowed_pairs:

        if from_idx == label_vocabulary.label_size:
            from_label = "START"
        else:
            from_label = label_vocabulary.token(from_idx)

        if to_idx == label_vocabulary.label_size + 1:
            to_label = "STOP"
        else:
            to_label = label_vocabulary.token(to_idx)
        print(f"(\"{from_label}\", \"{to_label}\"),")

    expect_trainsition_labels = [
        ("B-L1", "B-L1"), ("B-L1", "I-L1"), ("B-L1", "B-L2"), ("B-L1", "O"), ("B-L1", "STOP"),
        ("I-L1", "B-L1"), ("I-L1", "I-L1"), ("I-L1", "B-L2"), ("I-L1", "O"), ("I-L1", "STOP"),
        ("B-L2", "B-L1"), ("B-L2", "B-L2"), ("B-L2", "I-L2"), ("B-L2", "O"), ("B-L2", "STOP"),
        ("I-L2", "B-L1"), ("I-L2", "B-L2"), ("I-L2", "I-L2"), ("I-L2", "O"), ("I-L2", "STOP"),
        ("O", "B-L1"), ("O", "B-L2"), ("O", "O"), ("O", "STOP"),
        ("START", "B-L1"), ("START", "B-L2"), ("START", "O")]


    expect = list()

    for from_label, to_label in expect_trainsition_labels:
        if from_label == "START":
            from_idx = label_vocabulary.label_size
        else:
            from_idx = label_vocabulary.index(from_label)

        if to_label == "STOP":
            to_idx = label_vocabulary.label_size + 1
        else:
            to_idx = label_vocabulary.index(to_label)

        expect.append((from_idx, to_idx))

    ASSERT.assertSetEqual(set(expect), set(allowed_pairs))
    def test_allowed_transitions(self):
        bio_labels = ['O', 'B-X', 'I-X', 'B-Y', 'I-Y']  # start tag, end tag

        label_vocabulary = LabelVocabulary(labels=[bio_labels],
                                           padding=LabelVocabulary.PADDING)
        #              0     1      2      3      4         5          6
        allowed = BIO.allowed_transitions(label_vocabulary=label_vocabulary)

        # The empty spaces in this matrix indicate disallowed transitions.
        assert set(allowed) == {  # Extra column for end tag.
            (0, 0), (0, 1), (0, 3), (0, 6),
            (1, 0), (1, 1), (1, 2), (1, 3), (1, 6),
            (2, 0), (2, 1), (2, 2), (2, 3), (2, 6),
            (3, 0), (3, 1), (3, 3), (3, 4), (3, 6),
            (4, 0), (4, 1), (4, 3), (4, 4), (4, 6),
            (5, 0), (5, 1), (5, 3)  # Extra row for start tag
        }
예제 #4
0
    def __init__(self, token_vocabulary: Vocabulary, token_embedding_dim: int,
                 token_embedding_dropout: float,
                 gaz_vocabulary: PretrainedVocabulary,
                 gaz_word_embedding_dropout: float,
                 gaz_word_embedding_dim: int, num_lstm_layer: int,
                 lstm_hidden_size: int, gat_hidden_size: int,
                 gat_num_heads: int, gat_dropout: float, lstm_dropout: float,
                 alpha: float, fusion_strategy: str,
                 label_vocabulary: LabelVocabulary):

        super().__init__()

        assert gaz_word_embedding_dim == lstm_hidden_size * 2, \
            f"gaz_vocabulary.embedding_dim: {gaz_vocabulary.embedding_dim} " \
            f"与 lstm_hidden_size * 2: {lstm_hidden_size * 2} 不相等, 因为二者都会作为图的节点,所以 size 必须一致"

        self.token_vocabulary = token_vocabulary
        self.label_vocabulary = label_vocabulary

        self.token_embedding_dropout = Dropout(token_embedding_dropout)

        if isinstance(self.token_vocabulary, Vocabulary):
            self.token_embedding: Embedding = Embedding(
                num_embeddings=self.token_vocabulary.size,
                embedding_dim=token_embedding_dim,
                padding_idx=self.token_vocabulary.padding_index)

        elif isinstance(self.token_vocabulary, PretrainedVocabulary):
            self.token_embedding: Embedding = Embedding.from_pretrained(
                self.token_vocabulary.embedding_matrix,
                freeze=True,
                padding_idx=self.token_vocabulary.padding_index)

        self.gaz_word_embedding = Embedding.from_pretrained(
            gaz_vocabulary.embedding_matrix,
            freeze=True,
            padding_idx=gaz_vocabulary.padding_index)
        self.gaz_word_embedding_dropout = Dropout(gaz_word_embedding_dropout)

        # bilstm
        bilstm = DynamicRnn(rnn=LSTM(input_size=token_embedding_dim,
                                     hidden_size=lstm_hidden_size,
                                     num_layers=num_lstm_layer,
                                     batch_first=True,
                                     bidirectional=True))
        self.bilstm_seq2seq = RnnSeq2Seq(bilstm)
        self.lstm_dropout = Dropout(lstm_dropout)
        self.lstm_encoding_feed_forward = Linear(
            in_features=lstm_hidden_size * 2,
            out_features=self.label_vocabulary.label_size)
        # C-Graph
        self.c_gat = GAT(in_features=2 * lstm_hidden_size,
                         out_features=label_vocabulary.label_size,
                         dropout=gat_dropout,
                         alpha=alpha,
                         num_heads=gat_num_heads,
                         hidden_size=gat_hidden_size)

        # T-Graph
        self.t_gat = GAT(in_features=2 * lstm_hidden_size,
                         out_features=label_vocabulary.label_size,
                         dropout=gat_dropout,
                         alpha=alpha,
                         num_heads=gat_num_heads,
                         hidden_size=gat_hidden_size)

        # L-Graph
        self.l_gat = GAT(in_features=2 * lstm_hidden_size,
                         out_features=label_vocabulary.label_size,
                         dropout=gat_dropout,
                         alpha=alpha,
                         num_heads=gat_num_heads,
                         hidden_size=gat_hidden_size)

        if fusion_strategy == "m":
            self.fusion_layer = MFunsionLayer(
                label_size=label_vocabulary.label_size)
        elif fusion_strategy == "v":
            self.fusion_layer = VFusionLayer(
                label_size=label_vocabulary.label_size)
        elif fusion_strategy == "n":
            self.fusion_layer = NFusionLayer()
        else:
            raise RuntimeError(
                f"fusion_stategy 必须是: m, v, n 之一, 而现在是 {fusion_strategy}")
        # crf
        constraints = BIO.allowed_transitions(
            label_vocabulary=self.label_vocabulary)
        self.crf = ConditionalRandomField(
            num_tags=self.label_vocabulary.label_size, constraints=constraints)
예제 #5
0
    def __init__(self,
                 token_vocabulary: Vocabulary,
                 token_embedding_dim: int,
                 token_embedding_dropout: float,
                 gaz_vocabulary: PretrainedVocabulary,
                 gaz_word_embedding_dim: int,
                 gaz_word_embedding_dropout: float,
                 label_vocabulary: LabelVocabulary,
                 hidden_size: int,
                 lstm_dropout: float):
        """

        :param token_vocabulary: token vocabulary
        :param token_embedding_dim: token embedding 维度
        :param token_embedding_dropout: token embedding dropout
        :param gaz_vocabulary: gaz vocabualry
        :param gaz_word_embedding_dim: gaz word embedding 维度
        :param gaz_word_embedding_dropout: gaz word embedding droupout
        :param label_vocabulary: labe vocabulary
        :param hidden_size: lattice lstm 隐层输出, 2*hidden_size, 因为使用了双向的
        :param lstm_dropout: lstm dropout
        """

        super().__init__()

        self.token_vocabulary = token_vocabulary
        self.label_vocabulary = label_vocabulary

        self.token_embedding_dropout = Dropout(token_embedding_dropout)
        self.lstm_dropout = Dropout(lstm_dropout)

        if isinstance(self.token_vocabulary, Vocabulary):
            self.token_embedding: Embedding = Embedding(num_embeddings=self.token_vocabulary.size,
                                                        embedding_dim=token_embedding_dim,
                                                        padding_idx=self.token_vocabulary.padding_index)

        elif isinstance(self.token_vocabulary, PretrainedVocabulary):
            self.token_embedding: Embedding = Embedding.from_pretrained(self.token_vocabulary.embedding_matrix,
                                                                        freeze=True,
                                                                        padding_idx=self.token_vocabulary.padding_index)

        self.gaz_word_embedding = Embedding.from_pretrained(gaz_vocabulary.embedding_matrix,
                                                            freeze=True,
                                                            padding_idx=gaz_vocabulary.padding_index)
        # 默认使用双向的 Lattice LSTM
        # 前向 lattice lstm
        self.forward_lattice_lstm = LatticeLSTM(input_dim=token_embedding_dim,
                                                hidden_dim=hidden_size,
                                                gaz_word_embedding_dim=gaz_word_embedding_dim,
                                                gaz_word_embedding=self.gaz_word_embedding,
                                                gaz_word_embedding_dropout=gaz_word_embedding_dropout,
                                                left2right=True)

        # 反向 lattice lstm
        self.backward_lattice_lstm = LatticeLSTM(input_dim=token_embedding_dim,
                                                 hidden_dim=hidden_size,
                                                 gaz_word_embedding_dim=gaz_word_embedding_dim,
                                                 gaz_word_embedding=self.gaz_word_embedding,
                                                 gaz_word_embedding_dropout=gaz_word_embedding_dropout,
                                                 left2right=False)
        # 将 双向 lattice lstm 的输出转化到 label 空间
        self.linear = Linear(in_features=(hidden_size * 2),
                             out_features=label_vocabulary.label_size)

        # crf
        constraints = BIO.allowed_transitions(label_vocabulary=self.label_vocabulary)
        self.crf = ConditionalRandomField(num_tags=self.label_vocabulary.label_size,
                                          constraints=constraints)

        self.reset_parameters()
예제 #6
0
    def __init__(self, vocabulary_builder: VocabularyBuilder,
                 word_embedding_dim: int, rnn_type: str, hidden_size: int,
                 num_layer: int, dropout: float, is_used_crf: bool):

        super().__init__()

        self.word_embedding_dim = word_embedding_dim
        self.token_vocabulary = vocabulary_builder.token_vocabulary
        self.label_vocabulary = vocabulary_builder.label_vocabulary
        self.is_used_crf = is_used_crf

        if isinstance(self.token_vocabulary, Vocabulary):
            self.embedding: Embedding = Embedding(
                num_embeddings=self.token_vocabulary.size,
                embedding_dim=word_embedding_dim,
                padding_idx=self.token_vocabulary.padding_index)

        elif isinstance(self.token_vocabulary, PretrainedVocabulary):
            self.embedding: Embedding = Embedding.from_pretrained(
                self.token_vocabulary.embedding_matrix,
                freeze=True,
                padding_idx=self.token_vocabulary.padding_index)

        self.hidden_size = hidden_size

        if rnn_type == DynamicRnn.LSTM:

            lstm = LSTM(input_size=word_embedding_dim,
                        hidden_size=hidden_size,
                        num_layers=num_layer,
                        bidirectional=True,
                        dropout=dropout,
                        batch_first=True)
            dynamic_rnn = DynamicRnn(rnn=lstm)
        elif rnn_type == DynamicRnn.GRU:
            gru = GRU(input_size=word_embedding_dim,
                      hidden_size=hidden_size,
                      num_layers=num_layer,
                      bidirectional=True,
                      dropout=dropout,
                      batch_first=True)
            dynamic_rnn = DynamicRnn(rnn=gru)
        else:
            raise RuntimeError(
                f"rnn_type: {rnn_type} 必须是 {DynamicRnn.LSTM} 或 {DynamicRnn.GRU} "
            )

        self.rnn_seq2seq = RnnSeq2Seq(dynamic_rnn=dynamic_rnn)

        self.liner = Linear(in_features=hidden_size * 2,
                            out_features=self.label_vocabulary.label_size)

        if self.is_used_crf:
            constraints = BIO.allowed_transitions(
                label_vocabulary=self.label_vocabulary)
            self.crf = ConditionalRandomField(
                num_tags=self.label_vocabulary.label_size,
                constraints=constraints)
        else:
            self.crf = None

        self.reset_parameters()