def build_model_layers(self):
     ''' builds the layers in the model '''
     self.bert = BertForTokenClassification.from_pretrained(
         "bert-base-cased",
         num_labels=self.num_labels,
         output_attentions=False,
         output_hidden_states=False)
     if self.use_crf:
         self.crf = CRF(self.tag_pad_idx, self.pad_token, self.tag_names)
示例#2
0
    def build_model(self):
        '''
        build the embedding layer, lstm layer and CRF layer
        '''
        self.hidden2tag = nn.Linear(self.embedding_dim, self.n_tags)

        crf_config = {
            'n_tags': self.config['n_ent_tags'],
            'start_idx': self.config['start_ent_idx'],
            'end_idx': self.config['end_ent_idx'],
            'use_cuda': self.use_cuda
        }
        self.crf = CRF(crf_config)
        self.bert = transformers.BertModel.from_pretrained('bert-base-chinese')
示例#3
0
 def build_model_layers(self):
     ''' builds the layers in the model '''
     # embedding layer
     self.embedding = nn.Embedding(num_embeddings=self.input_dim, embedding_dim=self.embedding_dim,
                                   padding_idx=self.text_pad_idx)
     # dropout for embedding layer
     self.embedding_dropout = nn.Dropout(self.embedding_dropout_ratio)
     # character cnn
     if self.char_embedding_dim:
         self.char_embedding = nn.Embedding(num_embeddings=self.char_input_dim,
                                            embedding_dim=self.char_embedding_dim,
                                            padding_idx=self.char_pad_idx)
         self.char_cnn = nn.Conv1d(in_channels=self.char_embedding_dim,
                                   out_channels=self.char_embedding_dim*self.char_filter,
                                   kernel_size=self.char_kernel,
                                   groups=self.char_embedding_dim)
         self.cnn_dropout = nn.Dropout(self.cnn_dropout_ratio)
         all_embedding_dim = self.embedding_dim+(self.char_embedding_dim*self.char_filter)
         # lstm layers with dropout
     else:
         all_embedding_dim = self.embedding_dim
     # lstm layers with dropout
     self.lstm = nn.LSTM(batch_first=True, input_size=all_embedding_dim,
                         hidden_size=self.hidden_dim, num_layers=self.lstm_layers,
                         bidirectional=True, dropout=self.lstm_dropout_ratio if self.lstm_layers > 1 else 0)
     # use multihead attention if there are attention heads
     if self.attn_heads:
         self.attn = nn.MultiheadAttention(embed_dim=self.hidden_dim*2, num_heads=self.attn_heads, dropout=self.attn_dropout_ratio)
     # dropout for fully connected layer
     self.fc_dropout = nn.Dropout(self.fc_dropout_ratio)
     # fully connected layer
     self.fc = nn.Linear(self.hidden_dim*2, self.output_dim)
     # use crf layer if it is switched on
     if self.use_crf:
         self.crf = CRF(self.tag_pad_idx, self.pad_token, self.tag_names)            
示例#4
0
    def build_model(self):
        '''
        build the embedding layer, lstm layer and CRF layer
        '''
        self.word_embeds = nn.Embedding(self.n_words, self.embedding_dim)
        self.lstm = nn.LSTM(self.embedding_dim,
                            self.hidden_dim // 2,
                            batch_first=True,
                            num_layers=self.lstm_layer_num,
                            dropout=self.dropout_prob,
                            bidirectional=True)
        self.hidden2tag = nn.Linear(self.hidden_dim, self.n_tags)

        crf_config = {
            'n_tags': self.config['n_ent_tags'],
            'start_idx': self.config['start_ent_idx'],
            'end_idx': self.config['end_ent_idx'],
            'use_cuda': self.use_cuda
        }
        self.crf = CRF(crf_config)
示例#5
0
 def build_model_layers(self):
     ''' builds the layers in the model '''
     # embedding layer
     self.embedding = nn.Embedding(num_embeddings=self.input_dim,
                                   embedding_dim=self.embedding_dim,
                                   padding_idx=self.text_pad_idx)
     # dropout for embedding layer
     self.embedding_dropout = nn.Dropout(self.embedding_dropout_ratio)
     # character cnn
     if self.char_embedding_dim:
         self.char_embedding = nn.Embedding(
             num_embeddings=self.char_input_dim,
             embedding_dim=self.char_embedding_dim,
             padding_idx=self.char_pad_idx)
         self.char_cnn = nn.Conv1d(in_channels=self.char_embedding_dim,
                                   out_channels=self.char_embedding_dim *
                                   self.char_filter,
                                   kernel_size=self.char_kernel,
                                   groups=self.char_embedding_dim)
         self.cnn_dropout = nn.Dropout(self.cnn_dropout_ratio)
         # lstm layers with dropout
         all_embedding_dim = self.embedding_dim + (self.char_embedding_dim *
                                                   self.char_filter)
     else:
         all_embedding_dim = self.embedding_dim
     # transformer encoder layers with attention and dropout
     self.position_encoder = PositionalEncoding(d_model=all_embedding_dim)
     encoder_layers = nn.TransformerEncoderLayer(
         d_model=all_embedding_dim,
         nhead=self.attn_heads,
         activation='relu',
         dropout=self.trf_dropout_ratio)
     self.encoder = nn.TransformerEncoder(encoder_layer=encoder_layers,
                                          num_layers=self.trf_layers)
     # fully connected layer with gelu activation
     self.fc1 = nn.Linear(in_features=all_embedding_dim,
                          out_features=self.hidden_dim)
     self.fc1_gelu = nn.GELU()
     # layer norm
     self.fc1_norm = nn.LayerNorm(self.hidden_dim)
     # dropout for fully connected layer
     self.fc2_dropout = nn.Dropout(self.fc_dropout_ratio)
     # fully connected layer
     self.fc2 = nn.Linear(self.hidden_dim, self.output_dim)
     # use crf layer if it is switched on
     if self.use_crf:
         self.crf = CRF(self.tag_pad_idx, self.pad_token, self.tag_names)
class REL_BLSTM_CRF(MODEL_TEMP):
    def __init__(self, config={}, show_param=False):
        '''
        :param - dict
            param['embedding_dim']
            param['hidden_dim']
            ***param['n_ent_tags']
            param['n_rel_tags']
            param['n_rels']
            param['n_words']
            param['start_idx']  int, <start> tag index for entity tag seq
            param['end_idx']   int, <end> tag index for entity tag seq
            param['use_cuda']
            param['dropout_prob']
            param['lstm_layer_num']
        '''
        super(REL_BLSTM_CRF, self).__init__()
        self.config = config
        self.embedding_dim = self.config.get('embedding_dim', 128)
        self.hidden_dim = self.config.get('hidden_dim', 64)
        assert self.hidden_dim % 2 == 0, 'hidden_dim for BLSTM must be even'

        self.n_tags = self.config.get('n_rel_tags', 8)
        self.n_rels = self.config.get('n_rels', 9)
        self.n_words = self.config.get('n_words', 10000)

        self.dropout_prob = self.config.get('dropout_prob', 0)
        self.lstm_layer_num = self.config.get('lstm_layer_num', 1)

        self.use_cuda = self.config.get('use_cuda', False)
        self.model_type = 'REL_BLSTM_CRF'

        self.build_model()
        self.reset_parameters()
        if show_param:
            self.show_model_param()

    def show_model_param(self):
        log('=' * 80, 0)
        log(f'model_type: {self.model_type}', 1)
        log(f'embedding_dim: {self.embedding_dim}', 1)
        log(f'hidden_dim: {self.hidden_dim}', 1)
        log(f'use_cuda: {self.use_cuda}', 1)
        log(f'lstm_layer_num: {self.lstm_layer_num}', 1)
        log(f'dropout_prob: {self.dropout_prob}', 1)
        log('=' * 80, 0)

    def build_model(self):
        '''
        build the embedding layer, lstm layer and CRF layer
        '''
        self.word_embeds = nn.Embedding(self.n_words, self.embedding_dim)
        self.rel_embeds = nn.Embedding(self.n_rels, self.embedding_dim)
        self.embed2hidden = nn.Linear(self.embedding_dim * 2,
                                      self.embedding_dim)
        self.lstm = nn.LSTM(input_size=self.embedding_dim,
                            hidden_size=self.hidden_dim // 2,
                            batch_first=True,
                            num_layers=self.lstm_layer_num,
                            dropout=self.dropout_prob,
                            bidirectional=True)
        self.hidden2tag = nn.Linear(self.hidden_dim, self.n_tags)

        crf_config = {
            'n_tags': self.n_tags,
            'start_idx': self.config['start_rel_idx'],
            'end_idx': self.config['end_rel_idx'],
            'use_cuda': self.use_cuda
        }
        self.crf = CRF(crf_config)
        self.relu_layer = nn.ReLU()

    def reset_parameters(self):
        I.xavier_normal_(self.word_embeds.weight.data)
        I.xavier_normal_(self.rel_embeds.weight.data)
        self.lstm.reset_parameters()
        # stdv = 1.0 / math.sqrt(self.hidden_dim)
        # for weight in self.lstm.parameters():
        #     I.uniform_(weight, -stdv, stdv)
        I.xavier_normal_(self.embed2hidden.weight.data)
        I.xavier_normal_(self.hidden2tag.weight.data)
        self.crf.reset_parameters()

    def _get_lstm_features(self, x, relation_type, use_cuda=None):
        '''
        :param  
            @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array
            @relation_type: index之后的relation_type, (batch_size, 1), np.array
        :return 
            @lstm_feature: (batch_size, T, n_tags) -- 类似于eject score, torch.tensor
        '''
        use_cuda = self.use_cuda if use_cuda is None else use_cuda
        batch_size, T = x.shape[0], x.shape[1]

        ##embedding layer
        words_tensor = self._to_tensor(x, use_cuda)  #(batch_size, T)
        # print('words_tensor_shape', words_tensor.shape)
        word_input_embeds = self.word_embeds(
            words_tensor)  #(batch_size, T, n_embed)
        # print('word_input_embeds_shape', word_input_embeds.shape)
        reltype_tensor = self._to_tensor(relation_type,
                                         use_cuda)  #(batch_size, 1)
        # print('reltype_tensor', reltype_tensor.shape)
        reltype_input_embeds = self.rel_embeds(
            reltype_tensor)  #(batch_size, 1, n_embed)
        # print('reltype_input_embeds', reltype_input_embeds.shape)
        reltype_input_embeds = reltype_input_embeds.repeat(
            1, T, 1)  #(batch_size, T, n_embed)
        # print('reltype_input_embeds2', reltype_input_embeds.shape)

        input_embeds_all = torch.cat([word_input_embeds, reltype_input_embeds],
                                     -1)  #(batch_size, T, n_embed*2)
        # print('input_embeds_all.shape', input_embeds_all.shape)
        embeds = self.embed2hidden(
            input_embeds_all)  #(batch_size, T, n_embeds)
        # print('embeds.shape', embeds.shape)

        # ##LSTM layer
        if use_cuda:
            h_0 = torch.randn(2 * self.lstm_layer_num, batch_size,
                              self.hidden_dim //
                              2).cuda()  #(n_layer*n_dir, N, n_hid)
            c_0 = torch.randn(2 * self.lstm_layer_num, batch_size,
                              self.hidden_dim // 2).cuda()
        else:
            h_0 = torch.randn(2 * self.lstm_layer_num, batch_size,
                              self.hidden_dim // 2)
            c_0 = torch.randn(2 * self.lstm_layer_num, batch_size,
                              self.hidden_dim // 2)
        # c_0 = h_0.clone()
        hidden = (h_0, c_0)
        lstm_out, _hidden = self.lstm(
            embeds, hidden)  #(batch_size, T, n_dir*n_hid), (h, c)

        ##FC layer
        lstm_feature = self.hidden2tag(lstm_out)  #(batch_size, T, n_tags)
        lstm_feature = torch.tanh(lstm_feature)
        # print(lstm_feature.shape)

        return lstm_feature

    def _loss(self, x, relation_type, y_rel, lens, use_cuda=None):
        '''
        loss function: neg_log_likelihood
        :param
            @x: (batch_size, T), np.array, index之后的word, 每个字符按照字典对应到index, 
            @relation_type: (batch_size, 1), np.array, 关系类别
            @y_rel: (batch_size, T), np.array, index之后的关系序列, 字符级别,
            @lens: (batch_size), list, 具体每个句子的长度, 
        :return 
            @loss: (batch_size), torch.tensor
        '''
        use_cuda = self.use_cuda if use_cuda is None else use_cuda

        logits = self._get_lstm_features(x, relation_type, use_cuda)
        log_norm_score = self.crf.log_norm_score(logits, lens)
        path_score = self.crf.path_score(logits, y_rel, lens)

        loss = log_norm_score - path_score
        loss = (loss / self._to_tensor(lens, use_cuda).float()).mean()
        return loss

    def _output(self, x, relation_type, lens, use_cuda=None):
        '''
        return the crf decode paths
        :param
            @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array
            @relation_type: (batch_size, 1), np.array, 关系类别
            @lens: (batch_size), list, 具体每个句子的长度, 
        :return 
            @paths: (batch_size, T), torch.tensor, 最佳句子路径
            @scores: (batch_size), torch.tensor, 最佳句子路径上的得分
        '''
        use_cuda = self.use_cuda if use_cuda is None else use_cuda
        logits = self._get_lstm_features(x, relation_type, use_cuda)
        scores, paths = self.crf.viterbi_decode(logits, lens, use_cuda)
        return paths

    def train_model(self,
                    data_loader: KGDataLoader,
                    train_dataset=None,
                    eval_dataset=None,
                    hyper_param={},
                    use_cuda=None,
                    rebuild=False):
        '''
        :param
            @data_loader: (KGDataLoader),
            @result_dir: (str) path to save the trained model and extracted dictionary
            @hyper_param: (dict)
                @hyper_param['EPOCH']
                @hyper_param['batch_size']
                @hyper_param['learning_rate_upper']
                @hyper_param['learning_rate_bert']
                @hyper_param['bert_finetune']
                @hyper_param['visualize_length']   #num of batches between two check points
                @hyper_param['isshuffle']
                @hyper_param['result_dir']
                @hyper_param['model_name']
        :return
            @loss_record, 
            @score_record
        '''
        use_cuda = self.use_cuda if use_cuda is None else use_cuda
        if use_cuda:
            print('use cuda=========================')
            self.cuda()

        EPOCH = hyper_param.get('EPOCH', 3)
        BATCH_SIZE = hyper_param.get('batch_size', 4)
        LEARNING_RATE_upper = hyper_param.get('learning_rate_upper', 1e-2)
        LEARNING_RATE_bert = hyper_param.get('learning_rate_bert', 5e-5)
        bert_finetune = hyper_param.get('bert_finetune', True)
        visualize_length = hyper_param.get('visualize_length', 10)
        result_dir = hyper_param.get('result_dir', './result/')
        model_name = hyper_param.get('model_name', 'model.p')
        is_shuffle = hyper_param.get('isshuffle', True)
        DATA_TYPE = 'rel'

        train_dataset = data_loader.dataset.train_dataset if train_dataset is None else train_dataset
        if rebuild:
            train_data_mat_dict = data_loader.transform(train_dataset,
                                                        data_type=DATA_TYPE)
        ## 保存预处理的文本,这样调参的时候可以直接读取,节约时间   *WARNING*
        else:
            old_train_dict_path = os.path.join(result_dir,
                                               'train_data_mat_dict.pkl')
            if os.path.exists(old_train_dict_path):
                train_data_mat_dict = data_loader.load_preprocessed_data(
                    old_train_dict_path)
                log('Reload preprocessed data successfully~')
            else:
                # train_data_mat_dict = data_loader.transform(train_dataset, data_type=DATA_TYPE)
                train_data_mat_dict = data_loader.transform(
                    train_dataset, istest=False, data_type=DATA_TYPE, ratio=0)
                data_loader.save_preprocessed_data(old_train_dict_path,
                                                   train_data_mat_dict)
        ## 保存预处理的文本,这样调参的时候可以直接读取,节约时间   *WARNING*
        data_generator = Batch_Generator(train_data_mat_dict,
                                         batch_size=BATCH_SIZE,
                                         data_type=DATA_TYPE,
                                         isshuffle=is_shuffle)

        print('train_data_set_length:', len(train_dataset))
        print('train_data_mat_dict_length:',
              train_data_mat_dict['cha_matrix'].shape)

        all_param = list(self.named_parameters())
        bert_param = [p for n, p in all_param if 'bert' in n]
        other_param = [p for n, p in all_param if 'bert' not in n]

        if bert_finetune:
            optimizer_group_paramters = [{
                'params': other_param,
                'lr': LEARNING_RATE_upper
            }, {
                'params': bert_param,
                'lr': LEARNING_RATE_bert
            }]
            optimizer = torch.optim.Adam(optimizer_group_paramters)
            log(
                f'****BERT_finetune, learning_rate_upper: {LEARNING_RATE_upper}, learning_rate_bert: {LEARNING_RATE_bert}',
                0)
        else:
            optimizer = torch.optim.Adam(other_param, lr=LEARNING_RATE_upper)
            log(f'****BERT_fix, learning_rate_upper: {LEARNING_RATE_upper}', 0)

        # ##TODO:
        scheduler = LambdaLR(optimizer, lr_lambda=my_lr_lambda)
        # # scheduler = transformers.optimization.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=int(EPOCH*0.2), num_training_steps=EPOCH)

        all_cnt = len(train_data_mat_dict['cha_matrix'])
        log(f'{model_name} Training start!', 0)
        loss_record = []
        score_record = []
        max_score = -1

        evel_param = {
            'batch_size': 100,
            'issave': False,
            'result_dir': result_dir
        }
        for epoch in range(EPOCH):
            self.train()

            log(f'EPOCH: {epoch+1}/{EPOCH}', 0)
            loss = 0.0
            for cnt, data_batch in enumerate(data_generator):
                x, pos, reltype, y_rel, y_ent, lens, data_list = data_batch

                loss_avg = self._loss(x, reltype, y_rel, lens)
                optimizer.zero_grad()
                loss_avg.backward()
                optimizer.step()

                loss += loss_avg
                if use_cuda:
                    loss_record.append(loss_avg.cpu().item())
                else:
                    loss_record.append(loss_avg.item())

                if (cnt + 1) % visualize_length == 0:
                    loss_cur = loss / visualize_length
                    log(
                        f'[TRAIN] step: {(cnt+1)*BATCH_SIZE}/{all_cnt} | loss: {loss_cur:.4f}',
                        1)
                    loss = 0.0

                    # self.eval()
                    # print(data_list[0]['input'])
                    # pre_paths = self._output(x, reltype, lens)
                    # print('predict-path')
                    # print(pre_paths[0])
                    # print('target-path')
                    # print(y_rel[0])
                    # self.train()

            temp_score = self.eval_model(data_loader,
                                         data_set=eval_dataset,
                                         hyper_param=evel_param,
                                         use_cuda=use_cuda)
            score_record.append(temp_score)
            scheduler.step()

            if temp_score[2] > max_score:
                max_score = temp_score[2]
                save_path = os.path.join(result_dir, model_name)
                self.save_model(save_path)
                print(
                    f'Checkpoint saved successfully, current best socre is {max_score}'
                )
        log(f'the best score of the model is {max_score}')
        return loss_record, score_record

    @torch.no_grad()
    def predict(self,
                data_loader,
                data_set=None,
                hyper_param={},
                use_cuda=None,
                rebuild=False):
        '''
        预测出 test_data_mat_dict['y_ent_matrix']中的内容,重新填写进该matrix, 未预测之前都是0
        :param
            @data_loader: (KGDataLoader),
            @hyper_param: (dict)
                @hyper_param['batch_size']  ##默认4
                @hyper_param['issave']  ##默认False
                @hyper_param['result_dir']  ##默认None
        :return
            @result: list, len(句子个数)
                case = result[0]
                case['input']
                case['relation_list']
                    r = case['relation_list'][0]
                    r['relation']: 成立日期
                    r['head']: '百度'
                    r['tail']: '2016年04月08日'
        '''
        use_cuda = self.use_cuda if use_cuda is None else use_cuda
        if use_cuda:
            print('use cuda=========================')
            self.cuda()

        BATCH_SIZE = hyper_param.get('batch_size', 100)
        ISSAVE = hyper_param.get('issave', False)
        result_dir = hyper_param.get('result_dir', './result/')
        DATA_TYPE = 'rel'

        test_dataset = data_loader.dataset.test_dataset if data_set is None else data_set
        if rebuild:
            test_data_mat_dict = data_loader.transform(test_dataset,
                                                       istest=True,
                                                       data_type=DATA_TYPE)
        ## 保存预处理的文本,这样调参的时候可以直接读取,节约时间   *WARNING*
        else:
            old_test_dict_path = os.path.join(result_dir,
                                              'test_data_mat_dict.pkl')
            if os.path.exists(old_test_dict_path):
                test_data_mat_dict = data_loader.load_preprocessed_data(
                    old_test_dict_path)
                log('Reload preprocessed data successfully~')
            else:
                test_data_mat_dict = data_loader.transform(test_dataset,
                                                           istest=True,
                                                           data_type=DATA_TYPE,
                                                           ratio=0)
                data_loader.save_preprocessed_data(old_test_dict_path,
                                                   test_data_mat_dict)
        ## 保存预处理的文本,这样调参的时候可以直接读取,节约时间   *WARNING*

        print('test_dataset_length:', len(test_dataset))
        print('test_data_mat_dict_length:',
              test_data_mat_dict['cha_matrix'].shape)
        data_generator = Batch_Generator(test_data_mat_dict,
                                         batch_size=BATCH_SIZE,
                                         data_type=DATA_TYPE,
                                         isshuffle=False)

        self.eval()  #disable dropout layer and the bn layer

        total_output_rel = []
        all_cnt = len(test_data_mat_dict['cha_matrix'])
        log(f'Predict start!', 0)
        for cnt, data_batch in enumerate(data_generator):
            x, pos, reltype, y_rel, y_ent, lens, data_list = data_batch
            pre_paths = self._output(
                x, reltype, lens)  ##pre_paths, (batch_size, T), torch.tensor
            if use_cuda:
                pre_paths = pre_paths.data.cpu().numpy().astype(np.int)
            else:
                pre_paths = pre_paths.data.numpy().astype(np.int)
            total_output_rel.append(pre_paths)

            if (cnt + 1) % 10 == 0:
                log(f'[PREDICT] step {(cnt+1)*BATCH_SIZE}/{all_cnt}', 1)

        ## add mask when the ent seq idx larger than sentance length
        pred_output = np.vstack(
            total_output_rel)  ###(N, max_length), numpy.array
        len_list = test_data_mat_dict['sentence_length']  ###(N), list
        pred_output = self._padding_mask(pred_output,
                                         len_list[:len(pred_output)])

        ## transform back to the dict form
        test_data_mat_dict['y_rel_matrix'] = pred_output
        result = data_loader.transform_back(test_data_mat_dict,
                                            data_type=DATA_TYPE)

        ## save the result
        if ISSAVE and result_dir:
            save_file = os.path.join(result_dir, 'predict.json')
            with open(save_file, 'w') as f:
                for data in result:
                    temps = json.dumps(data, ensure_ascii=False)
                    f.write(temps + '\n')
            log(f'save the predict result in {save_file}')
        print('final predict length:', len(result))
        return result

    @torch.no_grad()
    def eval_model(self,
                   data_loader,
                   data_set=None,
                   hyper_param={},
                   use_cuda=None,
                   rebuild=False):
        '''
        :param
            @data_loader: (KGDataLoader),
            @hyper_param: (dict)
                @hyper_param['batch_size']  #默认64
                @hyper_param['issave']  ##默认False
                @hyper_param['result_dir']  ##默认./result WARNING:可能报错如果result目录不存在的话
        :return
            @precision_s, 
            @recall_s, 
            @f1_s
        '''
        use_cuda = self.use_cuda if use_cuda is None else use_cuda
        if use_cuda:
            print('use cuda=========================')
            self.cuda()

        def dict2str(d):
            ## 将entity 从字典形式转化为str形式方便比较
            # res = d['entity']+':'+d['entity_type']+':'+str(d['entity_index']['begin'])+'-'+str(d['entity_index']['end'])
            ## 将relation 从字典形式转化为str形式方便比较
            res = d['relation'] + '-' + d['head'] + '-' + d['tail']
            return res

        def calculate_f1(pred_cnt, tar_cnt, correct_cnt):
            precision_s = round(correct_cnt / (pred_cnt + 1e-8), 3)
            recall_s = round(correct_cnt / (tar_cnt + 1e-8), 3)
            f1_s = round(
                2 * precision_s * recall_s / (precision_s + recall_s + 1e-8),
                3)
            return precision_s, recall_s, f1_s

        eva_data_set = data_loader.dataset.dev_dataset if data_set is None else data_set

        pred_result = self.predict(
            data_loader, eva_data_set, hyper_param, use_cuda,
            rebuild=rebuild)  ###list(dict), 预测结果 len=n_sentence
        target = eva_data_set  ###list(dict)  AutoKGDataset, 真实结果

        pred_cnt = 0
        tar_cnt = 0
        correct_cnt = 0
        cnt_all = len(eva_data_set)
        log('Eval start')
        for idx in range(cnt_all):
            sentence = pred_result[idx]['input']
            pred_list = pred_result[idx]['relation_list']
            tar_list = target[idx]['output']['relation_list']

            str_pred_set = set(map(dict2str, pred_list))
            str_tar_set = set(map(dict2str, tar_list))
            common_set = str_pred_set.intersection(str_tar_set)
            # print('target:')
            # print(str_tar_set)
            # print('predict:')
            # print(str_pred_set)

            pred_cnt += len(str_pred_set)
            tar_cnt += len(str_tar_set)
            correct_cnt += len(common_set)

            if (idx + 1) % 1000 == 0:
                precision_s, recall_s, f1_s = calculate_f1(
                    pred_cnt, tar_cnt, correct_cnt)
                log(
                    f'[EVAL] step {idx+1}/{cnt_all} | precision: {precision_s} | recall: {recall_s} | f1 score: {f1_s}',
                    1)

        precision_s, recall_s, f1_s = calculate_f1(pred_cnt, tar_cnt,
                                                   correct_cnt)
        print('=' * 100)
        log(
            f'[FINAL] | precision: {precision_s} | recall: {recall_s} | f1 score: {f1_s}',
            0)
        print('=' * 100)
        return (precision_s, recall_s, f1_s)
示例#7
0
class BERT_CRF(MODEL_TEMP):
    def __init__(self, config={}, show_param=False):
        '''
        :param - dict
            param['embedding_dim']
            param['hidden_dim']
            param['n_ent_tags']
            param['n_rel_tags']
            param['n_words']
            param['start_ent_idx']  int, <start> tag index for entity tag seq
            param['end_ent_idx']   int, <end> tag index for entity tag seq
            param['start_rel_idx']
            param['end_rel_idx']
            param['use_cuda']
            param['dropout_prob']
            param['lstm_layer_num']
        '''
        super(BERT_CRF, self).__init__()
        self.config = config
        self.embedding_dim = self.config.get('embedding_dim', 768)
        self.n_tags = self.config['n_ent_tags']
        # self.n_words = self.config['n_words']
        # self.dropout_prob = self.config.get('dropout_prob', 0)

        self.use_cuda = self.config['use_cuda']
        self.model_type = 'BERT_CRF'

        self.build_model()
        self.reset_parameters()
        if show_param:
            self.show_model_param()

    def show_model_param(self):
        log('=' * 80, 0)
        log(f'model_type: {self.model_type}', 1)
        log(f'use_cuda: {self.use_cuda}', 1)
        log(f'embedding_dim: {self.embedding_dim}', 1)
        log(f'n_ent_tags: {self.n_tags}', 1)
        log(f"crf_start_idx: {self.config['start_ent_idx']}", 1)
        log(f"crf_end_idx: {self.config['end_ent_idx']}", 1)
        # log(f'dropout_prob: {self.dropout_prob}', 1)
        log('=' * 80, 0)

    def build_model(self):
        '''
        build the embedding layer, lstm layer and CRF layer
        '''
        self.hidden2tag = nn.Linear(self.embedding_dim, self.n_tags)

        crf_config = {
            'n_tags': self.config['n_ent_tags'],
            'start_idx': self.config['start_ent_idx'],
            'end_idx': self.config['end_ent_idx'],
            'use_cuda': self.use_cuda
        }
        self.crf = CRF(crf_config)
        self.bert = transformers.BertModel.from_pretrained('bert-base-chinese')

    def reset_parameters(self):
        I.xavier_normal_(self.hidden2tag.weight.data)
        self.crf.reset_parameters()

    def _get_features(self, x, lens, use_cuda=None):
        '''
        :param  
            @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array
            @lens: 每个句子的实际长度 (batch_size)
        :return 
            @lstm_feature: (batch_size, T, n_tags) -- 类似于eject score, torch.tensor
        '''
        use_cuda = self.use_cuda if use_cuda is None else use_cuda
        batch_size, T = x.shape

        ##bert layer
        words_tensor = self._to_tensor(x, use_cuda)  #(batch_size, T)
        lens = self._to_tensor(lens, use_cuda)
        att_mask = self._generate_mask(lens, max_len=T)
        embeds = self.bert(
            words_tensor,
            attention_mask=att_mask)[0]  #(batch_size, T, n_embed)

        ##FC layer
        feature = self.hidden2tag(embeds)  #(batch_size, T, n_tags)
        feature = torch.tanh(feature)
        # print(feature.shape)
        return feature

    def _loss(self, x, y_ent, lens, use_cuda=None):
        '''
        loss function: neg_log_likelihood
        :param
            @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array
            @y_ent: (batch_size, T), np.array, index之后的entity seq, 字符级别,
            @lens: (batch_size), list, 具体每个句子的长度, 
        :return 
            @loss: (1), torch.tensor
        '''
        use_cuda = self.use_cuda if use_cuda is None else use_cuda

        logits = self._get_features(x, lens)
        log_norm_score = self.crf.log_norm_score(logits, lens)
        path_score = self.crf.path_score(logits, y_ent, lens)

        loss = log_norm_score - path_score  ##(batch_size, )
        loss = (loss / self._to_tensor(lens, use_cuda)).mean()
        return loss

    def _output(self, x, lens, use_cuda=None):
        '''
        return the crf decode paths
        :param
            @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array
            @lens: (batch_size), list, 具体每个句子的长度, 
        :return 
            @paths: (batch_size, T+1), torch.tensor, 最佳句子路径
            @scores: (batch_size), torch.tensor, 最佳句子路径上的得分
        '''
        # self.eval()
        use_cuda = self.use_cuda if use_cuda is None else use_cuda
        logits = self._get_features(x, lens, use_cuda)
        scores, paths = self.crf.viterbi_decode(logits, lens, use_cuda)
        return paths
示例#8
0
class BLSTM_CRF(MODEL_TEMP):
    def __init__(self, config={}, show_param=False):
        '''
        :param - dict
            param['embedding_dim']
            param['hidden_dim']
            param['n_tags']
            param['n_words']
            param['start_idx']  int, <start> tag index for entity tag seq
            param['end_idx']   int, <end> tag index for entity tag seq
            param['use_cuda']
            param['dropout_prob']
            param['lstm_layer_num']
        '''
        super(BLSTM_CRF, self).__init__()
        self.config = config
        self.embedding_dim = self.config.get('embedding_dim',
                                             768)  #TODO: 64, 768
        self.hidden_dim = self.config.get('hidden_dim', 64)  #TODO: 128*2, 64
        assert self.hidden_dim % 2 == 0, 'hidden_dim for BLSTM must be even'
        self.n_tags = self.config.get('n_ent_tags', 45)
        self.n_words = self.config.get('n_words', 10000)

        self.dropout_prob = self.config.get('dropout_prob', 0)
        self.lstm_layer_num = self.config.get('lstm_layer_num', 1)

        self.use_cuda = self.config.get('use_cuda', False)
        self.model_type = 'BLSTM_CRF'

        self.build_model()
        self.reset_parameters()
        if show_param:
            self.show_model_param()

    def show_model_param(self):
        log('=' * 80, 0)
        log(f'model_type: {self.model_type}', 1)
        log(f'use_cuda: {self.use_cuda}', 1)
        log(f'embedding_dim: {self.embedding_dim}', 1)
        log(f'hidden_dim: {self.hidden_dim}', 1)
        log(f'lstm_layer_num: {self.lstm_layer_num}', 1)
        log(f'dropout_prob: {self.dropout_prob}', 1)
        log(f'n_ent_tags: {self.n_tags}', 1)
        log(f"crf_start_idx: {self.config['start_ent_idx']}", 1)
        log(f"crf_end_idx: {self.config['end_ent_idx']}", 1)
        log('=' * 80, 0)

    def build_model(self):
        '''
        build the embedding layer, lstm layer and CRF layer
        '''
        self.word_embeds = nn.Embedding(self.n_words, self.embedding_dim)
        self.lstm = nn.LSTM(self.embedding_dim,
                            self.hidden_dim // 2,
                            batch_first=True,
                            num_layers=self.lstm_layer_num,
                            dropout=self.dropout_prob,
                            bidirectional=True)
        self.hidden2tag = nn.Linear(self.hidden_dim, self.n_tags)

        crf_config = {
            'n_tags': self.config['n_ent_tags'],
            'start_idx': self.config['start_ent_idx'],
            'end_idx': self.config['end_ent_idx'],
            'use_cuda': self.use_cuda
        }
        self.crf = CRF(crf_config)

    def reset_parameters(self):
        I.xavier_normal_(self.word_embeds.weight.data)
        self.lstm.reset_parameters()
        # stdv = 1.0 / math.sqrt(self.hidden_dim)
        # for weight in self.lstm.parameters():
        #     I.uniform_(weight, -stdv, stdv)
        I.xavier_normal_(self.hidden2tag.weight.data)
        self.crf.reset_parameters()

    def _get_lstm_features(self, x, use_cuda=None):
        '''
        :param  
            @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array
        :return 
            @lstm_feature: (batch_size, T, n_tags) -- 类似于eject score, torch.tensor
        '''
        use_cuda = self.use_cuda if use_cuda is None else use_cuda
        batch_size = x.shape[0]

        ##embedding layer
        words_tensor = self._to_tensor(x, use_cuda)  #(batch_size, T)
        embeds = self.word_embeds(words_tensor)  #(batch_size, T, n_embed)

        ##LSTM layer
        if use_cuda:
            h_0 = torch.randn(2 * self.lstm_layer_num, batch_size,
                              self.hidden_dim //
                              2).cuda()  #(n_layer*n_dir, N, n_hid)
            c_0 = torch.randn(2 * self.lstm_layer_num, batch_size,
                              self.hidden_dim // 2).cuda()
        else:
            h_0 = torch.randn(2 * self.lstm_layer_num, batch_size,
                              self.hidden_dim // 2)
            c_0 = torch.randn(2 * self.lstm_layer_num, batch_size,
                              self.hidden_dim // 2)
        # c_0 = h_0.clone()
        hidden = (h_0, c_0)
        lstm_out, _hidden = self.lstm(
            embeds, hidden)  #(batch_size, T, n_dir*n_hid), (h, c)

        ##FC layer
        lstm_feature = self.hidden2tag(lstm_out)  #(batch_size, T, n_tags)
        lstm_feature = torch.tanh(lstm_feature)

        return lstm_feature

    def _loss(self, x, y_ent, lens, use_cuda=None):
        '''
        loss function: neg_log_likelihood
        :param
            @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array
            @y_ent: (batch_size, T), np.array, index之后的entity seq, 字符级别,
            @lens: (batch_size), list, 具体每个句子的长度, 
        :return 
            @loss: (batch_size), torch.tensor
        '''
        use_cuda = self.use_cuda if use_cuda is None else use_cuda

        logits = self._get_lstm_features(x)
        log_norm_score = self.crf.log_norm_score(logits, lens)
        path_score = self.crf.path_score(logits, y_ent, lens)

        loss = log_norm_score - path_score
        loss = (loss / self._to_tensor(lens, use_cuda)).mean()
        return loss

    def _output(self, x, lens, use_cuda=None):
        '''
        return the crf decode paths
        :param
            @x: index之后的word, 每个字符按照字典对应到index, (batch_size, T), np.array
            @lens: (batch_size), list, 具体每个句子的长度, 
        :return 
            @paths: (batch_size, T), torch.tensor, 最佳句子路径
            @scores: (batch_size), torch.tensor, 最佳句子路径上的得分
        '''
        use_cuda = self.use_cuda if use_cuda is None else use_cuda
        logits = self._get_lstm_features(x, use_cuda)
        scores, paths = self.crf.viterbi_decode(logits, lens, use_cuda)
        return paths
class BERT(nn.Module):
    def __init__(self, num_labels, use_crf, tag_pad_idx, pad_token, tag_names):
        '''

        bert sequence classifier

        num_labels: number of output classes
        use_crf: switch for using conditional random field (reduces probability of invalid tagging sequences)
        tag_pad_idx: index for tag padding token
        pad_token: pad token
        tag_names: the names of all of the tags in the tag field

        '''
        super().__init__()
        self.num_labels = num_labels
        self.use_crf = use_crf
        self.tag_pad_idx, self.pad_token, self.tag_names = tag_pad_idx, pad_token, tag_names
        self.build_model_layers()
        self.init_weights()

    def build_model_layers(self):
        ''' builds the layers in the model '''
        self.bert = BertForTokenClassification.from_pretrained(
            "bert-base-cased",
            num_labels=self.num_labels,
            output_attentions=False,
            output_hidden_states=False)
        if self.use_crf:
            self.crf = CRF(self.tag_pad_idx, self.pad_token, self.tag_names)

    def forward(self, sentence, attention_mask, tags):
        ''' forward operation for network '''
        outputs = self.bert(sentence,
                            token_type_ids=None,
                            attention_mask=attention_mask,
                            labels=tags)
        loss, logits = outputs[0], outputs[1]
        if self.use_crf:
            # remove first token id in each sentence (to make crf mask work)
            # crf_out, crf_loss = self.crf(logits, tags)
            crf_out, crf_loss = self.crf(logits[:, 1:], tags[:, 1:])
            return crf_out, crf_loss
        else:
            return logits, loss

    def init_weights(self):
        ''' initializes model weights '''
        # param_initializer = list(self.bert.classifier.named_parameters())
        # if self.crf:
        #     param_initializer += list(self.crf.named_parameters())
        # for name, param in param_initializer:
        #     nn.init.normal_(param.data, mean=0, std=0.1)

        # only initialize conditional random field weights
        if self.crf:
            for name, param in self.crf.named_parameters():
                nn.init.normal_(param.data, mean=0, std=0.1)

    def count_parameters(self):
        ''' counts model parameters '''
        return sum(p.numel() for p in self.parameters() if p.requires_grad)
示例#10
0
files_t = []
for i in range(10):
    files_t.append('test'+str(i+1)+'.nn.ner')

for i in range(10):
    print('file ' + str(i+1))
    fw = open(dirs + files_t[i] + '.pipe_crf6', 'w')
    print('load train')
    doc, label, sparse, trans_tr = load_train_data_pipe(dirs+files[i], vocab_w2i, sen_len, sparse_len, crf_num, label_num, label_onehot)
    print('load test')
    doc_t, label_t, sparse_t, trans_t = load_train_data_pipe(dirs+files_t[i], vocab_w2i, sen_len, sparse_len, crf_num, label_num, label_onehot)
    with tf.Graph().as_default():
        sess = tf.Session()
        with sess.as_default():
            crf = CRF(sen_len, label_num, sparse_len, crf_num, learning_rate, label_m, trans_tr)
            sess.run(tf.initialize_all_variables())
            def train_step(input_, label_):
                feed_dict = {
                    crf.input : input_,
                    crf.label : label_
                }
                _, lss = sess.run([crf.trains, crf.loss], feed_dict)
            def test_step(input_, label_, fw, trans):
                totals_ = 0
                corrects_ = 0
                feed_dict = {crf.input : input_, crf.label : label_}
                unary_score, lens, _ = sess.run([crf.unary_score, crf.lens, crf.trains], feed_dict)
                for unary_, l_, lens_ in zip(unary_score, label_, lens):
                    u = unary_[:lens_]
                    l = l_[:lens_]
示例#11
0
# print information about datasets
print('train set: {} sentences'.format(len(corpus.train_set)))
print('valid set: {} sentences'.format(len(corpus.valid_set)))
print('test set: {} sentences'.format(len(corpus.valid_set)))
print(m * '-')

# parameters from corpus
text_pad_idx = corpus.text_pad_idx
text_unk_idx = corpus.text_unk_idx
char_pad_idx = corpus.char_pad_idx
tag_pad_idx = corpus.tag_pad_idx
pad_token = corpus.pad_token
pretrained_embeddings = corpus.text_field.vocab.vectors

try:
    CRF(tag_pad_idx, pad_token, tag_names)
    use_crf = True
    print('using crf for models')
except:
    use_crf = False
    print('not using crf for models (incompatible tagging format)')
print(m * '-')

# shared cnn parameters
char_embedding_dim = 37
char_filter = 4
char_kernel = 3
# shared dropouts
embedding_dropout_ratio = 0.5
char_embedding_dropout_ratio = 0.25
cnn_dropout_ratio = 0.25