Пример #1
0
 def forward(self, src_ids, *args, **kwargs):
     tgt_labels = kwargs.pop('tgt_labels', None)
     tgt_pos = kwargs.pop('tgt_pos', None)
     encode_only = kwargs.pop('encode_only', False)
     _, encoded, info = ErnieModel.forward(self, src_ids, *args, **kwargs)
     #log.debug('hidden_-1 %r'% L.reduce_mean(info['hiddens'][0]).numpy())
     #log.debug('hidden_0 %r'% L.reduce_mean(info['hiddens'][1]).numpy())
     if encode_only:
         return None, None, info
     elif tgt_labels is None:
         encoded = self.mlm(encoded)
         encoded = self.mlm_ln(encoded)
         logits = L.matmul(encoded, self.word_emb.weight, transpose_y=True) + self.mlm_bias
         output_ids = L.argmax(logits, -1)
         return output_ids, logits, info
     else:
         encoded_2d = L.gather_nd(encoded, tgt_pos)
         #log.debug('input shape %s' % repr(src_ids.shape))
         #log.debug(L.gather_nd(src_ids, tgt_pos).numpy())
         encoded_2d = self.mlm(encoded_2d)
         encoded_2d = self.mlm_ln(encoded_2d)
         logits_2d = L.matmul(encoded_2d, self.word_emb.weight, transpose_y=True) + self.mlm_bias
         if len(tgt_labels.shape) == 1:
             tgt_labels = L.reshape(tgt_labels, [-1, 1])
         
         loss = L.reduce_mean(
                 L.softmax_with_cross_entropy(logits_2d, tgt_labels, soft_label=(tgt_labels.shape[-1] != 1))
                 )
         return loss, logits_2d, info
Пример #2
0
 def forward(self, src_ids, *args, **kwargs):
     pooled, encoded = ErnieModel.forward(self, src_ids, *args, **kwargs)
     encoded_2d = L.gather_nd(encoded, L.where(src_ids == mask_id))
     encoded_2d = self.mlm(encoded_2d)
     encoded_2d = self.mlm_ln(encoded_2d)
     logits_2d = L.matmul(
         encoded_2d, self.word_emb.weight, transpose_y=True) + self.mlm_bias
     return logits_2d
Пример #3
0
    def test_ernie_extract_feature(self):
        ernie_config = {
                "pretrain_dir_or_url": "ernie-1.0",
                }

        with D.guard():
            ernie = ErnieModel.from_pretrained(**ernie_config)

            res = batch_infer(ernie, TestErnieExtractFeature.text_ids, batch_size=3, with_label=False, logits_softmax=None)
            logging.info("len res: {}".format(len(res)))

            for (pooled_encode_vec, sequence_encode_vec), text in zip(res, TestErnieExtractFeature.text_list):
                logging.info("text: {}".format(text.encode("utf-8")))
                logging.info("pooled_encode_vec shape: {}".format(pooled_encode_vec.shape))
                logging.info("sequence_encode_vec shape: {}".format(sequence_encode_vec.shape))
Пример #4
0
    def test_ernie_cluster(self):
        ernie_config = {
                "pretrain_dir_or_url": "ernie-1.0",
                }

        tokenizer = ErnieTokenizer.load("./dict/vocab.txt")
        text_ids = tokenizer.transform(TestCluster.text_list)

        with D.guard():
            ernie = ErnieModel.from_pretrained(**ernie_config)
            res = batch_infer(ernie, text_ids, batch_size=128, with_label=False, logits_softmax=None)

        pooled_encode_vec, _ = zip(*res)
        data_cluster(TestCluster.cluster_model, pooled_encode_vec)

        cluster_ids = TestCluster.cluster_model.labels_
        cluster_res_path = os.path.join(TestCluster.test_output_dir, "ernie_kmeans_cluster_%d.txt" % TestCluster.cluster_num)
        write_to_file(zip(cluster_ids, TestCluster.text_list), cluster_res_path, write_func=lambda x: "%s\t%s" % x)
Пример #5
0
                        default=None,
                        help='model output directory')
    parser.add_argument('--wd',
                        type=float,
                        default=0.01,
                        help='weight decay, aka L2 regularizer')
    args = parser.parse_args()
    cfg_file_path = os.path.join(args.conf, 'ernie_config.json')
    tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained)

    # train 过程
    kf = KFold(n_splits=5, shuffle=True)
    for k, (train_index,
            val_index) in enumerate(kf.split(range(len(dataset)))):
        print('Start train {} ford'.format(k))
        ernie = ErnieModel.from_pretrained(args.from_pretrained)

        train = dataset.iloc[train_index]
        val = dataset.iloc[val_index]

        train = ErnieDataset(tokenizer, train, max_len=152)
        train = DataLoader(train, batch_size=16, num_workers=2)

        val = ErnieDataset(tokenizer, val, max_len=152)
        val = DataLoader(val, batch_size=16, num_workers=2)

        # train 过程
        trainer = SoftMaskedErnieTrainer(args, erine, tokenizer, device)
        best_loss = 100000
        for e in range(100):
            trainer.train(train, e)
Пример #6
0
def download_ernie_model(config):
    place = F.CUDAPlace(0)
    with D.guard(place):
        model = ErnieModel.from_pretrained(config.ernie_name)
Пример #7
0
 def _forward_once(self, src_ids, *args, **kwargs):
     pooled, encoded = ErnieModel.forward(self, src_ids, *args, **kwargs)
     return pooled
Пример #8
0
import numpy as np
import paddle.fluid.dygraph as D
from ernie.tokenizing_ernie import ErnieTokenizer
from ernie.modeling_ernie import ErnieModel

D.guard().__enter__()  # activate paddle `dygrpah` mode

model = ErnieModel.from_pretrained(
    'ernie-1.0'
)  # Try to get pretrained model from server, make sure you have network connection
# model.eval()是去掉训练过程(也就是没有了反向计算过程),只是测试
model.eval(
)  #加载模型并将模型的状态设置为校验状态(eval),显式告诉框架我们接下来只会使用前向计算的流程,不会计算梯度和梯度反向传播,这将减少内存的消耗。
tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')

ids, _ = tokenizer.encode('hello world')
ids = D.to_variable(np.expand_dims(ids, 0))  # insert extra `batch` dimension
pooled, encoded = model(ids)  # eager execution
print(pooled.numpy())  # convert  results to numpy