class TestTokenEmbeddingSimilarity(TestTokenEmbedding):
    def setUp(self):
        super().setUp()
        self.config["extended_vocab_path"] = self.test_data_file
        self.config["keep_extended_vocab_only"] = True

    def get_dot(self, vec_a, vec_b):
        return np.sum(vec_a * vec_b)

    def get_cosine(self, vec_a, vec_b):
        return self.get_dot(vec_a, vec_b) / (np.sqrt(
            self.get_dot(vec_a, vec_a) * self.get_dot(vec_b, vec_b)))

    def get_random_word_vec(self, vocab_list):
        vocab_size = len(vocab_list)
        ids = np.random.randint(vocab_size, size=2)
        word_a, word_b = vocab_list[ids[0]], vocab_list[ids[1]]
        vec_a, vec_b = self.embedding.search([word_a, word_b])
        return word_a, word_b, vec_a, vec_b

    def test_cosine_sim(self):
        self.embedding = TokenEmbedding(**self.config)
        vocab_list = get_vocab_list(self.config["extended_vocab_path"])
        word_a, word_b, vec_a, vec_b = self.get_random_word_vec(vocab_list)
        result = self.embedding.cosine_sim(word_a, word_b)
        expected_result = self.get_cosine(vec_a, vec_b)
        self.check_output_equal(result, expected_result)

    def test_dot(self):
        self.embedding = TokenEmbedding(**self.config)
        vocab_list = get_vocab_list(self.config["extended_vocab_path"])
        word_a, word_b, vec_a, vec_b = self.get_random_word_vec(vocab_list)
        result = self.embedding.dot(word_a, word_b)
        expected_result = self.get_dot(vec_a, vec_b)
        self.check_output_equal(result, expected_result)
 def test_extended_vocab(self):
     self.embedding = TokenEmbedding(**self.config)
     vocab_list = get_vocab_list(self.config["extended_vocab_path"])
     emb_idx = set(self.embedding.get_idx_list_from_words(vocab_list))
     vocab_idx = set([i for i in range(len(vocab_list))])
     self.assertEqual(emb_idx, vocab_idx)
     self.check_output_equal(emb_idx, vocab_idx)
 def test_unk_token(self):
     self.embedding = TokenEmbedding(**self.config)
     self.check_output_equal(self.config["unknown_token"],
                             self.embedding.unknown_token)
     self.check_output_equal(
         self.config["unknown_token_vector"],
         self.embedding.search(self.embedding.unknown_token)[0])
 def test_dot(self):
     self.embedding = TokenEmbedding(**self.config)
     vocab_list = get_vocab_list(self.config["extended_vocab_path"])
     word_a, word_b, vec_a, vec_b = self.get_random_word_vec(vocab_list)
     result = self.embedding.dot(word_a, word_b)
     expected_result = self.get_dot(vec_a, vec_b)
     self.check_output_equal(result, expected_result)
 def test_extended_vocab(self):
     self.embedding = TokenEmbedding(**self.config)
     vocab_list = get_vocab_list(self.config["extended_vocab_path"])
     vocab_size = len(vocab_list)
     # +1 means considering [PAD]
     self.check_output_equal(vocab_size + 1,
                             len(self.embedding._word_to_idx))
class TestTokenEmbeddingExtendedVocab(TestTokenEmbedding):
    def setUp(self):
        super().setUp()
        self.config["extended_vocab_path"] = self.test_data_file

    def test_extended_vocab(self):
        self.embedding = TokenEmbedding(**self.config)
        vocab_list = get_vocab_list(self.config["extended_vocab_path"])
        emb_idx = set(self.embedding.get_idx_list_from_words(vocab_list))
        vocab_idx = set([i for i in range(len(vocab_list))])
        self.assertEqual(emb_idx, vocab_idx)
        self.check_output_equal(emb_idx, vocab_idx)
示例#7
0
 def __init__(self,embedding_name):
     super(Embedding, self).__init__()
     self.embedding =TokenEmbedding(embedding_name)
     self.embedding_dim = self.embedding.embedding_dim
     weight_attr = paddle.framework.ParamAttr(
         name="linear_weight",
         initializer=paddle.nn.initializer.XavierNormal())
     bias_attr = paddle.framework.ParamAttr(
         name="linear_bias",
         initializer=paddle.nn.initializer.XavierNormal())
     self.mlp = paddle.nn.Linear(self.embedding_dim*2, self.embedding_dim, weight_attr=weight_attr, bias_attr=bias_attr)
     self.gru = nn.GRU(input_size=self.embedding_dim,hidden_size=self.embedding_dim//2,num_layers=1,
                       direction="bidirectional",)
示例#8
0
    def load_model(cls):

        cls.wordemb1 = spacy.load('zh_core_web_sm')
        cls.wordemb2 = TokenEmbedding(
            "w2v.baidu_encyclopedia.target.word-word.dim300")

        stopwords = []
        for word in open('static/dict/chineseStopWords.txt',
                         'r',
                         encoding='utf-8'):
            stopwords.append(word.strip())

        cls.stopwords = stopwords
        print('模型加载完成')
class TestTokenEmbeddingUNK(TestTokenEmbedding):
    def setUp(self):
        super().setUp()
        self.config["unknown_token"] = "[unk]"  # default [UNK], change it
        self.config["unknown_token_vector"] = np.random.normal(
            scale=0.02, size=300).astype(paddle.get_default_dtype())

    def test_unk_token(self):
        self.embedding = TokenEmbedding(**self.config)
        self.check_output_equal(self.config["unknown_token"],
                                self.embedding.unknown_token)
        self.check_output_equal(
            self.config["unknown_token_vector"],
            self.embedding.search(self.embedding.unknown_token)[0])
示例#10
0
    def __init__(self,
                 embed_dim,
                 hidden_size,
                 vocab_size,
                 output_dim,
                 vocab_path,
                 padding_idx=0,
                 num_layers=1,
                 dropout_prob=0.0,
                 init_scale=0.1,
                 embedding_name=None):
        super(BiLSTM, self).__init__()
        if embedding_name is not None:
            self.embedder = TokenEmbedding(embedding_name,
                                           extended_vocab_path=vocab_path,
                                           keep_extended_vocab_only=True)
            embed_dim = self.embedder.embedding_dim
        else:
            self.embedder = nn.Embedding(vocab_size, embed_dim, padding_idx)

        self.lstm = nn.LSTM(embed_dim,
                            hidden_size,
                            num_layers,
                            'bidirectional',
                            dropout=dropout_prob)

        self.fc = nn.Linear(
            hidden_size * 2,
            hidden_size,
            weight_attr=paddle.ParamAttr(
                initializer=I.Uniform(low=-init_scale, high=init_scale)))

        self.fc_1 = nn.Linear(
            hidden_size * 8,
            hidden_size,
            weight_attr=paddle.ParamAttr(
                initializer=I.Uniform(low=-init_scale, high=init_scale)))

        self.output_layer = nn.Linear(
            hidden_size,
            output_dim,
            weight_attr=paddle.ParamAttr(
                initializer=I.Uniform(low=-init_scale, high=init_scale)))
示例#11
0
 def __init__(self,
              emb_size,
              hidden_size,
              word_num,
              label_num,
              use_w2v_emb=False):
     super(BiGRUWithCRF, self).__init__()
     if use_w2v_emb:
         self.word_emb = TokenEmbedding(
             extended_vocab_path='./conf/word.dic', unknown_token='OOV')
     else:
         self.word_emb = nn.Embedding(word_num, emb_size)
     self.gru = nn.GRU(emb_size,
                       hidden_size,
                       num_layers=2,
                       direction='bidirectional')
     self.fc = nn.Linear(hidden_size * 2, label_num + 2)  # BOS EOS
     self.crf = LinearChainCrf(label_num)
     self.decoder = ViterbiDecoder(self.crf.transitions)
示例#12
0
 def __init__(self,
              emb_size,
              hidden_size,
              word_num,
              label_num,
              use_w2v_emb=False):
     super(BiGRUWithCRF, self).__init__()
     if use_w2v_emb:
         self.word_emb = TokenEmbedding(
             extended_vocab_path='./data/word.dic', unknown_token='OOV')
     else:
         self.word_emb = nn.Embedding(word_num, emb_size)
     self.gru = nn.GRU(emb_size,
                       hidden_size,
                       num_layers=2,
                       direction='bidirect')
     # We need `label_num + 2` for appending BOS and EOS tag 
     self.fc = nn.Linear(hidden_size * 2, label_num + 2)
     self.crf = LinearChainCrf(label_num)
     self.crf_loss = LinearChainCrfLoss(self.crf)
     self.viterbi_decoder = ViterbiDecoder(self.crf.transitions)
示例#13
0
 def __init__(self,
              vocab_size,
              num_classes,
              vocab_path,
              emb_dim=300,
              hidden_size=128,
              fc_hidden_size=96,
              use_token_embedding=True):
     super().__init__()
     if use_token_embedding:
         self.embedder = TokenEmbedding(
             args.embedding_name, extended_vocab_path=vocab_path)
         emb_dim = self.embedder.embedding_dim
     else:
         padding_idx = vocab_size - 1
         self.embedder = nn.Embedding(
             vocab_size, emb_dim, padding_idx=padding_idx)
     self.bow_encoder = paddlenlp.seq2vec.BoWEncoder(emb_dim)
     self.fc1 = nn.Linear(self.bow_encoder.get_output_dim(), hidden_size)
     self.fc2 = nn.Linear(hidden_size, fc_hidden_size)
     self.dropout = nn.Dropout(p=0.3, axis=1)
     self.output_layer = nn.Linear(fc_hidden_size, num_classes)
 def test_trainable(self):
     self.embedding = TokenEmbedding(**self.config)
     self.check_output_not_equal(self.config["trainable"],
                                 self.embedding.weight.stop_gradient)
示例#15
0
def set_log(args):
    set_seeds(args)
    if True:#args.do_train:
        # preparing embeddings
        tokens_emb = TokenEmbedding("w2v.baidu_encyclopedia.target.word-word.dim300")
        # preparing train datasets
        assert args.raw_train_file != None, "--raw_train_file should be set when training!"
        if not os.path.exists(args.train_file):
            process_data(args.raw_train_file,args.train_file,tokens_emb)
        with open(args.train_file,mode="r",encoding="utf-8") as rfp:
            train_ex = json.load(rfp)
        train_dataset = DuReaderDataset(train_ex)
        train_batch_sampler = paddle.io.DistributedBatchSampler(
            train_dataset, batch_size=args.batch_size, shuffle=True)
        train_data_loader = paddle.io.DataLoader(
            dataset=train_dataset,
            batch_sampler=train_batch_sampler,
            collate_fn=batchify,
            return_list=True)
        # preparing dev datasets
        assert args.raw_dev_file != None, "--raw_dev_file should be set when training!"
        if not os.path.exists(args.dev_file):
            process_data(args.raw_dev_file,args.dev_file,tokens_emb)
        with open(args.train_file,mode="r",encoding="utf-8") as rfp:
            dev_ex = json.load(rfp)
        dev_dataset = DuReaderDataset(dev_ex)
        dev_batch_sampler = paddle.io.DistributedBatchSampler(
            dev_dataset, batch_size=args.dev_batch_size, shuffle=True)
        dev_data_loader = paddle.io.DataLoader(
            dataset=dev_dataset,
            batch_sampler=dev_batch_sampler,
            collate_fn=batchify,
            return_list=True)

        num_training_steps = args.max_steps if args.max_steps > 0 else len(
            train_data_loader) * args.num_train_epochs
        if paddle.distributed.get_rank() == 0:
            dev_count = paddle.fluid.core.get_cuda_device_count()
            logger.info("Device count: %d" % dev_count)
            logger.info("Num train examples: %d" % len(train_dataset))
            logger.info("Num dev examples: %d" % len(dev_dataset))
            logger.info("Max train steps: %d" % num_training_steps)
        model = DocReader(args)
        model.init_lr_scheduler(args, num_training_steps)
        model.init_optimizer(args)
        model.init_loss(args)

        # Training process
        global_step = 0
        tic_train = time.time()
        for epoch in range(args.num_train_epochs):
            for step, batch in enumerate(train_data_loader):
                global_step += 1
                loss = model.update(batch)
                if global_step % args.logging_steps == 0:
                    logger.info("global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
                                % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train)))
                    tic_train = time.time()

                if global_step % args.save_steps == 0 or global_step == num_training_steps:
                    if paddle.distributed.get_rank() == 0:
                        output_dir = os.path.join(args.output_dir, "model_%d" % global_step)
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # need better way to get inner model of DataParallel
                        model_file = os.path.join(output_dir + '.ckpt')
                        model.save(model_file)
        model_file = os.path.join(args.output_dir, args.model_name + "-global.ckpt")
        model.save(model_file)

    if args.do_predict:
        # preparing test datasets
        pass
示例#16
0
words = jiagu.seg(text)  # 分词
print(words)

pos = jiagu.pos(words)  # 词性标注
print(pos)

ner = jiagu.ner(words)  # 命名实体识别
print(ner)

from paddlenlp.datasets import ChnSentiCorp

train_ds, dev_ds, test_ds = ChnSentiCorp.get_datasets(['train', 'dev', 'test'])

from paddlenlp.embeddings import TokenEmbedding

wordemb = TokenEmbedding("w2v.baidu_encyclopedia.target.word-word.dim300")
print(wordemb.cosine_sim("苹果", "香蕉"))

wordemb.cosine_sim("艺术", "火车")

wordemb.cosine_sim("狗", "香蕉")

for token1 in ['狗', '猫', '香蕉']:
    for token2 in ['狗', '猫', '香蕉']:
        print(wordemb.cosine_sim(token1, token2))

vv = wordemb.search(['狗', '猫', '香蕉'])

vv2 = wordemb.search('狗猫香蕉')