Пример #1
0
def test_word_emb():
    b = WordEmbedding()
    data1 = '你 好 啊'.split(' ')
    r = b.embed([data1], True)

    print(r)
    print(r.shape)
Пример #2
0
 def load_model(self):
     if not self.model:
         if self.embedding_type == EmbType.BERT:
             from text2vec.embeddings.bert_embedding import BERTEmbedding
             self.model = BERTEmbedding(sequence_length=128)
         elif self.embedding_type == EmbType.W2V:
             from text2vec.embeddings.word_embedding import WordEmbedding
             self.model = WordEmbedding()
         else:
             raise ValueError('set error embedding type.')
Пример #3
0
class Vector(object):
    def __init__(self, embedding_type='w2v'):
        self.embedding_type = embedding_type
        self.model = None

    def load_model(self):
        if not self.model:
            if self.embedding_type == EmbType.BERT:
                from text2vec.embeddings.bert_embedding import BERTEmbedding
                self.model = BERTEmbedding(sequence_length=128)
            elif self.embedding_type == EmbType.W2V:
                from text2vec.embeddings.word_embedding import WordEmbedding
                self.model = WordEmbedding()
            else:
                raise ValueError('set error embedding type.')

    def tokenize(self, text):
        if not text.strip():
            return []
        self.load_model()
        return self.model.tokenizer.tokenize(text)

    def encode(self, tokens):
        ret = 0.0
        if not tokens:
            return ret
        self.load_model()
        if isinstance(tokens, str):
            tokens = self.tokenize(tokens)
        return self.model.embed_one(tokens)
Пример #4
0
 def load_model(self):
     if not self.model:
         if self.embedding_type == EmbType.BERT:
             from text2vec.embeddings.bert_embedding import BERTEmbedding
             self.model = BERTEmbedding(model_folder=self.bert_model_folder,
                                        layer_nums=self.bert_layer_nums,
                                        trainable=self.trainable,
                                        sequence_length=self.sequence_length,
                                        processor=self.processor)
         elif self.embedding_type == EmbType.W2V:
             from text2vec.embeddings.word_embedding import WordEmbedding
             self.model = WordEmbedding(w2v_path=self.w2v_path,
                                        w2v_kwargs=self.w2v_kwargs,
                                        sequence_length=self.sequence_length,
                                        processor=self.processor,
                                        trainable=self.trainable)
         else:
             raise ValueError('set error embedding type.')
Пример #5
0
class Vector(object):
    def __init__(self,
                 embedding_type=EmbType.W2V,
                 w2v_path='',
                 w2v_kwargs=None,
                 sequence_length=128,
                 processor=None,
                 trainable=False,
                 bert_model_folder='',
                 bert_layer_nums=4):
        self.embedding_type = embedding_type
        self.w2v_path = w2v_path
        self.w2v_kwargs = w2v_kwargs  # default: {binary:False}
        self.sequence_length = sequence_length
        self.processor = processor
        self.trainable = trainable
        self.bert_model_folder = bert_model_folder
        self.bert_layer_nums = bert_layer_nums
        self.model = None

    def load_model(self):
        if not self.model:
            if self.embedding_type == EmbType.BERT:
                from text2vec.embeddings.bert_embedding import BERTEmbedding
                self.model = BERTEmbedding(
                    model_folder=self.bert_model_folder,
                    layer_nums=self.bert_layer_nums,
                    trainable=self.trainable,
                    sequence_length=self.sequence_length,
                    processor=self.processor)
            elif self.embedding_type == EmbType.W2V:
                from text2vec.embeddings.word_embedding import WordEmbedding
                self.model = WordEmbedding(
                    w2v_path=self.w2v_path,
                    w2v_kwargs=self.w2v_kwargs,
                    sequence_length=self.sequence_length,
                    processor=self.processor,
                    trainable=self.trainable)
            else:
                raise ValueError('set error embedding type.')

    def tokenize(self, text):
        if not text.strip():
            return []
        self.load_model()
        return self.model.tokenizer.tokenize(text.lower().strip())

    def encode(self, tokens):
        ret = 0.0
        if not tokens:
            return ret
        self.load_model()
        if isinstance(tokens, str):
            tokens = self.tokenize(tokens)
        return self.model.embed_one(tokens)
Пример #6
0
 def setUpClass(cls):
     from text2vec.embeddings.word_embedding import WordEmbedding
     cls.embedding = WordEmbedding(sequence_length=SEQUENCE_LENGTH)
Пример #7
0
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description: 
"""
from text2vec.embeddings.word_embedding import WordEmbedding

if __name__ == '__main__':
    b = WordEmbedding()
    data1 = '你 好 啊'.split(' ')
    r = b.embed([data1], True)

    print(r)
    print(r.shape)