def test_word_emb(): b = WordEmbedding() data1 = '你 好 啊'.split(' ') r = b.embed([data1], True) print(r) print(r.shape)
def load_model(self): if not self.model: if self.embedding_type == EmbType.BERT: from text2vec.embeddings.bert_embedding import BERTEmbedding self.model = BERTEmbedding(sequence_length=128) elif self.embedding_type == EmbType.W2V: from text2vec.embeddings.word_embedding import WordEmbedding self.model = WordEmbedding() else: raise ValueError('set error embedding type.')
class Vector(object): def __init__(self, embedding_type='w2v'): self.embedding_type = embedding_type self.model = None def load_model(self): if not self.model: if self.embedding_type == EmbType.BERT: from text2vec.embeddings.bert_embedding import BERTEmbedding self.model = BERTEmbedding(sequence_length=128) elif self.embedding_type == EmbType.W2V: from text2vec.embeddings.word_embedding import WordEmbedding self.model = WordEmbedding() else: raise ValueError('set error embedding type.') def tokenize(self, text): if not text.strip(): return [] self.load_model() return self.model.tokenizer.tokenize(text) def encode(self, tokens): ret = 0.0 if not tokens: return ret self.load_model() if isinstance(tokens, str): tokens = self.tokenize(tokens) return self.model.embed_one(tokens)
def load_model(self): if not self.model: if self.embedding_type == EmbType.BERT: from text2vec.embeddings.bert_embedding import BERTEmbedding self.model = BERTEmbedding(model_folder=self.bert_model_folder, layer_nums=self.bert_layer_nums, trainable=self.trainable, sequence_length=self.sequence_length, processor=self.processor) elif self.embedding_type == EmbType.W2V: from text2vec.embeddings.word_embedding import WordEmbedding self.model = WordEmbedding(w2v_path=self.w2v_path, w2v_kwargs=self.w2v_kwargs, sequence_length=self.sequence_length, processor=self.processor, trainable=self.trainable) else: raise ValueError('set error embedding type.')
class Vector(object): def __init__(self, embedding_type=EmbType.W2V, w2v_path='', w2v_kwargs=None, sequence_length=128, processor=None, trainable=False, bert_model_folder='', bert_layer_nums=4): self.embedding_type = embedding_type self.w2v_path = w2v_path self.w2v_kwargs = w2v_kwargs # default: {binary:False} self.sequence_length = sequence_length self.processor = processor self.trainable = trainable self.bert_model_folder = bert_model_folder self.bert_layer_nums = bert_layer_nums self.model = None def load_model(self): if not self.model: if self.embedding_type == EmbType.BERT: from text2vec.embeddings.bert_embedding import BERTEmbedding self.model = BERTEmbedding( model_folder=self.bert_model_folder, layer_nums=self.bert_layer_nums, trainable=self.trainable, sequence_length=self.sequence_length, processor=self.processor) elif self.embedding_type == EmbType.W2V: from text2vec.embeddings.word_embedding import WordEmbedding self.model = WordEmbedding( w2v_path=self.w2v_path, w2v_kwargs=self.w2v_kwargs, sequence_length=self.sequence_length, processor=self.processor, trainable=self.trainable) else: raise ValueError('set error embedding type.') def tokenize(self, text): if not text.strip(): return [] self.load_model() return self.model.tokenizer.tokenize(text.lower().strip()) def encode(self, tokens): ret = 0.0 if not tokens: return ret self.load_model() if isinstance(tokens, str): tokens = self.tokenize(tokens) return self.model.embed_one(tokens)
def setUpClass(cls): from text2vec.embeddings.word_embedding import WordEmbedding cls.embedding = WordEmbedding(sequence_length=SEQUENCE_LENGTH)
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: """ from text2vec.embeddings.word_embedding import WordEmbedding if __name__ == '__main__': b = WordEmbedding() data1 = '你 好 啊'.split(' ') r = b.embed([data1], True) print(r) print(r.shape)