def embed(self, sentence_list: Union[Tuple[List[List[str]], ...], List[List[str]]], debug: bool = False) -> np.ndarray: """ batch embed sentences Args: sentence_list: Sentence list to embed debug: show debug log Returns: vectorized sentence list print(token, predicts[i].tolist()[:4]) [CLS] [0.24250675737857819, 0.04605229198932648, ...] from [0.2858668565750122, 0.12927496433258057, ...] that [-0.7514970302581787, 0.14548861980438232, ...] day [0.32245880365371704, -0.043174318969249725, ...] ... """ if self.embed_model is None: raise ValueError('need to build model for embed sentence') tensor_x = self.process_x_dataset(sentence_list) if debug: logger.debug(f'sentence tensor: {tensor_x}') embed_results = self.embed_model.predict(tensor_x) return embed_results
def __init__(self, w2v_path: str = '', w2v_kwargs: Dict[str, Any] = None, sequence_length: Union[Tuple[int, ...], str, int] = 128, processor: Optional[BaseProcessor] = None, trainable: bool = False): """ Args: w2v_path: word2vec file path w2v_kwargs: params pass to the ``load_word2vec_format()`` function of ``gensim.models.KeyedVectors`` - https://radimrehurek.com/gensim/models/keyedvectors.html#module-gensim.models.keyedvectors sequence_length: ``'auto'``, ``'variable'`` or integer. When using ``'auto'``, use the 95% of corpus length as sequence length. When using ``'variable'``, model input shape will set to None, which can handle various length of input, it will use the length of max sequence in every batch for sequence length. If using an integer, let's say ``50``, the input output sequence length will set to 50. processor: """ if w2v_kwargs is None: w2v_kwargs = {} self.w2v_path = w2v_path self.w2v_kwargs = w2v_kwargs self.w2v = None self.w2v_model_loaded = False logger.debug('load w2v embedding ...') super(WordEmbedding, self).__init__(sequence_length=sequence_length, embedding_size=0, processor=processor) self._build_token2idx_from_w2v() if trainable: self._build_model()
def _build_model(self, **kwargs): if self.embed_model is None: seq_len = self.sequence_length if isinstance(seq_len, tuple): seq_len = seq_len[0] config_path = os.path.join(self.model_folder, 'bert_config.json') check_point_path = os.path.join(self.model_folder, 'bert_model.ckpt') logger.debug('load bert model from %s' % check_point_path) bert_model = keras_bert.load_trained_model_from_checkpoint( config_path, check_point_path, seq_len=seq_len, output_layer_num=self.layer_nums, training=self.training, trainable=self.trainable) self._model = tf.keras.Model(bert_model.inputs, bert_model.output) bert_seq_len = int(bert_model.output.shape[1]) if bert_seq_len < seq_len: logger.warning( f"Sequence length limit set to {bert_seq_len} by pre-trained model" ) self.sequence_length = bert_seq_len self.embedding_size = int(bert_model.output.shape[-1]) output_features = NonMaskingLayer()(bert_model.output) self.embed_model = tf.keras.Model(bert_model.inputs, output_features) logger.debug(f'seq_len: {self.sequence_length}')
def _build_token2idx_from_bert(self): dict_path = os.path.join(self.model_folder, 'vocab.txt') if not os.path.exists(dict_path): model_name = self.model_key_map.get(self.model_folder, 'chinese_L-12_H-768_A-12') url = self.pre_trained_models.get(model_name) get_file(model_name + ".zip", url, extract=True, cache_dir=text2vec.USER_DATA_DIR, cache_subdir=text2vec.USER_DATA_DIR, verbose=1) self.model_folder = os.path.join(text2vec.USER_DATA_DIR, model_name) dict_path = os.path.join(self.model_folder, 'vocab.txt') logger.debug(f'load vocab.txt from {dict_path}') token2idx = {} with codecs.open(dict_path, 'r', encoding='utf-8') as f: for line in f: token = line.strip() token2idx[token] = len(token2idx) self.bert_token2idx = token2idx self.tokenizer = keras_bert.Tokenizer(token2idx) self.processor.token2idx = self.bert_token2idx self.processor.idx2token = dict([(value, key) for key, value in token2idx.items()])
def embed(self, sentence_list: Union[Tuple[List[List[str]], ...], List[List[str]]], debug: bool = False) -> np.ndarray: """ batch embed sentences Args: sentence_list: Sentence list to embed debug: show debug log Returns: vectorized sentence list """ if self.w2v is None: raise ValueError('need to build model for embed sentence') embeds = [] for sentence in sentence_list: emb = [] count = 0 for word in sentence: if word not in self.w2v.vocab: continue emb.append(self.w2v[word]) count += 1 tensor_x = np.array(emb).sum(axis=0) # 纵轴相加 avg_tensor_x = np.divide(tensor_x, count) embeds.append(avg_tensor_x) embeds = np.array(embeds) if debug: logger.debug(f'sentence tensor shape: {embeds.shape}') return embeds
def _build_token2idx_from_w2v(self): if not self.w2v_path or not os.path.exists(self.w2v_path): if self.w2v_path in self.model_key_map: self.w2v_path = self.model_key_map[self.w2v_path] model_dict = self.model_key_map.get( self.w2v_path, self.model_key_map['w2v-light-tencent-chinese']) tar_filename = model_dict.get('tar_filename') self.w2v_kwargs = {'binary': model_dict.get('binary')} url = model_dict.get('url') untar_filename = model_dict.get('untar_filename') self.w2v_path = os.path.join(text2vec.USER_DATA_DIR, untar_filename) if not os.path.exists(self.w2v_path): get_file(tar_filename, url, extract=True, cache_dir=text2vec.USER_DIR, cache_subdir=text2vec.USER_DATA_DIR, verbose=1) t0 = time.time() w2v = KeyedVectors.load_word2vec_format(self.w2v_path, **self.w2v_kwargs) # w2v.init_sims(replace=True) logger.debug('load w2v from %s, spend %s s' % (self.w2v_path, time.time() - t0)) token2idx = { self.processor.token_pad: 0, self.processor.token_unk: 1, self.processor.token_bos: 2, self.processor.token_eos: 3 } for token in w2v.key_to_index: token2idx[token] = len(token2idx) vector_matrix = np.zeros((len(token2idx), w2v.vector_size)) vector_matrix[1] = np.random.rand(w2v.vector_size) vector_matrix[4:] = w2v.vectors self.embedding_size = w2v.vector_size self.w2v_vector_matrix = vector_matrix self.w2v_token2idx = token2idx self.w2v_top_words = w2v.index_to_key[:50] self.w2v_model_loaded = True self.w2v = w2v self.processor.token2idx = self.w2v_token2idx self.processor.idx2token = dict([ (value, key) for key, value in self.w2v_token2idx.items() ]) logger.debug('word count: {}'.format(len(self.w2v_vector_matrix))) logger.debug('emb size: {}'.format(self.embedding_size)) logger.debug('top 50 word: {}'.format(self.w2v_top_words)) logger.debug('filter stopwords: {}, count: {}'.format( sorted(list(self.stopwords))[:10], len(self.stopwords))) self.tokenizer = Tokenizer()
def sequence_length(self, val: Union[int, str]): if isinstance(val, str): if val == 'auto': logger.debug("Sequence length will auto set at 95% of sequence length") elif val == 'variable': val = None else: raise ValueError("sequence_length must be an int or 'auto' or 'variable'") self.processor.sequence_length = val
def embed(self, sentence_list: Union[Tuple[List[List[str]], ...], List[List[str]]], debug: bool = False) -> np.ndarray: """ batch embed sentences Args: sentence_list: Sentence list to embed debug: show debug log Returns: vectorized sentence list """ if self.w2v is None: raise ValueError('need to build model for embed sentence') embeds = [] for sentence in sentence_list: emb = [] count = 0 for word in sentence: # 过滤停用词 if word in self.stopwords: continue # 调用词向量 if word in self.w2v.key_to_index: emb.append(self.w2v.get_vector(word, norm=True)) count += 1 else: if len(word) == 1: continue # 再切分,eg 特价机票 ws = self.tokenizer.tokenize(word, cut_all=True) for w in ws: if w in self.w2v.key_to_index: emb.append(self.w2v.get_vector(w, norm=True)) count += 1 tensor_x = np.array(emb).sum(axis=0) # 纵轴相加 if count > 0: avg_tensor_x = np.divide(tensor_x, count) else: avg_tensor_x = 0.0 embeds.append(avg_tensor_x) embeds = np.array(embeds) if debug: logger.debug(f'sentence tensor shape: {embeds.shape}') return embeds
def _build_model(self, **kwargs): if self.embed_model is None: from tensorflow import keras if self.token_count == 0: logger.debug('need to build after build_word2idx') else: input_tensor = keras.layers.Input( shape=(self.sequence_length, ), name='input') layer_embedding = keras.layers.Embedding( self.token_count, self.embedding_size, weights=[self.w2v_vector_matrix], trainable=False, name='layer_embedding') embedded_tensor = layer_embedding(input_tensor) self.embed_model = keras.Model(input_tensor, embedded_tensor)