예제 #1
0
    def embed(self,
              sentence_list: Union[Tuple[List[List[str]], ...], List[List[str]]],
              debug: bool = False) -> np.ndarray:
        """
        batch embed sentences

        Args:
            sentence_list: Sentence list to embed
            debug: show debug log
        Returns:
            vectorized sentence list

            print(token, predicts[i].tolist()[:4])
            [CLS] [0.24250675737857819, 0.04605229198932648, ...]
            from [0.2858668565750122, 0.12927496433258057,  ...]
            that [-0.7514970302581787, 0.14548861980438232, ...]
            day [0.32245880365371704, -0.043174318969249725, ...]
            ...
        """
        if self.embed_model is None:
            raise ValueError('need to build model for embed sentence')

        tensor_x = self.process_x_dataset(sentence_list)
        if debug:
            logger.debug(f'sentence tensor: {tensor_x}')
        embed_results = self.embed_model.predict(tensor_x)
        return embed_results
예제 #2
0
    def __init__(self,
                 w2v_path: str = '',
                 w2v_kwargs: Dict[str, Any] = None,
                 sequence_length: Union[Tuple[int, ...], str, int] = 128,
                 processor: Optional[BaseProcessor] = None,
                 trainable: bool = False):
        """

        Args:
            w2v_path: word2vec file path
            w2v_kwargs: params pass to the ``load_word2vec_format()`` function of ``gensim.models.KeyedVectors`` -
                https://radimrehurek.com/gensim/models/keyedvectors.html#module-gensim.models.keyedvectors
            sequence_length: ``'auto'``, ``'variable'`` or integer. When using ``'auto'``, use the 95% of corpus length
                as sequence length. When using ``'variable'``, model input shape will set to None, which can handle
                various length of input, it will use the length of max sequence in every batch for sequence length.
                If using an integer, let's say ``50``, the input output sequence length will set to 50.
            processor:
        """
        if w2v_kwargs is None:
            w2v_kwargs = {}
        self.w2v_path = w2v_path
        self.w2v_kwargs = w2v_kwargs
        self.w2v = None
        self.w2v_model_loaded = False
        logger.debug('load w2v embedding ...')
        super(WordEmbedding, self).__init__(sequence_length=sequence_length,
                                            embedding_size=0,
                                            processor=processor)
        self._build_token2idx_from_w2v()
        if trainable:
            self._build_model()
예제 #3
0
    def _build_model(self, **kwargs):
        if self.embed_model is None:
            seq_len = self.sequence_length
            if isinstance(seq_len, tuple):
                seq_len = seq_len[0]
            config_path = os.path.join(self.model_folder, 'bert_config.json')
            check_point_path = os.path.join(self.model_folder,
                                            'bert_model.ckpt')
            logger.debug('load bert model from %s' % check_point_path)
            bert_model = keras_bert.load_trained_model_from_checkpoint(
                config_path,
                check_point_path,
                seq_len=seq_len,
                output_layer_num=self.layer_nums,
                training=self.training,
                trainable=self.trainable)

            self._model = tf.keras.Model(bert_model.inputs, bert_model.output)
            bert_seq_len = int(bert_model.output.shape[1])
            if bert_seq_len < seq_len:
                logger.warning(
                    f"Sequence length limit set to {bert_seq_len} by pre-trained model"
                )
                self.sequence_length = bert_seq_len
            self.embedding_size = int(bert_model.output.shape[-1])
            output_features = NonMaskingLayer()(bert_model.output)
            self.embed_model = tf.keras.Model(bert_model.inputs,
                                              output_features)
            logger.debug(f'seq_len: {self.sequence_length}')
예제 #4
0
    def _build_token2idx_from_bert(self):
        dict_path = os.path.join(self.model_folder, 'vocab.txt')
        if not os.path.exists(dict_path):
            model_name = self.model_key_map.get(self.model_folder,
                                                'chinese_L-12_H-768_A-12')
            url = self.pre_trained_models.get(model_name)
            get_file(model_name + ".zip",
                     url,
                     extract=True,
                     cache_dir=text2vec.USER_DATA_DIR,
                     cache_subdir=text2vec.USER_DATA_DIR,
                     verbose=1)
            self.model_folder = os.path.join(text2vec.USER_DATA_DIR,
                                             model_name)
            dict_path = os.path.join(self.model_folder, 'vocab.txt')
        logger.debug(f'load vocab.txt from {dict_path}')
        token2idx = {}
        with codecs.open(dict_path, 'r', encoding='utf-8') as f:
            for line in f:
                token = line.strip()
                token2idx[token] = len(token2idx)

        self.bert_token2idx = token2idx
        self.tokenizer = keras_bert.Tokenizer(token2idx)
        self.processor.token2idx = self.bert_token2idx
        self.processor.idx2token = dict([(value, key)
                                         for key, value in token2idx.items()])
예제 #5
0
    def embed(self,
              sentence_list: Union[Tuple[List[List[str]], ...],
                                   List[List[str]]],
              debug: bool = False) -> np.ndarray:
        """
        batch embed sentences

        Args:
            sentence_list: Sentence list to embed
            debug: show debug log
        Returns:
            vectorized sentence list
        """
        if self.w2v is None:
            raise ValueError('need to build model for embed sentence')

        embeds = []
        for sentence in sentence_list:
            emb = []
            count = 0
            for word in sentence:
                if word not in self.w2v.vocab:
                    continue
                emb.append(self.w2v[word])
                count += 1
            tensor_x = np.array(emb).sum(axis=0)  # 纵轴相加
            avg_tensor_x = np.divide(tensor_x, count)
            embeds.append(avg_tensor_x)
        embeds = np.array(embeds)
        if debug:
            logger.debug(f'sentence tensor shape: {embeds.shape}')
        return embeds
예제 #6
0
    def _build_token2idx_from_w2v(self):
        if not self.w2v_path or not os.path.exists(self.w2v_path):
            if self.w2v_path in self.model_key_map:
                self.w2v_path = self.model_key_map[self.w2v_path]
            model_dict = self.model_key_map.get(
                self.w2v_path, self.model_key_map['w2v-light-tencent-chinese'])
            tar_filename = model_dict.get('tar_filename')
            self.w2v_kwargs = {'binary': model_dict.get('binary')}
            url = model_dict.get('url')
            untar_filename = model_dict.get('untar_filename')
            self.w2v_path = os.path.join(text2vec.USER_DATA_DIR,
                                         untar_filename)
            if not os.path.exists(self.w2v_path):
                get_file(tar_filename,
                         url,
                         extract=True,
                         cache_dir=text2vec.USER_DIR,
                         cache_subdir=text2vec.USER_DATA_DIR,
                         verbose=1)
        t0 = time.time()
        w2v = KeyedVectors.load_word2vec_format(self.w2v_path,
                                                **self.w2v_kwargs)
        # w2v.init_sims(replace=True)
        logger.debug('load w2v from %s, spend %s s' %
                     (self.w2v_path, time.time() - t0))
        token2idx = {
            self.processor.token_pad: 0,
            self.processor.token_unk: 1,
            self.processor.token_bos: 2,
            self.processor.token_eos: 3
        }

        for token in w2v.key_to_index:
            token2idx[token] = len(token2idx)

        vector_matrix = np.zeros((len(token2idx), w2v.vector_size))
        vector_matrix[1] = np.random.rand(w2v.vector_size)
        vector_matrix[4:] = w2v.vectors

        self.embedding_size = w2v.vector_size
        self.w2v_vector_matrix = vector_matrix
        self.w2v_token2idx = token2idx
        self.w2v_top_words = w2v.index_to_key[:50]
        self.w2v_model_loaded = True
        self.w2v = w2v

        self.processor.token2idx = self.w2v_token2idx
        self.processor.idx2token = dict([
            (value, key) for key, value in self.w2v_token2idx.items()
        ])
        logger.debug('word count: {}'.format(len(self.w2v_vector_matrix)))
        logger.debug('emb size: {}'.format(self.embedding_size))
        logger.debug('top 50 word: {}'.format(self.w2v_top_words))
        logger.debug('filter stopwords: {}, count: {}'.format(
            sorted(list(self.stopwords))[:10], len(self.stopwords)))

        self.tokenizer = Tokenizer()
예제 #7
0
 def sequence_length(self, val: Union[int, str]):
     if isinstance(val, str):
         if val == 'auto':
             logger.debug("Sequence length will auto set at 95% of sequence length")
         elif val == 'variable':
             val = None
         else:
             raise ValueError("sequence_length must be an int or 'auto' or 'variable'")
     self.processor.sequence_length = val
예제 #8
0
    def embed(self,
              sentence_list: Union[Tuple[List[List[str]], ...],
                                   List[List[str]]],
              debug: bool = False) -> np.ndarray:
        """
        batch embed sentences

        Args:
            sentence_list: Sentence list to embed
            debug: show debug log
        Returns:
            vectorized sentence list
        """
        if self.w2v is None:
            raise ValueError('need to build model for embed sentence')

        embeds = []
        for sentence in sentence_list:
            emb = []
            count = 0
            for word in sentence:
                # 过滤停用词
                if word in self.stopwords:
                    continue
                # 调用词向量
                if word in self.w2v.key_to_index:
                    emb.append(self.w2v.get_vector(word, norm=True))
                    count += 1
                else:
                    if len(word) == 1:
                        continue
                    # 再切分,eg 特价机票
                    ws = self.tokenizer.tokenize(word, cut_all=True)
                    for w in ws:
                        if w in self.w2v.key_to_index:
                            emb.append(self.w2v.get_vector(w, norm=True))
                            count += 1
            tensor_x = np.array(emb).sum(axis=0)  # 纵轴相加
            if count > 0:
                avg_tensor_x = np.divide(tensor_x, count)
            else:
                avg_tensor_x = 0.0
            embeds.append(avg_tensor_x)
        embeds = np.array(embeds)
        if debug:
            logger.debug(f'sentence tensor shape: {embeds.shape}')
        return embeds
예제 #9
0
    def _build_model(self, **kwargs):
        if self.embed_model is None:
            from tensorflow import keras
            if self.token_count == 0:
                logger.debug('need to build after build_word2idx')
            else:
                input_tensor = keras.layers.Input(
                    shape=(self.sequence_length, ), name='input')
                layer_embedding = keras.layers.Embedding(
                    self.token_count,
                    self.embedding_size,
                    weights=[self.w2v_vector_matrix],
                    trainable=False,
                    name='layer_embedding')

                embedded_tensor = layer_embedding(input_tensor)
                self.embed_model = keras.Model(input_tensor, embedded_tensor)