예제 #1
0
    def build_vocab_generator(self, generators: List[CorpusGenerator]) -> None:
        if not self.vocab2idx:
            vocab2idx = self._initial_vocab_dic

            token2count: Dict[str, int] = {}

            for gen in generators:
                for sentence, label in tqdm.tqdm(
                        gen, desc="Preparing text vocab dict"):
                    if self.build_vocab_from_labels:
                        target = label
                    else:
                        target = sentence
                    for token in target:
                        count = token2count.get(token, 0)
                        token2count[token] = count + 1

            sorted_token2count = sorted(token2count.items(),
                                        key=operator.itemgetter(1),
                                        reverse=True)
            token2count = collections.OrderedDict(sorted_token2count)

            for token, token_count in token2count.items():
                if token not in vocab2idx and token_count >= self.min_count:
                    vocab2idx[token] = len(vocab2idx)
            self.vocab2idx = vocab2idx
            self.idx2vocab = dict([(v, k) for k, v in self.vocab2idx.items()])

            top_k_vocab = [k for (k, v) in list(self.vocab2idx.items())[:10]]
            logger.debug(
                f"--- Build vocab dict finished, Total: {len(self.vocab2idx)} ---"
            )
            logger.debug(f"Top-10: {top_k_vocab}")
예제 #2
0
    def get_seq_length_from_corpus(self,
                                   generators: List[CorpusGenerator],
                                   *,
                                   use_label: bool = False,
                                   cover_rate: float = 0.95) -> int:
        """
        Calculate proper sequence length according to the corpus

        Args:
            generators:
            use_label:
            cover_rate:

        Returns:

        """
        seq_lens = []
        for gen in generators:
            for sentence, label in tqdm.tqdm(
                    gen, desc="Calculating sequence length"):
                if use_label:
                    seq_lens.append(len(label))
                else:
                    seq_lens.append(len(sentence))
        if cover_rate == 1.0:
            target_index = -1
        else:
            target_index = int(cover_rate * len(seq_lens))
        sequence_length = sorted(seq_lens)[target_index]
        logger.debug(f'Calculated sequence length = {sequence_length}')
        return sequence_length
예제 #3
0
    def load_data(
            cls,
            subset_name: str = 'train',
            shuffle: bool = True) -> Tuple[List[List[str]], List[List[str]]]:
        """
        Load dataset as sequence labeling format, char level tokenized

        Args:
            subset_name: {train, test, valid}
            shuffle: should shuffle or not, default True.

        Returns:
            dataset_features and dataset labels
        """
        corpus_path = get_file(cls.__corpus_name__,
                               cls.__zip_file__name,
                               cache_dir=K.DATA_PATH,
                               untar=True)

        if subset_name == 'train':
            file_path = os.path.join(corpus_path, 'example.train')
        elif subset_name == 'test':
            file_path = os.path.join(corpus_path, 'example.test')
        else:
            file_path = os.path.join(corpus_path, 'example.dev')

        x_data, y_data = DataReader.read_conll_format_file(file_path)
        if shuffle:
            x_data, y_data = utils.unison_shuffled_copies(x_data, y_data)
        logger.debug(
            f"loaded {len(x_data)} samples from {file_path}. Sample:\n"
            f"x[0]: {x_data[0]}\n"
            f"y[0]: {y_data[0]}")
        return x_data, y_data
예제 #4
0
    def embed(self,
              sentences: List[List[str]],
              *,
              debug: bool = False) -> np.ndarray:
        """
        batch embed sentences

        Args:
            sentences: Sentence list to embed
            debug: show debug info
        Returns:
            vectorized sentence list
        """
        if self._text_processor is None:
            raise ValueError(
                'Need to setup the `embedding.setup_text_processor` before calling the embed function.'
            )

        tensor_x = self._text_processor.transform(sentences,
                                                  segment=self.segment,
                                                  seq_length=self.max_position)
        if debug:
            logger.debug(f'sentence tensor: {tensor_x}')
        embed_results = self.embed_model.predict(tensor_x)
        return embed_results
예제 #5
0
    def test_with_model(self):
        x, y = SMP2018ECDTCorpus.load_data('test')
        embedding = self.build_embedding()

        model = BiGRU_Model(embedding=embedding)
        model.build_model(x, y)
        model_summary = []
        embedding.embed_model.summary(
            print_fn=lambda x: model_summary.append(x))
        logger.debug('\n'.join(model_summary))

        model.fit(x, y, epochs=1)

        model_path = os.path.join(tempfile.gettempdir(), str(time.time()))
        model.save(model_path)
예제 #6
0
    def load_data(cls,
                  subset_name: str = 'train',
                  shuffle: bool = True,
                  cutter: str = 'char') -> Tuple[List[List[str]], List[str]]:
        """
        Load dataset as sequence classification format, char level tokenized

        Args:
            subset_name: {train, test, valid}
            shuffle: should shuffle or not, default True.
            cutter: sentence cutter, {char, jieba}

        Returns:
            dataset_features and dataset labels

        """

        corpus_path = get_file(cls.__corpus_name__,
                               cls.__zip_file__name,
                               cache_dir=K.DATA_PATH,
                               untar=True)

        if cutter not in ['char', 'jieba', 'none']:
            raise ValueError(
                'cutter error, please use one onf the {char, jieba}')

        df_path = os.path.join(corpus_path, f'{subset_name}.csv')
        df = pd.read_csv(df_path)
        if cutter == 'jieba':
            try:
                import jieba
            except ModuleNotFoundError:
                raise ModuleNotFoundError(
                    "please install jieba, `$ pip install jieba`")
            x_data = [list(jieba.cut(item)) for item in df['query'].to_list()]
        elif cutter == 'char':
            x_data = [list(item) for item in df['query'].to_list()]
        y_data = df['label'].to_list()

        if shuffle:
            x_data, y_data = utils.unison_shuffled_copies(x_data, y_data)
        logger.debug(f"loaded {len(x_data)} samples from {df_path}. Sample:\n"
                     f"x[0]: {x_data[0]}\n"
                     f"y[0]: {y_data[0]}")
        return x_data, y_data
예제 #7
0
    def predict(self,
                x_data: TextSamplesVar,
                *,
                batch_size: int = 32,
                truncating: bool = False,
                predict_kwargs: Dict = None) -> List[List[str]]:
        """
        Generates output predictions for the input samples.

        Computation is done in batches.

        Args:
            x_data: The input data, as a Numpy array (or list of Numpy arrays if the model has multiple inputs).
            batch_size: Integer. If unspecified, it will default to 32.
            truncating: remove values from sequences larger than `model.embedding.sequence_length`
            predict_kwargs: arguments passed to :meth:`tf.keras.Model.predict`

        Returns:
            array(s) of predictions.
        """
        if predict_kwargs is None:
            predict_kwargs = {}
        with kashgari.utils.custom_object_scope():
            if truncating:
                seq_length = self.sequence_length
            else:
                seq_length = None

            print(self.crf_layer)
            tensor = self.text_processor.transform(
                x_data,
                segment=self.embedding.segment,
                seq_length=seq_length,
                max_position=self.embedding.max_position)
            logger.debug('predict seq_length: {}, input: {}'.format(
                seq_length,
                np.array(tensor).shape))
            pred = self.tf_model.predict(tensor,
                                         batch_size=batch_size,
                                         verbose=1,
                                         **predict_kwargs)
            pred = pred.argmax(-1)

            lengths = [len(sen) for sen in x_data]

            res: List[List[str]] = self.label_processor.inverse_transform(
                pred,  # type: ignore
                lengths=lengths)
            logger.debug('predict output: {}'.format(np.array(pred).shape))
            logger.debug('predict output argmax: {}'.format(pred))
        return res
예제 #8
0
    def load_embed_vocab(self) -> Optional[Dict[str, int]]:
        w2v = KeyedVectors.load_word2vec_format(self.w2v_path,
                                                **self.w2v_kwargs)

        token2idx = {'[PAD]': 0, '[UNK]': 1, '[BOS]': 2, '[EOS]': 3}

        for token in w2v.index2word:
            token2idx[token] = len(token2idx)

        vector_matrix = np.zeros((len(token2idx), w2v.vector_size))
        vector_matrix[1] = np.random.rand(w2v.vector_size)
        vector_matrix[4:] = w2v.vectors

        self.embedding_size = w2v.vector_size
        self.w2v_matrix = vector_matrix
        w2v_top_words = w2v.index2entity[:50]

        logger.debug('------------------------------------------------')
        logger.debug("Loaded gensim word2vec model's vocab")
        logger.debug('model        : {}'.format(self.w2v_path))
        logger.debug('word count   : {}'.format(len(self.w2v_matrix)))
        logger.debug('Top 50 words : {}'.format(w2v_top_words))
        logger.debug('------------------------------------------------')

        return token2idx
예제 #9
0
    def fit_generator(self,
                      train_sample_gen: CorpusGenerator,
                      valid_sample_gen: CorpusGenerator = None,
                      batch_size: int = 64,
                      epochs: int = 5,
                      callbacks: List['tf.keras.callbacks.Callback'] = None,
                      fit_kwargs: Dict = None) -> 'tf.keras.callbacks.History':
        """
        Trains the model for a given number of epochs with given data generator.

        Data generator must be the subclass of `CorpusGenerator`

        Args:
            train_sample_gen: train data generator.
            valid_sample_gen: valid data generator.
            batch_size: Number of samples per gradient update, default to 64.
            epochs: Number of epochs to train the model.
                An epoch is an iteration over the entire `x` and `y` data provided.
            callbacks: List of `tf.keras.callbacks.Callback` instances.
                List of callbacks to apply during training.
                See `tf.keras.callbacks`.
            fit_kwargs: fit_kwargs: additional arguments passed to :meth:`tf.keras.Model.fit`

        Returns:
            A :py:class:`tf.keras.callback.History`  object. Its `History.history` attribute is
            a record of training loss values and metrics values
            at successive epochs, as well as validation loss values
            and validation metrics values (if applicable).
        """
        self.build_model_generator(
            [g for g in [train_sample_gen, valid_sample_gen] if g])

        train_set = BatchDataSet(train_sample_gen,
                                 text_processor=self.text_processor,
                                 label_processor=self.label_processor,
                                 segment=self.embedding.segment,
                                 seq_length=self.sequence_length,
                                 max_position=self.embedding.max_position,
                                 batch_size=batch_size)

        if fit_kwargs is None:
            fit_kwargs = {}
        if valid_sample_gen:
            valid_set = BatchDataSet(valid_sample_gen,
                                     text_processor=self.text_processor,
                                     label_processor=self.label_processor,
                                     segment=self.embedding.segment,
                                     seq_length=self.sequence_length,
                                     max_position=self.embedding.max_position,
                                     batch_size=batch_size)
            fit_kwargs['validation_data'] = valid_set.take()
            fit_kwargs['validation_steps'] = len(valid_set)

        for x, y in train_set.take(1):
            logger.debug('fit input shape: {}'.format(np.array(x).shape))
            logger.debug('fit input shape: {}'.format(np.array(y).shape))
        return self.tf_model.fit(train_set.take(),
                                 steps_per_epoch=len(train_set),
                                 epochs=epochs,
                                 callbacks=callbacks,
                                 **fit_kwargs)
예제 #10
0
    def load_embed_vocab(self) -> Optional[Dict[str, int]]:
        token2idx: Dict[str, int] = {}
        with codecs.open(self.vocab_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                self.vocab_list.append(token)
                token2idx[token] = len(token2idx)
        top_words = [k for k, v in list(token2idx.items())[:50]]
        logger.debug('------------------------------------------------')
        logger.debug("Loaded transformer model's vocab")
        logger.debug(f'config_path       : {self.config_path}')
        logger.debug(f'vocab_path      : {self.vocab_path}')
        logger.debug(f'checkpoint_path : {self.checkpoint_path}')
        logger.debug(f'Top 50 words    : {top_words}')
        logger.debug('------------------------------------------------')

        return token2idx