예제 #1
0
    def fit(self, X, Y=None, embedding_matrix=None, steps_per_epoch=None):
        if isinstance(X, list):
            X = np.array(X)
        if isinstance(Y, list):
            Y = np.array(Y)

        if not (self.sequence_length and self.vocab_size and self.nb_outputs
                and steps_per_epoch):
            steps_per_epoch = self._init_from_data(X, Y)

        if isinstance(X, np.ndarray):
            data = self._prepare_data(X, Y, shuffle=True)
        else:  # tensorflow dataset
            data = X.batch(self.batch_size)

        train_steps_per_epoch = int(
            (1 - self.validation_split) * steps_per_epoch)
        if train_steps_per_epoch == 0:
            logger.warning(
                "Not enough data for validation. Consider decreasing \
                batch_size or validation_split. Some features that \
                rely on validation metrics like early stopping might \
                not work")
        else:
            steps_per_epoch = train_steps_per_epoch
        train_data = data.take(steps_per_epoch)
        val_data = data.skip(steps_per_epoch)

        strategy = self._get_distributed_strategy()
        with strategy.scope():
            self.model = self._build_model(self.sequence_length,
                                           self.vocab_size, self.nb_outputs,
                                           steps_per_epoch, embedding_matrix)

        callbacks = [
            CALLBACK_DICT[c] if c in CALLBACK_DICT else c
            for c in self.callbacks
        ]
        if self.early_stopping:
            early_stopping = tf.keras.callbacks.EarlyStopping(
                patience=5, restore_best_weights=True)
            callbacks.append(early_stopping)

        self.model.fit(train_data,
                       validation_data=val_data,
                       epochs=self.nb_epochs,
                       callbacks=callbacks)
        return self
예제 #2
0
__all__ = [
    'WellcomeTfidf', 'Doc2VecVectorizer', 'Sent2VecVectorizer',
    'WellcomeVotingClassifier'
]
try:
    from .vectorizer import Vectorizer
    from .clustering import TextClustering
    from .spacy_ner import SpacyNER
    from .spacy_classifier import SpacyClassifier
    from .bert_classifier import BertClassifier
    from .bert_vectorizer import BertVectorizer
    from .spacy_knowledge_base import SpacyKnowledgeBase
    from .spacy_entity_linking import SpacyEntityLinker
    from .similarity_entity_linking import SimilarityEntityLinker
    from .cnn import CNNClassifier
    from .bilstm import BiLSTMClassifier
    from .keras_vectorizer import KerasVectorizer
    from .bert_semantic_equivalence import SemanticEquivalenceClassifier
    from .transformers_tokenizer import TransformersTokenizer
    __all__ += [
        'Vectorizer', 'TextClustering', 'SpacyNER', 'SpacyClassifier',
        'BertClassifier', 'BertVectorizer', 'SpacyKnowledgeBase',
        'SpacyEntityLinker', 'SemanticEquivalenceClassifier', 'CNNClassifier',
        'BiLSTMClassifier', 'KerasVectorizer', 'SimilarityEntityLinker',
        'SemanticEquivalenceClassifier', 'TransformersTokenizer'
    ]
except ImportError as e:
    logger.error(e)
    logger.warning("Using WellcomeML without extras (transformers & torch).")
예제 #3
0
    def optimise(self,
                 X,
                 param_grid,
                 n_cluster_range=None,
                 max_noise=0.2,
                 verbose=False):
        """
        Optimises the clustering silhouette based on a parameter grid,
        a range on number of clusters and a range on noise.

        It is customised to avoid re-fitting of intermediate
        steps (vectorizer and reducer) more than necessary.

        Args:
            X (iterable[str]): A list of texts to be clustered

            param_grid (dict) : A parameter grid, example:
                param_grid = {'reducer': {'min_dist': [0.0, 0.2], 'n_neighbors': [2,3,5],
                'metric': ['cosine', 'euclidean']},
                'clustering': {'min_samples': [2, 5], 'eps': [0.5, 1, 1.5]}}

            n_cluster_range (2-uple of ints): A 2-uple describing the max and min number
                of clusters (e.g.: (10, 20)). If unset, will just choose the best silhouette

            max_noise (float in [0,1]): The maximum fraction of points unclustered. Default: 0.2

        Returns:
            dict: A dictionary of results.

            The function returns a dictionary containing  "params_list", "silhouette"
            (the silhouette for each parameter) and "best_clustering"
            (the best clustering parameters)

        """

        min_n_clusters = (n_cluster_range[0] if n_cluster_range else 0)
        max_n_clusters = (n_cluster_range[1] if n_cluster_range else 10**5)

        # X might be transformed to be a vector, so we need to save the input
        # texts

        X_text = X

        # Linearises Dictionary to be compatible with grid search so it
        # becomes one dictionary with 'step__parameter'
        if self.reducer == 'tsne':
            logger.warning("TSNE is not suitable for predicting on new data."
                           "Skipping Vectoriser/TSNE optimisation parameters")
            self.fit(X)
            X = self.reduced_points

            pipeline = Pipeline([('clustering', self.clustering_class)],
                                memory=CACHE_DIR)

            params = {}

        elif self.cluster_reduced:
            # You cannot pickle sparse uMAP with more than 4096 points.
            # See https://github.com/lmcinnes/umap/issues/674
            # Until that issue is fixed, we need to convert everything to dense or cannot cache
            # the transformations of the pipeline

            memory = (CACHE_DIR
                      if len(X) < 4096 or self.embedding != 'tf-idf' else None)
            pipeline = Pipeline([('vectorizer', self.vectorizer),
                                 ('reducer', self.reducer_class),
                                 ('clustering', self.clustering_class)],
                                memory=memory)

            params = {
                **{
                    f'reducer__{key}': value
                    for key, value in param_grid.get('reducer', {}).items()
                }
            }

            params = {
                **params,
                **{
                    f'vectorizer__{key}': value
                    for key, value in param_grid.get('vectorizer', {}).items()
                }
            }

        else:
            self.vectorizer.cache_transformed = True
            pipeline = Pipeline([('vectorizer', self.vectorizer),
                                 ('clustering', self.clustering_class)],
                                memory=CACHE_DIR)
            params = {
                **{
                    f'vectorizer__{key}': value
                    for key, value in param_grid.get('vectorizer', {}).items()
                }
            }

        params = {
            **params,
            **{
                f'clustering__{key}': value
                for key, value in param_grid.get('clustering', {}).items()
            }
        }

        grid = GridSearchCV(estimator=pipeline,
                            param_grid=params,
                            scoring={
                                'silhouette': _clustering_score,
                                'noise': _clustering_noise,
                                'n_clusters': _number_of_clusters
                            },
                            refit='silhouette')

        logging_level = logger.level
        if verbose <= 1:
            # Previously disable logging to allow the loading bar to run
            # uninterruptly. Will reset after.
            logging.getLogger().setLevel(logging.WARNING)
            logger.setLevel(logging.WARNING)

        # Prunes result to actually optimise under constraints
        best_silhouette = 0
        best_params = {}

        grid.fit(X, y=None)

        for params, silhouette, noise, n_clusters in zip(
                grid.cv_results_['params'],
                grid.cv_results_['mean_test_silhouette'],
                grid.cv_results_['mean_test_noise'],
                grid.cv_results_['mean_test_n_clusters']):

            if min_n_clusters <= n_clusters <= max_n_clusters\
                    and noise <= max_noise\
                    and silhouette > best_silhouette:
                best_silhouette = silhouette
                best_params = params

        if not best_params:
            logger.warning("Could not find any clustering model with the "
                           "specified number of clusters and noise")

        self.silhouette = best_silhouette
        self.optimise_results = {
            key: value
            for key, value in grid.cv_results_.items()
            if key[:5] != 'split'  # We don't need all cross-val split results
        }

        self.set_params(best_params, from_parameter_grid=True)
        # Fits the pipeline again with the best parameters
        self.fit(X_text)

        logger.setLevel(logging_level)

        return best_params
예제 #4
0
    from sklearn.manifold import TSNE
    from sklearn.model_selection import GridSearchCV
    from sklearn.metrics import silhouette_score
    from sklearn.pipeline import Pipeline
except ImportError as e:
    throw_extra_import_message(error=e,
                               required_modules=required_modules,
                               extras=extras)

try:
    from hdbscan import HDBSCAN
    HDBSCAN_INSTALLED = True
except (ValueError, ModuleNotFoundError):
    HDBSCAN_INSTALLED = False
    logger.warning(
        "If you want to use hdbscan you need to run"
        "pip3 install hdbscan --no-cache-dir --no-binary :all: --no-build-isolation "
        "Read more https://github.com/wellcometrust/WellcomeML/issues/197")

CACHE_DIR = os.path.expanduser("~/.cache/wellcomeml")


class TextClustering(object):
    """
    Basic class for clustering pipelines.

    Attributes:
        vectorizer: The embedding Vectorizer object
        reducer: A dimensionality reduction object
        clustering: A clustering model object
        cluster_ids: Ids of the cluster
        cluster_names: Names of the clusters
예제 #5
0
import os
from wellcomeml.logger import logger

# Introduced a development_transformers env variable, that allows to
# disable functions that use spacy.

development_transformers_mode = (os.environ.get(
    "WELLCOMEML_ENV", "") == "development_transformers")

if development_transformers_mode:
    logger.warning("Running in development mode. Only loading modules that"
                   " use new version of transformers.")

    from .bert_semantic_equivalence import SemanticEquivalenceClassifier
    __all__ = [SemanticEquivalenceClassifier]
else:
    from .frequency_vectorizer import WellcomeTfidf
    from .doc2vec_vectorizer import Doc2VecVectorizer
    from .sent2vec_vectorizer import Sent2VecVectorizer
    from .voting_classifier import WellcomeVotingClassifier
    __all__ = [
        WellcomeTfidf, Doc2VecVectorizer, Sent2VecVectorizer,
        WellcomeVotingClassifier
    ]

    try:
        from .vectorizer import Vectorizer
        from .clustering import TextClustering
        from .spacy_ner import SpacyNER
        from .spacy_classifier import SpacyClassifier
        from .bert_classifier import BertClassifier