Exemplo n.º 1
0
"""
from wellcomeml.utils import throw_extra_import_message

from wellcomeml.ml.bert_vectorizer import BertVectorizer

required_modules = 'sklearn'
required_extras = 'core'

try:
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics import f1_score
    from sklearn.metrics.pairwise import cosine_similarity
    import numpy as np
except ImportError as e:
    throw_extra_import_message(error=e,
                               required_modules=required_modules,
                               required_extras=required_extras)


class SimilarityEntityLinker:
    def __init__(self, stopwords, embedding="tf-idf"):
        """
        Input:
            stopwords - list of stopwords
            embedding - How to embed the documents
                    in order to find which document in the corpus
                    is most similar to the sentence.
                    embedding='tf-idf': Use a TFIDF vectoriser
                    embedding='bert': Use a BERT vectoriser
        """
Exemplo n.º 2
0
from wellcomeml.utils import throw_extra_import_message

try:
    import tensorflow as tf
except ImportError as e:
    throw_extra_import_message(error=e,
                               required_modules='tensorflow',
                               extras='tensorflow')


class SelfAttention(tf.keras.layers.Layer):
    """https://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf"""
    def __init__(self, attention_dim=20):
        super(SelfAttention, self).__init__()
        self.attention_dim = attention_dim

    def build(self, input_shape):
        self.WQ = self.add_weight(
            shape=(input_shape[-1], self.attention_dim),
            trainable=True,
            initializer="uniform",
        )
        self.WK = self.add_weight(
            shape=(input_shape[-1], self.attention_dim),
            trainable=True,
            initializer="uniform",
        )
        self.WV = self.add_weight(
            shape=(input_shape[-1], input_shape[-1]),
            trainable=True,
            initializer="uniform",
Exemplo n.º 3
0
from wellcomeml.utils import throw_extra_import_message

required_module = 'nervaluate'
required_extras = 'core'

try:
    from nervaluate import Evaluator
except ImportError as e:
    throw_extra_import_message(e, required_module, required_extras)


def ner_classification_report(y_true, y_pred, groups, tags):
    """
    Evaluate the model's performance for each grouping of data
    for the NER labels given in 'tags'

    Input:
        y_pred: a list of predicted entities
        y_true: a list of gold entities
        groups: (str) the group each of the pred or gold entities belong to

    Output:
        report: evaluation metrics for each group
                in a nice format for printing
    """

    unique_groups = sorted(set(groups))
    outputs = []

    for group in unique_groups:
        pred_doc_entities = [y_pred[i] for i, g in enumerate(groups) if g == group]
Exemplo n.º 4
0
from wellcomeml.ml.keras_utils import CategoricalMetrics  # , MetricMiniBatchHistory
from wellcomeml.logger import LOGGING_LEVEL, build_logger

try:
    import tensorflow as tf
    from transformers import BertConfig, BertTokenizer, \
        TFBertForSequenceClassification

    from sklearn.base import BaseEstimator, TransformerMixin
    from sklearn.model_selection import train_test_split
    from sklearn.utils.validation import check_is_fitted
    from sklearn.exceptions import NotFittedError
except ImportError as e:
    throw_extra_import_message(error=e,
                               required_modules='tensorflow,sklearn',
                               extras='core,tensorflow,transformers')


class SemanticEquivalenceClassifier(BaseEstimator, TransformerMixin):
    """
    Class to fine-tune BERT-type models for semantic equivalence, for example
    paraphrase, textual similarity and other NLU tasks
    """
    def __init__(
            self,
            pretrained="bert",
            batch_size=32,
            eval_batch_size=32 * 2,
            learning_rate=3e-5,
            test_size=0.2,
Exemplo n.º 5
0
"""
import logging
import re

from wellcomeml.utils import throw_extra_import_message
# Heavy dependencies go here
required_modules = 'spacy,sklearn,scipy'
required_extras = 'spacy,core'

try:
    import spacy
    from scipy import sparse
    from sklearn.feature_extraction.text import TfidfVectorizer
except ImportError as e:
    throw_extra_import_message(error=e,
                               required_modules='spacy',
                               extras='spacy')

logger = logging.getLogger(__name__)


class WellcomeTfidf(TfidfVectorizer):
    """
    Class to wrap some basic transformation and text
    vectorisation/embedding
    """
    def __init__(self, use_regex=True, use_spacy_lemmatizer=True, **kwargs):
        """

        Args:
            Any sklearn "tfidfvectorizer" arguments (min_df, etc.)
Exemplo n.º 6
0
from datetime import datetime
import math

from wellcomeml.ml.attention import HierarchicalAttention
from wellcomeml.utils import throw_extra_import_message

try:
    import tensorflow as tf
    from sklearn.model_selection import train_test_split
    from sklearn.base import BaseEstimator, ClassifierMixin
    from sklearn.metrics import f1_score
    from scipy.sparse import csr_matrix, vstack
    import numpy as np
except ImportError as e:
    throw_extra_import_message(error=e,
                               required_modules='tensorflow,scipy,numpy',
                               extras='tensorflow,core')

TENSORBOARD_LOG_DIR = "logs/scalars/" + datetime.now().strftime(
    "%Y%m%d-%H%M%S")
CALLBACK_DICT = {
    'tensorboard': tf.keras.callbacks.TensorBoard(log_dir=TENSORBOARD_LOG_DIR)
}


class BiLSTMClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 learning_rate=0.01,
                 learning_rate_decay=1,
                 batch_size=32,
                 nb_epochs=5,