def test_build_model(): X = ["One and two", "One only", "Two nothing else", "Two and three"] Y = np.array([[1, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 1, 1, 0]]) vectorizer = KerasVectorizer() X_vec = vectorizer.fit_transform(X) batch_size = 2 model = CNNClassifier(batch_size=batch_size, multilabel=True, learning_rate=1e-2) model.fit(X_vec, Y) Y_pred = model.predict(X_vec) assert Y_pred.shape[1] == 4 Y = Y[:, :3] sequence_length = X_vec.shape[1] vocab_size = X_vec.max() + 1 nb_outputs = Y.shape[1] decay_steps = X_vec.shape[0] / batch_size model.build_model(sequence_length, vocab_size, nb_outputs, decay_steps) model.fit(X_vec, Y) Y_pred = model.predict(X_vec) assert Y_pred.shape[1] == 3
def test_infer_from_data(): X = ["One", "Two words", "Three words here"] keras_vectorizer = KerasVectorizer() keras_vectorizer.fit(X) assert keras_vectorizer.sequence_length == 3
def test_vocab_size(): X = ["One", "Two", "Three"] vocab_size = 1 keras_vectorizer = KerasVectorizer(vocab_size=vocab_size) X_vec = keras_vectorizer.fit_transform(X) assert X_vec.max() == vocab_size
def test_sequence_length(): X = ["One", "Two", "Three"] sequence_length = 5 keras_vectorizer = KerasVectorizer(sequence_length=sequence_length) X_vec = keras_vectorizer.fit_transform(X) assert X_vec.shape[1] == sequence_length
def test_vanilla(): X = ["One", "Two", "Three Four"] keras_vectorizer = KerasVectorizer() X_vec = keras_vectorizer.fit_transform(X) assert X_vec.shape[0] == 3 assert X_vec.shape[1] == 2 assert X_vec.max() == 5 # 4 tokens including OOV
def test_build_embedding_matrix_word_vectors(): X = ["One", "Two", "Three"] vocab_size = 1 keras_vectorizer = KerasVectorizer(vocab_size=vocab_size) keras_vectorizer.fit(X) embedding_matrix = keras_vectorizer.build_embedding_matrix( embeddings_name_or_path="glove-twitter-25") assert embedding_matrix.shape == (5, 25)
def test_XY_dataset(): X = ["One", "One only", "Two nothing else", "Two and three"] Y = np.array([0, 0, 1, 1]) vec = KerasVectorizer() X_vec = vec.fit_transform(X) data = tf.data.Dataset.from_tensor_slices((X_vec, Y)) data = data.shuffle(100, seed=42) clf = CNNClassifier(batch_size=2) clf.fit(data) assert clf.score(data, Y) > 0.3
def test_XY_dataset_sparse_y(): X = ["One and two", "One only", "Two nothing else", "Two and three"] Y = np.array([[1, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 1, 1, 0]]) Y_sparse = csr_matrix(Y) vec = KerasVectorizer() X_vec = vec.fit_transform(X) train_data = tf.data.Dataset.from_tensor_slices((X_vec, Y)) test_data = tf.data.Dataset.from_tensor_slices((X_vec)) clf = CNNClassifier(batch_size=2, sparse_y=True, multilabel=True) clf.fit(train_data) assert clf.score(test_data, Y_sparse) > 0.3
def test_save_load_attention(): X = ["One", "One only", "Two nothing else", "Two and three"] Y = np.array([0, 0, 1, 1]) vec = KerasVectorizer() X_vec = vec.fit_transform(X) model = BiLSTMClassifier(attention=True) model.fit(X_vec, Y) with tempfile.TemporaryDirectory() as tmp_dir: model.save(tmp_dir) loaded_model = BiLSTMClassifier() loaded_model.load(tmp_dir) assert hasattr(loaded_model, 'model') assert loaded_model.score(X_vec, Y) > 0.6
def test_vanilla(): X = ["One", "One only", "Two nothing else", "Two and three"] Y = np.array([0, 0, 1, 1]) model = Pipeline([('vec', KerasVectorizer()), ('clf', BiLSTMClassifier(nb_epochs=10))]) model.fit(X, Y) assert model.score(X, Y) > 0.6
def test_XY_list(): X = ["One", "One only", "Two nothing else", "Two and three"] Y = [0, 0, 1, 1] model = Pipeline([('vec', KerasVectorizer()), ('clf', CNNClassifier(batch_size=2))]) model.fit(X, Y) assert model.score(X, Y) > 0.6
def test_predict_proba(): X = ["One", "One only", "Two nothing else", "Two and three"] Y = np.array([0, 0, 1, 1]) model = Pipeline([('vec', KerasVectorizer()), ('clf', BiLSTMClassifier())]) model.fit(X, Y) Y_pred_prob = model.predict_proba(X) assert sum(Y_pred_prob >= 0) == Y.shape[0] assert sum(Y_pred_prob <= 1) == Y.shape[0]
def test_threshold(): X = ["One", "One only", "Two nothing else", "Two and three"] Y = np.array([0, 0, 1, 1]) model = Pipeline([('vec', KerasVectorizer()), ('clf', BiLSTMClassifier(threshold=0.1))]) model.fit(X, Y) Y_pred_expected = model.predict_proba(X) > 0.1 Y_pred = model.predict(X) assert np.array_equal(Y_pred_expected, Y_pred)
def test_feature_approach_concat(): X = ["One", "One only", "Two nothing else", "Two and three"] Y = np.array([0, 0, 1, 1]) model = Pipeline([('vec', KerasVectorizer()), ('clf', CNNClassifier(batch_size=2, feature_approach="concat"))]) model.fit(X, Y) assert model.score(X, Y) > 0.6
def test_attention(): X = ["One", "One only", "Two nothing else", "Two and three"] Y = np.array([0, 0, 1, 1]) model = Pipeline([('vec', KerasVectorizer()), ('clf', CNNClassifier(batch_size=2, attention=True, attention_heads=10))]) model.fit(X, Y) assert model.score(X, Y) > 0.6
def test_multilabel(): X = [ "One and two", "One only", "Three and four, nothing else", "Two nothing else", "Two and three" ] Y = np.array([[1, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 1], [0, 1, 0, 0], [0, 1, 1, 0]]) model = Pipeline([('vec', KerasVectorizer()), ('clf', BiLSTMClassifier(multilabel=True))]) model.fit(X, Y) assert model.score(X, Y) > 0.4 assert model.predict(X).shape == (5, 4)
def test_early_stopping(): X = ["One", "One only", "Two nothing else", "Two and three"] Y = np.array([0, 0, 1, 1]) model = Pipeline([('vec', KerasVectorizer()), ('clf', BiLSTMClassifier(early_stopping=True, nb_epochs=10000))]) # if early_stopping is not working it will take # a lot of time to finish running this test model.fit(X, Y) assert model.score(X, Y) > 0.6
def test_multilabel_attention(): X = ["One and two", "One only", "Two nothing else", "Two and three"] Y = np.array([[1, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0, 1, 1, 0]]) model = Pipeline([('vec', KerasVectorizer()), ('clf', CNNClassifier(batch_size=2, multilabel=True, attention=True, feature_approach="multilabel-attention", learning_rate=1e-2))]) model.fit(X, Y) assert model.score(X, Y) > 0.3
def test_early_stopping(): X = ["One", "One only", "Two nothing else", "Two and three"] Y = np.array([0, 0, 1, 1]) model = Pipeline([('vec', KerasVectorizer()), ('clf', CNNClassifier(batch_size=2, early_stopping=True, nb_epochs=10000))]) # if early_stopping is not working it will take # a lot of time to finish running this test # it will also consume the 4MB of logs in travis model.fit(X, Y) assert model.score(X, Y) > 0.6
def test_build_embedding_matrix(): X = ["One", "Two", "Three"] vocab_size = 1 keras_vectorizer = KerasVectorizer(vocab_size=vocab_size) keras_vectorizer.fit(X) with tempfile.TemporaryDirectory() as tmp_dir: embeddings_path = os.path.join(tmp_dir, "embeddings.csv") embeddings = [ "one 0 1 0 0 0", "two 0 0 1 0 0", "three 0 0 0 1 0", "four 0 0 0 0 1", ] with open(embeddings_path, "w") as embeddings_path_tmp: for line in embeddings: embeddings_path_tmp.write(line) embeddings_path_tmp.write("\n") embedding_matrix = keras_vectorizer.build_embedding_matrix( embeddings_name_or_path=embeddings_path) assert embedding_matrix.shape == (5, 5)
def test_sparse(): X = [ "One and two", "One only", "Three and four, nothing else", "Two nothing else", "Two and three" ] Y = csr_matrix( np.array([[1, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 1], [0, 1, 0, 0], [0, 1, 1, 0]])) model = Pipeline([('vec', KerasVectorizer()), ('clf', CNNClassifier(multilabel=True, batch_size=2, sparse_y=True))]) model.fit(X, Y) assert model.score(X, Y) > 0.4 assert model.predict(X).shape == (5, 4)
class MeshCNN: def __init__( self, threshold=0.5, batch_size=256, shuffle=True, buffer_size=1000, data_cache=None, random_seed=42, ): """ threshold: float, default 0.5. Probability threshold on top of which a tag should be assigned. batch_size: int, default 256. Size of batches used for training and prediction. shuffle: bool, default True. Flag on whether to shuffle data before fit. buffer_size: int, default 1000. Buffer size used for shuffling or transforming data before fit. data_cache: path, default None. Path to use for caching data transformations. random_seed: int, default 42. Random seed that controls reproducibility. """ self.threshold = threshold self.batch_size = batch_size self.shuffle = shuffle self.buffer_size = buffer_size self.data_cache = data_cache self.random_seed = random_seed def _yield_data(self, X, vectorizer, Y=None): """ Generator to yield vectorized X and Y data one by one X: list of texts vectorizer: vectorizer class that implements transform which transforms texts to integers Y: 2d numpy array or sparse csr_matrix that represents targets (tags) assigned. If Y is missing, for example when called by predict, yield_data yields only X vectorized """ def yield_transformed_data(X_buffer, Y_buffer): # TODO: This could move to WellcomeML to enable CNN to receive generators Y_den = None X_vec = self.vectorizer.transform(X_buffer) if Y_buffer: # Y_buffer list of np or sparse arrays if type(Y_buffer[0]) == np.ndarray: Y_den = np.vstack(Y_buffer) else: # sparse Y_buffer = sp.vstack(Y_buffer) Y_den = np.asarray(Y_buffer.todense()) for i in range(len(X_buffer)): if Y_den is not None: yield X_vec[i], Y_den[i] else: yield X_vec[i] def data_gen(): """ Wrapper on top of yield_transformed_data to get a callable function which enables to restart the iterator. This function also implements buffering for more efficient transformations """ X_buffer = [] Y_buffer = [] X_gen = X() if Y: Y_gen = Y() data_zip = zip(X_gen, Y_gen) else: data_zip = X_gen for data_example in data_zip: if Y: x, y = data_example Y_buffer.append(y) else: x = data_example X_buffer.append(x) if len(X_buffer) >= self.buffer_size: yield from yield_transformed_data(X_buffer, Y_buffer) X_buffer = [] Y_buffer = [] if X_buffer: yield from yield_transformed_data(X_buffer, Y_buffer) output_types = (tf.int32, tf.int32) if Y else (tf.int32) data = tf.data.Dataset.from_generator(data_gen, output_types=output_types) if self.data_cache: data = data.cache(self.data_cache) return data def _init_vectorizer(self): self.vectorizer = KerasVectorizer(vocab_size=5_000, sequence_length=400) def _init_classifier(self): self.classifier = CNNClassifier( learning_rate=0.01, dropout=0.1, sparse_y=True, nb_epochs=20, nb_layers=4, multilabel=True, threshold=self.threshold, batch_size=self.batch_size, ) def set_params(self, **params): if not hasattr(self, "vectorizer"): self._init_vectorizer() if not hasattr(self, "classifier"): self._init_classifier() vec_params = get_params_for_component(params, "vec") clf_params = get_params_for_component(params, "cnn") self.vectorizer.set_params(**vec_params) self.classifier.set_params(**clf_params) def fit(self, X, Y): """ X: list or generator of texts Y: 2d numpy array or sparse csr_matrix or generator of 2d numpy array of tags assigned If X is a generator it need to be callable i.e. return the generator by calling it X_gen = X(). This is so we can iterate on the data again. """ if not hasattr(self, "vectorizer"): self._init_vectorizer() if not hasattr(self, "classifier"): self._init_classifier() if type(X) in [list, np.ndarray]: print("Fitting vectorizer") self.vectorizer.fit(X) X_vec = self.vectorizer.transform(X) print(X_vec.shape) print("Fitting classifier") self.classifier.fit(X_vec, Y) else: print("Fitting vectorizer") X_gen = X() self.vectorizer.fit(X_gen) print("Fitting classifier") params_from_vectorizer = { "sequence_length": self.vectorizer.sequence_length, "vocab_size": self.vectorizer.vocab_size, } self.classifier.set_params(**params_from_vectorizer) train_data = self._yield_data(X, self.vectorizer, Y) # TODO: This should move inside CNNClassifier if self.shuffle: train_data = train_data.shuffle(self.buffer_size, seed=self.random_seed) self.classifier.fit(train_data) return self def predict(self, X): if type(X) in [list, np.ndarray]: X_vec = self.vectorizer.transform(X) Y_pred = self.classifier.predict(X_vec) else: pred_data = self._yield_data(X, self.vectorizer) Y_pred = self.classifier.predict(pred_data) return Y_pred def predict_proba(self, X): if type(X) in [list, np.ndarray]: X_vec = self.vectorizer.transform(X) Y_pred_proba = [] for i in range(0, X_vec.shape[0], self.batch_size): Y_pred_proba_batch = self.classifier.predict_proba( X_vec[i:i + self.batch_size]) Y_pred_proba.append(Y_pred_proba_batch) Y_pred_proba = np.hstack(Y_pred_proba) else: pred_data = self._yield_data(X, self.vectorizer) Y_pred_proba = self.classifier.predict_proba(pred_data) return Y_pred_proba def save(self, model_path): if not os.path.exists(model_path): os.mkdir(model_path) meta = {"name": "MeshCNN", "approach": "mesh-cnn"} meta_path = os.path.join(model_path, "meta.json") with open(meta_path, "w") as f: f.write(json.dumps(meta)) vectorizer_path = os.path.join(model_path, "vectorizer.pkl") save_pickle(vectorizer_path, self.vectorizer) self.classifier.save(model_path) def load(self, model_path): meta_path = os.path.join(model_path, "meta.json") with open(meta_path, "r") as f: meta = json.loads(f.read()) self.set_params(**meta) vectorizer_path = os.path.join(model_path, "vectorizer.pkl") self.vectorizer = load_pickle(vectorizer_path) self._init_classifier() self.classifier.load(model_path)
from wellcomeml.ml.cnn import CNNClassifier from wellcomeml.ml.keras_vectorizer import KerasVectorizer from sklearn.pipeline import Pipeline import numpy as np X = ["One", "three", "one", "two", "four"] Y = np.array([1, 0, 1, 0, 0]) cnn_pipeline = Pipeline([("vec", KerasVectorizer()), ("clf", CNNClassifier())]) cnn_pipeline.fit(X, Y) print(cnn_pipeline.score(X, Y)) X = ["One, three", "one", "two, three"] Y = np.array([[1, 0, 1], [1, 0, 0], [0, 1, 1]]) cnn_pipeline = Pipeline([("vec", KerasVectorizer()), ("clf", CNNClassifier(multilabel=True))]) cnn_pipeline.fit(X, Y) print(cnn_pipeline.score(X, Y))
from wellcomeml.ml.bilstm import BiLSTMClassifier from wellcomeml.ml.keras_vectorizer import KerasVectorizer from sklearn.pipeline import Pipeline import numpy as np X = ["One", "three", "one", "two", "four"] Y = np.array([1, 0, 1, 0, 0]) bilstm_pipeline = Pipeline([("vec", KerasVectorizer()), ("clf", BiLSTMClassifier())]) bilstm_pipeline.fit(X, Y) print(bilstm_pipeline.score(X, Y)) X = ["One, three", "one", "two, three"] Y = np.array([[1, 0, 1], [1, 0, 0], [0, 1, 1]]) bilstm_pipeline = Pipeline([("vec", KerasVectorizer()), ("clf", BiLSTMClassifier(multilabel=True))]) bilstm_pipeline.fit(X, Y) print(bilstm_pipeline.score(X, Y))
def create_model(approach, parameters=None): if approach == "tfidf-svm": model = Pipeline([ ( "tfidf", TfidfVectorizer( stop_words="english", max_df=0.95, min_df=0.0, ngram_range=(1, 1), ), ), ("svm", OneVsRestClassifier(SVC(kernel="linear", probability=True))), ]) elif approach == "tfidf-transformers-svm": model = TfidfTransformersSVM() elif approach == "bert-svm": model = Pipeline([ ("bert", BertVectorizer(pretrained="bert")), ("svm", OneVsRestClassifier(SVC(kernel="linear", probability=True))), ]) elif approach == "scibert-svm": model = Pipeline([ ("scibert", BertVectorizer(pretrained="scibert")), ("svm", OneVsRestClassifier(SVC(kernel="linear", probability=True))), ]) elif approach == "spacy-textclassifier": model = SpacyClassifier() elif approach == "bert": model = BertClassifier() elif approach == "scibert": model = BertClassifier(pretrained="scibert") elif approach == "classifierchain-tfidf-svm": model = Pipeline([ ( "tfidf", TfidfVectorizer( stop_words="english", max_df=0.95, min_df=0.0, ngram_range=(1, 1), ), ), ( "svm", ClassifierChain( classifier=SVC(kernel="linear", probability=True)), ), ]) elif approach == "labelpowerset-tfidf-svm": model = Pipeline([ ( "tfidf", TfidfVectorizer( stop_words="english", max_df=0.95, min_df=0.0, ngram_range=(1, 1), ), ), ("svm", LabelPowerset(SVC(kernel="linear", probability=True))), ]) elif approach == "binaryrelevance-tfidf-svm": # same as OneVsRestClassifier model = Pipeline([ ( "tfidf", TfidfVectorizer( stop_words="english", max_df=0.95, min_df=0.0, ngram_range=(1, 1), ), ), ( "svm", BinaryRelevance( classifier=SVC(kernel="linear", probability=True)), ), ]) elif approach == "binaryrelevance-tfidf-knn": model = Pipeline([ ( "tfidf", TfidfVectorizer( stop_words="english", max_df=0.95, min_df=0.0, ngram_range=(1, 1), ), ), ("knn", BinaryRelevance(classifier=KNeighborsClassifier)), ]) elif approach == "hashing_vectorizer-svm": model = Pipeline([ ("hashing_vectorizer", HashingVectorizer()), ("svm", OneVsRestClassifier(SGDClassifier(loss="hinge", penalty="l2"))), ]) elif approach == "hashing_vectorizer-nb": model = Pipeline([ ( "hashing_vectorizer", HashingVectorizer(binary=True, n_features=2**18), ), ("nb", OneVsRestClassifier(MultinomialNB())), ]) elif approach == "tfidf-sgd": model = Pipeline([ ( "tfidf", TfidfVectorizer(stop_words="english", max_df=0.95, min_df=5, ngram_range=(1, 1)), ), ("svm", OneVsRestClassifier(SGDClassifier(loss="hinge", penalty="l2"))), ]) elif approach == "cnn": model = Pipeline([ ("vec", KerasVectorizer(vocab_size=5_000)), ( "cnn", CNNClassifier( learning_rate=0.01, dropout=0.1, nb_epochs=20, nb_layers=4, multilabel=True, ), ), ]) elif approach == "bilstm": model = Pipeline([ ("vec", KerasVectorizer(vocab_size=5_000, sequence_length=678)), ( "bilstm", BiLSTMClassifier(learning_rate=0.01, dropout=0.1, nb_epochs=20, multilabel=True), ), ]) elif approach == "doc2vec-sgd": model = Pipeline([ ("vec", Doc2VecVectorizer()), ( "sgd", OneVsRestClassifier(SGDClassifier(penalty="l2", alpha=1e-8), n_jobs=-1), ), ]) elif approach == "doc2vec-tfidf-sgd": model = Pipeline([ ( "vec", FeatureUnion([ ( "doc2vec", Pipeline([ ("doc2vec_unscaled", Doc2VecVectorizer()), ("scale_doc2vec", Normalizer()), ]), ), ( "tfidf", Pipeline([ ( "tfidf_unscaled", TfidfVectorizer( min_df=5, stop_words="english", ngram_range=(1, 2), ), ), ("scale_tfidf", Normalizer()), ]), ), ]), ), ( "sgd", OneVsRestClassifier(SGDClassifier(penalty="l2", alpha=1e-6), n_jobs=-1), ), ]) elif approach == "sent2vec-sgd": model = Pipeline([ ("vec", Sent2VecVectorizer(pretrained="biosent2vec")), ( "sgd", OneVsRestClassifier(SGDClassifier(penalty="l2", alpha=1e-8), n_jobs=-1), ), ]) elif approach == "sent2vec-tfidf-sgd": model = Pipeline([ ( "vec", FeatureUnion([ ( "sent2vec", Pipeline([ ( "sent2vec_unscaled", Sent2VecVectorizer(pretrained="biosent2vec"), ), ("scale_sent2vec", Normalizer()), ]), ), ( "tfidf", Pipeline([ ( "tfidf_unscaled", TfidfVectorizer( min_df=5, stop_words="english", ngram_range=(1, 2), ), ), ("scale_tfidf", Normalizer()), ]), ), ]), ), ( "sgd", OneVsRestClassifier(SGDClassifier(penalty="l2", alpha=1e-8), n_jobs=-1), ), ]) elif approach == "tfidf-adaboost": model = Pipeline([ ( "tfidf", TfidfVectorizer(min_df=5, stop_words="english", ngram_range=(1, 2)), ), ( "adaboost", OneVsRestClassifier( AdaBoostClassifier(DecisionTreeClassifier())), ), ]) elif approach == "tfidf-gboost": model = Pipeline([ ( "tfidf", TfidfVectorizer(min_df=5, stop_words="english", ngram_range=(1, 2)), ), ("gboost", OneVsRestClassifier(GradientBoostingClassifier())), ]) elif approach == "tfidf+onehot_team-svm": model = Pipeline([ ( "vectorizer", FeatureUnion([ ( "text_features", Pipeline([ ( "selector", FunctionTransformer(lambda x: x["text"]), ), ( "tfidf", TfidfVectorizer( min_df=5, ngram_range=(1, 2), stop_words="english", ), ), ]), ), ( "team_features", Pipeline([ ( "selector", FunctionTransformer(lambda x: x[["Team"]]), ), ( "one hot", OneHotEncoder(handle_unknown="ignore"), ), ]), ), ]), ), ( "svm", OneVsRestClassifier( SVC(class_weight="balanced", kernel="linear")), ), ]) elif approach == "tfidf+onehot_scheme-svm": model = Pipeline([ ( "vectorizer", FeatureUnion([ ( "text_features", Pipeline([ ( "selector", FunctionTransformer(lambda x: x["text"]), ), ( "tfidf", TfidfVectorizer( min_df=5, ngram_range=(1, 2), stop_words="english", ), ), ]), ), ( "team_features", Pipeline([ ( "selector", FunctionTransformer(lambda x: x[["Scheme"]]), ), ( "one hot", OneHotEncoder(handle_unknown="ignore"), ), ]), ), ]), ), ( "svm", OneVsRestClassifier( SVC(class_weight="balanced", kernel="linear")), ), ]) elif approach == "mesh-tfidf-svm": model = MeshTfidfSVM() elif approach == "mesh-cnn": model = MeshCNN() elif approach == "science-ensemble": model = ScienceEnsemble() elif approach == "mesh-xlinear": model = MeshXLinear() else: raise ApproachNotImplemented if parameters: params = ast.literal_eval(parameters) model.set_params(**params) else: parameters = {} return model
def _init_vectorizer(self): self.vectorizer = KerasVectorizer(vocab_size=5_000, sequence_length=400)