def test_vanilla(): X = ["One", "One only", "Two nothing else", "Two and three"] Y = np.array([0, 0, 1, 1]) model = Pipeline([('vec', KerasVectorizer()), ('clf', BiLSTMClassifier(nb_epochs=10))]) model.fit(X, Y) assert model.score(X, Y) > 0.6
def test_predict_proba(): X = ["One", "One only", "Two nothing else", "Two and three"] Y = np.array([0, 0, 1, 1]) model = Pipeline([('vec', KerasVectorizer()), ('clf', BiLSTMClassifier())]) model.fit(X, Y) Y_pred_prob = model.predict_proba(X) assert sum(Y_pred_prob >= 0) == Y.shape[0] assert sum(Y_pred_prob <= 1) == Y.shape[0]
def test_threshold(): X = ["One", "One only", "Two nothing else", "Two and three"] Y = np.array([0, 0, 1, 1]) model = Pipeline([('vec', KerasVectorizer()), ('clf', BiLSTMClassifier(threshold=0.1))]) model.fit(X, Y) Y_pred_expected = model.predict_proba(X) > 0.1 Y_pred = model.predict(X) assert np.array_equal(Y_pred_expected, Y_pred)
def test_multilabel(): X = [ "One and two", "One only", "Three and four, nothing else", "Two nothing else", "Two and three" ] Y = np.array([[1, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 1], [0, 1, 0, 0], [0, 1, 1, 0]]) model = Pipeline([('vec', KerasVectorizer()), ('clf', BiLSTMClassifier(multilabel=True))]) model.fit(X, Y) assert model.score(X, Y) > 0.4 assert model.predict(X).shape == (5, 4)
def test_early_stopping(): X = ["One", "One only", "Two nothing else", "Two and three"] Y = np.array([0, 0, 1, 1]) model = Pipeline([('vec', KerasVectorizer()), ('clf', BiLSTMClassifier(early_stopping=True, nb_epochs=10000))]) # if early_stopping is not working it will take # a lot of time to finish running this test model.fit(X, Y) assert model.score(X, Y) > 0.6
def create_model(approach, parameters=None): if approach == "tfidf-svm": model = Pipeline([ ( "tfidf", TfidfVectorizer( stop_words="english", max_df=0.95, min_df=0.0, ngram_range=(1, 1), ), ), ("svm", OneVsRestClassifier(SVC(kernel="linear", probability=True))), ]) elif approach == "tfidf-transformers-svm": model = TfidfTransformersSVM() elif approach == "bert-svm": model = Pipeline([ ("bert", BertVectorizer(pretrained="bert")), ("svm", OneVsRestClassifier(SVC(kernel="linear", probability=True))), ]) elif approach == "scibert-svm": model = Pipeline([ ("scibert", BertVectorizer(pretrained="scibert")), ("svm", OneVsRestClassifier(SVC(kernel="linear", probability=True))), ]) elif approach == "spacy-textclassifier": model = SpacyClassifier() elif approach == "bert": model = BertClassifier() elif approach == "scibert": model = BertClassifier(pretrained="scibert") elif approach == "classifierchain-tfidf-svm": model = Pipeline([ ( "tfidf", TfidfVectorizer( stop_words="english", max_df=0.95, min_df=0.0, ngram_range=(1, 1), ), ), ( "svm", ClassifierChain( classifier=SVC(kernel="linear", probability=True)), ), ]) elif approach == "labelpowerset-tfidf-svm": model = Pipeline([ ( "tfidf", TfidfVectorizer( stop_words="english", max_df=0.95, min_df=0.0, ngram_range=(1, 1), ), ), ("svm", LabelPowerset(SVC(kernel="linear", probability=True))), ]) elif approach == "binaryrelevance-tfidf-svm": # same as OneVsRestClassifier model = Pipeline([ ( "tfidf", TfidfVectorizer( stop_words="english", max_df=0.95, min_df=0.0, ngram_range=(1, 1), ), ), ( "svm", BinaryRelevance( classifier=SVC(kernel="linear", probability=True)), ), ]) elif approach == "binaryrelevance-tfidf-knn": model = Pipeline([ ( "tfidf", TfidfVectorizer( stop_words="english", max_df=0.95, min_df=0.0, ngram_range=(1, 1), ), ), ("knn", BinaryRelevance(classifier=KNeighborsClassifier)), ]) elif approach == "hashing_vectorizer-svm": model = Pipeline([ ("hashing_vectorizer", HashingVectorizer()), ("svm", OneVsRestClassifier(SGDClassifier(loss="hinge", penalty="l2"))), ]) elif approach == "hashing_vectorizer-nb": model = Pipeline([ ( "hashing_vectorizer", HashingVectorizer(binary=True, n_features=2**18), ), ("nb", OneVsRestClassifier(MultinomialNB())), ]) elif approach == "tfidf-sgd": model = Pipeline([ ( "tfidf", TfidfVectorizer(stop_words="english", max_df=0.95, min_df=5, ngram_range=(1, 1)), ), ("svm", OneVsRestClassifier(SGDClassifier(loss="hinge", penalty="l2"))), ]) elif approach == "cnn": model = Pipeline([ ("vec", KerasVectorizer(vocab_size=5_000)), ( "cnn", CNNClassifier( learning_rate=0.01, dropout=0.1, nb_epochs=20, nb_layers=4, multilabel=True, ), ), ]) elif approach == "bilstm": model = Pipeline([ ("vec", KerasVectorizer(vocab_size=5_000, sequence_length=678)), ( "bilstm", BiLSTMClassifier(learning_rate=0.01, dropout=0.1, nb_epochs=20, multilabel=True), ), ]) elif approach == "doc2vec-sgd": model = Pipeline([ ("vec", Doc2VecVectorizer()), ( "sgd", OneVsRestClassifier(SGDClassifier(penalty="l2", alpha=1e-8), n_jobs=-1), ), ]) elif approach == "doc2vec-tfidf-sgd": model = Pipeline([ ( "vec", FeatureUnion([ ( "doc2vec", Pipeline([ ("doc2vec_unscaled", Doc2VecVectorizer()), ("scale_doc2vec", Normalizer()), ]), ), ( "tfidf", Pipeline([ ( "tfidf_unscaled", TfidfVectorizer( min_df=5, stop_words="english", ngram_range=(1, 2), ), ), ("scale_tfidf", Normalizer()), ]), ), ]), ), ( "sgd", OneVsRestClassifier(SGDClassifier(penalty="l2", alpha=1e-6), n_jobs=-1), ), ]) elif approach == "sent2vec-sgd": model = Pipeline([ ("vec", Sent2VecVectorizer(pretrained="biosent2vec")), ( "sgd", OneVsRestClassifier(SGDClassifier(penalty="l2", alpha=1e-8), n_jobs=-1), ), ]) elif approach == "sent2vec-tfidf-sgd": model = Pipeline([ ( "vec", FeatureUnion([ ( "sent2vec", Pipeline([ ( "sent2vec_unscaled", Sent2VecVectorizer(pretrained="biosent2vec"), ), ("scale_sent2vec", Normalizer()), ]), ), ( "tfidf", Pipeline([ ( "tfidf_unscaled", TfidfVectorizer( min_df=5, stop_words="english", ngram_range=(1, 2), ), ), ("scale_tfidf", Normalizer()), ]), ), ]), ), ( "sgd", OneVsRestClassifier(SGDClassifier(penalty="l2", alpha=1e-8), n_jobs=-1), ), ]) elif approach == "tfidf-adaboost": model = Pipeline([ ( "tfidf", TfidfVectorizer(min_df=5, stop_words="english", ngram_range=(1, 2)), ), ( "adaboost", OneVsRestClassifier( AdaBoostClassifier(DecisionTreeClassifier())), ), ]) elif approach == "tfidf-gboost": model = Pipeline([ ( "tfidf", TfidfVectorizer(min_df=5, stop_words="english", ngram_range=(1, 2)), ), ("gboost", OneVsRestClassifier(GradientBoostingClassifier())), ]) elif approach == "tfidf+onehot_team-svm": model = Pipeline([ ( "vectorizer", FeatureUnion([ ( "text_features", Pipeline([ ( "selector", FunctionTransformer(lambda x: x["text"]), ), ( "tfidf", TfidfVectorizer( min_df=5, ngram_range=(1, 2), stop_words="english", ), ), ]), ), ( "team_features", Pipeline([ ( "selector", FunctionTransformer(lambda x: x[["Team"]]), ), ( "one hot", OneHotEncoder(handle_unknown="ignore"), ), ]), ), ]), ), ( "svm", OneVsRestClassifier( SVC(class_weight="balanced", kernel="linear")), ), ]) elif approach == "tfidf+onehot_scheme-svm": model = Pipeline([ ( "vectorizer", FeatureUnion([ ( "text_features", Pipeline([ ( "selector", FunctionTransformer(lambda x: x["text"]), ), ( "tfidf", TfidfVectorizer( min_df=5, ngram_range=(1, 2), stop_words="english", ), ), ]), ), ( "team_features", Pipeline([ ( "selector", FunctionTransformer(lambda x: x[["Scheme"]]), ), ( "one hot", OneHotEncoder(handle_unknown="ignore"), ), ]), ), ]), ), ( "svm", OneVsRestClassifier( SVC(class_weight="balanced", kernel="linear")), ), ]) elif approach == "mesh-tfidf-svm": model = MeshTfidfSVM() elif approach == "mesh-cnn": model = MeshCNN() elif approach == "science-ensemble": model = ScienceEnsemble() elif approach == "mesh-xlinear": model = MeshXLinear() else: raise ApproachNotImplemented if parameters: params = ast.literal_eval(parameters) model.set_params(**params) else: parameters = {} return model
def test_save_load_attention(): X = ["One", "One only", "Two nothing else", "Two and three"] Y = np.array([0, 0, 1, 1]) vec = KerasVectorizer() X_vec = vec.fit_transform(X) model = BiLSTMClassifier(attention=True) model.fit(X_vec, Y) with tempfile.TemporaryDirectory() as tmp_dir: model.save(tmp_dir) loaded_model = BiLSTMClassifier() loaded_model.load(tmp_dir) assert hasattr(loaded_model, 'model') assert loaded_model.score(X_vec, Y) > 0.6
from wellcomeml.ml.bilstm import BiLSTMClassifier from wellcomeml.ml.keras_vectorizer import KerasVectorizer from sklearn.pipeline import Pipeline import numpy as np X = ["One", "three", "one", "two", "four"] Y = np.array([1, 0, 1, 0, 0]) bilstm_pipeline = Pipeline([("vec", KerasVectorizer()), ("clf", BiLSTMClassifier())]) bilstm_pipeline.fit(X, Y) print(bilstm_pipeline.score(X, Y)) X = ["One, three", "one", "two, three"] Y = np.array([[1, 0, 1], [1, 0, 0], [0, 1, 1]]) bilstm_pipeline = Pipeline([("vec", KerasVectorizer()), ("clf", BiLSTMClassifier(multilabel=True))]) bilstm_pipeline.fit(X, Y) print(bilstm_pipeline.score(X, Y))