def test_feature_importance_multiclass(): params = {'C': 1.0, 'ngram_range': (1, 2), 'max_features': 1000} classifier = AdeftClassifier('IR', ['HGNC:6091', 'MESH:D011839']) texts = data['texts'] labels = data['labels'] classifier.train(texts, labels, **params) feature_importances = classifier.feature_importances() assert isinstance(feature_importances, dict) assert set(feature_importances.keys()) == set(labels) for label, importances in feature_importances.items(): # check that importances are sorted assert importances == sorted(importances, key=lambda x: -x[1]) # check that output is of the correct type assert all( isinstance(x, tuple) and len(x) == 2 and isinstance(x[0], str) and isinstance(x[1], float) for x in importances) # check if selected important features have positive score assert all([ score > 0 for feature, score in feature_importances['HGNC:6091'] if feature in ['irs1', 'igf1r', 'signaling'] ]) assert all([ score > 0 for feature, score in feature_importances['MESH:D011839'] if feature in ['radiation', 'exposure', 'dna'] ])
def test_training_set_digest(): classifier = AdeftClassifier('?', ['??', '???']) texts = data['texts'] digest1 = classifier._training_set_digest(texts) digest2 = classifier._training_set_digest(texts[::-1]) digest3 = classifier._training_set_digest(texts[:-1]) assert digest1 == digest2 assert digest1 != digest3
def test_cv_multiclass(): params = {'C': [1.0], 'max_features': [1000]} classifier = AdeftClassifier('IR', ['HGNC:6091', 'MESH:D011839']) texts = data['texts'] labels = data['labels'] classifier.cv(texts, labels, param_grid=params, cv=2) assert classifier.best_score > 0.5 assert classifier.stats['label_distribution'] == dict(Counter(labels)) assert classifier.stats['precision']['mean'] > 0.5
def test_train(): params = {'C': 1.0, 'ngram_range': (1, 2), 'max_features': 1000} classifier = AdeftClassifier('IR', ['HGNC:6091', 'MESH:D011839']) texts = data['texts'] labels = data['labels'] classifier.train(texts, labels, **params) assert hasattr(classifier, 'estimator') assert (f1_score(labels, classifier.predict(texts), labels=['HGNC:6091', 'MESH:D011839'], average='weighted') > 0.5)
def test_cv_binary(): params = {'C': [1.0], 'max_features': [1000]} texts = data['texts'] labels = [ label if label == 'HGNC:6091' else 'ungrounded' for label in data['labels'] ] classifier = AdeftClassifier('IR', ['HGNC:6091']) classifier.cv(texts, labels, param_grid=params, cv=2) assert classifier.best_score > 0.5 assert classifier.stats['label_distribution'] == dict(Counter(labels)) assert classifier.stats['precision']['mean'] > 0.5
def test_train(): params = {'C': 1.0, 'ngram_range': (1, 2), 'max_features': 10} classifier = AdeftClassifier('IR', ['HGNC:6091', 'MESH:D011839'], random_state=1729) texts = data['texts'] labels = data['labels'] classifier.train(texts, labels, **params) assert hasattr(classifier, 'estimator') assert (f1_score(labels, classifier.predict(texts), labels=['HGNC:6091', 'MESH:D011839'], average='weighted') > 0.5) importances = classifier.feature_importances() INSR_features, INSR_scores = zip(*importances['HGNC:6091']) assert set(['irs', 'igf', 'insulin']) < set(INSR_features) irs_score = [ score for feature, score in importances['HGNC:6091'] if feature == 'irs' ][0] assert irs_score > 0 # test that results are repeatable coef1 = classifier.estimator.named_steps['logit'].coef_ classifier.train(texts, labels, **params) coef2 = classifier.estimator.named_steps['logit'].coef_ assert np.array_equal(coef1, coef2)
def learn_model(ambig_terms_pmids, params): ambig_terms, term_pmids = ambig_terms_pmids print() terms_str = '\n> ' + '\n> '.join(str(t) for t in ambig_terms) print('Learning model for: %s\n=======' % terms_str) texts, labels = get_papers(ambig_terms, term_pmids) if len(set(labels)) < 2: print('Could not get enough labels for more than one class, skipping.') return None label_counts = Counter(labels) if any([v <= 5 for v in label_counts.values()]): print('Could not get enough labels for at least one entry, skipping.') return None cl = AdeftClassifier([ambig_terms[0].text], list(set(labels))) cl.cv(texts, labels, params, cv=5) print(cl.stats) cl_model_info = cl.get_model_info() return {'cl': cl_model_info, 'ambig': ambig_terms}
def test_cv_binary(): params = {'C': [1.0], 'max_features': [10]} texts = data['texts'] labels = [ label if label == 'HGNC:6091' else 'ungrounded' for label in data['labels'] ] classifier = AdeftClassifier('IR', ['HGNC:6091'], random_state=1729) classifier.cv(texts, labels, param_grid=params, cv=2) assert classifier.stats['f1']['mean'] > 0.5 assert classifier.stats['HGNC:6091']['f1']['mean'] > 0.5 importances = classifier.feature_importances() INSR_features, INSR_scores = zip(*importances['HGNC:6091']) ungrounded_features, ungrounded_scores = zip(*importances['ungrounded']) assert set(INSR_features) == set(ungrounded_features) assert INSR_scores == tuple(-x for x in ungrounded_scores[::-1]) assert [ score for feature, score in importances['HGNC:6091'] if feature == 'insulin' ][0] > 0 assert [ score for feature, score in importances['HGNC:6091'] if feature == 'group' ][0] < 0
def test_feature_importance_binary(): params = {'C': 1.0, 'ngram_range': (1, 2), 'max_features': 1000} classifier = AdeftClassifier('IR', ['HGNC:6091']) texts = data['texts'] labels = [ label if label == 'HGNC:6091' else 'ungrounded' for label in data['labels'] ] classifier.train(texts, labels, **params) feature_importances = classifier.feature_importances() assert isinstance(feature_importances, dict) assert set(feature_importances.keys()) == set(labels) for label, importances in feature_importances.items(): # check that importances are sorted assert importances == sorted(importances, key=lambda x: -x[1]) # check that output is of the correct type assert all( isinstance(x, tuple) and len(x) == 2 and isinstance(x[0], str) and isinstance(x[1], float) for x in importances) # check if selected important features have positive score assert all([ score > 0 for feature, score in feature_importances['HGNC:6091'] if feature in ['irs1', 'igf1r', 'phosphorylation'] ])
def test_cv_multiclass(): params = {'C': [1.0], 'max_features': [10]} classifier = AdeftClassifier('IR', ['HGNC:6091', 'MESH:D011839'], random_state=1729) texts = data['texts'] labels = data['labels'] classifier.cv(texts, labels, param_grid=params, cv=2) assert classifier.stats['f1']['mean'] > 0.5 assert classifier.stats['ungrounded']['f1']['mean'] > 0.5 # Test that results are repeatable coef1 = classifier.estimator.named_steps['logit'].coef_ classifier.cv(texts, labels, param_grid=params, cv=2) coef2 = classifier.estimator.named_steps['logit'].coef_ assert np.array_equal(coef1, coef2)
return txts if __name__ == '__main__': # Approach 1: get training examples from curations # act_txts, amt_txts = get_curation_texts() # Approach 2: get training examples from Signor sentences signor_act, signor_amt = get_signor_stmts() act_txts, amt_txts = get_signor_xor_texts(signor_act, signor_amt) # Approach 3: use Signor to find A->B pairs that are exclusively # activity or amount regulations, and then find corresponding evidence # sentences from reading for A->B Statements. # TODO # Prepare training examples and labels texts = act_txts + amt_txts labels = ['act'] * len(act_txts) + ['amt'] * len(amt_txts) # Create classifier cl = AdeftClassifier(texts, labels) param_grid = { 'C': [10.0], 'max_features': [100, 1000], 'ngram_range': [(1, 2)] } # Do cross-validation cl.cv(texts, labels, param_grid, cv=5) print(cl.stats) cl_model_info = cl.get_model_info()