示例#1
0
def test_feature_importance_multiclass():
    params = {'C': 1.0, 'ngram_range': (1, 2), 'max_features': 1000}
    classifier = AdeftClassifier('IR', ['HGNC:6091', 'MESH:D011839'])
    texts = data['texts']
    labels = data['labels']
    classifier.train(texts, labels, **params)
    feature_importances = classifier.feature_importances()
    assert isinstance(feature_importances, dict)
    assert set(feature_importances.keys()) == set(labels)
    for label, importances in feature_importances.items():
        # check that importances are sorted
        assert importances == sorted(importances, key=lambda x: -x[1])
        # check that output is of the correct type
        assert all(
            isinstance(x, tuple) and len(x) == 2 and isinstance(x[0], str)
            and isinstance(x[1], float) for x in importances)
    # check if selected important features have positive score
    assert all([
        score > 0 for feature, score in feature_importances['HGNC:6091']
        if feature in ['irs1', 'igf1r', 'signaling']
    ])
    assert all([
        score > 0 for feature, score in feature_importances['MESH:D011839']
        if feature in ['radiation', 'exposure', 'dna']
    ])
示例#2
0
def test_training_set_digest():
    classifier = AdeftClassifier('?', ['??', '???'])
    texts = data['texts']
    digest1 = classifier._training_set_digest(texts)
    digest2 = classifier._training_set_digest(texts[::-1])
    digest3 = classifier._training_set_digest(texts[:-1])
    assert digest1 == digest2
    assert digest1 != digest3
示例#3
0
def test_cv_multiclass():
    params = {'C': [1.0], 'max_features': [1000]}
    classifier = AdeftClassifier('IR', ['HGNC:6091', 'MESH:D011839'])
    texts = data['texts']
    labels = data['labels']
    classifier.cv(texts, labels, param_grid=params, cv=2)
    assert classifier.best_score > 0.5
    assert classifier.stats['label_distribution'] == dict(Counter(labels))
    assert classifier.stats['precision']['mean'] > 0.5
示例#4
0
def test_train():
    params = {'C': 1.0, 'ngram_range': (1, 2), 'max_features': 1000}
    classifier = AdeftClassifier('IR', ['HGNC:6091', 'MESH:D011839'])
    texts = data['texts']
    labels = data['labels']
    classifier.train(texts, labels, **params)
    assert hasattr(classifier, 'estimator')
    assert (f1_score(labels,
                     classifier.predict(texts),
                     labels=['HGNC:6091', 'MESH:D011839'],
                     average='weighted') > 0.5)
示例#5
0
def test_cv_binary():
    params = {'C': [1.0], 'max_features': [1000]}
    texts = data['texts']
    labels = [
        label if label == 'HGNC:6091' else 'ungrounded'
        for label in data['labels']
    ]
    classifier = AdeftClassifier('IR', ['HGNC:6091'])
    classifier.cv(texts, labels, param_grid=params, cv=2)
    assert classifier.best_score > 0.5
    assert classifier.stats['label_distribution'] == dict(Counter(labels))
    assert classifier.stats['precision']['mean'] > 0.5
示例#6
0
def test_train():
    params = {'C': 1.0, 'ngram_range': (1, 2), 'max_features': 10}
    classifier = AdeftClassifier('IR', ['HGNC:6091', 'MESH:D011839'],
                                 random_state=1729)
    texts = data['texts']
    labels = data['labels']
    classifier.train(texts, labels, **params)
    assert hasattr(classifier, 'estimator')
    assert (f1_score(labels,
                     classifier.predict(texts),
                     labels=['HGNC:6091', 'MESH:D011839'],
                     average='weighted') > 0.5)
    importances = classifier.feature_importances()
    INSR_features, INSR_scores = zip(*importances['HGNC:6091'])
    assert set(['irs', 'igf', 'insulin']) < set(INSR_features)
    irs_score = [
        score for feature, score in importances['HGNC:6091']
        if feature == 'irs'
    ][0]
    assert irs_score > 0
    # test that results are repeatable
    coef1 = classifier.estimator.named_steps['logit'].coef_
    classifier.train(texts, labels, **params)
    coef2 = classifier.estimator.named_steps['logit'].coef_
    assert np.array_equal(coef1, coef2)
示例#7
0
文件: learn.py 项目: steppi/gilda
def learn_model(ambig_terms_pmids, params):
    ambig_terms, term_pmids = ambig_terms_pmids

    print()
    terms_str = '\n> ' + '\n> '.join(str(t) for t in ambig_terms)
    print('Learning model for: %s\n=======' % terms_str)
    texts, labels = get_papers(ambig_terms, term_pmids)
    if len(set(labels)) < 2:
        print('Could not get enough labels for more than one class, skipping.')
        return None

    label_counts = Counter(labels)
    if any([v <= 5 for v in label_counts.values()]):
        print('Could not get enough labels for at least one entry, skipping.')
        return None
    cl = AdeftClassifier([ambig_terms[0].text], list(set(labels)))
    cl.cv(texts, labels, params, cv=5)
    print(cl.stats)
    cl_model_info = cl.get_model_info()
    return {'cl': cl_model_info, 'ambig': ambig_terms}
示例#8
0
def test_cv_binary():
    params = {'C': [1.0], 'max_features': [10]}
    texts = data['texts']
    labels = [
        label if label == 'HGNC:6091' else 'ungrounded'
        for label in data['labels']
    ]
    classifier = AdeftClassifier('IR', ['HGNC:6091'], random_state=1729)
    classifier.cv(texts, labels, param_grid=params, cv=2)
    assert classifier.stats['f1']['mean'] > 0.5
    assert classifier.stats['HGNC:6091']['f1']['mean'] > 0.5
    importances = classifier.feature_importances()
    INSR_features, INSR_scores = zip(*importances['HGNC:6091'])
    ungrounded_features, ungrounded_scores = zip(*importances['ungrounded'])
    assert set(INSR_features) == set(ungrounded_features)
    assert INSR_scores == tuple(-x for x in ungrounded_scores[::-1])
    assert [
        score
        for feature, score in importances['HGNC:6091'] if feature == 'insulin'
    ][0] > 0
    assert [
        score
        for feature, score in importances['HGNC:6091'] if feature == 'group'
    ][0] < 0
示例#9
0
def test_feature_importance_binary():
    params = {'C': 1.0, 'ngram_range': (1, 2), 'max_features': 1000}
    classifier = AdeftClassifier('IR', ['HGNC:6091'])
    texts = data['texts']
    labels = [
        label if label == 'HGNC:6091' else 'ungrounded'
        for label in data['labels']
    ]
    classifier.train(texts, labels, **params)
    feature_importances = classifier.feature_importances()
    assert isinstance(feature_importances, dict)
    assert set(feature_importances.keys()) == set(labels)
    for label, importances in feature_importances.items():
        # check that importances are sorted
        assert importances == sorted(importances, key=lambda x: -x[1])
        # check that output is of the correct type
        assert all(
            isinstance(x, tuple) and len(x) == 2 and isinstance(x[0], str)
            and isinstance(x[1], float) for x in importances)
    # check if selected important features have positive score
    assert all([
        score > 0 for feature, score in feature_importances['HGNC:6091']
        if feature in ['irs1', 'igf1r', 'phosphorylation']
    ])
示例#10
0
def test_cv_multiclass():
    params = {'C': [1.0], 'max_features': [10]}
    classifier = AdeftClassifier('IR', ['HGNC:6091', 'MESH:D011839'],
                                 random_state=1729)
    texts = data['texts']
    labels = data['labels']
    classifier.cv(texts, labels, param_grid=params, cv=2)
    assert classifier.stats['f1']['mean'] > 0.5
    assert classifier.stats['ungrounded']['f1']['mean'] > 0.5
    # Test that results are repeatable
    coef1 = classifier.estimator.named_steps['logit'].coef_
    classifier.cv(texts, labels, param_grid=params, cv=2)
    coef2 = classifier.estimator.named_steps['logit'].coef_
    assert np.array_equal(coef1, coef2)
示例#11
0
    return txts


if __name__ == '__main__':
    # Approach 1: get training examples from curations
    # act_txts, amt_txts = get_curation_texts()

    # Approach 2: get training examples from Signor sentences
    signor_act, signor_amt = get_signor_stmts()
    act_txts, amt_txts = get_signor_xor_texts(signor_act, signor_amt)

    # Approach 3: use Signor to find A->B pairs that are exclusively
    # activity or amount regulations, and then find corresponding evidence
    # sentences from reading for A->B Statements.
    # TODO

    # Prepare training examples and labels
    texts = act_txts + amt_txts
    labels = ['act'] * len(act_txts) + ['amt'] * len(amt_txts)
    # Create classifier
    cl = AdeftClassifier(texts, labels)
    param_grid = {
        'C': [10.0],
        'max_features': [100, 1000],
        'ngram_range': [(1, 2)]
    }
    # Do cross-validation
    cl.cv(texts, labels, param_grid, cv=5)
    print(cl.stats)
    cl_model_info = cl.get_model_info()