예제 #1
0
def test_singleFeature_entityTypes():
	trainCorpus, devCorpus = generateTestData(positiveCount=100,negativeCount=100,relTypes=1)

	predictionCorpus = devCorpus.clone()
	predictionCorpus.removeRelations()

	classifier = kindred.RelationClassifier(features=['entityTypes'])
	classifier.train(trainCorpus)
	
	classifier.predict(predictionCorpus)
	
	f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score')
	assert round(f1score,3) == 1.0
예제 #2
0
def test_logisticregression_threshold():
	trainCorpus, devCorpus = generateTestData(positiveCount=100,negativeCount=100,relTypes=1)

	predictionCorpus = devCorpus.clone()
	predictionCorpus.removeRelations()

	classifier = kindred.RelationClassifier(classifierType='LogisticRegression',threshold=0.3)
	classifier.train(trainCorpus)
	
	classifier.predict(predictionCorpus)
	
	f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score')
	assert round(f1score,3) == 1.0
예제 #3
0
def test_noTFIDF():
	trainCorpus, devCorpus = generateTestData(positiveCount=100,negativeCount=100,relTypes=1)

	predictionCorpus = devCorpus.clone()
	predictionCorpus.removeRelations()

	classifier = kindred.RelationClassifier(tfidf=False)
	classifier.train(trainCorpus)
	
	classifier.predict(predictionCorpus)
	
	f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score')
	assert round(f1score,3) == 1.0
예제 #4
0
def test_predicting_duplicates():
	trainCorpus, testCorpus = generateTestData(positiveCount=100,negativeCount=100,relTypes=1)

	testCorpus.removeRelations()

	classifier = kindred.RelationClassifier()
	classifier.train(trainCorpus)

	classifier.predict(testCorpus)
	classifier.predict(testCorpus)
	classifier.predict(testCorpus)

	relations = [ r for doc in testCorpus.documents for r in doc.relations ]
	assert len(relations) == len(set(relations)), "Duplicate relations found in predictions"
예제 #5
0
def test_simpleRelationClassifier_emptyTestCorpus():
    trainCorpus, testCorpus = generateTestData(positiveCount=100,
                                               negativeCount=100)

    for doc in testCorpus.documents:
        doc.entities = []
        doc.relations = []

    classifier = kindred.RelationClassifier()
    classifier.train(trainCorpus)

    classifier.predict(testCorpus)

    assert len(testCorpus.getRelations()) == 0
def test_simpleRelationClassifier_emptyTrainCorpus():
    trainCorpus, testCorpus = generateTestData(positiveCount=100,
                                               negativeCount=100)

    for doc in trainCorpus.documents:
        doc.entities = []
        doc.relations = []

    classifier = kindred.RelationClassifier()

    with pytest.raises(RuntimeError) as excinfo:
        classifier.train(trainCorpus)
    assert excinfo.value.args == (
        'No candidate relations found in corpus for training', )
예제 #7
0
def test_vectorizer_dependencyPathEdges_noTFIDF():
    corpus1, _ = generateTestData(positiveCount=5, negativeCount=5)
    corpus2, _ = generateTestData(positiveCount=10, negativeCount=10)

    parser = kindred.Parser()
    parser.parse(corpus1)
    parser.parse(corpus2)

    candidateBuilder = kindred.CandidateBuilder()
    candidateRelations1 = candidateBuilder.build(corpus1)
    candidateRelations2 = candidateBuilder.build(corpus2)

    chosenFeatures = ["dependencyPathEdges"]
    vectorizer = kindred.Vectorizer(featureChoice=chosenFeatures, tfidf=False)

    matrix1 = vectorizer.fit_transform(candidateRelations1)
    matrix2 = vectorizer.transform(candidateRelations2)

    assert matrix1.shape == (8, 5)
    assert matrix2.shape == (18, 5)

    colnames = vectorizer.getFeatureNames()
    expectedNames = [
        'dependencypathelements_attr', 'dependencypathelements_nsubj',
        'dependencypathelements_nsubjpass', 'dependencypathelements_pobj',
        'dependencypathelements_prep'
    ]
    assert colnames == expectedNames

    # As a quick check, we'll confirm that the column means are as expected
    colmeans1 = np.sum(matrix1, axis=0)
    assert colmeans1.tolist() == [[4.0, 4.0, 2.0, 10.0, 8.0]]

    # As a quick check, we'll confirm that the column means are as expected
    colmeans2 = np.sum(matrix2, axis=0)
    assert colmeans2.tolist() == [[4.0, 10.0, 2.0, 10.0, 10.0]]
예제 #8
0
def test_singleClassifier_twoRelTypes():
    trainCorpus, devCorpus = generateTestData(positiveCount=100,
                                              negativeCount=100,
                                              relTypes=2)

    predictionCorpus = devCorpus.clone()
    predictionCorpus.removeRelations()

    classifier = kindred.RelationClassifier()
    classifier.train(trainCorpus)

    classifier.predict(predictionCorpus)

    f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score')
    assert round(f1score, 3) == 0.662
예제 #9
0
def test_simpleRelationClassifier_binary():
    trainCorpus, testCorpusGold = generateTestData(positiveCount=100,
                                                   negativeCount=100)

    predictionCorpus = testCorpusGold.clone()
    predictionCorpus.removeRelations()

    classifier = kindred.RelationClassifier()
    classifier.train(trainCorpus)

    classifier.predict(predictionCorpus)

    f1score = kindred.evaluate(testCorpusGold,
                               predictionCorpus,
                               metric='f1score')
    assert f1score == 1.0
예제 #10
0
def test_filterByEntityTypes_invalidTypes():
	trainCorpus, devCorpus = generateTestData(positiveCount=100,negativeCount=100,relTypes=1)

	predictionCorpus = devCorpus.clone()
	predictionCorpus.removeRelations()

	for doc in devCorpus.documents:
		for e in doc.entities:
			e.entityType = 'a new type'

	classifier = kindred.RelationClassifier(features=["unigramsBetweenEntities","bigrams","dependencyPathEdges","dependencyPathEdgesNearEntities"])
	classifier.train(trainCorpus)
	
	classifier.predict(predictionCorpus)
	
	f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score')
	assert round(f1score,3) == 0.0
예제 #11
0
def test_doublelabels():
	trainCorpus, devCorpus = generateTestData(positiveCount=100,negativeCount=100,relTypes=1)

	for doc in trainCorpus.documents:
		newRelations = [ kindred.Relation("anotherLabel",r.entities,r.argNames) for r in doc.relations ]
		doc.relations += newRelations
	for doc in devCorpus.documents:
		newRelations = [ kindred.Relation("anotherLabel",r.entities,r.argNames) for r in doc.relations ]
		doc.relations += newRelations

	predictionCorpus = devCorpus.clone()
	predictionCorpus.removeRelations()

	classifier = kindred.RelationClassifier()
	classifier.train(trainCorpus)
	
	classifier.predict(predictionCorpus)
	
	f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score')
	assert round(f1score,3) == 1.0
예제 #12
0
def test_pickle():
    trainCorpus, testCorpusGold = generateTestData(positiveCount=100,
                                                   negativeCount=100)

    predictionCorpus = testCorpusGold.clone()
    predictionCorpus.removeRelations()

    classifier = kindred.RelationClassifier()
    classifier.train(trainCorpus)

    with tempfile.NamedTemporaryFile() as tempF:
        with open(tempF.name, 'wb') as f:
            pickle.dump(classifier, f)
        with open(tempF.name, 'rb') as f:
            classifier = pickle.load(f)

    classifier.predict(predictionCorpus)

    f1score = kindred.evaluate(testCorpusGold,
                               predictionCorpus,
                               metric='f1score')
    assert f1score == 1.0
예제 #13
0
def test_singleClassifier_triple():
	trainCorpus, devCorpus = generateTestData(entityCount=3,positiveCount=100,negativeCount=100,relTypes=1)

	trainRelations = trainCorpus.getRelations()
	assert len(trainRelations) == 50
	for r in trainRelations:
		assert len(r.entities) == 3

	predictionCorpus = devCorpus.clone()
	predictionCorpus.removeRelations()

	classifier = kindred.RelationClassifier(entityCount=3)
	classifier.train(trainCorpus)
	
	classifier.predict(predictionCorpus)
	
	predictedRelations = predictionCorpus.getRelations()
	assert len(predictedRelations) == 50
	for r in predictedRelations:
		assert len(r.entities) == 3
	
	f1score = kindred.evaluate(devCorpus, predictionCorpus, metric='f1score')
	assert round(f1score,3) == 1.0
예제 #14
0
def test_simpleRelationClassifier_triple():
	trainCorpus, testCorpusGold = generateTestData(entityCount=3,positiveCount=100,negativeCount=100)

	trainRelations = trainCorpus.getRelations()
	assert len(trainRelations) == 50
	for r in trainRelations:
		assert len(r.entities) == 3

	predictionCorpus = testCorpusGold.clone()
	predictionCorpus.removeRelations()

	classifier = kindred.RelationClassifier(entityCount=3)
	classifier.train(trainCorpus)
	
	classifier.predict(predictionCorpus)

	predictedRelations = predictionCorpus.getRelations()
	assert len(predictedRelations) == 50
	for r in predictedRelations:
		assert len(r.entities) == 3
	
	f1score = kindred.evaluate(testCorpusGold, predictionCorpus, metric='f1score')
	assert f1score == 1.0
예제 #15
0
def test_vectorizer_defaults_triple():
    corpus1, _ = generateTestData(entityCount=3,
                                  positiveCount=5,
                                  negativeCount=5)
    corpus2, _ = generateTestData(entityCount=3,
                                  positiveCount=10,
                                  negativeCount=10)

    parser = kindred.Parser()
    parser.parse(corpus1)
    parser.parse(corpus2)

    candidateBuilder = kindred.CandidateBuilder(entityCount=3)
    candidateRelations1 = candidateBuilder.build(corpus1)
    candidateRelations2 = candidateBuilder.build(corpus2)

    vectorizer = kindred.Vectorizer(entityCount=3)

    matrix1 = vectorizer.fit_transform(candidateRelations1)
    matrix2 = vectorizer.transform(candidateRelations2)

    assert matrix1.shape == (18, 101)
    assert matrix2.shape == (60, 101)

    colnames = vectorizer.getFeatureNames()
    expectedNames = [
        'selectedtokentypes_0_disease', 'selectedtokentypes_0_drug',
        'selectedtokentypes_0_gene', 'selectedtokentypes_1_disease',
        'selectedtokentypes_1_drug', 'selectedtokentypes_1_gene',
        'selectedtokentypes_2_disease', 'selectedtokentypes_2_drug',
        'selectedtokentypes_2_gene', 'ngrams_betweenentities_0_1_and',
        'ngrams_betweenentities_0_1_be', 'ngrams_betweenentities_0_1_by',
        'ngrams_betweenentities_0_1_can',
        'ngrams_betweenentities_0_1_fvdxdietdx',
        'ngrams_betweenentities_0_1_inhibition',
        'ngrams_betweenentities_0_1_knetvjnjun',
        'ngrams_betweenentities_0_1_targets',
        'ngrams_betweenentities_0_1_treated',
        'ngrams_betweenentities_0_1_treats',
        'ngrams_betweenentities_0_1_using',
        'ngrams_betweenentities_0_1_zkrkzlyfef',
        'ngrams_betweenentities_0_2_and', 'ngrams_betweenentities_0_2_be',
        'ngrams_betweenentities_0_2_by', 'ngrams_betweenentities_0_2_can',
        'ngrams_betweenentities_0_2_fvdxdietdx',
        'ngrams_betweenentities_0_2_inhibition',
        'ngrams_betweenentities_0_2_knetvjnjun',
        'ngrams_betweenentities_0_2_targets',
        'ngrams_betweenentities_0_2_treated',
        'ngrams_betweenentities_0_2_treats',
        'ngrams_betweenentities_0_2_using',
        'ngrams_betweenentities_0_2_zkrkzlyfef',
        'ngrams_betweenentities_1_2_and', 'ngrams_betweenentities_1_2_be',
        'ngrams_betweenentities_1_2_by', 'ngrams_betweenentities_1_2_can',
        'ngrams_betweenentities_1_2_fvdxdietdx',
        'ngrams_betweenentities_1_2_inhibition',
        'ngrams_betweenentities_1_2_knetvjnjun',
        'ngrams_betweenentities_1_2_targets',
        'ngrams_betweenentities_1_2_treated',
        'ngrams_betweenentities_1_2_treats',
        'ngrams_betweenentities_1_2_using',
        'ngrams_betweenentities_1_2_zkrkzlyfef', 'bigrams_and_targets',
        'bigrams_be_treated', 'bigrams_by_fvdxdietdx', 'bigrams_by_zkrkzlyfef',
        'bigrams_can_be', 'bigrams_elvptnpvyc_.',
        'bigrams_fvdxdietdx_inhibition', 'bigrams_hxlfssirgk_.',
        'bigrams_inhibition_using', 'bigrams_knetvjnjun_and',
        'bigrams_kyekjnkrfo_can', 'bigrams_oxzbaapqct_treats',
        'bigrams_targets_hxlfssirgk', 'bigrams_treated_by',
        'bigrams_treats_knetvjnjun', 'bigrams_usckfljzxu_.',
        'bigrams_using_elvptnpvyc', 'bigrams_using_usckfljzxu',
        'bigrams_zgwivlcmly_can', 'bigrams_zkrkzlyfef_inhibition',
        'dependencypathelements_0_1_acl', 'dependencypathelements_0_1_advmod',
        'dependencypathelements_0_1_agent',
        'dependencypathelements_0_1_compound',
        'dependencypathelements_0_1_conj', 'dependencypathelements_0_1_dobj',
        'dependencypathelements_0_1_nsubjpass',
        'dependencypathelements_0_1_pobj', 'dependencypathelements_0_2_acl',
        'dependencypathelements_0_2_advmod',
        'dependencypathelements_0_2_agent',
        'dependencypathelements_0_2_compound',
        'dependencypathelements_0_2_conj', 'dependencypathelements_0_2_dobj',
        'dependencypathelements_0_2_nsubjpass',
        'dependencypathelements_0_2_pobj', 'dependencypathelements_1_2_acl',
        'dependencypathelements_1_2_advmod',
        'dependencypathelements_1_2_agent',
        'dependencypathelements_1_2_compound',
        'dependencypathelements_1_2_conj', 'dependencypathelements_1_2_dobj',
        'dependencypathelements_1_2_nsubjpass',
        'dependencypathelements_1_2_pobj',
        'dependencypathnearselectedtoken_0_compound',
        'dependencypathnearselectedtoken_0_conj',
        'dependencypathnearselectedtoken_0_dobj',
        'dependencypathnearselectedtoken_0_nsubjpass',
        'dependencypathnearselectedtoken_1_compound',
        'dependencypathnearselectedtoken_1_conj',
        'dependencypathnearselectedtoken_1_dobj',
        'dependencypathnearselectedtoken_1_nsubjpass',
        'dependencypathnearselectedtoken_2_compound',
        'dependencypathnearselectedtoken_2_conj',
        'dependencypathnearselectedtoken_2_dobj',
        'dependencypathnearselectedtoken_2_nsubjpass'
    ]
    assert colnames == expectedNames

    # As a quick check, we'll confirm that the column means are as expected
    expected1 = [
        6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 1.4620174403662662,
        2.089911361057128, 2.089911361057128, 2.089911361057128,
        0.8510011029330441, 2.089911361057128, 0.8909306965737043,
        1.4620174403662662, 2.089911361057128, 1.4620174403662662,
        2.089911361057128, 0.8510011029330441, 1.4620174403662662,
        2.089911361057128, 2.089911361057128, 2.089911361057128,
        0.8510011029330442, 2.089911361057128, 0.8909306965737043,
        1.4620174403662662, 2.089911361057128, 1.4620174403662662,
        2.089911361057128, 0.8510011029330441, 1.4620174403662662,
        2.089911361057128, 2.089911361057128, 2.089911361057128,
        0.8510011029330442, 2.089911361057128, 0.8909306965737043,
        1.4620174403662662, 2.089911361057128, 1.4620174403662662,
        2.089911361057128, 0.8510011029330441, 2.4494897427831783,
        3.151972689633972, 2.283202494909358, 2.283202494909358,
        3.151972689633972, 2.283202494909358, 2.283202494909358,
        2.4494897427831783, 3.151972689633972, 2.4494897427831783,
        2.283202494909358, 2.4494897427831783, 2.4494897427831783,
        3.151972689633972, 2.4494897427831783, 2.283202494909358,
        2.283202494909358, 2.283202494909358, 2.283202494909358,
        2.283202494909358, 8.0, 4.0, 8.0, 8.0, 4.0, 12.0, 8.0, 8.0, 8.0, 4.0,
        8.0, 8.0, 4.0, 12.0, 8.0, 8.0, 8.0, 4.0, 8.0, 8.0, 4.0, 12.0, 8.0, 8.0,
        4.0, 2.0, 2.0, 4.0, 4.0, 2.0, 2.0, 4.0, 4.0, 2.0, 2.0, 4.0
    ]
    colmeans1 = np.sum(matrix1, axis=0).tolist()[0]
    assert len(expected1) == len(colmeans1)
    for gotVal, expectedVal in zip(colmeans1, expected1):
        assert round(gotVal, 8) == round(
            expectedVal,
            8)  # Check rounded values (for floating point comparison issue)

    # As a quick check, we'll confirm that the column means are as expected
    expected2 = [
        8.0, 20.0, 20.0, 8.0, 20.0, 20.0, 8.0, 20.0, 20.0, 5.237574707711817,
        0.0, 0.0, 0.0, 3.9169357592886755, 0.0, 0.8909306965737043,
        18.69384731218667, 0.0, 3.095010602221718, 0.0, 0.0, 5.237574707711817,
        0.0, 0.0, 0.0, 3.9169357592886755, 0.0, 0.8909306965737043,
        18.69384731218667, 0.0, 3.095010602221718, 0.0, 0.0, 5.237574707711817,
        0.0, 0.0, 0.0, 3.9169357592886755, 0.0, 0.8909306965737043,
        18.69384731218667, 0.0, 3.095010602221718, 0.0, 0.0, 8.449489742783179,
        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.4494897427831783, 0.0,
        2.4494897427831783, 0.0, 2.4494897427831783, 2.4494897427831783, 0.0,
        2.4494897427831783, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 12.0, 20.0,
        28.0, 8.0, 28.0, 0.0, 4.0, 0.0, 12.0, 20.0, 28.0, 8.0, 28.0, 0.0, 4.0,
        0.0, 12.0, 20.0, 28.0, 8.0, 28.0, 4.0, 10.0, 2.0, 4.0, 4.0, 10.0, 2.0,
        4.0, 4.0, 10.0, 2.0, 4.0
    ]
    colmeans2 = np.sum(matrix2, axis=0).tolist()[0]
    assert len(expected2) == len(colmeans2)
    for gotVal, expectedVal in zip(colmeans2, expected2):
        assert round(gotVal, 8) == round(
            expectedVal,
            8)  # Check rounded values (for floating point comparison issue)
예제 #16
0
def test_vectorizer_bigrams():
    corpus1, _ = generateTestData(positiveCount=5, negativeCount=5)
    corpus2, _ = generateTestData(positiveCount=10, negativeCount=10)

    candidateBuilder = kindred.CandidateBuilder()
    candidateBuilder.fit_transform(corpus1)
    candidateBuilder.transform(corpus2)

    chosenFeatures = ["bigrams"]
    vectorizer = kindred.Vectorizer(featureChoice=chosenFeatures, tfidf=True)

    matrix1 = vectorizer.fit_transform(corpus1)
    matrix2 = vectorizer.transform(corpus2)

    assert matrix1.shape == (8, 27)
    assert matrix2.shape == (18, 27)

    colnames = vectorizer.getFeatureNames()
    expectedNames = [
        u'bigrams_ _gnorcyvmer', u'bigrams_a_common', u'bigrams_a_known',
        u'bigrams_be_treated', u'bigrams_bmzvpvwbpw_failed', u'bigrams_can_be',
        u'bigrams_clinical_trials', u'bigrams_common_treatment',
        u'bigrams_effect_of', u'bigrams_failed_clinical',
        u'bigrams_for_kyekjnkrfo', u'bigrams_for_zgwivlcmly',
        u'bigrams_gnorcyvmer_is', u'bigrams_is_a', u'bigrams_known_side',
        u'bigrams_kyekjnkrfo_.', u'bigrams_of_ruswdgzajr',
        u'bigrams_ootopaoxbg_can', u'bigrams_pehhjnlvvewbjccovflf_is',
        u'bigrams_ruswdgzajr_.', u'bigrams_side_effect',
        u'bigrams_treated_with', u'bigrams_treatment_for',
        u'bigrams_trials_for', u'bigrams_vgypkemhjr_.',
        u'bigrams_with_vgypkemhjr', u'bigrams_zgwivlcmly_.'
    ]
    assert colnames == expectedNames

    # As a quick check, we'll confirm that the column means are as expected
    expected1 = [
        0.6830902801437798, 0.7801302536256829, 0.6830902801437798,
        0.8164965809277259, 0.8164965809277259, 0.8164965809277259,
        0.8164965809277259, 0.7801302536256829, 0.6830902801437798,
        0.8164965809277259, 0.8164965809277259, 0.7801302536256829,
        0.6830902801437798, 1.1070563456981333, 0.6830902801437798,
        0.8164965809277259, 0.6830902801437798, 0.8164965809277259,
        0.7801302536256829, 0.6830902801437798, 0.6830902801437798,
        0.8164965809277259, 0.7801302536256829, 0.8164965809277259,
        0.8164965809277259, 0.8164965809277259, 0.7801302536256829
    ]
    colmeans1 = np.sum(matrix1, axis=0).tolist()[0]
    assert len(expected1) == len(colmeans1)
    for gotVal, expectedVal in zip(colmeans1, expected1):
        assert round(gotVal, 8) == round(
            expectedVal,
            8)  # Check rounded values (for floating point comparison issue)

    # As a quick check, we'll confirm that the column means are as expected
    expected2 = [
        0.0, 1.0581526716744893, 0.0, 0.8164965809277259, 1.0,
        0.8164965809277259, 2.1547005383792515, 1.0581526716744893, 0.0,
        2.1547005383792515, 0.0, 0.0, 0.0, 0.8005865164268136, 0.0, 2.0, 0.0,
        0.8164965809277259, 2.0, 0.0, 0.0, 0.8164965809277259,
        1.0581526716744893, 2.1547005383792515, 0.8164965809277259,
        0.8164965809277259, 0.0
    ]
    colmeans2 = np.sum(matrix2, axis=0).tolist()[0]
    assert len(expected2) == len(colmeans2)
    for gotVal, expectedVal in zip(colmeans2, expected2):
        assert round(gotVal, 8) == round(
            expectedVal,
            8)  # Check rounded values (for floating point comparison issue)
예제 #17
0
def test_vectorizer_defaults():
    corpus1, _ = generateTestData(positiveCount=5, negativeCount=5)
    corpus2, _ = generateTestData(positiveCount=10, negativeCount=10)

    parser = kindred.Parser()
    parser.parse(corpus1)
    parser.parse(corpus2)

    candidateBuilder = kindred.CandidateBuilder()
    candidateRelations1 = candidateBuilder.build(corpus1)
    candidateRelations2 = candidateBuilder.build(corpus2)

    vectorizer = kindred.Vectorizer()

    matrix1 = vectorizer.fit_transform(candidateRelations1)
    matrix2 = vectorizer.transform(candidateRelations2)

    assert matrix1.shape == (8, 60)
    assert matrix2.shape == (18, 60)

    colnames = vectorizer.getFeatureNames()
    expectedNames = [
        'selectedtokentypes_0_disease', 'selectedtokentypes_0_disease2',
        'selectedtokentypes_0_drug', 'selectedtokentypes_1_disease',
        'selectedtokentypes_1_disease2', 'selectedtokentypes_1_drug',
        'ngrams_betweenentities_a', 'ngrams_betweenentities_be',
        'ngrams_betweenentities_can', 'ngrams_betweenentities_clinical',
        'ngrams_betweenentities_common', 'ngrams_betweenentities_effect',
        'ngrams_betweenentities_failed', 'ngrams_betweenentities_for',
        'ngrams_betweenentities_is', 'ngrams_betweenentities_known',
        'ngrams_betweenentities_of', 'ngrams_betweenentities_side',
        'ngrams_betweenentities_treated', 'ngrams_betweenentities_treatment',
        'ngrams_betweenentities_trials', 'ngrams_betweenentities_with',
        'bigrams_ _gnorcyvmer', 'bigrams_a_common', 'bigrams_a_known',
        'bigrams_be_treated', 'bigrams_bmzvpvwbpw_failed', 'bigrams_can_be',
        'bigrams_clinical_trials', 'bigrams_common_treatment',
        'bigrams_effect_of', 'bigrams_failed_clinical',
        'bigrams_for_kyekjnkrfo', 'bigrams_for_zgwivlcmly',
        'bigrams_gnorcyvmer_is', 'bigrams_is_a', 'bigrams_known_side',
        'bigrams_kyekjnkrfo_.', 'bigrams_of_ruswdgzajr',
        'bigrams_ootopaoxbg_can', 'bigrams_pehhjnlvvewbjccovflf_is',
        'bigrams_ruswdgzajr_.', 'bigrams_side_effect', 'bigrams_treated_with',
        'bigrams_treatment_for', 'bigrams_trials_for', 'bigrams_vgypkemhjr_.',
        'bigrams_with_vgypkemhjr', 'bigrams_zgwivlcmly_.',
        'dependencypathelements_attr', 'dependencypathelements_nsubj',
        'dependencypathelements_nsubjpass', 'dependencypathelements_pobj',
        'dependencypathelements_prep',
        'dependencypathnearselectedtoken_0_nsubj',
        'dependencypathnearselectedtoken_0_nsubjpass',
        'dependencypathnearselectedtoken_0_pobj',
        'dependencypathnearselectedtoken_1_nsubj',
        'dependencypathnearselectedtoken_1_nsubjpass',
        'dependencypathnearselectedtoken_1_pobj'
    ]
    assert colnames == expectedNames

    # As a quick check, we'll confirm that the column means are as expected
    expected1 = [
        2.0, 2.0, 4.0, 2.0, 2.0, 4.0, 1.4519522547520485, 1.0, 1.0,
        1.0581526716744893, 1.037330992404908, 0.8817459917627732,
        1.0581526716744893, 1.5854195824122916, 1.4519522547520485,
        0.8817459917627732, 0.8817459917627732, 0.8817459917627732, 1.0,
        1.037330992404908, 1.0581526716744893, 1.0, 0.6830902801437798,
        0.7801302536256829, 0.6830902801437798, 0.8164965809277259,
        0.8164965809277259, 0.8164965809277259, 0.8164965809277259,
        0.7801302536256829, 0.6830902801437798, 0.8164965809277259,
        0.8164965809277259, 0.7801302536256829, 0.6830902801437798,
        1.1070563456981333, 0.6830902801437798, 0.8164965809277259,
        0.6830902801437798, 0.8164965809277259, 0.7801302536256829,
        0.6830902801437798, 0.6830902801437798, 0.8164965809277259,
        0.7801302536256829, 0.8164965809277259, 0.8164965809277259,
        0.8164965809277259, 0.7801302536256829, 4.0, 4.0, 2.0, 10.0, 8.0, 2.0,
        1.0, 1.0, 2.0, 1.0, 1.0
    ]
    colmeans1 = np.sum(matrix1, axis=0).tolist()[0]
    assert len(expected1) == len(colmeans1)
    for gotVal, expectedVal in zip(colmeans1, expected1):
        assert round(gotVal, 8) == round(
            expectedVal,
            8)  # Check rounded values (for floating point comparison issue)

    # As a quick check, we'll confirm that the column means are as expected
    expected2 = [
        5.0, 4.0, 9.0, 5.0, 4.0, 9.0, 0.7848330659854781, 1.0, 1.0,
        2.1163053433489787, 1.037330992404908, 0.0, 2.1163053433489787,
        2.386006098839105, 1.99154811934689, 0.0, 1.594941622753311, 0.0, 1.0,
        1.037330992404908, 2.1163053433489787, 1.0, 0.0, 1.0581526716744893,
        0.0, 0.8164965809277259, 1.0, 0.8164965809277259, 2.1547005383792515,
        1.0581526716744893, 0.0, 2.1547005383792515, 0.0, 0.0, 0.0,
        0.8005865164268136, 0.0, 2.0, 0.0, 0.8164965809277259, 2.0, 0.0, 0.0,
        0.8164965809277259, 1.0581526716744893, 2.1547005383792515,
        0.8164965809277259, 0.8164965809277259, 0.0, 4.0, 10.0, 2.0, 10.0,
        10.0, 3.0, 1.0, 0.0, 3.0, 1.0, 0.0
    ]
    colmeans2 = np.sum(matrix2, axis=0).tolist()[0]
    assert len(expected2) == len(colmeans2)
    for gotVal, expectedVal in zip(colmeans2, expected2):
        assert round(gotVal, 8) == round(
            expectedVal,
            8)  # Check rounded values (for floating point comparison issue)
예제 #18
0
def test_vectorizer_bigrams_noTFIDF():
    corpus1, _ = generateTestData(positiveCount=5, negativeCount=5)
    corpus2, _ = generateTestData(positiveCount=10, negativeCount=10)

    parser = kindred.Parser()
    parser.parse(corpus1)
    parser.parse(corpus2)

    candidateBuilder = kindred.CandidateBuilder()
    candidateRelations1 = candidateBuilder.build(corpus1)
    candidateRelations2 = candidateBuilder.build(corpus2)

    chosenFeatures = ["bigrams"]
    vectorizer = kindred.Vectorizer(featureChoice=chosenFeatures, tfidf=False)

    matrix1 = vectorizer.fit_transform(candidateRelations1)
    matrix2 = vectorizer.transform(candidateRelations2)

    assert matrix1.shape == (8, 27)
    assert matrix2.shape == (18, 27)

    colnames = vectorizer.getFeatureNames()
    expectedNames = [
        u'bigrams_ _gnorcyvmer', u'bigrams_a_common', u'bigrams_a_known',
        u'bigrams_be_treated', u'bigrams_bmzvpvwbpw_failed', u'bigrams_can_be',
        u'bigrams_clinical_trials', u'bigrams_common_treatment',
        u'bigrams_effect_of', u'bigrams_failed_clinical',
        u'bigrams_for_kyekjnkrfo', u'bigrams_for_zgwivlcmly',
        u'bigrams_gnorcyvmer_is', u'bigrams_is_a', u'bigrams_known_side',
        u'bigrams_kyekjnkrfo_.', u'bigrams_of_ruswdgzajr',
        u'bigrams_ootopaoxbg_can', u'bigrams_pehhjnlvvewbjccovflf_is',
        u'bigrams_ruswdgzajr_.', u'bigrams_side_effect',
        u'bigrams_treated_with', u'bigrams_treatment_for',
        u'bigrams_trials_for', u'bigrams_vgypkemhjr_.',
        u'bigrams_with_vgypkemhjr', u'bigrams_zgwivlcmly_.'
    ]
    assert colnames == expectedNames

    # As a quick check, we'll confirm that the column means are as expected
    expected1 = [
        4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 8.0,
        4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0
    ]
    colmeans1 = np.sum(matrix1, axis=0).tolist()[0]
    assert len(expected1) == len(colmeans1)
    for gotVal, expectedVal in zip(colmeans1, expected1):
        assert round(gotVal, 8) == round(
            expectedVal,
            8)  # Check rounded values (for floating point comparison issue)

    # As a quick check, we'll confirm that the column means are as expected
    expected2 = [
        0.0, 4.0, 0.0, 4.0, 4.0, 4.0, 8.0, 4.0, 0.0, 8.0, 0.0, 0.0, 0.0, 4.0,
        0.0, 4.0, 0.0, 4.0, 4.0, 0.0, 0.0, 4.0, 4.0, 8.0, 4.0, 4.0, 0.0
    ]
    colmeans2 = np.sum(matrix2, axis=0).tolist()[0]
    assert len(expected2) == len(colmeans2)
    for gotVal, expectedVal in zip(colmeans2, expected2):
        assert round(gotVal, 8) == round(
            expectedVal,
            8)  # Check rounded values (for floating point comparison issue)