예제 #1
0
파일: Corpus.py 프로젝트: olaTechie/kindred
    def nfold_split(self, folds):
        """
		Method for splitting up the corpus multiple times and is used for an n-fold cross validation approach (as a generator). Each iteration, the training and test set for that fold are provided.

		:param folds: Number of folds to create
		:type folds: int
		:return: Tuple of training and test corpus (for iterations=folds)
		:rtype: (kindred.Corpus,kindred.Corpus)
		"""
        assert isinstance(folds, int)
        assert folds > 0

        indices = list(range(len(self.documents)))
        random.shuffle(indices)

        chunkSize = int(len(self.documents) / float(folds))
        indexChunks = [
            indices[i:i + chunkSize]
            for i in range(0, len(self.documents), chunkSize)
        ]

        for f in range(folds):
            trainCorpus, testCorpus = kindred.Corpus(), kindred.Corpus()
            for i, indexChunk in enumerate(indexChunks):
                for j in indexChunk:
                    if i == f:
                        testCorpus.addDocument(self.documents[j])
                    else:
                        trainCorpus.addDocument(self.documents[j])
            yield trainCorpus, testCorpus
예제 #2
0
def iterLoad(dataFormat,path,corpusSizeCutoff=500):
	"""
	Iteratively load sections of a (presumably large) corpus. This will create a generator that provides kindred.Corpus objects that are subsets of the larger corpus. This should be used to lower the memory requirements (so that the entire file doesn't need to be loaded into memory at one time).

	:param dataFormat: Format of the data files to load (only 'biocxml' is currently supported)
	:param path: Path to data. Can be directory or an individual file (for bioc, json or simpletag)
	:param corpusSizeCutoff: Approximate maximum number of documents to be in each corpus subset
	:type dataFormat: str
	:type path: str
	:type corpusSizeCutoff: int
	:return: Subsets of the BioC file
	:rtype: A kindred.Corpus generator
	"""
	assert dataFormat == 'biocxml'

	corpus = kindred.Corpus()

	if os.path.isdir(path):
		filenames = [ os.path.join(path,x) for x in os.listdir(path) if x.endswith('bioc.xml') ]
	else:
		filenames = [path]

	for filename in filenames:
		with bioc.iterparse(filename) as parser:
			for document in parser:
				if len(corpus.documents) >= corpusSizeCutoff:
					yield corpus
					corpus = kindred.Corpus()
				kindredDocs = convertBiocDocToKindredDocs(document)
				for kindredDoc in kindredDocs:
					corpus.addDocument(kindredDoc)

	if len(corpus.documents) > 0:
		yield corpus
예제 #3
0
def test_parsing_dependencyGraph():
    text = 'You need to turn in your homework by next week'
    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    assert len(corpus.documents) == 1
    doc = corpus.documents[0]
    assert isinstance(doc.sentences, list)
    assert len(doc.sentences) == 1

    sentence = doc.sentences[0]
    assert isinstance(sentence, kindred.Sentence)

    expectedWords = "You need to turn in your homework by next week".split()
    assert isinstance(sentence.tokens, list)
    assert len(expectedWords) == len(sentence.tokens)
    for w, t in zip(expectedWords, sentence.tokens):
        assert isinstance(t, kindred.Token)
        assert len(t.lemma) > 0
        assert w == t.word

    assert isinstance(sentence.entitiesWithLocations, list)
    assert len(sentence.entitiesWithLocations) == 0

    assert isinstance(sentence.dependencies, list)
    expectedDependencies = [(1, 0, u'nsubj'), (1, 1, u'ROOT'), (3, 2, u'aux'),
                            (1, 3, u'xcomp'), (3, 4, u'prt'), (6, 5, u'poss'),
                            (3, 6, u'dobj'), (3, 7, u'prep'), (9, 8, u'amod'),
                            (7, 9, u'pobj')]
    assert sentence.dependencies == expectedDependencies
예제 #4
0
def test_simpleSentenceParse():
    text = '<drug id="1">Erlotinib</drug> is a common treatment for <cancer id="2">lung</cancer> and unknown <cancer id="2">cancers</cancer>'
    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    assert len(corpus.documents) == 1
    doc = corpus.documents[0]
    assert isinstance(doc.sentences, list)
    assert len(doc.sentences) == 1

    sentence = doc.sentences[0]
    assert isinstance(sentence, kindred.Sentence)

    expectedWords = "Erlotinib is a common treatment for lung and unknown cancers".split(
    )
    assert isinstance(sentence.tokens, list)
    assert len(expectedWords) == len(sentence.tokens)
    for w, t in zip(expectedWords, sentence.tokens):
        assert isinstance(t, kindred.Token)
        assert len(t.lemma) > 0
        assert w == t.word

    assert isinstance(sentence.entitiesWithLocations, list)
    assert len(sentence.entitiesWithLocations) == 2
    assertEntityWithLocation(sentence.entitiesWithLocations[0], 'drug', [0],
                             '1')
    assertEntityWithLocation(sentence.entitiesWithLocations[1], 'cancer',
                             [6, 9], '2')

    assert isinstance(sentence.dependencies, list)
    assert len(sentence.dependencies) > 0
예제 #5
0
def filterCorpus(corpus,filterTerms):
	filtered = kindred.Corpus()
	for doc in corpus.documents:
		termsFound = any( ft in doc.text.lower() for ft in filterTerms )
		if termsFound:
			filtered.addDocument(doc)
	return filtered
예제 #6
0
def test_unicodeParse():
    text = u"<drug id='1'>Erlotinib</drug> is a common treatment for NF-κB positive <cancer id='2'>lung</cancer> and unknown <cancer id='2'>cancers</cancer>"
    corpus = kindred.Corpus(text, loadFromSimpleTag=True)

    parser = kindred.Parser()
    parser.parse(corpus)

    assert len(corpus.documents) == 1
    doc = corpus.documents[0]
    assert isinstance(doc.sentences, list)
    assert len(doc.sentences) == 1

    sentence = doc.sentences[0]
    assert isinstance(sentence, kindred.Sentence)

    expectedWords = u"Erlotinib is a common treatment for NF - κB positive lung and unknown cancers".split(
    )
    assert isinstance(sentence.tokens, list)
    assert len(expectedWords) == len(sentence.tokens)
    for w, t in zip(expectedWords, sentence.tokens):
        assert isinstance(t, kindred.Token)
        assert len(t.lemma) > 0
        assert w == t.word

    assert isinstance(sentence.entityAnnotations, list)
    assert len(sentence.entityAnnotations) == 2
    assertEntityWithLocation(sentence.entityAnnotations[0], 'drug', [0], '1')
    assertEntityWithLocation(sentence.entityAnnotations[1], 'cancer', [10, 13],
                             '2')

    assert isinstance(sentence.dependencies, list)
    assert len(sentence.dependencies) > 0
예제 #7
0
def test_candidatebuilder_simple():
    text = '<drug id="1">Erlotinib</drug> is a common treatment for <cancer id="2">NSCLC</cancer>. <drug id="3">Aspirin</drug> is the main cause of <disease id="4">boneitis</disease>. <relation type="treats" subj="1" obj="2" />'

    corpus = kindred.Corpus(text, loadFromSimpleTag=True)

    candidateBuilder = kindred.CandidateBuilder()
    candidateBuilder.fit_transform(corpus)

    assert corpus.relationTypes == [('treats', 'obj', 'subj')]
    candidateRelations = corpus.getCandidateRelations(2)
    candidateClasses = corpus.getCandidateClasses(2)

    assert candidateClasses == [[0], [1], [0], [0]]
    assert len(candidateRelations) == 4

    sourceEntityIDsToEntityIDs = corpus.documents[
        0].getSourceEntityIDsToEntityIDs()

    assert candidateRelations[0].entityIDs == [
        sourceEntityIDsToEntityIDs['1'], sourceEntityIDsToEntityIDs['2']
    ]
    assert candidateRelations[1].entityIDs == [
        sourceEntityIDsToEntityIDs['2'], sourceEntityIDsToEntityIDs['1']
    ]
    assert candidateRelations[2].entityIDs == [
        sourceEntityIDsToEntityIDs['3'], sourceEntityIDsToEntityIDs['4']
    ]
    assert candidateRelations[3].entityIDs == [
        sourceEntityIDsToEntityIDs['4'], sourceEntityIDsToEntityIDs['3']
    ]
예제 #8
0
def test_evaluate_display(capfd):
    goldText = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene>. We also studied <disease id="T3">glioblastoma</disease>.'
    goldText += '<relation type="typeA" subj="T2" obj="T1" />'
    goldText += '<relation type="typeB" subj="T2" obj="T3" />'
    goldText += '<relation type="typeA" subj="T3" obj="T1" />'
    goldText += '<relation type="typeB" subj="T2" obj="T1" />'
    goldText += '<relation type="typeC" subj="T2" obj="T1" />'

    goldCorpus = kindred.Corpus(goldText)

    testCorpus = goldCorpus.clone()
    testDoc = testCorpus.documents[0]
    mapping = testDoc.getSourceEntityIDsToEntityIDs()

    # Remove a relation and add two different ones
    testDoc.relations = testDoc.relations[:4]
    testDoc.addRelation(
        kindred.Relation("typeX", entityIDs=[mapping["T1"], mapping["T2"]]))
    testDoc.addRelation(
        kindred.Relation("typeX", entityIDs=[mapping["T1"], mapping["T3"]]))

    _, _, _ = kindred.evaluate(goldCorpus,
                               testCorpus,
                               metric='all',
                               display=True)

    out, err = capfd.readouterr()
    expected = "typeA\tTP:2 FP:0 FN:0\tP:1.000000 R:1.000000 F1:1.000000\ntypeB\tTP:2 FP:0 FN:0\tP:1.000000 R:1.000000 F1:1.000000\ntypeC\tTP:0 FP:0 FN:1\tP:0.000000 R:0.000000 F1:0.000000\ntypeX\tTP:0 FP:2 FN:0\tP:0.000000 R:0.000000 F1:0.000000\n--------------------------------------------------\nAll  \tTP:4 FP:2 FN:1\tP:0.666667 R:0.800000 F1:0.727273\n"
    assert out == expected
    assert err == ""
예제 #9
0
def test_entityrecognizer_merge_idintersections():
    lookup = makeTestLookup()

    text = 'We studied the genes known as GLP-1R GLP1R GLP1 GLP-1.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, mergeTerms=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]

    assert len(doc.sentences) == 1
    assert len(doc.entities) == 2

    assert doc.entities[0].entityType == 'gene'
    assert doc.entities[0].externalID == 'HGNC:4324'
    assert doc.entities[0].text == 'GLP-1R GLP1R'
    assert doc.entities[0].position == [(30, 42)]
    assert doc.entities[0].sourceEntityID == 'T1'

    assert doc.entities[1].entityType == 'gene'
    assert doc.entities[1].externalID == 'HGNC:4191'
    assert doc.entities[1].text == 'GLP1 GLP-1'
    assert doc.entities[1].position == [(43, 53)]
    assert doc.entities[1].sourceEntityID == 'T2'
예제 #10
0
def test_entityrecognizer_merge_triple_brackets():
    lookup = makeTestLookup()

    text = 'HER2 neu (ERBB2) is a gene.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, mergeTerms=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    #print(doc.entities)

    assert len(doc.sentences) == 1
    assert len(doc.entities) == 1
    entity = doc.entities[0]

    assert entity.entityType == 'gene'
    assert entity.externalID == 'HGNC:2064'
    assert entity.text == 'HER2 neu (ERBB2)'
    assert entity.position == [(0, 16)]
    assert entity.sourceEntityID == 'T1'
예제 #11
0
def test_corpus_nfold_split():
    mainCorpus = kindred.Corpus()
    docCount = 100
    for i in range(docCount):
        doc = kindred.Document(text=str(i), entities=[])
        mainCorpus.addDocument(doc)

    corpusA, corpusB = mainCorpus.split(0.75)
    folds = 5
    trainCounter, testCounter = Counter(), Counter()
    for trainCorpus, testCorpus in mainCorpus.nfold_split(folds):
        assert len(trainCorpus.documents) == (folds - 1) * docCount / folds
        assert len(testCorpus.documents) == docCount / folds

        seen = set()
        for doc in corpusA.documents:
            assert doc in mainCorpus.documents, "This document doesn't match an existing one"
            assert not doc in seen, "This document isn't unique now"
            trainCounter[doc] += 1
        for doc in corpusB.documents:
            assert doc in mainCorpus.documents, "This document doesn't match an existing one"
            assert not doc in seen, "This document isn't unique now"
            testCounter[doc] += 1

    for doc, count in trainCounter.items():
        assert count == folds
    for doc, count in testCounter.items():
        assert count == folds
예제 #12
0
def test_saveStandoffFile_fromSimpleTag():
	text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />'
	corpus = kindred.Corpus()
	doc = kindred.Document(text)
	corpus.addDocument(doc)

	tempDir = tempfile.mkdtemp()

	kindred.save(corpus,'standoff',tempDir)

	loadedCorpus = kindred.loadDir('standoff',tempDir)

	assert isinstance(loadedCorpus,kindred.Corpus)
	assert len(loadedCorpus.documents) == 1
	loadedDoc = loadedCorpus.documents[0]
	
	assert isinstance(loadedDoc,kindred.Document)
	entities = loadedDoc.getEntities()
	relations = loadedDoc.getRelations()

	sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs()

	assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1")
	assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2")
	assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations
	
	shutil.rmtree(tempDir)
예제 #13
0
def test_simpleVectorizer_binary():
    text = '<drug id="1">Erlotinib</drug> is a common treatment for <cancer id="2">NSCLC</cancer>. <drug id="3">Aspirin</drug> is the main cause of <disease id="4">boneitis</disease> . <relation type="treats" subj="1" obj="2" />'

    corpus = kindred.Corpus(text, loadFromSimpleTag=True)

    parser = kindred.Parser()
    parser.parse(corpus)

    candidateBuilder = kindred.CandidateBuilder()
    candidateRelations = candidateBuilder.build(corpus)

    # We'll just get the vectors for the entityTypes
    vectorizer = kindred.Vectorizer(featureChoice=["entityTypes"])
    vectors = vectorizer.fit_transform(candidateRelations)

    assert vectors.shape == (4, 6)

    expected = [(0, 2), (1, 0), (2, 2), (3, 1), (0, 3), (1, 5), (2, 4), (3, 5)]

    rows, cols = vectors.nonzero()
    rowsWithCols = list(zip(rows.tolist(), cols.tolist()))
    assert sorted(expected) == sorted(rowsWithCols)

    vectorsCSR = vectors.tocsr()
    for r, c in expected:
        assert vectorsCSR[r, c] == 1.0
예제 #14
0
def test_saveStandoffFile_SeparateSentences():
	texts = ['The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />','<disease id="T1">Li-Fraumeni</disease> was caused by mutations in <gene id="T2">P53</gene><relation type="causes" subj="T2" obj="T1" />']
	corpus = kindred.Corpus()
	for t in texts:
		doc = kindred.Document(t)
		corpus.addDocument(doc)

	tempDir = tempfile.mkdtemp()

	kindred.save(corpus,'standoff',tempDir)

	loadedCorpus = kindred.loadDir('standoff',tempDir)

	assert isinstance(loadedCorpus,kindred.Corpus)
	assert len(loadedCorpus.documents) == 2
	
	data = loadedCorpus.documents[0]
	assert isinstance(data,kindred.Document)
	entities = data.getEntities()
	relations = data.getRelations()
	sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs()
	assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1")
	assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2")
	assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations
	
	data = loadedCorpus.documents[1]
	assert isinstance(data,kindred.Document)
	entities = data.getEntities()
	relations = data.getRelations()
	sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs()
	assertEntity(entities[0],expectedType='disease',expectedText='Li-Fraumeni',expectedPos=[(0,11)],expectedSourceEntityID="T1")
	assertEntity(entities[1],expectedType='gene',expectedText='P53',expectedPos=[(39,42)],expectedSourceEntityID="T2")
	assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations
	
	shutil.rmtree(tempDir)
예제 #15
0
def test_saveStandoffFile_noArgNames():
	text = "The colorectal cancer was caused by mutations in APC"
	e1 = kindred.Entity(entityType="disease",text="colorectal cancer",position=[(4, 21)],sourceEntityID="T1")
	e2 = kindred.Entity(entityType="gene",text="APC",position=[(49, 52)],sourceEntityID="T2")
	rel = kindred.Relation(relationType="causes",entityIDs=[e1.entityID,e2.entityID])
	doc = kindred.Document(text,[e1,e2],[rel],relationsUseSourceIDs=False)
	corpus = kindred.Corpus()
	corpus.addDocument(doc)

	tempDir = tempfile.mkdtemp()

	kindred.save(corpus,'standoff',tempDir)

	loadedCorpus = kindred.loadDir('standoff',tempDir)

	assert isinstance(loadedCorpus,kindred.Corpus)
	assert len(loadedCorpus.documents) == 1
	loadedDoc = loadedCorpus.documents[0]
	
	assert isinstance(loadedDoc,kindred.Document)
	entities = loadedDoc.getEntities()
	relations = loadedDoc.getRelations()

	sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs()

	assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1")
	assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2")
	assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['arg1','arg2'])], "(%s) not as expected" % relations
	
	shutil.rmtree(tempDir)
예제 #16
0
def test_candidatebuilder_simple():
	text = '<drug id="1">Erlotinib</drug> is a common treatment for <cancer id="2">NSCLC</cancer>. <drug id="3">Aspirin</drug> is the main cause of <disease id="4">boneitis</disease>. <relation type="treats" subj="1" obj="2" />'

	corpus = kindred.Corpus(text,loadFromSimpleTag=True)

	parser = kindred.Parser()
	parser.parse(corpus)
	
	candidateBuilder = kindred.CandidateBuilder()
	candidateRelations = candidateBuilder.build(corpus)
	
	assert len(candidateRelations) == 4
	
	for cr in candidateRelations:
		assert isinstance(cr, kindred.CandidateRelation)
		assert len(cr.entities) == 2

	assert candidateRelations[0].entities[0].sourceEntityID == '1'
	assert candidateRelations[0].entities[1].sourceEntityID == '2'
	assert candidateRelations[1].entities[0].sourceEntityID == '2'
	assert candidateRelations[1].entities[1].sourceEntityID == '1'
	assert candidateRelations[2].entities[0].sourceEntityID == '3'
	assert candidateRelations[2].entities[1].sourceEntityID == '4'
	assert candidateRelations[3].entities[0].sourceEntityID == '4'
	assert candidateRelations[3].entities[1].sourceEntityID == '3'

	assert candidateRelations[0].knownTypesAndArgNames == []
	assert candidateRelations[1].knownTypesAndArgNames == [('treats',['obj','subj'])]
	assert candidateRelations[2].knownTypesAndArgNames == []
	assert candidateRelations[3].knownTypesAndArgNames == []
예제 #17
0
def test_entityrecognizer_fusion_1():
    lookup = makeTestLookup()

    text = 'EGFR-ERBB2 is not a real fusion gene, but FGFR3-TACC3 is.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, detectFusionGenes=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 2
    entity1, entity2 = doc.entities

    assert entity1.entityType == 'gene'
    assert entity1.externalID == 'combo|HGNC:3236|HGNC:2064'
    assert entity1.text == 'EGFR-ERBB2'
    assert entity1.position == [(0, 10)]
    assert entity1.sourceEntityID == 'T1'

    assert entity2.entityType == 'gene'
    assert entity2.externalID == 'combo|HGNC:3690|HGNC:11524'
    assert entity2.text == 'FGFR3-TACC3'
    assert entity2.position == [(42, 53)]
    assert entity2.sourceEntityID == 'T2'
예제 #18
0
def test_iterLoadBiocFile():
	text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />'
	corpus = kindred.Corpus(text,loadFromSimpleTag=True)
	docsToCreate = 100

	tempDir = tempfile.mkdtemp()

	singleDoc = corpus.documents[0]
	corpus.documents = [ singleDoc for _ in range(docsToCreate) ]

	kindred.save(corpus,'bioc',tempDir)

	biocPath = os.path.join(tempDir,'collection.bioc.xml')
	totalDocCount = 0
	for corpus in kindred.iterLoadDataFromBioc(biocPath,corpusSizeCutoff=3):
		assert isinstance(corpus,kindred.Corpus)

		assert len(corpus.documents) <= 25
		totalDocCount += len(corpus.documents)

		for doc in corpus.documents:
			assert isinstance(doc,kindred.Document)
			entities = doc.getEntities()
			relations = doc.getRelations()

			sourceEntityIDsToEntityIDs = doc.getSourceEntityIDsToEntityIDs()

			assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1")
			assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2")
			assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations

	assert totalDocCount == docsToCreate
	shutil.rmtree(tempDir)
예제 #19
0
def test_entityrecognizer_removepathways_off():
    lookup = makeTestLookup()

    text = 'EGFR signalling is involved in lung cancer'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, removePathways=False)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 1
    entity = doc.entities[0]

    assert entity.entityType == 'gene'
    assert entity.externalID == 'HGNC:3236'
    assert entity.text == 'EGFR'
    assert entity.position == [(0, 4)]
    assert entity.sourceEntityID == 'T1'

    assert len(doc.sentences) == 1
    sentence = doc.sentences[0]
    assert sentence.entityAnnotations == [(entity, [0])]
예제 #20
0
def test_simpleVectorizer_triple():
    text = '<drug id="1">Erlotinib</drug> is a common treatment for <cancer id="2">NSCLC</cancer> which targets <gene id="3">EGFR</gene>. <relation type="druginfo" drug="1" disease="2" gene="3" />'

    corpus = kindred.Corpus(text, loadFromSimpleTag=True)

    parser = kindred.Parser()
    parser.parse(corpus)

    candidateBuilder = kindred.CandidateBuilder(entityCount=3)
    candidateRelations = candidateBuilder.build(corpus)

    # We'll just get the vectors for the entityTypes
    vectorizer = kindred.Vectorizer(entityCount=3,
                                    featureChoice=["entityTypes"])
    vectors = vectorizer.fit_transform(candidateRelations)

    assert vectors.shape == (6, 9)

    expected = [(0, 1), (0, 3), (0, 8), (1, 1), (1, 5), (1, 6), (2, 0), (2, 4),
                (2, 8), (3, 0), (3, 5), (3, 7), (4, 2), (4, 4), (4, 6), (5, 2),
                (5, 3), (5, 7)]

    rows, cols = vectors.nonzero()
    rowsWithCols = list(zip(rows.tolist(), cols.tolist()))
    assert sorted(expected) == sorted(rowsWithCols)

    vectorsCSR = vectors.tocsr()
    for r, c in expected:
        assert vectorsCSR[r, c] == 1.0
예제 #21
0
def test_entityrecognizer_merge_negativecase():
    lookup = makeTestLookup()

    text = 'EGFR ERBB2 is not anything.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, mergeTerms=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 2
    entity1, entity2 = doc.entities

    assert entity1.entityType == 'gene'
    assert entity1.externalID == 'HGNC:3236'
    assert entity1.text == 'EGFR'
    assert entity1.position == [(0, 4)]
    assert entity1.sourceEntityID == 'T1'

    assert entity2.entityType == 'gene'
    assert entity2.externalID == 'HGNC:2064'
    assert entity2.text == 'ERBB2'
    assert entity2.position == [(5, 10)]
    assert entity2.sourceEntityID == 'T2'
예제 #22
0
def test_entityrecognizer_acronyms_OFF():
    lookup = makeTestLookup()

    text = 'The Never Ending Umbrella (NEU) is a true classic.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 2
    entity1, entity2 = doc.entities

    assert entity1.entityType == 'movie'
    assert entity1.externalID == 'IMDB:9999'
    assert entity1.text == 'Never Ending Umbrella'
    assert entity1.position == [(4, 25)]
    assert entity1.sourceEntityID == 'T1'

    assert entity2.entityType == 'gene'
    assert entity2.externalID == 'HGNC:2064'
    assert entity2.text == 'NEU'
    assert entity2.position == [(27, 30)]
    assert entity2.sourceEntityID == 'T2'
예제 #23
0
def test_entityrecognizer_merge_brackets_OFF():
    lookup = makeTestLookup()

    text = 'This paper studies non-small cell lung carcinoma (NSCLC).'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 2
    entity1, entity2 = doc.entities

    assert entity1.entityType == 'cancer'
    assert entity1.externalID == 'DOID:3908'
    assert entity1.text == 'non-small cell lung carcinoma'
    assert entity1.position == [(19, 48)]
    assert entity1.sourceEntityID == 'T1'

    assert entity2.entityType == 'cancer'
    assert entity2.externalID == 'DOID:3908'
    assert entity2.text == 'NSCLC'
    assert entity2.position == [(50, 55)]
    assert entity2.sourceEntityID == 'T2'
예제 #24
0
def test_entityrecognizer_basic():
    lookup = makeTestLookup()

    text = 'EGFR is a gene associated with lung cancer'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 1
    entity = doc.entities[0]

    assert entity.entityType == 'gene'
    assert entity.externalID == 'HGNC:3236'
    assert entity.text == 'EGFR'
    assert entity.position == [(0, 4)]
    assert entity.sourceEntityID == 'T1'

    assert len(doc.sentences) == 1
    sentence = doc.sentences[0]
    assert sentence.entityAnnotations == [(entity, [0])]
예제 #25
0
def test_entityrecognizer_fusion_3():
    lookup = makeTestLookup()

    text = 'EGFR-lymphoma is not anything.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup, detectFusionGenes=True)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 2
    entity1, entity2 = doc.entities

    assert entity1.entityType == 'gene'
    assert entity1.externalID == 'HGNC:3236'
    assert entity1.text == 'EGFR'
    assert entity1.position == [(0, 4)]
    assert entity1.sourceEntityID == 'T1'

    assert entity2.entityType == 'cancer'
    assert entity2.externalID == 'DOID:0060058'
    assert entity2.text == 'lymphoma'
    assert entity2.position == [(5, 13)]
    assert entity2.sourceEntityID == 'T2'
예제 #26
0
파일: pubtator.py 프로젝트: vj1494/kindred
def load(pmids):
    """
	Load a set of documents with annotations from Pubmed given a list of Pubmed IDs (PMIDs)
	
	>>> corpus = load(19894120)
	>>> len(corpus.documents)
	1

	:param pmids: the list of Pubmed IDs
	:type pmids: List of ints
	:returns: a kindred corpus object
	:rtype: kindred.Corpus
	"""

    assert isinstance(pmids, list) or isinstance(pmids, int)

    corpus = kindred.Corpus()
    if isinstance(pmids, list):
        for pmid in pmids:
            doc = _loadPMID(pmid)
            assert isinstance(doc, kindred.Document)
            corpus.addDocument(doc)
    elif isinstance(pmids, int):
        doc = _loadPMID(pmids)
        assert isinstance(doc, kindred.Document)
        corpus.addDocument(doc)
    return corpus
예제 #27
0
def test_entityrecognizer_fusion_OFF():
    lookup = makeTestLookup()

    text = 'EGFR-ERBB2 is not a real fusion gene'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 2
    entity1, entity2 = doc.entities

    assert entity1.entityType == 'gene'
    assert entity1.externalID == 'HGNC:3236'
    assert entity1.text == 'EGFR'
    assert entity1.position == [(0, 4)]
    assert entity1.sourceEntityID == 'T1'

    assert entity2.entityType == 'gene'
    assert entity2.externalID == 'HGNC:2064'
    assert entity2.text == 'ERBB2'
    assert entity2.position == [(5, 10)]
    assert entity2.sourceEntityID == 'T2'
예제 #28
0
def test_entityrecognizer_twoSentences():
    lookup = makeTestLookup()

    text = 'EGFR is one gene. ERBB2 is another gene.'

    corpus = kindred.Corpus(text)

    parser = kindred.Parser()
    parser.parse(corpus)

    ner = kindred.EntityRecognizer(lookup)
    ner.annotate(corpus)

    doc = corpus.documents[0]
    assert len(doc.entities) == 2
    entity1, entity2 = doc.entities

    assert entity1.entityType == 'gene'
    assert entity1.externalID == 'HGNC:3236'
    assert entity1.text == 'EGFR'
    assert entity1.position == [(0, 4)]
    assert entity1.sourceEntityID == 'T1'

    assert entity2.entityType == 'gene'
    assert entity2.externalID == 'HGNC:2064'
    assert entity2.text == 'ERBB2'
    assert entity2.position == [(18, 23)]
    assert entity2.sourceEntityID == 'T2'
예제 #29
0
def test_saveStandoffFile_noSourceEntityID():
    text = 'The <disease>colorectal cancer</disease> is bad.'
    corpus = kindred.Corpus(text, loadFromSimpleTag=True)

    with TempDir() as tempDir:
        with pytest.raises(AssertionError) as excinfo:
            kindred.save(corpus, 'standoff', tempDir)
        assert excinfo.value.args[
            0] == 'Entities must have a sourceEntityID (e.g. T1) to be saved in the standoff format'
예제 #30
0
파일: test_save.py 프로젝트: vj1494/kindred
def test_saveStandoffFile():
    text = "The colorectal cancer was caused by mutations in APC"
    e1 = kindred.Entity(entityType="disease",
                        text="colorectal cancer",
                        position=[(4, 21)],
                        sourceEntityID="T1")
    e2 = kindred.Entity(entityType="gene",
                        text="APC",
                        position=[(49, 52)],
                        sourceEntityID="T2")
    rel = kindred.Relation(relationType="causes",
                           entities=[e1, e2],
                           argNames=['obj', 'subj'])
    doc = kindred.Document(text, [e1, e2], [rel])
    corpus = kindred.Corpus()
    corpus.addDocument(doc)

    with TempDir() as tempDir:
        kindred.save(corpus, 'standoff', tempDir)

        for filename in os.listdir(tempDir):
            if filename.endswith('.a2'):
                checkRelationAnnotations(os.path.join(tempDir, filename))

        loadedCorpus = kindred.load('standoff', tempDir)

    assert isinstance(loadedCorpus, kindred.Corpus)
    assert len(loadedCorpus.documents) == 1
    loadedDoc = loadedCorpus.documents[0]

    assert isinstance(loadedDoc, kindred.Document)
    entities = loadedDoc.entities
    relations = loadedDoc.relations

    sourceEntityIDToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }

    assertEntity(entities[0],
                 expectedType='disease',
                 expectedText='colorectal cancer',
                 expectedPos=[(4, 21)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='APC',
                 expectedPos=[(49, 52)],
                 expectedSourceEntityID="T2")
    assert relations == [
        kindred.Relation(
            'causes',
            [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]],
            ['obj', 'subj'],
            sourceRelationID='R1')
    ], "(%s) not as expected" % relations