Пример #1
0
def test_document_str():
	doc1 = kindred.Document('<disease id="T1">Cancer</disease> is caused by mutations in <gene id="T2">ABCDE1</gene>.',loadFromSimpleTag=True)
	expected1 = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' sourceid=T2 [(33, 39)]>] []>"
	
	assert str(doc1) == expected1
	assert doc1.__repr__() == expected1

	doc2 = kindred.Document('<disease id="T1">Cancer</disease> is caused by mutations in <gene id="T2">ABCDE1</gene>.<relation type="causes" subj="T2" obj="T1" />',loadFromSimpleTag=True)
	expected2 = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' sourceid=T2 [(33, 39)]>] [<Relation causes [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' sourceid=T2 [(33, 39)]>] ['obj', 'subj']>]>"
	
	assert str(doc2) == expected2
	assert doc2.__repr__() == expected2
Пример #2
0
def test_convertedTaggedTextWithRelations():
    text = '<drug id="5">Erlotinib</drug> is a common treatment for <cancer id="6">NSCLC</cancer><relation type="treats" subj="5" obj="6" />'

    converted = kindred.Document(text, loadFromSimpleTag=True)
    assert isinstance(converted, kindred.Document)

    entities = converted.getEntities()
    assert isinstance(entities, list)
    for e in entities:
        assert isinstance(e, kindred.Entity)

    assertEntity(entities[0],
                 expectedType='drug',
                 expectedText='Erlotinib',
                 expectedPos=[(0, 9)],
                 expectedSourceEntityID='5')
    assertEntity(entities[1],
                 expectedType='cancer',
                 expectedText='NSCLC',
                 expectedPos=[(36, 41)],
                 expectedSourceEntityID='6')

    text = converted.getText()
    #assert isinstance(text,unicode) # Python3 issue here
    assert text == u"Erlotinib is a common treatment for NSCLC"

    sourceEntityIDsToEntityIDs = converted.getSourceEntityIDsToEntityIDs()

    assert converted.getRelations() == [
        kindred.Relation(
            'treats',
            [sourceEntityIDsToEntityIDs['6'], sourceEntityIDsToEntityIDs['5']],
            ['obj', 'subj'])
    ]
Пример #3
0
def test_document_entityIDs():
    doc = kindred.Document(
        '<disease id="T1">Cancer</disease> is caused by mutations in <gene id="T2">ABCDE1</gene>.'
    )

    expected = [e.entityID for e in doc.entities]
    assert doc.getEntityIDs() == expected
Пример #4
0
def test_saveStandoffFile_SeparateSentences():
	texts = ['The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />','<disease id="T1">Li-Fraumeni</disease> was caused by mutations in <gene id="T2">P53</gene><relation type="causes" subj="T2" obj="T1" />']
	corpus = kindred.Corpus()
	for t in texts:
		doc = kindred.Document(t)
		corpus.addDocument(doc)

	tempDir = tempfile.mkdtemp()

	kindred.save(corpus,'standoff',tempDir)

	loadedCorpus = kindred.loadDir('standoff',tempDir)

	assert isinstance(loadedCorpus,kindred.Corpus)
	assert len(loadedCorpus.documents) == 2
	
	data = loadedCorpus.documents[0]
	assert isinstance(data,kindred.Document)
	entities = data.getEntities()
	relations = data.getRelations()
	sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs()
	assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1")
	assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2")
	assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations
	
	data = loadedCorpus.documents[1]
	assert isinstance(data,kindred.Document)
	entities = data.getEntities()
	relations = data.getRelations()
	sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs()
	assertEntity(entities[0],expectedType='disease',expectedText='Li-Fraumeni',expectedPos=[(0,11)],expectedSourceEntityID="T1")
	assertEntity(entities[1],expectedType='gene',expectedText='P53',expectedPos=[(39,42)],expectedSourceEntityID="T2")
	assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations
	
	shutil.rmtree(tempDir)
Пример #5
0
def test_document_entityIDToEntity():
    doc = kindred.Document(
        '<disease id="T1">Cancer</disease> is caused by mutations in <gene id="T2">ABCDE1</gene>.',
        loadFromSimpleTag=True)

    expected = {e.entityID: e for e in doc.entities}
    assert doc.getEntityIDsToEntities() == expected
Пример #6
0
def test_convertTaggedTextWithSplitEntities():
    #text = 'The <drug><disease>Erlotinib</disease></drug> is a common treatment for <cancer>NSCLC</cancer> patients'
    text = '<drug id="1">Erlotinib</drug> is a common treatment for <cancer id="2">lung</cancer> and unknown <cancer id="2">cancers</cancer>'
    converted = kindred.Document(text, loadFromSimpleTag=True)

    assert isinstance(converted, kindred.Document)
    entities = converted.getEntities()
    assert isinstance(entities, list)
    for e in entities:
        assert isinstance(e, kindred.Entity)

    assertEntity(entities[0],
                 expectedType='drug',
                 expectedText='Erlotinib',
                 expectedPos=[(0, 9)],
                 expectedSourceEntityID='1')
    assertEntity(entities[1],
                 expectedType='cancer',
                 expectedText='lung cancers',
                 expectedPos=[(36, 40), (53, 60)],
                 expectedSourceEntityID='2')

    text = converted.getText()
    #assert isinstance(text,unicode) # Python3 issue here
    assert text == "Erlotinib is a common treatment for lung and unknown cancers"
Пример #7
0
def test_convertTaggedText():
    #text = 'The <drug><disease>Erlotinib</disease></drug> is a common treatment for <cancer>NSCLC</cancer> patients'
    text = "<drug>Erlotinib</drug> is a common treatment for <cancer>NSCLC</cancer>"
    converted = kindred.Document(text, loadFromSimpleTag=True)

    assert isinstance(converted, kindred.Document)
    entities = converted.getEntities()
    assert isinstance(entities, list)
    for e in entities:
        assert isinstance(e, kindred.Entity)

    assertEntity(entities[0],
                 expectedType='drug',
                 expectedText='Erlotinib',
                 expectedPos=[(0, 9)],
                 expectedSourceEntityID=1)
    assertEntity(entities[1],
                 expectedType='cancer',
                 expectedText='NSCLC',
                 expectedPos=[(36, 41)],
                 expectedSourceEntityID=2)

    text = converted.getText()
    #assert isinstance(text,unicode) # Python3 issue here
    assert text == "Erlotinib is a common treatment for NSCLC"
Пример #8
0
def convertBiocDocToKindredDocs(document):
	assert isinstance(document,bioc.BioCDocument)
	kindredDocs = []
	for passage in document.passages:
		assert isinstance(passage,bioc.BioCPassage)
		
		text = passage.text
		offset = int(native(passage.offset))
		entities = []
		relations = []
		
		for a in passage.annotations:
			assert isinstance(a,bioc.BioCAnnotation)
			
			entityType = a.infons['type']
			sourceEntityID = a.id
			
			position = []
			segments = []
			
			for l in a.locations:
				assert isinstance(l,bioc.BioCLocation)
				startPos = int(native(l.offset)) - offset
				endPos = startPos + int(native(l.length))
				position.append((startPos,endPos))
				segments.append(text[startPos:endPos])
			
			entityText = " ".join(segments)
			e = kindred.Entity(entityType,entityText,position,sourceEntityID)
			entities.append(e)

		sourceEntityIDToEntity = { entity.sourceEntityID:entity for entity in entities }
			
		for r in passage.relations:
			assert isinstance(r,bioc.BioCRelation)
			relationType = r.infons['type']
			
			arguments = []
			for n in r.nodes:
				assert isinstance(n,bioc.BioCNode)
				arguments.append((n.role,n.refid))
			arguments = sorted(arguments)
				
			argNames = [ argName for argName,sourceEntityID in arguments]
			sourceEntityIDs = [ sourceEntityID for argName,sourceEntityID in arguments]
			for sourceEntityID in sourceEntityIDs:
				assert sourceEntityID in sourceEntityIDToEntity, "Relation references entity %s which does not exist in BioC document id=%s" % (sourceEntityID,str(document.id))

			entities = [ sourceEntityIDToEntity[sourceEntityID] for sourceEntityID in sourceEntityIDs ]
			
			r = kindred.Relation(relationType,entities,argNames)
			relations.append(r)
		
		metadata = dict(document.infons)
		metadata.update(passage.infons)
		metadata['id'] = document.id
		relData = kindred.Document(text,entities=entities,relations=relations,metadata=metadata)
		kindredDocs.append(relData)

	return kindredDocs
Пример #9
0
def test_saveStandoffFile_fromSimpleTag():
	text = 'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />'
	corpus = kindred.Corpus()
	doc = kindred.Document(text)
	corpus.addDocument(doc)

	tempDir = tempfile.mkdtemp()

	kindred.save(corpus,'standoff',tempDir)

	loadedCorpus = kindred.loadDir('standoff',tempDir)

	assert isinstance(loadedCorpus,kindred.Corpus)
	assert len(loadedCorpus.documents) == 1
	loadedDoc = loadedCorpus.documents[0]
	
	assert isinstance(loadedDoc,kindred.Document)
	entities = loadedDoc.getEntities()
	relations = loadedDoc.getRelations()

	sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs()

	assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1")
	assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2")
	assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['obj','subj'])], "(%s) not as expected" % relations
	
	shutil.rmtree(tempDir)
Пример #10
0
def test_corpus_nfold_split():
    mainCorpus = kindred.Corpus()
    docCount = 100
    for i in range(docCount):
        doc = kindred.Document(text=str(i), entities=[])
        mainCorpus.addDocument(doc)

    corpusA, corpusB = mainCorpus.split(0.75)
    folds = 5
    trainCounter, testCounter = Counter(), Counter()
    for trainCorpus, testCorpus in mainCorpus.nfold_split(folds):
        assert len(trainCorpus.documents) == (folds - 1) * docCount / folds
        assert len(testCorpus.documents) == docCount / folds

        seen = set()
        for doc in corpusA.documents:
            assert doc in mainCorpus.documents, "This document doesn't match an existing one"
            assert not doc in seen, "This document isn't unique now"
            trainCounter[doc] += 1
        for doc in corpusB.documents:
            assert doc in mainCorpus.documents, "This document doesn't match an existing one"
            assert not doc in seen, "This document isn't unique now"
            testCounter[doc] += 1

    for doc, count in trainCounter.items():
        assert count == folds
    for doc, count in testCounter.items():
        assert count == folds
Пример #11
0
def parseSimpleTag(text,ignoreEntities=[]):
	docText = u"<doc>%s</doc>" % text
	xmldoc = minidom.parseString(docText.encode('utf8'))
	docNode = xmldoc.childNodes[0]
	text,unmergedEntities,relationTuples = parseSimpleTag_helper(docNode,ignoreEntities=ignoreEntities)
	
	missingSourceEntityID = [ e.sourceEntityID == '' for e in unmergedEntities ]
	assert all(missingSourceEntityID) or (not any(missingSourceEntityID)), 'All entities or none (not some) should be given IDs'
	assert (not any(missingSourceEntityID)) or len(relationTuples) == 0, "Cannot include relations with no-ID entities"
	
	if all(missingSourceEntityID):
		for i,e in enumerate(unmergedEntities):
			e.sourceEntityID = i+1
					
	entities = mergeEntitiesWithMatchingIDs(unmergedEntities)
		
	sourceEntityIDToEntity = { entity.sourceEntityID:entity for entity in entities }

	relations = []
	for relationType,sourceEntityIDs,argNames in relationTuples:
		assert len(sourceEntityIDs) == len(argNames)
		entitiesInRelation = [ sourceEntityIDToEntity[sourceEntityID] for sourceEntityID in sourceEntityIDs ]
		relation = kindred.Relation(relationType=relationType,entities=entitiesInRelation,argNames=argNames)
		relations.append(relation)

	combinedData = kindred.Document(text,entities=entities,relations=relations)
	return combinedData
Пример #12
0
def test_document_entitySourceIDToEntityID():
    doc = kindred.Document(
        '<disease id="T1">Cancer</disease> is caused by mutations in <gene id="T2">ABCDE1</gene>.'
    )

    expected = {e.sourceEntityID: e.entityID for e in doc.entities}
    assert doc.getSourceEntityIDsToEntityIDs() == expected
Пример #13
0
def test_saveStandoffFile_noArgNames():
	text = "The colorectal cancer was caused by mutations in APC"
	e1 = kindred.Entity(entityType="disease",text="colorectal cancer",position=[(4, 21)],sourceEntityID="T1")
	e2 = kindred.Entity(entityType="gene",text="APC",position=[(49, 52)],sourceEntityID="T2")
	rel = kindred.Relation(relationType="causes",entityIDs=[e1.entityID,e2.entityID])
	doc = kindred.Document(text,[e1,e2],[rel],relationsUseSourceIDs=False)
	corpus = kindred.Corpus()
	corpus.addDocument(doc)

	tempDir = tempfile.mkdtemp()

	kindred.save(corpus,'standoff',tempDir)

	loadedCorpus = kindred.loadDir('standoff',tempDir)

	assert isinstance(loadedCorpus,kindred.Corpus)
	assert len(loadedCorpus.documents) == 1
	loadedDoc = loadedCorpus.documents[0]
	
	assert isinstance(loadedDoc,kindred.Document)
	entities = loadedDoc.getEntities()
	relations = loadedDoc.getRelations()

	sourceEntityIDsToEntityIDs = loadedDoc.getSourceEntityIDsToEntityIDs()

	assertEntity(entities[0],expectedType='disease',expectedText='colorectal cancer',expectedPos=[(4,21)],expectedSourceEntityID="T1")
	assertEntity(entities[1],expectedType='gene',expectedText='APC',expectedPos=[(49,52)],expectedSourceEntityID="T2")
	assert relations == [kindred.Relation('causes',[sourceEntityIDsToEntityIDs["T1"],sourceEntityIDsToEntityIDs["T2"]],['arg1','arg2'])], "(%s) not as expected" % relations
	
	shutil.rmtree(tempDir)
Пример #14
0
def test_document_entityTypeMap():
    doc = kindred.Document(
        '<disease id="T1">Cancer</disease> is caused by mutations in <gene id="T2">ABCDE1</gene>.'
    )

    mapping = doc.getSourceEntityIDsToEntityIDs()
    expected = {mapping["T1"]: 'disease', mapping["T2"]: 'gene'}
    assert doc.getEntityIDsToEntityTypes() == expected
Пример #15
0
def test_document_init():
    text = "Cancer is caused by mutations in ABCDE1."
    e1 = kindred.Entity('disease', 'Cancer', [(0, 6)], 'T1')
    e2 = kindred.Entity('gene', 'ABCDE1', [(33, 39)], 'T2')

    doc = kindred.Document(text, [e1, e2])

    expected = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' sourceid=T2 [(33, 39)]>] []>"
    assert str(doc) == expected
Пример #16
0
def test_saveStandoffFile():
    text = "The colorectal cancer was caused by mutations in APC"
    e1 = kindred.Entity(entityType="disease",
                        text="colorectal cancer",
                        position=[(4, 21)],
                        sourceEntityID="T1")
    e2 = kindred.Entity(entityType="gene",
                        text="APC",
                        position=[(49, 52)],
                        sourceEntityID="T2")
    rel = kindred.Relation(relationType="causes",
                           entities=[e1, e2],
                           argNames=['obj', 'subj'])
    doc = kindred.Document(text, [e1, e2], [rel])
    corpus = kindred.Corpus()
    corpus.addDocument(doc)

    with TempDir() as tempDir:
        kindred.save(corpus, 'standoff', tempDir)

        for filename in os.listdir(tempDir):
            if filename.endswith('.a2'):
                checkRelationAnnotations(os.path.join(tempDir, filename))

        loadedCorpus = kindred.load('standoff', tempDir)

    assert isinstance(loadedCorpus, kindred.Corpus)
    assert len(loadedCorpus.documents) == 1
    loadedDoc = loadedCorpus.documents[0]

    assert isinstance(loadedDoc, kindred.Document)
    entities = loadedDoc.entities
    relations = loadedDoc.relations

    sourceEntityIDToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }

    assertEntity(entities[0],
                 expectedType='disease',
                 expectedText='colorectal cancer',
                 expectedPos=[(4, 21)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='APC',
                 expectedPos=[(49, 52)],
                 expectedSourceEntityID="T2")
    assert relations == [
        kindred.Relation(
            'causes',
            [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]],
            ['obj', 'subj'],
            sourceRelationID='R1')
    ], "(%s) not as expected" % relations
Пример #17
0
def test_document_addEntity():
    text = "Cancer is caused by mutations in ABCDE1."

    doc = kindred.Document(text, [])

    e1 = kindred.Entity('disease', 'Cancer', [(0, 6)], 'T1')
    doc.addEntity(e1)

    expected = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>] []>"
    assert str(doc) == expected
Пример #18
0
def test_document_init_withRel():
    text = "Cancer is caused by mutations in ABCDE1."
    e1 = kindred.Entity('disease', 'Cancer', [(0, 6)], 'T1')
    e2 = kindred.Entity('gene', 'ABCDE1', [(33, 39)], 'T2')
    rel = kindred.Relation('causes', [e1, e2], ['subj', 'obj'])

    doc = kindred.Document(text, [e1, e2], [rel])

    expected = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' sourceid=T2 [(33, 39)]>] [<Relation causes [<Entity disease:'Cancer' sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' sourceid=T2 [(33, 39)]>] ['subj', 'obj']>]>"
    assert str(doc) == expected
Пример #19
0
def test_document_str():
    doc1 = kindred.Document(
        '<disease id="T1">Cancer</disease> is caused by mutations in <gene id="T2">ABCDE1</gene>.'
    )
    mapping1 = doc1.getSourceEntityIDsToEntityIDs()
    expected1 = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' id=%d sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' id=%d sourceid=T2 [(33, 39)]>] []>" % (
        mapping1["T1"], mapping1["T2"])

    assert str(doc1) == expected1
    assert doc1.__repr__() == expected1

    doc2 = kindred.Document(
        '<disease id="T1">Cancer</disease> is caused by mutations in <gene id="T2">ABCDE1</gene>.<relation type="causes" subj="T2" obj="T1" />'
    )
    mapping2 = doc2.getSourceEntityIDsToEntityIDs()
    expected2 = "<Document Cancer is caused by mutations in ABCDE1. [<Entity disease:'Cancer' id=%d sourceid=T1 [(0, 6)]>, <Entity gene:'ABCDE1' id=%d sourceid=T2 [(33, 39)]>] [<Relation causes [%d, %d] ['obj', 'subj']>]>" % (
        mapping2["T1"], mapping2["T2"], mapping2["T1"], mapping2["T2"])

    assert str(doc2) == expected2
    assert doc2.__repr__() == expected2
Пример #20
0
def parseJSON(data, ignoreEntities=[]):
    entities = []
    relations = []

    text = data['text']
    if 'denotations' in data:
        for d in data['denotations']:
            sourceEntityID = None
            if 'id' in d:
                sourceEntityID = d['id']

            entityType = d['obj']
            span = d['span']
            startPos, endPos = span['begin'], span['end']
            position = [(startPos, endPos)]
            entityText = text[startPos:endPos]

            if not entityType in ignoreEntities:
                entity = kindred.Entity(entityType,
                                        entityText,
                                        position,
                                        sourceEntityID=sourceEntityID)
                entities.append(entity)
    if 'relations' in data:
        for r in data['relations']:
            obj = r['obj']
            relationType = r['pred']
            subj = r['subj']

            entityIDs = [obj, subj]
            argNames = ['obj', 'subj']

            relation = kindred.Relation(relationType=relationType,
                                        entityIDs=entityIDs,
                                        argNames=argNames)
            relations.append(relation)

    expected = [
        'denotations', 'divid', 'modifications', 'namespaces', 'project',
        'relations', 'sourcedb', 'sourceid', 'target', 'text', 'tracks'
    ]
    extraFields = [k for k in data.keys() if not k in expected]
    assert len(extraFields
               ) == 0, "Found additional unexpected fields (%s) in JSON" % (
                   ",".join(extraFields))

    combinedData = kindred.Document(text,
                                    entities=entities,
                                    relations=relations)

    return combinedData
Пример #21
0
def loadDataFromSTFormat(txtFile,
                         a1File,
                         a2File,
                         verbose=False,
                         ignoreEntities=[],
                         ignoreComplexRelations=True):
    assert ignoreComplexRelations == True, "ignoreComplexRelations must be True as kindred doesn't currently support complex relations"

    with codecs.open(txtFile, "r", "utf-8") as f:
        text = f.read()

    entities = []
    with codecs.open(a1File, "r", "utf-8") as f:
        for line in f:
            if line.strip() == '':
                continue

            assert line[
                0] == 'T', "Only triggers are expected in a1 file: " + a1File
            entity = loadEntity(line.strip(), text)
            if (not entity is None) and (not entity.entityType
                                         in ignoreEntities):
                entities.append(entity)

    relations = []
    if os.path.exists(a2File):
        with codecs.open(a2File, "r", "utf-8") as f:
            for line in f:
                if line.strip() == '':
                    continue

                if line[0] == 'E' or line[0] == 'R':
                    relation = loadRelation(line.strip(),
                                            ignoreComplexRelations)
                    if not relation is None:
                        relations.append(relation)
                elif verbose:
                    sys.stderr.write("Unable to process line: %s\n" %
                                     line.strip())
    elif verbose:
        sys.stderr.write("Note: No A2 file found : %s\n" %
                         os.path.basename(a2File))

    baseTxtFile = os.path.basename(txtFile)
    baseFilename = baseTxtFile[0:-4]
    combinedData = kindred.Document(text,
                                    entities=entities,
                                    relations=relations,
                                    sourceFilename=baseFilename)

    return combinedData
Пример #22
0
    def __init__(self, text=None):
        """
		Constructor
		
		:param text: Optional SimpleTag text to initalize a single document
		:type text: String (with SimpleTag format XML)
		"""

        self.documents = []
        if not text is None:
            doc = kindred.Document(text)
            self.addDocument(doc)
        self.parsed = False

        self.relationTypes = None
Пример #23
0
def parseJSON(data,ignoreEntities=[]):
	entities = []
	relations = []

	if isinstance(data,list):
		assert len(data) == 1 and isinstance(data[0],dict), "JSON loading expects a dictionary or a list with one dictionary in it"
		data = data[0]
	assert isinstance(data,dict), "JSON loading expects a dictionary or a list with one dictionary in it"

	text = data['text']
	if 'denotations' in data:
		for d in data['denotations']:
			sourceEntityID = None
			if 'id' in d:
				sourceEntityID = d['id']
			
			entityType = d['obj']
			span = d['span']
			startPos,endPos = span['begin'],span['end']
			position = [(startPos,endPos)]
			entityText = text[startPos:endPos]
			
			if not entityType in ignoreEntities:
				entity = kindred.Entity(entityType,entityText,position,sourceEntityID=sourceEntityID)
				entities.append(entity)

	sourceEntityIDToEntity = { entity.sourceEntityID:entity for entity in entities }

	if 'relations' in data:
		for r in data['relations']:
			obj = r['obj']
			relationType = r['pred']
			subj = r['subj']
			
			sourceEntityIDs = [obj,subj]
			argNames = ['obj','subj']
			entitiesInRelation = [ sourceEntityIDToEntity[sourceEntityID] for sourceEntityID in sourceEntityIDs ]
		
			relation = kindred.Relation(relationType,entitiesInRelation,argNames)
			relations.append(relation)
	
	expected = ['denotations','divid','modifications','namespaces','project','relations','sourcedb','sourceid','target','text','tracks']
	extraFields = [ k for k in data.keys() if not k in expected]
	assert len(extraFields) == 0, "Found additional unexpected fields (%s) in JSON" % (",".join(extraFields))
		
	combinedData = kindred.Document(text,entities=entities,relations=relations)

	return combinedData
Пример #24
0
    def __init__(self, text=None, loadFromSimpleTag=False):
        """
		Create an empty corpus with no documents, or quickly load one with a single document using optional SimpleTag
		
		:param text: Optional SimpleTag text to initalize a single document
		:param loadFromSimpleTag: If text is provided, whether the text parameter is in the SimpleTag format and will extract entities and relations accordingly
		:type text: String (with SimpleTag format XML)
		:type loadFromSimpleTag: bool
		"""

        self.documents = []
        if not text is None:
            doc = kindred.Document(text, loadFromSimpleTag=loadFromSimpleTag)
            self.addDocument(doc)

        self.parsed = False
Пример #25
0
def loadDataFromStandoff(txtFile,ignoreEntities=[],ignoreComplexRelations=True):
	annotationExtensions = ['ann','a1','a2']
	assert ignoreComplexRelations == True, "ignoreComplexRelations must be True as kindred doesn't currently support complex relations"

	with codecs.open(txtFile, "r", "utf-8") as f:
		text = f.read()

	assert txtFile.endswith('.txt')
	base = txtFile[:-4]

	annotationFiles = [ "%s.%s" % (base,ext) for ext in annotationExtensions ]
	annotationFiles = [ filename for filename in annotationFiles if os.path.isfile(filename) ]

	entities = []

	for annotationFile in annotationFiles:
		with codecs.open(annotationFile, "r", "utf-8") as f:
			for line in f:
				if line.startswith('T'):
					entity = loadEntity(annotationFile,line.strip(), text)
					if (not entity is None) and (not entity.entityType in ignoreEntities):
						entities.append(entity)
		
	sourceEntityIDToEntity = { entity.sourceEntityID:entity for entity in entities }

	relations = []
	for annotationFile in annotationFiles:
		with codecs.open(annotationFile, "r", "utf-8") as f:
			for line in f:
				if line.startswith('E') or line.startswith('R'):
					relationTuple = loadRelation(annotationFile,line.strip(),ignoreComplexRelations)
					if not relationTuple is None:
						relationType,sourceEntityIDs,argNames = relationTuple
						for sourceEntityID in sourceEntityIDs:
							assert sourceEntityID in sourceEntityIDToEntity, "Relation exists that references a non-existent entity (%s) associated with %s" % (sourceEntityID,txtFile)
						entitiesInRelation = [ sourceEntityIDToEntity[sourceEntityID] for sourceEntityID in sourceEntityIDs ]
						relation = kindred.Relation(relationType,entitiesInRelation,argNames)
						relations.append(relation)

	baseTxtFile = os.path.basename(txtFile)
	baseFilename = baseTxtFile[0:-4]

	combinedData = kindred.Document(text,entities=entities,relations=relations,sourceFilename=baseFilename)
			
	return combinedData
Пример #26
0
    def __init__(self, text=None, loadFromSimpleTag=False):
        """
		Constructor
		
		:param text: Optional SimpleTag text to initalize a single document
		:param loadFromSimpleTag: If text is provided, whether the text parameter is in the SimpleTag format and will extract entities and relations accordingly
		:type text: String (with SimpleTag format XML)
		:type loadFromSimpleTag: bool
		"""

        self.documents = []
        if not text is None:
            doc = kindred.Document(text, loadFromSimpleTag=loadFromSimpleTag)
            self.addDocument(doc)

        self.parsed = False
        self.candidateRelationsEntityCounts = set()

        self.relationTypes = None
Пример #27
0
def test_corpus_split():
    mainCorpus = kindred.Corpus()
    for i in range(100):
        doc = kindred.Document(text=str(i), entities=[])
        mainCorpus.addDocument(doc)

    corpusA, corpusB = mainCorpus.split(0.75)

    assert len(corpusA.documents) == 75
    assert len(corpusB.documents) == 25

    seen = set()
    for doc in corpusA.documents:
        assert doc in mainCorpus.documents, "This document doesn't match an existing one"
        assert not doc in seen, "This document isn't unique now"
        seen.add(doc)
    for doc in corpusB.documents:
        assert doc in mainCorpus.documents, "This document doesn't match an existing one"
        assert not doc in seen, "This document isn't unique now"
        seen.add(doc)

    assert len(seen) == len(mainCorpus.documents)
Пример #28
0
def parseSimpleTag(text, ignoreEntities=[]):
    docText = u"<doc>%s</doc>" % text
    xmldoc = minidom.parseString(docText.encode('utf8'))
    docNode = xmldoc.childNodes[0]
    text, unmergedEntities, relations = parseSimpleTag_helper(
        docNode, ignoreEntities=ignoreEntities)

    missingSourceEntityID = [e.sourceEntityID == '' for e in unmergedEntities]
    assert all(missingSourceEntityID) or (
        not any(missingSourceEntityID)
    ), 'All entities or none (not some) should be given IDs'
    assert (not any(missingSourceEntityID)) or len(
        relations) == 0, "Cannot include relations with no-ID entities"

    if all(missingSourceEntityID):
        for i, e in enumerate(unmergedEntities):
            e.sourceEntityID = i + 1

    entities = mergeEntitiesWithMatchingIDs(unmergedEntities)

    combinedData = kindred.Document(text,
                                    entities=entities,
                                    relations=relations)
    return combinedData
Пример #29
0
def convertBiocDocToKindredDocs(document):
    assert isinstance(document, bioc.BioCDocument)
    kindredDocs = []
    for passage in document.passages:
        assert isinstance(passage, bioc.BioCPassage)

        text = passage.text
        offset = int(native(passage.offset))
        entities = []
        relations = []

        for a in passage.annotations:
            assert isinstance(a, bioc.BioCAnnotation)

            entityType = a.infons['type']
            sourceEntityID = a.id

            metadata = a.infons
            del metadata['type']

            position = []
            segments = []

            for l in a.locations:
                assert isinstance(l, bioc.BioCLocation)
                startPos = int(native(l.offset)) - offset
                endPos = startPos + int(native(l.length))

                assert startPos >= 0 and startPos <= len(
                    text
                ) and endPos >= 0 and endPos <= len(
                    text
                ), "Entity offsets (offset=%s,length=%s) are outside the span of the text (%s)" % (
                    str(l.offset), str(l.length), passage.text)

                position.append((startPos, endPos))
                segments.append(text[startPos:endPos])

            entityText = " ".join(segments)

            assert entityText == a.text, "Mismatch in entity annotation between expected text (%s) and extracted text (%s) using offset info for passage with text: %s" % (
                a.text, entityText, text)

            e = kindred.Entity(entityType,
                               entityText,
                               position,
                               sourceEntityID,
                               metadata=metadata)
            entities.append(e)

        sourceEntityIDToEntity = {
            entity.sourceEntityID: entity
            for entity in entities
        }

        for r in passage.relations:
            assert isinstance(r, bioc.BioCRelation)
            relationType = r.infons['type']

            arguments = []
            for n in r.nodes:
                assert isinstance(n, bioc.BioCNode)
                arguments.append((n.role, n.refid))
            arguments = sorted(arguments)

            argNames = [argName for argName, sourceEntityID in arguments]
            sourceEntityIDs = [
                sourceEntityID for argName, sourceEntityID in arguments
            ]
            for sourceEntityID in sourceEntityIDs:
                assert sourceEntityID in sourceEntityIDToEntity, "Relation references entity %s which does not exist in BioC document id=%s" % (
                    sourceEntityID, str(document.id))

            entities = [
                sourceEntityIDToEntity[sourceEntityID]
                for sourceEntityID in sourceEntityIDs
            ]

            r = kindred.Relation(relationType, entities, argNames)
            relations.append(r)

        metadata = dict(document.infons)
        metadata.update(passage.infons)
        metadata['id'] = document.id
        relData = kindred.Document(text,
                                   entities=entities,
                                   relations=relations,
                                   metadata=metadata)
        kindredDocs.append(relData)

    return kindredDocs
Пример #30
0
	for doc in documents:
		title_plus_abstract = doc['title'] + "\n" + doc['abstract']
		if title_plus_abstract in existing_mapping:
			already_parsed.append(existing_mapping[title_plus_abstract])
		else:
			needs_parsing.append(title_plus_abstract)		
	needs_parsing = sorted(set(needs_parsing))
	
	if use_previous_parses:
		print("Found %d documents with existing parses" % len(already_parsed))
	print("Found %d documents to parse" % len(needs_parsing))
	sys.stdout.flush()
	
	corpus = kindred.Corpus()
	for title_plus_abstract in needs_parsing:
		kindred_doc = kindred.Document(title_plus_abstract)
		corpus.addDocument(kindred_doc)
		
	print("Parsing...")
	sys.stdout.flush()
	parser = kindred.Parser(model='en_core_sci_sm')
	parser.parse(corpus)
	
	corpus.documents += already_parsed
	
	print("Saving %d parses..." % len(corpus.documents))
	sys.stdout.flush()
	with open(args.outPickle,'wb') as outF:
		pickle.dump(corpus,outF)