示例#1
0
def loadDataFromStandoff(txtFile,
                         ignoreEntities=[],
                         ignoreComplexRelations=True):
    annotationExtensions = ['ann', 'a1', 'a2']
    assert ignoreComplexRelations == True, "ignoreComplexRelations must be True as kindred doesn't currently support complex relations"

    with codecs.open(txtFile, "r", "utf-8") as f:
        text = f.read()

    assert txtFile.endswith('.txt')
    base = txtFile[:-4]

    annotationFiles = ["%s.%s" % (base, ext) for ext in annotationExtensions]
    annotationFiles = [
        filename for filename in annotationFiles if os.path.isfile(filename)
    ]

    entities = []

    for annotationFile in annotationFiles:
        with codecs.open(annotationFile, "r", "utf-8") as f:
            for line in f:
                if line.startswith('T'):
                    entity = loadEntity(annotationFile, line.strip(), text)
                    if (not entity is None) and (not entity.entityType
                                                 in ignoreEntities):
                        entities.append(entity)

    sourceEntityIDToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }

    relations = []
    for annotationFile in annotationFiles:
        with codecs.open(annotationFile, "r", "utf-8") as f:
            for line in f:
                if line.startswith('E') or line.startswith('R'):
                    relationTuple = loadRelation(annotationFile, line.strip(),
                                                 ignoreComplexRelations)
                    if not relationTuple is None:
                        sourceRelationID, relationType, sourceEntityIDs, argNames = relationTuple
                        for sourceEntityID in sourceEntityIDs:
                            assert sourceEntityID in sourceEntityIDToEntity, "Relation exists that references a non-existent entity (%s) associated with %s" % (
                                sourceEntityID, txtFile)
                        entitiesInRelation = [
                            sourceEntityIDToEntity[sourceEntityID]
                            for sourceEntityID in sourceEntityIDs
                        ]
                        relation = kindred.Relation(
                            relationType,
                            entitiesInRelation,
                            argNames,
                            sourceRelationID=sourceRelationID)
                        relations.append(relation)

    baseTxtFile = os.path.basename(txtFile)
    baseFilename = baseTxtFile[0:-4]

    combinedData = kindred.Document(text,
                                    entities=entities,
                                    relations=relations,
                                    sourceFilename=baseFilename)

    return combinedData
示例#2
0
def convertBiocDocToKindredDocs(document):
    assert isinstance(document, bioc.BioCDocument)
    kindredDocs = []
    for passage in document.passages:
        assert isinstance(passage, bioc.BioCPassage)

        text = passage.text
        offset = int(native(passage.offset))
        entities = []
        relations = []

        for a in passage.annotations:
            assert isinstance(a, bioc.BioCAnnotation)

            entityType = a.infons['type']
            sourceEntityID = a.id

            metadata = a.infons
            del metadata['type']

            position = []
            segments = []

            for l in a.locations:
                assert isinstance(l, bioc.BioCLocation)
                startPos = int(native(l.offset))
                endPos = startPos + int(native(l.length))
                position.append((startPos, endPos))
                segments.append(text[startPos:endPos])

            entityText = " ".join(segments)
            e = kindred.Entity(entityType,
                               entityText,
                               position,
                               sourceEntityID,
                               metadata=metadata)
            entities.append(e)

        sourceEntityIDToEntity = {
            entity.sourceEntityID: entity
            for entity in entities
        }

        for r in passage.relations:
            assert isinstance(r, bioc.BioCRelation)
            relationType = r.infons['type']

            arguments = []
            for n in r.nodes:
                assert isinstance(n, bioc.BioCNode)
                arguments.append((n.role, n.refid))
            arguments = sorted(arguments)

            argNames = [argName for argName, sourceEntityID in arguments]
            sourceEntityIDs = [
                sourceEntityID for argName, sourceEntityID in arguments
            ]
            for sourceEntityID in sourceEntityIDs:
                assert sourceEntityID in sourceEntityIDToEntity, "Relation references entity %s which does not exist in BioC document id=%s" % (
                    sourceEntityID, str(document.id))

            entities = [
                sourceEntityIDToEntity[sourceEntityID]
                for sourceEntityID in sourceEntityIDs
            ]

            r = kindred.Relation(relationType, entities, argNames)
            relations.append(r)

        metadata = dict(document.infons)
        metadata.update(passage.infons)
        metadata['id'] = document.id
        relData = kindred.Document(text,
                                   entities=entities,
                                   relations=relations,
                                   metadata=metadata)
        kindredDocs.append(relData)

    return kindredDocs
示例#3
0
	def predict(self,corpus):
		"""
		Use the relation classifier to predict new relations for a corpus. The new relations will be added to the Corpus.

		:param corpus: Corpus to make predictions on
		:type corpus: kindred.Corpus
		"""
		assert self.isTrained, "Classifier must be trained using train() before predictions can be made"
	
		assert isinstance(corpus,kindred.Corpus)
		
		if not corpus.parsed:
			parser = kindred.Parser(model=self.model)
			parser.parse(corpus)
		
		candidateRelations = self.candidateBuilder.build(corpus)

		# Check if there are any candidate relations to classify in this corpus
		if len(candidateRelations) == 0:
			return
		
		predictedRelations = []
		testVectors = self.vectorizer.transform(candidateRelations)

		classMatrix = self.clf.predict(testVectors)
		if self.clf.has_predict_proba():
			probMatrix = self.clf.predict_proba(testVectors)
		else:
			probMatrix = None


		predictedProb = None
		for matrixRow,matrixCol in zip(*classMatrix.nonzero()):
			candidateRelation = candidateRelations[matrixRow]

			if probMatrix is not None:
				predictedProb = probMatrix[matrixRow,matrixCol]

			relKey = self.colToRelType[matrixCol]
			relType = relKey[0]
			argNames = relKey[1:]
			
			candidateRelationEntityTypes = tuple( [ e.entityType for e in candidateRelation.entities ] )
			if not tuple(candidateRelationEntityTypes) in self.relTypeToValidEntityTypes[relKey]:
				continue

			predictedRelation = kindred.Relation(relType,candidateRelation.entities,argNames=argNames,probability=predictedProb)
			predictedRelations.append(predictedRelation)

		# Add the predicted relations into the corpus
		entitiesToDoc = {}
		for i,doc in enumerate(corpus.documents):
			for e in doc.entities:
				entitiesToDoc[e] = i

		for predictedRelation in predictedRelations:
			docIDs = [ entitiesToDoc[e] for e in predictedRelation.entities ]
			docIDs = list(set(docIDs))
			assert len(docIDs) > 0, "Predicted relation contains entities that don't match any documents in corpus"
			assert len(docIDs) == 1, "Predicted relation contains entities that are spread across documents"

			docID = docIDs[0]
			if not predictedRelation in corpus.documents[docID].relations:
				corpus.documents[docID].addRelation(predictedRelation)
示例#4
0
def parseJSON(data, ignoreEntities=[]):
    entities = []
    relations = []

    if isinstance(data, list):
        assert len(data) == 1 and isinstance(
            data[0], dict
        ), "JSON loading expects a dictionary or a list with one dictionary in it"
        data = data[0]
    assert isinstance(
        data, dict
    ), "JSON loading expects a dictionary or a list with one dictionary in it"

    text = data['text']
    if 'denotations' in data:
        for d in data['denotations']:
            sourceEntityID = None
            if 'id' in d:
                sourceEntityID = d['id']

            entityType = d['obj']
            span = d['span']
            startPos, endPos = span['begin'], span['end']
            position = [(startPos, endPos)]
            entityText = text[startPos:endPos]

            if not entityType in ignoreEntities:
                entity = kindred.Entity(entityType,
                                        entityText,
                                        position,
                                        sourceEntityID=sourceEntityID)
                entities.append(entity)

    sourceEntityIDToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }

    if 'relations' in data:
        for r in data['relations']:
            obj = r['obj']
            relationType = r['pred']
            subj = r['subj']

            sourceEntityIDs = [obj, subj]
            argNames = ['obj', 'subj']
            entitiesInRelation = [
                sourceEntityIDToEntity[sourceEntityID]
                for sourceEntityID in sourceEntityIDs
            ]

            relation = kindred.Relation(relationType, entitiesInRelation,
                                        argNames)
            relations.append(relation)

    expected = [
        'denotations', 'divid', 'modifications', 'namespaces', 'project',
        'relations', 'sourcedb', 'sourceid', 'target', 'text', 'tracks'
    ]
    extraFields = [k for k in data.keys() if not k in expected]
    assert len(extraFields
               ) == 0, "Found additional unexpected fields (%s) in JSON" % (
                   ",".join(extraFields))

    combinedData = kindred.Document(text,
                                    entities=entities,
                                    relations=relations)

    return combinedData
示例#5
0
def test_saveStandoffFile_SeparateSentences():
    texts = [
        'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />',
        '<disease id="T1">Li-Fraumeni</disease> was caused by mutations in <gene id="T2">P53</gene><relation type="causes" subj="T2" obj="T1" />'
    ]
    corpus = kindred.Corpus()
    for t in texts:
        doc = kindred.Document(t, loadFromSimpleTag=True)
        corpus.addDocument(doc)

    tempDir = tempfile.mkdtemp()

    kindred.save(corpus, 'standoff', tempDir)

    loadedCorpus = kindred.loadDir('standoff', tempDir)

    assert isinstance(loadedCorpus, kindred.Corpus)
    assert len(loadedCorpus.documents) == 2

    data = loadedCorpus.documents[0]
    assert isinstance(data, kindred.Document)
    entities = data.getEntities()
    relations = data.getRelations()
    sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs()
    assertEntity(entities[0],
                 expectedType='disease',
                 expectedText='colorectal cancer',
                 expectedPos=[(4, 21)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='APC',
                 expectedPos=[(49, 52)],
                 expectedSourceEntityID="T2")
    assert relations == [
        kindred.Relation('causes', [
            sourceEntityIDsToEntityIDs["T1"], sourceEntityIDsToEntityIDs["T2"]
        ], ['obj', 'subj'])
    ], "(%s) not as expected" % relations

    data = loadedCorpus.documents[1]
    assert isinstance(data, kindred.Document)
    entities = data.getEntities()
    relations = data.getRelations()
    sourceEntityIDsToEntityIDs = data.getSourceEntityIDsToEntityIDs()
    assertEntity(entities[0],
                 expectedType='disease',
                 expectedText='Li-Fraumeni',
                 expectedPos=[(0, 11)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='P53',
                 expectedPos=[(39, 42)],
                 expectedSourceEntityID="T2")
    assert relations == [
        kindred.Relation('causes', [
            sourceEntityIDsToEntityIDs["T1"], sourceEntityIDsToEntityIDs["T2"]
        ], ['obj', 'subj'])
    ], "(%s) not as expected" % relations

    shutil.rmtree(tempDir)
示例#6
0
def test_saveStandoffFile_SeparateSentences():
    texts = [
        'The <disease id="T1">colorectal cancer</disease> was caused by mutations in <gene id="T2">APC</gene><relation type="causes" subj="T2" obj="T1" />',
        '<disease id="T1">Li-Fraumeni</disease> was caused by mutations in <gene id="T2">P53</gene><relation type="causes" subj="T2" obj="T1" />'
    ]
    corpus = kindred.Corpus()
    for t in texts:
        doc = kindred.Document(t, loadFromSimpleTag=True)
        corpus.addDocument(doc)

    with TempDir() as tempDir:
        kindred.save(corpus, 'standoff', tempDir)

        for filename in os.listdir(tempDir):
            if filename.endswith('.a2'):
                checkRelationAnnotations(os.path.join(tempDir, filename))

        loadedCorpus = kindred.load('standoff', tempDir)

    assert isinstance(loadedCorpus, kindred.Corpus)
    assert len(loadedCorpus.documents) == 2

    data = loadedCorpus.documents[0]
    assert isinstance(data, kindred.Document)
    entities = data.entities
    relations = data.relations
    sourceEntityIDToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }
    assertEntity(entities[0],
                 expectedType='disease',
                 expectedText='colorectal cancer',
                 expectedPos=[(4, 21)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='APC',
                 expectedPos=[(49, 52)],
                 expectedSourceEntityID="T2")
    assert relations == [
        kindred.Relation(
            'causes',
            [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]],
            ['obj', 'subj'])
    ], "(%s) not as expected" % relations

    data = loadedCorpus.documents[1]
    assert isinstance(data, kindred.Document)
    entities = data.entities
    relations = data.relations
    sourceEntityIDToEntity = {
        entity.sourceEntityID: entity
        for entity in entities
    }
    assertEntity(entities[0],
                 expectedType='disease',
                 expectedText='Li-Fraumeni',
                 expectedPos=[(0, 11)],
                 expectedSourceEntityID="T1")
    assertEntity(entities[1],
                 expectedType='gene',
                 expectedText='P53',
                 expectedPos=[(39, 42)],
                 expectedSourceEntityID="T2")
    assert relations == [
        kindred.Relation(
            'causes',
            [sourceEntityIDToEntity["T1"], sourceEntityIDToEntity["T2"]],
            ['obj', 'subj'])
    ], "(%s) not as expected" % relations
示例#7
0
    def splitIntoSentences(self):
        """
		Create a new corpus with one document for each sentence in this document.

		:return: Corpus with one document per sentence
		:rtype: kindred.Corpus
		"""

        sentenceCorpus = kindred.Corpus()

        for sentence in self.sentences:
            sentenceStart = sentence.tokens[0].startPos

            entitiesInSentence = [
                entity for entity, tokenIndices in sentence.entityAnnotations
            ]

            entityMap = OrderedDict()
            for e in entitiesInSentence:
                startPos, endPos = e.position[0]
                newPosition = [(startPos - sentenceStart,
                                endPos - sentenceStart)]
                newE = kindred.Entity(e.entityType, e.text, newPosition,
                                      e.sourceEntityID, e.externalID)
                entityMap[e] = newE

            relationsInSentence = [
                r for r in self.relations
                if all(e in entitiesInSentence for e in r.entities)
            ]
            newRelationsInSentence = []
            for r in relationsInSentence:
                newEntitiesInRelation = [entityMap[e] for e in r.entities]
                newRelation = kindred.Relation(r.relationType,
                                               newEntitiesInRelation,
                                               r.argNames, r.probability)
                newRelationsInSentence.append(newRelation)

            newEntitiesInSentence = list(entityMap.values())
            doc = kindred.Document(sentence.text, newEntitiesInSentence,
                                   newRelationsInSentence)

            newTokens = [
                kindred.Token(t.word, t.lemma, t.partofspeech,
                              t.startPos - sentenceStart,
                              t.endPos - sentenceStart)
                for t in sentence.tokens
            ]

            newSentence = kindred.Sentence(sentence.text, newTokens,
                                           sentence.dependencies,
                                           sentence.sourceFilename)
            newEntityAnnotations = [
                (entityMap[e], tokenIndices)
                for e, tokenIndices in sentence.entityAnnotations
            ]
            newSentence.entityAnnotations = newEntityAnnotations
            doc.sentences = [newSentence]

            sentenceCorpus.addDocument(doc)

        return sentenceCorpus
示例#8
0
    def __init__(self,
                 text,
                 entities=None,
                 relations=None,
                 relationsUseSourceIDs=True,
                 sourceFilename=None,
                 metadata={},
                 loadFromSimpleTag=False):
        """
		Constructor for a Document that can take text using the SimpleTag XML format, or a set of Entities and Relations with associated text.
		
		:param text: Text in document (plain-text, or SimpleTag)
		:param entities: Entities in document
		:param relations: Relations in document
		:param relationsUseSourceIDs: description
		:param sourceFilename: description
		:param metadata: IDs and other information associated with the source (e.g. PMID)
		:param loadFromSimpleTag: Assumes the text parameter is in the SimpleTag format and will extract entities and relations accordingly
		:type text: type description
		:type entities: type description
		:type relations: type description
		:type relationsUseSourceIDs: type description
		:type sourceFilename: type description
		:type metadata: dict
		:type loadFromSimpleTag: bool
		"""

        self.sourceFilename = sourceFilename
        self.metadata = metadata

        if loadFromSimpleTag:
            assert entities is None and relations is None, 'Entities and relations will be extracted from SimpleTag. They cannot also be passed in as parameters'

            dataToCopy = kindred.loadFunctions.parseSimpleTag(text)
            self.text = dataToCopy.getText()
            self.entities = dataToCopy.getEntities()
            self.relations = dataToCopy.getRelations()
        else:
            self.text = text

            if entities is None:
                self.entities = []
            else:
                assert isinstance(entities, list)
                for e in entities:
                    assert isinstance(e, kindred.Entity)
                self.entities = entities

            if relations is None:
                self.relations = []
            else:
                assert isinstance(relations, list)
                for r in relations:
                    assert isinstance(r, kindred.Relation)
                self.relations = relations

        # We'll need to translate source IDs to internal IDs
        if relationsUseSourceIDs and not loadFromSimpleTag:
            sourceEntityIDsToEntityIDs = self.getSourceEntityIDsToEntityIDs()
            sourceEntityIDs = sourceEntityIDsToEntityIDs.keys()
            correctedRelations = []
            for r in self.relations:
                for e in r.entityIDs:
                    assert e in sourceEntityIDs, "Entities in relation must occur in the associated text. %s does not" % e
                relationEntityIDs = [
                    sourceEntityIDsToEntityIDs[e] for e in r.entityIDs
                ]
                correctedR = kindred.Relation(r.relationType,
                                              relationEntityIDs, r.argNames)
                correctedRelations.append(correctedR)

            self.relations = correctedRelations

        self.sentences = []