示例#1
0
def rebuilt2xmi(ci, output_dir, typesystem_path):
    """
    Converts a rebuilt ContentItem into Apache UIMA/XMI format.

    The resulting file will be named after the content item's ID, adding
    the `.xmi` extension.

    :param ci: the content item to be converted
    :type ci: `impresso_commons.classes.ContentItem`
    :param output_dir: the path to the output directory
    :type output_dir: str
    :param typesystem_path: TypeSystem file containing defitions of annotation
    layers.
    :type typesystem_path: str
    """
    tsf = TypeSystemFactory()
    tsf = tsf.readTypeSystem(typesystem_path)
    cas = CAS(tsf)
    cas.documentText = ci.fulltext
    cas.sofaMimeType = 'text'
    sentType = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'
    imgLinkType = 'webanno.custom.ImpressoImages'

    # create sentence-level annotations
    start_offset = 0
    for break_offset in ci.lines:
        start = start_offset
        end = break_offset
        start_offset = break_offset
        sntc = cas.createAnnotation(sentType, {'begin': start, 'end': end})
        cas.addToIndex(sntc)

    iiif_links = compute_image_links(ci)
    for iiif_link, start, end in iiif_links:
        imglink = cas.createAnnotation(imgLinkType, {
            'begin': start,
            'end': end,
            'link': iiif_link
        })
        cas.addToIndex(imglink)

    writer = XmiWriter()
    outfile_path = os.path.join(output_dir, f'{ci.id}.xmi')

    writer.write(cas, outfile_path)
 def buildCAS(self,xmifilepath,typefilepath):    
     #create type ystem object
     typesystem = TypeSystemFactory.readTypeSystem(self, typefilepath)
     #create a CAS object
     cas = CAS(typesystem)
     #create cas xmi perser object to fetch elements from xmi file
     casXmiParser = CasXmiParser()
     casXmiParser.setXmiAsFile(xmifilepath)
     return self.__build(cas, casXmiParser)
 def buildCASfromStrings(self, xmistring, typesysstemString):
     # create type ystem object
     typesystem = TypeSystemFactory.readTypeSystemString(self, typesysstemString)
     # create a CAS object
     cas = CAS(typesystem)
     # create cas xmi perser object to fetch elements from xmi file
     casXmiParser = CasXmiParser()
     casXmiParser.setXmiAsString(xmistring)
     return self.__build(cas, casXmiParser)
示例#4
0
    def test_Cas(self):

        typeSystemFilePath = 'typesystem.xml'
        typesystem = TypeSystemFactory.readTypeSystem(self, typeSystemFilePath)
        cas = CAS(typesystem)
        cas.documentText = 'These steps install the basis system requirements'
        cas.sofaMimeType = 'text'

        sentenceType = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'
        tokenType = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token'
        posType = 'de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS'
        tagDescType = 'de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagDescription'
        tagSetDescType = 'de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription'

        fsSentence1 = cas.createFS(sentenceType)
        cas.addToIndex(fsSentence1)
        #check empty FS
        self.assertEqual(fsSentence1.getFeatureValsAsDictList(), [])

        #throws error, 1, 'as FSid is already occupied' , assigned to auto generated Sofa FS
        self.assertEqual('', '')
        with self.assertRaises(ValueError):
            fsSentence2 = cas.createFS(sentenceType, {
                'begin': 10,
                'end': 20
            }, 1)

        fsPOS = cas.createFS(posType, {'PosValue': 'NN'}, 3)
        cas.addToIndex(fsPOS)
        fsPOS.PosValue = 'Noun'
        #throws error, need to provide feature as dictionary as second argument
        self.assertEqual('', '')
        with self.assertRaises(TypeError):
            fsPOSx = cas.createAnnotation(posType, 2)
        #throws error, as 3 is already set as FS id , lower than 3 can not be set by user
        self.assertEqual('', '')
        with self.assertRaises(ValueError):
            fsPOSx = cas.createAnnotation(posType, {'PosValue': 'NN'}, 2)
        #throws error, needs both begin and end
        self.assertEqual('', '')
        with self.assertRaises(ValueError):
            fsPOSx = cas.createAnnotation(posType, {
                'begin': 10,
                'PosValue': 'NN'
            }, 4)

        #create a valid annotation FS
        fsPOS = cas.createAnnotation(posType, {
            'begin': 0,
            'end': 5,
            'PosValue': 'NN'
        })
        #add it to index
        cas.addToIndex(fsPOS)
        fsPOS1 = cas.createAnnotation(posType, {
            'begin': 0,
            'end': 5
        })
        fsPOS1.PosValue = 'NN'
        cas.addToIndex(fsPOS1)
        fsToken1 = cas.createAnnotation(tokenType, {
            'begin': 0,
            'end': 5,
            'pos': fsPOS
        })
        cas.addToIndex(fsToken1)

        tdlist = []
        fstagDesc1 = cas.createFS(tagDescType, {'name': '#'})
        tdlist.append(fstagDesc1)
        fstagDesc2 = cas.createFS(tagDescType, {'name': '$'})
        tdlist.append(fstagDesc2)
        fstagDesc3 = cas.createFS(tagDescType, {'name': '-LRB-'})
        tdlist.append(fstagDesc3)

        fstagSetDesc = cas.createAnnotation(
            tagSetDescType, {
                'begin': 0,
                'end': 152,
                'layer':
                'de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS',
                'name': 'ptb',
                'tags': tdlist
            })
        cas.addToIndex(fstagSetDesc)
        #get all FS in index
        for e in cas.getAnnotationIndex():
            #check FSid
            if (e.FStype.name == "uima.cas.Sofa"):
                self.assertEqual(e.FSid, 1)

            if (e.FStype.name ==
                    "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence"
                ):
                self.assertEqual(e.FSid, 2)
            if (e.FStype.name ==
                    "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"
                ):
                self.assertEqual(e.FSid, 7)
            #check FS type
            if (e.FSid == 1):
                self.assertEqual(e.FStype.name, "uima.cas.Sofa")
            if (e.FSid == 2):
                self.assertEqual(
                    e.FStype.name,
                    "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence"
                )
            if (e.FSid == 7):
                self.assertEqual(
                    e.FStype.name,
                    "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token"
                )
        #check sofa
        self.assertEqual(cas.sofaMimeType, cas.sofaFS.mimeType)
        self.assertEqual(1, cas.sofaFS.sofaNum)
        self.assertEqual('_InitialView', cas.sofaFS.sofaID)
        self.assertEqual(cas.documentText, cas.sofaFS.sofaString)

        tokens = cas.getAnnotation(
            'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token')
        self.assertEqual(tokens[0].getCoveredText(), "These ")