def rebuilt2xmi(ci, output_dir, typesystem_path): """ Converts a rebuilt ContentItem into Apache UIMA/XMI format. The resulting file will be named after the content item's ID, adding the `.xmi` extension. :param ci: the content item to be converted :type ci: `impresso_commons.classes.ContentItem` :param output_dir: the path to the output directory :type output_dir: str :param typesystem_path: TypeSystem file containing defitions of annotation layers. :type typesystem_path: str """ tsf = TypeSystemFactory() tsf = tsf.readTypeSystem(typesystem_path) cas = CAS(tsf) cas.documentText = ci.fulltext cas.sofaMimeType = 'text' sentType = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence' imgLinkType = 'webanno.custom.ImpressoImages' # create sentence-level annotations start_offset = 0 for break_offset in ci.lines: start = start_offset end = break_offset start_offset = break_offset sntc = cas.createAnnotation(sentType, {'begin': start, 'end': end}) cas.addToIndex(sntc) iiif_links = compute_image_links(ci) for iiif_link, start, end in iiif_links: imglink = cas.createAnnotation(imgLinkType, { 'begin': start, 'end': end, 'link': iiif_link }) cas.addToIndex(imglink) writer = XmiWriter() outfile_path = os.path.join(output_dir, f'{ci.id}.xmi') writer.write(cas, outfile_path)
def buildCAS(self,xmifilepath,typefilepath): #create type ystem object typesystem = TypeSystemFactory.readTypeSystem(self, typefilepath) #create a CAS object cas = CAS(typesystem) #create cas xmi perser object to fetch elements from xmi file casXmiParser = CasXmiParser() casXmiParser.setXmiAsFile(xmifilepath) return self.__build(cas, casXmiParser)
def buildCASfromStrings(self, xmistring, typesysstemString): # create type ystem object typesystem = TypeSystemFactory.readTypeSystemString(self, typesysstemString) # create a CAS object cas = CAS(typesystem) # create cas xmi perser object to fetch elements from xmi file casXmiParser = CasXmiParser() casXmiParser.setXmiAsString(xmistring) return self.__build(cas, casXmiParser)
def test_Cas(self): typeSystemFilePath = 'typesystem.xml' typesystem = TypeSystemFactory.readTypeSystem(self, typeSystemFilePath) cas = CAS(typesystem) cas.documentText = 'These steps install the basis system requirements' cas.sofaMimeType = 'text' sentenceType = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence' tokenType = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token' posType = 'de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS' tagDescType = 'de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagDescription' tagSetDescType = 'de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription' fsSentence1 = cas.createFS(sentenceType) cas.addToIndex(fsSentence1) #check empty FS self.assertEqual(fsSentence1.getFeatureValsAsDictList(), []) #throws error, 1, 'as FSid is already occupied' , assigned to auto generated Sofa FS self.assertEqual('', '') with self.assertRaises(ValueError): fsSentence2 = cas.createFS(sentenceType, { 'begin': 10, 'end': 20 }, 1) fsPOS = cas.createFS(posType, {'PosValue': 'NN'}, 3) cas.addToIndex(fsPOS) fsPOS.PosValue = 'Noun' #throws error, need to provide feature as dictionary as second argument self.assertEqual('', '') with self.assertRaises(TypeError): fsPOSx = cas.createAnnotation(posType, 2) #throws error, as 3 is already set as FS id , lower than 3 can not be set by user self.assertEqual('', '') with self.assertRaises(ValueError): fsPOSx = cas.createAnnotation(posType, {'PosValue': 'NN'}, 2) #throws error, needs both begin and end self.assertEqual('', '') with self.assertRaises(ValueError): fsPOSx = cas.createAnnotation(posType, { 'begin': 10, 'PosValue': 'NN' }, 4) #create a valid annotation FS fsPOS = cas.createAnnotation(posType, { 'begin': 0, 'end': 5, 'PosValue': 'NN' }) #add it to index cas.addToIndex(fsPOS) fsPOS1 = cas.createAnnotation(posType, { 'begin': 0, 'end': 5 }) fsPOS1.PosValue = 'NN' cas.addToIndex(fsPOS1) fsToken1 = cas.createAnnotation(tokenType, { 'begin': 0, 'end': 5, 'pos': fsPOS }) cas.addToIndex(fsToken1) tdlist = [] fstagDesc1 = cas.createFS(tagDescType, {'name': '#'}) tdlist.append(fstagDesc1) fstagDesc2 = cas.createFS(tagDescType, {'name': '$'}) tdlist.append(fstagDesc2) fstagDesc3 = cas.createFS(tagDescType, {'name': '-LRB-'}) tdlist.append(fstagDesc3) fstagSetDesc = cas.createAnnotation( tagSetDescType, { 'begin': 0, 'end': 152, 'layer': 'de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS', 'name': 'ptb', 'tags': tdlist }) cas.addToIndex(fstagSetDesc) #get all FS in index for e in cas.getAnnotationIndex(): #check FSid if (e.FStype.name == "uima.cas.Sofa"): self.assertEqual(e.FSid, 1) if (e.FStype.name == "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" ): self.assertEqual(e.FSid, 2) if (e.FStype.name == "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" ): self.assertEqual(e.FSid, 7) #check FS type if (e.FSid == 1): self.assertEqual(e.FStype.name, "uima.cas.Sofa") if (e.FSid == 2): self.assertEqual( e.FStype.name, "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" ) if (e.FSid == 7): self.assertEqual( e.FStype.name, "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" ) #check sofa self.assertEqual(cas.sofaMimeType, cas.sofaFS.mimeType) self.assertEqual(1, cas.sofaFS.sofaNum) self.assertEqual('_InitialView', cas.sofaFS.sofaID) self.assertEqual(cas.documentText, cas.sofaFS.sofaString) tokens = cas.getAnnotation( 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token') self.assertEqual(tokens[0].getCoveredText(), "These ")