def test_deserializing_from_string(): cas_xmi = '''<?xml version="1.0" encoding="UTF-8"?> <xmi:XMI xmlns:tcas="http:///uima/tcas.ecore" xmlns:xmi="http://www.omg.org/XMI" xmlns:cas="http:///uima/cas.ecore" xmlns:cassis="http:///cassis.ecore" xmi:version="2.0"> <cas:NULL xmi:id="0"/> <tcas:DocumentAnnotation xmi:id="8" sofa="1" begin="0" end="47" language="x-unspecified"/> <cassis:Sentence xmi:id="79" sofa="1" begin="0" end="26" id="0"/> <cassis:Sentence xmi:id="84" sofa="1" begin="27" end="47" id="1"/> <cas:Sofa xmi:id="1" sofaNum="1" sofaID="mySofa" mimeType="text/plain" sofaString="Joe waited for the train . The train was late ."/> <cas:View sofa="1" members="8 13 19 25 31 37 43 49 55 61 67 73 79 84"/> </xmi:XMI> ''' load_typesystem(cas_xmi)
def test_deserializing_small_typesystem(small_typesystem_xml): typesystem = load_typesystem(small_typesystem_xml) assert len(typesystem) == 3 # Assert annotation type annotation_features = [Feature('language', '', 'uima.cas.String')] annotation_type = Type('uima.tcas.DocumentAnnotation', '', 'uima.tcas.Annotation', annotation_features) assert typesystem.get_type( 'uima.tcas.DocumentAnnotation') == annotation_type # Assert token type token_features = [ Feature('id', '', 'uima.cas.Integer'), Feature('pos', '', 'uima.cas.String') ] token_type = Type('cassis.Token', '', 'uima.tcas.Annotation', token_features) assert typesystem.get_type('cassis.Token') == token_type # Assert sentence type sentence_features = [Feature('id', '', 'uima.cas.Integer')] sentence_type = Type('cassis.Sentence', '', 'uima.tcas.Annotation', sentence_features) assert typesystem.get_type('cassis.Sentence') == sentence_type
def test_deserializing_small_typesystem(small_typesystem_xml): typesystem = load_typesystem(small_typesystem_xml) assert len(list(typesystem.get_types())) == 2 # Assert annotation type annotation_type = typesystem.get_type("uima.tcas.DocumentAnnotation") assert annotation_type.name == "uima.tcas.DocumentAnnotation" assert annotation_type.supertypeName == "uima.tcas.Annotation" language_feature = annotation_type.get_feature("language") assert language_feature.name == "language" assert language_feature.rangeTypeName == "uima.cas.String" # Assert token type token_type = typesystem.get_type("cassis.Token") assert token_type.name == "cassis.Token" assert token_type.supertypeName == "uima.tcas.Annotation" token_id_feature = token_type.get_feature("id") assert token_id_feature.name == "id" assert token_id_feature.rangeTypeName == "uima.cas.Integer" token_pos_feature = token_type.get_feature("pos") assert token_pos_feature.name == "pos" assert token_pos_feature.rangeTypeName == "uima.cas.String" # Assert sentence type sentence_type = typesystem.get_type("cassis.Sentence") assert sentence_type.name == "cassis.Sentence" assert sentence_type.supertypeName == "uima.tcas.Annotation" sentence_type_id_feature = sentence_type.get_feature("id") assert sentence_type_id_feature.name == "id" assert sentence_type_id_feature.rangeTypeName == "uima.cas.Integer"
def test_send_single_cas_from_python_to_ruta(self, notebook): # Step 1: Get the file paths typesytem_file = os.path.join(TEST_RESOURCE_DIR, "TypeSystem.xml") cas_file = os.path.join(TEST_RESOURCE_DIR, "example.xmi") # Step 2: Get a (local) python instance of the cas for comparison with open(typesytem_file, 'rb') as f: typesystem = cassis.load_typesystem(f) with open(cas_file, 'rb') as f: cas = cassis.load_cas_from_xmi(f, typesystem=typesystem) # Step 3: Send a command to a SoS notebook cell that is loading the cas in that cell in a notebook cas_init_expr = f""" import cassis with open("{typesytem_file}", 'rb') as f: typesystem = cassis.load_typesystem(f) with open("{cas_file}", 'rb') as f: cas_var = cassis.load_cas_from_xmi(f, typesystem=typesystem) """ notebook.call(cas_init_expr, kernel=SOS_KERNEL_NAME) # Step 4: Execute `%get cas` command in a Ruta cell and capture the return. notebook.call("%get cas_var", kernel=RUTA_KERNEL_NAME) actual_sofa = notebook.check_output("%displayMode RUTA_COLORING", kernel=RUTA_KERNEL_NAME) expected_sofa = cas.sofa_string # Step 5: Compare results. Ignore special characters. assert [c for c in actual_sofa if c.isalpha()] == [c for c in expected_sofa if c.isalpha()]
def __init__( self, args='object', xmi_string=None, text=[ 'Backgammon', 'is', 'one', 'of', 'the', 'oldest', 'known', 'board', 'games', '.' ], cas_path=None, type_system_path='../pydkpro/typesystems/temp_TypeSytems.xml', token_type='de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token' ): self.args = args self.text = text self.cas_path = cas_path self.type_system_path = type_system_path self.token_type = token_type self.token_list = [] with open(self.type_system_path, 'rb') as f: self.typesystem = load_typesystem(f) if cas_path: with open(self.cas_path, 'rb') as f: self.cas = load_cas_from_xmi( f, typesystem=load_dkpro_core_typesystem()) elif isinstance(self.args, cassis.TypeSystem): self.cas = cs(typesystem=self.args) if xmi_string: self.cas = load_cas_from_xmi( xmi_string, typesystem=load_dkpro_core_typesystem()) else: self.cas = cs(typesystem=self.typesystem) self.cas.sofa_mime = "text/plain" self.cas.sofa_string = ""
def test_type_can_retrieve_children(typesystem_with_inheritance_xml): typesystem = load_typesystem(typesystem_with_inheritance_xml) t = typesystem.get_type("cassis.Child") children = [item.name for item in t.children] assert children == ["cassis.GrandChild"]
def test_serializing_typesystem_to_file(tmpdir, typesystem_xml): typesystem = load_typesystem(typesystem_xml) path = str(tmpdir.join("typesystem.xml")) typesystem.to_xml(path) with open(path, "rb") as actual: assert_xml_equal(actual, typesystem_xml)
def convert_stuff(): with open(PATH_GENERATED + "/userstudy/obama/TypeSystem.xml", "rb") as f: typesystem = load_typesystem(f) with open(PATH_GENERATED + "/userstudy/obama/Wikipedia-Obama.xmi", "rb") as f: cas = load_cas_from_xmi(f, typesystem) featurize_cas(cas)
def test_type_can_create_instance_with_deeply_inherited_fields( typesystem_with_inheritance_xml): # https://github.com/dkpro/dkpro-cassis/issues/97 typesystem = load_typesystem(typesystem_with_inheritance_xml) t = typesystem.get_type("cassis.GrandGrandGrandChild") assert "parentFeature" in t._inherited_features assert "childFeature" in t._inherited_features
def test_that_typesystem_with_redefined_documentation_annotation_works( typesystem_with_redefined_documentannotation_xml, ): typesystem = load_typesystem( typesystem_with_redefined_documentannotation_xml) actual_xml = typesystem.to_xml() assert_xml_equal(actual_xml, typesystem_with_redefined_documentannotation_xml)
def test_serializing_small_typesystem_to_file(tmpdir, small_typesystem_xml): typesystem = load_typesystem(small_typesystem_xml) path = tmpdir.join('typesystem.xml') with open(path, 'wb') as f: typesystem.to_xml(f) with open(path, 'rb') as actual: assert_xml_equal(actual.read(), small_typesystem_xml.encode('utf-8'))
def test_is_instance_of(child_name: str, parent_name: str, expected: bool): # We cannot use fixtures and parameterize at the same time, so we # manually load the type system path = os.path.join(FIXTURE_DIR, "typesystems", "important_dkpro_types.xml") with open(path, "r") as f: ts = load_typesystem(f.read()) assert ts.is_instance_of(child_name, parent_name) == expected
def test_type_can_retrieve_descendants(typesystem_with_inheritance_xml): typesystem = load_typesystem(typesystem_with_inheritance_xml) t = typesystem.get_type("cassis.Child") descendants = [item.name for item in t.descendants] assert descendants == [ "cassis.Child", "cassis.GrandChild", "cassis.GrandGrandChild", "cassis.GrandGrandGrandChild" ]
def documents(self) -> List["TrainingDocument"]: # We parse this lazily as sometimes when already training, we just do not need to parse it at all. typesystem = load_typesystem(self._typesystem_xml) training_documents = [] for document in self._documents_json: cas = load_cas_from_xmi(document["xmi"], typesystem) document_id = document["documentId"] user_id = document["userId"] training_documents.append( TrainingDocument(cas, document_id, user_id)) return training_documents
def rebuilt2xmi(ci, output_dir, typesystem_path, iiif_mappings, pct_coordinates=False) -> str: """ Converts a rebuilt ContentItem into Apache UIMA/XMI format. The resulting file will be named after the content item's ID, adding the `.xmi` extension. :param ci: the content item to be converted :type ci: `impresso_commons.classes.ContentItem` :param output_dir: the path to the output directory :type output_dir: str :param typesystem_path: TypeSystem file containing defitions of annotation layers. :type typesystem_path: str """ with open(typesystem_path, "rb") as f: typesystem = load_typesystem(f) cas = Cas(typesystem=typesystem) cas.sofa_string = ci.fulltext cas.sofa_mime = 'text/plain' sentType = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence' imgLinkType = 'webanno.custom.ImpressoImages' Sentence = typesystem.get_type(sentType) ImageLink = typesystem.get_type(imgLinkType) # create sentence-level annotations start_offset = 0 for break_offset in ci.lines: start = start_offset end = break_offset start_offset = break_offset cas.add_annotation(Sentence(begin=start, end=end)) iiif_links = compute_image_links(ci, iiif_links=iiif_mappings, pct=pct_coordinates) # inject the IIIF links into for iiif_link, start, end in iiif_links: cas.add_annotation(ImageLink(begin=start, end=end, link=iiif_link)) outfile_path = os.path.join(output_dir, f'{ci.id}.xmi') cas.to_xmi(outfile_path, pretty_print=True) return outfile_path
def parse_prediction_request(json_object: JsonDict) -> PredictionRequest: metadata = json_object["metadata"] document = json_object["document"] layer = metadata["layer"] feature = metadata["feature"] project_id = metadata["projectId"] typesystem = load_typesystem(json_object["typeSystem"]) cas = load_cas_from_xmi(document["xmi"], typesystem) document_id = document["documentId"] user_id = document["userId"] return PredictionRequest(cas, layer, feature, project_id, document_id, user_id)
def load_isaac_ts() -> TypeSystem: dkpro_ts = load_dkpro_core_typesystem() # https://stackoverflow.com/a/20885799 try: import importlib.resources as pkg_resources except ImportError: # Try backported to PY<37 `importlib_resources`. import importlib_resources as pkg_resources from . import resources with pkg_resources.open_binary(resources, ISAAC_TYPESYSTEM_FILE) as f: typesystem = load_typesystem(f) final_ts = merge_typesystems(dkpro_ts, typesystem) return final_ts
def put_vars(self, items, to_kernel=None): """ Functionality to transfer CAS objects from the IRuta kernel to the SoS (Python) kernel. This function is called when a user invokes the line magic %put or %with. """ if len(items) != 1: raise Exception( "%put takes exactly one variable name as argument. ") var_name = items[0] temp_directory = tempfile.TemporaryDirectory() temp_typesystem_file = tempfile.NamedTemporaryFile( suffix=".xml", dir=temp_directory.name, delete=False) temp_typesystem_file_path = os.path.normpath( temp_typesystem_file.name).replace('\\', "/") temp_xmi_file = tempfile.NamedTemporaryFile(suffix=".xmi", dir=temp_directory.name, delete=False) temp_xmi_file_path = os.path.normpath(temp_xmi_file.name).replace( '\\', "/") # Step 1: Writing CAS and TypeSystem to disk with Ruta cmd_transfer_var = f"%displayMode NONE\n" \ f"%saveTypeSystem {temp_typesystem_file_path}\n" \ f"%saveCas {temp_xmi_file_path}" env.log_to_file('KERNEL', f'Executing "{cmd_transfer_var}"') self.ruta_kernel.run_cell(cmd_transfer_var, silent=True, store_history=False, on_error='Failed to write UIMA CAS to disk.') # Step 2: Reading CAS and TypeSystem from disk with python/cassis typesystem = cassis.load_typesystem(temp_typesystem_file) cas = cassis.load_cas_from_xmi(temp_xmi_file, typesystem=typesystem) # Step 3: Clean-up temp files temp_typesystem_file.close() temp_xmi_file.close() temp_directory.cleanup() return {var_name: cas}
def file_to_cas(self, filepath): # TODO below code is implemented for pydkpro purpose only in_text = filepath ts_xml = 'pydkpro/typesystems/temp_TypeSytems_textToXMI.xml' log_path = 'pydkpro/test_data/textToXMI.log' cmd = shlex.split( "java -jar pydkpro/pydkpro-0.0.1-SNAPSHOT-standalone_textXMI.jar %s %s %s" % (in_text, os.path.dirname(in_text), ts_xml)) if os.path.exists(in_text + '.xmi'): os.remove(in_text + '.xmi') with codecs.open(log_path, 'w', 'utf-8') as f: p = subprocess.Popen(cmd, stdout=f, stderr=f) p.wait() with open(ts_xml, 'rb') as f: self.typesystem = load_typesystem(f) with open(in_text + '.xmi', 'rb') as f: self.cas = load_cas_from_xmi(f, typesystem=self.typesystem) os.remove(in_text + '.xmi') return self
def test_that_merging_incompatible_typesystem_throws( name, rangeTypeName, elementType, multipleReferencesAllowed): with open(typesystem_merge_base_path(), "r") as f: base = load_typesystem(f.read()) ts = TypeSystem() t = ts.create_type("test.ArraysAndListsWithElementTypes", supertypeName="uima.cas.TOP") ts.add_feature( type_=t, name=name, rangeTypeName=rangeTypeName, elementType=elementType, multipleReferencesAllowed=multipleReferencesAllowed, ) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning) with pytest.raises(ValueError, match=r".*\[{0}\].*".format(name)): merge_typesystems(base, ts)
def test_deserializing_small_typesystem(small_typesystem_xml): typesystem = load_typesystem(small_typesystem_xml) # There are two types in the type system and we implicitly # define DocumentAnnotation assert len(list(typesystem.get_types())) == 3 # Assert annotation type annotation_type = typesystem.get_type("uima.tcas.DocumentAnnotation") assert annotation_type.name == "uima.tcas.DocumentAnnotation" assert annotation_type.supertypeName == "uima.tcas.Annotation" language_feature = annotation_type.get_feature("language") assert language_feature.name == "language" assert language_feature.rangeTypeName == "uima.cas.String" # Assert token type token_type = typesystem.get_type("cassis.Token") assert token_type.name == "cassis.Token" assert token_type.supertypeName == "uima.tcas.Annotation" token_id_feature = token_type.get_feature("id") assert token_id_feature.name == "id" assert token_id_feature.rangeTypeName == "uima.cas.Integer" token_pos_feature = token_type.get_feature("pos") assert token_pos_feature.name == "pos" assert token_pos_feature.rangeTypeName == "uima.cas.String" assert token_pos_feature.multipleReferencesAllowed is True # Assert sentence type sentence_type = typesystem.get_type("cassis.Sentence") assert sentence_type.name == "cassis.Sentence" assert sentence_type.supertypeName == "uima.tcas.Annotation" sentence_type_id_feature = sentence_type.get_feature("id") assert sentence_type_id_feature.name == "id" assert sentence_type_id_feature.rangeTypeName == "uima.cas.Integer" assert sentence_type_id_feature.multipleReferencesAllowed is False
def test_that_merging_compatible_typesystem_works(name, rangeTypeName, elementType, multipleReferencesAllowed): with open(typesystem_merge_base_path(), "r") as f: base = load_typesystem(f.read()) ts = TypeSystem() t = ts.create_type("test.ArraysAndListsWithElementTypes", supertypeName="uima.cas.TOP") ts.add_feature( type_=t, name=name, rangeTypeName=rangeTypeName, elementType=elementType, multipleReferencesAllowed=multipleReferencesAllowed, ) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning) result = merge_typesystems(base, ts) assert result.contains_type("test.ArraysAndListsWithElementTypes")
def test_send_single_cas_from_ruta_to_python(self, notebook): # Step 1: Get the file paths typesystem_file = os.path.join(TEST_RESOURCE_DIR, "TypeSystem.xml") cas_file = os.path.join(TEST_RESOURCE_DIR, "example.xmi") # Step 2: Get a (local) python instance of the cas for comparison with open(typesystem_file, 'rb') as f: typesystem = cassis.load_typesystem(f) with open(cas_file, 'rb') as f: cas = cassis.load_cas_from_xmi(f, typesystem=typesystem) # Step 3: Load CAS into Ruta cas_init_expr = f"%displayMode NONE\n" \ f"%loadCas {cas_file}\n" \ f"%loadTypeSystem {typesystem_file}" notebook.call(cas_init_expr, kernel=RUTA_KERNEL_NAME) # Step 4: Send files to SoS Kernel with %put notebook.call("%put modified_cas", kernel=RUTA_KERNEL_NAME) # Step 5: Check variable content actual_sofa = notebook.check_output("print(modified_cas.sofa_string)", kernel=SOS_KERNEL_NAME) expected_sofa = cas.sofa_string.strip() assert actual_sofa == expected_sofa
def test_serializing_small_typesystem_to_string(small_typesystem_xml): typesystem = load_typesystem(small_typesystem_xml) actual_xml = typesystem.to_xml() assert_xml_equal(actual_xml, small_typesystem_xml.encode('utf-8'))
def test_deserializing_from_file(typesystem_path): with open(typesystem_path, "rb") as f: load_typesystem(f)
def __init__(self): with open('../pydkpro/typesystems/dkpro-core-types.xml', 'rb') as f: self.typesystem = load_typesystem(f)
def test_deserializing_from_string(typesystem_xml): load_typesystem(typesystem_xml)
def test_serializing_typesystem_to_string(typesystem_xml): typesystem = load_typesystem(typesystem_xml) actual_xml = typesystem.to_xml() assert_xml_equal(actual_xml, typesystem_xml)
def test_that_typesystem_with_child_redefining_type_same_warns(): path = os.path.join(FIXTURE_DIR, "typesystems", "typesystem_with_inheritance_redefined_same.xml") with pytest.warns(UserWarning): with open(path, "rb") as f: load_typesystem(f)
def test_that_typesystem_with_child_redefining_type_differently_throws(): path = os.path.join(FIXTURE_DIR, "typesystems", "typesystem_with_inheritance_redefined_different.xml") with pytest.raises(ValueError): with open(path, "rb") as f: load_typesystem(f)