def test_anno_from_xml_with_trans_token_mismatch(self): anno_element = ET.fromstring("""<token id="t1324" trans="her#aws"> <dipl id="t1324_d1" trans="her#aws" utf="heraws"/> <mod id="t1324_m1" trans="her#aws" utf="heraws" ascii="heraws" checked="y" /> </token>""") ## strict mode: error with self.assertLogs(None, 'ERROR'): create_importer('coraxml', 'anselm')._create_cora_token(anno_element, set()) ## nonstrict mode: token is created as given by XML expected_token = CoraToken(AnselmParser().parse('her#aws'), [ TokDipl(AnselmParser().parse('her#aws', output_type="dipl"), extid='t1324_d1') ], [ TokAnno(AnselmParser().parse('her#aws', output_type="anno"), extid='t1324_m1', checked=True) ], extid='t1324') self.assertEquals( expected_token, create_importer('coraxml', 'anselm', strict=False)._create_cora_token( anno_element, set()))
def test_anno_with_doubled_tags(self): anno_element = ET.fromstring( '<mod id="t1_m1" trans="priuilegien" utf="priuilegien" ascii="priuilegien" checked="y"><pos tag="NA"/><pos tag="NA"/></mod>' ) with self.assertLogs(None, 'WARN'): create_importer('coraxml')._create_anno_token( anno_element, PlainParser().parse('priuilegien', output_type="dipl"))
def convert(infile, from_, to, parser, strict_parsing, outfile): MyImporter = create_importer(from_, parser, strict=strict_parsing) MyExporter = create_exporter(to) doc = MyImporter.import_from_file(infile) if doc: outdoc = MyExporter.export(doc) # convert special documents to text if isinstance(outdoc, dict): # json outdoc = json.dumps(outdoc) elif isinstance(outdoc, etree._ElementTree): # xml outdoc = etree.tostring(outdoc, xml_declaration=True, pretty_print=True, encoding="utf-8") # default: text click.echo(outdoc, file=outfile) else: logging.error("Input document invalid") exit(1)
def test_dipl_from_xml(self): expected_dipl = TokDipl(PlainParser().parse('test', output_type="dipl"), extid='t1_d1') dipl_element = ET.fromstring('<dipl id="t1_d1" trans="test" />') self.assertEquals( expected_dipl, create_importer('coraxml')._create_dipl_token( dipl_element, PlainParser().parse('test', output_type="dipl")))
def test_anno_from_xml_with_transcription_mismatch(self): anno_element = ET.fromstring( """<token id="t924" trans="hin#czü|hin(.)"> <dipl id="t924_d1" trans="hin#czü|" utf="hinczü"/> <dipl id="t924_d2" trans="hin" utf="hin"/> <mod id="t924_m1" trans="hin#czü|" utf="hinczü" ascii="hinczü" checked="y" /> <mod id="t924_m2" trans="hin" utf="hin" ascii="hin" checked="y" /> <mod id="t924_m3" trans="(.)" utf="." ascii="." checked="y" /> </token>""") ## strict mode: error with self.assertLogs(None, 'ERROR'): create_importer('coraxml', 'anselm')._create_cora_token(anno_element, set()) ## nonstrict mode: token is created as given by XML expected_token = CoraToken(AnselmParser().parse('hin#czü|hin'), [ TokDipl(AnselmParser().parse('hin#czü|', output_type="dipl"), extid='t924_d1'), TokDipl(AnselmParser().parse('hin', output_type="dipl"), extid='t924_d2') ], [ TokAnno(AnselmParser().parse('hin#czü|', output_type="anno"), extid='t924_m1', checked=True), TokAnno(AnselmParser().parse('hin', output_type="anno"), extid='t924_m2', checked=True), TokAnno(AnselmParser().parse('(.)', output_type="anno"), extid='t924_m3', checked=True) ], extid='t924') self.assertEquals( expected_token, create_importer('coraxml', 'anselm', strict=False)._create_cora_token( anno_element, set()))
def test_cora_token_from_xml(self): expected_token = CoraToken(PlainParser().parse('test|case'), [ TokDipl(PlainParser().parse('test|case', output_type="dipl"), extid='t1_d1') ], [ TokAnno(PlainParser().parse('test|', output_type="anno"), extid='t1_m1', checked=True), TokAnno(PlainParser().parse('case', output_type="anno"), extid='t1_m2') ], extid='t1') token_element = ET.fromstring( '<token id="t1" trans="test|case"><dipl id="t1_d1" trans="test|case" /><mod id="t1_m1" trans="test|" checked="y" /><mod id="t1_m2" trans="case" /></token>' ) self.assertEquals( expected_token, create_importer('coraxml')._create_cora_token( token_element, set()))
def test_anno_from_xml(self): expected_anno = TokAnno(PlainParser().parse('priuilegien', output_type="anno"), tags={ 'lemma': 'privileg', 'pos': 'NA', 'morph': 'Fem.Dat.Pl', 'boundary': 'Satz' }, flags=set(['lemma verified', 'boundary']), checked=True, extid='t1_m1') anno_element = ET.fromstring( '<mod id="t1_m1" trans="priuilegien" utf="priuilegien" ascii="priuilegien" checked="y"><lemma tag="privileg"/><pos tag="NA"/><boundary tag="Satz"/><morph tag="Fem.Dat.Pl"/><cora-flag name="lemma verified"/><cora-flag name="boundary"/></mod>' ) self.assertEquals( expected_anno, create_importer('coraxml')._create_anno_token( anno_element, PlainParser().parse('priuilegien', output_type="anno")))
from coraxml_utils.importer import create_importer from coraxml_utils.exporter import create_exporter if __name__ == "__main__": description = "Konvertiert eine CorA-XML-Datei ins TEI-Format." parser = argparse.ArgumentParser(description=description) parser.add_argument("infile", help="Eingabedatei (XML)") parser.add_argument("outfile", nargs="?", help="Ausgabedatei (XML)") parser.add_argument( "-P", "--parser", choices=["rem", "anselm", "ref", "redi"], default="ref", help="Token parser to use, default: %(default)s", ) args, _ = parser.parse_known_args() MyImporter = create_importer("coraxml", args.parser) MyExporter = create_exporter("tei") doc = MyImporter.import_from_file(args.infile) tei_doc = MyExporter.export(doc) ausgabe = etree.tounicode(tei_doc) ausgabe = ( ausgabe.replace("<lb ", "\n<lb ") .replace("<pb ", "\n<pb ") .replace("<space ", " <space ") ) print(ausgabe, file=open(args.outfile, "w", encoding="utf-8"))
#!/usr/bin/env python3 # coding: utf-8 from coraxml_utils.importer import create_importer from coraxml_utils.exporter import create_exporter from coraxml_utils.modifier import postprocess, no_postprocess, prepare_for_cora if __name__ == "__main__": postprocess( create_importer( "coraxml", dialect="anselm", strict=False, tok_dipl_tag="tok_dipl", tok_anno_tag="tok_anno", ), create_exporter( "coraxml", options={ # name mod -> tok_anno, dipl -> tok_dipl "dipl_tag_name": "dipl", "anno_tag_name": "mod", }, ), prepare_for_cora # ,no_postprocess )
def test_unsupported_file_format(self): with self.assertRaises(ValueError): create_importer('"some unknown format"')
def test_coraxml_unsupported_dialect(self): with self.assertRaises(ValueError): create_importer('coraxml', '"some unknown dialect"')
"--parser", choices=["rem", "anselm", "ref", "redi"], default="ref", help="Token parser to use, default: %(default)s", ) parser.add_argument( "--postprocessing", choices=["ref"], default=None, help="Script used to postprocess the xml file", ) args, _ = parser.parse_known_args() if _: logging.warn("Unknown args: %s", _) MyImporter = create_importer("trans", args.parser) MyExporter = create_exporter("coraxml") print("~BEGIN CHECK") doc = None if os.path.splitext(args.infile)[-1].lower() == ".docx": trans = importTextFromDocx(Path(args.infile)) doc = MyImporter.import_from_string(trans) else: with open(args.infile, "r", encoding="utf-8") as infile: doc = MyImporter.import_from_string(infile.read().replace( "\ufeff", "")) if doc: # do postprocessing
#!/usr/bin/env python3 # coding: utf-8 from coraxml_utils.importer import create_importer from coraxml_utils.exporter import create_exporter from coraxml_utils.modifier import postprocess, ref_convert if __name__ == "__main__": postprocess( create_importer("coraxml", dialect="ref", strict=False), create_exporter("coraxml",), ref_convert, )
#!/usr/bin/env python3 # coding: utf-8 from coraxml_utils.importer import create_importer from coraxml_utils.exporter import create_exporter from coraxml_utils.modifier import ( postprocess, anselm_postprocess, anselm_document_postprocess, ) if __name__ == "__main__": postprocess( create_importer("coraxml", dialect="anselm", strict=False), create_exporter( "coraxml", options={ # name mod -> tok_anno, dipl -> tok_dipl "dipl_tag_name": "tok_dipl", "anno_tag_name": "tok_anno", }, ), anselm_postprocess, anselm_document_postprocess, )
ap.add_argument( "-q", "--nowarnings", help="Quiet mode: show only errors, no warnings", action="store_true", ) args = ap.parse_args() if args.taggermode: args.tokenize = "all" args.bibinfo = "none" if args.nowarnings: logging.basicConfig(level=logging.ERROR) MyImporter = create_importer("trans", dialect=args.parser) MyExporter = create_exporter("trans") doc = None with open(args.inputfile, "r", encoding="utf-8") as infile: doc = MyImporter.import_from_string(infile.read().replace( "\ufeff", "")) export_contents = MyExporter.export(doc, token_form=args.format) if not args.output: sys.stdout.write(export_contents) else: with open(args.output, "wb", encoding="utf-8") as outputfile: outputfile.write(export_contents)
#!/usr/bin/env python3 # coding: utf-8 from coraxml_utils.importer import create_importer from coraxml_utils.exporter import create_exporter from coraxml_utils.modifier import postprocess, ref_postprocess if __name__ == "__main__": postprocess( create_importer("coraxml", dialect="ref"), create_exporter( "coraxml", options={ # name mod -> tok_anno, dipl -> tok_dipl "dipl_tag_name": "tok_dipl", "anno_tag_name": "tok_anno", }, ), ref_postprocess, )