def write( self, filename: str, resources: bool = True, ) -> Generator[None, "BaseEntry", None]: glos = self._glos fileObj = open(filename, "w", encoding="utf-8") title = glos.getInfo("name") author = glos.getInfo("author") # didn't find any tag for author in existing glossaries publisher = glos.getInfo("publisher") copyright = glos.getInfo("copyright") creationTime = glos.getInfo("creationTime") fileObj.write(f"""<?xml version="1.0" encoding="UTF-8"?> <TEI xmlns="http://www.tei-c.org/ns/1.0"> <teiHeader> <fileDesc> <titleStmt> <title>{title}</title> <respStmt><resp>converted with</resp><name>PyGlossary</name></respStmt> </titleStmt> <publicationStmt> <author>{author}</author> <publisher>{publisher}</publisher> <availability><p>{copyright}</p></availability> <date>{creationTime}</date> </publicationStmt> <sourceDesc><p>{filename}</p></sourceDesc> </fileDesc> </teiHeader> <text><body>""") while True: entry = yield if entry is None: break if entry.isData(): if resources: entry.save(f"{filename}_res") continue word = xml_escape(entry.s_word) defi = xml_escape(entry.defi) fileObj.write(f"""<entry> <form><orth>{word}</orth></form> <trans><tr>{defi}</tr></trans> </entry>""") fileObj.write("</body></text></TEI>") fileObj.close()
def write(glos, filename): fp = open(filename, 'wb') fp.write('<?xml version="1.0" encoding="utf-8" ?>\n<words>\n<xfardic>') for item in infoKeys: fp.write('<'+item+'>'+str(glos.getInfo(item))+'</'+item+'>') fp.write('</xfardic>\n') for entry in glos: words = entry.getWords() word, alts = words[0], words[1:] defi = entry.getDefi() #fp.write("<word><in>"+word+"</in><out>"+ defi+"</out></word>\n") fp.write('<word>\n <in>%s</in>\n'%xml_escape(word)) for alt in alts: fp.write(' <alt>%s</alt>\n'%xml_escape(alt)) fp.write(' <out>%s</out>\n</word>\n'%xml_escape(defi)) fp.write("</words>\n") fp.close()
def write( glos: GlossaryType, filename: str, resources: bool = True, ): fp = open(filename, "w", encoding="utf-8") title = glos.getInfo("name") publisher = glos.getInfo("author") copyright = glos.getInfo("copyright") creationTime = glos.getInfo("creationTime") fp.write(f"""<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE TEI.2 PUBLIC "-//TEI P3//DTD Main Document Type//EN" "/usr/share/sgml/tei-3/tei2.dtd" [ <!ENTITY %% TEI.dictionaries "INCLUDE" > ]> <tei.2> <teiHeader> <fileDesc> <titleStmt> <title>{title}</title> <respStmt><resp>converted with</resp><name>PyGlossary</name></respStmt> </titleStmt> <publicationStmt> <publisher>{publisher}</publisher> <availability><p>{copyright}</p></availability> <date>{creationTime}</date> </publicationStmt> <sourceDesc><p>{filename}</p></sourceDesc> </fileDesc> </teiHeader> <text><body>""") for entry in glos: if entry.isData(): if resources: entry.save(filename + "_res") continue word = xml_escape(entry.getWord()) defi = xml_escape(entry.getDefi()) fp.write(f"""<entry> <form><orth>{word}</orth></form> <trans><tr>{defi}</tr></trans> </entry>""") fp.write("</body></text></tei.2>") fp.close()
def write(glos, filename): fp = open(filename, 'wb') fp.write('<?xml version="1.0" encoding="utf-8" ?>\n<words>\n<xfardic>') for item in infoKeys: fp.write('<' + item + '>' + str(glos.getInfo(item)) + '</' + item + '>') fp.write('</xfardic>\n') for entry in glos: words = entry.getWords() word, alts = words[0], words[1:] defi = entry.getDefi() #fp.write("<word><in>"+word+"</in><out>"+ defi+"</out></word>\n") fp.write('<word>\n <in>%s</in>\n' % xml_escape(word)) for alt in alts: fp.write(' <alt>%s</alt>\n' % xml_escape(alt)) fp.write(' <out>%s</out>\n</word>\n' % xml_escape(defi)) fp.write("</words>\n") fp.close()
def replaceHtmlEntryCB(u_match): """ u_match: instance of _sre.SRE_Match Same as replaceHtmlEntryNoEscapeCB, but escapes result string Only <, >, & characters are escaped. """ u_res = replaceHtmlEntryNoEscapeCB(u_match) if u_match.group(0) == u_res: # conversion failed return u_res else: return xml_escape(u_res)
def processDefi(self, b_defi, b_key): """ b_defi: bytes b_key: bytes return: u_defi_format """ fields = DefinitionFields() self.collectDefiFields(b_defi, b_key, fields) fields.u_defi, fields.singleEncoding = self.decodeCharsetTags( fields.b_defi, self.targetEncoding, ) if fields.singleEncoding: fields.encoding = self.targetEncoding fields.u_defi = fixImgLinks(fields.u_defi) fields.u_defi = replaceHtmlEntries(fields.u_defi) fields.u_defi = removeControlChars(fields.u_defi) fields.u_defi = normalizeNewlines(fields.u_defi) fields.u_defi = fields.u_defi.strip() if fields.b_title: fields.u_title, singleEncoding = self.decodeCharsetTags( fields.b_title, self.sourceEncoding, ) fields.u_title = replaceHtmlEntries(fields.u_title) fields.u_title = removeControlChars(fields.u_title) if fields.b_title_trans: # sourceEncoding or targetEncoding ? fields.u_title_trans, singleEncoding = self.decodeCharsetTags( fields.b_title_trans, self.sourceEncoding, ) fields.u_title_trans = replaceHtmlEntries(fields.u_title_trans) fields.u_title_trans = removeControlChars(fields.u_title_trans) if fields.b_transcription_50: if fields.code_transcription_50 == 0x10: # contains values like this (char codes): # 00 18 00 19 00 1A 00 1B 00 1C 00 1D 00 1E 00 40 00 07 # this is not utf-16 # what is this? pass elif fields.code_transcription_50 == 0x1b: fields.u_transcription_50, singleEncoding = \ self.decodeCharsetTags( fields.b_transcription_50, self.sourceEncoding, ) fields.u_transcription_50 = \ replaceHtmlEntries(fields.u_transcription_50) fields.u_transcription_50 = \ removeControlChars(fields.u_transcription_50) elif fields.code_transcription_50 == 0x18: # incomplete text like: # t c=T>02D0;</charset>g<charset c=T>0259;</charset>- # This defi normally contains fields.b_transcription_60 # in this case. pass else: log.debug( "processDefi(%s)\n" % b_defi + "b_key = %s:\n" % b_key + "defi field 50, " + "unknown code: %#.2x" % fields.code_transcription_50 ) if fields.b_transcription_60: if fields.code_transcription_60 == 0x1b: fields.u_transcription_60, singleEncoding = \ self.decodeCharsetTags( fields.b_transcription_60, self.sourceEncoding, ) fields.u_transcription_60 = \ replaceHtmlEntries(fields.u_transcription_60) fields.u_transcription_60 = \ removeControlChars(fields.u_transcription_60) else: log.debug( "processDefi(%s)\n" % b_defi + "b_key = %s:\n" % b_key + "defi field 60" + "unknown code: %#.2x" % fields.code_transcription_60, ) if fields.b_field_1a: fields.u_field_1a, singleEncoding = self.decodeCharsetTags( fields.b_field_1a, self.sourceEncoding, ) self.processDefiStat(fields, b_defi, b_key) u_defi_format = "" if fields.partOfSpeech or fields.u_title: if fields.partOfSpeech: u_defi_format += '<font color="#%s">%s</font>' % ( self.partOfSpeechColor, xml_escape(fields.partOfSpeech), ) if fields.u_title: if u_defi_format: u_defi_format += " " u_defi_format += fields.u_title u_defi_format += "<br>\n" if fields.u_title_trans: u_defi_format += fields.u_title_trans + "<br>\n" if fields.u_transcription_50: u_defi_format += "[%s]<br>\n" % fields.u_transcription_50 if fields.u_transcription_60: u_defi_format += "[%s]<br>\n" % fields.u_transcription_60 if fields.u_defi: u_defi_format += fields.u_defi return u_defi_format
def processDefi(self, b_defi, b_key): """ b_defi: bytes b_key: bytes return: u_defi_format """ fields = DefinitionFields() self.collectDefiFields(b_defi, b_key, fields) fields.u_defi, fields.singleEncoding = self.decodeCharsetTags( fields.b_defi, self.targetEncoding, ) if fields.singleEncoding: fields.encoding = self.targetEncoding fields.u_defi = fixImgLinks(fields.u_defi) fields.u_defi = replaceHtmlEntries(fields.u_defi) fields.u_defi = removeControlChars(fields.u_defi) fields.u_defi = normalizeNewlines(fields.u_defi) fields.u_defi = fields.u_defi.strip() if fields.b_title: fields.u_title, singleEncoding = self.decodeCharsetTags( fields.b_title, self.sourceEncoding, ) fields.u_title = replaceHtmlEntries(fields.u_title) fields.u_title = removeControlChars(fields.u_title) if fields.b_title_trans: # sourceEncoding or targetEncoding ? fields.u_title_trans, singleEncoding = self.decodeCharsetTags( fields.b_title_trans, self.sourceEncoding, ) fields.u_title_trans = replaceHtmlEntries(fields.u_title_trans) fields.u_title_trans = removeControlChars(fields.u_title_trans) if fields.b_transcription_50: if fields.code_transcription_50 == 0x10: # contains values like this (char codes): # 00 18 00 19 00 1A 00 1B 00 1C 00 1D 00 1E 00 40 00 07 # this is not utf-16 # what is this? pass elif fields.code_transcription_50 == 0x1b: fields.u_transcription_50, singleEncoding = \ self.decodeCharsetTags( fields.b_transcription_50, self.sourceEncoding, ) fields.u_transcription_50 = \ replaceHtmlEntries(fields.u_transcription_50) fields.u_transcription_50 = \ removeControlChars(fields.u_transcription_50) elif fields.code_transcription_50 == 0x18: # incomplete text like: # t c=T>02D0;</charset>g<charset c=T>0259;</charset>- # This defi normally contains fields.b_transcription_60 # in this case. pass else: log.debug( f"processDefi({b_defi})\nb_key = {b_key}" f":\ndefi field 50" f", unknown code: {fields.code_transcription_50:#02x}" ) if fields.b_transcription_60: if fields.code_transcription_60 == 0x1b: fields.u_transcription_60, singleEncoding = \ self.decodeCharsetTags( fields.b_transcription_60, self.sourceEncoding, ) fields.u_transcription_60 = \ replaceHtmlEntries(fields.u_transcription_60) fields.u_transcription_60 = \ removeControlChars(fields.u_transcription_60) else: log.debug( f"processDefi({b_defi})\nb_key = {b_key}" f":\ndefi field 60" f", unknown code: {fields.code_transcription_60:#02x}" ) if fields.b_field_1a: fields.u_field_1a, singleEncoding = self.decodeCharsetTags( fields.b_field_1a, self.sourceEncoding, ) self.processDefiStat(fields, b_defi, b_key) u_defi_format = "" if fields.partOfSpeech or fields.u_title: if fields.partOfSpeech: pos = xml_escape(fields.partOfSpeech) posColor = self.partOfSpeechColor u_defi_format += f'<font color="#{posColor}">{pos}</font>' if fields.u_title: if u_defi_format: u_defi_format += " " u_defi_format += fields.u_title u_defi_format += "<br>\n" if fields.u_title_trans: u_defi_format += fields.u_title_trans + "<br>\n" if fields.u_transcription_50: u_defi_format += f"[{fields.u_transcription_50}]<br>\n" if fields.u_transcription_60: u_defi_format += f"[{fields.u_transcription_60}]<br>\n" if fields.u_defi: u_defi_format += fields.u_defi return u_defi_format