def check_greek_trans(self, myfile): self.good_trans = [] self.bad_trans = [] self.has_dchars = (new_dstring != None) if not self.has_dchars: # DChars not installed return greek_trans = [] # We suppose the greek is inside a span, and the transliterration # is in the title attribute. for find in [ "//span[@title]" ]: for element in etree.XPath(find)(myfile.tree): if not element.attrib: continue # Encyclopaedia Britanicca if "correction" in element.attrib.get("class", ""): continue title = element.attrib['title'] # Special book - remove [Griech.: ...] if title.startswith("[Griech.: "): title = title[10:-1] greek_trans += [ ( element.xpath("string()" ), title) ] # Now, compare. DSTRING_Y = new_dstring(language='grc', transliteration_method="gutenberg", options = { "gutenberg:transliteration for upsilon" : "u or y", } ) for g in greek_trans: # greek, transliteration, and expected transliteration # strip leading/trailing and double withe spaces grec = re.sub("\s+", " ", g[0].lstrip().rstrip()) triplet = ( grec, re.sub("\s+", " ", g[1].lstrip().rstrip()), DSTRING_Y(grec).get_transliteration() ) if triplet[1] == triplet[2]: self.good_trans += [ triplet ] else: self.bad_trans += [ triplet ]
def create_new_helpcharacters_file(self): """ HelpCharacterFile.create_new_helpcharacters_file """ with open(HelpCharacterFile.filename, 'w') as dest: #--------------------------------------------------------------- # header : #--------------------------------------------------------------- dest.write( ("#" * 80 ) + "\n" ) dest.write( "#" + "\n" ) dest.write( "# help_characters.data file" + "\n" ) dest.write( "#" + "\n" ) dest.write( "# file's format :" + "\n" ) dest.write( "# o line beginning with # are comments" + "\n" ) dest.write( "# o section names follow the format @002 (see the code)" + "\n" ) dest.write( "# o a section begins with '***', followed by the ISO-639-3" + "\n" ) dest.write( "# language's name, followed by '.', followed by either" + "\n" ) dest.write( "# 'text' either 'trans.yyy', 'yyy' being the name of the" + "\n" ) dest.write( "# transliteration's method." + "\n" ) dest.write( "# o 'xxx.text' stands for the unicode symbols used by the" + "\n" ) dest.write( "# language 'xxx'" + "\n" ) dest.write( "# o 'xxx.trans' stands for the transliterated symbols used" + "\n" ) dest.write( "# by the language 'xxx'" + "\n" ) dest.write( "# o a section can contain several lines, separated by a carriage" + "\n" ) dest.write( "# o return." + "\n" ) dest.write( "# o empty lines or lines made of spaces are not read." + "\n" ) dest.write( "#" + "\n" ) dest.write( ("#" * 80 ) + "\n" ) #--------------------------------------------------------------- # characters relative to each language : #--------------------------------------------------------------- for language_name in dchars.languages_name.LANGUAGES_AND_TRANSLITERATIONS: iso_639_3_name = dchars.languages_name.ISO_639_3_NAME[language_name] DSTRING = new_dstring( iso_639_3_name ) dest.write( "\n" ) dest.write( "# {0}({1})".format( language_name, iso_639_3_name) + "\n" ) dest.write( "\n" ) dest.write( "{0}{1}.text".format(HelpCharacterFile.milestone, iso_639_3_name) + "\n" ) dest.write( str(DSTRING().get_usefull_combinations()) + "\n" ) for trans in dchars.languages_name.LANGUAGES_AND_TRANSLITERATIONS[language_name]: dest.write( "{0}{1}.trans.{2}".format(HelpCharacterFile.milestone, iso_639_3_name, trans) + "\n" ) dest.write( DSTRING().get_usefull_transl_combinations() + "\n" )
# GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with DChars. If not, see <http://www.gnu.org/licenses/>. ################################################################################ """ ❏DChars❏ : dchars/tests/languages/grc/grc_tests.py """ import unittest, os.path from dchars.errors.errors import DCharsError from dchars.dchars import new_dstring from dchars.symbols.symbols import UNKNOWN_CHAR_SYMBOL DSTRING_GRC = new_dstring(language="Ἑλληνικὴ γλῶττα", options={"anonymize the unknown characters": "no"}) DSTRING_GRC__UNKNOWNCHAR = new_dstring(language="Ἑλληνικὴ γλῶττα", options={"anonymize the unknown characters": "yes"}) # pylint: disable=R0904 # ("Too many public methods") # Since this classes are derived from unittest.TestCase we have a lot of # methods in the following classe(s). ################################################################################ class TESTSDStringGRC(unittest.TestCase): """ class TESTSDStringGRC We test dchars.languages.grc.dchars::DStringGRC
(2) informations about the buffers (3) we write the buffers ############################################################################ """ import pickle import os.path from dchars.dchars import new_dstring import dchars.languages.bod.buffer as buffer import dchars.languages.bod.transliterations.ewts.ewts_buffer as ewts_buffer DSTRING_BOD_BUFF = new_dstring(language="བོད་ཡིག", transliteration_method='ewts', options = {"expected structure" : "Tibetan or Sanskrit", "look up in the buffers" : 'no', "fill the buffers" : 'yes'}, ) #............................................................................... # (1.1) reading a list of EWTS/unicode words # We use a list of EWTS/unicode words in order to read ewts and unicode strings: #............................................................................... for bod, ewts in ( ("ཀ" , 'ka'), ("ཀྲ" , 'kra'), ("ཀྭ", 'kwa'), ("ཀྱ", 'kya'), ("རྐ", 'rka'), ("ཉ", 'nya'), ("རྙ", 'rnya'),
def get_usefull_combinations(): """ get_usefull_combinations() Return a (str)string with all the usefull combinations of characters, i.e. only the 'interesting' characters (not punctuation if it's too simple by example). NB : this function has nothing to do with linguistic or a strict approach of the language. This function allows only to get the most common and/or usefull characters of the writing system. NB : function required by the dchars-fe project. """ res = [] dstring = new_dstring( 'ang' )() # base_char : we don't use the list stored in symbols.py # since we would lost the character's order. base_characters = ( 'a', 'æ', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'þ', 'ð', 'u', 'v', 'w', 'x', 'y', 'z', ) #----------------------------------------------------------------------- # (1/2) simple characters #----------------------------------------------------------------------- for base_char in base_characters: for capital_letter in (False, True): dchar = DCharacterANG( dstring_object = dstring, base_char = base_char, punctuation = False, capital_letter = capital_letter, makron = False, stress = 0, upperdot = False) txt = dchar__get_translit_str(dstring_object = dstring, dchar = dchar) res.append( str(dchar) + "{" + txt + "} " ) #----------------------------------------------------------------------- # (2/2) complex characters #----------------------------------------------------------------------- combinations = (itertools.product( # base_char : ('a',), # capital_letter (False, True), # makron (False, True), # length ( None, "short", "long",), # stress (-1, 0, 1, 2), # upperdot (False, True), )) for base_char, capital_letter, makron, length, stress, upperdot in combinations: add_this_dchar = True if base_char not in ('a', 'e', 'i', 'o', 'u'): if length is not None or \ stress != 0 or \ upperdot == True: add_this_dchar = False if add_this_dchar: dchar = DCharacterANG( dstring_object = dstring, base_char = base_char, punctuation = False, capital_letter = capital_letter, makron = makron, stress = stress, upperdot = upperdot ) txt = dchar__get_translit_str(dstring_object = dstring, dchar = dchar) res.append( str(dchar) + "{" + txt + "} " ) return "".join(res)
# GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with DChars. If not, see <http://www.gnu.org/licenses/>. ################################################################################ """ ❏DChars❏ : dchars/tests/languages/grc/transliterations/betacode_tests.py """ import unittest, os.path from dchars.dchars import new_dstring from dchars.symbols.symbols import UNKNOWN_CHAR_SYMBOL DSTRING_GRC = new_dstring(language="Ἑλληνικὴ γλῶττα", transliteration_method = "betacode", options = {"anonymize the unknown characters" : 'no'}, ) DSTRING_GRC__UNKNOWNCHAR = new_dstring(language="Ἑλληνικὴ γλῶττα", transliteration_method = "betacode", options = {"anonymize the unknown characters" : 'yes'}, ) LIST_OF_RECIPROCAL_EXAMPLES = ( ("", ''), ("ά", 'A/'), ("ἁ", 'A('), ("ἅ", "A(/"), ("ἆ", "A)="), ("ᾇ", "A(=|"),
""" ❏DChars❏ : dchars/tests/languages/grc/transliterations/gutenberg_tests.py """ import unittest from dchars.dchars import new_dstring from dchars.symbols.symbols import UNKNOWN_CHAR_SYMBOL DSTRING_GRC = new_dstring( language="Ἑλληνικὴ γλῶττα", transliteration_method="gutenberg", options={ "anonymize the unknown characters": "no", "gutenberg:ignore smooth breathing": "yes", "gutenberg:ignore accents": "yes", "gutenberg:ignore iota subscript": "yes", "gutenberg:ignore diaeresis": "yes", "gutenberg:transliteration for upsilon": "u", "gutenberg:ignore makron and brakhu": "yes", }, ) DSTRING_GRC__FULL = new_dstring( language="Ἑλληνικὴ γλῶττα", transliteration_method="gutenberg", options={ "anonymize the unknown characters": "no", "gutenberg:ignore smooth breathing": "no", "gutenberg:ignore accents": "no", "gutenberg:ignore iota subscript": "no",
from dchars.dchars import new_dstring DSTRING_FRO = new_dstring(language='fro') #string = DSTRING_FRO().init_from_transliteration("abc") #print(str(string)) #print(string.get_transliteration()) #string = DSTRING_FRO().init_from_transliteration("a\\") #print(str(string)) #print(string.get_transliteration()) #string = DSTRING_FRO().init_from_transliteration("a/") #print(str(string)) #print(string.get_transliteration()) #string = DSTRING_FRO().init_from_transliteration("a+:") #print(str(string)) #print(string.get_transliteration()) #string = DSTRING_FRO().init_from_transliteration("a/\\") #print(str(string)) #print(string.get_transliteration()) #string = DSTRING_FRO().init_from_transliteration("c+c") #print(str(string)) #print(string.get_transliteration()) string = DSTRING_FRO("ç") print(str(string)) print(string.get_transliteration())
def get_usefull_combinations(): """ get_usefull_combinations() Return a (str)string with all the usefull combinations of characters, i.e. only the 'interesting' characters (not punctuation if it's too simple by example). NB : this function has nothing to do with linguistic or a strict approach of the language. This function allows only to get the most common and/or usefull characters of the writing system. NB : function required by the dchars-fe project. """ res = [] SAN = new_dstring( 'san' ) dstring = SAN() # base_char : we don't use the list stored in symbols.py # since we would lost the character's order. base_characters__vowels = ( 'A', 'AA', 'I', 'II', 'U', 'UU', 'VOCALIC R', 'VOCALIC RR', 'VOCALIC L', 'VOCALIC LL', 'SHORT E', 'E', 'SHORT O', 'O', 'AI', 'AU', ) base_characters = ( 'KA', 'KHA', 'GA', 'GHA', 'NGA', 'CA', 'CHA', 'JA', 'JHA', 'NYA', 'TTA', 'TTHA', 'DDA', 'DDHA', 'NNA', 'TA', 'THA', 'DA', 'DHA', 'NA', 'PA', 'PHA', 'BA', 'BHA', 'MA', 'YA', 'RA', 'LA', 'LLA', 'VA', 'SHA', 'SSA', 'SA', 'HA', 'DEVANAGARI SIGN VISARGA', ) #----------------------------------------------------------------------- # (1/2) simple characters #----------------------------------------------------------------------- for base_char in base_characters__vowels: dchar = DCharacterSAN( dstring_object = dstring, base_char = base_char, accent = None, punctuation = False, nukta = False, anusvara_candrabindu = None, virama = False, anudatta = False, is_an_independent_vowel = True, dependentvowel = None, ) txt = dchar__get_translit_str(dstring_object = dstring, prev_dchar = None, dchar = dchar) res.append( str(dchar) + "{" + txt + "} " ) for base_char in base_characters: dchar = DCharacterSAN( dstring_object = dstring, base_char = base_char, accent = None, punctuation = False, nukta = False, anusvara_candrabindu = None, virama = False, anudatta = False, is_an_independent_vowel = False, dependentvowel = None, ) txt = dchar__get_translit_str(dstring_object = dstring, prev_dchar = None, dchar = dchar) res.append( str(dchar) + "{" + txt + "} " ) #----------------------------------------------------------------------- # (2/2) complex characters #----------------------------------------------------------------------- combinations = (itertools.product( # base_chars ('KA',), # anusvara_candrabindu #(None, # "DEVANAGARI SIGN ANUSVARA", # "DEVANAGARI SIGN INVERTED CANDRABINDU", # 'DEVANAGARI SIGN CANDRABINDU', # ), # virama #( False, True ), # anudatta #( False, True ), # dependentvowel ( None, 'AA', 'I', 'II', 'U', 'UU', 'VOCALIC R', 'VOCALIC RR', #'CANDRA E', #'SHORT E', 'E', 'AI', #'CANDRA O', #'SHORT O', 'O', 'AU', 'VOCALIC L', 'VOCALIC LL', ), )) for base_char, dependentvowel in combinations: add_this_char = True if base_char == 'DEVANAGARI SIGN VISARGA': if dependentvowel is not None: add_this_char = False if add_this_char: dchar = DCharacterSAN( dstring_object = dstring, base_char = base_char, accent = None, punctuation = None, nukta = False, anusvara_candrabindu = None, virama = False, anudatta = False, is_an_independent_vowel = False, dependentvowel = dependentvowel, ) txt = dchar__get_translit_str(dstring_object = dstring, prev_dchar = None, dchar = dchar) res.append( str(dchar) + "{" + txt + "} " ) return "".join(res)
# GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with DChars. If not, see <http://www.gnu.org/licenses/>. ################################################################################ """ ❏DChars❏ : dchars/tests/languages/ang/transliterations/basic_tests.py """ import unittest from dchars.dchars import new_dstring from dchars.languages.bod.dcharacter import UNKNOWN_CHAR_SYMBOL DSTRING_ANG = new_dstring(language="Ænglisc", transliteration_method="basic", options = {"anonymize the unknown characters" : "yes", }, ) LIST_OF_RECIPROCAL_EXAMPLES = ( ('', '' ), ('a', 'a' ), ('p', 'p' ), ("Q", 'Q' ), ("ō", 'o_' ), ("Quōēre,", 'Quo_e_re,'), ("N", 'N' ), (" ", ' ' ), ("è", 'e\\' ), ("a", 'a' ), ("A", 'A' ),
from dchars.dchars import new_dstring DSTRING_ANG = new_dstring(language='ang') #string = DSTRING_ANG().init_from_transliteration("a*_") #print(str(string)) #print(string.get_transliteration()) string = DSTRING_ANG().init_from_transliteration("a_*") print(str(string)) print(string.get_transliteration()) string += string[0] print(str(string)) print(string.get_transliteration())
# B) creation of the DSTRING object # C) processing # C.1) data reading # C.2) unicode strings become DString objects # C.3) sort # C.4) modifications # C.5) output # #******************************************************************************* # A) arguments of the command line ARGS = get_arguments() # B) creation of the DSTRING object DSTRING = new_dstring( language = ARGS.language, options = {"look up in the buffers" : 'yes', "sorting method" : ARGS.sorting_method}) # C) processing if ARGS.source is not None: if ARGS.modifications is None: # normal case : # C.1) data reading DATA = [] # we read the source file(s) : for filename in ARGS.source: with open(filename, 'r') as src:
# # You should have received a copy of the GNU General Public License # along with DChars. If not, see <http://www.gnu.org/licenses/>. ################################################################################ """ ❏DChars❏ : dchars/tests/languages/jpn/jpn_tests.py """ import unittest from dchars.errors.errors import DCharsError from dchars.dchars import new_dstring from dchars.symbols.symbols import UNKNOWN_CHAR_SYMBOL DSTRING_JPN = new_dstring(language="日本語", options = {"anonymize the unknown characters" : 'no'}, ) DSTRING_JPN__UNKNOWNCHAR = new_dstring(language="日本語", options = {"anonymize the unknown characters" : 'yes'}, ) # pylint: disable=R0904 # ("Too many public methods") # Since this classes are derived from unittest.TestCase we have a lot of # methods in the following classe(s). ################################################################################ class TESTSDStringJPN(unittest.TestCase): """
def get_usefull_combinations(): """ get_usefull_combinations() Return a (str)string with all the usefull combinations of characters, i.e. only the 'interesting' characters (not punctuation if it's too simple by example). NB : this function has nothing to do with linguistic or a strict approach of the language. This function allows only to get the most common and/or usefull characters of the writing system. NB : function required by the dchars-fe project. """ res = [] dstring = new_dstring( 'bod' )() # base_char : we don't use the list stored in symbols.py # since we would lost the character's order. base_characters = ('K', 'KH', 'G', 'GH', 'NG', 'C', 'CH', 'J', 'NY', 'TT', 'TTH', 'DD', 'DDH', 'NN', 'T', 'TH', 'D', 'DH', 'N', 'P', 'PH', 'B', 'BH', 'M', 'TS', 'TSH', 'DZ', 'DZH', 'W', 'ZH', 'Z', '-', 'Y', 'R', 'L', 'SH', 'SS', 'S', 'H', 'KSS', 'A', ) #----------------------------------------------------------------------- # (1/2) simple characters #----------------------------------------------------------------------- for base_char in base_characters: dchar = DCharacterBOD( dstring_object = dstring, base_char = base_char, subj_consonants = None, rnam_bcad = False, punctuation = False, halanta = False, anusvara_candrabindu = None, vowel1 = None, vowel2 = None ) dstring.append(dchar) dstring.update_istructs() txt = dstring__get_translit_str(dstring = dstring) res.append( str(dchar) + "{" + txt + "} " ) #----------------------------------------------------------------------- # (2/2) complex characters #----------------------------------------------------------------------- combinations = (itertools.product( # base_chars ('K',), # vowel ( None, 'AA', 'I', 'II', 'U', 'UU', 'VOCALIC R', 'VOCALIC RR', 'VOCALIC L', 'VOCALIC LL', 'E', 'AI', 'O', 'AU', ))) for base_char, vowel in combinations: dchar = DCharacterBOD( dstring_object = dstring, base_char = base_char, subj_consonants = None, rnam_bcad = False, punctuation = False, halanta = False, anusvara_candrabindu = None, vowel1 = vowel, vowel2 = None ) dstring.append(dchar) dstring.update_istructs() txt = dstring__get_translit_str(dstring = dstring) res.append( str(dchar) + "{" + txt + "} " ) return "".join(res)
# GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with DChars. If not, see <http://www.gnu.org/licenses/>. ################################################################################ """ ❏DChars❏ : dchars/tests/languages/hbo/hbo_tests.py """ import unittest, os.path from dchars.errors.errors import DCharsError from dchars.dchars import new_dstring from dchars.symbols.symbols import UNKNOWN_CHAR_SYMBOL DSTRING_HBO = new_dstring(language="עִבְֿרִיתֿ מִקְרָאִיתֿ", options={"anonymize the unknown characters": "no"}) DSTRING_HBO__UNKNOWNCHAR = new_dstring( language="עִבְֿרִיתֿ מִקְרָאִיתֿ", options={"anonymize the unknown characters": "yes"} ) # pylint: disable=R0904 # ("Too many public methods") # Since this classes are derived from unittest.TestCase we have a lot of # methods in the following classe(s). ################################################################################ class TESTSDStringHBO(unittest.TestCase): """ class TESTSDStringHBO We test dchars.languages.hbo.dchars::DStringHBO """
def get_intstruct_from_trans_str( _src, dstring_object ): """ function get_intstruct_from_trans_str() _src : (str) transliterated string like "क". Return a ListOfInternalStructures object. """ # list of InternalStructure objects. istructs = ListOfInternalStructures(anonymize_the_unknown_chars =\ dstring_object.options["anonymize the unknown characters"] == 'yes') # we read <_src> through a DSTRING_SAN object : dstring_san = new_dstring(language='संस्कृतम्', transliteration_method="iso15919") dstring_san = dstring_san(_src) # In Sanskrit, if a consonant is followed by a virama, it means that the following # consonants are part of a cluster of consonants. # # E.g. in कर्म (0915=ka, 0930=ra, 094D=virama, 092E=ma) we have something like kar+ma, # the -m- having no vowel. # place_consonant_among_subjc = False for dchar_san in dstring_san: if dchar_san.unknown_char: new_istruct = InternalStructure( dstring_object = dstring_object, unknown_character = True ) istructs.append(new_istruct) else: # punctation symbol : if dchar_san.base_char in SAN__SYMB_PUNCTUATION: unicode_symb = SAN__SYMB_PUNCTUATION.get_default_symbol(dchar_san.base_char) new_istruct = InternalStructure( dstring_object = dstring_object, punctuation_or_other_symbol = \ PUNCTUATION_INVERSED[unicode_symb] ) istructs.append(new_istruct) place_consonant_among_subjc = False # other symbol : elif dchar_san.base_char in SAN__SYMB_OTHER_SYMBOLS: unicode_symb = SAN__SYMB_OTHER_SYMBOLS.get_default_symbol(dchar_san.base_char) new_istruct = InternalStructure( dstring_object = dstring_object, punctuation_or_other_symbol = \ OTHER_SYMBOLS_INVERSED[unicode_symb] ) istructs.append(new_istruct) place_consonant_among_subjc = False # independent vowel: elif dchar_san.base_char in SAN__SYMB_INDEPENDENT_VOWELS: #............................................................... # _independent_vowel will be added as an independent vowel : #............................................................... if dstring_object.options["san2bod quality"] == "normal" and \ dchar_san.base_char=='O': #==================== # @@BOD2SAN-NORM-004 # (independent vowel) ओ(ō) > औ(au) #==================== _independent_vowel = "AU" elif dstring_object.options["san2bod quality"] == "low" and \ dchar_san.base_char=='O': #==================== # @@BOD2SAN-LOW-004 # (independent vowel) ओ(ō) > औ(au) #==================== _independent_vowel = "AU" elif dstring_object.options["san2bod quality"] == "low" and \ dchar_san.base_char in ('AA', 'II', 'UU'): #==================== # @@BOD2SAN-LOW-006 # (independent vowel) long vowels > short vowels #==================== _independent_vowel = {'AA' : 'A', 'II' : 'I', 'UU' : 'U'}[dchar_san.base_char] else: _independent_vowel = dchar_san.base_char unicode_symb = SAN__SYMB_INDEPENDENT_VOWELS.get_default_symbol(_independent_vowel) new_istruct = InternalStructure( dstring_object = dstring_object, consonant = "A", vowel1 = INDEPENDENT_VOWELS_INVERSED[unicode_symb]) istructs.append(new_istruct) place_consonant_among_subjc = False # consonant : elif dchar_san.base_char in SAN__SYMB_CONSONANTS: if dchar_san.base_char == 'DEVANAGARI SIGN VISARGA': # special case : the visarga symbol is placed among consonants in Sanskrit, # among diacritics in Tibetan. if dstring_object.options["san2bod quality"] == "normal" and \ dchar_san.base_char=='DEVANAGARI SIGN VISARGA': #==================== # @@BOD2SAN-NORM-001 # the visarga is omitted if "san2bod quality" == "normal" #==================== pass elif dstring_object.options["san2bod quality"] == "low" and \ dchar_san.base_char=='DEVANAGARI SIGN VISARGA': #=================== # @@BOD2SAN-LOW-001 # the visarga is omitted if "san2bod quality" == "low" #=================== pass else: unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(dchar_san.base_char) istructs[-1].rnam_bcad = True place_consonant_among_subjc = False elif not place_consonant_among_subjc: # consonant to be placed as a main consonant # (and not among subjoined consonants) : #........................................................... # _base_char will be added as a main consonant : #........................................................... if dstring_object.options["san2bod quality"] == "normal" and \ dchar_san.base_char=='VA': #==================== # @@BOD2SAN-NORM-002 # the व(va) becomes ब(ba) if "san2bod quality" == "normal" #==================== _base_char = "BA" elif dstring_object.options["san2bod quality"] == "low" and \ dchar_san.base_char=='VA': #=================== # @@BOD2SAN-LOW-002 # the व(va) becomes ब(ba) if "san2bod quality" == "normal" #=================== _base_char = "BA" elif (dstring_object.options["san2bod quality"] == "low" and \ dchar_san.base_char in ('TTA', 'TTHA', 'DDA', 'DDHA', 'NNA')): #=================== # @@BOD2SAN-LOW-007 # retroflex consonant > non-retroflex consonant # retroflex consonant + aspiration > non-retroflex # consonant without aspiration #=================== _base_char = {'TTA' : "TA", 'TTHA' : "TA", 'DDA' : "DA", 'DDHA' : "DA", 'NNA' : "NA" }[dchar_san.base_char] elif (dstring_object.options["san2bod quality"] == "low" and \ dchar_san.base_char in ('KHA', 'GHA', 'THA', 'CHA', 'JHA', 'TTHA', 'DDHA', 'PHA', 'BHA')): #=================== # @@BOD2SAN-LOW-008 # consonant + aspiration > consonant without aspiration #=================== _base_char = {'KHA' : "KA", 'GHA' : "GA", 'THA' : "TA", 'CHA' : "CA", 'JHA' : "JA", 'DHA' : "DA", 'TTHA' : "TTA", 'DDHA' : "DDA", 'PHA' : "PA", 'BHA' : "BA" }[dchar_san.base_char] else: # general case : _base_char = dchar_san.base_char unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(_base_char) bod_consonant = CONSONANTS_INVERSED[unicode_symb] new_istruct = InternalStructure( dstring_object = dstring_object, consonant = bod_consonant ) istructs.append(new_istruct) if dchar_san.virama: place_consonant_among_subjc = True else: # consonant to be placed among subjoined consonants # (and not as a main consonant) : if istructs[-1].subfix is None: istructs[-1].subfix = [] unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(dchar_san.base_char) cons = CONSONANTS_INVERSED[unicode_symb] add_this_consonant = True if dstring_object.options["san2bod quality"] == "low" and \ istructs[-1].subfix == [] and \ istructs[-1].consonant == cons: #=================== # @@BOD2SAN-LOW-008 # geminate consonant > 0 #=================== add_this_consonant = False # no more subjoinded consonant : the other one will be treated # like main consonants : place_consonant_among_subjc = False if add_this_consonant: istructs[-1].subfix.append( cons ) if not dchar_san.virama: place_consonant_among_subjc = False # dependent vowel : if dchar_san.dependentvowel is not None and dchar_san.dependentvowel != "A": #........................................................... # _dependent_vowel will be added as a dependent vowel : #........................................................... if dstring_object.options["san2bod quality"] == "normal" and \ dchar_san.dependentvowel=='O': #==================== # @@BOD2SAN-NORM-003 # (dependent vowel) ओ(ō) > औ(au) #==================== _dependent_vowel = "AU" elif dstring_object.options["san2bod quality"] == "low" and \ dchar_san.dependentvowel=='O': #==================== # @@BOD2SAN-LOW-003 # (dependent vowel) ओ(ō) > औ(au) #==================== _dependent_vowel = "AU" elif dstring_object.options["san2bod quality"] == "low" and \ dchar_san.dependentvowel in ('AA', 'II', 'UU'): #==================== # @@BOD2SAN-LOW-005 # (dependent vowel) long vowels > short vowels #==================== _dependent_vowel = {'AA' : 'A', 'II' : 'I', 'UU' : 'U'}[dchar_san.dependentvowel] else: _dependent_vowel = dchar_san.dependentvowel unicode_symb = \ SAN__SYMB_DEPENDENT_VOWELS.get_default_symbol(_dependent_vowel) istructs[-1].vowel1 = DEPENDENT_VOWELS_INVERSED[unicode_symb] # anusvara/candrabindu : if dchar_san.anusvara_candrabindu is not None: unicode_symb = \ SAN__SYMB_DIACRITICS.get_default_symbol(dchar_san.anusvara_candrabindu) istructs[-1].anusvara_candrabindu = DIACRITICS_INVERSED[unicode_symb] res = ListOfInternalStructures(anonymize_the_unknown_chars =\ dstring_object.options["anonymize the unknown characters"] == 'yes') # we add a tsheg after a "real" syllable (id est, not a punctuation sign, ...) for istruct in istructs: res.append(istruct) if istruct.consonant is not None: res.append( InternalStructure( dstring_object = dstring_object, punctuation_or_other_symbol = 'MARK INTERSYLLABIC TSHEG' )) return res
# # You should have received a copy of the GNU General Public License # along with DChars. If not, see <http://www.gnu.org/licenses/>. ################################################################################ """ ❏DChars❏ : dchars/tests/languages/jpn/transliterations/shepburn_tests.py """ import unittest from dchars.dchars import new_dstring from dchars.languages.bod.dcharacter import UNKNOWN_CHAR_SYMBOL DSTRING_JPN = new_dstring(language="jpn", transliteration_method="shepburn", options = {"anonymize the unknown characters" : "yes", "long vowels written with circumflex" : "no", "katakanas written with upper case letters" : "yes", "ou becomes ō" : "no", }, ) LIST_OF_RECIPROCAL_EXAMPLES = ( ('', '' ), ('か', 'ka' ), ('きゃ', 'kya'), ('びゅ', 'byu'), ('じゃあく', 'jaaku'), ('おねえさん', 'oneesan'), ('ゑ', 'we'), ('あんない', 'annai'), ('ぐんま', 'gunma'), ('しんよう', "shin'you"),
def get_usefull_combinations(): """ get_usefull_combinations() Return a (str)string with all the usefull combinations of characters, i.e. only the 'interesting' characters (not punctuation if it's too simple by example). NB : this function has nothing to do with linguistic or a strict approach of the language. This function allows only to get the most common and/or usefull characters of the writing system. NB : function required by the dchars-fe project. """ res = [] HBO = new_dstring( 'hbo' ) dstring = HBO() # base_char : we don't use the list stored in symbols.py # since we would lost the character's order. base_characters = ( 'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח', 'ט', 'י', 'כ', 'ל', 'מ', 'נ', 'ס', 'ע', 'פ', 'צ', 'ק', 'ר', 'ש', 'ת' ) #----------------------------------------------------------------------- # (1/2) simple characters #----------------------------------------------------------------------- for base_char in base_characters: for shin_sin_dot in (None, "HEBREW POINT SHIN DOT", "HEBREW POINT SIN DOT"): if base_char != 'SHIN': shin_sin_dot = None dchar = DCharacterHBO( dstring_object = dstring, base_char = base_char, contextual_form = None, shin_sin_dot = None, daghesh_mapiq = False, methegh = False, specialpoint = None, vowel = None, raphe = False, cantillation_mark = None ) txt = dchar__get_translit_str(dstring_object = dstring, dchar = dchar) res.append( str(dchar) + "{" + txt + "} " ) #----------------------------------------------------------------------- # (2/2) complex characters #----------------------------------------------------------------------- #....................................................................... combinations = (itertools.product( # base_char : ( 'ב', ), # vowel : (None, "HEBREW POINT SHEVA", "HEBREW POINT HATAF SEGOL", "HEBREW POINT HATAF PATAH", "HEBREW POINT HATAF QAMATS", "HEBREW POINT HIRIQ", "HEBREW POINT TSERE", "HEBREW POINT SEGOL", "HEBREW POINT PATAH", "HEBREW POINT QAMATS", "HEBREW POINT HOLAM", "HEBREW POINT HOLAM HASER FOR VAV", "HEBREW POINT QUBUTS", "HEBREW POINT QAMATS QATAN"), )) for base_char, \ vowel in combinations: dchar = DCharacterHBO( dstring_object = dstring, base_char = base_char, contextual_form = "initial+medium+final", shin_sin_dot = None, daghesh_mapiq = False, methegh = False, specialpoint = None, vowel = vowel, raphe = None, cantillation_mark = None, ) txt = dchar__get_translit_str(dstring_object = dstring, dchar = dchar) res.append( str(dchar) + "{" + txt + "} " ) #....................................................................... combinations = (itertools.product( # base_char : ( 'ש', ), # shin_sin_dot : (None, "HEBREW POINT SHIN DOT", "HEBREW POINT SIN DOT"), )) for base_char, shin_sin_dot, \ in combinations: dchar = DCharacterHBO( dstring_object = dstring, base_char = base_char, contextual_form = "initial+medium+final", shin_sin_dot = shin_sin_dot, daghesh_mapiq = False, methegh = False, specialpoint = None, vowel = None, raphe = None, cantillation_mark = None, ) txt = dchar__get_translit_str(dstring_object = dstring, dchar = dchar) res.append( str(dchar) + "{" + txt + "} " ) return "".join(res)
# # You should have received a copy of the GNU General Public License # along with DChars. If not, see <http://www.gnu.org/licenses/>. ################################################################################ """ ❏DChars❏ : dchars/tests/languages/bod/transliterations/bodsan_tests.py """ import unittest from dchars.dchars import new_dstring DSTRING_BOD_HIGH = new_dstring(language="བོད་ཡིག", transliteration_method='bodsan', options = {"expected structure" : "always Sanskrit", "fill the buffers" : 'no', "look up in the buffers" : 'no', "san2bod quality" : "high", }, ) DSTRING_BOD_NORM = new_dstring(language="བོད་ཡིག", transliteration_method='bodsan', options = {"expected structure" : "always Sanskrit", "fill the buffers" : 'no', "look up in the buffers" : 'no', "san2bod quality" : "normal", }, ) DSTRING_BOD_LOW = new_dstring(language="བོད་ཡིག", transliteration_method='bodsan',
# GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with DChars. If not, see <http://www.gnu.org/licenses/>. ################################################################################ """ ❏DChars❏ : dchars/tests/languages/san/transliterations/itrans_tests.py """ import unittest, os.path, re from dchars.dchars import new_dstring from dchars.languages.bod.dcharacter import UNKNOWN_CHAR_SYMBOL DSTRING_SAN = new_dstring(language='संस्कृतम्', transliteration_method = "itrans", options = {"anonymize the unknown characters" : 'no'}, ) DSTRING_SAN__UNKNOWNCHAR = new_dstring(language='संस्कृतम्', transliteration_method = "itrans", options = {"anonymize the unknown characters" : 'yes'}, ) LIST_OF_RECIPROCAL_EXAMPLES = ( ("", '' ), # क(0x0915) ("क", 'ka' ), # क(0x0915) + virama(0x094D) ("क्", 'k' ), # क(0x0915) + virama(0x094D) + anusvara (0902)
def test_get_transliteration__upsilon(self): """ TESTSDStringGRC.test_get_transliteration__upsilon """ # ----------------------------------------------- # [grc.gutenberg]transliteration for upsilon = y # ----------------------------------------------- dstring_grc__upsilon = new_dstring( language="Ἑλληνικὴ γλῶττα", transliteration_method="gutenberg", options={ "anonymize the unknown characters": "no", "gutenberg:ignore smooth breathing": "yes", "gutenberg:ignore accents": "yes", "gutenberg:ignore iota subscript": "yes", "gutenberg:ignore diaeresis": "yes", "gutenberg:transliteration for upsilon": "y", }, ) string = dstring_grc__upsilon("πύργον") grc_gutenberg2 = string.get_transliteration() self.assertEqual("pyrgon", grc_gutenberg2) string = dstring_grc__upsilon("αὐτης") grc_gutenberg2 = string.get_transliteration() self.assertEqual("aytês", grc_gutenberg2) # ----------------------------------------------- # [grc.gutenberg]transliteration for upsilon = u # ----------------------------------------------- dstring_grc__upsilon = new_dstring( language="Ἑλληνικὴ γλῶττα", transliteration_method="gutenberg", options={ "anonymize the unknown characters": "no", "gutenberg:ignore smooth breathing": "yes", "gutenberg:ignore accents": "yes", "gutenberg:ignore iota subscript": "yes", "gutenberg:ignore diaeresis": "yes", "gutenberg:transliteration for upsilon": "u", }, ) string = dstring_grc__upsilon("πύργον") grc_gutenberg2 = string.get_transliteration() self.assertEqual("purgon", grc_gutenberg2) string = dstring_grc__upsilon("αὐτης") grc_gutenberg2 = string.get_transliteration() self.assertEqual("autês", grc_gutenberg2) # ---------------------------------------------------- # [grc.gutenberg]transliteration for upsilon = u or y # ---------------------------------------------------- dstring_grc__upsilon = new_dstring( language="Ἑλληνικὴ γλῶττα", transliteration_method="gutenberg", options={ "anonymize the unknown characters": "no", "gutenberg:ignore smooth breathing": "yes", "gutenberg:ignore accents": "yes", "gutenberg:ignore iota subscript": "yes", "gutenberg:ignore diaeresis": "yes", "gutenberg:transliteration for upsilon": "u or y", }, ) string = dstring_grc__upsilon("πύργον") grc_gutenberg2 = string.get_transliteration() self.assertEqual("pyrgon", grc_gutenberg2) string = dstring_grc__upsilon("αὐτης") grc_gutenberg2 = string.get_transliteration() self.assertEqual("autês", grc_gutenberg2)
# # You should have received a copy of the GNU General Public License # along with DChars. If not, see <http://www.gnu.org/licenses/>. ################################################################################ """ ❏DChars❏ : dchars/tests/languages/fro/fro_tests.py """ import unittest from dchars.errors.errors import DCharsError from dchars.dchars import new_dstring from dchars.languages.bod.dcharacter import UNKNOWN_CHAR_SYMBOL DSTRING_FRO = new_dstring(language = "romanz", options = {"anonymize the unknown characters" : "no", "sorting method" : "default"}, ) DSTRING_FRO__UNKNOWNCHAR = new_dstring(language = "romanz", options = {"anonymize the unknown characters" : "yes"}, ) # pylint: disable=R0904 # ("Too many public methods") # Since this classes are derived from unittest.TestCase we have a lot of # methods in the following classe(s). ################################################################################ class TESTSDStringFRO(unittest.TestCase): """ class TESTSDStringFRO We test dchars.languages.fro.dchars::DStringFRO
# we import the "new_dstring" object in order to get a DSTRING_SAN object : from dchars.dchars import new_dstring DSTRING_SAN = new_dstring(language='san', transliteration_method="iso15919") # We set the string from a source-string : # this is the first part of the first verse of the Rig-Veda : string = DSTRING_SAN("अ॒ग्निमी॑ळे पु॒रोहि॑तं य॒ज्ञस्य॑ दे॒वमृ॒त्विज॑म् ।") # and this is the second part : string += DSTRING_SAN("होता॑रं रत्न॒धात॑मम् ॥") # transliteration : print(string.get_transliteration()) # -> a̱gnimī́ḷē pu̱rōhítaṁ ya̱jñasyá dē̱vamr̥̱tvijám .hōtā́raṁ ratna̱dhātámam .. # Let's inspect and modify this string : print(string[0]) # -> अ॒ print(string[-1].punctuation) # -> True print(string[0].base_char) # -> "A" print(string[1].base_char) # -> "DEVANAGARI LETTER GA" print(string[0].anudatta) # -> True string[0].anudatta = False string[0].accent = "DEVANAGARI STRESS SIGN UDATTA" print(string.get_transliteration()) # -> "ágnimī́ḷē pu̱rōhítaṁ ya̱jñasyá dē̱vamr̥̱tvijám .hōtā́raṁ ratna̱dhātámam .."
# Hypothesis.go_on__ang2() : regrouper les voyelles/dipht et les faire correspondre à l'index dans un mot dans la # chaîne source. Peut-être commencer par casser les mots puis au niveau supérieur casser les voyelles # treelib : caesar0301/treelib (https://github.com/caesar0301/treelib) # doc : http://hsiamin.com/treelib/ import os import argparse from dchars.dchars import new_dstring from treelib import Node, Tree NAME_OF_THE_PROJECT = "pyscansion" # (str) language name : DSTRING object LANGUAGES = { 'ang' : new_dstring(language='ang'), 'lat' : new_dstring(language='lat'), } ################################################################################ class Hypothesis(object): """ Hypothesis class Hypothesis objects are stored in HTREE. +----------+----------+---------------------------------------------------- | language | function | description +----------+----------+---------------------------------------------------- | (all) | ang1 | from root hypothesis | | |
def get_usefull_combinations(): """ get_usefull_combinations() Return a (str)string with all the usefull combinations of characters, i.e. only the 'interesting' characters (not punctuation if it's too simple by example). NB : this function has nothing to do with linguistic or a strict approach of the language. This function allows only to get the most common and/or usefull characters of the writing system. NB : function required by the dchars-fe project. """ res = [] dstring = new_dstring( 'grc' )() # base_char : we don't use the list stored in symbols.py # since we would lost the character's order. base_characters = ( 'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω', 'ϝ', 'ϗ', 'ϡ', 'ϛ', 'ϙ', ) #----------------------------------------------------------------------- # (1/2) simple characters #----------------------------------------------------------------------- for base_char in base_characters: for capital_letter in (False, True): dchar = DCharacterGRC( dstring_object = dstring, base_char = base_char, contextual_form = "initial+medium+final", punctuation = False, capital_letter = capital_letter, tonos = None, pneuma = None, hypogegrammene = False, dialutika = False, mekos = None ) txt = dchar__get_translit_str(dstring_object = dstring, dchar = dchar) res.append( str(dchar) + "{" + txt + "} " ) #----------------------------------------------------------------------- # (2/2) complex characters #----------------------------------------------------------------------- combinations = (itertools.product( # base_chars ( 'α', ), # contextual_form ("initial", "medium", "final", "initial+medium", "medium+final", "initial+medium+final"), # capital_letter (False, True), # tonos ( None, "ὀξεῖα", "βαρεῖα", "περισπωμένη" ), # pneuma ( None, "ψιλὸν", "δασὺ" ), # hypogegrammene (False, True), # dialutika (False, True), # mekos ( None, "βραχύ", "μακρόν" ), )) for base_char, contextual_form, capital_letter, \ tonos, pneuma, hypogegrammene, dialutika, mekos in combinations: add_this_dchar = True if base_char == 'ρ': if contextual_form != "initial+medium+final" or \ tonos is not None or \ hypogegrammene == True or \ dialutika == True or \ mekos is not None: add_this_dchar = False elif base_char in ('β', 'σ'): if tonos is not None or \ pneuma is not None or \ hypogegrammene == True or \ dialutika == True or \ mekos is not None: add_this_dchar = False elif base_char in ('α', 'η', 'ω'): if contextual_form != "initial+medium+final" or \ dialutika == True or \ mekos is not None: add_this_dchar = False elif base_char in ('ε', 'ο'): if contextual_form != "initial+medium+final" or \ hypogegrammene == True or \ tonos == "περισπωμένη" or \ hypogegrammene == True or \ dialutika == True or \ mekos is not None: add_this_dchar = False elif base_char in ('ι', 'υ'): if contextual_form != "initial+medium+final" or \ hypogegrammene == True or \ mekos is not None: add_this_dchar = False else: if contextual_form != "initial+medium+final" or \ tonos is not None or \ pneuma is not None or \ hypogegrammene == True or \ dialutika == True or \ mekos is not None: add_this_dchar = False if add_this_dchar: dchar = DCharacterGRC( dstring_object = dstring, base_char = base_char, contextual_form = contextual_form, punctuation = False, capital_letter = capital_letter, tonos = tonos, pneuma = pneuma, hypogegrammene = hypogegrammene, dialutika = dialutika, mekos=mekos) txt = dchar__get_translit_str(dstring_object = dstring, dchar = dchar) res.append( str(dchar) + "{" + txt + "} " ) return "".join(res)
# # You should have received a copy of the GNU General Public License # along with DChars. If not, see <http://www.gnu.org/licenses/>. ################################################################################ """ ❏DChars❏ : dchars/tests/languages/lat/lat_tests.py """ import unittest, os.path from dchars.errors.errors import DCharsError from dchars.dchars import new_dstring from dchars.symbols.symbols import UNKNOWN_CHAR_SYMBOL DSTRING_LAT = new_dstring(language = "latīna", options = {"anonymize the unknown characters" : "no", "sorting method" : "default"}, ) DSTRING_LAT__UNKNOWNCHAR = new_dstring(language = "latīna", options = {"anonymize the unknown characters" : "yes"}, ) # pylint: disable=R0904 # ("Too many public methods") # Since this classes are derived from unittest.TestCase we have a lot of # methods in the following classe(s). ################################################################################ class TESTSDStringLAT(unittest.TestCase): """ class TESTSDStringLAT We test dchars.languages.lat.dchars::DStringLAT
def load(self, srcfile = None): """ CurrentState.load Load from a file (<srcfile> : (str), file's name) the current state of the program; if <srcfile> is None, load the current state from the default file. """ if srcfile is None: _srcfile = CurrentState.DEFAULTFILE # missing default file : if not os.path.exists(_srcfile): return else: _srcfile = srcfile # cleaning up the current state of the program : self.mainapp.sourcetext_editor.clear() self.mainapp.reset_transformations() # <location> : # None, then "transformations options", then "source text/results" location = None # <nbr_of_srctextsresults> : # =0 for the source text, 1 for the result of the transformation #0, # 2 for the result of the transformation #1, ... nbr_of_srctextsresults = 0 with open(_srcfile, "r", encoding="utf-8") as src: for line in src.readlines(): if not line.startswith("###"): if line.startswith("*** TRANS"): # new transformation : location = "transformations options" self.mainapp.nbr_usedtransf += 1 current_t = \ self.mainapp.transformations[self.mainapp.nbr_usedtransf-1] current_t.editor_frame.show() elif line.startswith("*** SOURCE TEXT/RESULTS"): location = "source text/results" nbr_of_srctextsresults += 1 elif location == "transformations options": if line.startswith("language, transliteration_method ="): data = line[len("language, transliteration_method ="):].strip() newl, new_transl_method = data.split(",") # <newl> stands for "new language" : newl = newl.strip() new_transl_method = new_transl_method.strip() newds = new_dstring( language = newl, transliteration_method = new_transl_method ) current_t = \ self.mainapp.transformations[self.mainapp.nbr_usedtransf-1] current_t.dstring_type = newds # <newl2> : full representation of <newl> : newl2 = \ current_t.editor_frame.language_name.getFullStringForOneLanguage(newl) current_t.editor_frame.language_name.SetCurrentIndexTo(newl2) current_t.editor_frame.transl_name.SetCurrentIndexTo(new_transl_method) elif line.startswith("direction ="): # <new_d> stands for "new direction" new_d = Direction( int(line[len("direction ="):].strip()) ) current_t = self.mainapp.transformations[self.mainapp.nbr_usedtransf-1] current_d = current_t.editor_frame.direction current_d.SetCurrentIndexTo(new_d.GetCorrespondingStr()) else: # other options, like "anonymize the unknown characters" : if line.strip() != "": option_name, option_value = line.split("=") option_name = option_name.strip() option_value = option_value.strip() current_t = \ self.mainapp.transformations[self.mainapp.nbr_usedtransf-1] options = current_t.dstring_type.options options[ option_name ] = option_value current_t.editor_frame.options.setOption( option_name, option_value ) elif location == "source text/results": if nbr_of_srctextsresults == 1: # source text : current_text = self.mainapp.sourcetext_editor.toPlainText() if current_text == "": current_text += line[:-1] else: current_text += "\n" + line[:-1] self.mainapp.sourcetext_editor.setPlainText( current_text ) else: # transformation's result : current_t = self.mainapp.transformations[nbr_of_srctextsresults-2] current_result = current_t.result if current_t.result == "": current_result += line else: if line[:-1] == "\n": current_result += "\n" + line[:-1] else: # it may happen that <line> is the last line of the file # and is not terminated by \n : current_result += "\n" + line current_t.result = current_result current_t.editor_frame.editor.setPlainText( current_t.result )
# You should have received a copy of the GNU General Public License # along with Anceps. If not, see <http://www.gnu.org/licenses/>. ################################################################################ """ ❏Anceps❏ : anceps/constants.py """ from dchars.dchars import new_dstring NAME_OF_THE_PROJECT = "Anceps" # initialized by cmdline/cmdline.py::get_input_data() CMDLINE_ARGS = None # (str) language name : DSTRING object LANGUAGES = { 'ang' : new_dstring(language='ang', options = {"anonymize the unknown characters" : 'no'}), 'lat' : new_dstring(language='lat', options = {"anonymize the unknown characters" : 'no'}), } # constants for the patterns' file : PATTERNSFILE__COMMENTS_STARTSYMBOLS = "####" PATTERNSFILE__EXPECTED_SECTIONS_IN_A_PATTERN = ('scansion rules', 'syllabic structure') PROJECT_TITLE = """ ,adPPYYba, 8b,dPPYba, ,adPPYba, ,adPPYba, 8b,dPPYba, ,adPPYba, `Y8 88P' `a8a a8a aa a8P_____88 88P' a8a I8[ aa ,adPPPPP88 88 88 8b 8PP"'"'"aa 88 d8 `aY8ba, 88, ,88 88 88 a8a, ,aa a8b, ,aa 88b, ,a8a aa ]8I `a8bbdPaY8 88 88 `aYbbd8a' `aYbbd8a' 88`YbbdPa' `aYbbdPa' 88
def get_usefull_combinations(): """ get_usefull_combinations() Return a (str)string with all the usefull combinations of characters, i.e. only the 'interesting' characters (not punctuation if it's too simple by example). NB : this function has nothing to do with linguistic or a strict approach of the language. This function allows only to get the most common and/or usefull characters of the writing system. NB : function required by the dchars-fe project. """ res = [] dstring = new_dstring( 'jpn' )() # base_char : we don't use the list stored in symbols.py # since we would lost the character's order. base_characters = ( 'あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ', 'た', 'ち', 'つ', 'て', 'と', 'な', 'に', 'ぬ', 'ね', 'の', 'は', 'ひ', 'ふ', 'へ', 'ほ', 'ま', 'み', 'む', 'め', 'も', 'や', 'ゆ', 'よ', 'ら', 'り', 'る', 'れ', 'ろ', 'わ', 'ゐ', 'ゑ', 'ゑ', 'を', 'ん', ) for base_char in base_characters: for chartype in ('hiragana', 'katakana'): for smallsize in (False, True): for diacritic in (None, "dakuten", "handakuten"): add_this_char = True if smallsize and base_char not in HIRAGANA_TO_SMALL_HIRAGANA: add_this_char = False if diacritic == 'dakuten' and \ base_char not in ('か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ', 'た', 'ち', 'つ', 'て', 'と', 'は', 'ひ', 'ふ', 'へ', 'ほ',): add_this_char = False if diacritic == 'handakuten' and \ base_char not in ('は', 'ひ', 'ふ', 'へ', 'ほ',): add_this_char = False if add_this_char: dchar = DCharacterJPN( dstring_object = dstring, unknown_char = False, base_char = base_char, punctuation = False, chartype = chartype, diacritic = diacritic, smallsize = smallsize ) txt = dchar__get_translit_str(dstring_object = dstring, dchar = dchar) res.append( str(dchar) + "{" + txt + "} " ) return "".join(res)
# # You should have received a copy of the GNU General Public License # along with DChars. If not, see <http://www.gnu.org/licenses/>. ################################################################################ """ ❏DChars❏ : dchars/tests/languages/ang/ang_tests.py """ import unittest from dchars.errors.errors import DCharsError from dchars.dchars import new_dstring from dchars.languages.bod.dcharacter import UNKNOWN_CHAR_SYMBOL DSTRING_ANG = new_dstring( language="Ænglisc", options={"anonymize the unknown characters": "no", "sorting method": "default"} ) DSTRING_ANG__UNKNOWNCHAR = new_dstring(language="Ænglisc", options={"anonymize the unknown characters": "yes"}) # pylint: disable=R0904 # ("Too many public methods") # Since this classes are derived from unittest.TestCase we have a lot of # methods in the following classe(s). ################################################################################ class TESTSDStringANG(unittest.TestCase): """ class TESTSDStringANG We test dchars.languages.ang.dchars::DStringANG """