Пример #1
0
    def check_greek_trans(self, myfile):

        self.good_trans = []
        self.bad_trans = []
        self.has_dchars = (new_dstring != None)

        if not self.has_dchars:
            # DChars not installed
            return

        greek_trans = []

        # We suppose the greek is inside a span, and the transliterration
        # is in the title attribute.
        for find in [ "//span[@title]" ]:
            for element in etree.XPath(find)(myfile.tree):

                if not element.attrib:
                    continue

                # Encyclopaedia Britanicca
                if "correction" in element.attrib.get("class", ""):
                    continue

                title = element.attrib['title']

                # Special book - remove [Griech.: ...]
                if title.startswith("[Griech.: "):
                    title = title[10:-1]

                greek_trans += [ ( element.xpath("string()" ), title) ]


        # Now, compare.
        DSTRING_Y = new_dstring(language='grc',
                                transliteration_method="gutenberg",
                                options = { "gutenberg:transliteration for upsilon" : "u or y",
                                            }
                                )

        for g in greek_trans:

            # greek, transliteration, and expected transliteration
            # strip leading/trailing and double withe spaces
            grec = re.sub("\s+", " ", g[0].lstrip().rstrip())
            triplet = ( grec,
                        re.sub("\s+", " ", g[1].lstrip().rstrip()),
                        DSTRING_Y(grec).get_transliteration() )

            if triplet[1] == triplet[2]:
                self.good_trans += [ triplet ]
            else:
                self.bad_trans += [ triplet ]
Пример #2
0
    def create_new_helpcharacters_file(self):
        """
                HelpCharacterFile.create_new_helpcharacters_file
        """
        with open(HelpCharacterFile.filename, 'w') as dest:

            #---------------------------------------------------------------
            # header :
            #---------------------------------------------------------------
            dest.write( ("#" * 80 ) + "\n" )
            dest.write( "#" + "\n" )
            dest.write( "# help_characters.data file" + "\n" )
            dest.write( "#" + "\n" )
            dest.write( "#   file's format :" + "\n" )
            dest.write( "#   o line beginning with # are comments" + "\n" )
            dest.write( "#   o section names follow the format @002 (see the code)" + "\n" )
            dest.write( "#   o a section begins with '***', followed by the ISO-639-3" + "\n" )
            dest.write( "#     language's name, followed by '.', followed by either" + "\n" )
            dest.write( "#     'text' either 'trans.yyy', 'yyy' being the name of the" + "\n" )
            dest.write( "#     transliteration's method." + "\n" )
            dest.write( "#   o 'xxx.text' stands for the unicode symbols used by the" + "\n" )
            dest.write( "#      language 'xxx'" + "\n" )
            dest.write( "#   o 'xxx.trans' stands for the transliterated symbols used" + "\n" )
            dest.write( "#     by the language 'xxx'" + "\n" )
            dest.write( "#   o a section can contain several lines, separated by a carriage" + "\n" )
            dest.write( "#   o return." + "\n" )
            dest.write( "#   o empty lines or lines made of spaces are not read." + "\n" )
            dest.write( "#" + "\n" )
            dest.write( ("#" * 80 ) + "\n" )
            
            #---------------------------------------------------------------
            # characters relative to each language :
            #---------------------------------------------------------------
            for language_name in dchars.languages_name.LANGUAGES_AND_TRANSLITERATIONS:

                iso_639_3_name = dchars.languages_name.ISO_639_3_NAME[language_name]
                DSTRING = new_dstring( iso_639_3_name )
            
                dest.write( "\n" )
                dest.write( "# {0}({1})".format( language_name, iso_639_3_name) + "\n" )
                dest.write( "\n" )

                dest.write( "{0}{1}.text".format(HelpCharacterFile.milestone, iso_639_3_name) + "\n" )
                dest.write( str(DSTRING().get_usefull_combinations()) + "\n" )

                for trans in dchars.languages_name.LANGUAGES_AND_TRANSLITERATIONS[language_name]:

                    dest.write( "{0}{1}.trans.{2}".format(HelpCharacterFile.milestone,
                                                          iso_639_3_name,
                                                          trans) + "\n" )
                    dest.write( DSTRING().get_usefull_transl_combinations() + "\n"  )
Пример #3
0
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with DChars.  If not, see <http://www.gnu.org/licenses/>.
################################################################################
"""
    ❏DChars❏ : dchars/tests/languages/grc/grc_tests.py
"""

import unittest, os.path

from dchars.errors.errors import DCharsError
from dchars.dchars import new_dstring
from dchars.symbols.symbols import UNKNOWN_CHAR_SYMBOL

DSTRING_GRC = new_dstring(language="Ἑλληνικὴ γλῶττα", options={"anonymize the unknown characters": "no"})

DSTRING_GRC__UNKNOWNCHAR = new_dstring(language="Ἑλληνικὴ γλῶττα", options={"anonymize the unknown characters": "yes"})


# pylint: disable=R0904
# ("Too many public methods")
# Since this classes are derived from unittest.TestCase we have a lot of
# methods in the following classe(s).

################################################################################
class TESTSDStringGRC(unittest.TestCase):
    """
        class TESTSDStringGRC

        We test  dchars.languages.grc.dchars::DStringGRC
Пример #4
0
    (2) informations about the buffers

    (3) we write the buffers

    ############################################################################
"""

import pickle
import os.path
from dchars.dchars import new_dstring
import dchars.languages.bod.buffer as buffer
import dchars.languages.bod.transliterations.ewts.ewts_buffer as ewts_buffer
DSTRING_BOD_BUFF = new_dstring(language="བོད་ཡིག",
                               transliteration_method='ewts',
                               options = {"expected structure"          : "Tibetan or Sanskrit",
                                          "look up in the buffers"      : 'no',
                                          "fill the buffers"            : 'yes'},
                              )

#...............................................................................
# (1.1) reading a list of EWTS/unicode words
# We use a list of EWTS/unicode words in order to read ewts and unicode strings:
#...............................................................................
for bod, ewts in (
        ("ཀ"    , 'ka'),
        ("ཀྲ"    , 'kra'),
        ("ཀྭ",   'kwa'),
        ("ཀྱ",   'kya'),
        ("རྐ",   'rka'),
        ("ཉ",   'nya'),
        ("རྙ",   'rnya'),
Пример #5
0
def get_usefull_combinations():
    """
            get_usefull_combinations()

            Return a (str)string with all the usefull combinations of characters,
            i.e. only the 'interesting' characters (not punctuation if it's too simple
            by example).

            NB : this function has nothing to do with linguistic or a strict
                 approach of the language. This function allows only to get the
                 most common and/or usefull characters of the writing system.

            NB : function required by the dchars-fe project.
    """
    res = []

    dstring = new_dstring( 'ang' )()

    # base_char : we don't use the list stored in symbols.py
    # since we would lost the character's order.
    base_characters  = ( 'a', 'æ', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
                         'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
                         'q', 'r', 's', 't', 'þ', 'ð', 'u', 'v',
                         'w', 'x', 'y', 'z', )

    #-----------------------------------------------------------------------
    # (1/2) simple characters
    #-----------------------------------------------------------------------
    for base_char in base_characters:
        for capital_letter in (False, True):

            dchar = DCharacterANG( dstring_object = dstring,
                                   base_char = base_char,
                                   punctuation = False,
                                   capital_letter = capital_letter,
                                   makron = False,
                                   stress = 0,
                                   upperdot = False)

            txt = dchar__get_translit_str(dstring_object = dstring,
                                          dchar = dchar)

            res.append( str(dchar) + "{" + txt + "} " )

    #-----------------------------------------------------------------------
    # (2/2) complex characters
    #-----------------------------------------------------------------------
    combinations = (itertools.product(
                                       # base_char :
                                       ('a',),

                                       # capital_letter
                                       (False, True),

                                       # makron
                                       (False, True),

                                       # length
                                       ( None, "short", "long",),

                                       # stress
                                       (-1, 0, 1, 2),

                                       # upperdot
                                       (False, True),
                                       ))

    for base_char, capital_letter, makron, length, stress, upperdot in combinations:

        add_this_dchar = True

        if base_char not in ('a', 'e', 'i', 'o', 'u'):
            if length is not None or \
               stress != 0 or \
               upperdot == True:

                add_this_dchar = False

        if add_this_dchar:
            dchar = DCharacterANG( dstring_object = dstring,
                                   base_char = base_char,
                                   punctuation = False,
                                   capital_letter = capital_letter,
                                   makron = makron,
                                   stress = stress,
                                   upperdot = upperdot )

            txt = dchar__get_translit_str(dstring_object = dstring,
                                          dchar = dchar)

            res.append( str(dchar) + "{" + txt + "} " )

    return "".join(res)
Пример #6
0
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with DChars.  If not, see <http://www.gnu.org/licenses/>.
################################################################################
"""
    ❏DChars❏ : dchars/tests/languages/grc/transliterations/betacode_tests.py
"""

import unittest, os.path

from dchars.dchars import new_dstring
from dchars.symbols.symbols import UNKNOWN_CHAR_SYMBOL

DSTRING_GRC = new_dstring(language="Ἑλληνικὴ γλῶττα",
                          transliteration_method = "betacode",
                          options = {"anonymize the unknown characters" : 'no'},
                          )

DSTRING_GRC__UNKNOWNCHAR = new_dstring(language="Ἑλληνικὴ γλῶττα",
                                       transliteration_method = "betacode",
                                       options = {"anonymize the unknown characters" : 'yes'},
                                      )


LIST_OF_RECIPROCAL_EXAMPLES = (
    ("",        ''),
    ("ά",       'A/'),
    ("ἁ",       'A('),
    ("ἅ",       "A(/"),
    ("ἆ",       "A)="),
    ("ᾇ",       "A(=|"),
Пример #7
0
"""
    ❏DChars❏ : dchars/tests/languages/grc/transliterations/gutenberg_tests.py
"""

import unittest

from dchars.dchars import new_dstring
from dchars.symbols.symbols import UNKNOWN_CHAR_SYMBOL

DSTRING_GRC = new_dstring(
    language="Ἑλληνικὴ γλῶττα",
    transliteration_method="gutenberg",
    options={
        "anonymize the unknown characters": "no",
        "gutenberg:ignore smooth breathing": "yes",
        "gutenberg:ignore accents": "yes",
        "gutenberg:ignore iota subscript": "yes",
        "gutenberg:ignore diaeresis": "yes",
        "gutenberg:transliteration for upsilon": "u",
        "gutenberg:ignore makron and brakhu": "yes",
    },
)

DSTRING_GRC__FULL = new_dstring(
    language="Ἑλληνικὴ γλῶττα",
    transliteration_method="gutenberg",
    options={
        "anonymize the unknown characters": "no",
        "gutenberg:ignore smooth breathing": "no",
        "gutenberg:ignore accents": "no",
        "gutenberg:ignore iota subscript": "no",
Пример #8
0
from dchars.dchars import new_dstring
DSTRING_FRO = new_dstring(language='fro')

#string = DSTRING_FRO().init_from_transliteration("abc")
#print(str(string))
#print(string.get_transliteration())

#string = DSTRING_FRO().init_from_transliteration("a\\")
#print(str(string))
#print(string.get_transliteration())

#string = DSTRING_FRO().init_from_transliteration("a/")
#print(str(string))
#print(string.get_transliteration())

#string = DSTRING_FRO().init_from_transliteration("a+:")
#print(str(string))
#print(string.get_transliteration())

#string = DSTRING_FRO().init_from_transliteration("a/\\")
#print(str(string))
#print(string.get_transliteration())

#string = DSTRING_FRO().init_from_transliteration("c+c")
#print(str(string))
#print(string.get_transliteration())

string = DSTRING_FRO("ç")
print(str(string))
print(string.get_transliteration())
Пример #9
0
def get_usefull_combinations():
    """
            get_usefull_combinations()

            Return a (str)string with all the usefull combinations of characters,
            i.e. only the 'interesting' characters (not punctuation if it's too simple
            by example).

            NB : this function has nothing to do with linguistic or a strict
                 approach of the language. This function allows only to get the
                 most common and/or usefull characters of the writing system.

            NB : function required by the dchars-fe project.
    """
    res = []

    SAN = new_dstring( 'san' )
    dstring = SAN()

    # base_char : we don't use the list stored in symbols.py
    # since we would lost the character's order.
    base_characters__vowels = (
                                  'A',
                                  'AA',
                                  'I',
                                  'II',
                                  'U',
                                  'UU',
                                  'VOCALIC R',
                                  'VOCALIC RR',
                                  'VOCALIC L',
                                  'VOCALIC LL',
                                  'SHORT E',
                                  'E',
                                  'SHORT O',
                                  'O',
                                  'AI',
                                  'AU',
                                )

    base_characters  = ( 'KA',
                         'KHA',
                         'GA',
                         'GHA',
                         'NGA',
                         'CA',
                         'CHA',
                         'JA',
                         'JHA',
                         'NYA',
                         'TTA',
                         'TTHA',
                         'DDA',
                         'DDHA',
                         'NNA',
                         'TA',
                         'THA',
                         'DA',
                         'DHA',
                         'NA',
                         'PA',
                         'PHA',
                         'BA',
                         'BHA',
                         'MA',
                         'YA',
                         'RA',
                         'LA',
                         'LLA',
                         'VA',
                         'SHA',
                         'SSA',
                         'SA',
                         'HA',
                         'DEVANAGARI SIGN VISARGA', )

    #-----------------------------------------------------------------------
    # (1/2) simple characters
    #-----------------------------------------------------------------------
    for base_char in base_characters__vowels:

        dchar = DCharacterSAN( dstring_object = dstring,
                               base_char = base_char,
                               accent = None,
                               punctuation = False,
                               nukta = False,
                               anusvara_candrabindu = None,
                               virama = False,
                               anudatta = False,
                               is_an_independent_vowel = True,
                               dependentvowel = None,
                               )

        txt = dchar__get_translit_str(dstring_object = dstring,
                                      prev_dchar = None,
                                      dchar = dchar)

        res.append( str(dchar) + "{" + txt + "} " )


    for base_char in base_characters:

        dchar = DCharacterSAN( dstring_object = dstring,
                       base_char = base_char,
                       accent = None,
                       punctuation = False,
                       nukta = False,
                       anusvara_candrabindu = None,
                       virama = False,
                       anudatta = False,
                       is_an_independent_vowel = False,
                       dependentvowel = None,
                     )

        txt = dchar__get_translit_str(dstring_object = dstring,
                                      prev_dchar = None,
                                      dchar = dchar)

        res.append( str(dchar) + "{" + txt + "} " )


    #-----------------------------------------------------------------------
    # (2/2) complex characters
    #-----------------------------------------------------------------------
    combinations = (itertools.product(
                                       # base_chars
                                       ('KA',),

                                       # anusvara_candrabindu
                                       #(None,
                                       # "DEVANAGARI SIGN ANUSVARA",
                                       # "DEVANAGARI SIGN INVERTED CANDRABINDU",
                                       # 'DEVANAGARI SIGN CANDRABINDU',
                                       # ),

                                       # virama
                                       #( False, True ),

                                       # anudatta
                                       #( False, True ),

                                       # dependentvowel
                                       (      None,
                                              'AA',
                                              'I',
                                              'II',
                                              'U',
                                              'UU',
                                              'VOCALIC R',
                                              'VOCALIC RR',
                                              #'CANDRA E',
                                              #'SHORT E',
                                              'E',
                                              'AI',
                                              #'CANDRA O',
                                              #'SHORT O',
                                              'O',
                                              'AU',
                                              'VOCALIC L',
                                              'VOCALIC LL',
                                       ),

                                       ))

    for base_char, dependentvowel in combinations:

        add_this_char = True

        if base_char == 'DEVANAGARI SIGN VISARGA':
            if dependentvowel is not None:
                add_this_char = False

        if add_this_char:
            dchar = DCharacterSAN( dstring_object = dstring,
                                   base_char = base_char,
                                   accent = None,
                                   punctuation = None,
                                   nukta = False,
                                   anusvara_candrabindu = None,
                                   virama = False,
                                   anudatta = False,
                                   is_an_independent_vowel = False,
                                   dependentvowel = dependentvowel,
                                   )

            txt = dchar__get_translit_str(dstring_object = dstring,
                                          prev_dchar = None,
                                          dchar = dchar)

            res.append( str(dchar) + "{" + txt + "} " )

    return "".join(res)
Пример #10
0
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with DChars.  If not, see <http://www.gnu.org/licenses/>.
################################################################################
"""
    ❏DChars❏ : dchars/tests/languages/ang/transliterations/basic_tests.py
"""

import unittest

from dchars.dchars import new_dstring
from dchars.languages.bod.dcharacter import UNKNOWN_CHAR_SYMBOL
DSTRING_ANG = new_dstring(language="Ænglisc",
                          transliteration_method="basic",
                          options = {"anonymize the unknown characters" : "yes",
                                     },
                         )

LIST_OF_RECIPROCAL_EXAMPLES = (
    ('',        ''      ),
    ('a',       'a'     ),
    ('p',       'p'     ),
    ("Q",       'Q'     ),
    ("ō",       'o_'    ),
    ("Quōēre,", 'Quo_e_re,'),
    ("N",       'N'     ),
    (" ",       ' '     ),
    ("è",       'e\\'   ),
    ("a",       'a'     ),
    ("A",       'A'     ),
Пример #11
0
from dchars.dchars import new_dstring
DSTRING_ANG = new_dstring(language='ang')

#string = DSTRING_ANG().init_from_transliteration("a*_")
#print(str(string))
#print(string.get_transliteration())

string = DSTRING_ANG().init_from_transliteration("a_*")
print(str(string))
print(string.get_transliteration())

string += string[0]
print(str(string))
print(string.get_transliteration())
Пример #12
0
# B) creation of the DSTRING object
# C) processing
#    C.1) data reading
#    C.2) unicode strings become DString objects
#    C.3) sort
#    C.4) modifications
#    C.5) output
#
#*******************************************************************************

# A) arguments of the command line
ARGS = get_arguments()

# B) creation of the DSTRING object
DSTRING = new_dstring( language = ARGS.language,
                       options = {"look up in the buffers" : 'yes',
                                  "sorting method" : ARGS.sorting_method})

# C) processing
if ARGS.source is not None:

    if ARGS.modifications is None:
        # normal case :

        # C.1) data reading
        DATA = []

        # we read the source file(s) :
        for filename in ARGS.source:

            with open(filename, 'r') as src:
Пример #13
0
#
#    You should have received a copy of the GNU General Public License
#    along with DChars.  If not, see <http://www.gnu.org/licenses/>.
################################################################################
"""
    ❏DChars❏ : dchars/tests/languages/jpn/jpn_tests.py
"""

import unittest

from dchars.errors.errors import DCharsError
from dchars.dchars import new_dstring
from dchars.symbols.symbols import UNKNOWN_CHAR_SYMBOL

DSTRING_JPN = new_dstring(language="日本語",
                          options = {"anonymize the unknown characters" : 'no'},
                          )

DSTRING_JPN__UNKNOWNCHAR = new_dstring(language="日本語",
                                       options = {"anonymize the unknown characters" : 'yes'},
                                      )


# pylint: disable=R0904
# ("Too many public methods")
# Since this classes are derived from unittest.TestCase we have a lot of
# methods in the following classe(s).

################################################################################
class TESTSDStringJPN(unittest.TestCase):
    """
Пример #14
0
def get_usefull_combinations():
    """
            get_usefull_combinations()

            Return a (str)string with all the usefull combinations of characters,
            i.e. only the 'interesting' characters (not punctuation if it's too simple
            by example).

            NB : this function has nothing to do with linguistic or a strict
                 approach of the language. This function allows only to get the
                 most common and/or usefull characters of the writing system.

            NB : function required by the dchars-fe project.
    """
    res = []

    dstring = new_dstring( 'bod' )()

    # base_char : we don't use the list stored in symbols.py
    # since we would lost the character's order.
    base_characters  = ('K',
                        'KH',
                        'G',
                        'GH',
                        'NG',
                        'C',
                        'CH',
                        'J',
                        'NY',
                        'TT',
                        'TTH',
                        'DD',
                        'DDH',
                        'NN',
                        'T',
                        'TH',
                        'D',
                        'DH',
                        'N',
                        'P',
                        'PH',
                        'B',
                        'BH',
                        'M',
                        'TS',
                        'TSH',
                        'DZ',
                        'DZH',
                        'W',
                        'ZH',
                        'Z',
                        '-',
                        'Y',
                        'R',
                        'L',
                        'SH',
                        'SS',
                        'S',
                        'H',
                        'KSS',

                        'A',
                       )

    #-----------------------------------------------------------------------
    # (1/2) simple characters
    #-----------------------------------------------------------------------
    for base_char in base_characters:

        dchar = DCharacterBOD( dstring_object = dstring,
                                base_char = base_char,
                                subj_consonants = None,
                                rnam_bcad = False,
                                punctuation = False,
                                halanta = False,
                                anusvara_candrabindu = None,
                                vowel1 = None,
                                vowel2 = None )

        dstring.append(dchar)
        dstring.update_istructs()

        txt = dstring__get_translit_str(dstring = dstring)

        res.append( str(dchar) + "{" + txt + "} " )

    #-----------------------------------------------------------------------
    # (2/2) complex characters
    #-----------------------------------------------------------------------
    combinations = (itertools.product(
                                       # base_chars
                                       ('K',),

                                       # vowel
                                       (      None,
                                              'AA',
                                              'I',
                                              'II',
                                              'U',
                                              'UU',
                                              'VOCALIC R',
                                              'VOCALIC RR',
                                              'VOCALIC L',
                                              'VOCALIC LL',
                                              'E',
                                              'AI',
                                              'O',
                                              'AU',
                                       )))

    for base_char, vowel in combinations:

        dchar = DCharacterBOD( dstring_object = dstring,
                                base_char = base_char,
                                subj_consonants = None,
                                rnam_bcad = False,
                                punctuation = False,
                                halanta = False,
                                anusvara_candrabindu = None,
                                vowel1 = vowel,
                                vowel2 = None )

        dstring.append(dchar)
        dstring.update_istructs()

        txt = dstring__get_translit_str(dstring = dstring)

        res.append( str(dchar) + "{" + txt + "} " )

    return "".join(res)
Пример #15
0
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with DChars.  If not, see <http://www.gnu.org/licenses/>.
################################################################################
"""
    ❏DChars❏ : dchars/tests/languages/hbo/hbo_tests.py
"""

import unittest, os.path

from dchars.errors.errors import DCharsError
from dchars.dchars import new_dstring
from dchars.symbols.symbols import UNKNOWN_CHAR_SYMBOL

DSTRING_HBO = new_dstring(language="עִבְֿרִיתֿ מִקְרָאִיתֿ", options={"anonymize the unknown characters": "no"})
DSTRING_HBO__UNKNOWNCHAR = new_dstring(
    language="עִבְֿרִיתֿ מִקְרָאִיתֿ", options={"anonymize the unknown characters": "yes"}
)

# pylint: disable=R0904
# ("Too many public methods")
# Since this classes are derived from unittest.TestCase we have a lot of
# methods in the following classe(s).
################################################################################
class TESTSDStringHBO(unittest.TestCase):
    """
        class TESTSDStringHBO

        We test dchars.languages.hbo.dchars::DStringHBO
    """
Пример #16
0
def get_intstruct_from_trans_str( _src, dstring_object ):
    """
        function get_intstruct_from_trans_str()

        _src    : (str) transliterated string like "क".

        Return a ListOfInternalStructures object.
    """

    # list of InternalStructure objects.
    istructs = ListOfInternalStructures(anonymize_the_unknown_chars =\
                                dstring_object.options["anonymize the unknown characters"] == 'yes')

    # we read <_src> through a DSTRING_SAN object :
    dstring_san = new_dstring(language='संस्कृतम्',
                              transliteration_method="iso15919")
    dstring_san = dstring_san(_src)

    # In Sanskrit, if a consonant is followed by a virama, it means that the following
    # consonants are part of a cluster of consonants.
    #
    # E.g. in कर्म (0915=ka, 0930=ra, 094D=virama, 092E=ma) we have something like kar+ma,
    # the -m- having no vowel.
    #
    place_consonant_among_subjc = False

    for dchar_san in dstring_san:

        if dchar_san.unknown_char:
            new_istruct = InternalStructure( dstring_object = dstring_object,
                                             unknown_character = True )
            istructs.append(new_istruct)

        else:

            # punctation symbol :
            if dchar_san.base_char in SAN__SYMB_PUNCTUATION:
                unicode_symb = SAN__SYMB_PUNCTUATION.get_default_symbol(dchar_san.base_char)
                new_istruct = InternalStructure( dstring_object = dstring_object,
                                                 punctuation_or_other_symbol = \
                                                   PUNCTUATION_INVERSED[unicode_symb] )
                istructs.append(new_istruct)

                place_consonant_among_subjc = False

            # other symbol :
            elif dchar_san.base_char in SAN__SYMB_OTHER_SYMBOLS:
                unicode_symb = SAN__SYMB_OTHER_SYMBOLS.get_default_symbol(dchar_san.base_char)
                new_istruct = InternalStructure( dstring_object = dstring_object,
                                                 punctuation_or_other_symbol = \
                                                   OTHER_SYMBOLS_INVERSED[unicode_symb] )
                istructs.append(new_istruct)

                place_consonant_among_subjc = False

            # independent vowel:
            elif dchar_san.base_char in SAN__SYMB_INDEPENDENT_VOWELS:

                #...............................................................
                # _independent_vowel will be added as an independent vowel :
                #...............................................................
                if  dstring_object.options["san2bod quality"] == "normal" and \
                    dchar_san.base_char=='O':
                    #====================
                    # @@BOD2SAN-NORM-004
                    # (independent vowel) ओ(ō) > औ(au)
                    #====================
                    _independent_vowel = "AU"

                elif dstring_object.options["san2bod quality"] == "low" and \
                     dchar_san.base_char=='O':
                    #====================
                    # @@BOD2SAN-LOW-004
                    # (independent vowel) ओ(ō) > औ(au)
                    #====================
                    _independent_vowel = "AU"

                elif dstring_object.options["san2bod quality"] == "low" and \
                     dchar_san.base_char in ('AA', 'II', 'UU'):
                    #====================
                    # @@BOD2SAN-LOW-006
                    # (independent vowel) long vowels > short vowels
                    #====================
                    _independent_vowel = {'AA' : 'A',
                                          'II' : 'I',
                                          'UU' : 'U'}[dchar_san.base_char]

                else:
                    _independent_vowel = dchar_san.base_char

                unicode_symb = SAN__SYMB_INDEPENDENT_VOWELS.get_default_symbol(_independent_vowel)
                new_istruct = InternalStructure( dstring_object = dstring_object,
                                                 consonant = "A",
                                                 vowel1 = INDEPENDENT_VOWELS_INVERSED[unicode_symb])
                istructs.append(new_istruct)

                place_consonant_among_subjc = False

            # consonant :
            elif dchar_san.base_char in SAN__SYMB_CONSONANTS:

                if dchar_san.base_char == 'DEVANAGARI SIGN VISARGA':
                    # special case : the visarga symbol is placed among consonants in Sanskrit,
                    # among diacritics in Tibetan.

                    if dstring_object.options["san2bod quality"] == "normal" and \
                       dchar_san.base_char=='DEVANAGARI SIGN VISARGA':
                        #====================
                        # @@BOD2SAN-NORM-001
                        # the visarga is omitted if "san2bod quality" == "normal"
                        #====================
                        pass
                    elif dstring_object.options["san2bod quality"] == "low" and \
                         dchar_san.base_char=='DEVANAGARI SIGN VISARGA':
                        #===================
                        # @@BOD2SAN-LOW-001
                        # the visarga is omitted if "san2bod quality" == "low"
                        #===================
                        pass
                    else:
                        unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(dchar_san.base_char)
                        istructs[-1].rnam_bcad = True

                        place_consonant_among_subjc = False

                elif not place_consonant_among_subjc:
                    # consonant to be placed as a main consonant
                    # (and not among subjoined consonants) :

                    #...........................................................
                    # _base_char will be added as a main consonant :
                    #...........................................................
                    if  dstring_object.options["san2bod quality"] == "normal" and \
                        dchar_san.base_char=='VA':
                        #====================
                        # @@BOD2SAN-NORM-002
                        # the व(va) becomes ब(ba) if "san2bod quality" == "normal"
                        #====================
                        _base_char = "BA"

                    elif dstring_object.options["san2bod quality"] == "low" and \
                         dchar_san.base_char=='VA':
                        #===================
                        # @@BOD2SAN-LOW-002
                        # the व(va) becomes ब(ba) if "san2bod quality" == "normal"
                        #===================
                        _base_char = "BA"

                    elif (dstring_object.options["san2bod quality"] == "low" and \
                        dchar_san.base_char in ('TTA',
                                                'TTHA',
                                                'DDA',
                                                'DDHA',
                                                'NNA')):
                        #===================
                        # @@BOD2SAN-LOW-007
                        # retroflex consonant > non-retroflex consonant
                        # retroflex consonant + aspiration > non-retroflex
                        # consonant without aspiration
                        #===================
                        _base_char = {'TTA'   : "TA",
                                      'TTHA'  : "TA",
                                      'DDA'   : "DA",
                                      'DDHA'  : "DA",
                                      'NNA'   : "NA"
                                      }[dchar_san.base_char]

                    elif (dstring_object.options["san2bod quality"] == "low" and \
                        dchar_san.base_char in ('KHA',
                                                'GHA',
                                                'THA',
                                                'CHA',
                                                'JHA',
                                                'TTHA',
                                                'DDHA',
                                                'PHA',
                                                'BHA')):
                        #===================
                        # @@BOD2SAN-LOW-008
                        # consonant + aspiration > consonant without aspiration
                        #===================
                        _base_char = {'KHA'   : "KA",
                                      'GHA'   : "GA",
                                      'THA'   : "TA",
                                      'CHA'   : "CA",
                                      'JHA'   : "JA",
                                      'DHA'   : "DA",
                                      'TTHA'  : "TTA",
                                      'DDHA'  : "DDA",
                                      'PHA'   : "PA",
                                      'BHA'   : "BA"
                                      }[dchar_san.base_char]

                    else:
                        # general case :
                        _base_char = dchar_san.base_char

                    unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(_base_char)
                    bod_consonant = CONSONANTS_INVERSED[unicode_symb]

                    new_istruct = InternalStructure( dstring_object = dstring_object,
                                                     consonant = bod_consonant )
                    istructs.append(new_istruct)

                    if dchar_san.virama:
                        place_consonant_among_subjc = True

                else:
                    # consonant to be placed among subjoined consonants
                    # (and not as a main consonant) :
                    if istructs[-1].subfix is None:
                        istructs[-1].subfix = []

                    unicode_symb = SAN__SYMB_CONSONANTS.get_default_symbol(dchar_san.base_char)
                    cons = CONSONANTS_INVERSED[unicode_symb]

                    add_this_consonant = True
                    if dstring_object.options["san2bod quality"] == "low" and \
                       istructs[-1].subfix == [] and \
                       istructs[-1].consonant == cons:
                        #===================
                        # @@BOD2SAN-LOW-008
                        # geminate consonant > 0
                        #===================
                        add_this_consonant = False
                        # no more subjoinded consonant : the other one will be treated
                        # like main consonants :
                        place_consonant_among_subjc = False

                    if add_this_consonant:
                        istructs[-1].subfix.append( cons )

                        if not dchar_san.virama:
                            place_consonant_among_subjc = False

                # dependent vowel :
                if dchar_san.dependentvowel is not None and dchar_san.dependentvowel != "A":

                    #...........................................................
                    # _dependent_vowel will be added as a dependent vowel :
                    #...........................................................
                    if  dstring_object.options["san2bod quality"] == "normal" and \
                        dchar_san.dependentvowel=='O':
                        #====================
                        # @@BOD2SAN-NORM-003
                        # (dependent vowel) ओ(ō) > औ(au)
                        #====================
                        _dependent_vowel = "AU"

                    elif dstring_object.options["san2bod quality"] == "low" and \
                         dchar_san.dependentvowel=='O':
                        #====================
                        # @@BOD2SAN-LOW-003
                        # (dependent vowel) ओ(ō) > औ(au)
                        #====================
                        _dependent_vowel = "AU"

                    elif  dstring_object.options["san2bod quality"] == "low" and \
                          dchar_san.dependentvowel in ('AA', 'II', 'UU'):
                        #====================
                        # @@BOD2SAN-LOW-005
                        # (dependent vowel) long vowels > short vowels
                        #====================
                        _dependent_vowel = {'AA' : 'A',
                                            'II' : 'I',
                                            'UU' : 'U'}[dchar_san.dependentvowel]

                    else:
                        _dependent_vowel = dchar_san.dependentvowel

                    unicode_symb = \
                      SAN__SYMB_DEPENDENT_VOWELS.get_default_symbol(_dependent_vowel)

                    istructs[-1].vowel1 = DEPENDENT_VOWELS_INVERSED[unicode_symb]

            # anusvara/candrabindu :
            if dchar_san.anusvara_candrabindu is not None:
                unicode_symb = \
                  SAN__SYMB_DIACRITICS.get_default_symbol(dchar_san.anusvara_candrabindu)

                istructs[-1].anusvara_candrabindu = DIACRITICS_INVERSED[unicode_symb]

    res = ListOfInternalStructures(anonymize_the_unknown_chars =\
                                dstring_object.options["anonymize the unknown characters"] == 'yes')

    # we add a tsheg after a "real" syllable (id est, not a punctuation sign, ...)
    for istruct in istructs:
        res.append(istruct)

        if istruct.consonant is not None:
            res.append( InternalStructure(
                dstring_object = dstring_object,
                punctuation_or_other_symbol = 'MARK INTERSYLLABIC TSHEG' ))

    return res
Пример #17
0
#
#    You should have received a copy of the GNU General Public License
#    along with DChars.  If not, see <http://www.gnu.org/licenses/>.
################################################################################
"""
    ❏DChars❏ : dchars/tests/languages/jpn/transliterations/shepburn_tests.py
"""

import unittest

from dchars.dchars import new_dstring
from dchars.languages.bod.dcharacter import UNKNOWN_CHAR_SYMBOL
DSTRING_JPN = new_dstring(language="jpn",
                          transliteration_method="shepburn",
                          options = {"anonymize the unknown characters" : "yes",
                                     "long vowels written with circumflex" : "no",
                                     "katakanas written with upper case letters" : "yes",
                                     "ou becomes ō" : "no",
                                     },
                         )

LIST_OF_RECIPROCAL_EXAMPLES = (
    ('',                ''    ),
    ('か',              'ka'  ),
    ('きゃ',            'kya'),
    ('びゅ',            'byu'),
    ('じゃあく',        'jaaku'),
    ('おねえさん',      'oneesan'),
    ('ゑ',              'we'),
    ('あんない',        'annai'),
    ('ぐんま',          'gunma'),
    ('しんよう',        "shin'you"),
Пример #18
0
def get_usefull_combinations():
    """
            get_usefull_combinations()

            Return a (str)string with all the usefull combinations of characters,
            i.e. only the 'interesting' characters (not punctuation if it's too simple
            by example).

            NB : this function has nothing to do with linguistic or a strict
                 approach of the language. This function allows only to get the
                 most common and/or usefull characters of the writing system.

            NB : function required by the dchars-fe project.
    """
    res = []

    HBO = new_dstring( 'hbo' )
    dstring = HBO()

    # base_char : we don't use the list stored in symbols.py
    # since we would lost the character's order.
    base_characters = ( 'א',
                        'ב',
                        'ג',
                        'ד',
                        'ה',
                        'ו',
                        'ז',
                        'ח',
                        'ט',
                        'י',
                        'כ',
                        'ל',
                        'מ',
                        'נ',
                        'ס',
                        'ע',
                        'פ',
                        'צ',
                        'ק',
                        'ר',
                        'ש',
                        'ת' )

    #-----------------------------------------------------------------------
    # (1/2) simple characters
    #-----------------------------------------------------------------------
    for base_char in base_characters:
        for shin_sin_dot in (None,
                             "HEBREW POINT SHIN DOT",
                             "HEBREW POINT SIN DOT"):

            if base_char != 'SHIN':
                shin_sin_dot = None

            dchar = DCharacterHBO( dstring_object = dstring,
                                   base_char = base_char,
                                   contextual_form = None,
                                   shin_sin_dot = None,
                                   daghesh_mapiq = False,
                                   methegh = False,
                                   specialpoint = None,
                                   vowel = None,
                                   raphe = False,
                                   cantillation_mark = None )

            txt = dchar__get_translit_str(dstring_object = dstring,
                                          dchar = dchar)

            res.append( str(dchar) + "{" + txt + "} " )

        #-----------------------------------------------------------------------
        # (2/2) complex characters
        #-----------------------------------------------------------------------

        #.......................................................................
        combinations = (itertools.product(
                                           # base_char :
                                           ( 'ב', ),

                                           # vowel :
                                           (None,
                                            "HEBREW POINT SHEVA",
                                            "HEBREW POINT HATAF SEGOL",
                                            "HEBREW POINT HATAF PATAH",
                                            "HEBREW POINT HATAF QAMATS",
                                            "HEBREW POINT HIRIQ",
                                            "HEBREW POINT TSERE",
                                            "HEBREW POINT SEGOL",
                                            "HEBREW POINT PATAH",
                                            "HEBREW POINT QAMATS",
                                            "HEBREW POINT HOLAM",
                                            "HEBREW POINT HOLAM HASER FOR VAV",
                                            "HEBREW POINT QUBUTS",
                                            "HEBREW POINT QAMATS QATAN"),
                                             ))

        for base_char, \
            vowel in combinations:

            dchar = DCharacterHBO( dstring_object = dstring,
                                    base_char = base_char,
                                    contextual_form = "initial+medium+final",
                                    shin_sin_dot = None,
                                    daghesh_mapiq = False,
                                    methegh = False,
                                    specialpoint = None,
                                    vowel = vowel,
                                    raphe = None,
                                    cantillation_mark = None, )

            txt = dchar__get_translit_str(dstring_object = dstring,
                                          dchar = dchar)

            res.append( str(dchar) + "{" + txt + "} " )

        #.......................................................................
        combinations = (itertools.product(
                                           # base_char :
                                           ( 'ש', ),

                                           # shin_sin_dot :
                                           (None, "HEBREW POINT SHIN DOT", "HEBREW POINT SIN DOT"),
                                          ))

        for base_char, shin_sin_dot, \
            in combinations:

            dchar = DCharacterHBO( dstring_object = dstring,
                                    base_char = base_char,
                                    contextual_form = "initial+medium+final",
                                    shin_sin_dot = shin_sin_dot,
                                    daghesh_mapiq = False,
                                    methegh = False,
                                    specialpoint = None,
                                    vowel = None,
                                    raphe = None,
                                    cantillation_mark = None, )

            txt = dchar__get_translit_str(dstring_object = dstring,
                                          dchar = dchar)

            res.append( str(dchar) + "{" + txt + "} " )

    return "".join(res)
Пример #19
0
#
#    You should have received a copy of the GNU General Public License
#    along with DChars.  If not, see <http://www.gnu.org/licenses/>.
################################################################################
"""
    ❏DChars❏ : dchars/tests/languages/bod/transliterations/bodsan_tests.py
"""

import unittest

from dchars.dchars import new_dstring

DSTRING_BOD_HIGH = new_dstring(language="བོད་ཡིག",
                               transliteration_method='bodsan',
                               options = {"expected structure" : "always Sanskrit",
                                          "fill the buffers"       : 'no',
                                          "look up in the buffers" : 'no',
                                          "san2bod quality"        : "high",
                                          },
                          )

DSTRING_BOD_NORM = new_dstring(language="བོད་ཡིག",
                               transliteration_method='bodsan',
                               options = {"expected structure" : "always Sanskrit",
                                          "fill the buffers"       : 'no',
                                          "look up in the buffers" : 'no',
                                          "san2bod quality"        : "normal",
                                          },
                          )

DSTRING_BOD_LOW  = new_dstring(language="བོད་ཡིག",
                               transliteration_method='bodsan',
Пример #20
0
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with DChars.  If not, see <http://www.gnu.org/licenses/>.
################################################################################
"""
    ❏DChars❏ : dchars/tests/languages/san/transliterations/itrans_tests.py
"""

import unittest, os.path, re

from dchars.dchars import new_dstring
from dchars.languages.bod.dcharacter import UNKNOWN_CHAR_SYMBOL

DSTRING_SAN = new_dstring(language='संस्कृतम्',
                          transliteration_method = "itrans",
                          options = {"anonymize the unknown characters" : 'no'},
                          )

DSTRING_SAN__UNKNOWNCHAR = new_dstring(language='संस्कृतम्',
                          transliteration_method = "itrans",
                          options = {"anonymize the unknown characters" : 'yes'},
                          )


LIST_OF_RECIPROCAL_EXAMPLES = (
                            ("",                ''              ),
                            # क(0x0915)
                            ("क",               'ka'            ),
                            # क(0x0915) + virama(0x094D)
                            ("क्",               'k'             ),
                            # क(0x0915) + virama(0x094D) + anusvara (0902)
Пример #21
0
    def test_get_transliteration__upsilon(self):
        """
                TESTSDStringGRC.test_get_transliteration__upsilon
        """

        # -----------------------------------------------
        # [grc.gutenberg]transliteration for upsilon = y
        # -----------------------------------------------
        dstring_grc__upsilon = new_dstring(
            language="Ἑλληνικὴ γλῶττα",
            transliteration_method="gutenberg",
            options={
                "anonymize the unknown characters": "no",
                "gutenberg:ignore smooth breathing": "yes",
                "gutenberg:ignore accents": "yes",
                "gutenberg:ignore iota subscript": "yes",
                "gutenberg:ignore diaeresis": "yes",
                "gutenberg:transliteration for upsilon": "y",
            },
        )

        string = dstring_grc__upsilon("πύργον")
        grc_gutenberg2 = string.get_transliteration()
        self.assertEqual("pyrgon", grc_gutenberg2)

        string = dstring_grc__upsilon("αὐτης")
        grc_gutenberg2 = string.get_transliteration()
        self.assertEqual("aytês", grc_gutenberg2)

        # -----------------------------------------------
        # [grc.gutenberg]transliteration for upsilon = u
        # -----------------------------------------------
        dstring_grc__upsilon = new_dstring(
            language="Ἑλληνικὴ γλῶττα",
            transliteration_method="gutenberg",
            options={
                "anonymize the unknown characters": "no",
                "gutenberg:ignore smooth breathing": "yes",
                "gutenberg:ignore accents": "yes",
                "gutenberg:ignore iota subscript": "yes",
                "gutenberg:ignore diaeresis": "yes",
                "gutenberg:transliteration for upsilon": "u",
            },
        )

        string = dstring_grc__upsilon("πύργον")
        grc_gutenberg2 = string.get_transliteration()
        self.assertEqual("purgon", grc_gutenberg2)

        string = dstring_grc__upsilon("αὐτης")
        grc_gutenberg2 = string.get_transliteration()
        self.assertEqual("autês", grc_gutenberg2)

        # ----------------------------------------------------
        # [grc.gutenberg]transliteration for upsilon = u or y
        # ----------------------------------------------------
        dstring_grc__upsilon = new_dstring(
            language="Ἑλληνικὴ γλῶττα",
            transliteration_method="gutenberg",
            options={
                "anonymize the unknown characters": "no",
                "gutenberg:ignore smooth breathing": "yes",
                "gutenberg:ignore accents": "yes",
                "gutenberg:ignore iota subscript": "yes",
                "gutenberg:ignore diaeresis": "yes",
                "gutenberg:transliteration for upsilon": "u or y",
            },
        )

        string = dstring_grc__upsilon("πύργον")
        grc_gutenberg2 = string.get_transliteration()
        self.assertEqual("pyrgon", grc_gutenberg2)

        string = dstring_grc__upsilon("αὐτης")
        grc_gutenberg2 = string.get_transliteration()
        self.assertEqual("autês", grc_gutenberg2)
Пример #22
0
#
#    You should have received a copy of the GNU General Public License
#    along with DChars.  If not, see <http://www.gnu.org/licenses/>.
################################################################################
"""
    ❏DChars❏ : dchars/tests/languages/fro/fro_tests.py
"""

import unittest

from dchars.errors.errors import DCharsError
from dchars.dchars import new_dstring
from dchars.languages.bod.dcharacter import UNKNOWN_CHAR_SYMBOL

DSTRING_FRO = new_dstring(language = "romanz",
                          options = {"anonymize the unknown characters" : "no",
                                     "sorting method" : "default"},
                          )
DSTRING_FRO__UNKNOWNCHAR = new_dstring(language = "romanz",
                                       options = {"anonymize the unknown characters" : "yes"},
                                      )

# pylint: disable=R0904
# ("Too many public methods")
# Since this classes are derived from unittest.TestCase we have a lot of
# methods in the following classe(s).
################################################################################
class TESTSDStringFRO(unittest.TestCase):
    """
        class TESTSDStringFRO

        We test dchars.languages.fro.dchars::DStringFRO
Пример #23
0
# we import the "new_dstring" object in order to get a DSTRING_SAN object :
from dchars.dchars import new_dstring
DSTRING_SAN = new_dstring(language='san', transliteration_method="iso15919")

# We set the string from a source-string :
# this is the first part of the first verse of the Rig-Veda :
string = DSTRING_SAN("अ॒ग्निमी॑ळे पु॒रोहि॑तं य॒ज्ञस्य॑ दे॒वमृ॒त्विज॑म् ।")

# and this is the second part :
string += DSTRING_SAN("होता॑रं रत्न॒धात॑मम् ॥")

# transliteration :
print(string.get_transliteration())         # -> a̱gnimī́ḷē pu̱rōhítaṁ ya̱jñasyá dē̱vamr̥̱tvijám .hōtā́raṁ ratna̱dhātámam ..

# Let's inspect and modify this string :
print(string[0])                            # -> अ॒
print(string[-1].punctuation)               # -> True
print(string[0].base_char)                  # -> "A"
print(string[1].base_char)                  # -> "DEVANAGARI LETTER GA"
print(string[0].anudatta)                   # -> True
string[0].anudatta = False
string[0].accent = "DEVANAGARI STRESS SIGN UDATTA"
print(string.get_transliteration())         # -> "ágnimī́ḷē pu̱rōhítaṁ ya̱jñasyá dē̱vamr̥̱tvijám .hōtā́raṁ ratna̱dhātámam .."
Пример #24
0
#         Hypothesis.go_on__ang2() : regrouper les voyelles/dipht et les faire correspondre à l'index dans un mot dans la
#               chaîne source. Peut-être commencer par casser les mots puis au niveau supérieur casser les voyelles

# treelib : caesar0301/treelib (https://github.com/caesar0301/treelib)
#   doc : http://hsiamin.com/treelib/

import os
import argparse
from dchars.dchars import new_dstring
from treelib import Node, Tree

NAME_OF_THE_PROJECT = "pyscansion"

# (str) language name : DSTRING object
LANGUAGES = {
                    'ang' : new_dstring(language='ang'),
                    'lat' : new_dstring(language='lat'),
            }

################################################################################
class Hypothesis(object):
    """
        Hypothesis class

        Hypothesis objects are stored in HTREE.

        +----------+----------+----------------------------------------------------
        | language | function | description
        +----------+----------+----------------------------------------------------
        | (all)    | ang1     | from root hypothesis
        |          |          |
Пример #25
0
def get_usefull_combinations():
    """
            get_usefull_combinations()

            Return a (str)string with all the usefull combinations of characters,
            i.e. only the 'interesting' characters (not punctuation if it's too simple
            by example).

            NB : this function has nothing to do with linguistic or a strict
                 approach of the language. This function allows only to get the
                 most common and/or usefull characters of the writing system.

            NB : function required by the dchars-fe project.
    """
    res = []

    dstring = new_dstring( 'grc' )()

    # base_char : we don't use the list stored in symbols.py
    # since we would lost the character's order.
    base_characters  = ( 'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι',
                         'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'σ',
                         'τ', 'υ', 'φ', 'χ', 'ψ', 'ω',
                         'ϝ', 'ϗ', 'ϡ', 'ϛ', 'ϙ', )

    #-----------------------------------------------------------------------
    # (1/2) simple characters
    #-----------------------------------------------------------------------
    for base_char in base_characters:
        for capital_letter in (False, True):
            dchar = DCharacterGRC( dstring_object = dstring,
                                   base_char = base_char,
                                   contextual_form = "initial+medium+final",
                                   punctuation = False,
                                   capital_letter = capital_letter,
                                   tonos = None,
                                   pneuma = None,
                                   hypogegrammene = False,
                                   dialutika = False,
                                   mekos = None )

            txt = dchar__get_translit_str(dstring_object = dstring,
                                          dchar = dchar)

            res.append( str(dchar) + "{" + txt + "} " )

    #-----------------------------------------------------------------------
    # (2/2) complex characters
    #-----------------------------------------------------------------------
    combinations = (itertools.product(
                                       # base_chars
                                       ( 'α', ),

                                       # contextual_form
                                       ("initial", "medium", "final",
                                        "initial+medium", "medium+final",
                                        "initial+medium+final"),

                                       # capital_letter
                                       (False, True),

                                       # tonos
                                       ( None, "ὀξεῖα", "βαρεῖα", "περισπωμένη" ),

                                       # pneuma
                                       ( None, "ψιλὸν",  "δασὺ" ),

                                       # hypogegrammene
                                       (False, True),

                                       # dialutika
                                       (False, True),

                                       # mekos
                                       ( None, "βραχύ", "μακρόν" ),
                                       ))

    for base_char, contextual_form, capital_letter, \
        tonos, pneuma, hypogegrammene, dialutika, mekos in combinations:

        add_this_dchar = True

        if base_char == 'ρ':
            if contextual_form != "initial+medium+final" or \
               tonos is not None or \
               hypogegrammene == True or \
               dialutika == True or \
               mekos is not None:

                add_this_dchar = False

        elif base_char in ('β', 'σ'):
            if tonos is not None or \
               pneuma is not None or \
               hypogegrammene == True or \
               dialutika == True or \
               mekos is not None:

                add_this_dchar = False

        elif base_char in ('α', 'η', 'ω'):
            if contextual_form != "initial+medium+final" or \
               dialutika == True or \
               mekos is not None:

                add_this_dchar = False

        elif base_char in ('ε', 'ο'):
            if contextual_form != "initial+medium+final" or \
               hypogegrammene == True or \
               tonos == "περισπωμένη" or \
               hypogegrammene == True or \
               dialutika == True or \
               mekos is not None:

                add_this_dchar = False

        elif base_char in ('ι', 'υ'):
            if contextual_form != "initial+medium+final" or \
               hypogegrammene == True or \
               mekos is not None:

                add_this_dchar = False

        else:
            if contextual_form != "initial+medium+final" or \
               tonos is not None or \
               pneuma is not None or \
               hypogegrammene == True or \
               dialutika == True or \
               mekos is not None:

                add_this_dchar = False

        if add_this_dchar:
            dchar = DCharacterGRC( dstring_object = dstring,
                                   base_char = base_char,
                                   contextual_form = contextual_form,
                                   punctuation = False,
                                   capital_letter = capital_letter,
                                   tonos = tonos,
                                   pneuma = pneuma,
                                   hypogegrammene = hypogegrammene,
                                   dialutika = dialutika,
                                   mekos=mekos)

            txt = dchar__get_translit_str(dstring_object = dstring,
                                          dchar = dchar)

            res.append( str(dchar) + "{" + txt + "} " )

    return "".join(res)
Пример #26
0
#
#    You should have received a copy of the GNU General Public License
#    along with DChars.  If not, see <http://www.gnu.org/licenses/>.
################################################################################
"""
    ❏DChars❏ : dchars/tests/languages/lat/lat_tests.py
"""

import unittest, os.path

from dchars.errors.errors import DCharsError
from dchars.dchars import new_dstring
from dchars.symbols.symbols import UNKNOWN_CHAR_SYMBOL

DSTRING_LAT = new_dstring(language = "latīna",
                          options = {"anonymize the unknown characters" : "no",
                                     "sorting method" : "default"},
                          )
DSTRING_LAT__UNKNOWNCHAR = new_dstring(language = "latīna",
                                       options = {"anonymize the unknown characters" : "yes"},
                                      )

# pylint: disable=R0904
# ("Too many public methods")
# Since this classes are derived from unittest.TestCase we have a lot of
# methods in the following classe(s).
################################################################################
class TESTSDStringLAT(unittest.TestCase):
    """
        class TESTSDStringLAT

        We test dchars.languages.lat.dchars::DStringLAT
Пример #27
0
    def load(self, srcfile = None):
        """
                CurrentState.load
				
                Load from a file (<srcfile> : (str), file's name) the current
                state of the program; if <srcfile> is None, load the current
                state from the default file.
        """
    
        if srcfile is None:
            _srcfile = CurrentState.DEFAULTFILE

            # missing default file :
            if not os.path.exists(_srcfile):
                return
        else:
            _srcfile = srcfile

        # cleaning up the current state of the program :
        self.mainapp.sourcetext_editor.clear()
        self.mainapp.reset_transformations()

        # <location> :
        #   None, then "transformations options", then "source text/results"
        location = None

        # <nbr_of_srctextsresults> :
        #   =0 for the source text, 1 for the result of the transformation #0,
        #   2 for the result of the transformation #1, ...
        nbr_of_srctextsresults = 0 

        with open(_srcfile, "r", encoding="utf-8") as src:

            for line in src.readlines():

                if not line.startswith("###"):

                    if line.startswith("*** TRANS"):
                        # new transformation :
                        location = "transformations options"
                        self.mainapp.nbr_usedtransf += 1

                        current_t = \
                          self.mainapp.transformations[self.mainapp.nbr_usedtransf-1]

                        current_t.editor_frame.show()

                    elif line.startswith("*** SOURCE TEXT/RESULTS"):
                        location = "source text/results"
                        nbr_of_srctextsresults += 1
                    
                    elif location == "transformations options":

                        if line.startswith("language, transliteration_method ="):

                            data = line[len("language, transliteration_method ="):].strip()
                            newl, new_transl_method = data.split(",")

                            # <newl> stands for "new language" :
                            newl = newl.strip()
                            new_transl_method = new_transl_method.strip()

                            newds = new_dstring( language = newl,
                                                 transliteration_method = new_transl_method )

                            current_t = \
                              self.mainapp.transformations[self.mainapp.nbr_usedtransf-1]
                              
                            current_t.dstring_type = newds

                            # <newl2> : full representation of <newl> :
                            newl2 = \
                              current_t.editor_frame.language_name.getFullStringForOneLanguage(newl)

                            current_t.editor_frame.language_name.SetCurrentIndexTo(newl2)
                            current_t.editor_frame.transl_name.SetCurrentIndexTo(new_transl_method)

                        elif line.startswith("direction ="):

                            # <new_d> stands for "new direction"
                            new_d = Direction( int(line[len("direction ="):].strip()) )

                            current_t = self.mainapp.transformations[self.mainapp.nbr_usedtransf-1]
                            current_d = current_t.editor_frame.direction
                            current_d.SetCurrentIndexTo(new_d.GetCorrespondingStr())

                        else:
                            # other options, like "anonymize the unknown characters" :
                            if line.strip() != "":
                                option_name, option_value = line.split("=")
                                option_name = option_name.strip()
                                option_value = option_value.strip()

                                current_t = \
                                  self.mainapp.transformations[self.mainapp.nbr_usedtransf-1]

                                options = current_t.dstring_type.options
                                options[ option_name ] = option_value
                                current_t.editor_frame.options.setOption( option_name, option_value )

                    elif location == "source text/results":

                        if nbr_of_srctextsresults == 1:
                            # source text :
                            current_text = self.mainapp.sourcetext_editor.toPlainText()

                            if current_text == "":
                                current_text += line[:-1]
                            else:
                                current_text += "\n" + line[:-1]

                            self.mainapp.sourcetext_editor.setPlainText( current_text )

                        else:
                            # transformation's result :
                            current_t = self.mainapp.transformations[nbr_of_srctextsresults-2]
                            
                            current_result = current_t.result

                            if current_t.result == "":
                                current_result += line
                            else:
                                if line[:-1] == "\n":
                                    current_result += "\n" + line[:-1]
                                else:
                                    # it may happen that <line> is the last line of the file
                                    # and is not terminated by \n :
                                    current_result += "\n" + line
                            
                            current_t.result = current_result
                            current_t.editor_frame.editor.setPlainText( current_t.result )
Пример #28
0
#    You should have received a copy of the GNU General Public License
#    along with Anceps.  If not, see <http://www.gnu.org/licenses/>.
################################################################################
"""
    ❏Anceps❏ : anceps/constants.py
"""
from dchars.dchars import new_dstring

NAME_OF_THE_PROJECT = "Anceps"

# initialized by cmdline/cmdline.py::get_input_data()
CMDLINE_ARGS = None

# (str) language name : DSTRING object
LANGUAGES = {
                    'ang' : new_dstring(language='ang',
                                        options = {"anonymize the unknown characters" : 'no'}),
                    'lat' : new_dstring(language='lat',
                                        options = {"anonymize the unknown characters" : 'no'}),
            }

# constants for the patterns' file :
PATTERNSFILE__COMMENTS_STARTSYMBOLS = "####"
PATTERNSFILE__EXPECTED_SECTIONS_IN_A_PATTERN = ('scansion rules', 'syllabic structure')

PROJECT_TITLE = """
  ,adPPYYba, 8b,dPPYba,   ,adPPYba,  ,adPPYba, 8b,dPPYba,  ,adPPYba,  
         `Y8 88P'   `a8a a8a     aa a8P_____88 88P'    a8a I8[    aa  
  ,adPPPPP88 88       88 8b         8PP"'"'"aa 88       d8  `aY8ba,   
  88,    ,88 88       88 a8a,   ,aa a8b,   ,aa 88b,   ,a8a aa    ]8I  
  `a8bbdPaY8 88       88  `aYbbd8a'  `aYbbd8a' 88`YbbdPa'  `aYbbdPa'  
                                               88                     
Пример #29
0
def get_usefull_combinations():
    """
            get_usefull_combinations()

            Return a (str)string with all the usefull combinations of characters,
            i.e. only the 'interesting' characters (not punctuation if it's too simple
            by example).

            NB : this function has nothing to do with linguistic or a strict
                 approach of the language. This function allows only to get the
                 most common and/or usefull characters of the writing system.

            NB : function required by the dchars-fe project.
    """
    res = []

    dstring = new_dstring( 'jpn' )()

    # base_char : we don't use the list stored in symbols.py
    # since we would lost the character's order.
    base_characters  = ( 'あ', 'い', 'う', 'え', 'お',
                         'か', 'き', 'く', 'け', 'こ',
                         'さ', 'し', 'す', 'せ', 'そ',
                         'た', 'ち', 'つ', 'て', 'と',
                         'な', 'に', 'ぬ', 'ね', 'の',
                         'は', 'ひ', 'ふ', 'へ', 'ほ',
                         'ま', 'み', 'む', 'め', 'も',
                         'や', 'ゆ', 'よ',
                         'ら', 'り', 'る', 'れ', 'ろ',
                         'わ', 'ゐ', 'ゑ', 'ゑ',
                         'を',
                         'ん',
                        )

    for base_char in base_characters:
        for chartype in ('hiragana', 'katakana'):
            for smallsize in (False, True):
                for diacritic in (None, "dakuten", "handakuten"):

                    add_this_char = True

                    if smallsize and base_char not in HIRAGANA_TO_SMALL_HIRAGANA:
                        add_this_char = False

                    if diacritic == 'dakuten' and \
                       base_char not in ('か', 'き', 'く', 'け', 'こ',
                                         'さ', 'し', 'す', 'せ', 'そ',
                                         'た', 'ち', 'つ', 'て', 'と',
                                         'は', 'ひ', 'ふ', 'へ', 'ほ',):
                        add_this_char = False

                    if diacritic == 'handakuten' and \
                       base_char not in ('は', 'ひ', 'ふ', 'へ', 'ほ',):
                        add_this_char = False

                    if add_this_char:

                        dchar = DCharacterJPN( dstring_object = dstring,
                                               unknown_char = False,
                                               base_char = base_char,
                                               punctuation = False,
                                               chartype = chartype,
                                               diacritic = diacritic,
                                               smallsize = smallsize )

                        txt = dchar__get_translit_str(dstring_object = dstring,
                                                      dchar = dchar)

                        res.append( str(dchar) + "{" + txt + "} " )

    return "".join(res)
Пример #30
0
#
#    You should have received a copy of the GNU General Public License
#    along with DChars.  If not, see <http://www.gnu.org/licenses/>.
################################################################################
"""
    ❏DChars❏ : dchars/tests/languages/ang/ang_tests.py
"""

import unittest

from dchars.errors.errors import DCharsError
from dchars.dchars import new_dstring
from dchars.languages.bod.dcharacter import UNKNOWN_CHAR_SYMBOL

DSTRING_ANG = new_dstring(
    language="Ænglisc", options={"anonymize the unknown characters": "no", "sorting method": "default"}
)
DSTRING_ANG__UNKNOWNCHAR = new_dstring(language="Ænglisc", options={"anonymize the unknown characters": "yes"})

# pylint: disable=R0904
# ("Too many public methods")
# Since this classes are derived from unittest.TestCase we have a lot of
# methods in the following classe(s).
################################################################################
class TESTSDStringANG(unittest.TestCase):
    """
        class TESTSDStringANG

        We test dchars.languages.ang.dchars::DStringANG
    """