def __init__( self, write_lang_features_to_csv = False ): # # Language followed by flag for alphabet boundary, syllable boundary (either as one # character as in Chinese or space as in Korean), then word boundary (space) # The most NLP-inconvenient languages are those without word boundary, obviously. # Name, Code, Alphabet, CharacterType, SyllableSeparator, SyllableSeparatorType, WordSeparator, WordSeparatorType # # We need to define our own properties as even ISO 15924 specification does not contain them # # Hangul/CJK Language Family # try: self.PYCLANG = pycountry.languages except Exception as ex: Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Cannot load pycountry languages: ' + str(ex) ) self.PYCLANG = None lang_index = 0 lang_ko = { self.C_LANG_ID: self.LANG_KO, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Hangul', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_HANGUL, self.C_HAVE_SYL_SEP: True, # TODO Not really right to say it is char but rather a "syllable_character" self.C_SYL_SEP_TYPE: self.T_CHAR, self.C_HAVE_WORD_SEP: True, self.C_WORD_SEP_TYPE: self.T_SPACE, self.C_HAVE_VERB_CONJ: True } # # CJK Alphabet Family # lang_index += 1 lang_zh = { self.C_LANG_ID: self.LANG_ZH, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Chinese', self.C_HAVE_ALPHABET: False, self.C_CHAR_TYPE: self.ALPHABET_CJK, self.C_HAVE_SYL_SEP: True, self.C_SYL_SEP_TYPE: self.T_CHAR, self.C_HAVE_WORD_SEP: False, self.C_WORD_SEP_TYPE: self.T_NONE, self.C_HAVE_VERB_CONJ: False } # # Japanese Hiragana/Katakana # lang_index += 1 lang_ja = { self.C_LANG_ID: self.LANG_JA, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Japanese', self.C_HAVE_ALPHABET: False, self.C_CHAR_TYPE: self.ALPHABET_JAPANESE, self.C_HAVE_SYL_SEP: True, self.C_SYL_SEP_TYPE: self.T_CHAR, self.C_HAVE_WORD_SEP: False, self.C_WORD_SEP_TYPE: self.T_NONE, self.C_HAVE_VERB_CONJ: True } # # Cyrillic Alphabet Family # lang_index += 1 lang_ru = { self.C_LANG_ID: self.LANG_RU, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Russian', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_CYRILLIC, self.C_HAVE_SYL_SEP: False, self.C_SYL_SEP_TYPE: self.T_NONE, self.C_HAVE_WORD_SEP: True, self.C_WORD_SEP_TYPE: self.T_SPACE, self.C_HAVE_VERB_CONJ: True } # # Thai Alphabet Family # lang_index += 1 lang_th = { self.C_LANG_ID: self.LANG_TH, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Thai', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_THAI, self.C_HAVE_SYL_SEP: False, self.C_SYL_SEP_TYPE: self.T_NONE, self.C_HAVE_WORD_SEP: False, self.C_WORD_SEP_TYPE: self.T_NONE, self.C_HAVE_VERB_CONJ: False } # # Latin Alphabet Family # lang_index += 1 lang_en = { self.C_LANG_ID: self.LANG_EN, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'English', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_LATIN_AZ, self.C_HAVE_SYL_SEP: False, self.C_SYL_SEP_TYPE: self.T_NONE, self.C_HAVE_WORD_SEP: True, self.C_WORD_SEP_TYPE: self.T_SPACE, self.C_HAVE_VERB_CONJ: True } lang_index += 1 lang_es = { self.C_LANG_ID: self.LANG_ES, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Spanish', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_LATIN, self.C_HAVE_SYL_SEP: False, self.C_SYL_SEP_TYPE: self.T_NONE, self.C_HAVE_WORD_SEP: True, self.C_WORD_SEP_TYPE: self.T_SPACE, self.C_HAVE_VERB_CONJ: True } lang_index += 1 lang_fr = { self.C_LANG_ID: self.LANG_FR, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'French', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_LATIN, self.C_HAVE_SYL_SEP: False, self.C_SYL_SEP_TYPE: self.T_NONE, self.C_HAVE_WORD_SEP: True, self.C_WORD_SEP_TYPE: self.T_SPACE, self.C_HAVE_VERB_CONJ: True } lang_index += 1 lang_de = { self.C_LANG_ID: self.LANG_DE, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'German', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_LATIN, self.C_HAVE_SYL_SEP: False, self.C_SYL_SEP_TYPE: self.T_NONE, self.C_HAVE_WORD_SEP: True, self.C_WORD_SEP_TYPE: self.T_SPACE, self.C_HAVE_VERB_CONJ: True } lang_index += 1 lang_it = { self.C_LANG_ID: self.LANG_IT, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Italian', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_LATIN, self.C_HAVE_SYL_SEP: False, self.C_SYL_SEP_TYPE: self.T_NONE, self.C_HAVE_WORD_SEP: True, self.C_WORD_SEP_TYPE: self.T_SPACE, self.C_HAVE_VERB_CONJ: True } lang_index += 1 lang_nl = { self.C_LANG_ID: self.LANG_NL, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Dutch', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_LATIN, self.C_HAVE_SYL_SEP: False, self.C_SYL_SEP_TYPE: self.T_NONE, self.C_HAVE_WORD_SEP: True, self.C_WORD_SEP_TYPE: self.T_SPACE, self.C_HAVE_VERB_CONJ: True } lang_index += 1 lang_vi = { self.C_LANG_ID: self.LANG_VI, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Vietnamese', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_LATIN_VI_AZ, self.C_HAVE_SYL_SEP: True, self.C_SYL_SEP_TYPE: self.T_SPACE, self.C_HAVE_WORD_SEP: False, self.C_WORD_SEP_TYPE: self.T_NONE, self.C_HAVE_VERB_CONJ: False } lang_index += 1 lang_id = { self.C_LANG_ID: self.LANG_ID, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Indonesian', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_LATIN_AZ, self.C_HAVE_SYL_SEP: False, self.C_SYL_SEP_TYPE: self.T_NONE, self.C_HAVE_WORD_SEP: True, self.C_WORD_SEP_TYPE: self.T_SPACE, self.C_HAVE_VERB_CONJ: True } self.langs = { # Hangul/CJK self.LANG_KO: lang_ko, self.LANG_JA: lang_ja, # CJK self.LANG_ZH: lang_zh, # Cyrillic self.LANG_RU: lang_ru, # Thai self.LANG_TH: lang_th, # Latin self.LANG_EN: lang_en, self.LANG_ES: lang_es, self.LANG_FR: lang_fr, self.LANG_DE: lang_de, self.LANG_IT: lang_it, self.LANG_NL: lang_nl, self.LANG_VI: lang_vi, self.LANG_ID: lang_id, } assert lang_index+1 == len(self.langs) # Add ISO 639-2 definitions for lang in self.langs.keys(): if self.PYCLANG is not None: lang_639 = self.PYCLANG.get(alpha_2=lang) self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_3] = lang_639.alpha_3 self.langs[lang][LangFeatures.C_LANG_639_2_NAME] = lang_639.name self.langs[lang][LangFeatures.C_LANG_639_2_SCOPE] = lang_639.scope self.langs[lang][LangFeatures.C_LANG_639_2_TYPE] = lang_639.type try: self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_2] = lang_639.alpha_2 except Exception: self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_2] = '' try: self.langs[lang][LangFeatures.C_LANG_639_2_BIBLIO] = lang_639.bibliographic except Exception: self.langs[lang][LangFeatures.C_LANG_639_2_BIBLIO] = '' else: self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_3] = '' self.langs[lang][LangFeatures.C_LANG_639_2_NAME] = '' self.langs[lang][LangFeatures.C_LANG_639_2_SCOPE] = '' self.langs[lang][LangFeatures.C_LANG_639_2_TYPE] = '' self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_2] = '' self.langs[lang][LangFeatures.C_LANG_639_2_BIBLIO] = '' # Copy 2-letter keys (ISO 639-1) to also 3-letter keys (ISO 639-3) # Means we can access the language structure using either ISO 639-1 or ISO 639-3 # If engineering standard ISO had been more far-sighted (after all 26*26=676 only) # we would not have to do this new_items = {} for key in self.langs.keys(): lang_iso_699_3 = self.langs[key][LangFeatures.C_LANG_639_2_ALPHA_3] if key != lang_iso_699_3: lang_dict = self.langs[key].copy() # Change lang id to 3-letter ISO 639-1 lang_dict[self.C_LANG_ID] = lang_iso_699_3 new_items[lang_iso_699_3] = lang_dict for lang_id3 in new_items: self.langs[lang_id3] = new_items[lang_id3] self.langfeatures = pd.DataFrame( self.langs.values() ) # Конечно более удобно хранить данные в csv файле.. # но проблема с путем файла и тп будет очень неприятна пользователем if write_lang_features_to_csv: self.langfeatures = self.langfeatures.sort_values(by=[self.C_LANG_NAME], ascending=True) self.langfeatures.to_csv('lang_features.csv', sep=',', index=False) return
', TOTAL FAIL = ' + str(res_final.count_fail)) return res_final if __name__ == '__main__': config = cf.NwaeConfig.get_cmdline_params_and_init_config_singleton( Derived_Class=cf.NwaeConfig, default_config_file= '/usr/local/git/nwae/nwae/app.data/config/default.cf') ut_params = uthelper.UnitTestParams( dirpath_wordlist=config.get_config( param=cf.NwaeConfig.PARAM_NLP_DIR_WORDLIST), postfix_wordlist=config.get_config( param=cf.NwaeConfig.PARAM_NLP_POSTFIX_WORDLIST), dirpath_app_wordlist=config.get_config( param=cf.NwaeConfig.PARAM_NLP_DIR_APP_WORDLIST), postfix_app_wordlist=config.get_config( param=cf.NwaeConfig.PARAM_NLP_POSTFIX_APP_WORDLIST), dirpath_synonymlist=config.get_config( param=cf.NwaeConfig.PARAM_NLP_DIR_SYNONYMLIST), postfix_synonymlist=config.get_config( param=cf.NwaeConfig.PARAM_NLP_POSTFIX_SYNONYMLIST), dirpath_model=config.get_config(param=cf.NwaeConfig.PARAM_MODEL_DIR)) Log.important('Unit Test Params: ' + str(ut_params.to_string())) Log.LOGLEVEL = Log.LOG_LEVEL_ERROR res = NwaePartsUnitTest(ut_params=ut_params).run_unit_tests() exit(res.count_fail)
# So we define our own list of properties # However we also include properties from above standards using open PYPI packages like pycountry # import pandas as pd from nwae.utils.Log import Log from inspect import getframeinfo, currentframe # pip install iso-639 # https://www.iso.org/iso-639-language-codes.html # from iso639 import languages import nwae.utils.UnitTest as ut try: import pycountry except Exception as ex: Log.warning( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Cannot import pycountry: ' + str(ex) ) pass # # Class LangFeatures # # Helper class to define language properties, such as containing word/syllable separators, # alphabet type, etc. # # This most fundamental class for languages tells us: # # 1. Alphabet Type # What alphabet type a language is written in, either Latin, Cyrillic, etc. # This is used for example in LangDetect class.