def _cache_IETF(cls): cls.precache(include=['script_expr_txt', 'region_expr_txt', 'region_expr_uid'], all_lv=True) for uid in cls._cache: cls._cache[uid]['IETF'] = set() result = panlex.query_all('/expr', {'trans_uid': 'art-420', 'uid': 'art-274', 'include': 'trans_txt'})['result'] regions = {cls._cache[r['txt']]['region_expr'] for r in result} regions_result = panlex.query_all('/expr', {'trans_expr': list(regions), 'uid': 'art-006', 'include': 'trans_txt'})['result'] regions_dict = {r['trans_txt']: r['txt'] for r in regions_result if len(r['txt']) == 2} for r in result: uid = r['txt'] lang = cls._cache[uid] given_tag = Language.get(r['trans_txt'], normalize=False) normalized_tag = Language.get(r['trans_txt'], normalize=True) language_set = {lang['lang_code'], given_tag.language, normalized_tag.language} script_set = {lang['script_expr_txt'], given_tag.script, normalized_tag.script} region_set = {given_tag.region, normalized_tag.region} if lang['region_expr_uid'] == 'art-279' and lang['region_expr_txt'] != '001': region_set.add(lang['region_expr_txt']) try: region_set.add(regions_dict[lang['region_expr_txt']]) except KeyError: pass if {'GB', 'UK'} & region_set: region_set |= {'GB', 'UK'} if {'001', None} & region_set: region_set |= {'001', None} for language, script, region in product(language_set, script_set, region_set): new_tag = normalized_tag.update_dict({'language': language, 'script': script, 'region': region}) cls._cache[uid]['IETF'].add(str(new_tag)) if lang['region_expr_uid'] == 'art-279' and lang['region_expr_txt'] == '001': for language, script, region in product({lang['lang_code']}, script_set, {'001', None}): new_tag = normalized_tag.update_dict({'language': language, 'script': script, 'region': region}) cls._cache[uid]['IETF'].add(str(new_tag))
def _set_lang_codes(self): with open (os.path.join(os.path.dirname(__file__), 'locale.json')) as f: data = json.load(f) lang = Language.get(self.lang).language terr = Language.get(self.lang).territory if terr is None: self.lang_codes = self._get_lang_codes(lang, data) else: self.lang_codes = [self.lang.replace('-', '_')]
def format_language(value: Optional[str]): lang = Language.get(value) if not lang.is_valid(): raise ValueError('language_invalid') return lang.simplify_script().to_tag()
async def prepareDataSet(self, data_set: NLUDataSet) -> None: self.__language = Language.get(data_set.language).simplify_script().to_tag() agent_parent_path = self.__agents_client.project_path(self.__project) # The default language code doesn't really matter as this code always explicitly passes the # exact language on each step. Still, the default language code HAS to be set and it MUST # be set to the code that already is the default. # The following code attempts to retrieve the current agent and to extract the current # default language code from it. try: default_language_code = self.__agents_client.get_agent( agent_parent_path ).default_language_code except: # pylint: disable=bare-except # TODO: Unable to figure out which exact error is raised in case the agent doesn't # exist, which is why this code catches any exception that might be raised by the call # to get_agent. default_language_code = "en" self.__agents_client.set_agent(dialogflow_v2.types.Agent( parent = agent_parent_path, display_name = self.__agent, time_zone = self.__time_zone, default_language_code = default_language_code, supported_language_codes = [ self.__language ] ))
async def prepareDataSet(self, data_set: NLUDataSet) -> None: last_exception = None # Try all language tag derivations, from specific to broad for language in Language.get( data_set.language).simplify_script().broaden(): language = language.to_tag() try: if not self.__skip_language_installations: self._logger.info( "Installing language resources for \"%s\"...", language) subprocess.run([ self.__python, "-m", "snips_nlu", "download", language ], check=True) self.__language = language last_exception = None break except BaseException as e: # pylint: disable=broad-except last_exception = e if last_exception is not None: raise last_exception
def test_create_user(self, user_id=None, name=None): lang_code = random.choice(['zh-hans', 'jp', 'ko', 'en', 'es']) language = Language.get(lang_code) if user_id is None: user_id = utils.random_string()[0:10] if name is None: name = 'Steven' + utils.random_string()[0:5] params = { 'user_id': user_id, 'password': '******', 'name': name, 'language': language.__str__(), 'tags': { 'key': 'value' }, 'email': 'Steven' + utils.random_string()[0:5] + '@mz.co.kr', 'mobile': '+821026671234', 'group': 'group-id', 'domain_id': self.domain.domain_id } user = self.identity_v1.User.create(params, metadata=(('token', self.token), )) self.user = user self.users.append(user) self._print_data(self.user, 'test_create_user') self.assertEqual(self.user.name, params['name'])
def test_create_user(self, user_id=None, name=None, user_type=None, backend=None): lang_code = random.choice(['zh-hans', 'jp', 'ko', 'en', 'es']) language = Language.get(lang_code) if user_id is None: user_id = utils.random_string() + '@mz.co.kr' if name is None: name = 'Steven' + utils.random_string() params = { 'user_id': user_id, 'password': utils.generate_password(), 'name': name, 'email': user_id, 'timezone': 'Asia/Seoul', 'language': language.__str__(), 'tags': { 'tag_key': 'tag_value' }, 'domain_id': self.domain.domain_id } user = self.identity_v1.User.create(params, metadata=(('token', self.owner_token), )) self.user = user self.users.append(user) self._print_data(self.user, 'test_create_user') self.assertEqual(self.user.name, params['name'])
def _create_user(self, user_id=None): lang_code = random.choice(['zh-hans', 'jp', 'ko', 'en', 'es']) language = Language.get(lang_code) user_id = utils.random_string()[0:10] if user_id is None else user_id param = { 'user_id': user_id, 'domain_id': self.domain.domain_id, 'password': '******', 'name': 'Steven' + utils.random_string()[0:5], 'language': language.__str__(), 'timezone': 'Asia/Seoul', 'tags': { 'aa': 'bb' }, 'email': 'Steven' + utils.random_string()[0:5] + '@mz.co.kr', 'mobile': '+821026671234', 'group': 'group-id', } user = self.identity_v1.User.create(param, metadata=(('token', self.token), )) self.user = user self.users.append(user) self.assertEqual(self.user.name, param['name'])
def from_IETF(tag, normalize=False): cache = cache_IETF() tag = str(Language.get(tag, normalize)) output = [] for uid in cache: if tag in cache[uid]['IETF']: output.append(uid) return output
async def _prepareDataSet(self, data_set: NLUDataSet) -> None: language = Language.get(data_set.language).language if self.__pipeline == "supervised": pipeline_config = "supervised_embeddings" image = "rasa/rasa:{}".format(self.__VERSION) if self.__pipeline == "pretrained": pipeline_config = "pretrained_embeddings_spacy" # In theory it should be enough to install rasa/rasa:latest-spacy-{language}, but in # practice the training fails in these images due to the spaCy models not being found. # This bug is reported in the Rasa repo: https://github.com/RasaHQ/rasa/issues/4789 image = "rasa/rasa:{}-spacy-{}".format(self.__VERSION, language) # Create the Rasa config self.__rasa_config_yml = yaml.dump({ "language": language, "pipeline": pipeline_config }) # Connect to the Docker daemon and pull the Rasa container self._logger.info("Preparing the docker container for Rasa...") self._logger.debug("Pulling Rasa image \"%s\"...", image) self.__docker.images.pull(image) self._logger.debug("Starting the Rasa HTTP server...") self.__container = self.__docker.containers.run( image, # Run the Rasa server and enable the HTTP API [ "run", "--enable-api" ], # Automatically remove the container after the server shuts down auto_remove=True, # Don't wait for the command to finish detach=True, # Expose port 5005 (used for HTTP by Rasa) for TCP traffic to a random port ports={ "5005/tcp": None } ) # Update the container information from the Docker daemon self.__container.reload() # Extract the port mapping and build the base url for the HTTP API port_mapping = self.__container.attrs["NetworkSettings"]["Ports"]["5005/tcp"][0] self.__url = "http://{}:{}/".format(port_mapping["HostIp"], port_mapping["HostPort"]) self._logger.debug("Waiting for the health endpoint to come alive...") for _ in range(self.__timeout): try: success = requests.get(self.__url).status_code == 200 except requests.exceptions.ConnectionError: success = False if success: break await asyncio.sleep(1) self._logger.info("Container running.")
def get_language_name(language_code): # type: (str) -> str if is_python_2(): import langcodes return langcodes.LanguageData().get( language_code).describe()['language'] else: from langcodes import Language return Language(language=language_code).language_name()
def get_language_code(language_name): # type: (str) -> str if is_python_2(): import langcodes return langcodes.LanguageData.find_name('language', language_name, 'en').language else: from langcodes import Language return Language(language_name).language
async def _prepareDataSet(self, data_set: NLUDataSet) -> None: self.__app_id = self.__authoring_client.apps.add({ "name": "NLUTestFramework", "culture": Language.get(data_set.language).simplify_script().to_tag(), "initial_version_id": self.__class__.FAKE_VERSION })
def get_languages() -> List[Language]: langs = getattr(settings, "GARNETT_TRANSLATABLE_LANGUAGES", [get_default_language()]) if callable(langs): langs = langs() if type(langs) == list: return [Language.get(lang) for lang in langs] raise ImproperlyConfigured( "GARNETT_TRANSLATABLE_LANGUAGES must be a list or a callable that returns a list" )
def get_video_print(self, videos: List[Track]) -> List[List[str]]: if not videos: return [["--"]] data = [] for video in videos: codec = { "MPEG Video": f"MPEG-{(video.format_version or '').replace('Version ', '')}" }.get(video.format, video.format) scan_overview = video.scan_type vst = False if codec in ["MPEG-1", "MPEG-2"]: # parse d2v file with pyd2v, generates D2V if needed d2v = D2V.load(Path(self.file)) self.file = d2v.path # get every frames' flag data, this contains information on displaying frames # add vob and cell number to each frames flag data as well flags = [ f for line in [[ dict(**y, vob=x["vob"], cell=x["cell"]) for y in x["flags"] ] for x in d2v.data] for f in line ] interlaced_percent = ( sum(1 for f in flags if not f["progressive_frame"]) / len(flags)) * 100 if interlaced_percent == 100: scan_overview = "Interlaced (CST)" else: scan_overview = f"{round(interlaced_percent, 2)}% Interlaced (VST)" vst = True for ext in ["log", "d2v", "mpg", "mpeg"]: fp = os.path.splitext(self.file)[0] + "." + ext if os.path.exists(fp): os.unlink(fp) line_1 = "- {language}, {codec} ({profile}) {width}x{height} ({aspect}) @ {bitrate}".format( language=Language.get(video.language).display_name(), codec=codec, profile=video.format_profile, width=video.width, height=video.height, aspect=video.other_display_aspect_ratio[0], bitrate= f"{video.other_bit_rate[0]}{f' ({video.bit_rate_mode})' if video.bit_rate_mode else ''}" ) line_2 = " {fps} FPS ({fps_mode}), {color_space}{subsampling}P{bit_depth}, {scan}".format( fps=f"{video.framerate_num}/{video.framerate_den}" if video.framerate_num else video.frame_rate, fps_mode="VFR" if vst else video.frame_rate_mode, color_space=video.color_space, subsampling=video.chroma_subsampling.replace(":", ""), bit_depth=video.bit_depth, scan=scan_overview) data.append([line_1, line_2]) return data
def __init__(self, frame): self.frame = frame self.about_fn_btn = Button(self.frame, text=_("About Funing")) # language_combobox self.lang_combobox_var = tk.StringVar(self.frame) self.lang_code = settings.lang_code self.lang_combobox_var.set(Language.make(self.lang_code).autonym()) self.lang_combobox = ttk.Combobox( self.frame, textvariable=self.lang_combobox_var, values=tuple(self.locale_lang_display_names()), state="readonly", )
def get_language_from_request(request) -> Language: opt_order = getattr( settings, "GARNETT_REQUEST_LANGUAGE_SELECTORS", [ "garnett.selectors.query", "garnett.selectors.cookie", "garnett.selectors.header", ], ) for opt in opt_order: func = import_string(opt) if lang := func(request): return Language.get(lang)
def from_IETF(cls, tag, normalize=True): if cls._cache: try: [cls._cache[uid]['IETF'] for uid in cls._cache] except KeyError: cls._cache_IETF() else: cls._cache_IETF() tag = str(Language.get(tag, normalize)) output = [] for uid in cls._cache: if tag in cls._cache[uid]['IETF']: output.append(uid) return output
def _setLanguage(self, language: str) -> None: """ Args: language: The language of this data set. Use this method to set the language, if the language was dynamically loaded from the data set itself. The language is represented by its ISO 639-1 code (e.g. "en"). Raises: :exc:`ValueError`: if the language was already set. """ if self.__language is not None: raise ValueError("The language for this data set was already set.") self.__language = Language.get(language).maximize().to_tag()
def get_default_language(): setting = getattr(settings, "GARNETT_DEFAULT_TRANSLATABLE_LANGUAGE", "en-AU") if callable(setting): default = setting() else: default = setting if isinstance(default, Language): return default elif isinstance(default, str): return Language.get(default) else: raise ImproperlyConfigured( "GARNETT_DEFAULT_TRANSLATABLE_LANGUAGE must be a string or callable that returns a string or `Language` object" )
def _create_user(self): self.user_param = { 'user_id': (utils.random_string()[0:10]), 'password': '******', 'name': 'Steven' + utils.random_string()[0:5], 'language': Language.get('jp').__str__(), 'timezone': 'utc+9', 'tags': { 'aa': 'bb' }, 'domain_id': self.domain.domain_id, 'email': 'Steven' + utils.random_string()[0:5] + '@mz.co.kr', 'mobile': '+821026671234', 'group': 'group-id', } self.user = self.identity_v1.User.create( self.user_param, metadata=(('token', self.owner_token), ))
def find_keywords(text, keywords): # автоматически пытаемся определить язык текста новости lang = Language.make(language=detect(text)).language_name().lower() try: stemmer = SnowballStemmer(lang) except: # если такого языка не нашлось, то просто используем английский stemmer = SnowballStemmer('english') keywords = [(word.lower(), stemmer.stem(word)) for word in keywords] text = word_tokenize(text) for key in keywords: for word in text: if word == key[0] or word == key[1] or stemmer.stem( word) == key[0] or stemmer.stem(word) == key[1]: return True return False
def get_subtitle_print(subs: List[Track]) -> List[str]: """ Return a list of a brief subtitle overview per-subtitle. e.g. - English, Forced, SubRip (SRT) - English, SubRip (SRT) - English, SDH, SubRip (SRT) - Spanish, Latin American (SDH), SubRip (SRT) The bit of text between the Language and the Subtitle format is the Track Title. It can be of any format, but it is recommended to be used as shown above. It will be returned as a list of strings with the `- ` already pre-pended to each entry. """ data = [] if not subs: data.append("--") for sub in subs: line_items = [] # following sub.title tree checks and supports three different language and title scenarios # The second scenario is the recommended option to choose if you are open to choosing any # The third scenario should be used if you have nothing unique to state about the track # | Language | Track Title | Output | # | ------------ | ----------------------------- | --------------------------------------------- | # | es / Spanish | Spanish (Latin American, SDH) | - Spanish (Latin American, SDH), SubRip (SRT) | # | es / Spanish | Latin American (SDH) | - Spanish, Latin American (SDH), SubRip (SRT) | # | es / Spanish | None | - Spanish, SubRip (SRT) | language = Language.get(sub.language).display_name() if sub.title: if language.lower() in sub.title.lower(): line_items.append(sub.title) else: line_items.append(f"{language}, {sub.title}") else: line_items.append(language) line_items.append(sub.format.replace("UTF-8", "SubRip (SRT)")) line = "- " + ", ".join(line_items) data += [(" " + x if i > 0 else x) for i, x in enumerate(textwrap.wrap(line, 64))] return data
def runSimpleJSONDataSetTests(path, title, constructor_language, expected_language, size): expected_language = Language.get(expected_language).maximize().to_tag() # Run the tests twice, ignoring existing caches on the first run for ignore_cache in [ True, False ]: # Construct the data set data_set = SimpleJSONDataSet(title, path, 50, constructor_language, ignore_cache) assert data_set.title == title assert data_set.language == expected_language # Verify that the training data does not contain any None-intent sentences assert len(list(filter(lambda x: x.intent is None, data_set.training_data))) == 0 # Get the number of None-intent sentences in the validation data num_none_intent_sentences = len(list(filter( lambda x: x.intent is None, data_set.validation_data ))) # Verify that the training and validation data (without None-intent sentences) was split # correctly at about 50% validation_size_without_none = len(data_set.validation_data) - num_none_intent_sentences assert abs(len(data_set.training_data) - validation_size_without_none) <= 1 # Verify that all entries were loaded assert len(data_set.training_data) + len(data_set.validation_data) == size # Make sure that the data returned on subsequent calls is the same assert data_set.training_data == data_set.training_data assert data_set.validation_data == data_set.validation_data # Verify that the data is sorted and split differently after reshuffling the data training_data = data_set.training_data validation_data = data_set.validation_data data_set.reshuffle() assert training_data != data_set.training_data assert validation_data != data_set.validation_data # Make sure that a copy of the data is returned and not a reference data_set.training_data.pop() data_set.validation_data.pop() assert len(data_set.training_data) + len(data_set.validation_data) == size
async def cevir(client: Client, message: Message): # < Başlangıç await log_yolla(client, message) ilk_mesaj = await message.edit("__Bekleyin..__", disable_web_page_preview=True) #------------------------------------------------------------- Başlangıç > girilen_yazi = message.text cevaplanan_mesaj = message.reply_to_message if not cevaplanan_mesaj and len(girilen_yazi.split()) == 1: await ilk_mesaj.edit( "__Çeviri yapabilmem için bişeyler söyleyin ya da mesaj yanıtlayın..__" ) return if not cevaplanan_mesaj: girdi = girilen_yazi.split(" ", 1)[1] elif cevaplanan_mesaj.document: gelen_dosya = await cevaplanan_mesaj.download() veri_listesi = None with open(gelen_dosya, "rb") as oku: veri_listesi = oku.readlines() girdi = "".join(veri.decode("UTF-8") for veri in veri_listesi) os.remove(gelen_dosya) elif cevaplanan_mesaj.text: girdi = cevaplanan_mesaj.text else: await ilk_mesaj.edit("__güldük__") return await ilk_mesaj.edit("Çevriliyor...") gelen_mesaj_dili = Language.make( language=cevirici.detect(girdi).lang).display_name() cevrilmis_mesaj = cevirici.translate(girdi, dest='tr').text await ilk_mesaj.edit(f'`{gelen_mesaj_dili}`\n\n__{cevrilmis_mesaj}__')
def langcodes_score(language, segment, score): '''Use langcodes on selected URL segments and integrate them into a score.''' # see also: https://babel.pocoo.org/en/latest/locale.html # test if the code looks like a country or a language if segment[:2] not in COUNTRY_CODES and segment[:2] not in LANGUAGE_CODES: return score # test if tag is valid (caution: private codes are) if tag_is_valid(segment): # try to identify language code identified = Language.get(segment).language # see if it matches if identified is not None: LOGGER.debug('langcode %s found in URL segment %s', identified, segment) if identified != language: score -= 1 else: score += 1 return score
def _prepare_user_data(scenario_user): lang_code = random.choice(['ko', 'en']) language = Language.get(lang_code) user_id = random_string()[0:10] default_user = { 'user_id': user_id, 'password': user_id, 'name': 'Steven' + random_string()[0:5], 'language': language.__str__(), 'timezone': 'Asia/Seoul', 'tags': { 'aa': 'bb' }, 'email': 'Steven' + random_string()[0:5] + '@mz.co.kr', 'mobile': '+821026671234' } # Overwrite param, if needed default_user.update(scenario_user) return default_user
def get_course_language(course_runs): """ Gets the languages associated with a course. Used for the "Language" facet in Algolia. Arguments: course_runs (list): list of course runs for a course Returns: list: a list of supported languages for those course runs """ languages = set() for course_run in course_runs: content_language = course_run.get('content_language') if not content_language: continue language_name = Language.make(language=content_language).language_name() languages.add(language_name) return list(languages)
def get_audio_print(self, audio: List[Track]) -> List[str]: if not audio: return ["--"] data = [] for t in audio: if t.title and "Commentary" in t.title: title = t.title else: title = Language.get(t.language).display_name() if t.channel_layout: channels = float( sum( self.AUDIO_CHANNEL_LAYOUT_WEIGHT.get(x, 1) for x in t.channel_layout.split(" "))) else: channels = float(t.channel_s) bit_rate_mode = f" ({t.bit_rate_mode})" if t.bit_rate_mode else "" l1 = f"- {title}, {t.format} {channels} @ {t.other_bit_rate[0]}{bit_rate_mode}" data += [(" " + x if i > 0 else x) for i, x in enumerate(textwrap.wrap(l1, 64))] return data
def test_create_owner(self): lang_code = random.choice(['zh-hans', 'jp', 'ko', 'en', 'es']) language = Language.get(lang_code) owner_id = utils.random_string() params = { 'owner_id': owner_id, 'password': utils.generate_password(), 'name': 'Steven' + utils.random_string(), 'language': language.__str__(), 'timezone': 'Asia/Seoul', 'email': 'Steven' + utils.random_string() + '@mz.co.kr', 'domain_id': self.domain.domain_id } owner = self.identity_v1.DomainOwner.create(params) self.domain_owner = owner self.params = params self.assertEqual(params['name'], self.domain_owner.name) self._issue_owner_token(params['owner_id'], params['password'])
def _test_create_user(self, name='test', user_id=None): if self.role is None: self._test_create_role() if user_id is None: user_id = utils.random_string()[0:10] lang_code = random.choice(['zh-hans', 'jp', 'ko', 'en', 'es']) language = Language.get(lang_code) params = { 'user_id': user_id, 'domain_id': self.domain.domain_id, 'password': '******', 'name': name + utils.random_string()[0:5], 'language': language.__str__(), 'timezone': 'utc+9', 'tags': { 'aa': 'bb' }, 'email': name + utils.random_string()[0:5] + '@mz.co.kr', 'mobile': '+821026671234', 'group': 'group-id' } self.user = self.identity_v1.User.create(params, metadata=(('token', self.token), )) self.user = self.identity_v1.User.update_role( { 'user_id': self.user.user_id, 'domain_id': self.domain.domain_id, 'roles': [self.role.role_id] }, metadata=(('token', self.token), )) self.users.append(self.user) return self.user
def test_create_owner(self): lang_code = random.choice(['zh-hans', 'jp', 'ko', 'en', 'es']) language = Language.get(lang_code) owner_id = utils.random_string()[0:10] param = { 'owner_id': owner_id, 'password': '******', 'name': 'Steven' + utils.random_string()[0:5], 'language': language.__str__(), 'timezone': 'utc+9', 'email': 'Steven' + utils.random_string()[0:5] + '@mz.co.kr', 'mobile': '+821026671234', 'domain_id': self.domain.domain_id } owner = self.identity_v1.DomainOwner.create( param ) self.domain_owner = owner self.param = param self.assertEqual(param['name'], self.domain_owner.name)
def change_language(self, lang): lang_display_name = self.rbmixfm.lang_combobox_var.get() new_lang_code = Language.find(lang_display_name).to_tag() if settings.debug: print( "new_lang_code: ", new_lang_code, "lang_code: ", settings.lang_code, ) if new_lang_code == settings.lang_code: return restartapp = messagebox.askyesno(title=_("Restart Funing Now?")) if restartapp: settings.config_yml["lang_code"] = new_lang_code yaml.dump(settings.config_yml, open(settings._config_path, "w")) sys_executable = sys.executable os.execl(sys_executable, sys_executable, *sys.argv) pass
def get_language_info(language): """ Looks up the things we need to know about how to handle text in a given language. This will return a dictionary with the following fields: 'script': a BCP 47 script code such as 'Latn', 'Cyrl', 'Hans'... Indicates the script that tokens in this language should be in, _after_ our preprocessing. The script for 'zh' is 'Hans', for example, because even if the input is in Traditional Chinese ('Hant'), we convert it to Simplified. 'tokenizer': 'regex', 'jieba', 'mecab', or None Indicates the best way we know to separate tokens in the language. 'regex' is what will be used for most languages, meaning that we can segment the text with a Unicode-aware regular expression. If a language generally uses spaces to separate words, the regex will work well. 'jieba' and 'mecab' are tokenizers for specific languages written without spaces. A tokenizer of None means we don't have a good way to segment the language. We'll use the regex anyway, but the results will be pretty bad. 'normal_form': 'NFC' or 'NFKC' How "should" Unicode be normalized when comparing text in this language? This is not a standard, it's just based on experience. Many languages need NFKC normalization for text comparisons to work properly, but in many European languages, NFKC normalization is excessive and loses information. 'remove_marks': True or False Determines whether marks and decorations, such as vowel points and tatweels, should be removed. True for languages in abjad scripts. 'dotless_i': True or False Is "ı" the lowercase of "I" in this language, as in Turkish? 'diacritics_under': 'cedillas', 'commas', or None Should we convert any diacritics that are under the letters "s" and "t" in this language? 'cedillas' means we should convert commas to cedillas, and 'commas' means we should convert cedillas to commas. 'transliteration': 'sr-Latn', 'az-Latn', or None Indicates a type of transliteration that we should use for normalizing a multi-script language. 'sr-Latn' means to use Serbian romanization, and 'az-Latn' means to use Azerbaijani romanization. 'lookup_transliteration': 'zh-Hans' or None Indicates a lossy transliteration that should be not be used for output, but should be applied when looking up words in a list. 'zh-Hans' means that we should convert Traditional Chinese characters to Simplified. """ # The input is probably a string, so parse it into a Language. If it's # already a Language, it will pass through. language = Language.get(language) # Assume additional things about the language, such as what script it's in, # using the "likely subtags" table language_full = language.maximize() # Start the `info` dictionary with default values, including the 'script' # value that we now know from `language_full`. info = { 'script': language_full.script, 'tokenizer': 'regex', 'normal_form': 'NFKC', 'remove_marks': False, 'dotless_i': False, 'diacritics_under': None, 'transliteration': None, 'lookup_transliteration': None } if _language_in_list(language, ['ja', 'ko']): info['tokenizer'] = 'mecab' elif _language_in_list(language, ['zh', 'yue']): info['tokenizer'] = 'jieba' elif info['script'] in SPACELESS_SCRIPTS: info['tokenizer'] = None # Cased alphabetic scripts get NFC normal form if info['script'] in ['Latn', 'Grek', 'Cyrl']: info['normal_form'] = 'NFC' if info['script'] in ['Arab', 'Hebr']: info['remove_marks'] = True if _language_in_list(language, ['tr', 'az', 'kk']): info['dotless_i'] = True info['diacritics_under'] = 'cedillas' elif _language_in_list(language, ['ro']): info['diacritics_under'] = 'commas' if _language_in_list(language, ['sr']): info['transliteration'] = 'sr-Latn' elif _language_in_list(language, ['az']): info['transliteration'] = 'az-Latn' if language.language == 'zh' and language.script != 'Hant': info['lookup_transliteration'] = 'zh-Hans' return info
def cache_IETF(): cache = {} result = query( """ select expr.txt, exprsrc.txt as trans_txt from expr inner join denotationx as denotation on denotation.expr = expr.id inner join denotationx as denotationsrc on denotationsrc.meaning = denotation.meaning and denotationsrc.expr != denotation.expr inner join expr as exprsrc on exprsrc.id = denotationsrc.expr where expr.langvar = uid_langvar('art-274') and denotationsrc.langvar = uid_langvar('art-420') """) for r in result: cache[r['txt']] = {} cache[r['txt']]['IETF'] = set() for r in query( """ select langvar.lang_code, langvar.region_expr, uid(langvar.lang_code,langvar.var_code), script_expr.txt as script_expr_txt, uid(region_expr_langvar.lang_code,region_expr_langvar.var_code) as region_expr_uid, region_expr.txt as region_expr_txt from langvar inner join expr on expr.id = langvar.name_expr inner join expr as script_expr on script_expr.id = langvar.script_expr inner join expr as region_expr on region_expr.id = langvar.region_expr inner join langvar as region_expr_langvar on region_expr_langvar.id = region_expr.langvar where uid(langvar.lang_code,langvar.var_code) = any(%s) """, (list(cache.keys()),)): cache[r['uid']].update(r) regions_dict = {} for r in query( """ select expr.txt, exprsrc.txt as trans_txt from expr inner join denotationx as denotation on denotation.expr = expr.id inner join denotationx as denotationsrc on denotationsrc.meaning = denotation.meaning and denotationsrc.expr != denotation.expr inner join expr as exprsrc on exprsrc.id = denotationsrc.expr where expr.langvar = uid_langvar('art-006') and denotationsrc.expr = any(%s) """, ([l['region_expr'] for l in cache.values()],)): if len(r['txt']) == 2: regions_dict[r['trans_txt']] = r['txt'] for r in result: uid = r['txt'] lang = cache[uid] given_tag = Language.get(r['trans_txt'], normalize=False) normalized_tag = Language.get(r['trans_txt'], normalize=True) language_set = {lang['lang_code'], given_tag.language, normalized_tag.language} script_set = {lang['script_expr_txt'], given_tag.script, normalized_tag.script} region_set = {given_tag.region, normalized_tag.region} if lang['region_expr_uid'] == 'art-279' and lang['region_expr_txt'] != '001': region_set.add(lang['region_expr_txt']) try: region_set.add(regions_dict[lang['region_expr_txt']]) except KeyError: pass if {'GB', 'UK'} & region_set: region_set |= {'GB', 'UK'} if {'001', None} & region_set: region_set |= {'001', None} for language, script, region in product(language_set, script_set, region_set): new_tag = normalized_tag.update_dict({'language': language, 'script': script, 'region': region}) cache[uid]['IETF'].add(str(new_tag)) if lang['region_expr_uid'] == 'art-279' and lang['region_expr_txt'] == '001': for language, script, region in product({lang['lang_code']}, script_set, {'001', None}): new_tag = normalized_tag.update_dict({'language': language, 'script': script, 'region': region}) cache[uid]['IETF'].add(str(new_tag)) return cache