def translate(self, phrase, just_transliterate=False): if not is_cyr(phrase): return dict( term=phrase, translation=phrase, source="not_required", quality=10 ) if just_transliterate: return dict( term=phrase, translation=translit(phrase), source="translit", quality=10 ) term_id = self.get_id(phrase) if term_id in self.inner_dict: return self.inner_dict[term_id] loose_term_id = self.get_loose_id(phrase) if loose_term_id in self.inner_dict: return self.inner_dict[loose_term_id] any_luck = self.auto_translate(phrase) if any_luck: return any_luck if phrase.strip() and self.store_unseen: self.unseen.update([phrase.strip()]) return dict( term=phrase, translation=translit(phrase), source="translit", quality=1 )
def tranliterate_all_the_things(name): """ Transliterate both russian and Ukrainian characters to latin. This is our last hope to deal with mixed alphabets >>> tranliterate_all_the_things("bэdtєd і тыква") 'bedtied i tykva' """ return translit(translit(name), RussianInternationalPassport)
def get_nodes(self): language = get_language() persons = set() companies = set() name = self.name if language == "en": name = translit(name) for d in self.declarations.exclude(doc_type="Форма змін").exclude( exclude=True): persons |= set(d.source["related_entities"]["people"]["family"]) companies |= set( d.source["related_entities"]["companies"]["owned"]) companies |= set( d.source["related_entities"]["companies"]["related"]) nodes = [{ "data": { "id": "root", "label": name }, "classes": ["root", "person"] }] edges = [] for p in persons: id_ = hashlib.sha256(p.encode("utf-8")).hexdigest() if language == "en": p = translit(p) nodes.append({ "data": { "id": id_, "label": p }, "classes": ["person"] }) edges.append({"data": {"source": "root", "target": id_}}) for c in companies: id_ = hashlib.sha256(c.encode("utf-8")).hexdigest() nodes.append({ "data": { "id": id_, "label": c }, "classes": ["company"] }) edges.append({"data": {"source": "root", "target": id_}}) return {"nodes": nodes, "edges": edges}
def transliterate(self, person_last_name, person_first_name, person_patronymic): first_names = [] last_names = [] patronymics = [] original = [ "{} {} {}".format(person_last_name, person_first_name, person_patronymic).strip().replace(" ", " ") ] result = set() if (person_first_name.lower() in self.ru_translations and is_cyr(person_first_name)): first_names = self.ru_translations[person_first_name.lower()] else: first_names = [person_first_name] if (person_last_name.lower() in self.ru_translations and is_cyr(person_last_name)): last_names = self.ru_translations[person_last_name.lower()] else: last_names = [person_last_name] if (person_patronymic.lower() in self.ru_translations and is_cyr(person_patronymic)): patronymics = self.ru_translations[person_patronymic.lower()] else: patronymics = [person_patronymic] translated = [ "{} {} {}".format(l, f, p).strip().replace(" ", " ") for f in first_names for p in patronymics for l in last_names ] for n in original: if is_cyr(n): # TODO: also replace double ж, х, ц, ч, ш with single chars for ua_table in ALL_UKRAINIAN: result.add(translit(n, ua_table)) for n in translated: if not is_ukr(n): for ru_table in ALL_RUSSIAN: result.add(translit(n, ru_table)) return result | set(translated)
def conv_link(value): anchor = translit(value) anchor = translit_ru(anchor, 'ru', reversed=True) for i in anchor: if i == ' ': anchor = anchor.replace(i, '-') elif i == '\'': anchor = anchor.replace(i, '') anchor = quote_plus(anchor) anchor = anchor.lower() return anchor
def company_details(request, company_id): company = get_object_or_404(Company, pk=company_id) context = {"company": company} if is_cyr(company.name_uk): context["filename"] = translit(company.name_uk.lower().strip().replace( " ", "_").replace("\n", "")) else: context["filename"] = company.pk context["feedback_form_override"] = FeedbackForm( initial={"person": unicode(company.name)}) return context
def _get_unique_slug(text, Model): slug = translit(text, RussianSimple) slug = slugify(slug) if len(slug) > settings.SLUG_MAX_LENGTH: slug = slug[:settings.SLUG_MAX_LENGTH - 1] unique_slug = slug num = 1 while Model.objects.filter(slug=unique_slug).exists(): unique_slug = '{}-{}'.format(slug, num) num += 1 return unique_slug
def company_details(request, company_id): company = get_object_or_404(Company, pk=company_id) context = { "company": company, "articles": company.articles.filter(kind="i", publish=True).order_by("-date"), } if is_cyr(company.name_uk): context["filename"] = translit(company.name_uk.lower().strip().replace( " ", "_").replace("\n", "")) else: context["filename"] = company.pk context["feedback_form_override"] = FeedbackForm( initial={"person": unicode(company.name)}) return context
def person_details(request, person_id): person = get_object_or_404(Person, pk=person_id) flags_qs = person.flags.select_related("rule").order_by("rule__id") context = { "person": person, "query": "", "all_declarations": person.get_declarations(), "charts_data": person.get_charts_data(), "scoring_score": sum(x[0] * x[1] for x in flags_qs.values_list("rule__weight", "rule__scale")), "scoring_flags": flags_qs.order_by("-rule__weight", "pk").nocache(), "country_connections_titles": Person2Country._relationships_explained, "articles": person.articles.filter(kind="i", publish=True).order_by("-date"), } full_name = "%s %s %s" % ( person.last_name_uk, person.first_name_uk, person.patronymic_uk, ) if is_cyr(full_name): context["filename"] = translit(full_name.lower().strip().replace( " ", "_").replace("\n", "")) else: context["filename"] = person.pk context["feedback_form_override"] = FeedbackForm( initial={"person": unicode(person)}) return context
def person_details(request, person_id): person = get_object_or_404(Person, pk=person_id) context = { "person": person, "query": "", "all_declarations": person.get_declarations(), } full_name = "%s %s %s" % ( person.last_name_uk, person.first_name_uk, person.patronymic_uk, ) if is_cyr(full_name): context["filename"] = translit(full_name.lower().strip().replace( " ", "_").replace("\n", "")) else: context["filename"] = person.pk context["feedback_form_override"] = FeedbackForm( initial={"person": unicode(person)}) return context
def transform_company(self, company): """ Applies pre-process, categorization and parsing steps to record from the company registry :param company: One record from the input file :type company: dict :returns: Results of processing :rtype: Dict """ assert ( self.preprocessor and self.beneficiary_categorizer and self.parser ) base_rec = { "Company name": company["name"], "Company name EN": translit(company["name"]), "Company short name": company["short_name"], "Company short name EN": translit(company["short_name"]), "Company number": company["edrpou"], "Company address": company["location"], "Company head": company["head"], "Company head EN": translit(company["head"]), "Company profile": company["company_profile"], "Company status": company["status"], "Is beneficial owner": False, "BO is absent": False, "Has reference": False, "Was dereferenced": False, } founders = self.preprocessor.process_founders(company) if not founders: yield base_rec had_references = False true_founders = [] for founder in founders: rec = base_rec.copy() rec["Raw founder record"] = founder if self.beneficiary_categorizer.classify(founder): rec["Is beneficial owner"] = True owner = self.parser.parse_founders_record(founder, include_stats=True) owner["Name EN"] = list(map(translit, owner["Name"])) owner["BO is absent"] = self.beneficiary_categorizer.is_absent(founder) if not owner["Name"] and not owner["BO is absent"]: if self.beneficiary_categorizer.is_reference(founder): owner["Has reference"] = True had_references = True rec.update(owner) yield rec else: true_founders.append(founder) # Second pass: checking if there was BO records that references to founders # so we parse founders records too: for founder in true_founders: rec = base_rec.copy() rec["Raw founder record"] = founder if had_references: rec["Is beneficial owner"] = True owner = self.parser.parse_founders_record(founder, include_stats=True) owner["Name EN"] = list(map(translit, owner["Name"])) owner["Was dereferenced"] = True rec.update(owner) yield rec
def autocomplete_suggestions(name): return {title(name), translit(title(name), UkrainianKMU)}
def get_all_unique(text): return list(set((map(lambda t: translit(text, t), ALL_UKRAINIAN))))
def translit_to_en(value): return translit(value)
def to_dict(self): d = model_to_dict( self, fields=[ "id", "name_uk", "short_name_uk", "name_en", "short_name_en", "state_company", "edrpou", "wiki", "city", "street", "other_founders", "other_recipient", "other_owners", "other_managers", "bank_name", "also_known_as", ], ) d["related_persons"] = [ i.to_person_dict() for i in self.from_persons.prefetch_related("from_person") ] d["related_countries"] = [ i.to_dict() for i in self.from_countries.prefetch_related("to_country") ] d["related_companies"] = [ i.to_dict() for i in self.to_companies.prefetch_related("to_company") ] + [ i.to_dict_reverse() for i in self.from_companies.prefetch_related("from_company") ] d["status"] = self.get_status_display() d["status_en"] = translate_into(self.get_status_display()) d["founded"] = self.founded_human d["category"] = self.category d["closed"] = self.closed_on_human d["last_modified"] = self.last_modified suggestions = [] names = set([ d["name_uk"], d["short_name_uk"], ]) for name in list(names): for ua_table in ALL_UKRAINIAN: names.add(translit(name, ua_table)) names.add(d["name_en"]) names.add(d["short_name_en"]) for field in names: if not field: continue chunks = list( map(lambda x: x.strip("'\",.-“”«»"), field.split(" "))) for i in xrange(len(chunks)): variant = copy(chunks) variant = [variant[i]] + variant[:i] + variant[i + 1:] suggestions.append(" ".join(variant)) if self.edrpou: edrpou_chunks = list( filter( None, map( unicode.strip, re.split("([a-z]+)", self.edrpou, flags=re.IGNORECASE), ), )) suggestions += edrpou_chunks suggestions.append(self.edrpou.lstrip("0")) if self.edrpou.isdigit(): suggestions.append(self.edrpou.rjust(8, "0")) d["code_chunks"] = edrpou_chunks d["name_suggest"] = [{"input": x} for x in set(suggestions)] d["name_suggest_output"] = d["short_name_uk"] or d["name_uk"] d["name_suggest_output_en"] = d["short_name_en"] or d["name_en"] d["_id"] = d["id"] return d
def _generate_slug(self): value = translit(self.title) slug_candidate = slugify(value, allow_unicode=True) ctime = datetime.datetime.now().ctime() self.slug = f'{slug_candidate}-{ctime}'
def _generate_slug(self): value = translit(self.name) self.slug = slugify(value, allow_unicode=True)
def from_cyrillic_to_eng(text: str): text = text.replace(' ', '_').lower() tmp = translit(text) return tmp
def transliterate(self, person_last_name, person_first_name, person_patronymic): first_names = [] last_names = [] patronymics = [] original = [(person_last_name, person_first_name, person_patronymic)] result = set() if (person_first_name.lower() in self.ru_translations and is_cyr(person_first_name)): first_names = self.ru_translations[person_first_name.lower()] else: first_names = [person_first_name] self.add_for_translation(person_first_name) if (person_last_name.lower() in self.ru_translations and is_cyr(person_last_name)): last_names = self.ru_translations[person_last_name.lower()] else: last_names = [person_last_name] self.add_for_translation(person_last_name) if (person_patronymic.lower() in self.ru_translations and is_cyr(person_patronymic)): patronymics = self.ru_translations[person_patronymic.lower()] else: patronymics = [person_patronymic] self.add_for_translation(person_patronymic) translated = [ (l, f, p) for f in first_names for p in patronymics for l in last_names ] for n in original: name = self.get_name(n) if is_cyr(name): for ua_table in ALL_UKRAINIAN: result.add(translit(name, ua_table)) for sc_rex, replacements in self.special_replacements.items(): if re.search(sc_rex, name, flags=re.I | re.U): for repl in replacements: optional_n = re.sub(sc_rex, repl, name, flags=re.I | re.U) result.add(translit(title(optional_n), UkrainianKMU)) for sc, replacements in self.special_cases.items(): if sc in n: for repl in replacements: optional_n = self.replace_item(n, sc, repl) result.add(translit(self.get_name(optional_n), UkrainianKMU)) for n in translated: name = self.get_name(n) if not is_ukr(name): for ru_table in ALL_RUSSIAN: result.add(translit(name, ru_table)) for sc_rex, replacements in self.special_replacements.items(): if re.search(sc_rex, name, flags=re.I | re.U): for repl in replacements: optional_n = re.sub(sc_rex, repl, name, flags=re.I | re.U) result.add(translit(title(optional_n), RussianInternationalPassport)) for sc, replacements in self.special_cases.items(): if sc in n: for repl in replacements: optional_n = self.replace_item(n, sc, repl) result.add(translit( self.get_name(optional_n), RussianInternationalPassport) ) return result | set(map(self.get_name, translated))
from translitua import translit a = translit('Тіні забутих предків') print(a)