def s_to_d(input_string): scheme = detect.detect(input_string) print("input: " + input_string + " --- encoding = " + str(scheme)) v_scheme_map = sanscript.SchemeMap(sanscript.SCHEMES[sanscript.SLP1], sanscript.SCHEMES[sanscript.DEVANAGARI]) output_string = sanscript.transliterate(input_string, scheme_map=v_scheme_map) scheme = detect.detect(output_string) print("output: " + output_string + " --- encoding = " + str(scheme)) return output_string
def v_to_d(input_string): scheme = detect.detect(input_string) if (str(scheme) == "Velthuis"): v_scheme_map = sanscript.SchemeMap( sanscript.SCHEMES[sanscript.VELTHUIS], sanscript.SCHEMES[sanscript.DEVANAGARI]) output_string = sanscript.transliterate(input_string, scheme_map=v_scheme_map) scheme = detect.detect(output_string) else: output_string = input_string return output_string
def d_to_i(input_string): scheme = detect.detect(input_string) #print("input: " + input_string + " --- encoding = " + str(scheme)) if (str(scheme) == "Devanagari"): v_scheme_map = sanscript.SchemeMap( sanscript.SCHEMES[sanscript.DEVANAGARI], sanscript.SCHEMES[sanscript.ITRANS]) output_string = sanscript.transliterate(input_string, scheme_map=v_scheme_map) #print("Ascii translation for tokenization:") scheme = detect.detect(output_string) #print("output: " + output_string + " --- encoding = " + str(scheme)) else: output_string = input_string #print("please enter the input in devanagari") return output_string
def get_storage_name(text, source_script=None, max_length=50, maybe_use_dravidian_variant=True, mixed_languages_in_titles=True): from indic_transliteration import detect if source_script is None: source_script = detect.detect(text=text) text_optitrans = regex.sub("/ *", "__", text) if source_script in roman.ALL_SCHEME_IDS: if source_script in roman.CAPITALIZABLE_SCHEME_IDS: if mixed_languages_in_titles: text_optitrans = sanscript.SCHEMES[ sanscript.IAST].mark_off_non_indic_in_line(text_optitrans) text_optitrans = sanscript.transliterate( text_optitrans, source_script, sanscript.OPTITRANS, suspend_on=set('<'), suspend_off=set('>'), maybe_use_dravidian_variant=maybe_use_dravidian_variant) else: text_optitrans = sanscript.transliterate( text_optitrans, source_script, sanscript.OPTITRANS, maybe_use_dravidian_variant=maybe_use_dravidian_variant) storage_name = clean_file_path(text_optitrans) if max_length is not None: storage_name = storage_name[:max_length] return storage_name
def apply_transliteration(text): ''' Detect Language and Transliterate non-Roman script text into Roman script ''' lang = detect.detect(str(text)) if lang not in [ Scheme.ITRANS, Scheme.HK, Scheme.SLP1, Scheme.IAST, Scheme.Velthuis, Scheme.Kolkata ]: text = transliterate(text, getattr(sanscript, lang.upper()), sanscript.HK).lower() return text
def __init__(self, thing, encoding=None, unicode_encoding='utf-8'): assert isinstance(thing, six.string_types) # Encode early, unicode everywhere, decode late is the philosophy # However, we need to accept both unicode and non unicode strings # We are udAramatiH if isinstance(thing, six.text_type): self.thing = thing else: self.thing = six.text_type(thing, unicode_encoding) if encoding is None: # Autodetect Encoding encoding = SCHEMES[detect.detect(self.thing)] if encoding != SLP1: # Convert to SLP1 self.thing = sanscript.transliterate(self.thing, encoding, SLP1)
def i_to_d(input_string): scheme = detect.detect(input_string) if (str(scheme) == "ITRANS"): inputSchemeIndex = sanscript.ITRANS elif (str(scheme) == "HK"): inputSchemeIndex = sanscript.HK if ((str(scheme) == "ITRANS") or (str(scheme) == "HK")): v_scheme_map = sanscript.SchemeMap( sanscript.SCHEMES[inputSchemeIndex], sanscript.SCHEMES[sanscript.DEVANAGARI]) output_string = sanscript.transliterate(input_string, scheme_map=v_scheme_map) else: output_string = input_string return output_string
def main(): is_pragrahya = False print("input:" + sys.argv[1]) scheme = detect.detect(sys.argv[1]) input_string = "" print("input: " + sys.argv[1] + " --- encoding = " + str(scheme)) if (str(scheme) == "Devanagari"): v_scheme_map = sanscript.SchemeMap( sanscript.SCHEMES[sanscript.DEVANAGARI], sanscript.SCHEMES[sanscript.VELTHUIS]) input_string = sanscript.transliterate(sys.argv[1], scheme_map=v_scheme_map) else: print("please enter the input in devanagari") is_pragrahya = pragrahya_check(input_string) print("**Pragrahya " + str(is_pragrahya))
def __init__(self, thing=None, encoding=None, unicode_encoding='utf-8', strict_io=True, replace_ending_visarga='s'): assert isinstance(thing, six.string_types) # Encode early, unicode everywhere, decode late is the philosophy # However, we need to accept both unicode and non unicode strings # We are udAramatiH if isinstance(thing, six.text_type): self.thing = thing else: self.thing = six.text_type(thing, unicode_encoding) self.encoding = encoding if self.encoding is None: if thing is not None: # Autodetect Encoding self.encoding = SCHEMES[detect.detect(self.thing)] if self.encoding != SLP1: # Convert to SLP1 self.thing = self.transcoded(SLP1) self.encoding = SLP1 if not strict_io: # Normalize logger.debug("Before normalization: %s", self.thing) tmp = normalization.normalize(self.thing) if replace_ending_visarga == 's': self.thing = normalization.replace_ending_visarga_s(tmp) elif replace_ending_visarga == 'r': self.thing = normalization.replace_ending_visarga_r(tmp) else: self.thing = tmp # Lazy Anusvaras (see issue #103) tmpi = sanscript.transliterate(self.thing, SLP1, ITRANS) tmpi = deduplication.fix_lazy_anusvaara_itrans(tmpi) self.thing = sanscript.transliterate(tmpi, ITRANS, SLP1) logger.debug("After normalization: %s", self.thing) # Tags will go here as # { lexical_tag : [possible morphologies] } self.tags = []
def test_noisy(data): noise = ' \t\n 1234567890 !@#$%^&*(),.<>\'\"-_[]{}\\|;:`~ ΣД あア' text, scheme = data text = ''.join([noise, text, noise]) assert detect(text) == scheme, data
def test_decoded(data): text, scheme = data if sys.version_info < (3, 0): text = text.decode('utf-8') detection = detect(text) assert detection == scheme, u'%s == %s (%s)' % (detection, scheme, text)
def test_basic(data): text, scheme = data detection = detect(text) assert detection == scheme, u'%s == %s (%s)' % (detection, scheme, text)
def test_decoded(data): text, scheme = data text = text.decode('utf-8') detection = detect(text) assert detection == scheme, u'%s == %s (%s)' % (detection, scheme, text)