Пример #1
0
def s_to_d(input_string):
    scheme = detect.detect(input_string)
    print("input: " + input_string + " ---   encoding = " + str(scheme))
    v_scheme_map = sanscript.SchemeMap(sanscript.SCHEMES[sanscript.SLP1],
                                       sanscript.SCHEMES[sanscript.DEVANAGARI])
    output_string = sanscript.transliterate(input_string,
                                            scheme_map=v_scheme_map)
    scheme = detect.detect(output_string)
    print("output: " + output_string + " ---   encoding = " + str(scheme))
    return output_string
Пример #2
0
def v_to_d(input_string):
    scheme = detect.detect(input_string)
    if (str(scheme) == "Velthuis"):
        v_scheme_map = sanscript.SchemeMap(
            sanscript.SCHEMES[sanscript.VELTHUIS],
            sanscript.SCHEMES[sanscript.DEVANAGARI])
        output_string = sanscript.transliterate(input_string,
                                                scheme_map=v_scheme_map)
        scheme = detect.detect(output_string)
    else:
        output_string = input_string
    return output_string
Пример #3
0
def d_to_i(input_string):
    scheme = detect.detect(input_string)
    #print("input: " + input_string + " ---   encoding = " + str(scheme))
    if (str(scheme) == "Devanagari"):
        v_scheme_map = sanscript.SchemeMap(
            sanscript.SCHEMES[sanscript.DEVANAGARI],
            sanscript.SCHEMES[sanscript.ITRANS])
        output_string = sanscript.transliterate(input_string,
                                                scheme_map=v_scheme_map)
        #print("Ascii translation for tokenization:")
        scheme = detect.detect(output_string)
        #print("output: " + output_string + " ---   encoding = " + str(scheme))
    else:
        output_string = input_string
        #print("please enter the input in devanagari")
    return output_string
Пример #4
0
def get_storage_name(text,
                     source_script=None,
                     max_length=50,
                     maybe_use_dravidian_variant=True,
                     mixed_languages_in_titles=True):
    from indic_transliteration import detect
    if source_script is None:
        source_script = detect.detect(text=text)
    text_optitrans = regex.sub("/ *", "__", text)
    if source_script in roman.ALL_SCHEME_IDS:
        if source_script in roman.CAPITALIZABLE_SCHEME_IDS:
            if mixed_languages_in_titles:
                text_optitrans = sanscript.SCHEMES[
                    sanscript.IAST].mark_off_non_indic_in_line(text_optitrans)
            text_optitrans = sanscript.transliterate(
                text_optitrans,
                source_script,
                sanscript.OPTITRANS,
                suspend_on=set('<'),
                suspend_off=set('>'),
                maybe_use_dravidian_variant=maybe_use_dravidian_variant)
    else:
        text_optitrans = sanscript.transliterate(
            text_optitrans,
            source_script,
            sanscript.OPTITRANS,
            maybe_use_dravidian_variant=maybe_use_dravidian_variant)
    storage_name = clean_file_path(text_optitrans)
    if max_length is not None:
        storage_name = storage_name[:max_length]
    return storage_name
Пример #5
0
def apply_transliteration(text):
    '''
    Detect Language and Transliterate non-Roman script text into Roman script
    '''
    lang = detect.detect(str(text))
    if lang not in [
            Scheme.ITRANS, Scheme.HK, Scheme.SLP1, Scheme.IAST,
            Scheme.Velthuis, Scheme.Kolkata
    ]:
        text = transliterate(text, getattr(sanscript, lang.upper()),
                             sanscript.HK).lower()
    return text
Пример #6
0
 def __init__(self, thing, encoding=None, unicode_encoding='utf-8'):
     assert isinstance(thing, six.string_types)
     # Encode early, unicode everywhere, decode late is the philosophy
     # However, we need to accept both unicode and non unicode strings
     # We are udAramatiH
     if isinstance(thing, six.text_type):
         self.thing = thing
     else:
         self.thing = six.text_type(thing, unicode_encoding)
     if encoding is None:
         # Autodetect Encoding
         encoding = SCHEMES[detect.detect(self.thing)]
     if encoding != SLP1:
         # Convert to SLP1
         self.thing = sanscript.transliterate(self.thing, encoding, SLP1)
Пример #7
0
def i_to_d(input_string):
    scheme = detect.detect(input_string)
    if (str(scheme) == "ITRANS"):
        inputSchemeIndex = sanscript.ITRANS
    elif (str(scheme) == "HK"):
        inputSchemeIndex = sanscript.HK
    if ((str(scheme) == "ITRANS") or (str(scheme) == "HK")):
        v_scheme_map = sanscript.SchemeMap(
            sanscript.SCHEMES[inputSchemeIndex],
            sanscript.SCHEMES[sanscript.DEVANAGARI])
        output_string = sanscript.transliterate(input_string,
                                                scheme_map=v_scheme_map)
    else:
        output_string = input_string
    return output_string
Пример #8
0
def main():
    is_pragrahya = False
    print("input:" + sys.argv[1])
    scheme = detect.detect(sys.argv[1])
    input_string = ""
    print("input: " + sys.argv[1] + " ---   encoding = " + str(scheme))
    if (str(scheme) == "Devanagari"):
        v_scheme_map = sanscript.SchemeMap(
            sanscript.SCHEMES[sanscript.DEVANAGARI],
            sanscript.SCHEMES[sanscript.VELTHUIS])
        input_string = sanscript.transliterate(sys.argv[1],
                                               scheme_map=v_scheme_map)
    else:
        print("please enter the input in devanagari")

    is_pragrahya = pragrahya_check(input_string)
    print("**Pragrahya " + str(is_pragrahya))
Пример #9
0
 def __init__(self,
              thing=None,
              encoding=None,
              unicode_encoding='utf-8',
              strict_io=True,
              replace_ending_visarga='s'):
     assert isinstance(thing, six.string_types)
     # Encode early, unicode everywhere, decode late is the philosophy
     # However, we need to accept both unicode and non unicode strings
     # We are udAramatiH
     if isinstance(thing, six.text_type):
         self.thing = thing
     else:
         self.thing = six.text_type(thing, unicode_encoding)
     self.encoding = encoding
     if self.encoding is None:
         if thing is not None:
             # Autodetect Encoding
             self.encoding = SCHEMES[detect.detect(self.thing)]
     if self.encoding != SLP1:
         # Convert to SLP1
         self.thing = self.transcoded(SLP1)
         self.encoding = SLP1
     if not strict_io:
         # Normalize
         logger.debug("Before normalization: %s", self.thing)
         tmp = normalization.normalize(self.thing)
         if replace_ending_visarga == 's':
             self.thing = normalization.replace_ending_visarga_s(tmp)
         elif replace_ending_visarga == 'r':
             self.thing = normalization.replace_ending_visarga_r(tmp)
         else:
             self.thing = tmp
         # Lazy Anusvaras (see issue #103)
         tmpi = sanscript.transliterate(self.thing, SLP1, ITRANS)
         tmpi = deduplication.fix_lazy_anusvaara_itrans(tmpi)
         self.thing = sanscript.transliterate(tmpi, ITRANS, SLP1)
         logger.debug("After normalization: %s", self.thing)
     # Tags will go here as
     # { lexical_tag : [possible morphologies] }
     self.tags = []
Пример #10
0
def test_noisy(data):
    noise = ' \t\n 1234567890 !@#$%^&*(),.<>\'\"-_[]{}\\|;:`~ ΣД あア'
    text, scheme = data
    text = ''.join([noise, text, noise])
    assert detect(text) == scheme, data
Пример #11
0
def test_decoded(data):
    text, scheme = data
    if sys.version_info < (3, 0):
        text = text.decode('utf-8')
    detection = detect(text)
    assert detection == scheme, u'%s == %s (%s)' % (detection, scheme, text)
Пример #12
0
def test_basic(data):
    text, scheme = data
    detection = detect(text)
    assert detection == scheme, u'%s == %s (%s)' % (detection, scheme, text)
Пример #13
0
def test_decoded(data):
    text, scheme = data
    text = text.decode('utf-8')
    detection = detect(text)
    assert detection == scheme, u'%s == %s (%s)' % (detection, scheme, text)