def init(): """ To be called by library loader, do not call it in your program """ global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET ENGLISH_PHONETIC_DATA = pd.read_csv( common.get_resources_path() + '/script/english_script_phonetic_data.csv', encoding='utf-8') ENGLISH_PHONETIC_VECTORS = ENGLISH_PHONETIC_DATA.ix[:, PHONETIC_VECTOR_START_OFFSET:].as_matrix( ) PHONETIC_VECTOR_LENGTH = ENGLISH_PHONETIC_VECTORS.shape[1] ### Load mapping from ARPABET representation of phoneme to internal ID global ARPABET_ID_MAP, ID_ARPABET_MAP with open(common.get_resources_path() + '/script/english_arpabet_list.csv', 'r', encoding='utf-8') as infile: for ph_id, name in enumerate(iter(infile)): name = name.strip() ARPABET_ID_MAP[name] = ph_id ID_ARPABET_MAP[ph_id] = name
def init(): """ To be called by library loader, do not call it in your program """ global ALL_PHONETIC_DATA, ALL_PHONETIC_VECTORS, TAMIL_PHONETIC_DATA, TAMIL_PHONETIC_VECTORS ALL_PHONETIC_DATA=pd.read_csv(common.get_resources_path()+'/script/all_script_phonetic_data.csv',encoding='utf-8') TAMIL_PHONETIC_DATA=pd.read_csv(common.get_resources_path()+'/script/tamil_script_phonetic_data.csv',encoding='utf-8') ALL_PHONETIC_VECTORS= ALL_PHONETIC_DATA.ix[:,PHONETIC_VECTOR_START_OFFSET:].as_matrix() TAMIL_PHONETIC_VECTORS=TAMIL_PHONETIC_DATA.ix[:,PHONETIC_VECTOR_START_OFFSET:].as_matrix()
def init(): """ To be called by library loader, do not call it in your program """ global ALL_PHONETIC_DATA, ALL_PHONETIC_VECTORS, TAMIL_PHONETIC_DATA, TAMIL_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET ALL_PHONETIC_DATA=pd.read_csv(os.path.join(common.get_resources_path(),'script','all_script_phonetic_data.csv'),encoding='utf-8') TAMIL_PHONETIC_DATA=pd.read_csv(os.path.join(common.get_resources_path(),'script','tamil_script_phonetic_data.csv'),encoding='utf-8') ALL_PHONETIC_VECTORS= ALL_PHONETIC_DATA.ix[:,PHONETIC_VECTOR_START_OFFSET:].as_matrix() TAMIL_PHONETIC_VECTORS=TAMIL_PHONETIC_DATA.ix[:,PHONETIC_VECTOR_START_OFFSET:].as_matrix() PHONETIC_VECTOR_LENGTH=ALL_PHONETIC_VECTORS.shape[1]
def init(): """ To be called by library loader, do not call it in your program """ ### Load the ITRANS-script offset map. The map was initially generated using the snippet below (uses the old itrans transliterator) ### The map is modified as needed to accomodate extensions and corrections to the mappings # # base=0x900 # l=[] # for i in range(0,0x80): # c=chr(base+i) # itrans=ItransTransliterator.to_itrans(c,'hi') # l.append((hex(i),c,itrans)) # print(l) # # pd.DataFrame(l,columns=['offset_hex','devnag_char','itrans']).to_csv('offset_itrans_map.csv',index=False,encoding='utf-8') itrans_map_fname = os.path.join(common.get_resources_path(), 'transliterate', 'offset_itrans_map.csv') #itrans_map_fname=r'D:\src\python_sandbox\src\offset_itrans_map.csv' itrans_df = pd.read_csv(itrans_map_fname, encoding='utf-8') global OFFSET_TO_ITRANS, ITRANS_TO_OFFSET, DUPLICATE_ITRANS_REPRESENTATIONS for r in itrans_df.iterrows(): itrans = r[1]['itrans'] o = int(r[1]['offset_hex'], base=16) OFFSET_TO_ITRANS[o] = itrans if langinfo.is_consonant_offset(o): ### for consonants, strip the schwa - add halant offset ITRANS_TO_OFFSET[itrans[:-1]].extend([o, 0x4d]) else: ### the append assumes that the maatra always comes after independent vowel in the df ITRANS_TO_OFFSET[itrans].append(o) DUPLICATE_ITRANS_REPRESENTATIONS = { 'A': 'aa', 'I': 'ii', 'U': 'uu', 'RRi': 'R^i', 'RRI': 'R^I', 'LLi': 'L^i', 'LLI': 'L^I', 'L': 'ld', 'w': 'v', 'x': 'kSh', 'gj': 'j~n', 'dny': 'j~n', '.n': '.m', 'M': '.m', 'OM': 'AUM' }
def init(): """ To be called by library loader, do not call it in your program """ global ENGLISH_PHONETIC_DATA, ENGLISH_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET ENGLISH_PHONETIC_DATA=pd.read_csv(common.get_resources_path()+'/script/english_script_phonetic_data.csv',encoding='utf-8') ENGLISH_PHONETIC_VECTORS=ENGLISH_PHONETIC_DATA.ix[:,PHONETIC_VECTOR_START_OFFSET:].as_matrix() PHONETIC_VECTOR_LENGTH=ENGLISH_PHONETIC_VECTORS.shape[1] ### Load mapping from ARPABET representation of phoneme to internal ID global ARPABET_ID_MAP, ID_ARPABET_MAP with codecs.open(common.get_resources_path()+'/script/english_arpabet_list.csv','r','utf-8') as infile: for ph_id, name in enumerate(iter(infile)): name=name.strip() ARPABET_ID_MAP[name]=ph_id ID_ARPABET_MAP[ph_id]=name