def file_existing_events(events, tag): print("Filing existing events... to", all_calendars_dump_file) with open(all_calendars_dump_file, "a") as text_file: for event in events: print(event["start"]["date"], transliterate(event["summary"]), tag, file=text_file)
def find_matching_key(self, key): tkey = transliterate(key).strip() best = None bestr = 0 for pkey in sorted(self.keys()): cr = similar(tkey, pkey) if (cr > bestr): best = pkey bestr = cr return best, cr
def get_data(input_filename): data = pd.read_csv(input_filename, header=0) # force align the predicted orthographic transliteration (without schwa dropping) # with the actual phonetic transliteration (schwa dropping) to created training/test data schwa_instances = [] for _, row in data.iterrows(): # print(chr(27) + '[2J') # print('Processing row', _) try: schwa_instances += [[ tr.transliterate(row.hindi), schwa_instance[1], schwa_instance[0] ] for schwa_instance in tr.force_align(tr.transliterate(row.hindi), str(row.phon))] # schwa_instances += [[tr.transliterate(row.hindi), schwa_instance[1], schwa_instance[0]] # for schwa_instance in tr.force_align_weak(tr.transliterate(row.hindi), str(row.phon))] except Exception as e: # print(e) continue return schwa_instances
def generatePuns(phrase, topic): relatedWords = getRelatedGlove("closestWords10k.csv", topic) relatedWordsArpa = [] phraseArpa = [] for word in relatedWords: transcription = transcribe(word) if transcription != "NOT IN DICTIONARY": relatedWordsArpa.append((transcription, word)) #testing change for word in phrase.split(): phraseArpa += transcribe(word) phraseArpa.extend("#") punsList = [] for wordPair in relatedWordsArpa: word = wordPair[0] dummyPunsList = possiblePuns(word, phraseArpa, wordPair[1]) for pun in dummyPunsList: if pun != []: punsList.append((pun, wordPair[1])) punsListStrings = [] punOrigins = {} for punPair in punsList: pun = punPair[0] print(pun) punString = "" for syllable in pun[0]: punString += syllable + " " punString = punString.strip() punsListStrings.append(punString) punOrigins[punString] = punPair[1] punsList = getPhonScores.sortPuns( getPhonScores.getPhonScores(punsListStrings)) finalList = [] for pun in punsList: finalList += (transliterate.transliterate([pun], phrase), punOrigins[pun]) return finalList
def conv(word): left, right = 5, 5 model = load('models/xgboost/xgboost_nophon.joblib') chars = load('models/xgboost/xgboost_nophon_chars.joblib') if word[-1] == 'рдВ': word = word[:-1] + 'рдБ' # print(chars) transliteration = tr.transliterate(word) transformed_instances = [] for i, phone in enumerate(transliteration): if phone == 'a': x = [] for j in range(i - left, i + right + 1): if j == i: continue for char in chars: if j < 0 or j >= len(transliteration): if char == UNK_CHAR: x.append(1) else: x.append(0) else: if char == transliteration[j]: x.append(1) else: x.append(0) transformed_instances.append(x) col = [] for i in list(range(-left, 0)) + list(range(1, right + 1)): for j in chars: col.append('s' + str(i) + '_' + str(j)) X = pd.DataFrame(transformed_instances, columns=col) Y = [] if len(X) > 0: Y = model.predict(X) pos = 0 res = [] for phone in transliteration: if phone == 'a': if Y[pos]: res.append(espeak['a']) pos += 1 else: res.append(espeak[phone]) return ''.join(res)
def main(input_filename="hi_ur_pron.tsv"): csv_data = pd.read_csv(input_filename, header=0, sep='\t') instances = [] for _, row in csv_data.iterrows(): instances += [[tr.transliterate(row.hindi), schwa_instance[1], schwa_instance[0]] for schwa_instance in tr.force_align(row.hindi, row.phon)] y = [] transformed_instances = [] for s, schwa_index, schwa_was_deleted in instances: x = [] for i in range(schwa_index - CHAR_WINDOW, schwa_index + CHAR_WINDOW + 1): if i == schwa_index: continue if i < 0 or i >= len(s): x.append(UNK_CHAR) else: x.append(s[i]) transformed_instances.append(x) y.append(schwa_was_deleted) X = pd.DataFrame(transformed_instances, columns=["s-4", "s-3", "s-2", "s-1", "s+1", "s+2", "s+3", "s+4"]) X = pd.get_dummies(X) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=42) model = LogisticRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) print(accuracy_score(y_pred, y_test)) print(recall_score(y_pred, y_test)) print(f1_score(y_pred, y_test))
def latin_cyrill(user_input): answer = transliterate(user_input, 'cyrillic') return answer
def test(words, model_path, chars_path, phons_path=None, left=4, right=4): model = load(model_path) chars = load(chars_path) # print(chars) phons = None if phons_path: phons = load(phons_path) # print(phons) results = [] for word in words: transliteration = tr.transliterate(word) transformed_instances = [] for i, phone in enumerate(transliteration): if phone == 'a': x = [] for j in range(i - left, i + right + 1): if j == i: continue if phons: for phon in phons: if j < 0 or j >= len(transliteration): x.append(0) else: if phon in tr.phonological_features[ transliteration[j]]: x.append(1) else: x.append(0) else: for char in chars: if j < 0 or j >= len(transliteration): if char == UNK_CHAR: x.append(1) else: x.append(0) else: if char == transliteration[j]: x.append(1) else: x.append(0) transformed_instances.append(x) col = [] if phon: for i in list(range(-left, 0)) + list(range(1, right + 1)): for j in phons: col.append('s' + str(i) + '_' + str(j)) else: for i in list(range(-left, 0)) + list(range(1, right + 1)): for j in chars: col.append('s' + str(i) + '_' + str(j)) X = pd.DataFrame(transformed_instances, columns=col) Y = model.predict(X) # print(X, Y) pos = 0 res = [] for phone in transliteration: if phone == 'a': if Y[pos]: res.append('a') pos += 1 else: res.append(phone) results.append(res) print(word, ' '.join(res)) return results
def slugify(source_str): return transliterate(source_str, translit_map, do_skip_char)
def format_jpeg_name(jpeg_name): jpeg_name = transliterate(jpeg_name) jpeg_name = replace_non_alphabetic_symbols(jpeg_name) return jpeg_name
rules = yaml.full_load(file) single_list,double,double_consonants=rules["single"],rules.get("double",{}),rules.get("double_latin",{}) vowels=rules.get("vowels",{}) single={ a : b for a,b in zip(single_list[0],single_list[1])} # Load txt to be transliterated with open(filepath,"rb") as text_file: to_translit=text_file.read().decode("utf-8") # Apply rules on first/last syl/letter if rules["extra"]==True: to_translit=wordwise_transliterate(to_translit,rules.get("first_syl",{}),rules.get("last_syl",{}),rules.get("first_lett",{}),rules.get("last_lett",{}),rules.get("consonants"),vowels[0],vowels[1]) # Apply dictionary to_translit=transliterate(to_translit,single,double) # Apply replacement rules on transliterated text (double consonants get accents) to_translit=latin_replacements(to_translit,double_consonants) # Output if out_filepath is not None: with open(out_filepath,"w") as text_file: text_file.write(to_translit) else: print(to_translit)
def _slug_transform(pre): return transliterate(pre or '')
def main(input_filename, use_phon, left=4, right=4): data = pd.read_csv(input_filename, header=0) # force align the predicted orthographic transliteration (without schwa dropping) # with the actual phonetic transliteration (schwa dropping) to created training/test data schwa_instances = [] for _, row in data.iterrows(): # print(chr(27) + '[2J') # print('Processing row', _) try: schwa_instances += [[tr.transliterate(row.punjabi), schwa_instance[1], schwa_instance[0]] for schwa_instance in tr.force_align(tr.transliterate(row.punjabi), str(row.phon))] # schwa_instances += [[tr.transliterate(row.punjabi), schwa_instance[1], schwa_instance[0]] # for schwa_instance in tr.force_align_weak(tr.transliterate(row.punjabi), str(row.phon))] except Exception as e: print(e) continue print(len(schwa_instances)) chars = set() for word in schwa_instances: for char in word[0]: chars.add(char) chars.add(UNK_CHAR) chars = list(chars) phons = set() if use_phon: for phoneme, features in tr.phonological_features.items(): for feature in features: phons.add(feature) phons = list(phons) chars = load('models/neural/neural_chars.joblib') phons = load('models/neural/neural_phons.joblib') # clean up the data y = [] transformed_instances = [] for s, schwa_index, schwa_was_deleted in schwa_instances: x = [] for i in range(schwa_index - left, schwa_index + right + 1): if i == schwa_index: continue if use_phon: for phon in phons: if i < 0 or i >= len(s): x.append(0) else: if phon in tr.phonological_features[s[i]]: x.append(1) else: x.append(0) else: for char in chars: if i < 0 or i >= len(s): if char == UNK_CHAR: x.append(1) else: x.append(0) else: if char == s[i]: x.append(1) else: x.append(0) transformed_instances.append(x) y.append(schwa_was_deleted) col = [] if use_phon: for i in list(range(-left, 0)) + list(range(1, right + 1)): for j in phons: col.append('s' + str(i) + '_' + str(j)) else: for i in list(range(-left, 0)) + list(range(1, right + 1)): for j in chars: col.append('s' + str(i) + '_' + str(j)) X = pd.DataFrame(transformed_instances, columns=col) print(y.count(True), y.count(False)) # 20% is the final test data, 20% is for development X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.60, test_size=0.40, random_state=42) X_dev, y_dev = X_test[:len(X_test) // 2], y_test[:len(y_test) // 2] X_test, y_test = X_test[len(X_test) // 2:], y_test[len(y_test) // 2:] # model = LogisticRegression(solver='liblinear', max_iter=1000, verbose=True) model = MLPClassifier(max_iter=1000, learning_rate_init=1e-4, hidden_layer_sizes=(250,), verbose=True) # model = XGBClassifier(verbosity=2, max_depth=11, n_estimators=200) model = load('models/neural/neural.joblib') model.fit(X_train, y_train) # dump(model, 'models/neural/neural.joblib') # dump(chars, 'models/neural/neural_chars.joblib') # dump(phons, 'models/neural/neural_phons.joblib') y_pred = model.predict(X_test) print( accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred)) misses = set() all_words = set() for i in range(len(X_test)): all_words.add(' '.join(schwa_instances[X_test.iloc[i].name][0])) if y_pred[i] != y_test[i]: misses.add(' '.join(schwa_instances[X_test.iloc[i].name][0])) print(' '.join(schwa_instances[X_test.iloc[i].name][0]), schwa_instances[X_test.iloc[i].name][1], y_pred[i], y_test[i]) print(f"{len(misses)} words missed out of {len(all_words)}")
def cyrill_latin(user_input): answer = transliterate(user_input, 'latin') return answer
def save(self, *args, **kwargs): if not self.slug: n = transliterate(self.name) self.slug = u"%s-%s" % (self.ttype_slug(), n) super(Bus, self).save(*args, **kwargs)
def test_the_rat(self): self.assertEqual(transliterate("el rato"), "the rat")
def test_rat(self): self.assertEqual(transliterate("rato"), "rat")
def test_rat_in_house(self): inputs = "el rato esta en la casa" outputs = "the rat is in the house" self.assertEqual(transliterate(inputs), outputs)