예제 #1
0
def file_existing_events(events, tag):
    print("Filing existing events... to", all_calendars_dump_file)
    with open(all_calendars_dump_file, "a") as text_file:
        for event in events:
            print(event["start"]["date"],
                  transliterate(event["summary"]),
                  tag,
                  file=text_file)
예제 #2
0
 def find_matching_key(self, key):
     tkey = transliterate(key).strip()
     best = None
     bestr = 0
     for pkey in sorted(self.keys()):
         cr = similar(tkey, pkey)
         if (cr > bestr):
             best = pkey
             bestr = cr
     return best, cr
예제 #3
0
def get_data(input_filename):
    data = pd.read_csv(input_filename, header=0)
    # force align the predicted orthographic transliteration (without schwa dropping)
    # with the actual phonetic transliteration (schwa dropping) to created training/test data
    schwa_instances = []
    for _, row in data.iterrows():
        # print(chr(27) + '[2J')
        # print('Processing row', _)
        try:
            schwa_instances += [[
                tr.transliterate(row.hindi), schwa_instance[1],
                schwa_instance[0]
            ] for schwa_instance in tr.force_align(tr.transliterate(row.hindi),
                                                   str(row.phon))]
            # schwa_instances += [[tr.transliterate(row.hindi), schwa_instance[1], schwa_instance[0]]
            #         for schwa_instance in tr.force_align_weak(tr.transliterate(row.hindi), str(row.phon))]
        except Exception as e:
            # print(e)
            continue

    return schwa_instances
예제 #4
0
def generatePuns(phrase, topic):
    relatedWords = getRelatedGlove("closestWords10k.csv", topic)
    relatedWordsArpa = []
    phraseArpa = []

    for word in relatedWords:
        transcription = transcribe(word)
        if transcription != "NOT IN DICTIONARY":
            relatedWordsArpa.append((transcription, word))


#testing change
    for word in phrase.split():
        phraseArpa += transcribe(word)
        phraseArpa.extend("#")

    punsList = []
    for wordPair in relatedWordsArpa:
        word = wordPair[0]
        dummyPunsList = possiblePuns(word, phraseArpa, wordPair[1])
        for pun in dummyPunsList:
            if pun != []:
                punsList.append((pun, wordPair[1]))

    punsListStrings = []
    punOrigins = {}
    for punPair in punsList:
        pun = punPair[0]
        print(pun)
        punString = ""
        for syllable in pun[0]:
            punString += syllable + " "
        punString = punString.strip()
        punsListStrings.append(punString)
        punOrigins[punString] = punPair[1]

    punsList = getPhonScores.sortPuns(
        getPhonScores.getPhonScores(punsListStrings))

    finalList = []
    for pun in punsList:
        finalList += (transliterate.transliterate([pun],
                                                  phrase), punOrigins[pun])

    return finalList
예제 #5
0
def conv(word):
    left, right = 5, 5
    model = load('models/xgboost/xgboost_nophon.joblib')
    chars = load('models/xgboost/xgboost_nophon_chars.joblib')

    if word[-1] == 'рдВ':
        word = word[:-1] + 'рдБ'

    # print(chars)
    transliteration = tr.transliterate(word)
    transformed_instances = []
    for i, phone in enumerate(transliteration):
        if phone == 'a':
            x = []
            for j in range(i - left, i + right + 1):
                if j == i: continue
                for char in chars:
                    if j < 0 or j >= len(transliteration):
                        if char == UNK_CHAR: x.append(1)
                        else: x.append(0)
                    else:
                        if char == transliteration[j]: x.append(1)
                        else: x.append(0)
            transformed_instances.append(x)

    col = []
    for i in list(range(-left, 0)) + list(range(1, right + 1)):
        for j in chars:
            col.append('s' + str(i) + '_' + str(j))

    X = pd.DataFrame(transformed_instances, columns=col)
    Y = []
    if len(X) > 0: Y = model.predict(X)
    pos = 0
    res = []
    for phone in transliteration:
        if phone == 'a':
            if Y[pos]:
                res.append(espeak['a'])
            pos += 1
        else:
            res.append(espeak[phone])
    return ''.join(res)
예제 #6
0
def main(input_filename="hi_ur_pron.tsv"):
    csv_data = pd.read_csv(input_filename, header=0, sep='\t')

    instances = []
    for _, row in csv_data.iterrows():
        instances += [[tr.transliterate(row.hindi), schwa_instance[1], schwa_instance[0]]
                  for schwa_instance in tr.force_align(row.hindi, row.phon)]

    y = []
    transformed_instances = []
    for s, schwa_index, schwa_was_deleted in instances:
        x = []
        for i in range(schwa_index - CHAR_WINDOW, schwa_index + CHAR_WINDOW + 1):
            if i == schwa_index:
                continue

            if i < 0 or i >= len(s):
                x.append(UNK_CHAR)
            else:
                x.append(s[i])

        transformed_instances.append(x)
        y.append(schwa_was_deleted)

    X = pd.DataFrame(transformed_instances,
                     columns=["s-4", "s-3", "s-2", "s-1", "s+1", "s+2", "s+3", "s+4"])
    X = pd.get_dummies(X)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42)

    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(accuracy_score(y_pred, y_test))
    print(recall_score(y_pred, y_test))
    print(f1_score(y_pred, y_test))
예제 #7
0
def latin_cyrill(user_input):
    answer = transliterate(user_input, 'cyrillic')
    return answer
예제 #8
0
def test(words, model_path, chars_path, phons_path=None, left=4, right=4):
    model = load(model_path)
    chars = load(chars_path)
    # print(chars)
    phons = None
    if phons_path:
        phons = load(phons_path)
        # print(phons)

    results = []
    for word in words:
        transliteration = tr.transliterate(word)
        transformed_instances = []
        for i, phone in enumerate(transliteration):
            if phone == 'a':
                x = []
                for j in range(i - left, i + right + 1):
                    if j == i: continue
                    if phons:
                        for phon in phons:
                            if j < 0 or j >= len(transliteration):
                                x.append(0)
                            else:
                                if phon in tr.phonological_features[
                                        transliteration[j]]:
                                    x.append(1)
                                else:
                                    x.append(0)
                    else:
                        for char in chars:
                            if j < 0 or j >= len(transliteration):
                                if char == UNK_CHAR: x.append(1)
                                else: x.append(0)
                            else:
                                if char == transliteration[j]: x.append(1)
                                else: x.append(0)
                transformed_instances.append(x)

        col = []
        if phon:
            for i in list(range(-left, 0)) + list(range(1, right + 1)):
                for j in phons:
                    col.append('s' + str(i) + '_' + str(j))
        else:
            for i in list(range(-left, 0)) + list(range(1, right + 1)):
                for j in chars:
                    col.append('s' + str(i) + '_' + str(j))

        X = pd.DataFrame(transformed_instances, columns=col)
        Y = model.predict(X)
        # print(X, Y)
        pos = 0
        res = []
        for phone in transliteration:
            if phone == 'a':
                if Y[pos]: res.append('a')
                pos += 1
            else:
                res.append(phone)
        results.append(res)
        print(word, ' '.join(res))
    return results
예제 #9
0
파일: strings.py 프로젝트: znick/anytask
def slugify(source_str):
    return transliterate(source_str, translit_map, do_skip_char)
예제 #10
0
def slugify(source_str):
    return transliterate(source_str, translit_map, do_skip_char)
예제 #11
0
def format_jpeg_name(jpeg_name):
    jpeg_name = transliterate(jpeg_name)
    jpeg_name = replace_non_alphabetic_symbols(jpeg_name)
    return jpeg_name
예제 #12
0
	    rules = yaml.full_load(file)

	single_list,double,double_consonants=rules["single"],rules.get("double",{}),rules.get("double_latin",{})
	vowels=rules.get("vowels",{})

	single={ a : b for a,b in zip(single_list[0],single_list[1])}

	# Load txt to be transliterated
	with open(filepath,"rb") as text_file:
		to_translit=text_file.read().decode("utf-8")

	# Apply rules on first/last syl/letter
	if rules["extra"]==True:
		to_translit=wordwise_transliterate(to_translit,rules.get("first_syl",{}),rules.get("last_syl",{}),rules.get("first_lett",{}),rules.get("last_lett",{}),rules.get("consonants"),vowels[0],vowels[1])

	# Apply dictionary
	to_translit=transliterate(to_translit,single,double)

	# Apply replacement rules on transliterated text (double consonants get accents)
	to_translit=latin_replacements(to_translit,double_consonants)

	# Output
	if out_filepath is not None:
		with open(out_filepath,"w") as text_file:
			text_file.write(to_translit)
	else:
		print(to_translit)



예제 #13
0
 def _slug_transform(pre):
     return transliterate(pre or '')
예제 #14
0
def main(input_filename, use_phon, left=4, right=4):
    data = pd.read_csv(input_filename, header=0)

    # force align the predicted orthographic transliteration (without schwa dropping)
    # with the actual phonetic transliteration (schwa dropping) to created training/test data
    schwa_instances = []
    for _, row in data.iterrows():
        # print(chr(27) + '[2J')
        # print('Processing row', _)
        try:
            schwa_instances += [[tr.transliterate(row.punjabi), schwa_instance[1], schwa_instance[0]]
                for schwa_instance in tr.force_align(tr.transliterate(row.punjabi), str(row.phon))]
            # schwa_instances += [[tr.transliterate(row.punjabi), schwa_instance[1], schwa_instance[0]]
            #         for schwa_instance in tr.force_align_weak(tr.transliterate(row.punjabi), str(row.phon))]
        except Exception as e:
            print(e)
            continue
    
    print(len(schwa_instances))
    chars = set()
    for word in schwa_instances:
        for char in word[0]:
            chars.add(char)
    chars.add(UNK_CHAR)
    chars = list(chars)
    phons = set()

    if use_phon:
        for phoneme, features in tr.phonological_features.items():
            for feature in features:
                phons.add(feature)
        phons = list(phons)

    chars = load('models/neural/neural_chars.joblib')
    phons = load('models/neural/neural_phons.joblib')
    
    # clean up the data
    y = []
    transformed_instances = []
    for s, schwa_index, schwa_was_deleted in schwa_instances:
        x = []
        for i in range(schwa_index - left, schwa_index + right + 1):
            if i == schwa_index:
                continue
            
            if use_phon:
                for phon in phons:
                    if i < 0 or i >= len(s): 
                        x.append(0)
                    else:
                        if phon in tr.phonological_features[s[i]]: x.append(1)
                        else: x.append(0)
            else:
                for char in chars:
                    if i < 0 or i >= len(s): 
                        if char == UNK_CHAR: x.append(1)
                        else: x.append(0)
                    else:
                        if char == s[i]: x.append(1)
                        else: x.append(0)

        transformed_instances.append(x)
        y.append(schwa_was_deleted)
    
    col = []
    if use_phon:
        for i in list(range(-left, 0)) + list(range(1, right + 1)):
            for j in phons:
                col.append('s' + str(i) + '_' + str(j))
    else:
        for i in list(range(-left, 0)) + list(range(1, right + 1)):
            for j in chars:
                col.append('s' + str(i) + '_' + str(j))

    X = pd.DataFrame(transformed_instances,
        columns=col)

    print(y.count(True), y.count(False))

    # 20% is the final test data, 20% is for development
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.60, test_size=0.40, random_state=42)
    
    X_dev, y_dev = X_test[:len(X_test) // 2], y_test[:len(y_test) // 2]
    X_test, y_test = X_test[len(X_test) // 2:], y_test[len(y_test) // 2:]

    # model = LogisticRegression(solver='liblinear', max_iter=1000, verbose=True)
    model = MLPClassifier(max_iter=1000,  learning_rate_init=1e-4, hidden_layer_sizes=(250,), verbose=True)
    # model = XGBClassifier(verbosity=2, max_depth=11, n_estimators=200)

    model = load('models/neural/neural.joblib')
    model.fit(X_train, y_train)
    # dump(model, 'models/neural/neural.joblib')
    # dump(chars, 'models/neural/neural_chars.joblib')
    # dump(phons, 'models/neural/neural_phons.joblib')
    y_pred = model.predict(X_test)

    print(
        accuracy_score(y_test, y_pred),
        precision_score(y_test, y_pred),
        recall_score(y_test, y_pred))
    
    misses = set()
    all_words = set()
    for i in range(len(X_test)):
        all_words.add(' '.join(schwa_instances[X_test.iloc[i].name][0]))
        if y_pred[i] != y_test[i]:
            misses.add(' '.join(schwa_instances[X_test.iloc[i].name][0]))
            print(' '.join(schwa_instances[X_test.iloc[i].name][0]), schwa_instances[X_test.iloc[i].name][1], y_pred[i], y_test[i])
    print(f"{len(misses)} words missed out of {len(all_words)}")
예제 #15
0
def cyrill_latin(user_input):
    answer = transliterate(user_input, 'latin')
    return answer
예제 #16
0
 def save(self, *args, **kwargs):
     if not self.slug:
         n = transliterate(self.name)
         self.slug = u"%s-%s" % (self.ttype_slug(), n)
     super(Bus, self).save(*args, **kwargs)
예제 #17
0
 def test_the_rat(self):
     self.assertEqual(transliterate("el rato"), "the rat")
예제 #18
0
 def test_rat(self):
     self.assertEqual(transliterate("rato"), "rat")
예제 #19
0
 def test_rat_in_house(self):
     inputs = "el rato esta en la casa"
     outputs = "the rat is in the house"
     self.assertEqual(transliterate(inputs), outputs)