def initializeSoundexDict(freqDict): soundexScore = Soundex().soundex freqDictSoundex = {} for c in freqDict: freqDictSoundex[c] = soundexScore(c.capitalize()) return freqDictSoundex
def soundex(word): word = word.capitalize() sound_candidates = [] soundexScore = Soundex().soundex wordScore = soundexScore(word) for c in freqDict: if (wordScore == freqDictSoundex[c]): sound_candidates.append(c) if len(sound_candidates) == 0: sound_candidates.append(word) words = [] for i in range(min(len(sound_candidates), candidates_nb)): words.append(max(sound_candidates, key=P)) sound_candidates.remove(words[i]) return words
def has_player_name_fuzzy(current, edu): "if the EDU has a word that sounds like a player name" tokens = edu.tokens soundex = lambda w: Soundex().soundex(w) return has_one_of_words(current.players, tokens, norm=soundex)
phonemes2 = np.array([ipa[c] for c in pad2]) features = {'word 1 encoding': phonemes1, 'word 2 encoding': phonemes2} return features #%% Open some things if __name__ == '__main__': TRAIN_PATH = '../data/cognet_train.csv' TEST_PATH = '../data/cognet_test.csv' DEV_PATH = '../data/cognet_dev.csv' DATA_PATH = '../data/extracted_features.npy' SUPPORTED_LANGS_PATH = '../data/cognet_supported_langs.tsv' IPA_ENCODING_PATH = '../data/ipa_encodings.pickle' v = DictVectorizer(sparse=False) soundex = Soundex() epitran_dict = create_epitran_dict() with open(IPA_ENCODING_PATH, 'rb') as f: ipa = pickle.load(f) ipa = defaultdict(lambda: np.array([0.] * 24), ipa) #%% FEATURE EXTRACTION if __name__ == '__main__': print('Reading training data...') train_data = pd.read_csv(TRAIN_PATH) print('Extracting features...') x_train = v.fit_transform([ extract_features(str(lang1), str(word1), str(lang2), str(word2))\ for lang1, word1, lang2, word2 in\ zip(train_data['lang 1'], train_data['translit 1'], train_data['lang 2'], train_data['translit 2']) ])
# -*- coding: utf-8 -*- """ Created on Sun Jun 18 00:04:21 2017 @author: Milton """ from foneticaBR import foneticaBR from buscabr import buscaBR from metaphoneBR import metaphoneBR # Soundex - Instalar no Python as bibliotecas soundex e silpa_common from soundex import Soundex chaveRoberto = foneticaBR() chavebr = buscaBR() chavemeta = metaphoneBR() chavesoundex = Soundex() texto = 'JOSSEPH' print(chaveRoberto.chavefonetica(texto)) print(chavebr.chaveBR(texto, False)) print(chavebr.chaveBR(texto, True)) print(chavemeta.chaveMetaphoneBR(texto)) print(chavesoundex.soundex(texto))
def sndx(s): return Soundex().soundex(stemmer(s, args.stem))
from soundex import Soundex from collections import defaultdict from wordfreq import word_frequency soundex = Soundex().soundex sound_words = defaultdict(set) with open('eff_short_wordlist_1.txt','r') as fh: for line in fh: word = line.split()[1] sound = soundex(word) if len(sound) > 1: # and sound not in ('i245', 't651'): sound_words[sound].add(word) for word_set in sound_words.values(): if len(word_set) > 1: word_list = [ (word_frequency(word, 'en'), word) for word in word_set ] word_list.sort() print(word_list[-1][-1]) else: print(list(word_set)[0])
'ben': SCHEMES[BENGALI], 'guj': SCHEMES[GUJARATI], 'hin': SCHEMES[DEVANAGARI], 'kan': SCHEMES[KANNADA], 'mal': SCHEMES[MALAYALAM], 'mar': SCHEMES[DEVANAGARI], "ori": SCHEMES[ORIYA], 'pun': SCHEMES[GURMUKHI], 'tam': SCHEMES[TAMIL], 'tel': SCHEMES[TELUGU], 'urd': SCHEMES[DEVANAGARI] } if lang in scheme_dict: src_scheme = scheme_dict[lang] scheme_map = SchemeMap(src_scheme, SCHEMES[HK]) instance = Soundex() findings = {} for i, line in enumerate(inp_bible): curr_line_id = 23146 + i if line == "" or line == "\n": continue for index, name in enumerate(names_reference): lids = ast.literal_eval(name[lid_col]) for col in col_references: romans = name[col] if romans == "": continue print(".", end="")