def test_caversham(self): """Test using Caversham test set (SoundEx, Metaphone, & Caverphone).""" soundex = Soundex() metaphone = Metaphone() with open(_corpus_file('variantNames.csv')) as cav_testset: next(cav_testset) for cav_line in cav_testset: ( name1, soundex1, metaphone1, caverphone1, name2, soundex2, metaphone2, caverphone2, soundex_same, metaphone_same, caverphone_same, ) = cav_line.strip().split(',') self.assertEqual(soundex.encode(name1), soundex1) self.assertEqual(soundex.encode(name2), soundex2) if soundex_same == '1': self.assertEqual(soundex.encode(name1), soundex.encode(name2)) else: self.assertNotEqual(soundex.encode(name1), soundex.encode(name2)) self.assertEqual(metaphone.encode(name1), metaphone1) self.assertEqual(metaphone.encode(name2), metaphone2) if metaphone_same == '1': self.assertEqual(metaphone.encode(name1), metaphone.encode(name2)) else: self.assertNotEqual(metaphone.encode(name1), metaphone.encode(name2)) self.assertEqual(self.pa.encode(name1), caverphone1) self.assertEqual(self.pa.encode(name2), caverphone2) if caverphone_same == '1': self.assertEqual(self.pa.encode(name1), self.pa.encode(name2)) else: self.assertNotEqual(self.pa.encode(name1), self.pa.encode(name2))
def test_caversham(self): """Test using Caversham test set (SoundEx, Metaphone, & Caverphone).""" soundex = Soundex() metaphone = Metaphone() with open(_corpus_file('variantNames.csv')) as cav_testset: next(cav_testset) for cav_line in cav_testset: ( name1, soundex1, metaphone1, caverphone1, name2, soundex2, metaphone2, caverphone2, soundex_same, metaphone_same, caverphone_same, ) = cav_line.strip().split(',') self.assertEqual(soundex.encode(name1), soundex1) self.assertEqual(soundex.encode(name2), soundex2) if soundex_same == '1': self.assertEqual( soundex.encode(name1), soundex.encode(name2) ) else: self.assertNotEqual( soundex.encode(name1), soundex.encode(name2) ) self.assertEqual(metaphone.encode(name1), metaphone1) self.assertEqual(metaphone.encode(name2), metaphone2) if metaphone_same == '1': self.assertEqual( metaphone.encode(name1), metaphone.encode(name2) ) else: self.assertNotEqual( metaphone.encode(name1), metaphone.encode(name2) ) self.assertEqual(self.pa.encode(name1), caverphone1) self.assertEqual(self.pa.encode(name2), caverphone2) if caverphone_same == '1': self.assertEqual( self.pa.encode(name1), self.pa.encode(name2) ) else: self.assertNotEqual( self.pa.encode(name1), self.pa.encode(name2) )
def test_metaphone(self): """Test abydos.phonetic.Metaphone.""" self.assertEqual(self.pa.encode(''), '') self.assertEqual(self.pa.encode('...'), '') # http://ntz-develop.blogspot.com/2011/03/phonetic-algorithms.html self.assertEqual(self.pa4.encode('Fishpool'), 'FXPL') self.assertEqual(self.pa4.encode('Fishpoole'), 'FXPL') self.assertEqual(self.pa4.encode('Gellately'), 'JLTL') self.assertEqual(self.pa4.encode('Gelletly'), 'JLTL') self.assertEqual(self.pa4.encode('Lowers'), 'LWRS') self.assertEqual(self.pa4.encode('Lowerson'), 'LWRS') self.assertEqual(self.pa4.encode('Mallabar'), 'MLBR') self.assertEqual(self.pa4.encode('Melbert'), 'MLBR') self.assertEqual(self.pa4.encode('Melbourn'), 'MLBR') self.assertEqual(self.pa4.encode('Melbourne'), 'MLBR') self.assertEqual(self.pa4.encode('Melburg'), 'MLBR') self.assertEqual(self.pa4.encode('Melbury'), 'MLBR') self.assertEqual(self.pa4.encode('Milberry'), 'MLBR') self.assertEqual(self.pa4.encode('Milborn'), 'MLBR') self.assertEqual(self.pa4.encode('Milbourn'), 'MLBR') self.assertEqual(self.pa4.encode('Milbourne'), 'MLBR') self.assertEqual(self.pa4.encode('Milburn'), 'MLBR') self.assertEqual(self.pa4.encode('Milburne'), 'MLBR') self.assertEqual(self.pa4.encode('Millberg'), 'MLBR') self.assertEqual(self.pa4.encode('Mulberry'), 'MLBR') self.assertEqual(self.pa4.encode('Mulbery'), 'MLBR') self.assertEqual(self.pa4.encode('Mulbry'), 'MLBR') self.assertEqual(self.pa4.encode('Saipy'), 'SP') self.assertEqual(self.pa4.encode('Sapey'), 'SP') self.assertEqual(self.pa4.encode('Sapp'), 'SP') self.assertEqual(self.pa4.encode('Sappy'), 'SP') self.assertEqual(self.pa4.encode('Sepey'), 'SP') self.assertEqual(self.pa4.encode('Seppey'), 'SP') self.assertEqual(self.pa4.encode('Sopp'), 'SP') self.assertEqual(self.pa4.encode('Zoppie'), 'SP') self.assertEqual(self.pa4.encode('Zoppo'), 'SP') self.assertEqual(self.pa4.encode('Zupa'), 'SP') self.assertEqual(self.pa4.encode('Zupo'), 'SP') self.assertEqual(self.pa4.encode('Zuppa'), 'SP') # assorted tests to complete code coverage self.assertEqual(self.pa.encode('Xavier'), 'SFR') self.assertEqual(self.pa.encode('Acacia'), 'AKX') self.assertEqual(self.pa.encode('Schuler'), 'SKLR') self.assertEqual(self.pa.encode('Sign'), 'SN') self.assertEqual(self.pa.encode('Signed'), 'SNT') self.assertEqual(self.pa.encode('Horatio'), 'HRX') self.assertEqual(self.pa.encode('Ignatio'), 'IKNX') self.assertEqual(self.pa.encode('Lucretia'), 'LKRX') self.assertEqual(self.pa.encode('Wright'), 'RKT') self.assertEqual(self.pa.encode('White'), 'WT') self.assertEqual(self.pa.encode('Black'), 'BLK') self.assertEqual(self.pa.encode('Chance'), 'XNS') self.assertEqual(self.pa.encode('Dgengo'), 'JJNK') self.assertEqual(self.pa.encode('Ghost'), 'ST') self.assertEqual(self.pa.encode('Qing'), 'KNK') self.assertEqual(self.pa.encode('Asia'), 'AX') self.assertEqual(self.pa.encode('Ax'), 'AKS') self.assertEqual(self.pa.encode('Thegn'), '0N') self.assertEqual(self.pa.encode('acknowledged'), 'AKNLJT') self.assertEqual(self.pa.encode('awkward'), 'AKWRT') self.assertEqual(self.pa.encode('admitted'), 'ATMTT') self.assertEqual(self.pa.encode('dahl'), 'TL') self.assertEqual(self.pa.encode('autobiography'), 'ATBKRF') self.assertEqual(self.pa.encode('exaggerate'), 'EKSKRT') self.assertEqual(self.pa.encode('pitch'), 'PX') self.assertEqual(self.pa.encode('chracter'), 'KRKTR') # assorted tests to complete branch coverage self.assertEqual(self.pa.encode('Lamb'), 'LM') self.assertEqual(self.pa.encode('science'), 'SNS') # max_length bounds tests self.assertEqual(Metaphone(max_length=-1).encode('Niall'), 'NL') self.assertEqual(Metaphone(max_length=0).encode('Niall'), 'NL') # Test wrapper self.assertEqual(metaphone('Xavier'), 'SFR')
'double_metaphone': DoubleMetaphone().encode, 'eudex': Eudex().encode, 'fonem': FONEM().encode, 'fuzzy_soundex': FuzzySoundex().encode, 'fuzzy_soundex_0pad_ml8': FuzzySoundex(max_length=8, zero_pad=True).encode, 'haase_phonetik': Haase().encode, 'haase_phonetik_primary': Haase(primary_only=True).encode, 'henry_early': HenryEarly().encode, 'henry_early_ml8': HenryEarly(max_length=8).encode, 'koelner_phonetik': Koelner().encode, 'koelner_phonetik_alpha': Koelner().encode_alpha, 'lein': LEIN().encode, 'lein_nopad_ml8': LEIN(max_length=8, zero_pad=False).encode, 'metasoundex': MetaSoundex().encode, 'metasoundex_es': MetaSoundex(lang='es').encode, 'metaphone': Metaphone().encode, 'mra': MRA().encode, 'norphone': Norphone().encode, 'nrl': NRL().encode, 'nysiis': NYSIIS().encode, 'nysiis_modified': NYSIIS(modified=True).encode, 'nysiis_ml_inf': NYSIIS(max_length=-1).encode, 'onca': ONCA().encode, 'onca_nopad_ml8': ONCA(max_length=8, zero_pad=False).encode, 'parmar_kumbharana': ParmarKumbharana().encode, 'phonem': Phonem().encode, 'phonet_1': Phonet().encode, 'phonet_2': Phonet(mode=2).encode, 'phonet_1_none': Phonet(lang='none').encode, 'phonet_2_none': Phonet(mode=2, lang='none').encode, 'phonetic_spanish': PhoneticSpanish().encode,
koelner.encode, 'koelner_phonetik_num_to_alpha': ( lambda _: koelner._to_alpha(koelner.encode(_)) # noqa: SF01 ), 'koelner_phonetik_alpha': koelner.encode_alpha, 'lein': LEIN().encode, 'lein_nopad_ml8': LEIN(max_length=8, zero_pad=False).encode, 'metasoundex': MetaSoundex().encode, 'metasoundex_es': MetaSoundex(lang='es').encode, 'metaphone': Metaphone().encode, 'mra': MRA().encode, 'norphone': Norphone().encode, 'nrl': NRL().encode, 'nysiis': NYSIIS().encode, 'nysiis_modified': NYSIIS(modified=True).encode, 'nysiis_ml_inf': NYSIIS(max_length=-1).encode, 'onca': ONCA().encode, 'onca_nopad_ml8':
alpha_sis = AlphaSIS() bm = BeiderMorse() caverphone = Caverphone() davidson = Davidson() dm = DaitchMokotoff() dolby = Dolby() double_metaphone = DoubleMetaphone() eudex = Eudex() fonem = FONEM() fuzzy_soundex = FuzzySoundex() haase = Haase() henry_early = HenryEarly() koelner = Koelner() lein = Lein() metaphone = Metaphone() metasoundex = MetaSoundex() mra = MRA() norphone = Norphone() nrl = NRL() nysiis = NYSIIS() onca = ONCA() parmar_kumbharana = ParmarKumbharana() phonem = Phonem() phonet = Phonet() phonetic_spanish = PhoneticSpanish() phonex = Phonex() phonix = Phonix() pshp_soundex_first = PSHPSoundexFirst() pshp_soundex_last = PSHPSoundexLast() refined_soundex = RefinedSoundex()
class MetaphoneTestCases(unittest.TestCase): """Test Metaphone functions. test cases for abydos.phonetic.Metaphone """ pa = Metaphone() def test_metaphone(self): """Test abydos.phonetic.Metaphone.""" self.assertEqual(self.pa.encode(''), '') self.assertEqual(self.pa.encode('...'), '') # http://ntz-develop.blogspot.com/2011/03/phonetic-algorithms.html self.assertEqual(self.pa.encode('Fishpool', 4), 'FXPL') self.assertEqual(self.pa.encode('Fishpoole', 4), 'FXPL') self.assertEqual(self.pa.encode('Gellately', 4), 'JLTL') self.assertEqual(self.pa.encode('Gelletly', 4), 'JLTL') self.assertEqual(self.pa.encode('Lowers', 4), 'LWRS') self.assertEqual(self.pa.encode('Lowerson', 4), 'LWRS') self.assertEqual(self.pa.encode('Mallabar', 4), 'MLBR') self.assertEqual(self.pa.encode('Melbert', 4), 'MLBR') self.assertEqual(self.pa.encode('Melbourn', 4), 'MLBR') self.assertEqual(self.pa.encode('Melbourne', 4), 'MLBR') self.assertEqual(self.pa.encode('Melburg', 4), 'MLBR') self.assertEqual(self.pa.encode('Melbury', 4), 'MLBR') self.assertEqual(self.pa.encode('Milberry', 4), 'MLBR') self.assertEqual(self.pa.encode('Milborn', 4), 'MLBR') self.assertEqual(self.pa.encode('Milbourn', 4), 'MLBR') self.assertEqual(self.pa.encode('Milbourne', 4), 'MLBR') self.assertEqual(self.pa.encode('Milburn', 4), 'MLBR') self.assertEqual(self.pa.encode('Milburne', 4), 'MLBR') self.assertEqual(self.pa.encode('Millberg', 4), 'MLBR') self.assertEqual(self.pa.encode('Mulberry', 4), 'MLBR') self.assertEqual(self.pa.encode('Mulbery', 4), 'MLBR') self.assertEqual(self.pa.encode('Mulbry', 4), 'MLBR') self.assertEqual(self.pa.encode('Saipy', 4), 'SP') self.assertEqual(self.pa.encode('Sapey', 4), 'SP') self.assertEqual(self.pa.encode('Sapp', 4), 'SP') self.assertEqual(self.pa.encode('Sappy', 4), 'SP') self.assertEqual(self.pa.encode('Sepey', 4), 'SP') self.assertEqual(self.pa.encode('Seppey', 4), 'SP') self.assertEqual(self.pa.encode('Sopp', 4), 'SP') self.assertEqual(self.pa.encode('Zoppie', 4), 'SP') self.assertEqual(self.pa.encode('Zoppo', 4), 'SP') self.assertEqual(self.pa.encode('Zupa', 4), 'SP') self.assertEqual(self.pa.encode('Zupo', 4), 'SP') self.assertEqual(self.pa.encode('Zuppa', 4), 'SP') # assorted tests to complete code coverage self.assertEqual(self.pa.encode('Xavier'), 'SFR') self.assertEqual(self.pa.encode('Acacia'), 'AKX') self.assertEqual(self.pa.encode('Schuler'), 'SKLR') self.assertEqual(self.pa.encode('Sign'), 'SN') self.assertEqual(self.pa.encode('Signed'), 'SNT') self.assertEqual(self.pa.encode('Horatio'), 'HRX') self.assertEqual(self.pa.encode('Ignatio'), 'IKNX') self.assertEqual(self.pa.encode('Lucretia'), 'LKRX') # assorted tests to complete branch coverage self.assertEqual(self.pa.encode('Lamb'), 'LM') self.assertEqual(self.pa.encode('science'), 'SNS') # max_length bounds tests self.assertEqual(self.pa.encode('Niall', max_length=-1), 'NL') self.assertEqual(self.pa.encode('Niall', max_length=0), 'NL') # Test wrapper self.assertEqual(metaphone('Xavier'), 'SFR')
def map_answers(self): # lists for mapped answers in different categories self.mapped_survey_answers = [] self.first_name_mapped_survey_answers = [] self.unmapped_survey_answers = [] # loop through answers and map them for answer in self.meds_cleaned: # try to get the drugbank ids for the whole answer db_ids = self.drug_dictionary.get(answer) # regex pattern to isolate first word first_word = re.sub('[^\w]+.*$', '', answer) first_word_db_ids = self.drug_dictionary.get(first_word) # if the name is already in the drug dictionary add to the mapped list if db_ids: self.mapped_survey_answers.append(answer) mapped_db_ids = db_ids # if its first name is in the bnf add it to the first name mapped list and add answer to the drug dictionary elif first_word_db_ids: self.drug_dictionary[answer] = self.drug_dictionary[first_word] self.first_name_mapped_survey_answers.append(answer) mapped_db_ids = first_word_db_ids # otherwise add it to the unmapped list else: self.unmapped_survey_answers.append(answer) mapped_db_ids = set() # for each of the drugbank ids, update the frequency dictionary for db_id in mapped_db_ids: self.drug_frequencies[db_id] += 1 ## use metaphone to map phonetic encodings to drugbank ids in the drug dictionaries ## mp = Metaphone() # dictionary for storing encodings mapped to drugbank ids encoded_drug_dict = {} # list for ambigious encodings (distinct phonetically-identical drugs) - these will be removed from the dictionary ambiguous_encodings = [] # loop through the drug dictionary and encode every entry, saving the corresponding drugbank ids under the encoding for drug in self.drug_dictionary: # save the encoding for each drug encoding = mp.encode(drug) # if the encoding is not in the encoding dictionary, add it if encoding not in encoded_drug_dict: encoded_drug_dict[encoding] = self.drug_dictionary[drug] # if the encoding is already in the dictionary and there exists different ids for the same encoding, save it elif self.drug_dictionary[drug] != encoded_drug_dict[encoding]: ambiguous_encodings.append(encoding) # filter for encodings with only one match in the drugbank encoded_drug_dict = { key: val for key, val in encoded_drug_dict.items() if key not in ambiguous_encodings } # get survey answers whose encodings are valid self.mapped_by_encoding = [ answer for answer in self.unmapped_survey_answers if mp.encode(answer) in encoded_drug_dict ] # add answers to the drug dictionary under the encoding's drugbank ids for answer in self.mapped_by_encoding: self.drug_dictionary[answer] = encoded_drug_dict[mp.encode(answer)] # list for drugs still unmapped by phonetic encoding self.unmapped_by_encoding = [ answer for answer in self.unmapped_survey_answers if answer not in self.mapped_by_encoding ]