def __init__(self, model='latin'): self.model = model self.impH = input_helpers.InputHelper() self.ST = syllable_tokenizer.SyllableTokenizer() # Phonetic Encoder self.pe = Ainsworth() # Soundex Firstname Algorithm self.pshp_soundex_first = PSHPSoundexFirst() # String Distance algorithms self.algos = [ IterativeSubString(), BISIM(), DiscountedLevenshtein(), Prefix(), LCSstr(), MLIPNS(), Strcmp95(), MRA(), Editex(), SAPS(), FlexMetric(), JaroWinkler(mode='Jaro'), HigueraMico(), Sift4(), Eudex(), ALINE(), Covington(), PhoneticEditDistance() ] self.algo_names = [ 'iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline', 'covington', 'phoneticeditdistance' ] # extract model tarball into directory if doesnt exist model_dir = os.path.join(os.path.dirname(__file__), "models", self.model) if not os.path.exists(model_dir): os.makedirs(model_dir) tar = tarfile.open( os.path.join(os.path.dirname(__file__), "models", self.model + ".tar.gz"), "r:gz") tar.extractall(model_dir) tar.close() # String Distance Pipeline (Level 0/Base Model) self.baseModel = joblib.load(os.path.join(model_dir, 'base.pkl')) # Character Embedding Network (Level 0/Base Model) self.vocab = preprocess.VocabularyProcessor( max_document_length=15, min_frequency=0).restore(os.path.join(model_dir, 'vocab')) siamese_model = os.path.join(model_dir, 'siamese') graph = tf.Graph() with graph.as_default() as graph: self.sess = tf.Session() with self.sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( '{}.meta'.format(siamese_model)) self.sess.run(tf.global_variables_initializer()) saver.restore(self.sess, siamese_model) # Get the placeholders from the graph by name self.input_x1 = graph.get_operation_by_name('input_x1').outputs[0] self.input_x2 = graph.get_operation_by_name('input_x2').outputs[0] self.dropout_keep_prob = graph.get_operation_by_name( 'dropout_keep_prob').outputs[0] self.prediction = graph.get_operation_by_name( 'output/distance').outputs[0] self.sim = graph.get_operation_by_name( 'accuracy/temp_sim').outputs[0] # Logreg (Level 1/Meta Model) self.metaModel = joblib.load(os.path.join(model_dir, 'meta.pkl')) # seen names (mapping dict from raw name to processed name) self.seen_names = {} # seen pairs (mapping dict from name pair tuple to similarity) self.seen_pairs = {}
iss = IterativeSubString() bisim = BISIM() dlev = DiscountedLevenshtein() prefix = Prefix() lcs = LCSstr() mlipns = MLIPNS() strcmp95 = Strcmp95() mra = MRA() editex = Editex() saps = SAPS() flexmetric = FlexMetric() jaro = JaroWinkler(mode='Jaro') higuera_mico = HigueraMico() sift4 = Sift4() eudex = Eudex() aline = ALINE() phonetic_edit = PhoneticEditDistance() algos = [iss, bisim, dlev, prefix, lcs, mlipns, strcmp95, mra, editex, saps, flexmetric, jaro, higuera_mico, sift4, eudex, aline, phonetic_edit] algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline', 'phoneticeditdistance'] def sum_ipa(name_a, name_b): feat1 = ipa_to_features(pe.encode(name_a)) feat2 = ipa_to_features(pe.encode(name_b)) if len(feat1) <= 1: score = sum(cmp_features(f1, f2) for f1, f2 in zip(feat1, feat2))/1 else: score = sum(cmp_features(f1, f2) for f1, f2 in zip(feat1, feat2))/len(feat1)
def test_aline_alignments(self): """Test abydos.distance.ALINE.alignments.""" # test cases from Kondrak (2000) self.assertEqual( self.cmp.alignments('driy', 'tres'), [(75.0, '‖ d r iy ‖', '‖ t r e ‖ s')], ) self.assertEqual( self.cmp.alignments('blow', 'flare'), [(53.0, '‖ b l o ‖ w', '‖ f l a ‖ re')], ) self.assertEqual( self.cmp.alignments('ful', 'plenus'), [(48.0, '‖ f u l ‖', '‖ p - l ‖ enus')], ) self.assertEqual( self.cmp.alignments('fiz', 'piskis'), [(63.0, '‖ f i z ‖', '‖ p i s ‖ kis')], ) self.assertEqual(self.cmp.alignments('ay', 'ego'), [(17.5, '‖ ay ‖', '‖ e ‖ go')]) self.assertEqual( self.cmp.alignments('tuwz', 'dentis'), [(75.0, '‖ t uw z ‖', 'den ‖ t i s ‖')], ) # test cases from Kondrak (2002) after Covington (1996) # Some of these alignments are a little different from what's in the # thesis because of the differing encoding used. self.assertEqual(self.cmp.alignments('jo', 'zPe'), [(29.0, '‖ j o ‖', '‖ zP e ‖')]) self.assertEqual(self.cmp.alignments('tu', 'tuF'), [(45.0, '‖ t u ‖', '‖ t uF ‖')]) self.assertEqual( self.cmp.alignments('nostros', 'nu'), [(47.5, '‖ n o ‖ stros', '‖ n u ‖')], ) self.assertEqual( self.cmp.alignments('kyen', 'ki'), [(47.5, '‖ k ye ‖ n', '‖ k i ‖')], ) self.assertEqual(self.cmp.alignments('ke', 'kwa'), [(42.5, '‖ k e ‖', '‖ k wa ‖')]) self.assertEqual( self.cmp.alignments('todos', 'tu'), [(47.5, '‖ t o ‖ dos', '‖ t u ‖')], ) self.assertEqual( self.cmp.alignments('una', 'uFn'), [(45.0, '‖ u n ‖ a', '‖ uF n ‖')], ) self.assertEqual( self.cmp.alignments('dos', 'doF'), [(45.0, '‖ d o ‖ s', '‖ d oF ‖')], ) self.assertEqual( self.cmp.alignments('tres', 'trwa'), [(77.5, '‖ t r e ‖ s', '‖ t r wa ‖')], ) self.assertEqual( self.cmp.alignments('ombre', 'om'), [ (50.0, '‖ o m ‖ bre', '‖ o m ‖'), (50.0, '‖ o mb ‖ re', '‖ o m ‖'), ], ) self.assertEqual( self.cmp.alignments('arbol', 'arbreC'), [(88.0, '‖ a r b o l ‖', '‖ a r b - r ‖ eC')], ) self.assertEqual( self.cmp.alignments('pluFma', 'plum'), [(115.0, '‖ p l uF m ‖ a', '‖ p l u m ‖')], ) self.assertEqual( self.cmp.alignments('kabetSa', 'kap'), [(75.0, '‖ k a b ‖ etSa', '‖ k a p ‖')], ) self.assertEqual( self.cmp.alignments('boka', 'busP'), [(68.5, '‖ b o k ‖ a', '‖ b u sP ‖')], ) self.assertEqual( self.cmp.alignments('pye', 'pye'), [(65.0, '‖ p y e ‖', '‖ p y e ‖')], ) self.assertEqual( self.cmp.alignments('koratSon', 'koFr'), [(80.0, '‖ k o r ‖ atSon', '‖ k oF r ‖')], ) self.assertEqual( self.cmp.alignments('ber', 'vwar'), [(60.5, '‖ b e r ‖', '‖ v wa r ‖')], ) self.assertEqual( self.cmp.alignments('benir', 'veCnir'), [(115.5, '‖ b e n i r ‖', '‖ v eC n i r ‖')], ) self.assertEqual( self.cmp.alignments('detSir', 'dir'), [ (65.0, 'de ‖ tS i r ‖', '‖ d i r ‖'), (65.0, '‖ d e tS i r ‖', '‖ d - - i r ‖'), ], ) self.assertEqual( self.cmp.alignments('pobre', 'povreC'), [(115.5, '‖ p o b r e ‖', '‖ p o v r eC ‖')], ) self.assertEqual( self.cmp.alignments('dSis', 'diHzes'), [(77.5, '‖ dS i s ‖', 'diH ‖ z e s ‖')], ) self.assertEqual( self.cmp.alignments('dSaFt', 'das'), [(62.5, '‖ dS aF t ‖', '‖ d a s ‖')], ) # Different from paper: self.assertEqual( self.cmp.alignments('wat', 'vas'), [(40.0, 'w ‖ a t ‖', 'v ‖ a s ‖')], ) self.assertEqual( self.cmp.alignments('nat', 'nixt'), [ (62.5, '‖ n a - t ‖', '‖ n i x t ‖'), (62.5, '‖ n a t ‖', '‖ n i xt ‖'), ], ) self.assertEqual( self.cmp.alignments('logN', 'lagN'), [(75.0, '‖ l o gN ‖', '‖ l a gN ‖')], ) self.assertEqual( self.cmp.alignments('maFn', 'man'), [(82.5, '‖ m aF n ‖', '‖ m a n ‖')], ) self.assertEqual( self.cmp.alignments('flesP', 'flaysP'), [(122.5, '‖ f l e sP ‖', '‖ f l ay sP ‖')], ) self.assertEqual( self.cmp.alignments('bleCd', 'bluHt'), [(99.0, '‖ b l eC d ‖', '‖ b l uH t ‖')], ) self.assertEqual( self.cmp.alignments('fedSeCr', 'feHdeCr'), [(124.0, '‖ f e dS eC r ‖', '‖ f eH d eC r ‖')], ) self.assertEqual( self.cmp.alignments('haFr', 'haHr'), [(81.5, '‖ h aF r ‖', '‖ h aH r ‖')], ) self.assertEqual(self.cmp.alignments('ir', 'oHr'), [(41.5, '‖ i r ‖', '‖ oH r ‖')]) self.assertEqual( self.cmp.alignments('ay', 'awgeC'), [(20.0, '‖ a y ‖', '‖ a w ‖ geC')], ) self.assertEqual( self.cmp.alignments('nowz', 'naHzeC'), [(70.5, '‖ n ow z ‖', '‖ n aH z ‖ eC')], ) self.assertEqual( self.cmp.alignments('mawtS', 'munt'), [(62.5, '‖ m aw - tS ‖', '‖ m u n t ‖')], ) self.assertEqual( self.cmp.alignments('teCgN', 'tsugNeC'), [(75.0, '‖ t eC gN ‖', '‖ ts u gN ‖ eC')], ) self.assertEqual( self.cmp.alignments('fut', 'fuHs'), [(74.0, '‖ f u t ‖', '‖ f uH s ‖')], ) self.assertEqual( self.cmp.alignments('niy', 'kniH'), [(53.0, '‖ n iy ‖', 'k ‖ n iH ‖')], ) self.assertEqual( self.cmp.alignments('haFnd', 'hant'), [(107.5, '‖ h aF n d ‖', '‖ h a n t ‖')], ) self.assertEqual( self.cmp.alignments('hart', 'herts'), [ (115.0, '‖ h a r t ‖', '‖ h e r t ‖ s'), (115.0, '‖ h a r t ‖', '‖ h e r ts ‖'), ], ) self.assertEqual( self.cmp.alignments('liveCr', 'leHbeCr'), [(109.5, '‖ l i v eC r ‖', '‖ l eH b eC r ‖')], ) self.assertEqual( self.cmp.alignments('aFnd', 'ante'), [(72.5, '‖ aF n d ‖', '‖ a n t ‖ e')], ) self.assertEqual(self.cmp.alignments('aFt', 'ad'), [(37.5, '‖ aF t ‖', '‖ a d ‖')]) self.assertEqual( self.cmp.alignments('blow', 'flaHre'), [(52.0, '‖ b l o ‖ w', '‖ f l aH ‖ re')], ) # Different from paper: self.assertEqual( self.cmp.alignments('ir', 'awris'), [(45.0, '‖ i r ‖', 'a ‖ w r ‖ is')], ) self.assertEqual( self.cmp.alignments('iyt', 'edere'), [(40.0, '‖ iy t ‖', '‖ e d ‖ ere')], ) self.assertEqual( self.cmp.alignments('fisS', 'piskis'), [(73.0, '‖ f i sS ‖', '‖ p i s ‖ kis')], ) self.assertEqual( self.cmp.alignments('flow', 'fluere'), [(92.5, '‖ f l ow ‖', '‖ f l u ‖ ere')], ) self.assertEqual( self.cmp.alignments('star', 'steHlla'), [(92.0, '‖ s t a r ‖', '‖ s t eH l ‖ la')], ) self.assertEqual( self.cmp.alignments('ful', 'pleHnus'), [(48.0, '‖ f u l ‖', '‖ p - l ‖ eHnus')], ) self.assertEqual( self.cmp.alignments('graFs', 'graHmen'), [(81.5, '‖ g r aF ‖ s', '‖ g r aH ‖ men')], ) self.assertEqual( self.cmp.alignments('hart', 'kordis'), [(70.0, '‖ h a r t ‖', '‖ k o r d ‖ is')], ) self.assertEqual( self.cmp.alignments('horn', 'kornuH'), [(90.0, '‖ h o r n ‖', '‖ k o r n ‖ uH')], ) self.assertEqual(self.cmp.alignments('ay', 'ego'), [(17.5, '‖ ay ‖', '‖ e ‖ go')]) self.assertEqual( self.cmp.alignments('niy', 'genuH'), [(44.0, '‖ n i ‖ y', 'ge ‖ n uH ‖')], ) self.assertEqual( self.cmp.alignments('meCdSeCr', 'maHter'), [(109.0, '‖ m eC dS eC r ‖', '‖ m aH t e r ‖')], ) self.assertEqual( self.cmp.alignments('mawnteCn', 'moHns'), [(105.5, '‖ m aw n t ‖ eCn', '‖ m oH n s ‖')], ) # The example below is different from the expected, but # (73.0, '‖ n ey m ‖', '‖ n oH m ‖ en') is the #2 alignment. # This is probably due to slightly differing weights/costs/features. self.assertEqual( self.cmp.alignments('neym', 'noHmen'), [(80.5, '‖ n ey m ‖', 'noH ‖ m e n ‖')], ) self.assertEqual( self.cmp.alignments('nyuw', 'nowus'), [(70.0, '‖ n yu w ‖', '‖ n o wu ‖ s')], ) self.assertEqual( self.cmp.alignments('weCn', 'uHnus'), [(48.0, '‖ weC n ‖', '‖ uH n ‖ us')], ) self.assertEqual( self.cmp.alignments('rawnd', 'rotundus'), [(115.0, '‖ r a - w n d ‖', '‖ r o t u n d ‖ us')], ) self.assertEqual( self.cmp.alignments('sow', 'suere'), [(57.5, '‖ s ow ‖', '‖ s u ‖ ere')], ) self.assertEqual( self.cmp.alignments('sit', 'seHdere'), [(66.5, '‖ s i t ‖', '‖ s eH d ‖ ere')], ) self.assertEqual( self.cmp.alignments('tSriy', 'treHs'), [(73.0, '‖ tS r iy ‖', '‖ t r eH ‖ s')], ) self.assertEqual( self.cmp.alignments('tuwtS', 'dentis'), [(85.0, '‖ t uw tS ‖', 'den ‖ t i s ‖')], ) self.assertEqual( self.cmp.alignments('tSin', 'tenuis'), [(67.5, '‖ tS i n ‖', '‖ t e n ‖ uis')], ) self.assertEqual( self.cmp.alignments('kiHnwaHwa', 'kenuaq'), [(105.5, '‖ k iH n w aH ‖ wa', '‖ k e n u a ‖ q')], ) self.assertEqual( self.cmp.alignments('niHna', 'nenah'), [(91.5, '‖ n iH n a ‖', '‖ n e n a ‖ h')], ) self.assertEqual( self.cmp.alignments('naHpeHwa', 'naHpeHw'), [(115.0, '‖ n aH p eH w ‖ a', '‖ n aH p eH w ‖')], ) self.assertEqual( self.cmp.alignments('waHpimini', 'waHpemen'), [(150.0, '‖ w aH p i m i n ‖ i', '‖ w aH p e m e n ‖')], ) self.assertEqual( self.cmp.alignments('nameHsa', 'nameHqs'), [(125.0, '‖ n a m eH - s ‖ a', '‖ n a m eH q s ‖')], ) self.assertEqual( self.cmp.alignments('okimaHwa', 'okeHmaHw'), [(121.5, '‖ o k i m aH w ‖ a', '‖ o k eH m aH w ‖')], ) self.assertEqual( self.cmp.alignments('sPiHsPiHpa', 'seHqsep'), [(97.0, '‖ sP iH - sP iH p ‖ a', '‖ s eH q s e p ‖')], ) self.assertEqual( self.cmp.alignments('ahkohkwa', 'ahkeHh'), [(124.0, '‖ a h k o h ‖ kwa', '‖ a h k eH h ‖')], ) self.assertEqual( self.cmp.alignments('pemaHtesiweni', 'pemaHtesewen'), [( 257.5, '‖ p e m aH t e s i w e n ‖ i', '‖ p e m aH t e s e w e n ‖', )], ) self.assertEqual( self.cmp.alignments('asenya', 'aqsen'), [(90.0, '‖ a - s e n ‖ ya', '‖ a q s e n ‖')], ) self.assertEqual( self.cmp.alignments('didoHmi', 'doH'), [(50.0, 'di ‖ d oH ‖ mi', '‖ d oH ‖')], ) self.assertEqual( self.cmp.alignments('tAugateEr', 'toxteCr'), [(130.0, '‖ tA u g a t e r ‖', '‖ t o x - t eC r ‖')], ) self.assertEqual( self.cmp.alignments('doteCr', 'tAugateEr'), [(112.5, '‖ d o t eC r ‖', 'tAu ‖ g a t e r ‖')], ) self.assertEqual( self.cmp.alignments('ager', 'azPras'), [(61.0, '‖ a g e r ‖', '‖ a zP - r ‖ as')], ) self.assertEqual( self.cmp.alignments('bAaraHmi', 'pAero'), [(74.0, '‖ bA a r aH ‖ mi', '‖ pA e r o ‖')], ) self.assertEqual( self.cmp.alignments('kentum', 'hekaton'), [ (111.5, '‖ k e n t u m ‖', 'he ‖ k a - t o n ‖'), (111.5, '‖ k e nt u m ‖', 'he ‖ k a t o n ‖'), ], ) self.assertEqual( self.cmp.alignments('kentum', 'sateCm'), [ (90.0, '‖ k e n t u m ‖', '‖ s a - t eC m ‖'), (90.0, '‖ k e nt u m ‖', '‖ s a t eC m ‖'), ], ) # test cases from Downey, et al. (2008) self.assertEqual( self.cmp.alignments('api', 'api'), [(65.0, '‖ a p i ‖', '‖ a p i ‖')], ) self.assertEqual( self.cmp.alignments('apik', 'apik'), [(100.0, '‖ a p i k ‖', '‖ a p i k ‖')], ) self.assertEqual( self.cmp.alignments('apila', 'apila'), [(115.0, '‖ a p i l a ‖', '‖ a p i l a ‖')], ) self.assertEqual( self.cmp.alignments('api', 'apik'), [(65.0, '‖ a p i ‖', '‖ a p i ‖ k')], ) self.assertEqual( self.cmp.alignments('api', 'apila'), [(65.0, '‖ a p i ‖', '‖ a p i ‖ la')], ) self.assertEqual( self.cmp.alignments('apik', 'apila'), [(65.0, '‖ a p i ‖ k', '‖ a p i ‖ la')], ) self.assertEqual( self.cmp.alignments('kalarita', 'kalarita'), [(200.0, '‖ k a l a r i t a ‖', '‖ k a l a r i t a ‖')], ) self.assertEqual( self.cmp.alignments('kalara', 'kalara'), [(150.0, '‖ k a l a r a ‖', '‖ k a l a r a ‖')], ) self.assertEqual( self.cmp.alignments('makebela', 'makebela'), [(200.0, '‖ m a k e b e l a ‖', '‖ m a k e b e l a ‖')], ) # The following case has a different score, but the same alignment as # in Downey, et. al (2008) self.assertEqual( self.cmp.alignments('kalarita', 'kalara'), [(137.5, '‖ k a l a r i ‖ ta', '‖ k a l a r a ‖')], ) self.assertEqual( self.cmp.alignments('kalarita', 'makebela'), [ (75.0, '‖ k - - a l a ‖ rita', 'ma ‖ k e b e l a ‖'), (75.0, '‖ k a - - l a ‖ rita', 'ma ‖ k e b e l a ‖'), ], ) self.assertEqual( self.cmp.alignments('kalara', 'makebela'), [(82.0, '‖ k a l a r a ‖', 'ma ‖ k e b e l a ‖')], ) # other alignment styles: cmp2 = ALINE(mode='local') self.assertEqual( cmp2.alignments('aHpakosiHs', 'waHpikonoHha'), [(120.0, '‖ aH p a k o s iH s ‖', 'w ‖ aH p i k o n oH h ‖ a')], ) cmp2 = ALINE(mode='semi-global') self.assertEqual( cmp2.alignments('aHpakosiHs', 'waHpikonoHha'), [(120.0, '‖ aH p a k o s iH s ‖', 'w ‖ aH p i k o n oH h ‖ a')], ) cmp2 = ALINE(mode='half-local') self.assertEqual( cmp2.alignments('aHpakosiHs', 'waHpikonoHha'), [(110.0, '‖ aH p a k o s iH s - ‖', 'w ‖ aH p i k o n oH h a ‖')], ) cmp2 = ALINE(mode='global') self.assertEqual( cmp2.alignments('aHpakosiHs', 'waHpikonoHha'), [(106.5, '‖ aH p a k o s iH s - ‖', '‖ waH p i k o n oH h a ‖')], ) # The following just confirms that unknown values of mode use 'local' cmp2 = ALINE(mode='universal') self.assertEqual( cmp2.alignments('aHpakosiHs', 'waHpikonoHha'), [(120.0, '‖ aH p a k o s iH s ‖', 'w ‖ aH p i k o n oH h ‖ a')], ) self.assertEqual( cmp2.alignments('kan', 'kaABCDHn'), [(84.0, '‖ k a n ‖', '‖ k aABCDH n ‖')], ) self.assertEqual( cmp2.alignments('kaABCDHn', 'kan'), [(84.0, '‖ k aABCDH n ‖', '‖ k a n ‖')], ) cmp2 = ALINE(phones='ipa') self.assertEqual( cmp2.alignments('kɒgneit', 'kognaːtus'), [(163.0, '‖ k ɒ g n ei t ‖', '‖ k o g n aː t ‖ us')], )
def __init__(self, model='latin', prefilter=True, allow_alt_surname=True, allow_initials=True, allow_missing_components=True): # user-provided parameters self.model = model self.allow_alt_surname = allow_alt_surname self.allow_initials = allow_initials self.allow_missing_components = allow_missing_components self.prefilter = prefilter if self.prefilter: self.refined_soundex = { 'b': 1, 'p': 1, 'f': 2, 'v': 2, 'c': 3, 'k': 3, 's': 3, 'g': 4, 'j': 4, 'q': 5, 'x': 5, 'z': 5, 'd': 6, 't': 6, 'l': 7, 'm': 8, 'n': 8, 'r': 9 } # verify user-supplied class arguments model_dir = self.validate_parameters() self.impH = input_helpers.InputHelper() # Phonetic Encoder self.pe = Ainsworth() # Soundex Firstname Algorithm self.pshp_soundex_first = PSHPSoundexFirst() # Soundex Lastname Algorithm self.pshp_soundex_last = PSHPSoundexLast() # String Distance algorithms self.algos = [IterativeSubString(), BISIM(), DiscountedLevenshtein(), Prefix(), LCSstr(), MLIPNS(), Strcmp95(), MRA(), Editex(), SAPS(), FlexMetric(), JaroWinkler(mode='Jaro'), HigueraMico(), Sift4(), Eudex(), ALINE(), CovingtonGuard(), PhoneticEditDistance()] self.algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline', 'covington', 'phoneticeditdistance'] # String Distance Pipeline (Level 0/Base Model) self.baseModel = joblib.load(os.path.join(model_dir, 'base.pkl')) # Character Embedding Network (Level 0/Base Model) self.vocab = preprocess.VocabularyProcessor(max_document_length=15, min_frequency=0).restore( os.path.join(model_dir, 'vocab')) siamese_model = os.path.join(model_dir, 'siamese') # start tensorflow session graph = tf.Graph() with graph.as_default() as graph: self.sess = tf.Session() if tf.__version__[0] == '1' else tf.compat.v1.Session() with self.sess.as_default(): # Load the saved meta graph and restore variables if tf.__version__[0] == '1': saver = tf.train.import_meta_graph('{}.meta'.format(siamese_model)) self.sess.run(tf.global_variables_initializer()) else: saver = tf.compat.v1.train.import_meta_graph('{}.meta'.format(siamese_model)) self.sess.run(tf.compat.v1.global_variables_initializer()) saver.restore(self.sess, siamese_model) # Get the placeholders from the graph by name self.input_x1 = graph.get_operation_by_name('input_x1').outputs[0] self.input_x2 = graph.get_operation_by_name('input_x2').outputs[0] self.dropout_keep_prob = graph.get_operation_by_name('dropout_keep_prob').outputs[0] self.prediction = graph.get_operation_by_name('output/distance').outputs[0] self.sim = graph.get_operation_by_name('accuracy/temp_sim').outputs[0] # Logreg (Level 1/Meta Model) self.metaModel = joblib.load(os.path.join(model_dir, 'meta.pkl')) # seen names (mapping dict from raw name to processed name) self.seen_names = {} # seen pairs (mapping dict from name pair tuple to similarity) self.seen_pairs = {} # user scores (mapping dict from name pair tuple to similarity) self.user_scores = {}
def test_aline_alignment(self): """Test abydos.distance.ALINE.alignment.""" # test cases from Kondrak (2000) self.assertEqual( self.cmp.alignment('driy', 'tres'), [(75.0, '‖ d r iy ‖', '‖ t r e ‖ s')], ) self.assertEqual( self.cmp.alignment('blow', 'flare'), [(53.0, '‖ b l o ‖ w', '‖ f l a ‖ re')], ) self.assertEqual( self.cmp.alignment('ful', 'plenus'), [(48.0, '‖ f u l ‖', '‖ p - l ‖ enus')], ) self.assertEqual( self.cmp.alignment('fiz', 'piskis'), [(63.0, '‖ f i z ‖', '‖ p i s ‖ kis')], ) self.assertEqual( self.cmp.alignment('ay', 'ego'), [(17.5, '‖ ay ‖', '‖ e ‖ go')] ) self.assertEqual( self.cmp.alignment('tuwz', 'dentis'), [(75.0, '‖ t uw z ‖', 'den ‖ t i s ‖')], ) # test cases from Kondrak (2002) after Covington (1996) # Some of these alignments are a little different from what's in the # thesis because of the differing encoding used. self.assertEqual( self.cmp.alignment('jo', 'zPe'), [(29.0, '‖ j o ‖', '‖ zP e ‖')] ) self.assertEqual( self.cmp.alignment('tu', 'tuF'), [(45.0, '‖ t u ‖', '‖ t uF ‖')] ) self.assertEqual( self.cmp.alignment('nostros', 'nu'), [(47.5, '‖ n o ‖ stros', '‖ n u ‖')], ) self.assertEqual( self.cmp.alignment('kyen', 'ki'), [(47.5, '‖ k ye ‖ n', '‖ k i ‖')], ) self.assertEqual( self.cmp.alignment('ke', 'kwa'), [(42.5, '‖ k e ‖', '‖ k wa ‖')] ) self.assertEqual( self.cmp.alignment('todos', 'tu'), [(47.5, '‖ t o ‖ dos', '‖ t u ‖')], ) self.assertEqual( self.cmp.alignment('una', 'uFn'), [(45.0, '‖ u n ‖ a', '‖ uF n ‖')], ) self.assertEqual( self.cmp.alignment('dos', 'doF'), [(45.0, '‖ d o ‖ s', '‖ d oF ‖')], ) self.assertEqual( self.cmp.alignment('tres', 'trwa'), [(77.5, '‖ t r e ‖ s', '‖ t r wa ‖')], ) self.assertEqual( self.cmp.alignment('ombre', 'om'), [ (50.0, '‖ o m ‖ bre', '‖ o m ‖'), (50.0, '‖ o mb ‖ re', '‖ o m ‖'), ], ) self.assertEqual( self.cmp.alignment('arbol', 'arbreC'), [(88.0, '‖ a r b o l ‖', '‖ a r b - r ‖ eC')], ) self.assertEqual( self.cmp.alignment('pluFma', 'plum'), [(115.0, '‖ p l uF m ‖ a', '‖ p l u m ‖')], ) self.assertEqual( self.cmp.alignment('kabetSa', 'kap'), [(75.0, '‖ k a b ‖ etSa', '‖ k a p ‖')], ) self.assertEqual( self.cmp.alignment('boka', 'busP'), [(68.5, '‖ b o k ‖ a', '‖ b u sP ‖')], ) self.assertEqual( self.cmp.alignment('pye', 'pye'), [(65.0, '‖ p y e ‖', '‖ p y e ‖')], ) self.assertEqual( self.cmp.alignment('koratSon', 'koFr'), [(80.0, '‖ k o r ‖ atSon', '‖ k oF r ‖')], ) self.assertEqual( self.cmp.alignment('ber', 'vwar'), [(60.5, '‖ b e r ‖', '‖ v wa r ‖')], ) self.assertEqual( self.cmp.alignment('benir', 'veCnir'), [(115.5, '‖ b e n i r ‖', '‖ v eC n i r ‖')], ) self.assertEqual( self.cmp.alignment('detSir', 'dir'), [ (65.0, 'de ‖ tS i r ‖', '‖ d i r ‖'), (65.0, '‖ d e tS i r ‖', '‖ d - - i r ‖'), ], ) self.assertEqual( self.cmp.alignment('pobre', 'povreC'), [(115.5, '‖ p o b r e ‖', '‖ p o v r eC ‖')], ) self.assertEqual( self.cmp.alignment('dSis', 'diHzes'), [(77.5, '‖ dS i s ‖', 'diH ‖ z e s ‖')], ) self.assertEqual( self.cmp.alignment('dSaFt', 'das'), [(62.5, '‖ dS aF t ‖', '‖ d a s ‖')], ) # Different from paper: self.assertEqual( self.cmp.alignment('wat', 'vas'), [(40.0, 'w ‖ a t ‖', 'v ‖ a s ‖')], ) self.assertEqual( self.cmp.alignment('nat', 'nixt'), [ (62.5, '‖ n a - t ‖', '‖ n i x t ‖'), (62.5, '‖ n a t ‖', '‖ n i xt ‖'), ], ) self.assertEqual( self.cmp.alignment('logN', 'lagN'), [(75.0, '‖ l o gN ‖', '‖ l a gN ‖')], ) self.assertEqual( self.cmp.alignment('maFn', 'man'), [(82.5, '‖ m aF n ‖', '‖ m a n ‖')], ) self.assertEqual( self.cmp.alignment('flesP', 'flaysP'), [(122.5, '‖ f l e sP ‖', '‖ f l ay sP ‖')], ) self.assertEqual( self.cmp.alignment('bleCd', 'bluHt'), [(99.0, '‖ b l eC d ‖', '‖ b l uH t ‖')], ) self.assertEqual( self.cmp.alignment('fedSeCr', 'feHdeCr'), [(124.0, '‖ f e dS eC r ‖', '‖ f eH d eC r ‖')], ) self.assertEqual( self.cmp.alignment('haFr', 'haHr'), [(81.5, '‖ h aF r ‖', '‖ h aH r ‖')], ) self.assertEqual( self.cmp.alignment('ir', 'oHr'), [(41.5, '‖ i r ‖', '‖ oH r ‖')] ) self.assertEqual( self.cmp.alignment('ay', 'awgeC'), [(20.0, '‖ a y ‖', '‖ a w ‖ geC')], ) self.assertEqual( self.cmp.alignment('nowz', 'naHzeC'), [(70.5, '‖ n ow z ‖', '‖ n aH z ‖ eC')], ) self.assertEqual( self.cmp.alignment('mawtS', 'munt'), [(62.5, '‖ m aw - tS ‖', '‖ m u n t ‖')], ) self.assertEqual( self.cmp.alignment('teCgN', 'tsugNeC'), [(75.0, '‖ t eC gN ‖', '‖ ts u gN ‖ eC')], ) self.assertEqual( self.cmp.alignment('fut', 'fuHs'), [(74.0, '‖ f u t ‖', '‖ f uH s ‖')], ) self.assertEqual( self.cmp.alignment('niy', 'kniH'), [(53.0, '‖ n iy ‖', 'k ‖ n iH ‖')], ) self.assertEqual( self.cmp.alignment('haFnd', 'hant'), [(107.5, '‖ h aF n d ‖', '‖ h a n t ‖')], ) self.assertEqual( self.cmp.alignment('hart', 'herts'), [ (115.0, '‖ h a r t ‖', '‖ h e r t ‖ s'), (115.0, '‖ h a r t ‖', '‖ h e r ts ‖'), ], ) self.assertEqual( self.cmp.alignment('liveCr', 'leHbeCr'), [(109.5, '‖ l i v eC r ‖', '‖ l eH b eC r ‖')], ) self.assertEqual( self.cmp.alignment('aFnd', 'ante'), [(72.5, '‖ aF n d ‖', '‖ a n t ‖ e')], ) self.assertEqual( self.cmp.alignment('aFt', 'ad'), [(37.5, '‖ aF t ‖', '‖ a d ‖')] ) self.assertEqual( self.cmp.alignment('blow', 'flaHre'), [(52.0, '‖ b l o ‖ w', '‖ f l aH ‖ re')], ) # Different from paper: self.assertEqual( self.cmp.alignment('ir', 'awris'), [(45.0, '‖ i r ‖', 'a ‖ w r ‖ is')], ) self.assertEqual( self.cmp.alignment('iyt', 'edere'), [(40.0, '‖ iy t ‖', '‖ e d ‖ ere')], ) self.assertEqual( self.cmp.alignment('fisS', 'piskis'), [(73.0, '‖ f i sS ‖', '‖ p i s ‖ kis')], ) self.assertEqual( self.cmp.alignment('flow', 'fluere'), [(92.5, '‖ f l ow ‖', '‖ f l u ‖ ere')], ) self.assertEqual( self.cmp.alignment('star', 'steHlla'), [(92.0, '‖ s t a r ‖', '‖ s t eH l ‖ la')], ) self.assertEqual( self.cmp.alignment('ful', 'pleHnus'), [(48.0, '‖ f u l ‖', '‖ p - l ‖ eHnus')], ) self.assertEqual( self.cmp.alignment('graFs', 'graHmen'), [(81.5, '‖ g r aF ‖ s', '‖ g r aH ‖ men')], ) self.assertEqual( self.cmp.alignment('hart', 'kordis'), [(70.0, '‖ h a r t ‖', '‖ k o r d ‖ is')], ) self.assertEqual( self.cmp.alignment('horn', 'kornuH'), [(90.0, '‖ h o r n ‖', '‖ k o r n ‖ uH')], ) self.assertEqual( self.cmp.alignment('ay', 'ego'), [(17.5, '‖ ay ‖', '‖ e ‖ go')] ) self.assertEqual( self.cmp.alignment('niy', 'genuH'), [(44.0, '‖ n i ‖ y', 'ge ‖ n uH ‖')], ) self.assertEqual( self.cmp.alignment('meCdSeCr', 'maHter'), [(109.0, '‖ m eC dS eC r ‖', '‖ m aH t e r ‖')], ) self.assertEqual( self.cmp.alignment('mawnteCn', 'moHns'), [(105.5, '‖ m aw n t ‖ eCn', '‖ m oH n s ‖')], ) # The example below is different from the expected, but # (73.0, '‖ n ey m ‖', '‖ n oH m ‖ en') is the #2 alignment. # This is probably due to slightly differing weights/costs/features. self.assertEqual( self.cmp.alignment('neym', 'noHmen'), [(80.5, '‖ n ey m ‖', 'noH ‖ m e n ‖')], ) self.assertEqual( self.cmp.alignment('nyuw', 'nowus'), [(70.0, '‖ n yu w ‖', '‖ n o wu ‖ s')], ) self.assertEqual( self.cmp.alignment('weCn', 'uHnus'), [(48.0, '‖ weC n ‖', '‖ uH n ‖ us')], ) self.assertEqual( self.cmp.alignment('rawnd', 'rotundus'), [(115.0, '‖ r a - w n d ‖', '‖ r o t u n d ‖ us')], ) self.assertEqual( self.cmp.alignment('sow', 'suere'), [(57.5, '‖ s ow ‖', '‖ s u ‖ ere')], ) self.assertEqual( self.cmp.alignment('sit', 'seHdere'), [(66.5, '‖ s i t ‖', '‖ s eH d ‖ ere')], ) self.assertEqual( self.cmp.alignment('tSriy', 'treHs'), [(73.0, '‖ tS r iy ‖', '‖ t r eH ‖ s')], ) self.assertEqual( self.cmp.alignment('tuwtS', 'dentis'), [(85.0, '‖ t uw tS ‖', 'den ‖ t i s ‖')], ) self.assertEqual( self.cmp.alignment('tSin', 'tenuis'), [(67.5, '‖ tS i n ‖', '‖ t e n ‖ uis')], ) self.assertEqual( self.cmp.alignment('kiHnwaHwa', 'kenuaq'), [(105.5, '‖ k iH n w aH ‖ wa', '‖ k e n u a ‖ q')], ) self.assertEqual( self.cmp.alignment('niHna', 'nenah'), [(91.5, '‖ n iH n a ‖', '‖ n e n a ‖ h')], ) self.assertEqual( self.cmp.alignment('naHpeHwa', 'naHpeHw'), [(115.0, '‖ n aH p eH w ‖ a', '‖ n aH p eH w ‖')], ) self.assertEqual( self.cmp.alignment('waHpimini', 'waHpemen'), [(150.0, '‖ w aH p i m i n ‖ i', '‖ w aH p e m e n ‖')], ) self.assertEqual( self.cmp.alignment('nameHsa', 'nameHqs'), [(125.0, '‖ n a m eH - s ‖ a', '‖ n a m eH q s ‖')], ) self.assertEqual( self.cmp.alignment('okimaHwa', 'okeHmaHw'), [(121.5, '‖ o k i m aH w ‖ a', '‖ o k eH m aH w ‖')], ) self.assertEqual( self.cmp.alignment('sPiHsPiHpa', 'seHqsep'), [(97.0, '‖ sP iH - sP iH p ‖ a', '‖ s eH q s e p ‖')], ) self.assertEqual( self.cmp.alignment('ahkohkwa', 'ahkeHh'), [(124.0, '‖ a h k o h ‖ kwa', '‖ a h k eH h ‖')], ) self.assertEqual( self.cmp.alignment('pemaHtesiweni', 'pemaHtesewen'), [ ( 257.5, '‖ p e m aH t e s i w e n ‖ i', '‖ p e m aH t e s e w e n ‖', ) ], ) self.assertEqual( self.cmp.alignment('asenya', 'aqsen'), [(90.0, '‖ a - s e n ‖ ya', '‖ a q s e n ‖')], ) self.assertEqual( self.cmp.alignment('didoHmi', 'doH'), [(50.0, 'di ‖ d oH ‖ mi', '‖ d oH ‖')], ) self.assertEqual( self.cmp.alignment('tAugateEr', 'toxteCr'), [(130.0, '‖ tA u g a t e r ‖', '‖ t o x - t eC r ‖')], ) self.assertEqual( self.cmp.alignment('doteCr', 'tAugateEr'), [(112.5, '‖ d o t eC r ‖', 'tAu ‖ g a t e r ‖')], ) self.assertEqual( self.cmp.alignment('ager', 'azPras'), [(61.0, '‖ a g e r ‖', '‖ a zP - r ‖ as')], ) self.assertEqual( self.cmp.alignment('bAaraHmi', 'pAero'), [(74.0, '‖ bA a r aH ‖ mi', '‖ pA e r o ‖')], ) self.assertEqual( self.cmp.alignment('kentum', 'hekaton'), [ (111.5, '‖ k e n t u m ‖', 'he ‖ k a - t o n ‖'), (111.5, '‖ k e nt u m ‖', 'he ‖ k a t o n ‖'), ], ) self.assertEqual( self.cmp.alignment('kentum', 'sateCm'), [ (90.0, '‖ k e n t u m ‖', '‖ s a - t eC m ‖'), (90.0, '‖ k e nt u m ‖', '‖ s a t eC m ‖'), ], ) # test cases from Downey, et al. (2008) self.assertEqual( self.cmp.alignment('api', 'api'), [(65.0, '‖ a p i ‖', '‖ a p i ‖')], ) self.assertEqual( self.cmp.alignment('apik', 'apik'), [(100.0, '‖ a p i k ‖', '‖ a p i k ‖')], ) self.assertEqual( self.cmp.alignment('apila', 'apila'), [(115.0, '‖ a p i l a ‖', '‖ a p i l a ‖')], ) self.assertEqual( self.cmp.alignment('api', 'apik'), [(65.0, '‖ a p i ‖', '‖ a p i ‖ k')], ) self.assertEqual( self.cmp.alignment('api', 'apila'), [(65.0, '‖ a p i ‖', '‖ a p i ‖ la')], ) self.assertEqual( self.cmp.alignment('apik', 'apila'), [(65.0, '‖ a p i ‖ k', '‖ a p i ‖ la')], ) self.assertEqual( self.cmp.alignment('kalarita', 'kalarita'), [(200.0, '‖ k a l a r i t a ‖', '‖ k a l a r i t a ‖')], ) self.assertEqual( self.cmp.alignment('kalara', 'kalara'), [(150.0, '‖ k a l a r a ‖', '‖ k a l a r a ‖')], ) self.assertEqual( self.cmp.alignment('makebela', 'makebela'), [(200.0, '‖ m a k e b e l a ‖', '‖ m a k e b e l a ‖')], ) # The following case has a different score, but the same alignment as # in Downey, et. al (2008) self.assertEqual( self.cmp.alignment('kalarita', 'kalara'), [(137.5, '‖ k a l a r i ‖ ta', '‖ k a l a r a ‖')], ) self.assertEqual( self.cmp.alignment('kalarita', 'makebela'), [ (75.0, '‖ k - - a l a ‖ rita', 'ma ‖ k e b e l a ‖'), (75.0, '‖ k a - - l a ‖ rita', 'ma ‖ k e b e l a ‖'), ], ) self.assertEqual( self.cmp.alignment('kalara', 'makebela'), [(82.0, '‖ k a l a r a ‖', 'ma ‖ k e b e l a ‖')], ) # other alignment styles: cmp2 = ALINE(mode='local') self.assertEqual( cmp2.alignment('aHpakosiHs', 'waHpikonoHha'), [(120.0, '‖ aH p a k o s iH s ‖', 'w ‖ aH p i k o n oH h ‖ a')], ) cmp2 = ALINE(mode='semi-global') self.assertEqual( cmp2.alignment('aHpakosiHs', 'waHpikonoHha'), [(120.0, '‖ aH p a k o s iH s ‖', 'w ‖ aH p i k o n oH h ‖ a')], ) cmp2 = ALINE(mode='half-local') self.assertEqual( cmp2.alignment('aHpakosiHs', 'waHpikonoHha'), [(110.0, '‖ aH p a k o s iH s - ‖', 'w ‖ aH p i k o n oH h a ‖')], ) cmp2 = ALINE(mode='global') self.assertEqual( cmp2.alignment('aHpakosiHs', 'waHpikonoHha'), [(106.5, '‖ aH p a k o s iH s - ‖', '‖ waH p i k o n oH h a ‖')], ) # The following just confirms that unknown values of mode use 'local' cmp2 = ALINE(mode='universal') self.assertEqual( cmp2.alignment('aHpakosiHs', 'waHpikonoHha'), [(120.0, '‖ aH p a k o s iH s ‖', 'w ‖ aH p i k o n oH h ‖ a')], ) self.assertEqual( cmp2.alignment('kan', 'kaABCDHn'), [(84.0, '‖ k a n ‖', '‖ k aABCDH n ‖')], ) self.assertEqual( cmp2.alignment('kaABCDHn', 'kan'), [(84.0, '‖ k aABCDH n ‖', '‖ k a n ‖')], ) cmp2 = ALINE(phones='ipa') self.assertEqual( cmp2.alignment('kɒgneit', 'kognaːtus'), [(163.0, '‖ k ɒ g n ei t ‖', '‖ k o g n aː t ‖ us')], )