class MraTestCases(unittest.TestCase): """Test MRA functions. abydos.distance.MRA """ cmp = MRA() def test_mra_dist_abs(self): """Test abydos.distance.MRA.dist_abs.""" self.assertEqual(self.cmp.dist_abs('', ''), 6) self.assertEqual(self.cmp.dist_abs('a', 'a'), 6) self.assertEqual(self.cmp.dist_abs('abcdefg', 'abcdefg'), 6) self.assertEqual(self.cmp.dist_abs('abcdefg', ''), 0) self.assertEqual(self.cmp.dist_abs('', 'abcdefg'), 0) # https://en.wikipedia.org/wiki/Match_rating_approach self.assertEqual(self.cmp.dist_abs('Byrne', 'Boern'), 5) self.assertEqual(self.cmp.dist_abs('Smith', 'Smyth'), 5) self.assertEqual(self.cmp.dist_abs('Catherine', 'Kathryn'), 4) self.assertEqual(self.cmp.dist_abs('ab', 'abcdefgh'), 0) self.assertEqual(self.cmp.dist_abs('ab', 'ac'), 5) self.assertEqual(self.cmp.dist_abs('abcdefik', 'abcdefgh'), 3) self.assertEqual(self.cmp.dist_abs('xyz', 'abc'), 0) def test_mra_sim(self): """Test abydos.distance.MRA.sim.""" self.assertEqual(self.cmp.sim('', ''), 1) self.assertEqual(self.cmp.sim('a', 'a'), 1) self.assertEqual(self.cmp.sim('abcdefg', 'abcdefg'), 1) self.assertEqual(self.cmp.sim('abcdefg', ''), 0) self.assertEqual(self.cmp.sim('', 'abcdefg'), 0) # https://en.wikipedia.org/wiki/Match_rating_approach self.assertEqual(self.cmp.sim('Byrne', 'Boern'), 5 / 6) self.assertEqual(self.cmp.sim('Smith', 'Smyth'), 5 / 6) self.assertEqual(self.cmp.sim('Catherine', 'Kathryn'), 4 / 6) self.assertEqual(self.cmp.sim('ab', 'abcdefgh'), 0) self.assertEqual(self.cmp.sim('ab', 'ac'), 5 / 6) self.assertEqual(self.cmp.sim('abcdefik', 'abcdefgh'), 3 / 6) self.assertEqual(self.cmp.sim('xyz', 'abc'), 0) def test_mra_dist(self): """Test abydos.distance.MRA.dist.""" self.assertEqual(self.cmp.dist('', ''), 0) self.assertEqual(self.cmp.dist('a', 'a'), 0) self.assertEqual(self.cmp.dist('abcdefg', 'abcdefg'), 0) self.assertEqual(self.cmp.dist('abcdefg', ''), 1) self.assertEqual(self.cmp.dist('', 'abcdefg'), 1) # https://en.wikipedia.org/wiki/Match_rating_approach self.assertAlmostEqual(self.cmp.dist('Byrne', 'Boern'), 1 / 6) self.assertAlmostEqual(self.cmp.dist('Smith', 'Smyth'), 1 / 6) self.assertAlmostEqual(self.cmp.dist('Catherine', 'Kathryn'), 2 / 6) self.assertEqual(self.cmp.dist('ab', 'abcdefgh'), 1) self.assertAlmostEqual(self.cmp.dist('ab', 'ac'), 1 / 6) self.assertAlmostEqual(self.cmp.dist('abcdefik', 'abcdefgh'), 3 / 6) self.assertEqual(self.cmp.dist('xyz', 'abc'), 1)
from abydos.phonetic import PSHPSoundexFirst, Ainsworth from abydos.phones import * import re from sklearn.preprocessing import MinMaxScaler # Featurizer pshp_soundex_first = PSHPSoundexFirst() pe = Ainsworth() iss = IterativeSubString() bisim = BISIM() dlev = DiscountedLevenshtein() prefix = Prefix() lcs = LCSstr() mlipns = MLIPNS() strcmp95 = Strcmp95() mra = MRA() editex = Editex() saps = SAPS() flexmetric = FlexMetric() jaro = JaroWinkler(mode='Jaro') higuera_mico = HigueraMico() sift4 = Sift4() eudex = Eudex() aline = ALINE() phonetic_edit = PhoneticEditDistance() algos = [iss, bisim, dlev, prefix, lcs, mlipns, strcmp95, mra, editex, saps, flexmetric, jaro, higuera_mico, sift4, eudex, aline, phonetic_edit] algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline', 'phoneticeditdistance']
def __init__(self, model='latin'): self.model = model self.impH = input_helpers.InputHelper() self.ST = syllable_tokenizer.SyllableTokenizer() # Phonetic Encoder self.pe = Ainsworth() # Soundex Firstname Algorithm self.pshp_soundex_first = PSHPSoundexFirst() # String Distance algorithms self.algos = [ IterativeSubString(), BISIM(), DiscountedLevenshtein(), Prefix(), LCSstr(), MLIPNS(), Strcmp95(), MRA(), Editex(), SAPS(), FlexMetric(), JaroWinkler(mode='Jaro'), HigueraMico(), Sift4(), Eudex(), ALINE(), Covington(), PhoneticEditDistance() ] self.algo_names = [ 'iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline', 'covington', 'phoneticeditdistance' ] # extract model tarball into directory if doesnt exist model_dir = os.path.join(os.path.dirname(__file__), "models", self.model) if not os.path.exists(model_dir): os.makedirs(model_dir) tar = tarfile.open( os.path.join(os.path.dirname(__file__), "models", self.model + ".tar.gz"), "r:gz") tar.extractall(model_dir) tar.close() # String Distance Pipeline (Level 0/Base Model) self.baseModel = joblib.load(os.path.join(model_dir, 'base.pkl')) # Character Embedding Network (Level 0/Base Model) self.vocab = preprocess.VocabularyProcessor( max_document_length=15, min_frequency=0).restore(os.path.join(model_dir, 'vocab')) siamese_model = os.path.join(model_dir, 'siamese') graph = tf.Graph() with graph.as_default() as graph: self.sess = tf.Session() with self.sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( '{}.meta'.format(siamese_model)) self.sess.run(tf.global_variables_initializer()) saver.restore(self.sess, siamese_model) # Get the placeholders from the graph by name self.input_x1 = graph.get_operation_by_name('input_x1').outputs[0] self.input_x2 = graph.get_operation_by_name('input_x2').outputs[0] self.dropout_keep_prob = graph.get_operation_by_name( 'dropout_keep_prob').outputs[0] self.prediction = graph.get_operation_by_name( 'output/distance').outputs[0] self.sim = graph.get_operation_by_name( 'accuracy/temp_sim').outputs[0] # Logreg (Level 1/Meta Model) self.metaModel = joblib.load(os.path.join(model_dir, 'meta.pkl')) # seen names (mapping dict from raw name to processed name) self.seen_names = {} # seen pairs (mapping dict from name pair tuple to similarity) self.seen_pairs = {}
def __init__(self, model='latin', prefilter=True, allow_alt_surname=True, allow_initials=True, allow_missing_components=True): # user-provided parameters self.model = model self.allow_alt_surname = allow_alt_surname self.allow_initials = allow_initials self.allow_missing_components = allow_missing_components self.prefilter = prefilter if self.prefilter: self.refined_soundex = { 'b': 1, 'p': 1, 'f': 2, 'v': 2, 'c': 3, 'k': 3, 's': 3, 'g': 4, 'j': 4, 'q': 5, 'x': 5, 'z': 5, 'd': 6, 't': 6, 'l': 7, 'm': 8, 'n': 8, 'r': 9 } # verify user-supplied class arguments model_dir = self.validate_parameters() self.impH = input_helpers.InputHelper() # Phonetic Encoder self.pe = Ainsworth() # Soundex Firstname Algorithm self.pshp_soundex_first = PSHPSoundexFirst() # Soundex Lastname Algorithm self.pshp_soundex_last = PSHPSoundexLast() # String Distance algorithms self.algos = [IterativeSubString(), BISIM(), DiscountedLevenshtein(), Prefix(), LCSstr(), MLIPNS(), Strcmp95(), MRA(), Editex(), SAPS(), FlexMetric(), JaroWinkler(mode='Jaro'), HigueraMico(), Sift4(), Eudex(), ALINE(), CovingtonGuard(), PhoneticEditDistance()] self.algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra', 'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline', 'covington', 'phoneticeditdistance'] # String Distance Pipeline (Level 0/Base Model) self.baseModel = joblib.load(os.path.join(model_dir, 'base.pkl')) # Character Embedding Network (Level 0/Base Model) self.vocab = preprocess.VocabularyProcessor(max_document_length=15, min_frequency=0).restore( os.path.join(model_dir, 'vocab')) siamese_model = os.path.join(model_dir, 'siamese') # start tensorflow session graph = tf.Graph() with graph.as_default() as graph: self.sess = tf.Session() if tf.__version__[0] == '1' else tf.compat.v1.Session() with self.sess.as_default(): # Load the saved meta graph and restore variables if tf.__version__[0] == '1': saver = tf.train.import_meta_graph('{}.meta'.format(siamese_model)) self.sess.run(tf.global_variables_initializer()) else: saver = tf.compat.v1.train.import_meta_graph('{}.meta'.format(siamese_model)) self.sess.run(tf.compat.v1.global_variables_initializer()) saver.restore(self.sess, siamese_model) # Get the placeholders from the graph by name self.input_x1 = graph.get_operation_by_name('input_x1').outputs[0] self.input_x2 = graph.get_operation_by_name('input_x2').outputs[0] self.dropout_keep_prob = graph.get_operation_by_name('dropout_keep_prob').outputs[0] self.prediction = graph.get_operation_by_name('output/distance').outputs[0] self.sim = graph.get_operation_by_name('accuracy/temp_sim').outputs[0] # Logreg (Level 1/Meta Model) self.metaModel = joblib.load(os.path.join(model_dir, 'meta.pkl')) # seen names (mapping dict from raw name to processed name) self.seen_names = {} # seen pairs (mapping dict from name pair tuple to similarity) self.seen_pairs = {} # user scores (mapping dict from name pair tuple to similarity) self.user_scores = {}