def test_crossover(self): from copy import deepcopy rule_1 = Rule.load([[{ 'cont': '+' }], [{ 'coronal': '-' }], [{ 'coronal': '-' }], [], True]) rule_2 = Rule.load([[{ 'cons': '+', 'low': '-' }], [{ 'voice': '-' }], [{ 'voice': '-' }], [], True]) crossover_rule_1 = deepcopy(rule_1) crossover_rule_2 = deepcopy(rule_2) crossover_rule_1.left_context_feature_bundle_list = rule_2.left_context_feature_bundle_list crossover_rule_1.right_context_feature_bundle_list = rule_2.right_context_feature_bundle_list crossover_rule_1.change_feature_bundle_list = rule_2.change_feature_bundle_list crossover_rule_2.left_context_feature_bundle_list = rule_1.left_context_feature_bundle_list crossover_rule_2.right_context_feature_bundle_list = rule_1.right_context_feature_bundle_list crossover_rule_2.change_feature_bundle_list = rule_1.change_feature_bundle_list rule_set_1 = RuleSet([crossover_rule_1]) rule_set_2 = RuleSet([crossover_rule_2]) print(rule_set_1) print(rule_set_2) hmm = HMM({ 'q0': ['q1'], 'q1': (['q2', 'qf'], ['dag', 'kat', 'dot', 'kod']), 'q2': (['qf'], ['zo', 'go', 'do']) }) grammar_1 = Grammar(hmm, rule_set_1) grammar_2 = Grammar(hmm, rule_set_2) data = ['kat', 'dot', 'dag', 'kod'] + \ ['katso', 'dotso', 'dagzo', 'kodzo'] + \ ['katko', 'dotko', 'daggo', 'kodgo'] + \ ['katto', 'dotto', 'dagdo', 'koddo'] hypothesis_1 = Hypothesis(grammar_1, data) hypothesis_2 = Hypothesis(grammar_2, data) print(hypothesis_1.get_energy()) print(hypothesis_2.get_energy())
def get_energy(self, hmm, rule_set_list, case_name): grammar = Grammar(hmm, RuleSet(rule_set_list)) hypothesis = Hypothesis(grammar, self.data) energy = hypothesis.get_energy() print("{}: {}".format(case_name, hypothesis.get_recent_energy_signature())) return energy
def test_opacity_two_hypotheses(self): from simulations import dag_zook_opacity as simulation self.initialise_simulation(simulation) hmm = HMM({ 'q0': ['q1'], 'q1': (['q2', 'q3'], [ 'daot', 'dkoz', 'dog', 'dok', 'gdaas', 'gkas', 'kaos', 'kat', 'kood', 'ksoag', 'ogtd', 'oktdo', 'skaz', 'tak', 'tso' ]), 'q2': (['qf'], ['go', 'kazka', 'soka', 'ta', EPSILON]), 'q3': (['qf'], ['da', 'saat', 'tsk', 'zoka']) }) epenthesis_rule = Rule([], [{ 'low': '+' }], [{ 'coronal': '+' }], [{ 'coronal': '+' }], True) assimilation_rule = Rule([{ 'cons': '+' }], [{ 'voice': '-' }], [{ 'voice': '-' }], [], True) rule_set = RuleSet([assimilation_rule, epenthesis_rule]) grammar = Grammar(hmm, rule_set) hypothesis = Hypothesis(grammar) print(hypothesis.get_energy())
def get_energy(self, simulation_case): case_name = simulation_case.case_name configuration.configurations_dict["case_name"] = case_name if isinstance(simulation_case.hmm_dict, HMM): hmm = simulation_case.hmm_dict else: hmm = HMM(simulation_case.hmm_dict) if isinstance(simulation_case.flat_rule_set_list, RuleSet): rule_set = simulation_case.flat_rule_set_list else: rule_set_list = [] for flat_rule in simulation_case.flat_rule_set_list: rule_set_list.append(Rule(*flat_rule)) rule_set = RuleSet(rule_set_list) grammar = Grammar(hmm, rule_set) self.write_to_dot_to_file(hmm, "hmm_" + case_name) self.write_to_dot_to_file(grammar.get_nfa(), "grammar_nfa_" + case_name) hypothesis = Hypothesis(grammar, self.data) energy = hypothesis.get_energy() if self.target_energy: print("{}: {} distance from target: {}".format( case_name, hypothesis.get_recent_energy_signature(), energy - self.target_energy)) else: print("{}: {}".format(case_name, hypothesis.get_recent_energy_signature())) return energy
def test_turkish_blah(self): self.initialise_simulation(turkish_vowel_harmony_new_weights) Q2s = [ 'in', 'ler', 'siz', 'i', 'ten', 'sel', 'lik', 'li', 'e', EPSILON ] hmm_dict = { 'q0': ['q1'], 'q1': (['q2'], [ 'el', 'j1l', 'ek', 'ip', 'renk', 'son', 'et', 'josun', 'kedi', 'kent', 'k0j', 'k0k', 'sokak', 'tuz', 'dal', 'gyn', 'kirpi', 'k1z', 's1rtlan', 'g0z', 'kurt', 'aj', 'arp' ]), 'q2': (['qf'], Q2s), } some_hmm = HMM(deepcopy(hmm_dict)) some_rules = RuleSet([ Rule([{ "syll": "+" }], [{ "back": "+" }], [{ "cont": "+", "back": "+" }, { "syll": "-", "kleene": True }], [], True) ]) some_hypo = Hypothesis(Grammar(some_hmm, some_rules)) # self.assert_equal_no_infs(self.get_target_hypo().get_energy(), some_hypo.get_energy())
def test_abadnese(self): self.initialise_segment_table("abd_segment_table.txt") data = [ 'bbabbba', 'baabbba', 'babbbba', 'bbabadba', 'baabadba', 'babbadba', 'bbabbad', 'baabbad', 'babbbad', 'bbabadad', 'baabadad', 'babbadad', 'bbabbab', 'baabbab', 'babbbab', 'bbabadab', 'baabadab', 'babbadab' ] hmm = HMM({ 'q0': ['q1'], 'q1': (['q2', 'qf'], ['bbab', 'baab', 'babb', 'bbaba', 'baaba', 'babba']), 'q2': (['qf'], ['dba', 'dad', 'dab']) }) rule = Rule.load([[{ "cons": "+" }], [{ "labial": "+" }], [{ "labial": "+" }], [], True]) rule_set = RuleSet([rule]) grammar = Grammar(hmm, rule_set) hypothesis = Hypothesis(grammar, data) self.assertEqual(hypothesis.get_energy(), 245)
def test_turkish__only_syll_is_the_correct_context(self): self.initialise_simulation(turkish_vowel_harmony_new_weights) # +syll --> +back hmm_dict = { 'q0': ['q1'], 'q1': (['q2'], [ 'el', 'j1l', 'ek', 'ip', 'renk', 'son', 'et', 'josun', 'kedi', 'kent', 'k0j', 'k0k', 'sokak', 'tuz', 'dal', 'gyn', 'kirpi', 'k1z', 's1rtlan', 'g0z', 'kurt', 'aj', 'arp' ]), 'q2': (['qf'], [ 'in', 'ler', 'siz', 'i', 'ten', 'sel', 'lik', 'li', 'e', EPSILON ]), } rule_change = ([{"syll": "+"}], [{"back": "+"}]) # +syll --> -back hmm_dict2 = { 'q0': ['q1'], 'q1': (['q2'], [ 'el', 'j1l', 'ek', 'ip', 'renk', 'son', 'et', 'josun', 'kedi', 'kent', 'k0j', 'k0k', 'sokak', 'tuz', 'dal', 'gyn', 'kirpi', 'k1z', 's1rtlan', 'g0z', 'kurt', 'aj', 'arp' ]), 'q2': (['qf'], [ '1n', 'lar', 's1z', '1', 'tan', 'sal', 'l1k', 'l1', 'a', EPSILON ]), } rule_change2 = ([{"syll": "+"}], [{"back": "-"}]) target_energy = self.get_target_hypo().get_energy() unexpexted_context = [] for feat in 'syll,back,round,high,voice,cont,lateral,son'.split(','): for val in ['+', '-']: if (feat, val) == ('syll', '-'): continue for r, change in enumerate([rule_change, rule_change2], start=1): for h, hmm in enumerate([hmm_dict, hmm_dict2], start=1): some_hmm = HMM(deepcopy(hmm)) rule = change + ([{ "syll": "+", "back": change[1][0]['back'] }, { feat: val, "kleene": True }], [], True) some_rules = RuleSet([Rule(*rule)]) some_hypo = Hypothesis(Grammar(some_hmm, some_rules)) if some_hypo.get_energy() <= target_energy: unexpexted_context.append( {f"hmm{h} rule {r}": { feat: val }}) assert unexpexted_context == [], f"Unexpected kleene context for rule: {unexpexted_context}"
def get_energy(self, hmm, rule_set_list, case_name): grammar = Grammar(hmm, RuleSet(rule_set_list)) self.write_to_dot_file(grammar.get_nfa(), "grammar_nfa_" + case_name) hypothesis = Hypothesis(grammar, self.data) energy = hypothesis.get_energy() print("{}: {}".format(case_name, hypothesis.get_recent_energy_signature())) return energy
def test_plural_english_hypothesis(self): self.initialise_segment_table("plural_english_segment_table.txt") self.rule_set = self.get_rule_set("plural_english_rule_set.json") plural_english_data = 1 * ['kats', 'dogz', 'kat', 'dog'] hmm = HMM({INITIAL_STATE: ['q1'], 'q1': (['q2', FINAL_STATE], ['dog', 'kat']), 'q2': ([FINAL_STATE], ['z'])}) grammar = Grammar(hmm, self.rule_set) self.write_to_dot_file(self.rule_set.rules[0].get_transducer(), "plural_english_rule") self.write_to_dot_file(grammar.get_nfa(), "grammar_nfa") self.configurations.simulation_data = plural_english_data hypothesis = Hypothesis(grammar) self.assertEqual(int(hypothesis.get_energy()), 117)
def test_abadnese_no_rule(self): self.initialise_segment_table("abd_segment_table.txt") data = ['bbabbba', 'baabbba', 'babbbba', 'bbabadba', 'baabadba', 'babbadba', 'bbabbad', 'baabbad', 'babbbad', 'bbabadad', 'baabadad', 'babbadad', 'bbabbab', 'baabbab', 'babbbab', 'bbabadab', 'baabadab', 'babbadab'] hmm = HMM({'q0': ['q1'], 'q1': (['q2'], ['bbab', 'baab', 'babb', 'bbaba', 'baaba', 'babba']), 'q2': (['qf'], ['dba', 'dad', 'dab', 'bba', 'bad', 'bab'])}) grammar = Grammar(hmm, []) self.configurations.simulation_data = data hypothesis = Hypothesis(grammar) self.assertEqual(int(hypothesis.get_energy()), 243)
def test_assimilation_no_rule(self): self.initialise_segment_table("plural_english_segment_table.txt") data = ['kat', 'dot', 'dag', 'kod'] + \ ['katso', 'dotso', 'dagzo', 'kodzo'] + \ ['katko', 'dotko', 'daggo', 'kodgo'] + \ ['katto', 'dotto', 'dagdo', 'koddo'] hmm = HMM({'q0': ['q1'], 'q1': (['q2', 'qf'], ['dag', 'kat', 'dot', 'kod']), 'q2': (['qf'], ['zo', 'go', 'do', 'to', 'so', 'ko']) }) grammar = Grammar(hmm, []) hypothesis = Hypothesis(grammar) self.configurations.simulation_data = data self.assertEqual(int(hypothesis.get_energy()), 230)
def test_assimilation2(self): self.initialise_segment_table("plural_english_segment_table.txt") self.rule_set = self.get_rule_set("plural_english_rule_set.json") data = ['kat', 'dot', 'dag', 'kod'] + \ ['katso', 'dotso', 'dagzo', 'kodzo'] + \ ['katko', 'dotko', 'daggo', 'kodgo'] + \ ['katto', 'dotto', 'dagdo', 'koddo'] hmm = HMM({'q0': ['q1'], 'q1': (['q2', 'qf'], ['dag', 'kat', 'dot', 'kod']), 'q2': (['qf'], ['zo', 'go', 'do']) }) grammar = Grammar(hmm, self.rule_set) self.configurations.simulation_data = data hypothesis = Hypothesis(grammar) for _ in range(10): #1.4 energy = hypothesis.get_energy()
def test_abadnese_no_rule(self): self.initialise_segment_table("abd_segment_table.txt") data = [ 'bbabbba', 'baabbba', 'babbbba', 'bbabadba', 'baabadba', 'babbadba', 'bbabbad', 'baabbad', 'babbbad', 'bbabadad', 'baabadad', 'babbadad', 'bbabbab', 'baabbab', 'babbbab', 'bbabadab', 'baabadab', 'babbadab' ] hmm = HMM({ 'q0': ['q1'], 'q1': (['q2', 'qf'], ['bbab', 'baab', 'babb', 'bbaba', 'baaba', 'babba']), 'q2': (['qf'], ['dba', 'dad', 'dab', 'bba', 'bad', 'bab']) }) grammar = Grammar(hmm, []) hypothesis = Hypothesis(grammar, data) self.assertEqual(hypothesis.get_energy(), 252)
def test_abnese(self): self.initialise_segment_table("ab_segment_table.txt") self.configurations["BRACKET_TRANSDUCER"] = True data = ['bab', 'aabab'] hmm = HMM( {'q0': ['q1'], 'q1': (['qf'], ['bb', 'aabb']) }) rule = Rule([], [{"cons": "-"}], [{"cons": "+"}], [{"cons": "+"}], False) # e->a / b_b rule_set = RuleSet([rule]) print(rule_set.get_outputs_of_word("bb")) grammar = Grammar(hmm, rule_set) self.write_to_dot_file(grammar.get_nfa(), "grammar_nfa") self.configurations.simulation_data = data hypothesis = Hypothesis(grammar) print(hypothesis.get_energy()) print(hypothesis.get_recent_energy_signature())
def test_katso_two_rule(self): #configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 25 self.initialise_segment_table("plural_english_segment_table.txt") data = ['kat', 'dot', 'dag', 'kod', 'gas', 'toz'] + \ ['katso', 'dotso', 'dagzo', 'kodzo', 'gasazo', 'tozazo'] + \ ['katko', 'dotko', 'daggo', 'kodgo', 'gasko', 'tozgo'] + \ ['katto', 'dotto', 'dagdo', 'koddo', 'gasto', 'tozdo'] hmm = {'q0': ['q1'], 'q1': (['q2', 'qf'], ['dag', 'kat', 'dot', 'kod', 'gas', 'toz']), 'q2': (['qf'], ['zo', 'go', 'do'])} epenthesis_rule = Rule.load([[], [{"cons": "-", "low": "+"}], [{"cons": "+", "cont": "+"}], [{"cons": "+", "cont": "+"}], True]) assimilation_rule = Rule.load([[{"cons": "+"}], [{"voice": "-"}], [{"voice": "-"}], [], True]) rule_set = RuleSet([epenthesis_rule, assimilation_rule]) hmm = HMM(hmm) grammar = Grammar(hmm, rule_set) self.write_to_dot_file(grammar.get_nfa(), "grammar_nfa") self.configurations.simulation_data = data hypothesis = Hypothesis(grammar) self.assertEqual(int(hypothesis.get_energy()), 364)
'q1': ['q2'], 'q2': [FINAL_STATE] } target_hmm_emissions = { 'q1': [prefix], 'q2': replace_bab_with_bb_for_everty_word(target_stems) } target_hmm_inner_states = ['q1', 'q2'] target_hmm = HMM(target_hmm_transitions, target_hmm_emissions, target_hmm_inner_states) # dot(target_hmm, 'complex_morphology_hmm') target_lexicon = Lexicon([], max_word_length_in_data, initial_hmm=target_hmm) dot(target_hmm, 'abnese_target') target_grammar = Grammar(target_constraint_set, target_lexicon) target_hypothesis = Hypothesis(target_grammar, data) target_energy = target_hypothesis.get_energy() print(f"target hypothesis: {target_hypothesis.get_recent_energy_signature()}") print_empty_line() #words initial hypothesis words_initial_lexicon = Lexicon(data, max_word_length_in_data, alphabet_or_words="words") dot(words_initial_lexicon.hmm, 'abnese_initial_hmm') words_initial_grammar = Grammar(faith_constraint_set, words_initial_lexicon) words_initial_hypothesis = Hypothesis(words_initial_grammar, data) words_initial_energy = words_initial_hypothesis.get_energy() print( f"words initial hypothesis: {words_initial_hypothesis.get_recent_energy_signature()}" )
target_stems_and_suffixes = get_target_stems_and_suffixes(target_stems, surface_suffixes) corpus = target_stems + target_stems_and_suffixes corpus = Corpus(corpus) data = corpus.get_words() max_word_length_in_data = max([len(word) for word in data]) #initial hypothesis initial_lexicon = Lexicon(data, max_word_length_in_data) initial_grammar = Grammar(initial_constraint_set, initial_lexicon) initial_hypothesis = Hypothesis(initial_grammar, data) initial_energy = initial_hypothesis.get_energy() print(f"initial hypothesis: {initial_hypothesis.get_recent_energy_signature()}") print(f"initial energy: {initial_energy}") print_empty_line() target_hmm_transitions = {INITIAL_STATE: ["q1"], "q1": ["q2", FINAL_STATE], "q2": [FINAL_STATE]} target_hmm_emissions = {"q1": target_stems, "q2": ["kun"]} target_hmm_inner_states = ["q1", "q2"] target_hmm = HMM(target_hmm_transitions, target_hmm_emissions, target_hmm_inner_states) target_lexicon = Lexicon([], max_word_length_in_data, initial_hmm=target_hmm)