def test_kleene_star(self):
     self.initialise_segment_table("plural_english_segment_table.txt")
     self.configurations["CHANGE_KLEENE_VALUE"] = True
     rule = Rule([{"cons": "-"}], [{"low": "+"}], [{"cons": "-"}, {"cons": "+", "kleene": True}], [],
                 obligatory=True)
     rule_set = RuleSet([rule])
     self.assertCountEqual(rule_set.get_outputs_of_word("ato"), ['ata'])
     self.assertCountEqual(rule_set.get_outputs_of_word("attto"), ['attta'])
    def test_morpheme_boundary(self):
        self.initialise_segment_table("abnese_lengthening_segment_table.txt")
        rule = Rule([], [{"long": "+"}], [], [{"bound": "+"}], obligatory=True)
        rule_set = RuleSet([rule])
        self.assertEqual(rule_set.get_outputs_of_word("abB"), [u'abYB'])

        rule = Rule([], [{"long": "+"}], [], [{}, {"bound": "+"}], obligatory=True)
        rule_set = RuleSet([rule])
        self.assertEqual(rule_set.get_outputs_of_word("abB"), [u'aYbB'])
 def test_kleene_star(self):
     self.initialise_segment_table("plural_english_segment_table.txt")
     rule = Rule([{
         "cons": "-"
     }], [{
         "low": "+"
     }], [{
         "cons": "-"
     }, {
         "cons": "+",
         "kleene": True
     }], [],
                 obligatory=True)
     rule_set = RuleSet([rule])
     print(rule_set.get_outputs_of_word("ato"))  # -> ata
     print(rule_set.get_outputs_of_word("atttto"))  # -> atttta
 def test_phi_ro_identity(self):
     self.initialise_segment_table("ab_segment_table.txt")
     rule = Rule([{
         "cons": "-"
     }], [{
         "cons": "-"
     }], [{
         "cons": "+"
     }], [{
         "cons": "+"
     }],
                 obligatory=True)
     rule_set = RuleSet([rule])
     print(rule_set.get_outputs_of_word("bb"))  # should be bb , instead []
     print(rule_set.get_outputs_of_word(
         "bab"))  # should be 'bab' instead [u'bab', u'bab']
    def setUp(self):
        configurations["DATA_ENCODING_LENGTH_MULTIPLIER"] = 25
        configurations["MORPHEME_BOUNDARY_FLAG"] = True
        configurations["UNDERSPECIFICATION_FLAG"] = True
        self.initialise_segment_table("underspecification_segment_table.txt")
        self.data = ['dat', 'tat', 'da', 'ta']

        hmm = HMM({
            'q0': ['q1'],
            'q1': (['q2', 'qf'], [
                'dag', 'kat', 'dot', 'kod', 'gas', 'toz', 'kta', 'dgo', 'skoz',
                'gdas'
            ]),
            'q2': (['qf'], ['zook', 'gos', 'dod', 'sad'])
        })

        rule = Rule([{
            "voice": "0"
        }], [{
            "voice": "-"
        }], [], [{
            "bound": "+"
        }], True)
        rule.get_transducer()
        print(rule.get_segment_representation())
        rule_set = RuleSet([rule])

        print(rule_set.get_outputs_of_word('daTB'))
 def test_insertion_with_right_context_only2(self):
     configurations["SINGLE_CONTEXT_TRANSDUCER_FLAG"] = True
     self.initialise_segment_table("abd_segment_table.txt")
     rule = Rule([], [{"cons": "-"}], [], [{"cons": "+", "labial": "+"}, {"cons": "+", "labial": "-"}],
                 obligatory=True)
     rule_set = RuleSet([rule])
     self.assertCountEqual(rule_set.get_outputs_of_word('bdbd'), ['abdabd'])
 def test_vicky(self):
     self.initialise_segment_table("plural_english_segment_table.txt")
     rule = Rule([], [{
         "voice": "-"
     }], [{
         "voice": "-"
     }], [],
                 obligatory=True)
     rule_set = RuleSet([rule])
     print(rule_set.get_outputs_of_word("dot"))
    def test_rule_application_direction(self):
        # Test whether rules are applied recursively once the environment changes
        self.initialise_segment_table("turkish_segment_table.txt")
        rule = Rule([{"syll": "+"}], [{"back": "-"}], [{"syll": "+", "back": "-"}], [],
                    obligatory=True)
        rule_set = RuleSet([rule])

        # TODO: this should be replaced with:
        # self.assertEqual(rule_set.get_outputs_of_word("i1a"), ['iia'])
        # I have no idea why `iie` returns here as well, but this is a bug.
        self.assertIn('iia', rule_set.get_outputs_of_word("i1a"))
 def test_assimilation(self):
     self.initialise_segment_table("plural_english_segment_table.txt")
     rule = Rule([{
         "cons": "+"
     }], [{
         "voice": "-"
     }], [{
         "voice": "-"
     }], [],
                 obligatory=True)
     rule_set = RuleSet([rule])
     print(rule_set.get_outputs_of_word("tz"))
 def test_abnese_insertion(self):
     self.initialise_segment_table("ab_segment_table.txt")
     rule = Rule([], [{
         "cons": "-"
     }], [{
         "cons": "+"
     }], [{
         "cons": "+"
     }],
                 obligatory=True)
     rule_set = RuleSet([rule])
     print(rule_set.get_outputs_of_word("aabb"))
 def test_insertion_with_right_context_only_2(self):
     configurations["SINGLE_CONTEXT_TRANSDUCER_FLAG"] = True
     self.initialise_segment_table("ab_segment_table.txt")
     rule = Rule([], [{
         "cons": "-"
     }], [], [{
         "cons": "+"
     }, {
         "cons": "+"
     }],
                 obligatory=True)
     rule_set = RuleSet([rule])
     print(rule_set.get_outputs_of_word('bbbb'))
 def test_rule_application_direction(self):
     # Test whether rules are applied recursively once the environment changes
     self.initialise_segment_table("turkish_segment_table.txt")
     rule = Rule([{
         "cons": "-"
     }], [{
         "back": "-"
     }], [{
         "cons": "-",
         "back": "-"
     }], [],
                 obligatory=True)
     rule_set = RuleSet([rule])
     print(rule_set.get_outputs_of_word("i1a"))  # -> iia
 def test_2(self):
     rule1 = [[], [{
         "ATR": "+"
     }], [{
         "coronal": "+"
     }], [{
         "coronal": "+"
     }], True]
     rule2 = [[{
         "voice": "+"
     }], [{
         "voice": "-"
     }], [{
         "voice": "-"
     }], [], True]
     rule_set = RuleSet([Rule(*rule1), Rule(*rule2)])
     result = rule_set.get_outputs_of_word('daadt')
     print(result)
    def test_abnese(self):
        self.initialise_segment_table("ab_segment_table.txt")
        self.configurations["BRACKET_TRANSDUCER"] = True
        data = ['bab', 'aabab']

        hmm = HMM( {'q0': ['q1'],
              'q1': (['qf'], ['bb', 'aabb'])
              })
        rule = Rule([], [{"cons": "-"}], [{"cons": "+"}], [{"cons": "+"}], False)  # e->a / b_b
        rule_set = RuleSet([rule])

        print(rule_set.get_outputs_of_word("bb"))

        grammar = Grammar(hmm, rule_set)
        self.write_to_dot_file(grammar.get_nfa(), "grammar_nfa")
        self.configurations.simulation_data = data
        hypothesis = Hypothesis(grammar)

        print(hypothesis.get_energy())
        print(hypothesis.get_recent_energy_signature())
    def test_abadnese_for_ezer(self):
        self.initialise_segment_table("abd_segment_table.txt")
        data = ['aabad', 'abad', 'badaabad', 'aba', 'aaba', 'badaa']

        hmm = HMM({
            'q0': ['q1'],
            'q1': (['qf'], ['aabd', 'abd', 'bdaabd', 'aba', 'aaba', 'bdaa'])
        })
        rule = Rule([], [{}], [{
            "cons": "+",
            "labial": "+"
        }], [{
            "cons": "+",
            "labial": "+"
        }],
                    obligatory=True)
        rule_set = RuleSet([rule])

        print(rule_set.get_outputs_of_word("abb"))

        grammar = Grammar(hmm, rule_set)
        hypothesis = Hypothesis(grammar, data)
예제 #16
0
class Grammar:
    def __init__(self, hmm, rule_set=None):
        if isinstance(hmm, HMM):
            self.hmm = hmm
        else:
            self.hmm = HMM(hmm)
        segment_table = SegmentTable()
        self.segment_symbol_length = uniform_encoding.log2(len(segment_table) + 1)  # + 1 for the delimiter
        if rule_set:
            self.rule_set = rule_set
        else:
            self.rule_set = RuleSet(noise=False)

        noises = configurations.get("NOISE_RULE_SET", [])
        self.noise_rule_set = RuleSet.load_noise_rules_from_flat_list(noises)

        self._cached_hmm_transducer = None
        self._cached_rule_set_transducer = None
        self._cached_noise_rule_set_transducer = None

    def generate_word(self):
        emission = self.hmm.generate_emission()
        return choice(self.rule_set.get_outputs_of_word(emission))

    def generate_all_words(self):
        # TODO: I think this also generates noised data. This should be fixed.
        words = []
        emissions = self.hmm.generate_all_emissions()
        for emission in emissions:
            words += self.rule_set.get_outputs_of_word(emission)
        return words

    def get_transducer(self, with_noise=True):
        hmm_transducer = self.get_hmm_transducer()

        if "case_name" in configurations.configurations_dict:
            case_name = configurations.configurations_dict["case_name"]
            dot(hmm_transducer, "{}_hmm_transducer".format(case_name))
        rules_set_transducer = self.get_rule_set_transducer()
        if with_noise:
            noise_rules_transducer = self.get_noise_rule_set_transducer()
        else:
            noise_rules_transducer = None

        return self._compose_grammar_transducers(
            hmm_transducer, rules_set_transducer, noise_rules_transducer
        )

    def _compose_grammar_transducers(self, first_transducer, *other_transducers):
        composed_transducer = first_transducer
        for transducer in other_transducers:
            if transducer:
                composed_transducer.arc_sort_input()
                transducer.arc_sort_input()
                composed_transducer = composed_transducer >> transducer
        return composed_transducer

    def get_nfa(self):
        grammar_pyfst_transducer = self.get_transducer()
        # dot(grammar_pyfst_transducer, "grammar_pyfst_transducer")
        # grammar_pyfst_transducer.remove_epsilon()
        return ParsingNFA.get_from_pyfst_transducer(grammar_pyfst_transducer)

    def make_mutation(self):
        mutation_successful = False

        if ga_config.MUTATE_BOTH_HMM_AND_RULES:
            hmm_mutation_successful = False
            rule_set_mutation_successful = False

            if configurations["EVOLVE_HMM"]:
                hmm_mutation_successful = self.hmm.make_mutation()
            if configurations["EVOLVE_RULES"]:
                rule_set_mutation_successful = self.rule_set.make_mutation()

            mutation_successful = mutation_successful or rule_set_mutation_successful or hmm_mutation_successful

            if hmm_mutation_successful:
                self.invalidate_cached_hmm_transducer()
            if rule_set_mutation_successful:
                self.invalidate_cached_rule_set_transducer()

        else:
            rule_set_mutation_weight = 0 if not configurations["EVOLVE_RULES"] else configurations["MUTATE_RULE_SET"]
            hmm_mutation_weight = 0 if not configurations["EVOLVE_HMM"] else configurations["MUTATE_HMM"]

            mutation_weights = [('rule_set', rule_set_mutation_weight),
                                ('hmm', hmm_mutation_weight)]

            weighted_mutatable_object_list = get_weighted_list(mutation_weights)
            object_name_to_mutate = choice(weighted_mutatable_object_list)
            if object_name_to_mutate == 'rule_set':
                object_to_mutate = self.rule_set
            elif object_name_to_mutate == 'hmm':
                object_to_mutate = self.hmm
            mutation_successful = object_to_mutate.make_mutation()

            if mutation_successful:
                if object_name_to_mutate == 'hmm':
                    self.invalidate_cached_hmm_transducer()
                elif object_name_to_mutate == 'rule_set':
                    self.invalidate_cached_rule_set_transducer()

        return mutation_successful

    def get_encoding_length(self):
        if not configurations["UNDERSPECIFICATION_FLAG"]:
            hmm_encoding_length = self.hmm.get_encoding_length(self.segment_symbol_length,
                                                               restrictions_on_alphabet=configurations["RESTRICTIONS_ON_ALPHABET"])
        else:
            hmm_encoding_length = self.hmm.get_underspecified_encoding_length()
        rules_encoding_length = self.rule_set.get_encoding_length()
        return hmm_encoding_length, rules_encoding_length

    def generate_word_list(self, n):
        result = []
        for _ in range(n):
            result.append(self.generate_word())
        return result

    def get_all_outputs(self, with_noise=True):
        transducer = self.get_transducer(with_noise=with_noise)
        if configurations["MINIMIZE_TRANSDUCER"]:
            transducer = self.minimize_transducer(transducer)

        transducer_symbol_table = SegmentTable().transducer_symbol_table
        outputs = list()
        for path in transducer.paths():
            output = ""
            for i in path:
                symbol = transducer_symbol_table.find(i.olabel)
                if symbol != u"\u03b5" and symbol != MORPHEME_BOUNDARY and symbol != WORD_BOUNDARY:
                    output += symbol
            outputs.append(output)
        return outputs

    def get_hmm_transducer(self):
        if self._cached_hmm_transducer is None:
            self._cached_hmm_transducer = self.hmm.get_transducer()
        return self._cached_hmm_transducer

    def get_rule_set_transducer(self):
        if self._cached_rule_set_transducer is None:  # rule set transducer may be None
            self._cached_rule_set_transducer = self.rule_set.get_transducer()

        return self._cached_rule_set_transducer

    def get_noise_rule_set_transducer(self):
        if self._cached_noise_rule_set_transducer is None:
            self._cached_noise_rule_set_transducer = self.noise_rule_set.get_transducer()
        return self._cached_noise_rule_set_transducer

    def get_log_lines(self):
        return self.hmm.get_log_lines() + self.rule_set.get_log_lines()

    def invalidate_cached_hmm_transducer(self):
        self._cached_hmm_transducer = None

    def invalidate_cached_rule_set_transducer(self):
        self._cached_rule_set_transducer = None

    @staticmethod
    def minimize_transducer(transducer):
        transducer.project_output()
        transducer = transducer.determinize()
        transducer.minimize()
        return transducer

    def __getstate__(self):
        # Don't pickle cached transducers
        state = self.__dict__.copy()
        state['_cached_hmm_transducer'] = None
        state['_cached_rule_set_transducer'] = None
        state['_cached_noise_rule_set_transducer'] = None
        return state
 def test_phi_ro_identity(self):
     self.initialise_segment_table("ab_segment_table.txt")
     rule = Rule([{"cons": "-"}], [{"cons": "-"}], [{"cons": "+"}], [{"cons": "+"}], obligatory=True)
     rule_set = RuleSet([rule])
     self.assertCountEqual(rule_set.get_outputs_of_word("bb"), ["bb"])
     self.assertCountEqual(rule_set.get_outputs_of_word("bab"), ["bab"])
def print_rule_word_outputs(rule, word):
    rule_set = RuleSet([rule])
    result = rule_set.get_outputs_of_word(word)
    #print("{} -> {}    {}".format(word, rule_set.get_outputs_of_word(word), rule))
    return result
예제 #19
0
class Grammar:
    def __init__(self, hmm, rule_set=None):
        if isinstance(hmm, HMM):
            self.hmm = hmm
        else:
            self.hmm = HMM(hmm)
        segment_table = SegmentTable()
        self.segment_symbol_length = ceil(log(len(segment_table) + 1,
                                              2))  # + 1 for the delimiter
        if rule_set:
            self.rule_set = rule_set
        else:
            self.rule_set = RuleSet()

    def generate_word(self):
        emission = self.hmm.generate_emission()
        return choice(self.rule_set.get_outputs_of_word(emission))

    def get_transducer(self):
        hmm_transducer = self.hmm.get_transducer()
        if "case_name" in configurations.configurations_dict:
            case_name = configurations.configurations_dict["case_name"]
            dot(hmm_transducer, "{}_hmm_transducer".format(case_name))
        rules_set_transducer = self.rule_set.get_transducer()
        if rules_set_transducer:
            hmm_transducer.arc_sort_input()
            rules_set_transducer.arc_sort_input()
            composed_hmm_rules_transducer = hmm_transducer >> rules_set_transducer
        else:
            composed_hmm_rules_transducer = hmm_transducer
        return composed_hmm_rules_transducer

    def get_nfa(self):
        grammar_pyfst_transducer = self.get_transducer()
        # dot(grammar_pyfst_transducer, "grammar_pyfst_transducer")
        # grammar_pyfst_transducer.remove_epsilon()
        return ParsingNFA.get_from_pyfst_transducer(grammar_pyfst_transducer)

    def make_mutation(self):
        num_mutations = randint(1, ga_config.MAX_MUTATIONS)
        mutation_result = False

        for _ in range(num_mutations):
            if ga_config.MUTATE_BOTH_HMM_AND_RULES:
                rule_set_success = False
                hmm_success = False
                if configurations["EVOLVE_RULES"]:
                    rule_set_success = self.rule_set.make_mutation()
                if configurations["EVOLVE_HMM"]:
                    hmm_success = self.hmm.make_mutation()
                mutation_result = mutation_result or rule_set_success or hmm_success

            else:
                rule_set_mutation_weight = 0 if not configurations[
                    "EVOLVE_RULES"] else configurations["MUTATE_RULE_SET"]
                hmm_mutation_weight = 0 if not configurations[
                    "EVOLVE_HMM"] else configurations["MUTATE_HMM"]

                mutation_weights = [(self.rule_set, rule_set_mutation_weight),
                                    (self.hmm, hmm_mutation_weight)]

                weighted_mutatable_object_list = get_weighted_list(
                    mutation_weights)
                object_to_mutate = choice(weighted_mutatable_object_list)
                mutation_result = object_to_mutate.make_mutation()

        return mutation_result

    def get_encoding_length(self):
        if not configurations["UNDERSPECIFICATION_FLAG"]:
            hmm_encoding_length = self.hmm.get_encoding_length(
                self.segment_symbol_length,
                restrictions_on_alphabet=configurations[
                    "RESTRICTIONS_ON_ALPHABET"])
        else:
            hmm_encoding_length = self.hmm.get_underspecified_encoding_length()
        rules_encoding_length = self.rule_set.get_encoding_length()
        return hmm_encoding_length, rules_encoding_length

    def generate_word_list(self, n):
        result = []
        for _ in range(n):
            result.append(self.generate_word())
        return result

    def get_all_outputs(self):
        transducer = self.get_transducer()
        transducer_symbol_table = SegmentTable().transducer_symbol_table
        outputs = list()
        for path in transducer.paths():
            output = ""
            for i in path:
                symbol = transducer_symbol_table.find(i.olabel)
                if symbol != u"\u03b5" and symbol != MORPHEME_BOUNDARY and symbol != WORD_BOUNDARY:
                    output += symbol
            outputs.append(output)
        return outputs
 def test_vicky(self):
     self.initialise_segment_table("plural_english_segment_table.txt")
     rule = Rule([], [{"voice": "-"}], [{"voice": "-"}], [], obligatory=True)
     rule_set = RuleSet([rule])
     self.assertCountEqual(rule_set.get_outputs_of_word("dot"), ["dot" + s for s in ('s', 'k', 't')])
 def test_degenerate_assimilation(self):
     self.initialise_segment_table("plural_english_segment_table.txt")
     rule = Rule([{"cons": "+", "low": "+"}], [{"voice": "-"}], [{"voice": "-"}], [], obligatory=False)
     rule_set = RuleSet([rule])
     self.assertCountEqual(rule_set.get_outputs_of_word("tz"), ["tz"])