示例#1
0
    def extract_data_from_feature_bundle_lists(self):
        # check rule type
        if not self.target_feature_bundle_list and not self.change_feature_bundle_list:
            self.transformation_type = DEGENERATE
        elif not self.target_feature_bundle_list:
            self.transformation_type = INSERTION
        elif not self.change_feature_bundle_list:
            self.transformation_type = DELETION
        else:
            self.transformation_type = ASSIMILATION

        # check context type
        if not self.left_context_feature_bundle_list and not self.right_context_feature_bundle_list:
            self.context_type = NO_CONTEXT
        elif not self.left_context_feature_bundle_list:
            self.context_type = RIGHT_CONTEXT_ONLY
        elif not self.right_context_feature_bundle_list:
            self.context_type = LEFT_CONTEXT_ONLY
        else:
            self.context_type = BOTH_CONTEXTS

        if self.target_feature_bundle_list:
            self.target_features = self.target_feature_bundle_list.get_first_item()
            self.target_segments = SegmentTable().get_segments_symbols_by_features(self.target_features)

        if self.change_feature_bundle_list:
            self.change_features = self.change_feature_bundle_list.get_first_item()
            self.change_segments = SegmentTable().get_segments_symbols_by_features(self.change_features)

        if self.target_feature_bundle_list or self.change_feature_bundle_list:
            self.target_change_tuples_list = self._get_target_change_tuples_list()
 def devoicer(self, words):
     for i, word in enumerate(words):
         c = word[-1]
         segment = SegmentTable().get_segment_by_symbol(c)
         new_features = deepcopy(segment.features)
         new_features[Feature('voice', ('+', '-'))] = '-'
         new_c = SegmentTable().get_segment_symbol_by_features(new_features)
         if new_c:
             words[i] = word[:-1] + new_c
    def initialise_simulation(self, simulation):
        self.simulation = simulation
        Cache.get_cache().flush()
        self.configurations.load_configuration_for_simulation(simulation)
        self.configurations.configurations_dict = deepcopy(
            self.configurations.configurations_dict)

        segment_table_fixture_path = join(segment_table_dir_path,
                                          simulation.segment_table_file_name)
        SegmentTable.load(segment_table_fixture_path)
示例#4
0
def get_prologue_inverse_transducer():
    transducer_symbol_table = SegmentTable().transducer_symbol_table
    prologue_inverse_transducer = fst.Transducer(isyms=transducer_symbol_table,
                                                 osyms=transducer_symbol_table)
    alphabet = set(SegmentTable().get_segments_symbols())
    for segment in alphabet:
        prologue_inverse_transducer.add_arc(0, 0, segment, segment)
    for bracket in BRACKETS:
        prologue_inverse_transducer.add_arc(0, 0, bracket, EPSILON)
    prologue_inverse_transducer[0].final = True
    return prologue_inverse_transducer
 def devoice(self, words):
     for i, word in enumerate(words):
         # if random.randint(1, 5) != 5:
         #     continue  # chance of 5 to 1 of changing
         c = word[-1]
         segment = SegmentTable().get_segment_by_symbol(c)
         new_features = deepcopy(segment.features)
         new_features[Feature('voice', ('+', '-'))] = '-'
         new_c = SegmentTable().get_segment_symbol_by_features(new_features)
         if new_c:
             words[i] = word[:-1] + new_c
    def _add_feature(self):
        if self.morpheme_boundary or self.word_boundary:
            return False  # WB or MB feature must be standalone

        available_feature = SegmentTable().get_random_available_feature(
            self.feature_dict.keys())
        if available_feature:
            self.feature_dict[
                available_feature] = available_feature.get_random_value()
            return True
        else:
            return False
示例#7
0
 def get_all_outputs(self):
     transducer = self.get_transducer()
     transducer_symbol_table = SegmentTable().transducer_symbol_table
     outputs = list()
     for path in transducer.paths():
         output = ""
         for i in path:
             symbol = transducer_symbol_table.find(i.olabel)
             if symbol != u"\u03b5" and symbol != MORPHEME_BOUNDARY and symbol != WORD_BOUNDARY:
                 output += symbol
         outputs.append(output)
     return outputs
示例#8
0
 def apply_noise(self, words):
     total_to_noise = int(len(words) * self.noise_rate / 100)
     segment_table = SegmentTable()
     for i, word in enumerate(words[:total_to_noise]):
         c = word[-1]
         segment = segment_table.get_segment_by_symbol(c)
         if not self._is_voiced_obstruent(segment):
             continue
         new_features = deepcopy(segment.features)
         new_features[Feature('voice', ('+', '-'))] = '-'
         new_c = segment_table.get_segment_symbol_by_features(new_features)
         if new_c:
             words[i] = word[:-1] + new_c
    def get_all_outputs(self, with_noise=True):
        transducer = self.get_transducer(with_noise=with_noise)
        if configurations["MINIMIZE_TRANSDUCER"]:
            transducer = self.minimize_transducer(transducer)

        transducer_symbol_table = SegmentTable().transducer_symbol_table
        outputs = list()
        for path in transducer.paths():
            output = ""
            for i in path:
                symbol = transducer_symbol_table.find(i.olabel)
                if symbol != u"\u03b5" and symbol != MORPHEME_BOUNDARY and symbol != WORD_BOUNDARY:
                    output += symbol
            outputs.append(output)
        return outputs
示例#10
0
def get_transducer_outputs(transducer, limit=float("inf")):
    transducer_symbol_table = SegmentTable().transducer_symbol_table
    outputs = list()
    counter = 0
    for path in transducer.paths():
        output = ""
        for arc in path:
            symbol = transducer_symbol_table.find(arc.olabel)
            if symbol != u"\u03b5":
                output += symbol
        outputs.append(output)
        counter += 1
        if counter > limit:
            break
    return outputs
    def get_random_feature_bundle(cls, role, boundary_position=False):
        """
        :param role: 'target', 'change', 'left_context', 'right_context'
        :param boundary_position: whether feature bundle is first or last in left or right context, respectively
        :return:
        """
        available_features = list(SegmentTable().features.keys())
        if configurations[
                'WORD_BOUNDARY_FLAG'] and FeatureBundle._is_context_role(
                    role) and boundary_position:
            available_features.append(WORD_BOUNDARY_FEATURE_NAME)
        if configurations[
                'MORPHEME_BOUNDARY_FLAG'] and FeatureBundle._is_context_role(
                    role) and boundary_position:
            available_features.append(MORPHEME_BOUNDARY_FEATURE_NAME)

        feature_dict = {}
        random_feature = choice(available_features)
        if not isinstance(random_feature, Feature):
            if random_feature == WORD_BOUNDARY_FEATURE_NAME:
                feature_dict[WORD_BOUNDARY_FEATURE_NAME] = choice(
                    [True, False])
            elif random_feature == MORPHEME_BOUNDARY_FEATURE_NAME:
                feature_dict[MORPHEME_BOUNDARY_FEATURE_NAME] = choice(
                    [True, False])
        else:
            random_value = random_feature.get_random_value()
            feature_dict[random_feature.name] = random_value

        return cls(feature_dict, role)
示例#12
0
    def get_replace_transducer(self):
        transducer_symbol_table = SegmentTable().transducer_symbol_table
        inner_replace_transducer = fst.Transducer(
            isyms=transducer_symbol_table, osyms=transducer_symbol_table)
        for segment1, segment2 in self.target_change_tuples_list:
            inner_replace_transducer.add_arc(0, 1, segment1, segment2)
        inner_replace_transducer[1].final = True
        inner_replace_transducer_ignore_brackets = [
            LEFT_CENTER_BRACKET, RIGHT_CENTER_BRACKET
        ]

        for bracket in inner_replace_transducer_ignore_brackets:
            inner_replace_transducer.add_arc(0, 0, bracket, bracket)
            inner_replace_transducer.add_arc(1, 1, bracket, bracket)

        opt_part = left_bracket_transducer + inner_replace_transducer + right_bracket_transducer
        add_opt(opt_part)

        sigma_star_regex = "({})*".format("+".join(self.alphabet))
        sigma_star_dfa = get_dfa_from_regex(sigma_star_regex,
                                            sigma=self.alphabet)
        sigma_star_dfa_ignore_identity = get_ignore_dfa(
            self.alphabet
            | set([LEFT_IDENTITY_BRACKET, RIGHT_IDENTITY_BRACKET]),
            sigma_star_dfa,
            set([LEFT_IDENTITY_BRACKET, RIGHT_IDENTITY_BRACKET]))
        id_sigma_star = pyfst_from_dfa(sigma_star_dfa_ignore_identity)

        concat_transducer = id_sigma_star + opt_part
        replace_transducer = concat_transducer.closure()
        # dot(replace_transducer, "replace_transducer")
        return replace_transducer
    def get_random_feature_bundle_list(cls, is_one_item_list, role=None):
        is_empty_probab = 1 / len(SegmentTable().features)
        if random() < is_empty_probab:
            is_empty = True
        else:
            is_empty = False

        if is_empty:
            return cls([], is_one_item_list, role)
        else:
            max_bundles = 1 if is_one_item_list else configurations[
                "MAX_FEATURE_BUNDLE_IN_CONTEXT"]
            num_bundles = randrange(1, max_bundles + 1)

            feature_bundles = []
            for i in range(num_bundles):
                is_boundary_position = False
                if role == 'left_context' and i == 0:
                    is_boundary_position = True
                elif role == 'right_context' and i == (num_bundles - 1):
                    is_boundary_position = True
                feature_bundle = FeatureBundle.get_random_feature_bundle(
                    role=role, boundary_position=is_boundary_position)
                feature_bundles.append(feature_bundle)
            return cls(feature_bundles, is_one_item_list, role)
    def test_(self):
        self.target_energy = None
        hmm = {'q0': ['q1'],
            'q1': (['q2','qf'], ['abberation', 'abbreviate', 'abolitionist', 'abortion', 'absence', 'abstractionist', 'abutment', 'accent', 'acclaim', 'accolade', 'accommodate', 'accommodation', 'accomodation', 'achiev', 'add', 'administer', 'advertis', 'afford', 'aggravate', 'alert', 'amount', 'announc', 'appeal', 'applaud', 'apprentice', 'arcade', 'arrest', 'assault', 'assum', 'astound', 'attack', 'attempt', 'back', 'bak', 'balance', 'barbecue', 'bath', 'beckon', 'benefit', 'blast', 'blend', 'bless', 'blister', 'bloom', 'blow', 'boast', 'bogey', 'boil', 'bolster', 'bomb', 'borrow', 'bother', 'brac', 'breakfast', 'broadcast', 'broaden', 'bruise', 'buffet', 'burden', 'catalogue', 'cater', 'challeng', 'chang', 'charg', 'charm', 'compris', 'conced', 'conclud', 'condition', 'consum', 'costume', 'deal', 'decid', 'demand', 'describ', 'down', 'draw', 'drink', 'dwell', 'enforc', 'farm', 'feed', 'feel', 'flow', 'gaz', 'glaz', 'invad', 'liv', 'pac']),
            'q2': (['qf'], ['ing', 'e', 'd', 'ed', 's', 'es', 'er'])
              }

        target = SimulationCase("target", hmm, [])
        self.target_energy = self.get_energy(target)

        hmm = {'q0': ['q1'],
            'q1': (['q1', 'qf'], SegmentTable().get_segments_symbols()),
              }

        initial = SimulationCase("initial", hmm, [])
        self.get_energy(initial)



        hmm = {'q0': ['q1'],
            'q1': (['q1', 'qf'], self.data[:]),
              }

        rote_learning = SimulationCase("rote_learning", hmm, [])
        self.get_energy(rote_learning)
 def is_valid(self):
     for feature_bundle in self.feature_bundle_list:
         segments = SegmentTable().get_segments_symbols_by_features(
             feature_bundle)
         if not segments:  # make sure that the feature_bundle represents any symbol
             return False
     return True
    def setUp(self):
        self.initialise_segment_table("plural_english_segment_table.txt")

        number_of_features = len(SegmentTable().features)
        self.rule_symbol_length = ceil(
            log(number_of_features + 6, 2)
        )  # + 5 for 3 delimiters (feature, bundle, rule part), plus sign and minus sign, 1 for kleene
示例#17
0
def get_transducer_acceptor(string_):
    transducer_symbol_table = SegmentTable().transducer_symbol_table
    transducer = fst.Transducer(isyms=transducer_symbol_table,
                                osyms=transducer_symbol_table)
    for i, char in enumerate(string_):
        transducer.add_arc(i, i + 1, char, char)
    transducer[i + 1].final = True
    return transducer
示例#18
0
def get_context_string_options(context_features):
    """
    :param context_features: List of feature bundles
    :return: List of lists of segment symbols matching each feature bundle: [[s1, s2, s3], ...]
    """
    context_string_options = []
    for features in context_features:
        context_string_options.append(SegmentTable().get_segments_symbols_by_features(features))
    return context_string_options
示例#19
0
 def test_change_segment_in_emission(self):
     self.initialise_segment_table("plural_english_segment_table.txt")
     hmm = HMM({INITIAL_STATE: ['q1'],
                'q1': (['q2', FINAL_STATE], ['dog', 'kat']),
                'q2': ([FINAL_STATE], ['z'])})
     self.write_to_dot_to_file(hmm, "hmm")
     segments = SegmentTable().get_segments_symbols()
     hmm.change_segment_in_emission(segments)
     print(hmm.get_all_emissions())
    def get_from_pyfst_transducer(cls, transducer):
        transducer_symbol_table = SegmentTable().transducer_symbol_table
        nfa = ParsingNFA()
        nfa.final_states = list()
        arcs_dict = dict()
        probabilities = dict()

        num_states = len(list(transducer.states))
        transition_matrix = np.ones(
            (num_states, num_states)) * NO_TRANSITION_IDX

        for state in transducer:
            m = re.match(
                r".*#(\w*).*", str(state)
            )  # get sate number from the string: "<StdState #x with y arcs>"
            nfa_state1 = m.group(1)
            if state.initial:
                nfa.initial_state = nfa_state1
            if state.final:
                nfa.final_states.append(nfa_state1)

            for arc in state:
                nfa_state2 = str(arc.nextstate)
                output_symbol = transducer_symbol_table.find(arc.olabel)
                if output_symbol == u"\u03b5":
                    output_symbol = NULL_SEGMENT
                if nfa_state1 not in arcs_dict:
                    arcs_dict[nfa_state1] = {}
                    probabilities[nfa_state1] = []
                if output_symbol not in arcs_dict[nfa_state1]:
                    arcs_dict[nfa_state1][output_symbol] = []

                arcs_dict[nfa_state1][output_symbol].append(nfa_state2)
                probabilities[nfa_state1].append((output_symbol, nfa_state2))

                segment_idx = NULL_SEGMENT_IDX if output_symbol == NULL_SEGMENT else arc.olabel
                transition_matrix[int(nfa_state1),
                                  int(nfa_state2)] = segment_idx

        nfa.arcs_dict = arcs_dict
        nfa.probabilities = probabilities
        nfa.transition_matrix = transition_matrix
        return nfa
示例#21
0
    def get_changed_segment(self, segment_symbol, change_feature_bundle):
        """
        Applies a change of features to a given segment
        :param segment_symbol: Target segment
        :param change_feature_bundle:  Change feature bundle
        :return: String of output segment
        """
        args_repr = repr(segment_symbol) + repr(change_feature_bundle)
        cached = cache.get(args_repr, 'change_segment')
        if cached is not None:
            return cached

        segment = SegmentTable().get_segment_by_symbol(segment_symbol)
        new_segment_features_dict = deepcopy(segment.features)
        new_segment_features_dict.update(change_feature_bundle.feature_dict)

        changed_segment = SegmentTable().get_segment_symbol_by_features(new_segment_features_dict)

        cache.set(args_repr, changed_segment, 'change_segment')
        return changed_segment
示例#22
0
def get_intro_transducer(sigma, introduced_set):
    sigma_transducer = get_sigma_transducer_for_intro(sigma)

    transducer_symbol_table = SegmentTable().transducer_symbol_table
    cartesian_transducer = fst.Transducer(isyms=transducer_symbol_table,
                                          osyms=transducer_symbol_table)
    for introduced_symbol in introduced_set:
        cartesian_transducer.add_arc(0, 0, EPSILON, introduced_symbol)
    cartesian_transducer[0].final = True
    union_transducer = sigma_transducer | cartesian_transducer
    intro_transducer = union_transducer.closure()
    return intro_transducer
示例#23
0
 def __init__(self, hmm, rule_set=None):
     if isinstance(hmm, HMM):
         self.hmm = hmm
     else:
         self.hmm = HMM(hmm)
     segment_table = SegmentTable()
     self.segment_symbol_length = ceil(log(len(segment_table) + 1,
                                           2))  # + 1 for the delimiter
     if rule_set:
         self.rule_set = rule_set
     else:
         self.rule_set = RuleSet()
 def setUp(self):
     self.table = "plural_english_segment_table"
     self.initialise_segment_table("%s.txt" % self.table)
     self.plural_english_segments = SegmentTable().get_segments_symbols()
     assimilation_rule = Rule([{
         "cons": "+"
     }], [{
         "voice": "-"
     }], [{
         "voice": "-"
     }], [], True)
     self.plural_english_rule_set = RuleSet([assimilation_rule])
示例#25
0
def pyfst_to_dfa(transducer, alphabet):
    transducer_symbol_table = SegmentTable().transducer_symbol_table
    nfa = NFA()
    nfa.Sigma = alphabet
    delta = dict()
    States = list()
    nfa.Initial = set()
    for state in transducer:
        m = re.match(r".*#(\w*).*", str(state))  # get sate number from the string: "<StdState #x with y arcs>"
        nfa_state1_name = m.group(1)
        States.append(nfa_state1_name)
        nfa_state1 = States.index(nfa_state1_name)
        if state.initial:
            nfa.Initial.add(nfa_state1)
        if state.final:
            nfa.Final.add(nfa_state1)

    for state in transducer:
        m = re.match(r".*#(\w*).*", str(state))
        nfa_state1_name = m.group(1)
        nfa_state1 = States.index(nfa_state1_name)
        for arc in state:
            nfa_state2 = States.index(str(arc.nextstate))
            output_symbol = transducer_symbol_table.find(arc.olabel)
            if output_symbol == u"\u03b5":
                output_symbol = FAdo.common.Epsilon
            if nfa_state1 not in delta:
                delta[nfa_state1] = dict()
            if output_symbol not in delta[nfa_state1]:
                delta[nfa_state1][output_symbol] = set()
            delta[nfa_state1][output_symbol].add(nfa_state2)


    nfa.delta = delta
    nfa.States = States

    dfa = nfa.toDFA()

    return dfa
示例#26
0
    def __init__(self, hmm, rule_set=None):
        if isinstance(hmm, HMM):
            self.hmm = hmm
        else:
            self.hmm = HMM(hmm)
        segment_table = SegmentTable()
        self.segment_symbol_length = uniform_encoding.log2(len(segment_table) + 1)  # + 1 for the delimiter
        if rule_set:
            self.rule_set = rule_set
        else:
            self.rule_set = RuleSet(noise=False)

        noises = configurations.get("NOISE_RULE_SET", [])
        self.noise_rule_set = RuleSet.load_noise_rules_from_flat_list(noises)

        self._cached_hmm_transducer = None
        self._cached_rule_set_transducer = None
        self._cached_noise_rule_set_transducer = None
示例#27
0
def pyfst_from_dfa(dfa):
    transducer_symbol_table = SegmentTable().transducer_symbol_table
    transducer = fst.Transducer(isyms=transducer_symbol_table, osyms=transducer_symbol_table)

    dfa_state_transducer_state_dict = {i: i for i, dfa_state in enumerate(dfa.States)}
    for dfa_state1 in dfa.delta:
        for segment in dfa.delta[dfa_state1]:
            dfa_state2 = dfa.delta[dfa_state1][segment]
            transducer_state1 = dfa_state_transducer_state_dict[dfa_state1]
            transducer_state2 = dfa_state_transducer_state_dict[dfa_state2]
            transducer.add_arc(transducer_state1, transducer_state2, segment, segment)


    for dfa_final_state in dfa.Final:
        transducer_final_state = dfa_state_transducer_state_dict[dfa_final_state]
        transducer[transducer_final_state].final = True

    transducer_initial_state = dfa_state_transducer_state_dict[dfa.Initial]
    transducer[transducer_initial_state].initial = True
    return transducer
示例#28
0
    def __init__(self, rules=None, noise=False):
        if not rules:
            rules = []
        self.rules = rules
        for rule in self.rules:
            if rule.noise != noise:
                if noise:
                    raise ValueError("Non-noise-rule in a noise-rule-set")
                else:
                    raise ValueError("Noise-rule in a non-noise-rule-set")

        number_of_features = len(SegmentTable().features)
        number_of_encoding_symbols = number_of_features + 5  # +5 for 3 delimiters (feature, bundle, # rule part), plus sign, and minus sign
        if configurations['WORD_BOUNDARY_FLAG']:
            number_of_encoding_symbols += 1
        if configurations['MORPHEME_BOUNDARY_FLAG']:
            number_of_encoding_symbols += 1
        if configurations['CHANGE_KLEENE_VALUE']:
            number_of_encoding_symbols += 1
        self.rule_symbol_length = uniform_encoding.log2(
            number_of_encoding_symbols)
    def __init__(self, feature_string_dict, role=None):
        """
        :param feature_string_dict: dictionary of form {"cons": "+", "WB": True}
        :param role: "target", "change", "left_context", or "right_context"
        """
        feature_dict = dict()

        self.role = role
        self.kleene = False
        self.word_boundary = False
        self.morpheme_boundary = False

        if WORD_BOUNDARY_FEATURE_NAME in feature_string_dict:
            if feature_string_dict[
                    WORD_BOUNDARY_FEATURE_NAME] and self._is_context_bundle():
                self.word_boundary = True
        elif MORPHEME_BOUNDARY_FEATURE_NAME in feature_string_dict:
            if feature_string_dict[
                    MORPHEME_BOUNDARY_FEATURE_NAME] and self._is_context_bundle(
                    ):
                self.morpheme_boundary = True
        else:
            for feature_name in feature_string_dict:
                if feature_name is KLEENE_FEATURE_NAME:
                    if self._is_context_bundle(
                    ) and configurations['CHANGE_KLEENE_VALUE']:
                        self.kleene = feature_string_dict[feature_name]
                else:
                    feature = Feature(feature_name)
                    if not SegmentTable().is_valid_feature(feature):
                        raise ValueError(
                            u"{} not in segment_table".format(feature_name))
                    else:
                        feature_dict[feature] = feature_string_dict[
                            feature_name]

        self.feature_dict = feature_dict
示例#30
0
 def __init__(self, rule):
     self.__dict__.update(rule.__dict__)
     self.alphabet = set(SegmentTable().get_segments_symbols())