def import_seed_rules(seed_rules_path, display): """ This function will read the seed rule file, and return the contents as a set. The file is a CSV formatted with columns containing the rule type, the contents of the rule, and its strength rating (the probability that a token is a name if the rule applies to said token). Args: rulename (str): Location of seed rules file. Returns: rules (set): Set of Rule objects corresponding to the rules in the seed rules file. Raises: None """ seed_rules = set() cols = ['Rule Type', 'Rule', 'Strength'] data = pd.read_csv(seed_rules_path, names=cols, header=0) for i in range(len(data) - 1): rule_type = Rule.find_type(data.loc[i, 'Rule Type']) rule = data.loc[i, 'Rule'] strength = data.loc[i, 'Strength'] rule = Rule(rule_type, rule, strength) seed_rules.add(rule) display.update_progress_bar(i + 1, len(data)) return seed_rules
def test_rule_nq(): """ """ rule1 = Rule(Rule.Type.left_context, 'ki', 0.989010989) rule2 = Rule(Rule.Type.left_context, 'giri3', 0.9887640449) assert rule1 != rule2
def test_rule_eq(): """ """ rule1 = Rule(Rule.Type.left_context, 'ki', 0.989010989) rule2 = Rule(Rule.Type.left_context, 'ki', 0.989010989) assert rule1 == rule2
def gramsToRules(kgrams, allrules, iteration): rules = set() for gram in kgrams: rule = Rule(Rule.Type.spelling, gram, -1) if not rule in allrules: rules.add(rule) rule.iteration = iteration return rules
def test_rule_str(): """ """ rule = Rule(Rule.Type.left_context, 'ki', 0.989010989) expected_result = 'Rule(type=left_context, rule=ki, strength=0.989010989)' assert str(rule) == expected_result
def test_rule_init(): """ """ rule = Rule(Rule.Type.left_context, 'ki', 0.989010989) assert rule.type == Rule.Type.left_context assert rule.contents == 'ki' assert rule.strength == 0.989010989 assert rule.occurrences == 1 assert rule.iteration == -1
def test_rulefilter_main(): """ """ rule_set = set() expected = set() values = np.linspace(0.1, 0.8, num=8) for strength in values: rule = Rule(Rule.Type.unset, "test-{}".format(strength), strength) rule_set.add(rule) for strength in values[3:]: rule = Rule(Rule.Type.unset, "test-{}".format(strength), strength) expected.add(rule) expected = sorted(expected, reverse=True) results = sorted(rulefilter.main(rule_set, 5), reverse=True) assert results == expected
def main(corpus, rules, names, max_rules, iteration, options, display): """ This is meant to generate conextual rules from a set of identified name tokens. It needs the corpus as well as the name set in order to assess the performance of any rules it finds from the names Args: corpus (set): Set of all Token objects in the corpus. rules (set): Set of all Rule objects that have been found so far. names (set): Set of Token objects to derive new context rules from. max_rules (int): maximum number of rules to be accepted each iteration. iteration (int): what iteration is the algorithm currently on? options (Options): collection of configuration options Returns: new_rules (set): Set of Rule objects. Raises: None """ new_rules = set() for name in names: # New rules are assumed to have no strength until they are assessed # later by scanning the whole corpus # Create left context rule left_context = Rule(Rule.Type.left_context, str(name.left_context), -1) # Create right context rule right_context = Rule(Rule.Type.right_context, str(name.right_context), -1) # No redundant rules allowed! if left_context not in rules: left_context.iteration = iteration new_rules.add(left_context) if right_context not in rules: right_context.iteration = iteration new_rules.add(right_context) rulesperformance.main(corpus, new_rules, options, iteration, display) new_rules = rulefilter.main(new_rules, max_rules) return new_rules
def test_rulesperformance_rateRulePerformance(): """ """ alpha = 0.1 k = 2.0 accept_threshold = 0.9 rule = Rule(Rule.Type.spelling, 'bb', 1.0) name = Token('aa-aa', 'bb-bb', 'cc-cc', Token.Type.personal_name) name.name_probability = 1.0 names = set() names.add(name) rulesperformance.rateRulePerformance(names, rule, alpha, k, accept_threshold) expected_strength = (1.0 + alpha) / (1.0 + k * alpha) expected_occurrences = 1.0 assert rule.strength == expected_strength assert rule.occurrences == expected_occurrences