Пример #1
0
def import_seed_rules(seed_rules_path, display):
    """
    This function will read the seed rule file, and return the contents as
    a set.  The file is a CSV formatted with columns containing the rule type,
	the contents of the rule, and its strength rating (the probability that a
	token is a name if the rule applies to said token).

    Args:
        rulename (str): Location of seed rules file.

    Returns:
        rules (set): Set of Rule objects corresponding to the rules in the
            seed rules file.

    Raises:
		None

    """

    seed_rules = set()

    cols = ['Rule Type', 'Rule', 'Strength']

    data = pd.read_csv(seed_rules_path, names=cols, header=0)
    for i in range(len(data) - 1):
        rule_type = Rule.find_type(data.loc[i, 'Rule Type'])
        rule = data.loc[i, 'Rule']
        strength = data.loc[i, 'Strength']
        rule = Rule(rule_type, rule, strength)
        seed_rules.add(rule)

        display.update_progress_bar(i + 1, len(data))

    return seed_rules
Пример #2
0
def test_rule_nq():
    """

    """

    rule1 = Rule(Rule.Type.left_context, 'ki', 0.989010989)
    rule2 = Rule(Rule.Type.left_context, 'giri3', 0.9887640449)
    assert rule1 != rule2
Пример #3
0
def test_rule_eq():
    """

    """

    rule1 = Rule(Rule.Type.left_context, 'ki', 0.989010989)
    rule2 = Rule(Rule.Type.left_context, 'ki', 0.989010989)
    assert rule1 == rule2
Пример #4
0
def gramsToRules(kgrams, allrules, iteration):
    rules = set()

    for gram in kgrams:
        rule = Rule(Rule.Type.spelling, gram, -1)
        if not rule in allrules:
            rules.add(rule)
            rule.iteration = iteration

    return rules
Пример #5
0
def test_rule_str():
    """

    """

    rule = Rule(Rule.Type.left_context, 'ki', 0.989010989)
    expected_result = 'Rule(type=left_context, rule=ki, strength=0.989010989)'
    assert str(rule) == expected_result
Пример #6
0
def test_rule_init():
    """

    """

    rule = Rule(Rule.Type.left_context, 'ki', 0.989010989)
    assert rule.type == Rule.Type.left_context
    assert rule.contents == 'ki'
    assert rule.strength == 0.989010989
    assert rule.occurrences == 1
    assert rule.iteration == -1
Пример #7
0
def test_rulefilter_main():
    """

    """

    rule_set = set()
    expected = set()
    values = np.linspace(0.1, 0.8, num=8)

    for strength in values:
        rule = Rule(Rule.Type.unset, "test-{}".format(strength), strength)
        rule_set.add(rule)

    for strength in values[3:]:
        rule = Rule(Rule.Type.unset, "test-{}".format(strength), strength)
        expected.add(rule)

    expected = sorted(expected, reverse=True)
    results = sorted(rulefilter.main(rule_set, 5), reverse=True)
    assert results == expected
Пример #8
0
def main(corpus, rules, names, max_rules, iteration, options, display):
    """
    This is meant to generate conextual rules from a set of identified
    name tokens. It needs the corpus as well as the name set in order to
    assess the performance of any rules it finds from the names

    Args:
        corpus (set): Set of all Token objects in the corpus.
        rules (set): Set of all Rule objects that have been found so far.
        names (set): Set of Token objects to derive new context rules from.
        max_rules (int): maximum number of rules to be accepted each iteration.
        iteration (int): what iteration is the algorithm currently on?
        options (Options): collection of configuration options

    Returns:
        new_rules (set): Set of Rule objects.

    Raises:
        None

    """

    new_rules = set()

    for name in names:
        # New rules are assumed to have no strength until they are assessed
        #  later by scanning the whole corpus

        # Create left context rule
        left_context = Rule(Rule.Type.left_context, str(name.left_context), -1)
        # Create right context rule
        right_context = Rule(Rule.Type.right_context, str(name.right_context), -1)

        # No redundant rules allowed!
        if left_context not in rules:
            left_context.iteration = iteration
            new_rules.add(left_context)

        if right_context not in rules:
            right_context.iteration = iteration
            new_rules.add(right_context)

    rulesperformance.main(corpus, new_rules, options, iteration, display)

    new_rules = rulefilter.main(new_rules, max_rules)

    return new_rules
Пример #9
0
def test_rulesperformance_rateRulePerformance():
    """

    """

    alpha = 0.1
    k = 2.0
    accept_threshold = 0.9

    rule = Rule(Rule.Type.spelling, 'bb', 1.0)
    name = Token('aa-aa', 'bb-bb', 'cc-cc', Token.Type.personal_name)
    name.name_probability = 1.0
    names = set()
    names.add(name)

    rulesperformance.rateRulePerformance(names, rule, alpha, k,
                                         accept_threshold)

    expected_strength = (1.0 + alpha) / (1.0 + k * alpha)
    expected_occurrences = 1.0

    assert rule.strength == expected_strength
    assert rule.occurrences == expected_occurrences