예제 #1
0
 def test_duplicate_tokens_error(self):
     doc = doc1
     match_examples = [
         util.idxs_to_tokens(
             doc, [0, 1, 1, 3])  # [We, introduce, introduce, methods]
     ]
     for match_example in match_examples:
         with pytest.raises(DuplicateTokensError):
             build_dependency_pattern(doc, match_example)
예제 #2
0
 def test_tokens_not_connected_error(self):
     doc = doc1
     match_examples = [
         util.idxs_to_tokens(
             doc, [19, 20, 21, 27])  # [courses, generated, by, models]
     ]
     feature_dict = {'DEP': 'dep_', 'TAG': 'tag_'}
     for match_example in match_examples:
         with pytest.raises(TokensNotFullyConnectedError):
             build_dependency_pattern(doc, match_example, feature_dict)
예제 #3
0
 def test_custom_extension(self):
     Token.set_extension('custom_attr', default=False)
     feature_dict = {'DEP': 'dep_', '_': {'custom_attr': 'custom_attr'}}
     for i, case in enumerate(cases):
         doc = case['example']['doc']
         for token in doc:
             token._.custom_attr = 'my_attr'
         match_example = case['example']['match']
         pattern = build_dependency_pattern(doc, match_example,
                                            feature_dict)
         matches = match.find_matches(doc, pattern)
         assert match_example in matches, 'does not match example'
         pattern_file_name = 'examples/pattern_{}.json'.format(i)
         with open(pattern_file_name, 'w') as f:
             json.dump(pattern, f, indent=2)
         if 'should_hit' in case:
             for item in case['should_hit']:
                 doc = item['doc']
                 hit_match = item['match']
                 matches = match.find_matches(doc, pattern)
                 assert hit_match in matches, 'false negative'
         if 'should_miss' in case:
             for item in case['should_miss']:
                 doc = item['doc']
                 miss_match = item['match']
                 matches = match.find_matches(doc, pattern)
                 assert miss_match not in matches, 'false positive'
예제 #4
0
def yield_tree_level_pattern_variants(role_pattern, training_match,
                                      feature_dict):
    match_tokens = training_match.match_tokens
    extended_match_tokens = spacy_pattern_builder.yield_extended_trees(
        match_tokens)
    doc = util.doc_from_match(training_match)
    for match_tokens in extended_match_tokens:
        match_tokens = sorted(match_tokens, key=lambda token: token.i)
        token_labels = role_pattern_builder.build_pattern_label_list(
            match_tokens, training_match)
        dependency_pattern_variant = spacy_pattern_builder.build_dependency_pattern(
            doc, match_tokens, feature_dict=feature_dict)
        assert (len(dependency_pattern_variant) == len(
            role_pattern.spacy_dep_pattern) + 1)
        assert len(token_labels) == len(role_pattern.token_labels) + 1
        role_pattern_variant = RolePattern(dependency_pattern_variant,
                                           token_labels)
        match_tokens_depth_order = spacy_pattern_builder.util.sort_by_depth(
            match_tokens)  # Should be same order as the dependency pattern
        token_labels_depth_order = role_pattern_builder.build_pattern_label_list(
            match_tokens_depth_order, training_match)
        role_pattern_variant.token_labels_depth_order = token_labels_depth_order
        role_pattern_variant.builder = role_pattern.builder
        new_training_match = RolePatternMatch(training_match)
        new_training_match.match_tokens = match_tokens
        role_pattern_variant.training_match = new_training_match
        yield role_pattern_variant
예제 #5
0
    def test_yield_extended_trees(self):
        # Build initial pattern
        doc = doc1
        match_tokens = util.idxs_to_tokens(
            doc, [0, 1, 3])  # [We, introduce, methods]
        feature_dict = {'DEP': 'dep_', 'TAG': 'tag_', 'LOWER': 'lower_'}
        pattern = build_dependency_pattern(doc, match_tokens, feature_dict)

        match_tokens_variants = list(yield_extended_trees(match_tokens))

        pattern_variants = [
            build_dependency_pattern(doc, match_token_variant, feature_dict)
            for match_token_variant in match_tokens_variants
        ]

        assert not util.list_contains_duplicates(pattern_variants)
        n_variants = len(pattern_variants)
        for pattern_variant, match_tokens_variant in zip(
                pattern_variants, match_tokens_variants):
            matches = match.find_matches(doc, pattern_variant)
            match_tokens_variant = sorted(match_tokens_variant,
                                          key=lambda t: t.i)
            assert match_tokens_variant in matches
예제 #6
0
def build_role_pattern(
    match_example,
    feature_dict=DEFAULT_BUILD_PATTERN_TOKEN_FEATURE_DICT,
    validate_pattern=True,
):
    doc = util.doc_from_match(match_example)
    util.annotate_token_depth(doc)
    tokens = util.flatten_list(match_example.values())
    tokens = [doc[idx] for idx in util.token_idxs(tokens)
              ]  # Ensure that tokens have the newly added depth attribute
    nx_graph = util.doc_to_nx_graph(doc)
    match_tokens = util.smallest_connected_subgraph(tokens, nx_graph, doc)
    spacy_dep_pattern = spacy_pattern_builder.build_dependency_pattern(
        doc, match_tokens, feature_dict=feature_dict)
    token_labels = build_pattern_label_list(match_tokens, match_example)
    role_pattern = RolePattern(spacy_dep_pattern, token_labels)
    match_tokens_depth_order = spacy_pattern_builder.util.sort_by_depth(
        match_tokens)  # Should be same order as the dependency pattern
    token_labels_depth_order = build_pattern_label_list(
        match_tokens_depth_order, match_example)
    role_pattern.token_labels_depth_order = token_labels_depth_order
    if validate_pattern:
        pattern_does_match_example, matches = validate.pattern_matches_example(
            role_pattern, match_example)
        if not pattern_does_match_example:
            spacy_dep_pattern = role_pattern.spacy_dep_pattern
            message = [
                'Unable to match example: \n{}'.format(pformat(match_example)),
                'From doc: {}'.format(doc),
                'Constructed role pattern: \n',
                'spacy_dep_pattern: \n{}'.format(pformat(spacy_dep_pattern)),
                'token_labels: \n{}'.format(
                    pformat(role_pattern.token_labels_depth_order)),
            ]
            if matches:
                message.append('Matches found:')
                for match in matches:
                    message += [
                        'Match tokens: \n{}'.format(pformat(
                            match.match_tokens)),
                        'Slots: \n{}'.format(pformat(match)),
                    ]
            else:
                message.append('Matches found: None')
            message = '\n{}'.format('\n'.join(message))
            raise RolePatternDoesNotMatchExample(message)
    return role_pattern
예제 #7
0
    def test_yield_node_level_pattern_variants(self):
        # Build initial pattern
        doc = doc1
        match_tokens = util.idxs_to_tokens(
            doc, [0, 1, 3])  # [We, introduce, methods]
        feature_dict = {'DEP': 'dep_', 'TAG': 'tag_'}
        pattern = build_dependency_pattern(doc, match_tokens, feature_dict)

        feature_dicts = (
            {
                'DEP': 'dep_',
                'TAG': 'tag_'
            },
            {
                'DEP': 'dep_',
                'TAG': 'tag_',
                'LOWER': 'lower_'
            },
        )
        pattern_variants = list(
            yield_node_level_pattern_variants(pattern, match_tokens,
                                              feature_dicts))
        assert not util.list_contains_duplicates(pattern_variants)
        n_variants = len(pattern_variants)
        assert n_variants == len(feature_dicts)**len(pattern)
        for pattern_variant in pattern_variants:
            matches = match.find_matches(doc, pattern_variant)
            assert match_tokens in matches

        # Test mutate_tokens parameter
        pattern_variants = list(
            yield_node_level_pattern_variants(pattern,
                                              match_tokens,
                                              feature_dicts,
                                              mutate_tokens=[match_tokens[1]]))
        n_variants = len(pattern_variants)
        assert n_variants == len(feature_dicts)**len(pattern)
        for pattern_variant in pattern_variants:
            matches = match.find_matches(doc, pattern_variant)
            assert match_tokens in matches
예제 #8
0
 def test_build_pattern(self):
     feature_dict = {'DEP': 'dep_', 'TAG': 'tag_'}
     for i, case in enumerate(cases):
         doc = case['example']['doc']
         match_example = case['example']['match']
         pattern = build_dependency_pattern(doc, match_example,
                                            feature_dict)
         matches = match.find_matches(doc, pattern)
         assert match_example in matches, 'does not match example'
         pattern_file_name = 'examples/pattern_{}.json'.format(i)
         with open(pattern_file_name, 'w') as f:
             json.dump(pattern, f, indent=2)
         if 'should_hit' in case:
             for item in case['should_hit']:
                 doc = item['doc']
                 hit_match = item['match']
                 matches = match.find_matches(doc, pattern)
                 assert hit_match in matches, 'false negative'
         if 'should_miss' in case:
             for item in case['should_miss']:
                 doc = item['doc']
                 miss_match = item['match']
                 matches = match.find_matches(doc, pattern)
                 assert miss_match not in matches, 'false positive'
예제 #9
0
without needing to traverse tokens outside of the list.
Otherwise, spacy-pattern-builder will raise a TokensNotFullyConnectedError.
You can get a connected set that includes your tokens with the following: '''
from spacy_pattern_builder import util
connected_tokens = util.smallest_connected_subgraph(match_tokens, doc)
assert match_tokens == connected_tokens

# Specify the token attributes / features to use
feature_dict = {  # This here is equal to the default feature_dict
    'DEP': 'dep_',
    'TAG': 'tag_'
}

# Build the pattern
pattern = build_dependency_pattern(doc,
                                   match_tokens,
                                   feature_dict=feature_dict)

from pprint import pprint
pprint(pattern)  # In the format consumed by SpaCy's DependencyTreeMatcher:
'''
[{'PATTERN': {'DEP': 'ROOT', 'TAG': 'VBP'}, 'SPEC': {'NODE_NAME': 'node1'}},
 {'PATTERN': {'DEP': 'nsubj', 'TAG': 'PRP'},
  'SPEC': {'NBOR_NAME': 'node1', 'NBOR_RELOP': '>', 'NODE_NAME': 'node0'}},
 {'PATTERN': {'DEP': 'dobj', 'TAG': 'NNS'},
  'SPEC': {'NBOR_NAME': 'node1', 'NBOR_RELOP': '>', 'NODE_NAME': 'node3'}}]
'''

# Create a matcher and add the newly generated pattern
from spacy.matcher import DependencyTreeMatcher