Exemplo n.º 1
0
def case_apples():

    data = """
# text = apples, pears, oranges, and bananas.
1   apples   apple  NOUN    NN   Number=Plur                  0   obj    _   _
2   ,     ,    PUNCT   ,   _                 3   punct   _   _
3   pears     pear    NOUN   NN   Number=Plur                 1   conj   _   _
4   ,     ,    PUNCT   ,   _    5   punct   _   _
5   oranges     orange    NOUN   NN   Number=Plur                 1   conj   _   _
6   ,     ,    PUNCT   ,   _                 8   punct   _   _
7   and   and   SCONJ   CC  _   8   cc    _   _
8   bananas    banana   NOUN    NN   Number=Plur                           1   conj    _   _

"""
    return TokenSNGram(conllu.parse_tree(data)[0]), {
        "length":
        8,
        "str":
        "apples [pears,, oranges,, bananas [,, and]]",
        "repr": [
            PatternElement('apples', 'form', 1), SNGram.LEFT_BRACKET,
            PatternElement('pears', 'form', 3),
            PatternElement(',', 'form', 2), SNGram.COMMA,
            PatternElement('oranges', 'form', 5),
            PatternElement(',', 'form', 4), SNGram.COMMA,
            PatternElement('bananas', 'form', 8), SNGram.LEFT_BRACKET,
            PatternElement(',', 'form', 6), SNGram.COMMA,
            PatternElement('and', 'form', 7), SNGram.RIGHT_BRACKET,
            SNGram.RIGHT_BRACKET
        ],
        "profiles":
        set(["form [ form form , form form , form [ form , form ] ]"])
    }
Exemplo n.º 2
0
def case_fox():

    data = """
# text = The quick brown fox
1   The     the    DET    DT   Definite=Def|PronType=Art   4   det     _   _
2   quick   quick  ADJ    JJ   Degree=Pos                  4   amod    _   _
3   brown   brown  ADJ    JJ   Degree=Pos                  4   amod    _   _
4   fox     fox    NOUN   NN   Number=Sing                 0   nsubj   _   _

"""

    return TokenSNGram(conllu.parse_tree(data)[0]), {
        "length":
        4,
        "str":
        "fox [The, quick, brown]",
        "repr": [
            PatternElement('fox', 'form', 4), SNGram.LEFT_BRACKET,
            PatternElement('The', 'form', 1), SNGram.COMMA,
            PatternElement('quick', 'form', 2), SNGram.COMMA,
            PatternElement('brown', 'form', 3), SNGram.RIGHT_BRACKET
        ],
        "profiles":
        set(["form [ form , form , form ]"])
    }
Exemplo n.º 3
0
def case_sidorov2():

    data = """
# text = y le di un par de vueltas de_mala_gana
1   y              _  _  _  _  0  _  _  _
2   le             _  _  _  _  3  _  _  _
3   di             _  _  _  _  1  _  _  _
4   par            _  _  _  _  3  _  _  _
5   de_mala_gana   _  _  _  _  3  _  _  _

"""
    return TokenSNGram(conllu.parse_tree(data)[0]), {
        "length":
        5,
        "str":
        "y di [le, par, de_mala_gana]",
        "repr": [
            PatternElement('y', 'form', 1),
            PatternElement('di', 'form', 3), SNGram.LEFT_BRACKET,
            PatternElement('le', 'form', 2), SNGram.COMMA,
            PatternElement('par', 'form', 4), SNGram.COMMA,
            PatternElement('de_mala_gana', 'form', 5), SNGram.RIGHT_BRACKET
        ],
        "profiles":
        set(["form form [ form , form , form ]"])
    }
Exemplo n.º 4
0
def case_changed_special():

    data = """
# text = The quick brown fox
1   The     the    DET    DT   Definite=Def|PronType=Art   4   det     _   _
2   quick   quick  ADJ    JJ   Degree=Pos                  4   amod    _   _
3   brown   brown  ADJ    JJ   Degree=Pos                  4   amod    _   _
4   fox     fox    NOUN   NN   Number=Sing                 0   nsubj   _   _

"""

    return TokenSNGram(conllu.parse_tree(data)[0],
                       left_bracket="(",
                       right_bracket=")",
                       comma="_"), {
                           "length":
                           4,
                           "str":
                           "fox (The_ quick_ brown)",
                           "repr": [
                               PatternElement('fox', 'form', 4), "(",
                               PatternElement('The', 'form', 1), "_",
                               PatternElement('quick', 'form', 2), "_",
                               PatternElement('brown', 'form', 3), ")"
                           ],
                           "profiles":
                           set(["form ( form _ form _ form )"])
                       }
Exemplo n.º 5
0
def case_jumps():

    data = """
# text = The quick brown fox jumps over the lazy dog.
1   The     the    DET    DT   Definite=Def|PronType=Art   4   det     _   _
2   quick   quick  ADJ    JJ   Degree=Pos                  4   amod    _   _
3   brown   brown  ADJ    JJ   Degree=Pos                  4   amod    _   _
4   fox     fox    NOUN   NN   Number=Sing                 5   nsubj   _   _
5   jumps   jump   VERB   VBZ  Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin   0   root    _   _
6   over    over   ADP    IN   _                           9   case    _   _
7   the     the    DET    DT   Definite=Def|PronType=Art   9   det     _   _
8   lazy    lazy   ADJ    JJ   Degree=Pos                  9   amod    _   _
9   dog     dog    NOUN   NN   Number=Sing                 5   nmod    _   SpaceAfter=No
10  .       .      PUNCT  .    _                           5   punct   _   _

"""
    return TokenSNGram(conllu.parse_tree(data)[0]), {
        "length":
        10,
        "str":
        "jumps [fox [The, quick, brown], dog [over, the, lazy], .]",
        "repr": [
            PatternElement('jumps', 'form', 5), SNGram.LEFT_BRACKET,
            PatternElement('fox', 'form', 4), SNGram.LEFT_BRACKET,
            PatternElement('The', 'form', 1), SNGram.COMMA,
            PatternElement('quick', 'form', 2), SNGram.COMMA,
            PatternElement('brown', 'form',
                           3), SNGram.RIGHT_BRACKET, SNGram.COMMA,
            PatternElement('dog', 'form', 9), SNGram.LEFT_BRACKET,
            PatternElement('over', 'form', 6), SNGram.COMMA,
            PatternElement('the', 'form', 7), SNGram.COMMA,
            PatternElement('lazy', 'form', 8), SNGram.RIGHT_BRACKET,
            SNGram.COMMA,
            PatternElement('.', 'form', 10), SNGram.RIGHT_BRACKET
        ],
        "profiles":
        set([
            "form [ form [ form , form , form ] , form [ form , form , form ] , form ]"
        ])
    }
Exemplo n.º 6
0
def case_dog():

    data = """
# text = over the lazy dog
6   over    over   ADP    IN   _                           9   case    _   _
7   the     the    DET    DT   Definite=Def|PronType=Art   9   det     _   _
8   lazy    lazy   ADJ    JJ   Degree=Pos                  9   amod    _   _
9   dog     dog    NOUN   NN   Number=Sing                 0   nmod    _   SpaceAfter=No

"""
    return TokenSNGram(conllu.parse_tree(data)[0]), {
        "length":
        4,
        "str":
        "dog [over, the, lazy]",
        "repr": [
            PatternElement('dog', 'form', 9), SNGram.LEFT_BRACKET,
            PatternElement('over', 'form', 6), SNGram.COMMA,
            PatternElement('the', 'form', 7), SNGram.COMMA,
            PatternElement('lazy', 'form', 8), SNGram.RIGHT_BRACKET
        ],
        "profiles":
        set(["form [ form , form , form ]"])
    }
Exemplo n.º 7
0
def case_jumps_phrases():

    tree = SNGram.Tree({
        'form': 'jumps',
        'id': 5
    }, [
        SNGram.Tree({
            'form': 'nsubj',
            'id': 4
        }, [],
                    SNGram.Tree({
                        'form': 'fox',
                        'id': 4
                    }, [
                        SNGram.Tree({
                            'form': 'The',
                            'id': 1
                        }, []),
                        SNGram.Tree({
                            'form': 'quick',
                            'id': 2
                        }, []),
                        SNGram.Tree({
                            'form': 'brown',
                            'id': 3
                        }, []),
                    ])),
        SNGram.Tree({
            'form': 'nmod',
            'id': 9
        }, [],
                    SNGram.Tree({
                        'form': 'dog',
                        'id': 9
                    }, [
                        SNGram.Tree({
                            'form': 'over',
                            'id': 6
                        }, []),
                        SNGram.Tree({
                            'form': 'the',
                            'id': 7
                        }, []),
                        SNGram.Tree({
                            'form': 'lazy',
                            'id': 8
                        }, []),
                    ])),
        SNGram.Tree({
            'form': '.',
            'id': 10
        }, [])
    ])

    return TokenSNGram(tree), {
        "length":
        4,
        "str":
        "jumps [nsubj, nmod, .]",
        "repr": [
            PatternElement('jumps', 'form', 5), SNGram.LEFT_BRACKET,
            PatternElement('nsubj', 'form', 4), SNGram.COMMA,
            PatternElement('nmod', 'form', 9), SNGram.COMMA,
            PatternElement('.', 'form', 10), SNGram.RIGHT_BRACKET
        ],
        "repr_full": [
            PatternElement('jumps', 'form', 5), SNGram.LEFT_BRACKET,
            PatternElement('fox', 'form', 4), SNGram.LEFT_BRACKET,
            PatternElement('The', 'form', 1), SNGram.COMMA,
            PatternElement('quick', 'form', 2), SNGram.COMMA,
            PatternElement('brown', 'form',
                           3), SNGram.RIGHT_BRACKET, SNGram.COMMA,
            PatternElement('dog', 'form', 9), SNGram.LEFT_BRACKET,
            PatternElement('over', 'form', 6), SNGram.COMMA,
            PatternElement('the', 'form', 7), SNGram.COMMA,
            PatternElement('lazy', 'form', 8), SNGram.RIGHT_BRACKET,
            SNGram.COMMA,
            PatternElement('.', 'form', 10), SNGram.RIGHT_BRACKET
        ],
        "profiles":
        set(["form [ form , form , form ]"])
    }