Пример #1
0
def test_encode_decode_unknown_huffman():

    unknown_token = "__unknown__"

    test = HuffmanEncoder({'form': {
        'fox': 1,
        'quick': 2,
        'brown': 3
    }},
                          SNGram,
                          unknown=unknown_token)

    pattern_list = [
        PatternElement('fox', 'form'), SNGram.LEFT_BRACKET,
        PatternElement('The', 'form'), SNGram.COMMA,
        PatternElement('quick', 'form'), SNGram.COMMA,
        PatternElement('brown', 'form'), SNGram.RIGHT_BRACKET
    ]
    pattern = SNGram.from_element_list(pattern_list)

    expected_pattern_list = pattern_list
    expected_pattern_list[2] = PatternElement(unknown_token, 'form')
    expected_pattern = SNGram.from_element_list(expected_pattern_list)

    assert test.decode(test.encode(pattern)) == expected_pattern
Пример #2
0
def case_changed_special():

    data = """
# text = The quick brown fox
1   The     the    DET    DT   Definite=Def|PronType=Art   4   det     _   _
2   quick   quick  ADJ    JJ   Degree=Pos                  4   amod    _   _
3   brown   brown  ADJ    JJ   Degree=Pos                  4   amod    _   _
4   fox     fox    NOUN   NN   Number=Sing                 0   nsubj   _   _

"""

    return TokenSNGram(conllu.parse_tree(data)[0],
                       left_bracket="(",
                       right_bracket=")",
                       comma="_"), {
                           "length":
                           4,
                           "str":
                           "fox (The_ quick_ brown)",
                           "repr": [
                               PatternElement('fox', 'form', 4), "(",
                               PatternElement('The', 'form', 1), "_",
                               PatternElement('quick', 'form', 2), "_",
                               PatternElement('brown', 'form', 3), ")"
                           ],
                           "profiles":
                           set(["form ( form _ form _ form )"])
                       }
Пример #3
0
def case_fox():

    data = """
# text = The quick brown fox
1   The     the    DET    DT   Definite=Def|PronType=Art   4   det     _   _
2   quick   quick  ADJ    JJ   Degree=Pos                  4   amod    _   _
3   brown   brown  ADJ    JJ   Degree=Pos                  4   amod    _   _
4   fox     fox    NOUN   NN   Number=Sing                 0   nsubj   _   _

"""

    return TokenSNGram(conllu.parse_tree(data)[0]), {
        "length":
        4,
        "str":
        "fox [The, quick, brown]",
        "repr": [
            PatternElement('fox', 'form', 4), SNGram.LEFT_BRACKET,
            PatternElement('The', 'form', 1), SNGram.COMMA,
            PatternElement('quick', 'form', 2), SNGram.COMMA,
            PatternElement('brown', 'form', 3), SNGram.RIGHT_BRACKET
        ],
        "profiles":
        set(["form [ form , form , form ]"])
    }
Пример #4
0
    def __init__(self,
                 frequency_dictionaries,
                 pattern_type,
                 special_weight=1,
                 unknown=None):

        self.pattern_type = pattern_type

        huffman_freq_dict = {}
        max_freq = 0

        self.levels = set(frequency_dictionaries.keys())

        for level, dict_ in frequency_dictionaries.items():
            for word, freq in dict_.items():

                if max_freq < freq:
                    max_freq = freq

                huffman_freq_dict[PatternElement(word, level)] = freq

        special_frequency = max_freq * special_weight
        for special_element in self.pattern_type.specialElements():
            huffman_freq_dict[special_element] = special_frequency

        self.unknown = unknown
        if self.unknown is not None:
            for level in frequency_dictionaries.keys():
                huffman_freq_dict[PatternElement(unknown,
                                                 level)] = special_frequency

        huffman_freq_dict[self.token_start] = max(max_freq, special_frequency)
        huffman_freq_dict[self.token_end] = max(max_freq, special_frequency)

        self.huffman_dict = bitarray.util.huffman_code(huffman_freq_dict)
Пример #5
0
def case_sidorov2():

    data = """
# text = y le di un par de vueltas de_mala_gana
1   y              _  _  _  _  0  _  _  _
2   le             _  _  _  _  3  _  _  _
3   di             _  _  _  _  1  _  _  _
4   par            _  _  _  _  3  _  _  _
5   de_mala_gana   _  _  _  _  3  _  _  _

"""
    return TokenSNGram(conllu.parse_tree(data)[0]), {
        "length":
        5,
        "str":
        "y di [le, par, de_mala_gana]",
        "repr": [
            PatternElement('y', 'form', 1),
            PatternElement('di', 'form', 3), SNGram.LEFT_BRACKET,
            PatternElement('le', 'form', 2), SNGram.COMMA,
            PatternElement('par', 'form', 4), SNGram.COMMA,
            PatternElement('de_mala_gana', 'form', 5), SNGram.RIGHT_BRACKET
        ],
        "profiles":
        set(["form form [ form , form , form ]"])
    }
Пример #6
0
    def _encode_to_bitarray(self, pattern):

        code = bitarray.bitarray()

        try:
            code.encode(self.huffman_dict, pattern)
        except ValueError as e:
            code = bitarray.bitarray()
            for element in pattern:
                if not hasattr(element, 'items'):
                    try:
                        code.encode(self.huffman_dict, [element])
                    except ValueError:
                        if self.unknown is not None:
                            code.encode(
                                self.huffman_dict,
                                [PatternElement(self.unknown, element.level)])
                        else:
                            raise EncodeError(str(e))
                else:
                    ## encode a whole token (a dict)
                    code.extend(
                        self._encode_to_bitarray([self.token_start] + [
                            PatternElement(value, level)
                            for level, value in element.items()
                        ] + [self.token_end]))

        return code
Пример #7
0
def test_encode_decode(encoder):

    pattern = SNGram.from_element_list([
        PatternElement('fox', 'form'), SNGram.LEFT_BRACKET,
        PatternElement('The', 'form'), SNGram.COMMA,
        PatternElement('quick', 'form'), SNGram.COMMA,
        PatternElement('brown', 'form'), SNGram.RIGHT_BRACKET
    ])

    assert encoder.decode(encoder.encode(pattern)) == pattern
Пример #8
0
def test_encode_decode_different_levels(encoder_dict):

    test = BitEncoder(encoder_dict, SNGram)
    pattern = SNGram.from_element_list([
        PatternElement('Noun', 'pos'), SNGram.LEFT_BRACKET,
        PatternElement('The', 'form'), SNGram.COMMA,
        PatternElement('quick', 'form'), SNGram.COMMA,
        PatternElement('brown', 'form'), SNGram.RIGHT_BRACKET
    ])

    assert test.decode(test.encode(pattern)) == pattern
Пример #9
0
def test_encode_decode_with_full_token(encoder):

    pattern = SNGram.from_element_list([{
        'form': 'fox'
    }, SNGram.LEFT_BRACKET,
                                        PatternElement('The',
                                                       'form'), SNGram.COMMA,
                                        PatternElement('quick', 'form'),
                                        SNGram.COMMA, {
                                            'form': 'brown'
                                        }, SNGram.RIGHT_BRACKET])

    assert encoder.decode(encoder.encode(pattern)) == pattern
Пример #10
0
def test_encode_unknown_not_set_huffman():

    test = HuffmanEncoder({'form': {'fox': 1, 'quick': 2, 'brown': 3}}, SNGram)

    pattern_list = [
        PatternElement('fox', 'form'), SNGram.LEFT_BRACKET,
        PatternElement('The', 'form'), SNGram.COMMA,
        PatternElement('quick', 'form'), SNGram.COMMA,
        PatternElement('brown', 'form'), SNGram.RIGHT_BRACKET
    ]
    pattern = SNGram.from_element_list(pattern_list)

    with pytest.raises(EncodeError):
        test.encode(pattern)
Пример #11
0
def test_encode_unknown_not_set_bitencoder():

    test = BitEncoder({'form': set(['fox', 'quick', 'brown'])}, SNGram)

    pattern_list = [
        PatternElement('fox', 'form'), SNGram.LEFT_BRACKET,
        PatternElement('The', 'form'), SNGram.COMMA,
        PatternElement('quick', 'form'), SNGram.COMMA,
        PatternElement('brown', 'form'), SNGram.RIGHT_BRACKET
    ]
    pattern = SNGram.from_element_list(pattern_list)

    with pytest.raises(EncodeError):
        test.encode(pattern)
Пример #12
0
def case_apples():

    data = """
# text = apples, pears, oranges, and bananas.
1   apples   apple  NOUN    NN   Number=Plur                  0   obj    _   _
2   ,     ,    PUNCT   ,   _                 3   punct   _   _
3   pears     pear    NOUN   NN   Number=Plur                 1   conj   _   _
4   ,     ,    PUNCT   ,   _    5   punct   _   _
5   oranges     orange    NOUN   NN   Number=Plur                 1   conj   _   _
6   ,     ,    PUNCT   ,   _                 8   punct   _   _
7   and   and   SCONJ   CC  _   8   cc    _   _
8   bananas    banana   NOUN    NN   Number=Plur                           1   conj    _   _

"""
    return TokenSNGram(conllu.parse_tree(data)[0]), {
        "length":
        8,
        "str":
        "apples [pears,, oranges,, bananas [,, and]]",
        "repr": [
            PatternElement('apples', 'form', 1), SNGram.LEFT_BRACKET,
            PatternElement('pears', 'form', 3),
            PatternElement(',', 'form', 2), SNGram.COMMA,
            PatternElement('oranges', 'form', 5),
            PatternElement(',', 'form', 4), SNGram.COMMA,
            PatternElement('bananas', 'form', 8), SNGram.LEFT_BRACKET,
            PatternElement(',', 'form', 6), SNGram.COMMA,
            PatternElement('and', 'form', 7), SNGram.RIGHT_BRACKET,
            SNGram.RIGHT_BRACKET
        ],
        "profiles":
        set(["form [ form form , form form , form [ form , form ] ]"])
    }
Пример #13
0
def test_append(encoder):

    pattern_list = [
        PatternElement('fox', 'form'), SNGram.LEFT_BRACKET,
        PatternElement('The', 'form'), SNGram.COMMA,
        PatternElement('quick', 'form'), SNGram.COMMA,
        PatternElement('brown', 'form'), SNGram.RIGHT_BRACKET
    ]
    expected_pattern = SNGram.from_element_list(pattern_list)

    pattern = b''
    for element in pattern_list:
        pattern = encoder.append(pattern, encoder.encode_item(element))

    assert encoder.decode(pattern) == expected_pattern
Пример #14
0
def test_create_encoder():

    infile_path = os.path.abspath(
        'example_data/example_data_dict_filtered.json')
    configfile_path = os.path.abspath('example_data/example_config.json')
    script_file = os.path.abspath('bin/create_encoder')

    runner = CliRunner()
    with runner.isolated_filesystem():

        outfile = "example_data_encoder"

        exit_status = os.system(script_file + " " + infile_path + " " +
                                outfile + " " + configfile_path)

        encoder = Base64Encoder(PatternEncoder.load(open(outfile, 'rb')))
        dict_ = json.load(open(infile_path, 'r'))

        pattern_elements = [
            PatternElement(word, level) for level, elements in dict_.items()
            for word in elements.keys()
        ]
        results = [
            encoder.decode(encoder.encode_item(pe)).get_element_list()[0] == pe
            for pe in pattern_elements
        ]

    assert all(results)
Пример #15
0
def test_huffman_encode_unknown_item(freq_dict):

    test = HuffmanEncoder(freq_dict, SNGram)

    element = PatternElement('unknown', 'form')

    with pytest.raises(EncodeError):
        test.encode_item(element)
Пример #16
0
def case_jumps():

    data = """
# text = The quick brown fox jumps over the lazy dog.
1   The     the    DET    DT   Definite=Def|PronType=Art   4   det     _   _
2   quick   quick  ADJ    JJ   Degree=Pos                  4   amod    _   _
3   brown   brown  ADJ    JJ   Degree=Pos                  4   amod    _   _
4   fox     fox    NOUN   NN   Number=Sing                 5   nsubj   _   _
5   jumps   jump   VERB   VBZ  Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin   0   root    _   _
6   over    over   ADP    IN   _                           9   case    _   _
7   the     the    DET    DT   Definite=Def|PronType=Art   9   det     _   _
8   lazy    lazy   ADJ    JJ   Degree=Pos                  9   amod    _   _
9   dog     dog    NOUN   NN   Number=Sing                 5   nmod    _   SpaceAfter=No
10  .       .      PUNCT  .    _                           5   punct   _   _

"""
    return TokenSNGram(conllu.parse_tree(data)[0]), {
        "length":
        10,
        "str":
        "jumps [fox [The, quick, brown], dog [over, the, lazy], .]",
        "repr": [
            PatternElement('jumps', 'form', 5), SNGram.LEFT_BRACKET,
            PatternElement('fox', 'form', 4), SNGram.LEFT_BRACKET,
            PatternElement('The', 'form', 1), SNGram.COMMA,
            PatternElement('quick', 'form', 2), SNGram.COMMA,
            PatternElement('brown', 'form',
                           3), SNGram.RIGHT_BRACKET, SNGram.COMMA,
            PatternElement('dog', 'form', 9), SNGram.LEFT_BRACKET,
            PatternElement('over', 'form', 6), SNGram.COMMA,
            PatternElement('the', 'form', 7), SNGram.COMMA,
            PatternElement('lazy', 'form', 8), SNGram.RIGHT_BRACKET,
            SNGram.COMMA,
            PatternElement('.', 'form', 10), SNGram.RIGHT_BRACKET
        ],
        "profiles":
        set([
            "form [ form [ form , form , form ] , form [ form , form , form ] , form ]"
        ])
    }
Пример #17
0
def case_dog():

    data = """
# text = over the lazy dog
6   over    over   ADP    IN   _                           9   case    _   _
7   the     the    DET    DT   Definite=Def|PronType=Art   9   det     _   _
8   lazy    lazy   ADJ    JJ   Degree=Pos                  9   amod    _   _
9   dog     dog    NOUN   NN   Number=Sing                 0   nmod    _   SpaceAfter=No

"""
    return TokenSNGram(conllu.parse_tree(data)[0]), {
        "length":
        4,
        "str":
        "dog [over, the, lazy]",
        "repr": [
            PatternElement('dog', 'form', 9), SNGram.LEFT_BRACKET,
            PatternElement('over', 'form', 6), SNGram.COMMA,
            PatternElement('the', 'form', 7), SNGram.COMMA,
            PatternElement('lazy', 'form', 8), SNGram.RIGHT_BRACKET
        ],
        "profiles":
        set(["form [ form , form , form ]"])
    }
Пример #18
0
    def encode_item(self, item):

        code = 0

        if item == self.token_start:

            code = self.special_offset + len(self.special_characters)

        elif item == self.token_end:

            code = self.special_offset + len(self.special_characters) + 1

        else:

            try:
                code = self.special_characters.index(
                    item) + self.special_offset

            except ValueError:

                try:
                    code = self.dictionaries[item.level][
                        item.form] + self.level_offsets[item.level]

                except AttributeError:

                    ## it is a token - so encode this
                    return self._encode([self.token_start] + [
                        PatternElement(value, level)
                        for level, value in item.items()
                    ] + [self.token_end])

                except KeyError:

                    if self.unknown is not None:
                        code = len(self.dictionaries[
                            item.level]) + self.level_offsets[item.level]
                    else:
                        raise EncodeError("Element not in dictionary: " +
                                          str(item))

        return self._int_2_bytes(code)
Пример #19
0
    def decode(self, encoded_pattern):

        encoded_pattern = self._bytes_2_int(encoded_pattern)

        pattern = []
        element_size_int = 2**self.element_size - 1

        in_token = False

        while encoded_pattern != 0:
            element_id = int(encoded_pattern & element_size_int)
            try:
                word, level = self._get_word_for_id(element_id)
                if in_token:
                    current_token[level] = word
                else:
                    if level is not None:
                        pattern.append(PatternElement(word, level))
                    else:
                        pattern.append(word)

            except:
                ## end of token - treat as start as pattern is reversed
                if element_id == len(self._ids_2_words) + 2:
                    in_token = True
                    current_token = {}
                ## start of token - treat as start as pattern is reversed
                elif element_id == len(self._ids_2_words) + 1:
                    in_token = False
                    pattern.append(current_token)

                else:
                    raise ValueError("Cannot decode pattern, unknown key " +
                                     str(element_id))

            encoded_pattern = encoded_pattern >> self.element_size

        pattern.reverse()
        return self.pattern_type.from_element_list(pattern)
Пример #20
0
def case_jumps_phrases():

    tree = SNGram.Tree({
        'form': 'jumps',
        'id': 5
    }, [
        SNGram.Tree({
            'form': 'nsubj',
            'id': 4
        }, [],
                    SNGram.Tree({
                        'form': 'fox',
                        'id': 4
                    }, [
                        SNGram.Tree({
                            'form': 'The',
                            'id': 1
                        }, []),
                        SNGram.Tree({
                            'form': 'quick',
                            'id': 2
                        }, []),
                        SNGram.Tree({
                            'form': 'brown',
                            'id': 3
                        }, []),
                    ])),
        SNGram.Tree({
            'form': 'nmod',
            'id': 9
        }, [],
                    SNGram.Tree({
                        'form': 'dog',
                        'id': 9
                    }, [
                        SNGram.Tree({
                            'form': 'over',
                            'id': 6
                        }, []),
                        SNGram.Tree({
                            'form': 'the',
                            'id': 7
                        }, []),
                        SNGram.Tree({
                            'form': 'lazy',
                            'id': 8
                        }, []),
                    ])),
        SNGram.Tree({
            'form': '.',
            'id': 10
        }, [])
    ])

    return TokenSNGram(tree), {
        "length":
        4,
        "str":
        "jumps [nsubj, nmod, .]",
        "repr": [
            PatternElement('jumps', 'form', 5), SNGram.LEFT_BRACKET,
            PatternElement('nsubj', 'form', 4), SNGram.COMMA,
            PatternElement('nmod', 'form', 9), SNGram.COMMA,
            PatternElement('.', 'form', 10), SNGram.RIGHT_BRACKET
        ],
        "repr_full": [
            PatternElement('jumps', 'form', 5), SNGram.LEFT_BRACKET,
            PatternElement('fox', 'form', 4), SNGram.LEFT_BRACKET,
            PatternElement('The', 'form', 1), SNGram.COMMA,
            PatternElement('quick', 'form', 2), SNGram.COMMA,
            PatternElement('brown', 'form',
                           3), SNGram.RIGHT_BRACKET, SNGram.COMMA,
            PatternElement('dog', 'form', 9), SNGram.LEFT_BRACKET,
            PatternElement('over', 'form', 6), SNGram.COMMA,
            PatternElement('the', 'form', 7), SNGram.COMMA,
            PatternElement('lazy', 'form', 8), SNGram.RIGHT_BRACKET,
            SNGram.COMMA,
            PatternElement('.', 'form', 10), SNGram.RIGHT_BRACKET
        ],
        "profiles":
        set(["form [ form , form , form ]"])
    }
Пример #21
0
def test_encode_item(encoder):

    element = PatternElement('fox', 'form')
    expected_pattern = SNGram.from_element_list([element])

    assert encoder.decode(encoder.encode_item(element)) == expected_pattern