def test_encode_decode_unknown_huffman(): unknown_token = "__unknown__" test = HuffmanEncoder({'form': { 'fox': 1, 'quick': 2, 'brown': 3 }}, SNGram, unknown=unknown_token) pattern_list = [ PatternElement('fox', 'form'), SNGram.LEFT_BRACKET, PatternElement('The', 'form'), SNGram.COMMA, PatternElement('quick', 'form'), SNGram.COMMA, PatternElement('brown', 'form'), SNGram.RIGHT_BRACKET ] pattern = SNGram.from_element_list(pattern_list) expected_pattern_list = pattern_list expected_pattern_list[2] = PatternElement(unknown_token, 'form') expected_pattern = SNGram.from_element_list(expected_pattern_list) assert test.decode(test.encode(pattern)) == expected_pattern
def case_changed_special(): data = """ # text = The quick brown fox 1 The the DET DT Definite=Def|PronType=Art 4 det _ _ 2 quick quick ADJ JJ Degree=Pos 4 amod _ _ 3 brown brown ADJ JJ Degree=Pos 4 amod _ _ 4 fox fox NOUN NN Number=Sing 0 nsubj _ _ """ return TokenSNGram(conllu.parse_tree(data)[0], left_bracket="(", right_bracket=")", comma="_"), { "length": 4, "str": "fox (The_ quick_ brown)", "repr": [ PatternElement('fox', 'form', 4), "(", PatternElement('The', 'form', 1), "_", PatternElement('quick', 'form', 2), "_", PatternElement('brown', 'form', 3), ")" ], "profiles": set(["form ( form _ form _ form )"]) }
def case_fox(): data = """ # text = The quick brown fox 1 The the DET DT Definite=Def|PronType=Art 4 det _ _ 2 quick quick ADJ JJ Degree=Pos 4 amod _ _ 3 brown brown ADJ JJ Degree=Pos 4 amod _ _ 4 fox fox NOUN NN Number=Sing 0 nsubj _ _ """ return TokenSNGram(conllu.parse_tree(data)[0]), { "length": 4, "str": "fox [The, quick, brown]", "repr": [ PatternElement('fox', 'form', 4), SNGram.LEFT_BRACKET, PatternElement('The', 'form', 1), SNGram.COMMA, PatternElement('quick', 'form', 2), SNGram.COMMA, PatternElement('brown', 'form', 3), SNGram.RIGHT_BRACKET ], "profiles": set(["form [ form , form , form ]"]) }
def __init__(self, frequency_dictionaries, pattern_type, special_weight=1, unknown=None): self.pattern_type = pattern_type huffman_freq_dict = {} max_freq = 0 self.levels = set(frequency_dictionaries.keys()) for level, dict_ in frequency_dictionaries.items(): for word, freq in dict_.items(): if max_freq < freq: max_freq = freq huffman_freq_dict[PatternElement(word, level)] = freq special_frequency = max_freq * special_weight for special_element in self.pattern_type.specialElements(): huffman_freq_dict[special_element] = special_frequency self.unknown = unknown if self.unknown is not None: for level in frequency_dictionaries.keys(): huffman_freq_dict[PatternElement(unknown, level)] = special_frequency huffman_freq_dict[self.token_start] = max(max_freq, special_frequency) huffman_freq_dict[self.token_end] = max(max_freq, special_frequency) self.huffman_dict = bitarray.util.huffman_code(huffman_freq_dict)
def case_sidorov2(): data = """ # text = y le di un par de vueltas de_mala_gana 1 y _ _ _ _ 0 _ _ _ 2 le _ _ _ _ 3 _ _ _ 3 di _ _ _ _ 1 _ _ _ 4 par _ _ _ _ 3 _ _ _ 5 de_mala_gana _ _ _ _ 3 _ _ _ """ return TokenSNGram(conllu.parse_tree(data)[0]), { "length": 5, "str": "y di [le, par, de_mala_gana]", "repr": [ PatternElement('y', 'form', 1), PatternElement('di', 'form', 3), SNGram.LEFT_BRACKET, PatternElement('le', 'form', 2), SNGram.COMMA, PatternElement('par', 'form', 4), SNGram.COMMA, PatternElement('de_mala_gana', 'form', 5), SNGram.RIGHT_BRACKET ], "profiles": set(["form form [ form , form , form ]"]) }
def _encode_to_bitarray(self, pattern): code = bitarray.bitarray() try: code.encode(self.huffman_dict, pattern) except ValueError as e: code = bitarray.bitarray() for element in pattern: if not hasattr(element, 'items'): try: code.encode(self.huffman_dict, [element]) except ValueError: if self.unknown is not None: code.encode( self.huffman_dict, [PatternElement(self.unknown, element.level)]) else: raise EncodeError(str(e)) else: ## encode a whole token (a dict) code.extend( self._encode_to_bitarray([self.token_start] + [ PatternElement(value, level) for level, value in element.items() ] + [self.token_end])) return code
def test_encode_decode(encoder): pattern = SNGram.from_element_list([ PatternElement('fox', 'form'), SNGram.LEFT_BRACKET, PatternElement('The', 'form'), SNGram.COMMA, PatternElement('quick', 'form'), SNGram.COMMA, PatternElement('brown', 'form'), SNGram.RIGHT_BRACKET ]) assert encoder.decode(encoder.encode(pattern)) == pattern
def test_encode_decode_different_levels(encoder_dict): test = BitEncoder(encoder_dict, SNGram) pattern = SNGram.from_element_list([ PatternElement('Noun', 'pos'), SNGram.LEFT_BRACKET, PatternElement('The', 'form'), SNGram.COMMA, PatternElement('quick', 'form'), SNGram.COMMA, PatternElement('brown', 'form'), SNGram.RIGHT_BRACKET ]) assert test.decode(test.encode(pattern)) == pattern
def test_encode_decode_with_full_token(encoder): pattern = SNGram.from_element_list([{ 'form': 'fox' }, SNGram.LEFT_BRACKET, PatternElement('The', 'form'), SNGram.COMMA, PatternElement('quick', 'form'), SNGram.COMMA, { 'form': 'brown' }, SNGram.RIGHT_BRACKET]) assert encoder.decode(encoder.encode(pattern)) == pattern
def test_encode_unknown_not_set_huffman(): test = HuffmanEncoder({'form': {'fox': 1, 'quick': 2, 'brown': 3}}, SNGram) pattern_list = [ PatternElement('fox', 'form'), SNGram.LEFT_BRACKET, PatternElement('The', 'form'), SNGram.COMMA, PatternElement('quick', 'form'), SNGram.COMMA, PatternElement('brown', 'form'), SNGram.RIGHT_BRACKET ] pattern = SNGram.from_element_list(pattern_list) with pytest.raises(EncodeError): test.encode(pattern)
def test_encode_unknown_not_set_bitencoder(): test = BitEncoder({'form': set(['fox', 'quick', 'brown'])}, SNGram) pattern_list = [ PatternElement('fox', 'form'), SNGram.LEFT_BRACKET, PatternElement('The', 'form'), SNGram.COMMA, PatternElement('quick', 'form'), SNGram.COMMA, PatternElement('brown', 'form'), SNGram.RIGHT_BRACKET ] pattern = SNGram.from_element_list(pattern_list) with pytest.raises(EncodeError): test.encode(pattern)
def case_apples(): data = """ # text = apples, pears, oranges, and bananas. 1 apples apple NOUN NN Number=Plur 0 obj _ _ 2 , , PUNCT , _ 3 punct _ _ 3 pears pear NOUN NN Number=Plur 1 conj _ _ 4 , , PUNCT , _ 5 punct _ _ 5 oranges orange NOUN NN Number=Plur 1 conj _ _ 6 , , PUNCT , _ 8 punct _ _ 7 and and SCONJ CC _ 8 cc _ _ 8 bananas banana NOUN NN Number=Plur 1 conj _ _ """ return TokenSNGram(conllu.parse_tree(data)[0]), { "length": 8, "str": "apples [pears,, oranges,, bananas [,, and]]", "repr": [ PatternElement('apples', 'form', 1), SNGram.LEFT_BRACKET, PatternElement('pears', 'form', 3), PatternElement(',', 'form', 2), SNGram.COMMA, PatternElement('oranges', 'form', 5), PatternElement(',', 'form', 4), SNGram.COMMA, PatternElement('bananas', 'form', 8), SNGram.LEFT_BRACKET, PatternElement(',', 'form', 6), SNGram.COMMA, PatternElement('and', 'form', 7), SNGram.RIGHT_BRACKET, SNGram.RIGHT_BRACKET ], "profiles": set(["form [ form form , form form , form [ form , form ] ]"]) }
def test_append(encoder): pattern_list = [ PatternElement('fox', 'form'), SNGram.LEFT_BRACKET, PatternElement('The', 'form'), SNGram.COMMA, PatternElement('quick', 'form'), SNGram.COMMA, PatternElement('brown', 'form'), SNGram.RIGHT_BRACKET ] expected_pattern = SNGram.from_element_list(pattern_list) pattern = b'' for element in pattern_list: pattern = encoder.append(pattern, encoder.encode_item(element)) assert encoder.decode(pattern) == expected_pattern
def test_create_encoder(): infile_path = os.path.abspath( 'example_data/example_data_dict_filtered.json') configfile_path = os.path.abspath('example_data/example_config.json') script_file = os.path.abspath('bin/create_encoder') runner = CliRunner() with runner.isolated_filesystem(): outfile = "example_data_encoder" exit_status = os.system(script_file + " " + infile_path + " " + outfile + " " + configfile_path) encoder = Base64Encoder(PatternEncoder.load(open(outfile, 'rb'))) dict_ = json.load(open(infile_path, 'r')) pattern_elements = [ PatternElement(word, level) for level, elements in dict_.items() for word in elements.keys() ] results = [ encoder.decode(encoder.encode_item(pe)).get_element_list()[0] == pe for pe in pattern_elements ] assert all(results)
def test_huffman_encode_unknown_item(freq_dict): test = HuffmanEncoder(freq_dict, SNGram) element = PatternElement('unknown', 'form') with pytest.raises(EncodeError): test.encode_item(element)
def case_jumps(): data = """ # text = The quick brown fox jumps over the lazy dog. 1 The the DET DT Definite=Def|PronType=Art 4 det _ _ 2 quick quick ADJ JJ Degree=Pos 4 amod _ _ 3 brown brown ADJ JJ Degree=Pos 4 amod _ _ 4 fox fox NOUN NN Number=Sing 5 nsubj _ _ 5 jumps jump VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _ 6 over over ADP IN _ 9 case _ _ 7 the the DET DT Definite=Def|PronType=Art 9 det _ _ 8 lazy lazy ADJ JJ Degree=Pos 9 amod _ _ 9 dog dog NOUN NN Number=Sing 5 nmod _ SpaceAfter=No 10 . . PUNCT . _ 5 punct _ _ """ return TokenSNGram(conllu.parse_tree(data)[0]), { "length": 10, "str": "jumps [fox [The, quick, brown], dog [over, the, lazy], .]", "repr": [ PatternElement('jumps', 'form', 5), SNGram.LEFT_BRACKET, PatternElement('fox', 'form', 4), SNGram.LEFT_BRACKET, PatternElement('The', 'form', 1), SNGram.COMMA, PatternElement('quick', 'form', 2), SNGram.COMMA, PatternElement('brown', 'form', 3), SNGram.RIGHT_BRACKET, SNGram.COMMA, PatternElement('dog', 'form', 9), SNGram.LEFT_BRACKET, PatternElement('over', 'form', 6), SNGram.COMMA, PatternElement('the', 'form', 7), SNGram.COMMA, PatternElement('lazy', 'form', 8), SNGram.RIGHT_BRACKET, SNGram.COMMA, PatternElement('.', 'form', 10), SNGram.RIGHT_BRACKET ], "profiles": set([ "form [ form [ form , form , form ] , form [ form , form , form ] , form ]" ]) }
def case_dog(): data = """ # text = over the lazy dog 6 over over ADP IN _ 9 case _ _ 7 the the DET DT Definite=Def|PronType=Art 9 det _ _ 8 lazy lazy ADJ JJ Degree=Pos 9 amod _ _ 9 dog dog NOUN NN Number=Sing 0 nmod _ SpaceAfter=No """ return TokenSNGram(conllu.parse_tree(data)[0]), { "length": 4, "str": "dog [over, the, lazy]", "repr": [ PatternElement('dog', 'form', 9), SNGram.LEFT_BRACKET, PatternElement('over', 'form', 6), SNGram.COMMA, PatternElement('the', 'form', 7), SNGram.COMMA, PatternElement('lazy', 'form', 8), SNGram.RIGHT_BRACKET ], "profiles": set(["form [ form , form , form ]"]) }
def encode_item(self, item): code = 0 if item == self.token_start: code = self.special_offset + len(self.special_characters) elif item == self.token_end: code = self.special_offset + len(self.special_characters) + 1 else: try: code = self.special_characters.index( item) + self.special_offset except ValueError: try: code = self.dictionaries[item.level][ item.form] + self.level_offsets[item.level] except AttributeError: ## it is a token - so encode this return self._encode([self.token_start] + [ PatternElement(value, level) for level, value in item.items() ] + [self.token_end]) except KeyError: if self.unknown is not None: code = len(self.dictionaries[ item.level]) + self.level_offsets[item.level] else: raise EncodeError("Element not in dictionary: " + str(item)) return self._int_2_bytes(code)
def decode(self, encoded_pattern): encoded_pattern = self._bytes_2_int(encoded_pattern) pattern = [] element_size_int = 2**self.element_size - 1 in_token = False while encoded_pattern != 0: element_id = int(encoded_pattern & element_size_int) try: word, level = self._get_word_for_id(element_id) if in_token: current_token[level] = word else: if level is not None: pattern.append(PatternElement(word, level)) else: pattern.append(word) except: ## end of token - treat as start as pattern is reversed if element_id == len(self._ids_2_words) + 2: in_token = True current_token = {} ## start of token - treat as start as pattern is reversed elif element_id == len(self._ids_2_words) + 1: in_token = False pattern.append(current_token) else: raise ValueError("Cannot decode pattern, unknown key " + str(element_id)) encoded_pattern = encoded_pattern >> self.element_size pattern.reverse() return self.pattern_type.from_element_list(pattern)
def case_jumps_phrases(): tree = SNGram.Tree({ 'form': 'jumps', 'id': 5 }, [ SNGram.Tree({ 'form': 'nsubj', 'id': 4 }, [], SNGram.Tree({ 'form': 'fox', 'id': 4 }, [ SNGram.Tree({ 'form': 'The', 'id': 1 }, []), SNGram.Tree({ 'form': 'quick', 'id': 2 }, []), SNGram.Tree({ 'form': 'brown', 'id': 3 }, []), ])), SNGram.Tree({ 'form': 'nmod', 'id': 9 }, [], SNGram.Tree({ 'form': 'dog', 'id': 9 }, [ SNGram.Tree({ 'form': 'over', 'id': 6 }, []), SNGram.Tree({ 'form': 'the', 'id': 7 }, []), SNGram.Tree({ 'form': 'lazy', 'id': 8 }, []), ])), SNGram.Tree({ 'form': '.', 'id': 10 }, []) ]) return TokenSNGram(tree), { "length": 4, "str": "jumps [nsubj, nmod, .]", "repr": [ PatternElement('jumps', 'form', 5), SNGram.LEFT_BRACKET, PatternElement('nsubj', 'form', 4), SNGram.COMMA, PatternElement('nmod', 'form', 9), SNGram.COMMA, PatternElement('.', 'form', 10), SNGram.RIGHT_BRACKET ], "repr_full": [ PatternElement('jumps', 'form', 5), SNGram.LEFT_BRACKET, PatternElement('fox', 'form', 4), SNGram.LEFT_BRACKET, PatternElement('The', 'form', 1), SNGram.COMMA, PatternElement('quick', 'form', 2), SNGram.COMMA, PatternElement('brown', 'form', 3), SNGram.RIGHT_BRACKET, SNGram.COMMA, PatternElement('dog', 'form', 9), SNGram.LEFT_BRACKET, PatternElement('over', 'form', 6), SNGram.COMMA, PatternElement('the', 'form', 7), SNGram.COMMA, PatternElement('lazy', 'form', 8), SNGram.RIGHT_BRACKET, SNGram.COMMA, PatternElement('.', 'form', 10), SNGram.RIGHT_BRACKET ], "profiles": set(["form [ form , form , form ]"]) }
def test_encode_item(encoder): element = PatternElement('fox', 'form') expected_pattern = SNGram.from_element_list([element]) assert encoder.decode(encoder.encode_item(element)) == expected_pattern