def encode_pattern(pattern, token_start, token_end, unknowns): current_pattern = b'' for element in pattern.get_element_list(): if not hasattr(element, 'items'): ## can be special element (string), or a PatternElement current_list = [(str(element), getattr(element, 'get', lambda x, y: None)('level', None))] else: current_list = [(token_start, None)] + [ (word, level) for level, word in element.items() if level in {'lemma', 'upos', 'deprel'} ] + [(token_end, None)] for element, level in current_list: ## just a quick fix - np_function needs to be handled differently if level == 'deprel': level = 'np_function' try: encoded_element = Base64Encoder.b64decode(element) except binascii.Error: ## set encoded_element to unknown encoded_element = Base64Encoder.b64decode(unknowns[level]) current_pattern = HuffmanEncoder.combine(current_pattern, encoded_element) return Base64Encoder.b64encode(current_pattern, binary=False)
def test_encode_vocabulary(): infile_path = os.path.abspath( 'example_data/example_data_dict_filtered.json') encoder_path = os.path.abspath('example_data/example_data_encoder') configfile_path = os.path.abspath('example_data/example_config.json') script_file = os.path.abspath('bin/encode_vocabulary') runner = CliRunner() with runner.isolated_filesystem(): outfile = "example_data_dict_filtered_encoded.json" os.system(script_file + " " + infile_path + " " + outfile + " " + encoder_path + " " + configfile_path) encoder = Base64Encoder(PatternEncoder.load(open(encoder_path, 'rb'))) result_dict = json.load(open(outfile, 'r')) results = [(level, decoded, encoder.decode(encoded).get_element_list()) for level, elements in result_dict.items() for decoded, encoded in elements.items() if level != "__special__"] results = [ len(pe) == 1 and pe[0].level == level and pe[0].form == word for level, word, pe in results ] assert all(results)
def test_create_encoder(): infile_path = os.path.abspath( 'example_data/example_data_dict_filtered.json') configfile_path = os.path.abspath('example_data/example_config.json') script_file = os.path.abspath('bin/create_encoder') runner = CliRunner() with runner.isolated_filesystem(): outfile = "example_data_encoder" exit_status = os.system(script_file + " " + infile_path + " " + outfile + " " + configfile_path) encoder = Base64Encoder(PatternEncoder.load(open(outfile, 'rb'))) dict_ = json.load(open(infile_path, 'r')) pattern_elements = [ PatternElement(word, level) for level, elements in dict_.items() for word in elements.keys() ] results = [ encoder.decode(encoder.encode_item(pe)).get_element_list()[0] == pe for pe in pattern_elements ] assert all(results)
def decode_pattern_collection(ctx, infile, encoder, outfile, string, unknown): with open_file(encoder, 'rb') as encoder_file: pattern_encoder = Base64Encoder(PatternEncoder.load(encoder_file), binary=False) with open_file(infile) as infile: with open_file(outfile, 'w') as o: for line in infile: pattern, content = json.loads(line) decoded_pattern = pattern_encoder.decode(pattern) if string: out_pattern = str(decoded_pattern) else: out_pattern = base64.b64encode( pickle.dumps(decoded_pattern)).decode('ascii') base_patterns = content.get('base_patterns', []) decoded_base_patterns = [] for base_pattern in base_patterns: try: examples = [] if len(base_pattern) == 2: examples = base_pattern[1] base_pattern = base_pattern[0] decoded_pattern = pattern_encoder.decode(base_pattern) if unknown is None or all([ element != unknown for element in decoded_pattern.get_element_list() ]): if string: cout_pattern = str(decoded_pattern) else: cout_pattern = base64.b64encode( pickle.dumps(decoded_pattern)).decode( 'ascii') decoded_base_patterns.append( [cout_pattern, examples]) except: ctx.obj['logger'].warning( "Could not test pattern for unknown, skipping.") content['base_patterns'] = decoded_base_patterns json.dump((out_pattern, content), o) o.write("\n")
def decode_patterns(ctx, infile, encoder, outfile, processes): with open_file(encoder, 'rb') as encoder_file: pattern_encoder = Base64Encoder(PatternEncoder.load(encoder_file), binary=False) with open_file(infile) as infile: with open_file(outfile, 'wb') as o: with MultiprocessMap(processes, chunksize=1000) as m: for pattern, decoded_pattern in m( functools.partial(decode_pattern, pattern_encoder=pattern_encoder), infile): ctx.obj['logger'].info("Pattern") pickle.dump((pattern, decoded_pattern), o)
'fox': 0, 'The': 2, 'quick': 1, 'brown': 3 }}, SNGram), BitEncoder({'form': set(['fox', 'The', 'quick', 'brown'])}, SNGram), HuffmanEncoder({'form': { 'fox': 5, 'The': 10, 'quick': 3, 'brown': 8 }}, SNGram), Base64Encoder( HuffmanEncoder({'form': { 'fox': 5, 'The': 10, 'quick': 3, 'brown': 8 }}, SNGram)), Base64Encoder(HuffmanEncoder( {'form': { 'fox': 5, 'The': 10, 'quick': 3, 'brown': 8 }}, SNGram), binary=False) ] @pytest.mark.parametrize("encoder", encoder)
def test_extract_patterns_with_phrases(parameters, expected_patterns, expected_basepatterns): infile_path = os.path.abspath('example_data/example_data_encoded.conllu') dictfile_path = os.path.abspath( 'example_data/example_data_dict_filtered_encoded.json') encoder_path = os.path.abspath('example_data/example_data_encoder') runner = CliRunner() with runner.isolated_filesystem(): patterns_list_filename = 'patterns_sorted.json' base_list_filename = 'base_sorted.json' patterns_filename = 'patterns.json' base_filename = 'base.json' runner.invoke(main, [ 'extract-patterns', infile_path, patterns_list_filename, base_list_filename, dictfile_path ] + parameters) # files need to be sorted for filename in [patterns_list_filename, base_list_filename]: with open(filename, 'r') as pattern_file: lines = [line.rstrip() for line in pattern_file] ## patterns list needs sorted with unique if filename == patterns_list_filename: lines = set(lines) patterns = sorted(lines) with open(filename, 'w') as pattern_file: pattern_file.write('\n'.join(patterns) + '\n') with open(patterns_list_filename, 'r') as pattern_file: patterns = [line.rstrip() for line in pattern_file] print('\n'.join(patterns)) runner.invoke(main, [ 'utils', 'convert-pattern-list', base_list_filename, base_filename ]) runner.invoke(main, [ 'utils', 'convert-pattern-list', patterns_list_filename, patterns_filename, ]) patterns = {} for line in open(patterns_filename): pattern, base_patterns = json.loads(line) patterns[pattern] = base_patterns basepatterns = [] for line in open(base_filename): basepatterns.append(json.loads(line)) encoder = Base64Encoder(PatternEncoder.load(open(encoder_path, 'rb'))) patterns = { str(encoder.decode(pattern)): set([str(encoder.decode(base)) for base in bases]) for pattern, bases in patterns.items() } basepatterns = { str(encoder.decode(pattern)): content for pattern, content in basepatterns } assert patterns == expected_patterns assert basepatterns == expected_basepatterns