Exemplo n.º 1
0
def test_unicode_compile():
    c = JsonDictionaryCompiler({"memory_limit_mb": "10"})
    c.Add("üöä", '{"y" : 2}')
    c.Add("üüüüüüabd", '{"a" : 3}')
    c.Add(u"ääääädäd", '{"b" : 33}')

    with tmp_dictionary(c, 'simple_json.kv') as d:
        assert len(d) == 3
        assert d["üöä"].GetValueAsString() == '{"y":2}'
        assert d[u"üöä"].GetValueAsString() == '{"y":2}'
        assert d["üüüüüüabd"].GetValueAsString() == '{"a":3}'
        assert d["ääääädäd"].GetValueAsString() == '{"b":33}'
Exemplo n.º 2
0
def test_unicode():
    c = JsonDictionaryCompiler({"memory_limit_mb": "10"})
    c.Add("öäü", '{"a" : 2}')
    c.Add("abd", '{"a" : 3}')
    # use python syntax ala __setitem__
    c["abd"] = '{"a" : 3}'

    # create unicode string
    key = "öäü"
    with tmp_dictionary(c, 'unicode_json.kv') as d:
        assert key in d
        assert d[key].GetValue() == {"a": 2}
        assert d.get(key).GetValue() == {"a": 2}
Exemplo n.º 3
0
def test_leak():
    c = JsonDictionaryCompiler({"memory_limit_mb": "10"})
    c.Add("something", '["a" : 2]')

    with tmp_dictionary(c, 'near_simple.kv') as d:
        gc.collect()
        memory_usage_on_start = memory_usage_ps()
        for i in range(0, 500000):
            assert not d.get('something_else')
            if i % 100 == 0:
                gc.collect()
                memory_usage_now = memory_usage_ps()
                assert memory_usage_now < memory_usage_on_start + 15000
Exemplo n.º 4
0
def test_simple_snappy():
    c = JsonDictionaryCompiler({
        "memory_limit_mb": "10",
        'compression': 'snappy',
        'compression_threshold': '0'
    })
    c.Add("abc", '{"a" : 2}')
    c.Add("abd", '{"a" : 3}')
    with tmp_dictionary(c, 'simple_json_snappy.kv') as d:
        assert len(d) == 2
        assert d["abc"].GetValueAsString() == '{"a":2}'
        assert d["abd"].GetValueAsString() == '{"a":3}'
        m = d.GetStatistics()['Value Store']
        assert m['__compression'] == "snappy"
Exemplo n.º 5
0
def test_unicode_lookup():
    c = JsonDictionaryCompiler({"memory_limit_mb": "10"})
    c.Add("Los Angeles", '{"country" : "USA"}')
    c.Add("Frankfurt am Main", '{"country" : "Germany"}')
    c.Add("Kirchheim bei München", '{"country" : "Germany"}')

    # create unicode string for lookup
    text = "From Los Angeles via Frankfurt am Main to Kirchheim bei München it should just work"
    with tmp_dictionary(c, 'unicode_json_lookup.kv') as d:
        assert "Kirchheim bei München" in d
        matched_strings = [x.GetMatchedString() for x in d.LookupText(text)]
        assert len(matched_strings) == 3
        assert u"Kirchheim bei München" in matched_strings
        assert u"Los Angeles" in matched_strings
        assert u"Frankfurt am Main" in matched_strings
Exemplo n.º 6
0
def test_tmp_dir():
    cwd = os.getcwd()
    os.chdir(tempfile.gettempdir())
    try:
        os.mkdir("tmp_dir_test")
        os.chdir(os.path.join(tempfile.gettempdir(), "tmp_dir_test"))
        c = JsonDictionaryCompiler({"memory_limit_mb": "10"})
        c.Add("abc", "{'a':2}")
        assert os.listdir('.') == []
        c.Compile()
        assert os.listdir('.') == []
        del c
        assert os.listdir('.') == []
    finally:
        os.chdir(cwd)
        os.rmdir(os.path.join(tempfile.gettempdir(), "tmp_dir_test"))
Exemplo n.º 7
0
def test_float_compaction():
    cs = JsonDictionaryCompiler({
        "memory_limit_mb": "10",
        'floating_point_precision': 'single'
    })
    cd = JsonDictionaryCompiler({"memory_limit_mb": "10"})

    # add a couple of floats to both
    cs.Add(
        'aa',
        '[1.7008715758978892, 1.8094465532317732, 1.6098250864350536, 1.6369107966501981, 1.7736887965234107, 1.606682751740542, 1.6186427703265525, 1.7939763843449683, 1.5973550162469434, 1.6799721708726192, 1.8199786239525833, 1.7956178070065245, 1.7269879953863045]'
    )
    cd.Add(
        'aa',
        '[1.7008715758978892, 1.8094465532317732, 1.6098250864350536, 1.6369107966501981, 1.7736887965234107, 1.606682751740542, 1.6186427703265525, 1.7939763843449683, 1.5973550162469434, 1.6799721708726192, 1.8199786239525833, 1.7956178070065245, 1.7269879953863045]'
    )

    with tmp_dictionary(cs, 'json_single_precision_float.kv') as ds:
        with tmp_dictionary(cd, 'json_double_precision_float.kv') as dd:
            # first some basic checks
            assert len(ds) == 1
            assert len(dd) == 1
            # simple test the length of the value store which shall be smaller for single floats
            stats_s = ds.GetStatistics()
            stats_d = dd.GetStatistics()
            assert int(stats_s['Value Store']['size']) < int(
                stats_d['Value Store']['size'])
Exemplo n.º 8
0
def test_truncated_file_json():
    c = JsonDictionaryCompiler({"memory_limit_mb": "10"})
    c.Add('a', '{1:2}')
    c.Add('b', '{2:4}')
    c.Add('c', '{4:4}')
    c.Add('d', '{2:3}')
    c.Compile()

    c.WriteToFile(os.path.join(tmp_dir, 'truncation_test.kv'))
    size = os.path.getsize(os.path.join(tmp_dir, 'truncation_test.kv'))

    fd_in = open(os.path.join(tmp_dir, 'truncation_test.kv'), 'rb')
    fd = open(os.path.join(tmp_dir, 'truncation_test1.kv'), 'wb')
    fd.write(fd_in.read(int(size / 2)))
    fd.close()

    fd2 = open(os.path.join(tmp_dir, 'truncation_test2.kv'), 'wb')
    fd2.write(fd_in.read(int(size - 2)))
    fd2.close()

    with pytest.raises(ValueError):
        d = Dictionary(os.path.join(tmp_dir, 'truncation_test1.kv'))
    with pytest.raises(ValueError):
        d = Dictionary(os.path.join(tmp_dir, 'truncation_test2.kv'))
    os.remove(os.path.join(tmp_dir, 'truncation_test2.kv'))
    os.remove(os.path.join(tmp_dir, 'truncation_test1.kv'))
    os.remove(os.path.join(tmp_dir, 'truncation_test.kv'))
Exemplo n.º 9
0
def test_input_output_keys():
    compiler = JsonDictionaryCompiler({
        'compression_threshold': '32',
        'compression': 'zlib',
        "memory_limit_mb": "10"
    })
    input_keys_count = 0
    with open(os.path.join(
            root, 'var_length_short_calculation_test_data.tsv')) as f_in:
        for line in f_in:
            k, v = line.split('\t')
            key = json.loads(k)
            value = json.loads(v)
            compiler.Add(key, value)
            input_keys_count += 1

    output_keys_count = 0
    with tmp_dictionary(compiler, 'var_length_short_test.kv') as d:
        for _ in d.GetAllItems():
            output_keys_count += 1

    assert input_keys_count == output_keys_count
Exemplo n.º 10
0
def compile_file(input, output, jobs, shards):
    skipped_keys = 0

    compilers = {}
    for i in range(0, shards):
        compilers[i] = JsonDictionaryCompiler()

    if os.path.isdir(input):
        input_files = [os.path.join(input, d) for d in os.listdir(input)]
    else:
        input_files = [input]

    for input_file in input_files:
        if input_file.endswith(".gz"):
            input_fd = gzip.open(input_file)
        else:
            input_fd = open(input_file)

        for line in input_fd:
            try:
                parts = line.split("\t")
                key = parts[0]

                if key != remove_control_chars(key):
                    print("skip key: " + ":".join("{:02x}".format(ord(c))
                                                  for c in key) +
                          " due to containing control characters")
                    skipped_keys += 1

                value = parts[1]

                shard = JumpConsistentHashString(key, shards)
                compilers[shard].Add(key, value)
            except:
                print("failed to add: " + line)
        print("Skipped keys " + str(skipped_keys))

    for i in range(jobs):
        t = threading.Thread(target=compile_worker)
        t.daemon = True
        t.start()

    if shards == 1:
        compile_queue.put((compilers[i], output))
    else:
        for i in range(0, shards):
            compile_queue.put((compilers[i], output + "-" + str(i)))

    compile_queue.join()
Exemplo n.º 11
0
def test_tmp_dir_defined():
    def run_compile(tmpdir):
        c = JsonDictionaryCompiler({
            "memory_limit_mb": "10",
            "temporary_path": tmpdir
        })
        c.Add("abc", "{'a':2}")
        c.Compile()
        assert os.listdir(tmpdir) != []

    test_dir = os.path.join(tempfile.gettempdir(), "tmp_dir_test_defined")
    try:
        os.mkdir(test_dir)
        run_compile(test_dir)
    finally:
        gc.collect()
        JsonDictionaryCompiler({"memory_limit_mb": "10"})
        shutil.rmtree(test_dir)
Exemplo n.º 12
0
def test_compiler_empty_json():
    c = JsonDictionaryCompiler({"memory_limit_mb": "10"})
    with test_tools.tmp_dictionary(c, 'empty_json.kv') as d:
        assert len(d) == 0
Exemplo n.º 13
0
def compile(args):
    params = {key: value for key, value in args.compiler_params}

    dict_type = args.dict_type
    if dict_type == 'json':
        dictionary = JsonDictionaryCompiler(params)
    elif dict_type == 'string':
        dictionary = StringDictionaryCompiler(params)
    elif dict_type == 'int':
        dictionary = IntDictionaryCompiler(params)
    elif dict_type == 'completion':
        dictionary = CompletionDictionaryCompiler(params)
    elif dict_type == 'key-only':
        dictionary = KeyOnlyDictionaryCompiler(params)
    else:
        return 'Must never reach here'

    with open(args.input_file) as file_in:
        for line in file_in:
            line = line.rstrip('\n')
            try:
                splits = line.split('\t')
                if dict_type == 'key-only':
                    dictionary.Add(splits[0])
                elif dict_type == 'int' or dict_type == 'completion':
                    dictionary.Add(splits[0], int(splits[1]))
                else:
                    dictionary.Add(splits[0], splits[1])
            except:
                print('Can not parse line: {}'.format(line))

    dictionary.Compile()
    dictionary.WriteToFile(args.output_file)
Exemplo n.º 14
0
def test_manifest_for_merger():
    try:
        c = JsonDictionaryCompiler({"memory_limit_mb": "10"})
        c.Add("abc", '{"a" : 2}')
        c.Compile()
        c.SetManifest('{"author": "Zapp Brannigan"}')
        c.WriteToFile('manifest_json_merge1.kv')
        del c

        c2 = JsonDictionaryCompiler({"memory_limit_mb": "10"})
        c2.Add("abd", '{"a" : 3}')
        c2.Compile()
        c2.SetManifest('{"author": "Leela"}')
        c2.WriteToFile('manifest_json_merge2.kv')
        del c2

        merger = JsonDictionaryMerger({"memory_limit_mb": "10"})
        merger.SetManifest('{"author": "Fry"}')
        merger.Merge('manifest_json_merged.kv')

        d = Dictionary('manifest_json_merged.kv')
        m = json.loads(d.GetManifest())
        assert m['author'] == "Fry"
        del d

    finally:
        os.remove('manifest_json_merge1.kv')
        os.remove('manifest_json_merge2.kv')
        os.remove('manifest_json_merged.kv')