def test_getpy_dump_load(): key_type = np.dtype('u8') value_type = np.dtype('u8') keys = np.random.randint(1, 1000, size=10**1, dtype=key_type) values = np.random.randint(1, 1000, size=10**1, dtype=value_type) gp_dict_1 = gp.Dict(key_type, value_type) gp_dict_1[keys] = values gp_dict_1.dump('test.bin') gp_dict_2 = gp.Dict(key_type, value_type) gp_dict_2.load('test.bin') assert len(gp_dict_1) == len(gp_dict_2)
def test_getpy_vectorized_methods_with_bytearray_dtype(): key_type = np.dtype('u8') value_type = gp.types['bytearray50'] gp_dict = gp.Dict(key_type, value_type) keys = np.random.randint(1, 1000, size=200, dtype=key_type) values = np.packbits([ np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1] * 25, dtype=np.bool) ] * 200, axis=1).view(value_type) gp_dict[keys] = values iterated_keys = [key for key in gp_dict] iterated_keys_and_values = [(key, value) for key, value in gp_dict.items()] select_keys = np.random.choice(keys, size=100) select_values = gp_dict[select_keys] random_keys = np.random.randint(1, 1000, size=500, dtype=key_type) random_keys_mask = gp_dict.contains(random_keys) mask_keys = random_keys[random_keys_mask] mask_values = gp_dict[mask_keys] gp_dict.iadd(keys, values) gp_dict.isub(keys, values) gp_dict.ior(keys, values) gp_dict.iand(keys, values)
def test_getpy_vectorized_methods_with_default(): key_type = np.dtype('u8') value_type = np.dtype('u8') gp_dict = gp.Dict(key_type, value_type, default_value=0) keys = np.random.randint(1, 1000, size=200, dtype=key_type) values = np.random.randint(1, 1000, size=200, dtype=value_type) gp_dict[keys] = values iterated_keys = [key for key in gp_dict] iterated_keys_and_values = [(key, value) for key, value in gp_dict.items()] select_keys = np.random.choice(keys, size=100) select_values = gp_dict[select_keys] random_keys = np.random.randint(1, 1000, size=500, dtype=key_type) random_keys_mask = gp_dict.contains(random_keys) random_values_with_defaults = gp_dict[random_keys] for random_key_mask, random_value in zip(random_keys_mask, random_values_with_defaults): if not random_key_mask: assert random_value == 0 else: assert random_value != 0 one_values = np.ones(500, dtype=value_type) gp_dict.iadd(random_keys, one_values) gp_dict.isub(random_keys, one_values) gp_dict.ior(random_keys, one_values) gp_dict.iand(random_keys, one_values)
def test_gp_dict(): import getpy as gp # type: ignore h = gp.Dict(HASH_TYPE, np.uint8) h[np.arange(10, dtype=HASH_TYPE)] = np.zeros(10, dtype=np.uint8) h[np.arange(5, dtype=HASH_TYPE)] = np.ones(5, dtype=np.uint8) expected = {i: i < 5 for i in range(10)} assert expected == as_dict(h)
def __init__(self, data_format): super().__init__(data_format) self.nominal_speed = .5 self.social_score_proximity_threshold_list = [0.2, 0.3, 0.5] key_type = np.dtype('i8') value_type = np.dtype('f8') self.social_score_dict = {} self.update_iteration_count_dict = {} for thresh in self.social_score_proximity_threshold_list: self.social_score_dict[thresh] = gp.Dict(key_type, value_type, default_value=np.asarray(0.0).astype('f8')) self.update_iteration_count_dict[thresh] = gp.Dict(key_type, value_type, default_value=np.asarray(0.0).astype('f8')) self.time_step_done = False self.min_social_score_agent = 0 self.min_social_score = [float('Inf') for _ in range(len(self.social_score_proximity_threshold_list))] self.social_score_collision_threshold = 0.0 self.agent_radius = 0.2 #needs to be the same as in collision
def test_getpy_types(): for key_type, value_type in gp.dict_types: gp_dict = gp.Dict(key_type, value_type) keys = np.array(range(256), dtype=key_type) values = np.array(range(256), dtype=value_type) gp_dict[keys] = values values = gp_dict[keys]
def test_getpy_big_dict_u8_u8(): key_type = np.dtype('u8') value_type = np.dtype('u8') gp_dict = gp.Dict(key_type, value_type) values = np.random.randint(10**15, size=10**4, dtype=value_type) for i in range(10**2): keys = np.random.randint(10**15, size=10**4, dtype=key_type) gp_dict[keys] = values
def test_getpy_very_big_dict_u4_u4(): key_type = np.dtype('u4') value_type = np.dtype('u4') gp_dict = gp.Dict(key_type, value_type) values = np.random.randint(10**9, size=10**5, dtype=value_type) for i in range(10**2): keys = np.random.randint(10**9, size=10**5, dtype=key_type) gp_dict[keys] = values
def test_getpy_very_big_dict_u8_S16(): key_type = np.dtype('u8') value_type = np.dtype('S16') gp_dict = gp.Dict(key_type, value_type) values = np.array([np.random.bytes(16) for i in range(10**5)], dtype=value_type) for i in range(10**2): keys = np.random.randint(10**15, size=10**5, dtype=key_type) gp_dict[keys] = values
def load_gp(self, filename): """Override gp.Dict.load, to correctly merge values instead of overwriting.""" other = gp.Dict(HASH_TYPE, np.uint8, default_value=False) other.load(str(filename)) n = len(other) keys = np.fromiter((k for (k, v) in other.items()), dtype=HASH_TYPE, count=n) values = np.fromiter((v for (k, v) in other.items()), dtype=np.uint8, count=n) self.merge(keys, values)
def __init__(self, data_format): super().__init__(data_format) self.collision_radius = 0.0 key_type = np.dtype('i8') value_type = np.dtype('i8') self.collisions_per_agent = gp.Dict(key_type, value_type, default_value=0) self.agent_radius = 0.2 #0.2 normal 0.1 for UNIV crowded self.total_collisions = 0 self.agents = dict() self.num_agents_buffer = [] self.time_step_done = False self.time_step = 0
def test_getpy_big_dict_uint64_lookup(): key_type = np.dtype('u8') value_type = np.dtype('u8') gp_dict = gp.Dict(key_type, value_type) keys = np.random.randint(10**15, size=10**5, dtype=key_type) values = np.random.randint(10**15, size=10**5, dtype=value_type) gp_dict[keys] = values for i in range(10**2): values = gp_dict[keys]
def test_getpy_methods_with_multidim_and_strings(): key_type = np.dtype('S8') value_type = np.dtype('S8') keys = np.array([np.random.bytes(4) for i in range(10**2)], dtype=key_type).reshape(10,10) values = np.array([np.random.bytes(4) for i in range(10**2)], dtype=value_type).reshape(10,10) gp_dict = gp.Dict(key_type, value_type) gp_dict[keys] = values p_dict = {key : value for key, value in zip(keys.flat, values.flat)} assert len(gp_dict) == len(np.unique(keys.flat)) assert all([gp_dict[key] == p_dict[key] for key in keys.flat])
def test_getpy_methods_with_strings(): key_type = np.dtype('S8') value_type = np.dtype('S8') keys = np.array([np.random.bytes(8) for i in range(10**2)], dtype=key_type) values = np.array([np.random.bytes(8) for i in range(10**2)], dtype=value_type) gp_dict = gp.Dict(key_type, value_type) gp_dict[keys] = values p_dict = {key : value for key, value in zip(keys, values)} assert len(gp_dict) == len(np.unique(keys)) assert all([gp_dict[key] == p_dict[key] for key in keys])
def test_getpy_methods_with_multidim(): key_type = np.dtype('u8') value_type = np.dtype('u8') keys = np.random.randint(1, 1000, size=10**2, dtype=key_type).reshape(10,10) values = np.random.randint(1, 1000, size=10**2, dtype=value_type).reshape(10,10) gp_dict = gp.Dict(key_type, value_type) gp_dict[keys] = values p_dict = {key : value for key, value in zip(keys.flat, values.flat)} assert len(gp_dict) == len(np.unique(keys)) assert all([gp_dict[key] == p_dict[key] for key in keys.flat])
def test_getpy_methods(): key_type = np.dtype('u8') value_type = np.dtype('u8') keys = np.random.randint(1, 1000, size=10**2, dtype=key_type) values = np.random.randint(1, 1000, size=10**2, dtype=value_type) gp_dict = gp.Dict(key_type, value_type) gp_dict[keys] = values p_dict = {key: value for key, value in zip(keys, values)} assert len(gp_dict) == len(np.unique(keys)) assert all([gp_dict[key] == p_dict[key] for key in keys])
def build_gp_dict(data_set, key_type, value_type): """ Convert dataset with int64 keys and int32/int32 vals into int64/int64 gpdict """ gp_dict = gp.Dict(key_type, value_type) gp_keys = np.array(data_set[:, 0]).astype(np.int64) gp_vals = np.array(data_set[:, 1:]).astype(np.int64) gp_vals = gp_vals.astype(np.int32).reshape(-1) gp_vals = gp_vals.view(np.int64) gp_dict[gp_keys] = gp_vals return gp_dict
def test_getpy_very_big_dict_uint64_bytearray32(): key_type = np.dtype('u8') value_type = gp.types['bytearray32'] gp_dict = gp.Dict(key_type, value_type) values = np.packbits([ np.array([1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1] * 16, dtype=np.bool) ] * 10**5, axis=1).view(value_type) for i in range(10**2): keys = np.random.randint(10**15, size=10**5, dtype=key_type) gp_dict[keys] = values
def test_getpy_methods_with_default(): key_type = np.dtype('u8') value_type = np.dtype('u8') keys = np.random.randint(1, 1000, size=10**2, dtype=key_type) values = np.random.randint(1, 1000, size=10**2, dtype=value_type) default_value = 4242 gp_dict = gp.Dict(key_type, value_type, default_value=default_value) gp_dict[keys] = values random_keys = np.random.randint(1, 1000, size=500, dtype=key_type) random_values = gp_dict[random_keys] assert np.all(random_values[np.where(gp_dict.contains(random_keys))] != default_value) assert np.all(random_values[np.where(np.logical_not(gp_dict.contains(random_keys)))] == default_value)
def test_getpy_methods_with_default_and_strings(): key_type = np.dtype('S8') value_type = np.dtype('S8') keys = np.array([np.random.bytes(8) for i in range(10**2)], dtype=key_type) values = np.array([np.random.bytes(8) for i in range(10**2)], dtype=value_type) default_value = np.random.bytes(8) gp_dict = gp.Dict(key_type, value_type, default_value=default_value) gp_dict[keys] = values random_keys = np.array([np.random.bytes(8) for i in range(10**3)], dtype=key_type) random_values = gp_dict[random_keys] assert np.all(random_values[np.where(gp_dict.contains(random_keys))] != default_value) assert np.all(random_values[np.where(np.logical_not(gp_dict.contains(random_keys)))] == default_value)
def preload(gp_dict_file, loop_archive, loop_struct_silent, rosetta_flags_file): """ Util to preload all the stuff and initialize from user inputs """ key_type = np.int64 value_type = np.int64 gp_dict = gp.Dict(key_type, value_type) gp_dict.load(gp_dict_file) with open(loop_archive, "r") as f: loop_list = f.read().splitlines() run_pyrosetta_with_flags(rosetta_flags_file) sfd, silent_index, silent_out = silent_preload(loop_struct_silent) return gp_dict, loop_list, sfd, silent_index, silent_out
def test_getpy_types(): for key_type, value_type in gp.dict_types: gp_dict = gp.Dict(key_type, value_type) if key_type.kind == 'U': keys = np.array(['0123456789' * 10 for i in range(10)], dtype=key_type) else: keys = np.array(range(10), dtype=key_type) if value_type.kind == 'U': values = np.array(['0123456789' * 10 for i in range(10)], dtype=value_type) else: values = np.array(range(10), dtype=value_type) gp_dict[keys] = values values = gp_dict[keys]
def retrieve_gp_dict_from_cache(ori, cart, key_type, value_type): """ Checks cache for gp_dict, returns None if not found """ dest_path_gp_cache_dir = os.path.join(os.path.dirname(__file__), "cache/gp_dicts/") os.makedirs(dest_path_gp_cache_dir, exist_ok=True) key = f"{(ori, cart)}" try: with open(dest_path_gp_cache_dir + "/hashmaps.json", "r") as f: index_dict = json.load(f) name = index_dict.get(key) if name is None: return except FileNotFoundError: return except json.decoder.JSONDecodeError: return gp_dict = gp.Dict(key_type, value_type) gp_dict.load(dest_path_gp_cache_dir + "/" + name) return gp_dict
def test_getpy_vectorized_methods(): key_type = np.dtype('u8') value_type = np.dtype('u8') gp_dict = gp.Dict(key_type, value_type) keys = np.random.randint(1, 1000, size=200, dtype=key_type) values = np.random.randint(1, 1000, size=200, dtype=value_type) gp_dict[keys] = values iterated_keys = [key for key in gp_dict] iterated_keys_and_values = [(key, value) for key, value in gp_dict.items()] assert len(gp_dict) == len(np.unique(keys)) p_dict = dict() for key, value in zip(keys, values): p_dict[key] = value assert len(gp_dict) == len(p_dict) assert sorted([(key, value) for key, value in gp_dict.items() ]) == sorted(p_dict.items()) select_keys = np.random.choice(keys, size=100).astype(key_type) select_values = gp_dict[select_keys] random_keys = np.random.randint(1, 1000, size=500).astype(key_type) random_keys_mask = gp_dict.contains(random_keys) mask_keys = random_keys[random_keys_mask] mask_values = gp_dict[mask_keys] gp_dict.iadd(keys, values) gp_dict.isub(keys, values) gp_dict.ior(keys, values) gp_dict.iand(keys, values)
def main(dict_list, frag_list, silent_list): "" keys_unique = {} with open(dict_list, "r") as f: dict_paths = f.read().splitlines() with open(frag_list, "r") as f: frag_paths = f.read().splitlines() data_paths_iter = zip(dict_paths, frag_paths) for dict_path, frag_path in data_paths_iter: key_type = np.dtype("i8") value_type = np.dtype("i8") dict_temp = gp.Dict(key_type, value_type) dict_temp.load(dict_path) with open(frag_path, "r") as f: frags = f.read().splitlines() keys = dict_temp.keys() vals = dict_temp[keys].view(np.int32).reshape(-1, 2) keys_loops_iter = ( (key, frags[val[0] : val[0] + val[1]]) for key, val in zip(keys, vals) ) update_unique_key_dict(keys_unique, keys_loops_iter) logging.debug(f"starting hashmap population") offset = 0 strings_master = [] gp_vals_list = [] gp_keys_list = [] for key, strings in keys_unique.items(): logging.debug(f"key string pair: {key}, {strings}") num_strings = len(strings) logging.debug(f"num_strings: {num_strings}") strings_master.extend(strings) gp_keys_list.append(key) gp_vals_list.append([offset, num_strings]) offset += num_strings logging.debug(f"new offset: {offset}") gp_keys = np.array(gp_keys_list) gp_vals = np.array(gp_vals_list) logging.debug(f"gp_vals: {gp_vals}") gp_vals_i32_flat = gp_vals.astype(np.int32).reshape(-1) logging.debug(f"gp_vals_i32_flat: {gp_vals_i32_flat}") gp_vals_i64 = gp_vals_i32_flat.view(np.int64) key_type = np.int64 value_type = np.int64 gp_dict = gp.Dict(key_type, value_type) gp_dict[gp_keys] = gp_vals_i64 gp_dump = "getpy_dict.bin" gp_dict.dump(gp_dump) key_val_data = np.empty((gp_keys.shape[0], 3)) key_val_data[:, 0] = gp_keys key_val_data[:, 1:] = gp_vals string_master_file = "loop_tag_index.txt" with open(string_master_file, mode="wt", encoding="utf-8") as f: f.write("\n".join(strings_master)) f.write("\n") npz_out = "key_val_data.npz" np.savez(npz_out, key_val_data) combine_silents(silent_list, "loop_archive.silent")
def main( silent_file, rosetta_flags_file="", xbin_cart_list=[], xbin_ori_list=[], max_len=20, scan_file="", ): """ Generates a loop e2e xbin table referencing a silentfile with metadata e2e hashtable holds just xbin keys and table entry keys. The table entry keys are just arbitrary indices that go to a table referncing a silentfile The table is just a list of tags and positions in a text list. The values deposited in the getpy dict are a two number coordinate: start index and num entries. To find all the appropriate values, just load the txt list into a python list and slice it: list[start:start + num_entries]. """ run_pyrosetta_with_flags(rosetta_flags_file) key_type = np.dtype("i8") value_type = np.dtype("i8") gp_dict = gp.Dict(key_type, value_type) xforms = [] loop_data_string_list = [] # silent_name = "loop_structs_out.silent" sfd = SilentFileData( silent_file, False, False, "binary", SilentFileOptions() ) # for pose in poses_from_silent(silent_file): sfd.read_file(silent_file) for tag in sfd.tags(): logging.debug(f"working on tag: {tag}") try: poses = silent_tag_to_poselets(silent_file, tag, 1, 2) except AssertionError: logging.debug("assertion in hackload failed, skipping") continue except RuntimeError: logging.debug("Unable to load this fragment in hackload, skipping") continue tag_loop_data_list, tag_xforms_list = parse_xforms_from_poselets( poses, tag, max_n_mer=max_len ) # tag_loop_data_list, tag_xforms_list = parse_xforms_from_tag( # sfd, tag, max_n_mer=max_len # ) loop_data_string_list.extend(tag_loop_data_list) xforms.extend(tag_xforms_list) logging.debug("loop data loaded") xbin_cart_list, xbin_ori_list = setup_xbin_vars( xbin_cart_list, xbin_ori_list, scan_file ) logging.debug(f"fragments extracted, building tables") logging.debug( f"""xbin params: c:{xbin_cart_list} o:{xbin_ori_list}""" ) for xbin_cart, xbin_ori in product(xbin_cart_list, xbin_ori_list): binner = xb(cart_resl=xbin_cart, ori_resl=xbin_ori) all_keys_non_unique = binner.get_bin_index(np.array(xforms)) keys_unique = {} for key, loop_data_string in zip( all_keys_non_unique, loop_data_string_list ): if key in keys_unique.keys(): keys_unique[key].append(loop_data_string) else: keys_unique[key] = [loop_data_string] offset = 0 strings_master = [] gp_vals_list = [] gp_keys_list = [] for key, strings in keys_unique.items(): num_strings = len(strings) strings_master.extend(strings) gp_keys_list.append(key) gp_vals_list.append([offset, num_strings]) offset += num_strings gp_keys = np.array(gp_keys_list, dtype=np.int64) gp_vals = np.array(gp_vals_list, dtype=np.int64) general_vals = gp_vals # squash data to fit into getpy_dict gp_vals = gp_vals.astype(np.int32).reshape(-1) gp_vals = gp_vals.view(np.int64) gp_dict[gp_keys] = gp_vals gp_dump = f"gp_c{xbin_cart}_o{xbin_ori}.bin" gp_dict.dump(gp_dump) key_val_data = np.empty((gp_keys.shape[0], 3), dtype=np.int64) key_val_data[:, 0] = gp_keys key_val_data[:, 1:] = general_vals # string_master_file = f"loop_tag_index_c{xbin_cart}_o{xbin_ori}.txt" np_strings_master = np.array(strings_master, dtype=np.string_) # # with open(string_master_file, mode="wt", encoding="utf-8") as f: # f.write("\n".join(strings_master)) # f.write("\n") # npz_out = "key_val_data.npz" # np.savez(npz_out, key_val_data) hdf5 = h5py.File("fragment_data.hf5", "a") kv_group = hdf5.require_group("key_value_data") key_val_ds = kv_group.require_dataset( f"key_val_index_cart_{xbin_cart}_ori_{xbin_ori}_nmer_{max_len}", key_val_data.shape, dtype=key_val_data.dtype, ) key_val_ds[:] = key_val_data key_val_ds.attrs.create("cart_resl", data=xbin_cart) key_val_ds.attrs.create("ori_resl", data=xbin_ori) key_val_ds.attrs.create("max_len", data=max_len) key_val_ds.attrs.create( "description", data="this is a 2XN np array where col1 are the int64 xbin keys and col2/3 are two upcast int32s (row,n_strings) addressing a sequence of strings in a corresponding archive", ) str_group = hdf5.require_group("string_archive") string_archive = str_group.require_dataset( f"string_archive_cart_{xbin_cart}_ori_{xbin_ori}_nmer_{max_len}", np_strings_master.shape, dtype=np_strings_master.dtype, ) string_archive[:] = np_strings_master string_archive.attrs.create("cart_resl", data=xbin_cart) string_archive.attrs.create("ori_resl", data=xbin_ori) string_archive.attrs.create("max_len", data=max_len) string_archive.attrs.create( "description", data="Ordered archive of strings describing pose fragments by tag:start:end. The order is addressed by the corresponding key_val dataset", ) # str_id = string_archive.id # key_val_ds.attrs.create("string_archive_id", data=str_id) hdf5.close()