def _init_dump_selected_frames(systems, labels, selc_idx, sys_data_path, jdata): pick_data = jdata['pick_data'] use_clusters = jdata.get('use_clusters', False) if use_clusters: selc_systems = dpdata.MultiSystems() for j in selc_idx: sys_name, sys_id = labels[j] selc_systems.append(systems[sys_name][sys_id]) selc_systems.to_deepmd_raw(sys_data_path) selc_systems.to_deepmd_npy(sys_data_path, set_size=selc_idx.size) else: selc_systems = {} for j in selc_idx: sys_name, sys_id = labels[j] selc_systems = _add_system(selc_systems, sys_name, systems[sys_name][sys_id]) sys_idx_map = get_system_idx(pick_data) for kk in selc_systems.keys(): sub_path = os.path.join(sys_data_path, sys_name_fmt % sys_idx_map[kk]) selc_systems[kk].to_deepmd_raw(sub_path) selc_systems[kk].to_deepmd_npy(sub_path, set_size=selc_idx.size) with open(os.path.join(sys_data_path, 'sys_idx_map.json'), 'w') as fp: json.dump(sys_idx_map, fp, indent=4)
def get_systems(path, jdata): system = get_system_cls(jdata) systems = dpdata.MultiSystems(*[ system(os.path.join(path, s), fmt='deepmd/npy') for s in os.listdir(path) ]) return systems
def convert_data(jdata): s = dpdata.MultiSystems(*[ dpdata.LabeledSystem(x, fmt="gaussian/log") for x in glob.glob(os.path.join(fp_path, "*", "output")) ], type_map=jdata["type_map"]) s.to_deepmd_npy(data_path) dlog.info("Initial data is avaiable in %s" % os.path.abspath(data_path))
def init_pick(iter_index, jdata, mdata): """pick up init data from dataset randomly""" pick_data = jdata['pick_data'] init_pick_number = jdata['init_pick_number'] # use MultiSystems with System # TODO: support System and LabeledSystem # TODO: support other format systems = get_systems(pick_data, jdata) # label the system labels = [] for key, system in systems.systems.items(): labels.extend([(key, j) for j in range(len(system))]) # random pick iter_name = make_iter_name(iter_index) create_path(iter_name) work_path = os.path.join(iter_name, model_devi_name) create_path(work_path) idx = np.arange(len(labels)) np.random.shuffle(idx) pick_idx = idx[:init_pick_number] rest_idx = idx[init_pick_number:] # dump the init data picked_systems = dpdata.MultiSystems() for j in pick_idx: sys_name, sys_id = labels[j] picked_systems.append(systems[sys_name][sys_id]) sys_data_path = os.path.join(work_path, picked_data_name) picked_systems.to_deepmd_raw(sys_data_path) picked_systems.to_deepmd_npy(sys_data_path, set_size=init_pick_number) # dump the rest data rest_systems = dpdata.MultiSystems() for j in rest_idx: sys_name, sys_id = labels[j] rest_systems.append(systems[sys_name][sys_id]) sys_data_path = os.path.join(work_path, rest_data_name) rest_systems.to_deepmd_raw(sys_data_path) rest_systems.to_deepmd_npy(sys_data_path, set_size=rest_idx.size)
def predict(self, dp): try: # DP 1.x import deepmd.DeepPot as DeepPot except ModuleNotFoundError: # DP 2.x from deepmd.infer import DeepPot if not isinstance(dp, DeepPot): dp = DeepPot(dp) new_multisystems = dpdata.MultiSystems() for ss in self: new_multisystems.append(ss.predict(dp)) return new_multisystems
def _searchpath(self): logfiles = [] for root, _, files in tqdm(os.walk(self.data_path, followlinks=True)): for logfile in files: if logfile.endswith(self.suffix): logfiles.append(os.path.join(root, logfile)) multi_systems = dpdata.MultiSystems() with Pool() as pool: for system in pool.imap_unordered(self._preparedeepmdforLOG, tqdm(logfiles)): multi_systems.append(system) multi_systems.to_deepmd_npy(self.deepmd_dir) for formula, system in multi_systems.systems.items(): self.system_paths.append(os.path.join(self.deepmd_dir, formula)) self.batch_size.append( min(max(32//(system["coords"].shape[1]), 1), system["coords"].shape[0])) self.atomname = multi_systems.atom_names
def post_model_devi(iter_index, jdata, mdata): """calculate the model deviation""" iter_name = make_iter_name(iter_index) work_path = os.path.join(iter_name, model_devi_name) tasks = glob.glob(os.path.join(work_path, "task.*")) e_trust_lo = jdata['e_trust_lo'] e_trust_hi = jdata['e_trust_hi'] f_trust_lo = jdata['f_trust_lo'] f_trust_hi = jdata['f_trust_hi'] sys_accurate = dpdata.MultiSystems() sys_candinate = dpdata.MultiSystems() sys_failed = dpdata.MultiSystems() for task in tasks: # e.out details_e = glob.glob( os.path.join(task, "{}.*.e.out".format(detail_file_name_prefix))) e_all = np.array( [np.loadtxt(detail_e, ndmin=2)[:, 1] for detail_e in details_e]) e_std = np.std(e_all, axis=0) n_frame = e_std.size # f.out details_f = glob.glob( os.path.join(task, "{}.*.f.out".format(detail_file_name_prefix))) f_all = np.array([ np.loadtxt(detail_f, ndmin=2)[:, 3:6].reshape((n_frame, -1, 3)) for detail_f in details_f ]) # (n_model, n_frame, n_atom, 3) f_std = np.std(f_all, axis=0) # (n_frame, n_atom, 3) f_std = np.linalg.norm(f_std, axis=2) # (n_frame, n_atom) f_std = np.max(f_std, axis=1) # (n_frame,) system_cls = get_system_cls(jdata) for subsys, e_devi, f_devi in zip( system_cls(os.path.join(task, rest_data_name), fmt='deepmd/npy'), e_std, f_std): if (e_devi < e_trust_hi and e_devi >= e_trust_lo) or (f_devi < f_trust_hi and f_devi >= f_trust_lo): sys_candinate.append(subsys) elif (e_devi >= e_trust_hi) or (f_devi >= f_trust_hi): sys_failed.append(subsys) elif (e_devi < e_trust_lo and f_devi < f_trust_lo): sys_accurate.append(subsys) counter = { "candidate": sys_candinate.get_nframes(), "accurate": sys_accurate.get_nframes(), "failed": sys_failed.get_nframes() } fp_sum = sum(counter.values()) for cc_key, cc_value in counter.items(): dlog.info("{0:9s} : {1:6d} in {2:6d} {3:6.2f} %".format( cc_key, cc_value, fp_sum, cc_value / fp_sum * 100)) # label the candidate system labels = [] for key, system in sys_candinate.systems.items(): labels.extend([(key, j) for j in range(len(system))]) # candinate: pick up randomly iter_pick_number = jdata['iter_pick_number'] idx = np.arange(counter['candidate']) np.random.shuffle(idx) pick_idx = idx[:iter_pick_number] rest_idx = idx[iter_pick_number:] # dump the picked candinate data picked_systems = dpdata.MultiSystems() for j in pick_idx: sys_name, sys_id = labels[j] picked_systems.append(sys_candinate[sys_name][sys_id]) sys_data_path = os.path.join(work_path, picked_data_name) picked_systems.to_deepmd_raw(sys_data_path) picked_systems.to_deepmd_npy(sys_data_path, set_size=iter_pick_number) # dump the rest data (not picked candinate data and failed data) rest_systems = dpdata.MultiSystems() for j in rest_idx: sys_name, sys_id = labels[j] rest_systems.append(sys_candinate[sys_name][sys_id]) rest_systems += sys_failed sys_data_path = os.path.join(work_path, rest_data_name) rest_systems.to_deepmd_raw(sys_data_path) rest_systems.to_deepmd_npy(sys_data_path, set_size=rest_idx.size) # dump the accurate data -- to another directory sys_data_path = os.path.join(work_path, accurate_data_name) sys_accurate.to_deepmd_raw(sys_data_path) sys_accurate.to_deepmd_npy(sys_data_path, set_size=sys_accurate.get_nframes())
def post_model_devi(iter_index, jdata, mdata): """calculate the model deviation""" use_clusters = jdata.get('use_clusters', False) iter_name = make_iter_name(iter_index) work_path = os.path.join(iter_name, model_devi_name) tasks = glob.glob(os.path.join(work_path, "task.*")) tasks.sort() e_trust_lo = jdata['e_trust_lo'] e_trust_hi = jdata['e_trust_hi'] f_trust_lo = jdata['f_trust_lo'] f_trust_hi = jdata['f_trust_hi'] if use_clusters: sys_accurate = dpdata.MultiSystems() sys_candinate = dpdata.MultiSystems() sys_failed = dpdata.MultiSystems() else: sys_accurate = {} sys_candinate = {} sys_failed = {} all_names = set() for task in tasks: if not use_clusters: sys_name = os.path.basename(task).split('.')[1] all_names.add(sys_name) # e.out details_e = glob.glob( os.path.join(task, "{}.*.e.out".format(detail_file_name_prefix))) e_all = np.array( [np.loadtxt(detail_e, ndmin=2)[:, 1] for detail_e in details_e]) e_std = np.std(e_all, axis=0) n_frame = e_std.size # f.out details_f = glob.glob( os.path.join(task, "{}.*.f.out".format(detail_file_name_prefix))) f_all = np.array([ np.loadtxt(detail_f, ndmin=2)[:, 3:6].reshape((n_frame, -1, 3)) for detail_f in details_f ]) # (n_model, n_frame, n_atom, 3) f_std = np.std(f_all, axis=0) # (n_frame, n_atom, 3) f_std = np.linalg.norm(f_std, axis=2) # (n_frame, n_atom) f_std = np.max(f_std, axis=1) # (n_frame,) system_cls = get_system_cls(jdata) for subsys, e_devi, f_devi in zip( system_cls(os.path.join(task, rest_data_name), fmt='deepmd/npy'), e_std, f_std): if (e_devi < e_trust_hi and e_devi >= e_trust_lo) or (f_devi < f_trust_hi and f_devi >= f_trust_lo): if use_clusters: sys_candinate.append(subsys) else: sys_candinate = _add_system(sys_candinate, sys_name, subsys) elif (e_devi >= e_trust_hi) or (f_devi >= f_trust_hi): if use_clusters: sys_failed.append(subsys) else: sys_failed = _add_system(sys_failed, sys_name, subsys) elif (e_devi < e_trust_lo and f_devi < f_trust_lo): if use_clusters: sys_accurate.append(subsys) else: sys_accurate = _add_system(sys_accurate, sys_name, subsys) else: raise RuntimeError( 'reach a place that should NOT be reached...') if use_clusters: counter = { "candidate": sys_candinate.get_nframes(), "accurate": sys_accurate.get_nframes(), "failed": sys_failed.get_nframes() } fp_sum = sum(counter.values()) for cc_key, cc_value in counter.items(): dlog.info("{0:9s} : {1:6d} in {2:6d} {3:6.2f} %".format( cc_key, cc_value, fp_sum, cc_value / fp_sum * 100)) else: all_names = list(all_names) all_names.sort() counter = {"candidate": 0, "accurate": 0, "failed": 0} for kk in all_names: sys_counter = {"candidate": 0, "accurate": 0, "failed": 0} if kk in sys_candinate.keys(): sys_counter['candidate'] += sys_candinate[kk].get_nframes() if kk in sys_accurate.keys(): sys_counter['accurate'] += sys_accurate[kk].get_nframes() if kk in sys_failed.keys(): sys_counter['failed'] += sys_failed[kk].get_nframes() fp_sum = sum(sys_counter.values()) for cc_key, cc_value in sys_counter.items(): if fp_sum != 0: dlog.info( "sys{0:s} {1:9s} : {2:6d} in {3:6d} {4:6.2f} %".format( kk, cc_key, cc_value, fp_sum, cc_value / fp_sum * 100)) else: dlog.info( "sys{0:s} {1:9s} : {2:6d} in {3:6d} {4:6.2f} %".format( kk, cc_key, cc_value, fp_sum, 0 * 100)) for ii in ['candidate', 'accurate', 'failed']: counter[ii] += sys_counter[ii] if counter['candidate'] == 0 and counter['failed'] > 0: raise RuntimeError( 'no candidate but still have failed cases, stop. You may want to refine the training or to increase the trust level hi' ) # label the candidate system labels = [] if use_clusters: items = sys_candinate.systems.items() else: items = sys_candinate.items() for key, system in items: labels.extend([(key, j) for j in range(len(system))]) # candinate: pick up randomly iter_pick_number = jdata['iter_pick_number'] idx = np.arange(counter['candidate']) assert (len(idx) == len(labels)) np.random.shuffle(idx) pick_idx = idx[:iter_pick_number] rest_idx = idx[iter_pick_number:] dlog.info("total candidate {0:6d} picked {1:6d} ({2:6.2f} %) rest {3:6d} ({4:6.2f} % )".format\ (counter['candidate'], len(pick_idx), float(len(pick_idx))/counter['candidate']*100., len(rest_idx), float(len(rest_idx))/counter['candidate']*100.)) # dump the picked candinate data if use_clusters: picked_systems = dpdata.MultiSystems() for j in pick_idx: sys_name, sys_id = labels[j] picked_systems.append(sys_candinate[sys_name][sys_id]) sys_data_path = os.path.join(work_path, picked_data_name) picked_systems.to_deepmd_raw(sys_data_path) picked_systems.to_deepmd_npy(sys_data_path, set_size=iter_pick_number) else: selc_systems = {} for j in pick_idx: sys_name, sys_id = labels[j] selc_systems = _add_system(selc_systems, sys_name, sys_candinate[sys_name][sys_id]) sys_data_path = os.path.join(work_path, picked_data_name) _dump_system_dict(selc_systems, sys_data_path) # dump the rest data (not picked candinate data and failed data) if use_clusters: rest_systems = dpdata.MultiSystems() for j in rest_idx: sys_name, sys_id = labels[j] rest_systems.append(sys_candinate[sys_name][sys_id]) rest_systems += sys_failed sys_data_path = os.path.join(work_path, rest_data_name) rest_systems.to_deepmd_raw(sys_data_path) rest_systems.to_deepmd_npy(sys_data_path, set_size=rest_idx.size) else: selc_systems = {} for j in rest_idx: sys_name, sys_id = labels[j] selc_systems = _add_system(selc_systems, sys_name, sys_candinate[sys_name][sys_id]) for kk in sys_failed.keys(): selc_systems = _add_system(selc_systems, kk, sys_failed[kk]) sys_data_path = os.path.join(work_path, rest_data_name) _dump_system_dict(selc_systems, sys_data_path) # dump the accurate data -- to another directory if use_clusters: sys_data_path = os.path.join(work_path, accurate_data_name) sys_accurate.to_deepmd_raw(sys_data_path) sys_accurate.to_deepmd_npy(sys_data_path, set_size=sys_accurate.get_nframes()) else: sys_data_path = os.path.join(work_path, accurate_data_name) _dump_system_dict(sys_accurate, sys_data_path)