Пример #1
0
def _init_dump_selected_frames(systems, labels, selc_idx, sys_data_path,
                               jdata):
    pick_data = jdata['pick_data']
    use_clusters = jdata.get('use_clusters', False)
    if use_clusters:
        selc_systems = dpdata.MultiSystems()
        for j in selc_idx:
            sys_name, sys_id = labels[j]
            selc_systems.append(systems[sys_name][sys_id])
        selc_systems.to_deepmd_raw(sys_data_path)
        selc_systems.to_deepmd_npy(sys_data_path, set_size=selc_idx.size)
    else:
        selc_systems = {}
        for j in selc_idx:
            sys_name, sys_id = labels[j]
            selc_systems = _add_system(selc_systems, sys_name,
                                       systems[sys_name][sys_id])
        sys_idx_map = get_system_idx(pick_data)
        for kk in selc_systems.keys():
            sub_path = os.path.join(sys_data_path,
                                    sys_name_fmt % sys_idx_map[kk])
            selc_systems[kk].to_deepmd_raw(sub_path)
            selc_systems[kk].to_deepmd_npy(sub_path, set_size=selc_idx.size)
        with open(os.path.join(sys_data_path, 'sys_idx_map.json'), 'w') as fp:
            json.dump(sys_idx_map, fp, indent=4)
Пример #2
0
def get_systems(path, jdata):
    system = get_system_cls(jdata)
    systems = dpdata.MultiSystems(*[
        system(os.path.join(path, s), fmt='deepmd/npy')
        for s in os.listdir(path)
    ])
    return systems
Пример #3
0
def convert_data(jdata):
    s = dpdata.MultiSystems(*[
        dpdata.LabeledSystem(x, fmt="gaussian/log")
        for x in glob.glob(os.path.join(fp_path, "*", "output"))
    ],
                            type_map=jdata["type_map"])
    s.to_deepmd_npy(data_path)
    dlog.info("Initial data is avaiable in %s" % os.path.abspath(data_path))
Пример #4
0
def init_pick(iter_index, jdata, mdata):
    """pick up init data from dataset randomly"""
    pick_data = jdata['pick_data']
    init_pick_number = jdata['init_pick_number']
    # use MultiSystems with System
    # TODO: support System and LabeledSystem
    # TODO: support other format
    systems = get_systems(pick_data, jdata)
    # label the system
    labels = []
    for key, system in systems.systems.items():
        labels.extend([(key, j) for j in range(len(system))])

    # random pick
    iter_name = make_iter_name(iter_index)
    create_path(iter_name)
    work_path = os.path.join(iter_name, model_devi_name)
    create_path(work_path)
    idx = np.arange(len(labels))
    np.random.shuffle(idx)
    pick_idx = idx[:init_pick_number]
    rest_idx = idx[init_pick_number:]

    # dump the init data
    picked_systems = dpdata.MultiSystems()
    for j in pick_idx:
        sys_name, sys_id = labels[j]
        picked_systems.append(systems[sys_name][sys_id])
    sys_data_path = os.path.join(work_path, picked_data_name)

    picked_systems.to_deepmd_raw(sys_data_path)
    picked_systems.to_deepmd_npy(sys_data_path, set_size=init_pick_number)

    # dump the rest data
    rest_systems = dpdata.MultiSystems()
    for j in rest_idx:
        sys_name, sys_id = labels[j]
        rest_systems.append(systems[sys_name][sys_id])
    sys_data_path = os.path.join(work_path, rest_data_name)
    rest_systems.to_deepmd_raw(sys_data_path)
    rest_systems.to_deepmd_npy(sys_data_path, set_size=rest_idx.size)
Пример #5
0
 def predict(self, dp):
     try:
         # DP 1.x
         import deepmd.DeepPot as DeepPot
     except ModuleNotFoundError:
         # DP 2.x
         from deepmd.infer import DeepPot
     if not isinstance(dp, DeepPot):
         dp = DeepPot(dp)
     new_multisystems = dpdata.MultiSystems()
     for ss in self:
         new_multisystems.append(ss.predict(dp))
     return new_multisystems
Пример #6
0
 def _searchpath(self):
     logfiles = []
     for root, _, files in tqdm(os.walk(self.data_path, followlinks=True)):
         for logfile in files:
             if logfile.endswith(self.suffix):
                 logfiles.append(os.path.join(root, logfile))
     multi_systems = dpdata.MultiSystems()
     with Pool() as pool:
         for system in pool.imap_unordered(self._preparedeepmdforLOG, tqdm(logfiles)):
             multi_systems.append(system)
     multi_systems.to_deepmd_npy(self.deepmd_dir)
     for formula, system in multi_systems.systems.items():
         self.system_paths.append(os.path.join(self.deepmd_dir, formula))
         self.batch_size.append(
             min(max(32//(system["coords"].shape[1]), 1), system["coords"].shape[0]))
     self.atomname = multi_systems.atom_names
Пример #7
0
def post_model_devi(iter_index, jdata, mdata):
    """calculate the model deviation"""
    iter_name = make_iter_name(iter_index)
    work_path = os.path.join(iter_name, model_devi_name)
    tasks = glob.glob(os.path.join(work_path, "task.*"))

    e_trust_lo = jdata['e_trust_lo']
    e_trust_hi = jdata['e_trust_hi']
    f_trust_lo = jdata['f_trust_lo']
    f_trust_hi = jdata['f_trust_hi']

    sys_accurate = dpdata.MultiSystems()
    sys_candinate = dpdata.MultiSystems()
    sys_failed = dpdata.MultiSystems()

    for task in tasks:
        # e.out
        details_e = glob.glob(
            os.path.join(task, "{}.*.e.out".format(detail_file_name_prefix)))
        e_all = np.array(
            [np.loadtxt(detail_e, ndmin=2)[:, 1] for detail_e in details_e])
        e_std = np.std(e_all, axis=0)
        n_frame = e_std.size

        # f.out
        details_f = glob.glob(
            os.path.join(task, "{}.*.f.out".format(detail_file_name_prefix)))
        f_all = np.array([
            np.loadtxt(detail_f, ndmin=2)[:, 3:6].reshape((n_frame, -1, 3))
            for detail_f in details_f
        ])
        # (n_model, n_frame, n_atom, 3)
        f_std = np.std(f_all, axis=0)
        # (n_frame, n_atom, 3)
        f_std = np.linalg.norm(f_std, axis=2)
        # (n_frame, n_atom)
        f_std = np.max(f_std, axis=1)
        # (n_frame,)

        system_cls = get_system_cls(jdata)
        for subsys, e_devi, f_devi in zip(
                system_cls(os.path.join(task, rest_data_name),
                           fmt='deepmd/npy'), e_std, f_std):
            if (e_devi < e_trust_hi
                    and e_devi >= e_trust_lo) or (f_devi < f_trust_hi
                                                  and f_devi >= f_trust_lo):
                sys_candinate.append(subsys)
            elif (e_devi >= e_trust_hi) or (f_devi >= f_trust_hi):
                sys_failed.append(subsys)
            elif (e_devi < e_trust_lo and f_devi < f_trust_lo):
                sys_accurate.append(subsys)
    counter = {
        "candidate": sys_candinate.get_nframes(),
        "accurate": sys_accurate.get_nframes(),
        "failed": sys_failed.get_nframes()
    }
    fp_sum = sum(counter.values())
    for cc_key, cc_value in counter.items():
        dlog.info("{0:9s} : {1:6d} in {2:6d} {3:6.2f} %".format(
            cc_key, cc_value, fp_sum, cc_value / fp_sum * 100))

    # label the candidate system
    labels = []
    for key, system in sys_candinate.systems.items():
        labels.extend([(key, j) for j in range(len(system))])
    # candinate: pick up randomly
    iter_pick_number = jdata['iter_pick_number']
    idx = np.arange(counter['candidate'])
    np.random.shuffle(idx)
    pick_idx = idx[:iter_pick_number]
    rest_idx = idx[iter_pick_number:]

    # dump the picked candinate data
    picked_systems = dpdata.MultiSystems()
    for j in pick_idx:
        sys_name, sys_id = labels[j]
        picked_systems.append(sys_candinate[sys_name][sys_id])
    sys_data_path = os.path.join(work_path, picked_data_name)

    picked_systems.to_deepmd_raw(sys_data_path)
    picked_systems.to_deepmd_npy(sys_data_path, set_size=iter_pick_number)

    # dump the rest data (not picked candinate data and failed data)
    rest_systems = dpdata.MultiSystems()
    for j in rest_idx:
        sys_name, sys_id = labels[j]
        rest_systems.append(sys_candinate[sys_name][sys_id])
    rest_systems += sys_failed
    sys_data_path = os.path.join(work_path, rest_data_name)
    rest_systems.to_deepmd_raw(sys_data_path)
    rest_systems.to_deepmd_npy(sys_data_path, set_size=rest_idx.size)

    # dump the accurate data -- to another directory
    sys_data_path = os.path.join(work_path, accurate_data_name)
    sys_accurate.to_deepmd_raw(sys_data_path)
    sys_accurate.to_deepmd_npy(sys_data_path,
                               set_size=sys_accurate.get_nframes())
Пример #8
0
def post_model_devi(iter_index, jdata, mdata):
    """calculate the model deviation"""
    use_clusters = jdata.get('use_clusters', False)
    iter_name = make_iter_name(iter_index)
    work_path = os.path.join(iter_name, model_devi_name)
    tasks = glob.glob(os.path.join(work_path, "task.*"))
    tasks.sort()

    e_trust_lo = jdata['e_trust_lo']
    e_trust_hi = jdata['e_trust_hi']
    f_trust_lo = jdata['f_trust_lo']
    f_trust_hi = jdata['f_trust_hi']

    if use_clusters:
        sys_accurate = dpdata.MultiSystems()
        sys_candinate = dpdata.MultiSystems()
        sys_failed = dpdata.MultiSystems()
    else:
        sys_accurate = {}
        sys_candinate = {}
        sys_failed = {}
        all_names = set()

    for task in tasks:
        if not use_clusters:
            sys_name = os.path.basename(task).split('.')[1]
            all_names.add(sys_name)
        # e.out
        details_e = glob.glob(
            os.path.join(task, "{}.*.e.out".format(detail_file_name_prefix)))
        e_all = np.array(
            [np.loadtxt(detail_e, ndmin=2)[:, 1] for detail_e in details_e])
        e_std = np.std(e_all, axis=0)
        n_frame = e_std.size

        # f.out
        details_f = glob.glob(
            os.path.join(task, "{}.*.f.out".format(detail_file_name_prefix)))
        f_all = np.array([
            np.loadtxt(detail_f, ndmin=2)[:, 3:6].reshape((n_frame, -1, 3))
            for detail_f in details_f
        ])
        # (n_model, n_frame, n_atom, 3)
        f_std = np.std(f_all, axis=0)
        # (n_frame, n_atom, 3)
        f_std = np.linalg.norm(f_std, axis=2)
        # (n_frame, n_atom)
        f_std = np.max(f_std, axis=1)
        # (n_frame,)

        system_cls = get_system_cls(jdata)
        for subsys, e_devi, f_devi in zip(
                system_cls(os.path.join(task, rest_data_name),
                           fmt='deepmd/npy'), e_std, f_std):
            if (e_devi < e_trust_hi
                    and e_devi >= e_trust_lo) or (f_devi < f_trust_hi
                                                  and f_devi >= f_trust_lo):
                if use_clusters:
                    sys_candinate.append(subsys)
                else:
                    sys_candinate = _add_system(sys_candinate, sys_name,
                                                subsys)
            elif (e_devi >= e_trust_hi) or (f_devi >= f_trust_hi):
                if use_clusters:
                    sys_failed.append(subsys)
                else:
                    sys_failed = _add_system(sys_failed, sys_name, subsys)
            elif (e_devi < e_trust_lo and f_devi < f_trust_lo):
                if use_clusters:
                    sys_accurate.append(subsys)
                else:
                    sys_accurate = _add_system(sys_accurate, sys_name, subsys)
            else:
                raise RuntimeError(
                    'reach a place that should NOT be reached...')
    if use_clusters:
        counter = {
            "candidate": sys_candinate.get_nframes(),
            "accurate": sys_accurate.get_nframes(),
            "failed": sys_failed.get_nframes()
        }
        fp_sum = sum(counter.values())
        for cc_key, cc_value in counter.items():
            dlog.info("{0:9s} : {1:6d} in {2:6d} {3:6.2f} %".format(
                cc_key, cc_value, fp_sum, cc_value / fp_sum * 100))
    else:
        all_names = list(all_names)
        all_names.sort()
        counter = {"candidate": 0, "accurate": 0, "failed": 0}
        for kk in all_names:
            sys_counter = {"candidate": 0, "accurate": 0, "failed": 0}
            if kk in sys_candinate.keys():
                sys_counter['candidate'] += sys_candinate[kk].get_nframes()
            if kk in sys_accurate.keys():
                sys_counter['accurate'] += sys_accurate[kk].get_nframes()
            if kk in sys_failed.keys():
                sys_counter['failed'] += sys_failed[kk].get_nframes()
            fp_sum = sum(sys_counter.values())
            for cc_key, cc_value in sys_counter.items():
                if fp_sum != 0:
                    dlog.info(
                        "sys{0:s} {1:9s} : {2:6d} in {3:6d} {4:6.2f} %".format(
                            kk, cc_key, cc_value, fp_sum,
                            cc_value / fp_sum * 100))
                else:
                    dlog.info(
                        "sys{0:s} {1:9s} : {2:6d} in {3:6d} {4:6.2f} %".format(
                            kk, cc_key, cc_value, fp_sum, 0 * 100))
            for ii in ['candidate', 'accurate', 'failed']:
                counter[ii] += sys_counter[ii]

    if counter['candidate'] == 0 and counter['failed'] > 0:
        raise RuntimeError(
            'no candidate but still have failed cases, stop. You may want to refine the training or to increase the trust level hi'
        )

    # label the candidate system
    labels = []
    if use_clusters:
        items = sys_candinate.systems.items()
    else:
        items = sys_candinate.items()
    for key, system in items:
        labels.extend([(key, j) for j in range(len(system))])
    # candinate: pick up randomly
    iter_pick_number = jdata['iter_pick_number']
    idx = np.arange(counter['candidate'])
    assert (len(idx) == len(labels))
    np.random.shuffle(idx)
    pick_idx = idx[:iter_pick_number]
    rest_idx = idx[iter_pick_number:]
    dlog.info("total candidate {0:6d}   picked {1:6d} ({2:6.2f} %) rest {3:6d} ({4:6.2f} % )".format\
              (counter['candidate'], len(pick_idx), float(len(pick_idx))/counter['candidate']*100., len(rest_idx), float(len(rest_idx))/counter['candidate']*100.))

    # dump the picked candinate data
    if use_clusters:
        picked_systems = dpdata.MultiSystems()
        for j in pick_idx:
            sys_name, sys_id = labels[j]
            picked_systems.append(sys_candinate[sys_name][sys_id])
        sys_data_path = os.path.join(work_path, picked_data_name)
        picked_systems.to_deepmd_raw(sys_data_path)
        picked_systems.to_deepmd_npy(sys_data_path, set_size=iter_pick_number)
    else:
        selc_systems = {}
        for j in pick_idx:
            sys_name, sys_id = labels[j]
            selc_systems = _add_system(selc_systems, sys_name,
                                       sys_candinate[sys_name][sys_id])
        sys_data_path = os.path.join(work_path, picked_data_name)
        _dump_system_dict(selc_systems, sys_data_path)

    # dump the rest data (not picked candinate data and failed data)
    if use_clusters:
        rest_systems = dpdata.MultiSystems()
        for j in rest_idx:
            sys_name, sys_id = labels[j]
            rest_systems.append(sys_candinate[sys_name][sys_id])
        rest_systems += sys_failed
        sys_data_path = os.path.join(work_path, rest_data_name)
        rest_systems.to_deepmd_raw(sys_data_path)
        rest_systems.to_deepmd_npy(sys_data_path, set_size=rest_idx.size)
    else:
        selc_systems = {}
        for j in rest_idx:
            sys_name, sys_id = labels[j]
            selc_systems = _add_system(selc_systems, sys_name,
                                       sys_candinate[sys_name][sys_id])
        for kk in sys_failed.keys():
            selc_systems = _add_system(selc_systems, kk, sys_failed[kk])
        sys_data_path = os.path.join(work_path, rest_data_name)
        _dump_system_dict(selc_systems, sys_data_path)

    # dump the accurate data -- to another directory
    if use_clusters:
        sys_data_path = os.path.join(work_path, accurate_data_name)
        sys_accurate.to_deepmd_raw(sys_data_path)
        sys_accurate.to_deepmd_npy(sys_data_path,
                                   set_size=sys_accurate.get_nframes())
    else:
        sys_data_path = os.path.join(work_path, accurate_data_name)
        _dump_system_dict(sys_accurate, sys_data_path)