def _generate_pkl(pot, dbs=None, **args): """Generates a pickle file for a single potential and its default databases. """ from matdb.plotting.potentials import generate from matdb.atoms import AtomsList from cPickle import dump outdir = path.join(args["folder"], pot.fqn) if not path.isdir(outdir): mkdir(outdir) if dbs is not None: configs = AtomsList() for db in dbs: configs.extend(list(db.iconfigs)) pdis = generate(args["plots"], pot.calculator, configs, outdir, args["base64"], valkey=args["valkey"]) else: pdis = generate(args["plots"], pot.calculator, pot.configs(args["subset"]), outdir, args["base64"]) pklname = "{}-{}-plotgen.pkl".format(args["subset"], args["plots"]) target = path.join(outdir, pklname) with open(target, 'w') as f: dump(pdis, f)
def rset(self): """Constructs the Hessian matrix for the *best* convergence parameters in this group and it's possible sub-sequences. Returns: list: list of :class:`~matdb.atoms.Atoms`; each atoms object will have a `H` matrix in its info dictionary. """ if len(self.sequence) == 0: #We are at the bottom of the stack; attach the hessian matrix #to the atoms object it corresponds to. self.atoms.info["H"] = self.H result = AtomsList() result.append(self.atoms) return result else: #Check where we are in the stack. If we are just below the database, #then we want to return a list of hessian matrices and atoms #objects. If we are not, then we must a parameter grid of sequences #to select from. if isinstance(self.parent, Hessian): #We have many dynamical matrices to choose from. We need to decide #what "best" means and then return that one. bestkey = self._best_bands() return self.sequence[bestkey].rset else: result = AtomsList() for p in self.sequence.values(): result.extend(p.rset) return result
def quantities(self, params=None, properties=None, aggregators=None, kind="train", **kwargs): """Returns datasets derived from the atoms objects that are present in this trainers compiled databases. .. note:: If a property is missing from a particular atoms object, it is just ignored. That means the arrays returned from this method may not all have exactly the same length as the number of entries in the database. Args: params (list): list of `str` parameter names to extract from each atoms object. properties (list): list of `str` property names to extract from each atoms object. aggregators (dict): keys are `str` property names; values are `str` FQN of importable functions that can be applied to a :class:`numpy.ndarray` to produce a single scalar value. These are used to reduce an array of property values to a single number for a particular configuration. If not specified, the raw arrays are returned instead. kind (str): one of ['train', 'holdout', 'super', '*']. Specifies which of the database sets to use. If '*' is specified, then all of them are combined. kwargs (dict): additional dummy arguments that aren't needed, but allow the `**` syntax to be used. Returns: dict: keys are either property or parameter names. Values are :class:`numpy.ndarray` for parameters; for properties, since the arrays may have different sizes, the value will be a list of :class:`numpy.ndarray`. """ assert kind in ["train", "holdout", "super", '*'] if kind == '*': db = AtomsList() for k in ["train", "holdout", "super"]: db.extend(self.configs(k)) else: db = self.configs(kind) result = {} if params is not None: for pname in params: result[pname] = np.array(getattr(db, pname)) if properties is not None: for pname in properties: value = getattr(db, pname) if pname in aggregators: aggmod, aggfun = import_fqdn(aggregators[pname]) result[pname] = aggfun(value) else: result[pname] = value return result
def fitting_configs(self): """Returns a :class:`matdb.atoms.AtomsList` for all configs in this group. """ configs = AtomsList() if len(self.sequence) == 0: for config in self.config_atoms.values(): configs.append(config) else: for seq in self.sequence.values(): configs.extend(seq.fitting_configs) return configs
def fitting_configs(self): """Returns a :class:`~matdb.atoms.AtomsList` for all configs in this group. This list includes a single *duplicated* configuration for each of the eigenvalue/eigenvector combinations of the Hessian matrix. """ if len(self.sequence) == 0: if self.ready(): return self.config_atoms.values() else: return AtomsList() else: result = AtomsList() for g in self.sequence.values(): result.extend(g.fitting_configs) return result
def h5cat(files, target): """Concatenates a list of h5 AtomsList files into a single AtomsList. Args: files (list): list of `string` file paths to combine. target (str): name/path of the output file that will include all of the combined files. """ # Local import to prevent cyclic imports from matdb.atoms import AtomsList result = AtomsList() for fname in files: ilist = AtomsList(fname) result.extend(ilist) result.write(target)
def rset(self): """Returns a :class:`matdb.atoms.AtomsList`, one for each config in the latest result set. """ if len(self.sequence) == 0: # Return the configurations from this group; it is at the # bottom of the stack result = AtomsList() for epath in self.atoms_paths(): result.append(Atoms(path.join(epath, 'pre_comp_atoms.h5'))) return result else: result = [] for e in self.sequence.values(): result.extend(e.rset()) return result
def rset(self): """Returns the reusable set to the next database group. Returns: list: list of :class:`~matdb.atoms.Atoms` """ if len(self.sequence) == 0: #We are at the bottom of the stack; result = AtomsList() for config in self.fitting_configs: result.append(Atoms(path.join(config, "atoms.h5"))) return result else: #Check where we are in the stack. If we are just below the database, #then we want to return the atoms objects for all database entries. #If we are not, then we must a parameter grid of sequences #to select from. result = [] for g in self.sequence.values(): result.extend(g.rset) return AtomsList(result)
def split(atlist, splits, targets, dbdir, ran_seed, dbfile=None, recalc=0, nonsplit=None): """Splits the :class:`~matdb.atoms.AtomsList` multiple times, one for each `split` setting in the database specification. Args: atlsit (AtomsList or list): the list of :class:`matdb.atoms.Atoms` objects to be split or a list to the files containing the atoms objects. splits (dict): the splits to perform. targets (dict): the files to save the splits in, these should contain a {} in the name which will be replaced with the split name. The dictionary must have the format {"train": file_name, "holdout": file_name, "super": file_name}. dbdir (str): the root *splits* directory for the database. dbfile (str): the _dbfile for a legacy database. ran_seed (int or float): the random seed for the splits (i.e. the controllers random seed). recalc (int): when non-zero, re-split the data and overwrite any existing *.h5 files. This parameter decreases as rewrites proceed down the stack. To re-calculate lower-level h5 files, increase this value. nonsplit (AtomsList): a list of atoms to include in the training set "as-is" because they cannot be split (they only have meaning together). """ from matdb.utility import dbcat assert nonsplit is None or isinstance(nonsplit, AtomsList) for name, train_perc in splits.items(): train_file = targets["train"](name) holdout_file = targets["holdout"](name) super_file = targets["super"](name) idfile = path.join(dbdir, "{0}-ids.pkl".format(name)) if (path.isfile(train_file) and path.isfile(holdout_file) and path.isfile(super_file)): if recalc <= 0: return else: if path.isfile(idfile): with open(idfile, 'rb') as f: data = load(f) for fname in [train_file, holdout_file, super_file]: new_name = fname.replace( name, "{0}_{1}".format(name, data["uuid"])) rename(fname, new_name) remove(idfile) #Compile a list of all the sub-configurations we can include in the #training. if not isinstance(atlist, AtomsList): subconfs = AtomsList(atlist) else: subconfs = atlist if path.isfile(idfile): with open(idfile, 'rb') as f: data = load(f) subconfs = data["subconfs"] ids = data["ids"] Ntrain = data["Ntrain"] Nhold = data["Nhold"] Ntot = data["Ntot"] Nsuper = data["Nsuper"] else: Ntot = len(subconfs) Ntrain = int(np.ceil(Ntot * train_perc)) ids = np.arange(Ntot) Nhold = int(np.ceil((Ntot - Ntrain) * train_perc)) Nsuper = Ntot - Ntrain - Nhold np.random.shuffle(ids) #We need to save these ids so that we don't mess up the statistics on #the training and validation sets. data = { "uuid": str(uuid4()), "subconfs": subconfs, "ids": ids, "Ntrain": Ntrain, "Nhold": Nhold, "Ntot": Ntot, "Nsuper": Nsuper, "ran_seed": ran_seed } with open(idfile, 'wb') as f: dump(data, f) #Only write the minimum necessary files. Use dbcat to create the #database version and configuration information. There is duplication #here because we also store the ids again. We retain the pkl file above #so that we can recreate *exactly* the same split again later. if not path.isfile(train_file): tids = ids[0:Ntrain] #Make sure that we have some atoms to write in the first place! if len(tids) > 0: altrain = subconfs[tids] else: altrain = AtomsList() #Add the unsplittable configurations to the training set as-is. Nunsplit = 0 if nonsplit is not None: altrain.extend(nonsplit) Nunsplit = len(nonsplit) altrain.write(train_file) if dbfile is not None: dbcat([dbfile], train_file, docat=False, ids=tids, N=Ntrain + Nunsplit) else: dbcat([], train_file, docat=False, ids=tids, N=Ntrain + Nunsplit) if not path.isfile(holdout_file): hids = ids[Ntrain:-Nsuper] alhold = subconfs[hids] alhold.write(holdout_file) if dbfile is not None: dbcat([dbfile], holdout_file, docat=False, ids=hids, N=Nhold) else: dbcat([], holdout_file, docat=False, ids=hids, N=Nhold) if not path.isfile(super_file): sids = ids[-Nsuper:] alsuper = subconfs[sids] alsuper.write(super_file) if dbfile is not None: dbcat([dbfile], super_file, docat=False, ids=sids, N=Nsuper) else: dbcat([], super_file, docat=False, ids=sids, N=Nsuper)
def _create_dbfull(self, folder, pattern, energy, force, virial, config_type): """Creates the full combined database. """ from matdb.utility import chdir, dbcat from glob import glob from tqdm import tqdm from os import path #NB! There is a subtle bug here: if you try and open a matdb.atoms.Atoms #within the context manager of `chdir`, something messes up with the #memory sharing in fortran and it dies. This has to be separate. with chdir(folder): self.dbfiles = glob(pattern) rewrites = [] for dbfile in self.dbfiles: #Look at the first configuration in the atoms list to #determine if it matches the energy, force, virial and #config type parameter names. dbpath = path.join(folder, dbfile) params, doforce = _atoms_conform(dbpath, energy, force, virial) if len(params) > 0 or doforce: msg.std("Conforming database file {}.".format(dbpath)) al = AtomsList(dbpath) outpath = path.join(self.root, dbfile.replace(".xyz",".h5")) for ai in tqdm(al): for target, source in params.items(): if (target == "config_type" and config_type is not None): ai.params[target] = config_type else: ai.add_param(target,ai.params[source]) del ai.params[source] if source in ai.info: #pragma: no cover #(if things were #dane correctly by #the atoms object #this should never #be used. It exists #mainly as a #safegaurd. msg.warn("The atoms object didn't properly " "update the parameters of the legacy " "atoms object.") del ai.info[source] if doforce: ai.add_property("ref_force",ai.properties[force]) del ai.properties[force] al.write(outpath) #Mark this db as non-conforming so that we created a new #version of it. rewrites.append(dbfile) dbcat([dbpath], outpath, docat=False, renames=params, doforce=doforce) # We want a single file to hold all of the data for all the atoms in the database. all_atoms = AtomsList() for dbfile in self.dbfiles: if dbfile in rewrites: infile = dbfile.replace(".xyz",".h5") all_atoms.extend(AtomsList(path.join(self.root, infile))) else: dbpath = path.join(folder, dbfile) all_atoms.extend(AtomsList(dbpath)) all_atoms.write(self._dbfull) #Finally, create the config file. from matdb.utility import dbcat with chdir(folder): dbcat(self.dbfiles, self._dbfull, config_type=self.config_type, docat=False)