def train( self, inputs, targets, data=None, optimizer=(None, None), epochs=100, regularization=None, convergence=None, lossfxn=None, device="cpu", batch_size=None, lr_scheduler=None, independent_loss=True, loss_weights=None, ): """Train the models Parameters ---------- inputs : dict Dictionary with hashed feature space. targets : list The expected values that the model has to learn aka y. model : object The NeuralNetwork class. data : object Data object created from the handler. optimizer : tuple The optimizer is a tuple with the structure: >>> ('adam', {'lr': float, 'weight_decay'=float}) epochs : int Number of full training cycles. regularization : float This is the L2 regularization. It is not the same as weight decay. convergence : dict Instead of using epochs, users can set a convergence criterion. >>> convergence = {"rmse": [0.04, 0.02]} lossfxn : obj A loss function object. device : str Calculation can be run in the cpu or cuda (gpu). batch_size : int Number of data points per batch to use for training. Default is None. lr_scheduler : tuple Tuple with structure: scheduler's name and a dictionary with keyword arguments. >>> lr_scheduler = ('ReduceLROnPlateau', {'mode': 'min', 'patience': 10}) independent_loss : bool Whether or not models' weight are optimized independently. loss_weights : list How much the loss of model(i) contributes to the total loss. """ self.epochs = epochs # Convergence criterion if isinstance(convergence["rmse"], float) or isinstance( convergence["rmse"], int): convergence["rmse"] = np.array( [convergence["rmse"] for model in range(len(self.models))]) elif isinstance(convergence["rmse"], list): if len(convergence["rmse"]) != len(self.models): raise ( "Your convergence list is not the same length of the number of models" ) convergence["rmse"] = np.array(convergence["rmse"]) logger.info(" ") logging.info("Model Merger") logging.info("============") now = datetime.datetime.now() logger.info("Module accessed on {}.".format( now.strftime("%Y-%m-%d %H:%M:%S"))) logging.info("Merging the following models:") for model in self.models: logging.info(" - {}.".format(model.name())) logging.info("Loss functions:") if loss_weights is None: self.loss_weights = [1.0 / len(lossfxn) for l in lossfxn] else: self.loss_weights = loss_weights for index, l in enumerate(lossfxn): logging.info(" - Name: {}; Weight: {}.".format( l.__name__, self.loss_weights[index])) logging.info("Convergence criterion: {}.".format(convergence)) # If no batch_size provided then the whole training set length is the batch. if batch_size is None: batch_size = len(inputs.values()) if isinstance(batch_size, int): chunks = [] for inputs_ in inputs: if inspect.ismethod(inputs_): chunks.append(inputs_) else: chunks.append( list(get_chunks(inputs_, batch_size, svm=False))) targets = [ list(get_chunks(target, batch_size, svm=False)) for target in targets ] atoms_per_image = list( get_chunks(data.atoms_per_image, batch_size, svm=False)) if lossfxn is None: self.lossfxn = [None for model in self.models] else: self.lossfxn = lossfxn self.device = device # Population of extra Attributes needed by the models, and further data # preprocessing for index, loss in enumerate(lossfxn): _args, _varargs, _keywords, _defaults = inspect.getargspec(loss) if "latent" in _args: train = dynamic_import("train", "ml4chem.atomistic.models", alt_name="autoencoders") self.inputs_chunk_vals = train.get_inputs_chunks(chunks[index]) else: self.inputs_chunk_vals = None parameters = [] for index, model in enumerate(self.models): parameters += model.parameters() if model.name() == "PytorchPotentials": # These models require targets as tensors self.atoms_per_image = torch.tensor(atoms_per_image, requires_grad=False, dtype=torch.float) _targets = [ torch.tensor(batch, requires_grad=False) for batch in targets[index] ] targets[index] = _targets del _targets elif model.name() in ModelMerger.autoencoders: targets[index] = lod_to_list(targets[index]) # Data scattering client = dask.distributed.get_client() # self.targets = [client.scatter(target) for target in targets] self.targets = [target for target in targets] self.chunks = [] for i, chunk in enumerate(chunks): if inspect.ismethod(chunk) is False: self.chunks.append(client.scatter(chunk)) else: # This list comprehension is useful to have the same number of # functions as the same number of chunks without users' input. chunk = [chunk for _ in range(len(self.targets[i]))] self.chunks.append(chunk) del chunks logger.info(" ") logging.info("Batch Information") logging.info("-----------------") logging.info("Number of batches:") for index, c in enumerate(self.chunks): logging.info(" - Model {}, {}.".format(index, len(c))) logging.info("Batch size: {} elements per batch.\n".format(batch_size)) # Define optimizer self.optimizer_name, self.optimizer = get_optimizer( optimizer, parameters) if lr_scheduler is not None: self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler) logger.info(" ") logger.info("Starting training...") logger.info(" ") logger.info("{:6s} {:19s} {:12s} {:8s}".format("Epoch", "Time Stamp", "Loss", "RMSE (ave)")) logger.info("{:6s} {:19s} {:12s} {:8s}".format("------", "-------------------", "------------", "--------------")) converged = False epoch = 0 if independent_loss is False: # Convert list of chunks from [[a, c], [b, d]] to [[a, b], [c, d]] self.chunks = list(map(list, zip(*self.chunks))) old_state_dict = {} for key in self.models[1].state_dict(): old_state_dict[key] = self.models[1].state_dict()[key].clone() from ml4chem.atomistic.models.autoencoders import Annealer annealer = Annealer() while not converged: epoch += 1 self.annealing = annealer.update(epoch) self.optimizer.zero_grad() # clear previous gradients if independent_loss: losses = [] outputs = [] for model_index, model in enumerate(self.models): loss, output = self.closure(model_index, model, independent_loss, name=model.name()) losses.append(loss) outputs.append(output) else: loss, outputs = self.closure(index, self.models, independent_loss) rmse = [] for i, model in enumerate(self.models): outputs_ = outputs[i] targets_ = self.targets[i] if model.name() == "VAE": # VAE usually returns a complex output with mus and sigmas # but we only need mus at this stage. outputs_ = [sublist[0] for sublist in outputs_] rmse.append(compute_rmse(outputs_, targets_)) rmse = np.array(rmse) _rmse = np.average(rmse) if self.optimizer_name != "LBFGS": self.optimizer.step() else: options = { "closure": self.closure, "current_loss": loss, "max_ls": 10 } self.optimizer.step(options) ts = time.time() ts = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d " "%H:%M:%S") logger.info("{:6d} {} {:8e} {:8f}".format(epoch, ts, loss, _rmse)) if convergence is None and epoch == self.epochs: converged = True elif convergence is not None and (rmse <= convergence["rmse"]).all(): converged = True new_state_dict = {} for key in self.models[1].state_dict(): new_state_dict[key] = self.models[1].state_dict( )[key].clone() for key in old_state_dict: if not (old_state_dict[key] == new_state_dict[key]).all(): print("Diff in {}".format(key)) else: print("No diff in {}".format(key)) print(convergence) print(rmse) print("Final") print(convergence) print(rmse)
def __init__( self, inputs, targets, model=None, data=None, optimizer=(None, None), regularization=None, epochs=100, convergence=None, lossfxn=None, device="cpu", batch_size=None, lr_scheduler=None, ): self.initial_time = time.time() atoms_per_image = data.atoms_per_image if batch_size is None: batch_size = len(inputs.values()) if isinstance(batch_size, int): # Data batches chunks = list(get_chunks(inputs, batch_size, svm=False)) targets = list(get_chunks(targets, batch_size, svm=False)) atoms_per_image = list( get_chunks(atoms_per_image, batch_size, svm=False)) logger.info(" ") logging.info("Batch Information") logging.info("-----------------") logging.info("Number of batches: {}.".format(len(chunks))) logging.info("Batch size: {} elements per batch.".format(batch_size)) logger.info(" ") atoms_per_image = torch.tensor(atoms_per_image, requires_grad=False, dtype=torch.float) targets = torch.tensor(targets, requires_grad=False) if device == "cuda": logger.info("Moving data to CUDA...") atoms_per_image = atoms_per_image.cuda() targets = targets.cuda() _inputs = OrderedDict() for hash, f in inputs.items(): _inputs[hash] = [] for features in f: symbol, vector = features _inputs[hash].append((symbol, vector.cuda())) inputs = _inputs move_time = time.time() - self.initial_time h, m, s = convert_elapsed_time(move_time) logger.info("Data moved to GPU in {} hours {} minutes {:.2f} \ seconds.".format(h, m, s)) logger.info(" ") # Define optimizer self.optimizer_name, self.optimizer = get_optimizer( optimizer, model.parameters()) if lr_scheduler is not None: self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler) logger.info(" ") logger.info("Starting training...") logger.info(" ") logger.info("{:6s} {:19s} {:12s} {:8s} {:8s}".format( "Epoch", "Time Stamp", "Loss", "RMSE/img", "RMSE/atom")) logger.info("{:6s} {:19s} {:12s} {:8s} {:8s}".format( "------", "-------------------", "------------", "--------", "---------")) self.atoms_per_image = atoms_per_image self.convergence = convergence self.device = device self.epochs = epochs self.model = model self.lr_scheduler = lr_scheduler # Data scattering client = dask.distributed.get_client() self.chunks = [client.scatter(chunk) for chunk in chunks] self.targets = [client.scatter(target) for target in targets] if lossfxn is None: self.lossfxn = AtomicMSELoss else: self.lossfxn = lossfxn # Let the hunger games begin... self.trainer()
def __init__( self, inputs, targets, model=None, data=None, optimizer=(None, None), regularization=None, epochs=100, convergence=None, lossfxn=None, device="cpu", batch_size=None, lr_scheduler=None, **kwargs ): supported_keys = ["anneal", "penalize_latent"] if len(kwargs.items()) == 0: for k in supported_keys: setattr(self, k, None) else: for k, v in kwargs.items(): if k in supported_keys: setattr(self, k, v) self.initial_time = time.time() if device == "cuda": pass """ logger.info('Moving data to CUDA...') targets = targets.cuda() _inputs = OrderedDict() for hash, f in inputs.items(): _inputs[hash] = [] for features in f: symbol, vector = features _inputs[hash].append((symbol, vector.cuda())) del inputs inputs = _inputs move_time = time.time() - initial_time h, m, s = convert_elapsed_time(move_time) logger.info('Data moved to GPU in {} hours {} minutes {:.2f} seconds.' .format(h, m, s)) """ if batch_size is None: batch_size = len(inputs.values()) if isinstance(batch_size, int): chunks = list(get_chunks(inputs, batch_size, svm=False)) targets_ = list(get_chunks(targets, batch_size, svm=False)) del targets # This change is needed because the targets are features or # positions and they are built as a dictionary. targets = lod_to_list(targets_) logging.info("Batch size: {} elements per batch.".format(batch_size)) if device == "cuda": logger.info("Moving data to CUDA...") targets = targets.cuda() _inputs = OrderedDict() for hash, f in inputs.items(): _inputs[hash] = [] for features in f: symbol, vector = features _inputs[hash].append((symbol, vector.cuda())) inputs = _inputs move_time = time.time() - self.initial_time h, m, s = convert_elapsed_time(move_time) logger.info( "Data moved to GPU in {} hours {} minutes {:.2f} \ seconds.".format( h, m, s ) ) logger.info(" ") # Define optimizer self.optimizer_name, self.optimizer = get_optimizer( optimizer, model.parameters() ) if lr_scheduler is not None: self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler) if lossfxn is None: self.lossfxn = MSELoss self.inputs_chunk_vals = None else: logger.info("Using custom loss function...") logger.info("") self.lossfxn = lossfxn self.inputs_chunk_vals = self.get_inputs_chunks(chunks) logger.info(" ") logger.info("Starting training...") logger.info(" ") logger.info( "{:6s} {:19s} {:12s} {:9s}".format("Epoch", "Time Stamp", "Loss", "Rec Err") ) logger.info( "{:6s} {:19s} {:12s} {:9s}".format( "------", "-------------------", "------------", "--------" ) ) # Data scattering client = dask.distributed.get_client() self.chunks = [client.scatter(chunk) for chunk in chunks] self.targets = [client.scatter(target) for target in targets] self.device = device self.epochs = epochs self.model = model self.lr_scheduler = lr_scheduler self.convergence = convergence # Let the hunger game begin... self.trainer()
def __init__( self, inputs, targets, model=None, data=None, optimizer=(None, None), regularization=None, epochs=100, convergence=None, lossfxn=None, device="cpu", batch_size=None, lr_scheduler=None, uncertainty=None, checkpoint=None, test=None, ): self.initial_time = time.time() if lossfxn is None: lossfxn = AtomicMSELoss logger.info("") logger.info("Training") logger.info("========") logger.info(f"Convergence criteria: {convergence}") logger.info(f"Loss function: {lossfxn.__name__}") if uncertainty is not None: logger.info("Options:") logger.info(f" - Uncertainty penalization: {pformat(uncertainty)}") logger.info("") atoms_per_image = data.atoms_per_image if batch_size is None: batch_size = len(inputs.values()) if isinstance(batch_size, int): # Data batches chunks = list(get_chunks(inputs, batch_size, svm=False)) targets = list(get_chunks(targets, batch_size, svm=False)) atoms_per_image = list(get_chunks(atoms_per_image, batch_size, svm=False)) if uncertainty != None: uncertainty = list(get_chunks(uncertainty, batch_size, svm=False)) uncertainty = [ torch.tensor(u, requires_grad=False, dtype=torch.float) for u in uncertainty ] logger.info("") logging.info("Batch Information") logging.info("-----------------") logging.info("Number of batches: {}.".format(len(chunks))) logging.info("Batch size: {} elements per batch.".format(batch_size)) logger.info(" ") atoms_per_image = [ torch.tensor(n_atoms, requires_grad=False, dtype=torch.float) for n_atoms in atoms_per_image ] targets = [torch.tensor(t, requires_grad=False) for t in targets] if device == "cuda": logger.info("Moving data to CUDA...") atoms_per_image = atoms_per_image.cuda() targets = targets.cuda() _inputs = OrderedDict() for hash, f in inputs.items(): _inputs[hash] = [] for features in f: symbol, vector = features _inputs[hash].append((symbol, vector.cuda())) inputs = _inputs move_time = time.time() - self.initial_time h, m, s = convert_elapsed_time(move_time) logger.info( "Data moved to GPU in {} hours {} minutes {:.2f} \ seconds.".format( h, m, s ) ) logger.info(" ") # Define optimizer self.optimizer_name, self.optimizer = get_optimizer( optimizer, model.parameters() ) if lr_scheduler is not None: self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler) self.atoms_per_image = atoms_per_image self.convergence = convergence self.device = device self.epochs = epochs self.model = model self.lr_scheduler = lr_scheduler self.lossfxn = lossfxn self.checkpoint = checkpoint self.test = test # Data scattering client = dask.distributed.get_client() self.chunks = [client.scatter(chunk) for chunk in chunks] self.targets = [client.scatter(target) for target in targets] if uncertainty != None: self.uncertainty = [client.scatter(u) for u in uncertainty] else: self.uncertainty = uncertainty # Let the hunger games begin... self.trainer()
def train(self, inputs, targets, data=None, optimizer=(None, None), regularization=None, epochs=100, convergence=None, lossfxn=None, device="cpu", batch_size=None, lr_scheduler=None, independent_loss=True, loss_weights=None): logger.info(" ") logging.info("Model Merger") logging.info("============") logging.info("Merging the following models:") for model in self.models: logging.info(" - {}.".format(model.name())) logging.info("Loss functions:") if loss_weights is None: self.loss_weights = [1. / len(lossfxn) for l in lossfxn] else: self.loss_weights = loss_weights for l in lossfxn: logging.info(" - {}.".format(l.__name__)) # If no batch_size provided then the whole training set length is the batch. if batch_size is None: batch_size = len(inputs.values()) if isinstance(batch_size, int): chunks = [] for inputs_ in inputs: if inspect.ismethod(inputs_): chunks.append(inputs_) else: chunks.append( list(get_chunks(inputs_, batch_size, svm=False))) targets = [ list(get_chunks(target, batch_size, svm=False)) for target in targets ] atoms_per_image = list( get_chunks(data.atoms_per_image, batch_size, svm=False)) if lossfxn is None: self.lossfxn = [None for model in self.models] else: self.lossfxn = lossfxn self.device = device # Population of extra Attributes needed by the models, and further data # preprocessing for index, loss in enumerate(lossfxn): _args, _varargs, _keywords, _defaults = inspect.getargspec(loss) if "latent" in _args: train = dynamic_import("train", "ml4chem.models", alt_name="autoencoders") self.inputs_chunk_vals = train.get_inputs_chunks(chunks[index]) parameters = [] for index, model in enumerate(self.models): parameters += model.parameters() if model.name() == "PytorchPotentials": # These models require targets as tensors self.atoms_per_image = torch.tensor(atoms_per_image, requires_grad=False, dtype=torch.float) _targets = [ torch.tensor(batch, requires_grad=False) for batch in targets[index] ] targets[index] = _targets del _targets elif model.name() == "AutoEncoder": targets[index] = lod_to_list(targets[index]) # Data scattering client = dask.distributed.get_client() # self.targets = [client.scatter(target) for target in targets] self.targets = [target for target in targets] self.chunks = [] for i, chunk in enumerate(chunks): if inspect.ismethod(chunk) is False: self.chunks.append(client.scatter(chunk)) else: # This list comprehension is useful to have the same number of # functions as the same number of chunks without users' input. chunk = [chunk for _ in range(len(self.targets[i]))] self.chunks.append(chunk) del chunks logger.info(" ") logging.info("Batch Information") logging.info("-----------------") logging.info("Number of batches:") for index, c in enumerate(self.chunks): logging.info(' - Model {}, {}.'.format(index, len(c))) logging.info("Batch size: {} elements per batch.\n".format(batch_size)) # Define optimizer self.optimizer_name, self.optimizer = get_optimizer( optimizer, parameters) if lr_scheduler is not None: self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler) logger.info(" ") logger.info("Starting training...") logger.info(" ") logger.info("{:6s} {:19s} {:12s} {:8s}".format("Epoch", "Time Stamp", "Loss", "RMSE (ave)")) logger.info("{:6s} {:19s} {:12s} {:8s}".format("------", "-------------------", "------------", "--------------")) converged = False epoch = 0 if independent_loss is False: # Convert list of chunks from [[a, c], [b, d]] to [[a, b], [c, d]] self.chunks = list(map(list, zip(*self.chunks))) old_state_dict = {} for key in self.models[1].state_dict(): old_state_dict[key] = self.models[1].state_dict()[key].clone() while not converged: epoch += 1 self.optimizer.zero_grad() # clear previous gradients if independent_loss: losses = [] for model_index, model in enumerate(self.models): name = model.name() loss, outputs = self.closure(model_index, model, independent_loss, name=name) losses.append(loss) else: loss, outputs = self.closure(index, self.models, independent_loss) rmse = [] for i, model in enumerate(self.models): rmse.append(compute_rmse(outputs[i], self.targets[i])) # print(outputs[1]) # print(targets[1]) # print(rmse) _rmse = np.average(rmse) if self.optimizer_name != "LBFGS": self.optimizer.step() else: options = { "closure": self.closure, "current_loss": loss, "max_ls": 10 } self.optimizer.step(options) ts = time.time() ts = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d " "%H:%M:%S") logger.info("{:6d} {} {:8e} {:8f}".format(epoch, ts, loss, _rmse)) if convergence is None and epoch == self.epochs: converged = True elif convergence is not None and all(i <= convergence["rmse"] for i in rmse): converged = True new_state_dict = {} for key in self.models[1].state_dict(): new_state_dict[key] = self.models[1].state_dict( )[key].clone() for key in old_state_dict: if not (old_state_dict[key] == new_state_dict[key]).all(): print('Diff in {}'.format(key)) else: print('No diff in {}'.format(key))
def calculate(self, images=None, purpose="training", data=None, svm=False): """Calculate the features per atom in an atoms objects Parameters ---------- image : dict Hashed images using the Data class. purpose : str The supported purposes are: 'training', 'inference'. data : obj data object svm : bool Whether or not these features are going to be used for kernel methods. Returns ------- feature_space : dict A dictionary with key hash and value as a list with the following structure: {'hash': [('H', [vector]]} reference_space : dict A reference space useful for SVM models. """ client = dask.distributed.get_client() logger.info(" ") logger.info("Featurization") logger.info("=============") now = datetime.datetime.now() logger.info("Module accessed on {}.".format( now.strftime("%Y-%m-%d %H:%M:%S"))) logger.info(f"Module name: {self.name()}.") # FIXME the block below should become a function. if os.path.isfile(self.filename) and self.overwrite is False: logger.warning(f"Loading features from {self.filename}.") logger.info(" ") svm_keys = [b"feature_space", b"reference_space"] data = load(self.filename) data_hashes = list(data.keys()) image_hashes = list(images.keys()) if image_hashes == data_hashes: # Check if both lists are the same. return data elif any(i in image_hashes for i in data_hashes): # Check if any of the elem _data = {} for hash in image_hashes: _data[hash] = data[hash] return _data if svm_keys == list(data.keys()): feature_space = data[svm_keys[0]] reference_space = data[svm_keys[1]] return feature_space, reference_space initial_time = time.time() # Verify that we know the unique element symbols if data.unique_element_symbols is None: logger.info(f"Getting unique element symbols for {purpose}") unique_element_symbols = data.get_unique_element_symbols( images, purpose=purpose) unique_element_symbols = unique_element_symbols[purpose] logger.info(f"Unique chemical elements: {unique_element_symbols}") elif isinstance(data.unique_element_symbols, dict): unique_element_symbols = data.unique_element_symbols[purpose] logger.info(f"Unique chemical elements: {unique_element_symbols}") # we make the features self.GP = self.custom.get("GP", None) if self.GP is None: custom = self.custom.get("user_input", None) self.GP = self.make_symmetry_functions( unique_element_symbols, custom=custom, angular_type=self.angular_type) self.custom.update({"GP": self.GP}) else: logger.info( "Using parameters from file to create symmetry functions...\n") self.print_features_params(self.GP) symbol = data.unique_element_symbols[purpose][0] sample = np.zeros(len(self.GP[symbol])) self.dimension = len(sample) preprocessor = Preprocessing(self.preprocessor, purpose=purpose) preprocessor.set(purpose=purpose) # We start populating computations to get atomic features. logger.info("") logger.info( "Embarrassingly parallel computation of atomic features...") stacked_features = [] atoms_index_map = [ ] # This list is used to reconstruct images from atoms. if self.batch_size is None: self.batch_size = data.get_total_number_atoms() chunks = get_chunks(images, self.batch_size, svm=svm) ini = end = 0 for chunk in chunks: images_ = OrderedDict(chunk) intermediate = [] for image in images_.items(): _, image = image end = ini + len(image) atoms_index_map.append(list(range(ini, end))) ini = end for atom in image: index = atom.index symbol = atom.symbol cutoff_keys = ["radial", "angular"] n_symbols, neighborpositions = {}, {} if isinstance(self.cutoff, dict): for cutoff_key in cutoff_keys: nl = get_neighborlist( image, cutoff=self.cutoff[cutoff_key]) # n_indices: neighbor indices for central atom_i. # n_offsets: neighbor offsets for central atom_i. n_indices, n_offsets = nl[atom.index] n_symbols_ = np.array( image.get_chemical_symbols())[n_indices] n_symbols[cutoff_key] = n_symbols_ neighborpositions_ = image.positions[ n_indices] + np.dot(n_offsets, image.get_cell()) neighborpositions[cutoff_key] = neighborpositions_ else: for cutoff_key in cutoff_keys: nl = get_neighborlist(image, cutoff=self.cutoff) # n_indices: neighbor indices for central atom_i. # n_offsets: neighbor offsets for central atom_i. n_indices, n_offsets = nl[atom.index] n_symbols_ = np.array( image.get_chemical_symbols())[n_indices] n_symbols[cutoff_key] = n_symbols_ neighborpositions_ = image.positions[ n_indices] + np.dot(n_offsets, image.get_cell()) neighborpositions[cutoff_key] = neighborpositions_ afp = self.get_atomic_features( atom, index, symbol, n_symbols, neighborpositions, image_molecule=image, weighted=self.weighted, n_indices=n_indices, ) intermediate.append(afp) intermediate = client.persist(intermediate, scheduler=self.scheduler) stacked_features += intermediate del intermediate scheduler_time = time.time() - initial_time dask.distributed.wait(stacked_features) h, m, s = convert_elapsed_time(scheduler_time) logger.info("... finished in {} hours {} minutes {:.2f}" " seconds.".format(h, m, s)) logger.info("") if self.preprocessor is not None: scaled_feature_space = [] # To take advantage of dask_ml we need to convert our numpy array # into a dask array. logger.info("Converting features to dask array...") stacked_features = [ da.from_delayed(lazy, dtype=float, shape=sample.shape) for lazy in stacked_features ] layout = {0: tuple(len(i) for i in atoms_index_map), 1: -1} # stacked_features = dask.array.stack(stacked_features, axis=0).rechunk(layout) stacked_features = da.stack(stacked_features, axis=0).rechunk(layout) logger.info("Shape of array is {} and chunks {}.".format( stacked_features.shape, stacked_features.chunks)) # Note that dask_ml by default convert the output of .fit # in a concrete value. if purpose == "training": stacked_features = preprocessor.fit(stacked_features, scheduler=self.scheduler) else: stacked_features = preprocessor.transform(stacked_features) atoms_index_map = [ client.scatter(indices) for indices in atoms_index_map ] # stacked_features = [client.scatter(features) for features in stacked_features] stacked_features = client.scatter(stacked_features, broadcast=True) logger.info("Stacking features using atoms index map...") for indices in atoms_index_map: features = client.submit(self.stack_features, *(indices, stacked_features)) # features = self.stack_features(indices, stacked_features) scaled_feature_space.append(features) else: scaled_feature_space = [] atoms_index_map = [ client.scatter(chunk) for chunk in atoms_index_map ] stacked_features = client.scatter(stacked_features, broadcast=True) for indices in atoms_index_map: features = client.submit(self.stack_features, *(indices, stacked_features)) scaled_feature_space.append(features) scaled_feature_space = client.gather(scaled_feature_space) # Clean del stacked_features # Restack images feature_space = [] if svm and purpose == "training": logger.info("Building array with reference space.") reference_space = [] for i, image in enumerate(images.items()): restacked = client.submit( self.restack_image, *(i, image, scaled_feature_space, svm)) # image = (hash, ase_image) -> tuple for atom in image[1]: restacked_atom = client.submit( self.restack_atom, *(i, atom, scaled_feature_space)) reference_space.append(restacked_atom) feature_space.append(restacked) reference_space = client.gather(reference_space) elif svm is False and purpose == "training": for i, image in enumerate(images.items()): restacked = client.submit( self.restack_image, *(i, image, scaled_feature_space, svm)) feature_space.append(restacked) else: try: for i, image in enumerate(images.items()): restacked = client.submit( self.restack_image, *(i, image, scaled_feature_space, svm)) feature_space.append(restacked) except UnboundLocalError: # scaled_feature_space does not exist. for i, image in enumerate(images.items()): restacked = client.submit(self.restack_image, *(i, image, feature_space, svm)) feature_space.append(restacked) feature_space = client.gather(feature_space) feature_space = OrderedDict(feature_space) fp_time = time.time() - initial_time h, m, s = convert_elapsed_time(fp_time) logger.info("Featurization finished in {} hours {} minutes {:.2f}" " seconds.".format(h, m, s)) if svm and purpose == "training": client.restart() # Reclaims memory aggressively preprocessor.save_to_file(preprocessor, self.save_preprocessor) if self.filename is not None: logger.info(f"features saved to {self.filename}.") data = {"feature_space": feature_space} data.update({"reference_space": reference_space}) dump(data, filename=self.filename) self.feature_space = feature_space self.reference_space = reference_space return self.feature_space, self.reference_space elif svm is False and purpose == "training": client.restart() # Reclaims memory aggressively preprocessor.save_to_file(preprocessor, self.save_preprocessor) if self.filename is not None: logger.info(f"features saved to {self.filename}.") dump(feature_space, filename=self.filename) self.feature_space = feature_space return self.feature_space else: self.feature_space = feature_space return self.feature_space
def calculate(self, images=None, purpose="training", data=None, svm=False): """Calculate the features per atom in an atoms objects Parameters ---------- image : dict Hashed images using the Data class. purpose : str The supported purposes are: 'training', 'inference'. data : obj data object svm : bool Whether or not these features are going to be used for kernel methods. Returns ------- feature_space : dict A dictionary with key hash and value as a list with the following structure: {'hash': [('H', [vector]]} reference_space : dict A reference space useful for SVM models. """ client = dask.distributed.get_client() logger.info(" ") logger.info("Featurization") logger.info("=============") now = datetime.datetime.now() logger.info("Module accessed on {}.".format( now.strftime("%Y-%m-%d %H:%M:%S"))) # FIXME the block below should become a function. if os.path.isfile(self.filename) and self.overwrite is False: logger.warning("Loading features from {}.".format(self.filename)) logger.info(" ") svm_keys = [b"feature_space", b"reference_space"] data = load(self.filename) data_hashes = list(data.keys()) image_hashes = list(images.keys()) if image_hashes == data_hashes: # Check if both lists are the same. return data elif any(i in image_hashes for i in data_hashes): # Check if any of the elem _data = {} for hash in image_hashes: _data[hash] = data[hash] return _data if svm_keys == list(data.keys()): feature_space = data[svm_keys[0]] reference_space = data[svm_keys[1]] return feature_space, reference_space initial_time = time.time() # Verify that we know the unique element symbols if data.unique_element_symbols is None: logger.info( "Getting unique element symbols for {}".format(purpose)) unique_element_symbols = data.get_unique_element_symbols( images, purpose=purpose) unique_element_symbols = unique_element_symbols[purpose] logger.info( "Unique chemical elements: {}".format(unique_element_symbols)) elif isinstance(data.unique_element_symbols, dict): unique_element_symbols = data.unique_element_symbols[purpose] logger.info( "Unique chemical elements: {}".format(unique_element_symbols)) # we make the features preprocessor = Preprocessing(self.preprocessor, purpose=purpose) preprocessor.set(purpose=purpose) # We start populating computations to get atomic features. logger.info("") logger.info( "Embarrassingly parallel computation of atomic features...") stacked_features = [] atoms_symbols_map = [ ] # This list is used to reconstruct images from atoms. if self.batch_size is None: self.batch_size = data.get_total_number_atoms() chunks = get_chunks(images, self.batch_size, svm=svm) for chunk in chunks: images_ = OrderedDict(chunk) intermediate = [] for image in images_.items(): key, image = image atoms_symbols_map.append(image.get_chemical_symbols()) # Use .create() class method from dscribe. _features = dask.delayed(self.create)(image) intermediate.append(_features) intermediate = client.compute(intermediate, scheduler=self.scheduler) stacked_features += intermediate del intermediate # scheduler_time = time.time() - initial_time # dask.distributed.wait(stacked_features) logger.info("") if self.preprocessor is not None: raise NotImplementedError else: scaled_feature_space = [] atoms_symbols_map = [ client.scatter(chunk) for chunk in atoms_symbols_map ] stacked_features = client.scatter(stacked_features, broadcast=True) for image_index, symbols in enumerate(atoms_symbols_map): features = client.submit( self.stack_features, *(symbols, image_index, stacked_features)) scaled_feature_space.append(features) scaled_feature_space = client.gather(scaled_feature_space) # Clean del stacked_features # Restack images feature_space = [] if svm and purpose == "training": for i, image in enumerate(images.items()): restacked = client.submit( self.restack_image, *(i, image, scaled_feature_space, svm)) feature_space.append(restacked) elif svm is False and purpose == "training": for i, image in enumerate(images.items()): restacked = client.submit( self.restack_image, *(i, image, scaled_feature_space, svm)) feature_space.append(restacked) else: try: for i, image in enumerate(images.items()): restacked = client.submit( self.restack_image, *(i, image, scaled_feature_space, svm)) feature_space.append(restacked) except UnboundLocalError: # scaled_feature_space does not exist. for i, image in enumerate(images.items()): restacked = client.submit(self.restack_image, *(i, image, feature_space, svm)) feature_space.append(restacked) feature_space = client.gather(feature_space) if svm and purpose == "training": # FIXME This might need to be improved logger.info("Building array with reference space.") hashes, reference_space = list(zip(*feature_space)) del hashes reference_space = list( itertools.chain.from_iterable(reference_space)) logger.info("Finished reference space.") feature_space = OrderedDict(feature_space) fp_time = time.time() - initial_time h, m, s = convert_elapsed_time(fp_time) logger.info("Featurization finished in {} hours {} minutes {:.2f}" " seconds.".format(h, m, s)) if svm and purpose == "training": client.restart() # Reclaims memory aggressively preprocessor.save_to_file(preprocessor, self.save_preprocessor) if self.filename is not None: logger.info("features saved to {}.".format(self.filename)) data = {"feature_space": feature_space} data.update({"reference_space": reference_space}) dump(data, filename=self.filename) self.feature_space = feature_space self.reference_space = reference_space return self.feature_space, self.reference_space elif svm is False and purpose == "training": client.restart() # Reclaims memory aggressively preprocessor.save_to_file(preprocessor, self.save_preprocessor) if self.filename is not None: logger.info("features saved to {}.".format(self.filename)) dump(feature_space, filename=self.filename) self.feature_space = feature_space return self.feature_space else: self.feature_space = feature_space return self.feature_space
def prepare_model(self, feature_space, reference_features, data=None, purpose="training"): """Prepare the Kernel Ridge Regression model Parameters ---------- feature_space : dict A dictionary with hash, fingerprint structure. reference_features : dict A dictionary with raveled tuples of symbol, atomic fingerprint. data : object DataSet object created from the handler. purpose : str Purpose of this model: 'training', 'inference'. Notes ----- This method builds the atomic kernel matrices and the LT vectors needed to apply the atomic decomposition Ansatz. """ if purpose == "training": logger.info("Model Training") logger.info("Model name: {}.".format(self.name())) logger.info("Kernel parameters:") logger.info(" - Kernel function: {}.".format(self.kernel)) logger.info(" - Sigma: {}.".format(self.sigma)) logger.info(" - Lamda: {}.".format(self.lamda)) dim = len(reference_features) """ Atomic kernel matrices """ initial_time = time.time() logger.info("Computing Kernel Matrix...") # We start populating computations with delayed functions to # operate with dask's scheduler logger.warning(" Adding calculations to scheduler...") computations = self.get_kernel_matrix(feature_space, reference_features) scheduler_time = time.time() - initial_time h, m, s = convert_elapsed_time(scheduler_time) logger.info(" {} kernel evaluations added in {} hours {} minutes " "{:.2f} seconds.".format(len(computations), h, m, s)) if self.batch_size is not None: computations = list(get_chunks(computations, self.batch_size)) logger.info( " The calculations were batched in groups of {}.".format( self.batch_size)) # We compute the calculations with dask and the result is converted # to numpy array. logger.info(" Evaluating atomic similarities...") if self.batch_size is None: kernel_matrix = dask.compute(*computations, scheduler=self.scheduler) else: kernel_matrix = [] for i, chunk in enumerate(computations): kernel_matrix.append( dask.compute(*chunk, scheduler=self.scheduler)) self.K = np.array(kernel_matrix).reshape(dim, dim) build_time = time.time() - initial_time h, m, s = convert_elapsed_time(build_time) logger.info("Kernel matrix built in {} hours {} minutes {:.2f} " "seconds.".format(h, m, s)) """ LT Vectors """ # We build the LT matrix needed for ADA logger.info("Building LT matrix") computations = [] for index, feature_space in enumerate(feature_space.items()): computations.append(self.get_lt(index)) self.LT = np.array((dask.compute(*computations, scheduler=self.scheduler))) lt_time = time.time() - initial_time h, m, s = convert_elapsed_time(lt_time) logger.info( "LT matrix built in {} hours {} minutes {:.2f} seconds.".format( h, m, s))
def get_kernel_matrix(self, feature_space, reference_features, purpose): """Get kernel matrix delayed computations Parameters ---------- features : dict, list Dictionary with hash and features, or a list. reference_space : array Array with reference feature space. purpose : str Purpose of this kernel matrix. Accepted arguments are 'training', and 'inference'. Returns ------- kernel_matrix List with kernel matrix values. Notes ----- This class method expects the feature_space to be an OrderedDict and reference_space but it turns out that for computing variances, it might be the case the feature_space is also a list. """ call = {"exponential": exponential, "laplacian": laplacian, "rbf": rbf} initial_time = time.time() if isinstance(reference_features, dict): # This is the case when the reference_features are a # dictionary, too. If that's true we have to convert it to a list. reference_features = list(reference_features.values())[0] chunks = list(get_chunks(feature_space, self.batch_size)) logger.info( " The calculations are distributed in {} batches of {} atoms.". format(len(chunks), self.batch_size)) counter = 0 kernel_matrix = [] for c, chunk in enumerate(chunks): chunk_initial_time = time.time() logger.info( " Computing kernel functions for chunk {}...".format(c)) intermediates = [] if isinstance(feature_space, dict) and isinstance( reference_features, list): if isinstance(chunk, dict) is False: chunk = OrderedDict(chunk) reference_lenght = len(reference_features) for hash, _feature_space in chunk.items(): f_map = [] for i_symbol, i_afp in _feature_space: i_symbol = decode(i_symbol) f_map.append(1) if purpose == "training": for j in range(counter, reference_lenght): j_symbol, j_afp = reference_features[j] kernel = call[self.kernel](i_afp, j_afp, i_symbol, j_symbol, self.sigma) intermediates.append(kernel) counter += 1 else: for j_symbol, j_afp in reference_features: j_symbol = decode(j_symbol) kernel = call[self.kernel](i_afp, j_afp, i_symbol, j_symbol, self.sigma) intermediates.append(kernel) self.fingerprint_map.append(f_map) elif isinstance(feature_space, list) and isinstance( reference_features, list): for i_symbol, i_afp in chunk: for j_symbol, j_afp in reference_features: i_symbol = decode(i_symbol) j_symbol = decode(j_symbol) kernel = call[self.kernel](i_afp, j_afp, i_symbol, j_symbol, self.sigma) intermediates.append(kernel) # Compute stuff from above kernel_matrix += dask.compute(intermediates, scheduler=self.scheduler)[0] del intermediates chunk_final_time = time.time() - chunk_initial_time h, m, s = convert_elapsed_time(chunk_final_time) logger.info(" ...finished in {} hours {} minutes {:.2f} " "seconds.".format(h, m, s)) # dask.distributed.wait(kernel_matrix) del reference_features # kernel_matrix = client.gather(kernel_matrix) build_time = time.time() - initial_time h, m, s = convert_elapsed_time(build_time) logger.info("Kernel matrix built in {} hours {} minutes {:.2f} " "seconds.".format(h, m, s)) """ LT Vectors """ # We build the LT matrix needed for ADA if purpose == "training": self.LT = [] logger.info("Building LT matrix") computations = [] for index, feature_space in enumerate(feature_space.items()): computations.append(self.get_lt(index)) computations = list(get_chunks(computations, self.batch_size)) logger.info( " The calculations are distributed in {} batches of {} molecules." .format(len(computations), self.batch_size)) for chunk in computations: self.LT += dask.compute(*chunk, scheduler=self.scheduler) self.LT = np.array(self.LT) del computations del chunk lt_time = time.time() - initial_time h, m, s = convert_elapsed_time(lt_time) logger.info( "LT matrix built in {} hours {} minutes {:.2f} seconds.". format(h, m, s)) return kernel_matrix