def __init__( self, inputs, targets, model=None, data=None, optimizer=(None, None), regularization=None, epochs=100, convergence=None, lossfxn=None, device="cpu", batch_size=None, lr_scheduler=None, ): self.initial_time = time.time() atoms_per_image = data.atoms_per_image if batch_size is None: batch_size = len(inputs.values()) if isinstance(batch_size, int): # Data batches chunks = list(get_chunks(inputs, batch_size, svm=False)) targets = list(get_chunks(targets, batch_size, svm=False)) atoms_per_image = list( get_chunks(atoms_per_image, batch_size, svm=False)) logger.info(" ") logging.info("Batch Information") logging.info("-----------------") logging.info("Number of batches: {}.".format(len(chunks))) logging.info("Batch size: {} elements per batch.".format(batch_size)) logger.info(" ") atoms_per_image = torch.tensor(atoms_per_image, requires_grad=False, dtype=torch.float) targets = torch.tensor(targets, requires_grad=False) if device == "cuda": logger.info("Moving data to CUDA...") atoms_per_image = atoms_per_image.cuda() targets = targets.cuda() _inputs = OrderedDict() for hash, f in inputs.items(): _inputs[hash] = [] for features in f: symbol, vector = features _inputs[hash].append((symbol, vector.cuda())) inputs = _inputs move_time = time.time() - self.initial_time h, m, s = convert_elapsed_time(move_time) logger.info("Data moved to GPU in {} hours {} minutes {:.2f} \ seconds.".format(h, m, s)) logger.info(" ") # Define optimizer self.optimizer_name, self.optimizer = get_optimizer( optimizer, model.parameters()) if lr_scheduler is not None: self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler) logger.info(" ") logger.info("Starting training...") logger.info(" ") logger.info("{:6s} {:19s} {:12s} {:8s} {:8s}".format( "Epoch", "Time Stamp", "Loss", "RMSE/img", "RMSE/atom")) logger.info("{:6s} {:19s} {:12s} {:8s} {:8s}".format( "------", "-------------------", "------------", "--------", "---------")) self.atoms_per_image = atoms_per_image self.convergence = convergence self.device = device self.epochs = epochs self.model = model self.lr_scheduler = lr_scheduler # Data scattering client = dask.distributed.get_client() self.chunks = [client.scatter(chunk) for chunk in chunks] self.targets = [client.scatter(target) for target in targets] if lossfxn is None: self.lossfxn = AtomicMSELoss else: self.lossfxn = lossfxn # Let the hunger games begin... self.trainer()
def train( self, inputs, targets, data=None, optimizer=(None, None), epochs=100, regularization=None, convergence=None, lossfxn=None, device="cpu", batch_size=None, lr_scheduler=None, independent_loss=True, loss_weights=None, ): """Train the models Parameters ---------- inputs : dict Dictionary with hashed feature space. targets : list The expected values that the model has to learn aka y. model : object The NeuralNetwork class. data : object Data object created from the handler. optimizer : tuple The optimizer is a tuple with the structure: >>> ('adam', {'lr': float, 'weight_decay'=float}) epochs : int Number of full training cycles. regularization : float This is the L2 regularization. It is not the same as weight decay. convergence : dict Instead of using epochs, users can set a convergence criterion. >>> convergence = {"rmse": [0.04, 0.02]} lossfxn : obj A loss function object. device : str Calculation can be run in the cpu or cuda (gpu). batch_size : int Number of data points per batch to use for training. Default is None. lr_scheduler : tuple Tuple with structure: scheduler's name and a dictionary with keyword arguments. >>> lr_scheduler = ('ReduceLROnPlateau', {'mode': 'min', 'patience': 10}) independent_loss : bool Whether or not models' weight are optimized independently. loss_weights : list How much the loss of model(i) contributes to the total loss. """ self.epochs = epochs # Convergence criterion if isinstance(convergence["rmse"], float) or isinstance( convergence["rmse"], int): convergence["rmse"] = np.array( [convergence["rmse"] for model in range(len(self.models))]) elif isinstance(convergence["rmse"], list): if len(convergence["rmse"]) != len(self.models): raise ( "Your convergence list is not the same length of the number of models" ) convergence["rmse"] = np.array(convergence["rmse"]) logger.info(" ") logging.info("Model Merger") logging.info("============") now = datetime.datetime.now() logger.info("Module accessed on {}.".format( now.strftime("%Y-%m-%d %H:%M:%S"))) logging.info("Merging the following models:") for model in self.models: logging.info(" - {}.".format(model.name())) logging.info("Loss functions:") if loss_weights is None: self.loss_weights = [1.0 / len(lossfxn) for l in lossfxn] else: self.loss_weights = loss_weights for index, l in enumerate(lossfxn): logging.info(" - Name: {}; Weight: {}.".format( l.__name__, self.loss_weights[index])) logging.info("Convergence criterion: {}.".format(convergence)) # If no batch_size provided then the whole training set length is the batch. if batch_size is None: batch_size = len(inputs.values()) if isinstance(batch_size, int): chunks = [] for inputs_ in inputs: if inspect.ismethod(inputs_): chunks.append(inputs_) else: chunks.append( list(get_chunks(inputs_, batch_size, svm=False))) targets = [ list(get_chunks(target, batch_size, svm=False)) for target in targets ] atoms_per_image = list( get_chunks(data.atoms_per_image, batch_size, svm=False)) if lossfxn is None: self.lossfxn = [None for model in self.models] else: self.lossfxn = lossfxn self.device = device # Population of extra Attributes needed by the models, and further data # preprocessing for index, loss in enumerate(lossfxn): _args, _varargs, _keywords, _defaults = inspect.getargspec(loss) if "latent" in _args: train = dynamic_import("train", "ml4chem.atomistic.models", alt_name="autoencoders") self.inputs_chunk_vals = train.get_inputs_chunks(chunks[index]) else: self.inputs_chunk_vals = None parameters = [] for index, model in enumerate(self.models): parameters += model.parameters() if model.name() == "PytorchPotentials": # These models require targets as tensors self.atoms_per_image = torch.tensor(atoms_per_image, requires_grad=False, dtype=torch.float) _targets = [ torch.tensor(batch, requires_grad=False) for batch in targets[index] ] targets[index] = _targets del _targets elif model.name() in ModelMerger.autoencoders: targets[index] = lod_to_list(targets[index]) # Data scattering client = dask.distributed.get_client() # self.targets = [client.scatter(target) for target in targets] self.targets = [target for target in targets] self.chunks = [] for i, chunk in enumerate(chunks): if inspect.ismethod(chunk) is False: self.chunks.append(client.scatter(chunk)) else: # This list comprehension is useful to have the same number of # functions as the same number of chunks without users' input. chunk = [chunk for _ in range(len(self.targets[i]))] self.chunks.append(chunk) del chunks logger.info(" ") logging.info("Batch Information") logging.info("-----------------") logging.info("Number of batches:") for index, c in enumerate(self.chunks): logging.info(" - Model {}, {}.".format(index, len(c))) logging.info("Batch size: {} elements per batch.\n".format(batch_size)) # Define optimizer self.optimizer_name, self.optimizer = get_optimizer( optimizer, parameters) if lr_scheduler is not None: self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler) logger.info(" ") logger.info("Starting training...") logger.info(" ") logger.info("{:6s} {:19s} {:12s} {:8s}".format("Epoch", "Time Stamp", "Loss", "RMSE (ave)")) logger.info("{:6s} {:19s} {:12s} {:8s}".format("------", "-------------------", "------------", "--------------")) converged = False epoch = 0 if independent_loss is False: # Convert list of chunks from [[a, c], [b, d]] to [[a, b], [c, d]] self.chunks = list(map(list, zip(*self.chunks))) old_state_dict = {} for key in self.models[1].state_dict(): old_state_dict[key] = self.models[1].state_dict()[key].clone() from ml4chem.atomistic.models.autoencoders import Annealer annealer = Annealer() while not converged: epoch += 1 self.annealing = annealer.update(epoch) self.optimizer.zero_grad() # clear previous gradients if independent_loss: losses = [] outputs = [] for model_index, model in enumerate(self.models): loss, output = self.closure(model_index, model, independent_loss, name=model.name()) losses.append(loss) outputs.append(output) else: loss, outputs = self.closure(index, self.models, independent_loss) rmse = [] for i, model in enumerate(self.models): outputs_ = outputs[i] targets_ = self.targets[i] if model.name() == "VAE": # VAE usually returns a complex output with mus and sigmas # but we only need mus at this stage. outputs_ = [sublist[0] for sublist in outputs_] rmse.append(compute_rmse(outputs_, targets_)) rmse = np.array(rmse) _rmse = np.average(rmse) if self.optimizer_name != "LBFGS": self.optimizer.step() else: options = { "closure": self.closure, "current_loss": loss, "max_ls": 10 } self.optimizer.step(options) ts = time.time() ts = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d " "%H:%M:%S") logger.info("{:6d} {} {:8e} {:8f}".format(epoch, ts, loss, _rmse)) if convergence is None and epoch == self.epochs: converged = True elif convergence is not None and (rmse <= convergence["rmse"]).all(): converged = True new_state_dict = {} for key in self.models[1].state_dict(): new_state_dict[key] = self.models[1].state_dict( )[key].clone() for key in old_state_dict: if not (old_state_dict[key] == new_state_dict[key]).all(): print("Diff in {}".format(key)) else: print("No diff in {}".format(key)) print(convergence) print(rmse) print("Final") print(convergence) print(rmse)
def __init__( self, inputs, targets, model=None, data=None, optimizer=(None, None), regularization=None, epochs=100, convergence=None, lossfxn=None, device="cpu", batch_size=None, lr_scheduler=None, **kwargs ): supported_keys = ["anneal", "penalize_latent"] if len(kwargs.items()) == 0: for k in supported_keys: setattr(self, k, None) else: for k, v in kwargs.items(): if k in supported_keys: setattr(self, k, v) self.initial_time = time.time() if device == "cuda": pass """ logger.info('Moving data to CUDA...') targets = targets.cuda() _inputs = OrderedDict() for hash, f in inputs.items(): _inputs[hash] = [] for features in f: symbol, vector = features _inputs[hash].append((symbol, vector.cuda())) del inputs inputs = _inputs move_time = time.time() - initial_time h, m, s = convert_elapsed_time(move_time) logger.info('Data moved to GPU in {} hours {} minutes {:.2f} seconds.' .format(h, m, s)) """ if batch_size is None: batch_size = len(inputs.values()) if isinstance(batch_size, int): chunks = list(get_chunks(inputs, batch_size, svm=False)) targets_ = list(get_chunks(targets, batch_size, svm=False)) del targets # This change is needed because the targets are features or # positions and they are built as a dictionary. targets = lod_to_list(targets_) logging.info("Batch size: {} elements per batch.".format(batch_size)) if device == "cuda": logger.info("Moving data to CUDA...") targets = targets.cuda() _inputs = OrderedDict() for hash, f in inputs.items(): _inputs[hash] = [] for features in f: symbol, vector = features _inputs[hash].append((symbol, vector.cuda())) inputs = _inputs move_time = time.time() - self.initial_time h, m, s = convert_elapsed_time(move_time) logger.info( "Data moved to GPU in {} hours {} minutes {:.2f} \ seconds.".format( h, m, s ) ) logger.info(" ") # Define optimizer self.optimizer_name, self.optimizer = get_optimizer( optimizer, model.parameters() ) if lr_scheduler is not None: self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler) if lossfxn is None: self.lossfxn = MSELoss self.inputs_chunk_vals = None else: logger.info("Using custom loss function...") logger.info("") self.lossfxn = lossfxn self.inputs_chunk_vals = self.get_inputs_chunks(chunks) logger.info(" ") logger.info("Starting training...") logger.info(" ") logger.info( "{:6s} {:19s} {:12s} {:9s}".format("Epoch", "Time Stamp", "Loss", "Rec Err") ) logger.info( "{:6s} {:19s} {:12s} {:9s}".format( "------", "-------------------", "------------", "--------" ) ) # Data scattering client = dask.distributed.get_client() self.chunks = [client.scatter(chunk) for chunk in chunks] self.targets = [client.scatter(target) for target in targets] self.device = device self.epochs = epochs self.model = model self.lr_scheduler = lr_scheduler self.convergence = convergence # Let the hunger game begin... self.trainer()
def __init__( self, inputs, targets, model=None, data=None, optimizer=(None, None), regularization=None, epochs=100, convergence=None, lossfxn=None, device="cpu", batch_size=None, lr_scheduler=None, uncertainty=None, checkpoint=None, test=None, ): self.initial_time = time.time() if lossfxn is None: lossfxn = AtomicMSELoss logger.info("") logger.info("Training") logger.info("========") logger.info(f"Convergence criteria: {convergence}") logger.info(f"Loss function: {lossfxn.__name__}") if uncertainty is not None: logger.info("Options:") logger.info(f" - Uncertainty penalization: {pformat(uncertainty)}") logger.info("") atoms_per_image = data.atoms_per_image if batch_size is None: batch_size = len(inputs.values()) if isinstance(batch_size, int): # Data batches chunks = list(get_chunks(inputs, batch_size, svm=False)) targets = list(get_chunks(targets, batch_size, svm=False)) atoms_per_image = list(get_chunks(atoms_per_image, batch_size, svm=False)) if uncertainty != None: uncertainty = list(get_chunks(uncertainty, batch_size, svm=False)) uncertainty = [ torch.tensor(u, requires_grad=False, dtype=torch.float) for u in uncertainty ] logger.info("") logging.info("Batch Information") logging.info("-----------------") logging.info("Number of batches: {}.".format(len(chunks))) logging.info("Batch size: {} elements per batch.".format(batch_size)) logger.info(" ") atoms_per_image = [ torch.tensor(n_atoms, requires_grad=False, dtype=torch.float) for n_atoms in atoms_per_image ] targets = [torch.tensor(t, requires_grad=False) for t in targets] if device == "cuda": logger.info("Moving data to CUDA...") atoms_per_image = atoms_per_image.cuda() targets = targets.cuda() _inputs = OrderedDict() for hash, f in inputs.items(): _inputs[hash] = [] for features in f: symbol, vector = features _inputs[hash].append((symbol, vector.cuda())) inputs = _inputs move_time = time.time() - self.initial_time h, m, s = convert_elapsed_time(move_time) logger.info( "Data moved to GPU in {} hours {} minutes {:.2f} \ seconds.".format( h, m, s ) ) logger.info(" ") # Define optimizer self.optimizer_name, self.optimizer = get_optimizer( optimizer, model.parameters() ) if lr_scheduler is not None: self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler) self.atoms_per_image = atoms_per_image self.convergence = convergence self.device = device self.epochs = epochs self.model = model self.lr_scheduler = lr_scheduler self.lossfxn = lossfxn self.checkpoint = checkpoint self.test = test # Data scattering client = dask.distributed.get_client() self.chunks = [client.scatter(chunk) for chunk in chunks] self.targets = [client.scatter(target) for target in targets] if uncertainty != None: self.uncertainty = [client.scatter(u) for u in uncertainty] else: self.uncertainty = uncertainty # Let the hunger games begin... self.trainer()
def train(self, inputs, targets, data=None, optimizer=(None, None), regularization=None, epochs=100, convergence=None, lossfxn=None, device="cpu", batch_size=None, lr_scheduler=None, independent_loss=True, loss_weights=None): logger.info(" ") logging.info("Model Merger") logging.info("============") logging.info("Merging the following models:") for model in self.models: logging.info(" - {}.".format(model.name())) logging.info("Loss functions:") if loss_weights is None: self.loss_weights = [1. / len(lossfxn) for l in lossfxn] else: self.loss_weights = loss_weights for l in lossfxn: logging.info(" - {}.".format(l.__name__)) # If no batch_size provided then the whole training set length is the batch. if batch_size is None: batch_size = len(inputs.values()) if isinstance(batch_size, int): chunks = [] for inputs_ in inputs: if inspect.ismethod(inputs_): chunks.append(inputs_) else: chunks.append( list(get_chunks(inputs_, batch_size, svm=False))) targets = [ list(get_chunks(target, batch_size, svm=False)) for target in targets ] atoms_per_image = list( get_chunks(data.atoms_per_image, batch_size, svm=False)) if lossfxn is None: self.lossfxn = [None for model in self.models] else: self.lossfxn = lossfxn self.device = device # Population of extra Attributes needed by the models, and further data # preprocessing for index, loss in enumerate(lossfxn): _args, _varargs, _keywords, _defaults = inspect.getargspec(loss) if "latent" in _args: train = dynamic_import("train", "ml4chem.models", alt_name="autoencoders") self.inputs_chunk_vals = train.get_inputs_chunks(chunks[index]) parameters = [] for index, model in enumerate(self.models): parameters += model.parameters() if model.name() == "PytorchPotentials": # These models require targets as tensors self.atoms_per_image = torch.tensor(atoms_per_image, requires_grad=False, dtype=torch.float) _targets = [ torch.tensor(batch, requires_grad=False) for batch in targets[index] ] targets[index] = _targets del _targets elif model.name() == "AutoEncoder": targets[index] = lod_to_list(targets[index]) # Data scattering client = dask.distributed.get_client() # self.targets = [client.scatter(target) for target in targets] self.targets = [target for target in targets] self.chunks = [] for i, chunk in enumerate(chunks): if inspect.ismethod(chunk) is False: self.chunks.append(client.scatter(chunk)) else: # This list comprehension is useful to have the same number of # functions as the same number of chunks without users' input. chunk = [chunk for _ in range(len(self.targets[i]))] self.chunks.append(chunk) del chunks logger.info(" ") logging.info("Batch Information") logging.info("-----------------") logging.info("Number of batches:") for index, c in enumerate(self.chunks): logging.info(' - Model {}, {}.'.format(index, len(c))) logging.info("Batch size: {} elements per batch.\n".format(batch_size)) # Define optimizer self.optimizer_name, self.optimizer = get_optimizer( optimizer, parameters) if lr_scheduler is not None: self.scheduler = get_lr_scheduler(self.optimizer, lr_scheduler) logger.info(" ") logger.info("Starting training...") logger.info(" ") logger.info("{:6s} {:19s} {:12s} {:8s}".format("Epoch", "Time Stamp", "Loss", "RMSE (ave)")) logger.info("{:6s} {:19s} {:12s} {:8s}".format("------", "-------------------", "------------", "--------------")) converged = False epoch = 0 if independent_loss is False: # Convert list of chunks from [[a, c], [b, d]] to [[a, b], [c, d]] self.chunks = list(map(list, zip(*self.chunks))) old_state_dict = {} for key in self.models[1].state_dict(): old_state_dict[key] = self.models[1].state_dict()[key].clone() while not converged: epoch += 1 self.optimizer.zero_grad() # clear previous gradients if independent_loss: losses = [] for model_index, model in enumerate(self.models): name = model.name() loss, outputs = self.closure(model_index, model, independent_loss, name=name) losses.append(loss) else: loss, outputs = self.closure(index, self.models, independent_loss) rmse = [] for i, model in enumerate(self.models): rmse.append(compute_rmse(outputs[i], self.targets[i])) # print(outputs[1]) # print(targets[1]) # print(rmse) _rmse = np.average(rmse) if self.optimizer_name != "LBFGS": self.optimizer.step() else: options = { "closure": self.closure, "current_loss": loss, "max_ls": 10 } self.optimizer.step(options) ts = time.time() ts = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d " "%H:%M:%S") logger.info("{:6d} {} {:8e} {:8f}".format(epoch, ts, loss, _rmse)) if convergence is None and epoch == self.epochs: converged = True elif convergence is not None and all(i <= convergence["rmse"] for i in rmse): converged = True new_state_dict = {} for key in self.models[1].state_dict(): new_state_dict[key] = self.models[1].state_dict( )[key].clone() for key in old_state_dict: if not (old_state_dict[key] == new_state_dict[key]).all(): print('Diff in {}'.format(key)) else: print('No diff in {}'.format(key))