def predict(self, dataset: Dataset, transformers: List[Transformer] = []) -> np.ndarray: """ Uses self to make predictions on provided Dataset object. Parameters ---------- dataset: Dataset Dataset to make prediction on transformers: List[Transformer] Transformers that the input data has been transformed by. The output is passed through these transformers to undo the transformations. Returns ------- np.ndarray A numpy array of predictions the model produces. """ y_preds = [] for (X_batch, _, _, ids_batch) in dataset.iterbatches(deterministic=True): n_samples = len(X_batch) y_pred_batch = self.predict_on_batch(X_batch) # Discard any padded predictions y_pred_batch = y_pred_batch[:n_samples] y_pred_batch = undo_transforms(y_pred_batch, transformers) y_preds.append(y_pred_batch) y_pred = np.concatenate(y_preds) return y_pred
def fit(self, dataset: Dataset, nb_epoch: int = 10) -> float: """ Fits a model on data in a Dataset object. Parameters ---------- dataset: Dataset the Dataset to train on nb_epoch: int the number of epochs to train for Returns ------- float The average loss over the most recent checkpoint interval. """ for epoch in range(nb_epoch): logger.info("Starting epoch %s" % str(epoch + 1)) losses = [] for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(): losses.append(self.fit_on_batch(X_batch, y_batch, w_batch)) logger.info("Avg loss for epoch %d: %f" % (epoch + 1, np.array(losses).mean())) return np.array(losses).mean()
def predict(self, dataset: Dataset, transformers: List[Transformer] = []) -> OneOrMany[np.ndarray]: """ Uses self to make predictions on provided Dataset object. Parameters ---------- dataset: dc.data.Dataset Dataset to make prediction on transformers: list of dc.trans.Transformers Transformers that the input data has been transformed by. The output is passed through these transformers to undo the transformations. Returns ------- a NumPy array of the model produces a single output, or a list of arrays if it produces multiple outputs """ y_preds = [] n_tasks = self.get_num_tasks() ind = 0 for (X_batch, _, _, ids_batch) in dataset.iterbatches(deterministic=True): n_samples = len(X_batch) y_pred_batch = self.predict_on_batch(X_batch) # Discard any padded predictions y_pred_batch = y_pred_batch[:n_samples] y_pred_batch = undo_transforms(y_pred_batch, transformers) y_preds.append(y_pred_batch) y_pred = np.concatenate(y_preds) return y_pred
def fit(self, dataset: Dataset, nb_epoch: int = 10) -> float: """ Fits a model on data in a Dataset object. Parameters ---------- dataset: Dataset the Dataset to train on nb_epoch: int the number of epochs to train for Returns ------- the average loss over the most recent epoch """ # TODO(rbharath/enf): We need a structured way to deal with potential GPU # memory overflows. for epoch in range(nb_epoch): logger.info("Starting epoch %s" % str(epoch + 1)) losses = [] for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(): losses.append(self.fit_on_batch(X_batch, y_batch, w_batch)) logger.info("Avg loss for epoch %d: %f" % (epoch + 1, np.array(losses).mean())) return np.array(losses).mean()
def default_generator( self, dataset: Dataset, epochs: int = 1, mode: str = 'fit', deterministic: bool = True, pad_batches: bool = True) -> Iterable[Tuple[List, List, List]]: """Create a generator that iterates batches for a dataset. Subclasses may override this method to customize how model inputs are generated from the data. Parameters ---------- dataset: Dataset the data to iterate epochs: int the number of times to iterate over the full dataset mode: str allowed values are 'fit' (called during training), 'predict' (called during prediction), and 'uncertainty' (called during uncertainty prediction) deterministic: bool whether to iterate over the dataset in order, or randomly shuffle the data for each epoch pad_batches: bool whether to pad each batch up to this model's preferred batch size Returns ------- a generator that iterates batches, each represented as a tuple of lists: ([inputs], [outputs], [weights]) """ for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches( batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): yield ([X_b], [y_b], [w_b])
def default_generator( self, dataset: Dataset, epochs: int = 1, mode: str = 'fit', deterministic: bool = True, pad_batches: bool = True) -> Iterable[Tuple[List, List, List]]: """Convert a dataset into the tensors needed for learning. Parameters ---------- dataset: `dc.data.Dataset` Dataset to convert epochs: int, optional (Default 1) Number of times to walk over `dataset` mode: str, optional (Default 'fit') Ignored in this implementation. deterministic: bool, optional (Default True) Whether the dataset should be walked in a deterministic fashion pad_batches: bool, optional (Default True) If true, each returned batch will have size `self.batch_size`. Returns ------- Iterator which walks over the batches """ for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches( batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): if y_b is not None: if self.mode == 'classification': y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) inputs = self.compute_features_on_batch(X_b) yield (inputs, [y_b], [w_b])
def default_generator( self, dataset: Dataset, epochs: int = 1, mode: str = 'fit', deterministic: bool = True, pad_batches: bool = True) -> Iterable[Tuple[List, List, List]]: """Convert a dataset into the tensors needed for learning. Parameters ---------- dataset: `dc.data.Dataset` Dataset to convert epochs: int, optional (Default 1) Number of times to walk over `dataset` mode: str, optional (Default 'fit') Ignored in this implementation. deterministic: bool, optional (Default True) Whether the dataset should be walked in a deterministic fashion pad_batches: bool, optional (Default True) If true, each returned batch will have size `self.batch_size`. Returns ------- Iterator which walks over the batches """ for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches( batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): if y_b is not None: if self.mode == 'classification': y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) atom_feat = [] pair_feat = [] atom_split = [] atom_to_pair = [] pair_split = [] start = 0 for im, mol in enumerate(X_b): n_atoms = mol.get_num_atoms() # number of atoms in each molecule atom_split.extend([im] * n_atoms) # index of pair features C0, C1 = np.meshgrid(np.arange(n_atoms), np.arange(n_atoms)) atom_to_pair.append( np.transpose( np.array([C1.flatten() + start, C0.flatten() + start]))) # number of pairs for each atom pair_split.extend(C1.flatten() + start) start = start + n_atoms # atom features atom_feat.append(mol.get_atom_features()) # pair features pair_feat.append( np.reshape(mol.get_pair_features(), (n_atoms * n_atoms, self.n_pair_feat[0]))) inputs = [ np.concatenate(atom_feat, axis=0), np.concatenate(pair_feat, axis=0), np.array(pair_split), np.array(atom_split), np.concatenate(atom_to_pair, axis=0) ] yield (inputs, [y_b], [w_b])