def sparse_shuffle(self): """Shuffling that exploits data sparsity to shuffle large datasets. Only for 1-dimensional feature vectors (does not work for tensorial featurizations). """ time1 = time.time() shard_size = self.get_shard_size() num_shards = self.get_number_shards() X_sparses, ys, ws, ids = [], [], [], [] num_features = None for i in range(num_shards): (X_s, y_s, w_s, ids_s) = self.get_shard(i) if num_features is None: num_features = X_s.shape[1] X_sparse = sparsify_features(X_s) X_sparses, ys, ws, ids = (X_sparses + [X_sparse], ys + [y_s], ws + [w_s], ids + [np.atleast_1d(torch.squeeze(ids_s))]) # Get full dataset in memory (X_sparse, y, w, ids) = (np.vstack(X_sparses), np.vstack(ys), np.vstack(ws), torch.cat(ids)) # Shuffle in memory num_samples = len(X_sparse) permutation = torch.randperm(num_samples) X_sparse, y, w, ids = (X_sparse[permutation], y[permutation], w[permutation], ids[permutation]) # Write shuffled shards out to disk for i in range(num_shards): start, stop = i * shard_size, (i + 1) * shard_size (X_sparse_s, y_s, w_s, ids_s) = (X_sparse[start:stop], y[start:stop], w[start:stop], ids[start:stop]) X_s = densify_features(X_sparse_s, num_features) self.set_shard(i, X_s, y_s, w_s, ids_s) time2 = time.time() log("TIMING: sparse_shuffle took %0.3f s" % (time2 - time1), self.verbose)
def shard_generator(): for shard_num, shard in enumerate( self.get_shards(input_files, shard_size)): time1 = time.time() X, valid_inds = self.featurize_shard(shard) ids = shard[self.id_field].values ids = ids[valid_inds] if len(self.tasks) > 0: # Featurize task results iff they exist. y, w = convert_df_to_numpy(shard, self.tasks, self.id_field) # Filter out examples where featurization failed. y, w = (y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) else: # For prospective data where results are unknown, it makes # no sense to have y values or weights. y, w = (None, None) assert len(X) == len(ids) time2 = time.time() log( "TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2 - time1), self.verbose) yield X, y, w, ids
def create_dataset(shard_generator, data_dir=None, tasks=[], verbose=True): """Creates a new DiskDataset Parameters ---------- shard_generator: Iterable An iterable (either a list or generator) that provides tuples of data (X, y, w, ids). Each tuple will be written to a separate shard on disk. data_dir: str Filename for data directory. Creates a temp directory if none specified. tasks: list List of tasks for this dataset. """ if data_dir is None: data_dir = tempfile.mkdtemp() elif not os.path.exists(data_dir): os.makedirs(data_dir) metadata_rows = [] time1 = time.time() for shard_num, (X, y, w, ids) in enumerate(shard_generator): basename = "shard-%d" % shard_num metadata_rows.append( DiskDataset.write_data_to_disk(data_dir, basename, tasks, X, y, w, ids)) metadata_df = DiskDataset._construct_metadata(metadata_rows) save_metadata(tasks, metadata_df, data_dir) time2 = time.time() log("TIMING: dataset construction took %0.3f s" % (time2 - time1), verbose) return DiskDataset(data_dir, verbose=verbose)
def __init__(self, data_dir, verbose=True): """ Turns featurized dataframes into numpy files, writes them & metadata to disk. """ self.data_dir = data_dir self.verbose = verbose log("Loading dataset from disk.", self.verbose) self.tasks, self.metadata_df = self.load_metadata()
def featurize(self, input_files, data_dir=None, shard_size=8192): """Featurize provided files and write to specified location. For large datasets, automatically shards into smaller chunks for convenience. Parameters ---------- input_files: list List of input filenames. data_dir: str (Optional) Directory to store featurized dataset. shard_size: int (Optional) Number of examples stored in each shard. """ log("Loading raw samples now.", self.verbose) log("shard_size: %d" % shard_size, self.verbose) if not isinstance(input_files, list): input_files = [input_files] def shard_generator(): for shard_num, shard in enumerate( self.get_shards(input_files, shard_size)): time1 = time.time() X, valid_inds = self.featurize_shard(shard) ids = shard[self.id_field].values ids = ids[valid_inds] if len(self.tasks) > 0: # Featurize task results iff they exist. y, w = convert_df_to_numpy(shard, self.tasks, self.id_field) # Filter out examples where featurization failed. y, w = (y[valid_inds], w[valid_inds]) assert len(X) == len(ids) == len(y) == len(w) else: # For prospective data where results are unknown, it makes # no sense to have y values or weights. y, w = (None, None) assert len(X) == len(ids) time2 = time.time() log( "TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2 - time1), self.verbose) yield X, y, w, ids return DiskDataset.create_dataset(shard_generator(), data_dir, self.tasks, verbose=self.verbose)
def get_user_specified_features(df, featurizer, verbose=True): """Extract and merge user specified features. Merge features included in dataset provided by user into final features dataframe Three types of featurization here: 1) Molecule featurization -) Smiles string featurization -) Rdkit MOL featurization 2) Complex featurization -) PDB files for interacting molecules. 3) User specified featurizations. """ time1 = time.time() df[featurizer.feature_fields] = df[featurizer.feature_fields].apply( pd.to_numeric) X_shard = df.as_matrix(columns=featurizer.feature_fields) time2 = time.time() log("TIMING: user specified processing took %0.3f s" % (time2 - time1), verbose) return X_shard
def featurize_mol_df(df, featurizer, field, verbose=True, log_every_N=1000): """Featurize individual compounds in dataframe. Featurizes .sdf files, so the 3-D structure should be preserved so we use the rdkit "mol" object created from .sdf instead of smiles string. Some featurizers such as CoulombMatrix also require a 3-D structure. Featurizing from .sdf is currently the only way to perform CM feautization. """ sample_elems = df[field].tolist() features = [] for ind, mol in enumerate(sample_elems): if ind % log_every_N == 0: log("Featurizing sample %d" % ind, verbose) features.append(featurizer.featurize([mol])) valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] return torch.squeeze(torch.Tensor(features)), valid_inds
def featurize_smiles_torch(arr, featurizer, log_every_N=1000, verbose=True): """Featurize individual compounds in a torch tensor. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features array """ features = [] for ind, elem in enumerate(arr.tolist()): mol = Chem.MolFromSmiles(elem) if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) if ind % log_every_N == 0: log("Featurizing sample %d" % ind, verbose) features.append(featurizer.featurize([mol])) valid_inds = torch.Tensor([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] features = torch.squeeze(torch.Tensor(features)) return features.reshape(-1, )
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True): """Featurize individual compounds in dataframe. Given a featurizer that operates on individual chemical compounds or macromolecules, compute & add features for that compound to the features dataframe """ sample_elems = df[field].tolist() features = [] for ind, elem in enumerate(sample_elems): mol = Chem.MolFromSmiles(elem) if mol: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) if ind % log_every_N == 0: log("Featurizing sample %d" % ind, verbose) features.append(featurizer.featurize([mol])) valid_inds = torch.Tensor([1 if elt.size > 0 else 0 for elt in features], dtype=bool) features = [ elt for (is_valid, elt) in zip(valid_inds, features) if is_valid ] return torch.squeeze(torch.Tensor(features), axis=1), valid_inds
def featurize_shard(self, shard): """Featurizes a shard of an input dataframe.""" log( "Currently featurizing feature_type: %s" % self.featurizer.__class__.__name__, self.verbose) return featurize_mol_df(shard, self.featurizer, field=self.mol_field)