def get_grad_statistics(self): """Computes and returns statistics of this dataset This function assumes that the first task of a dataset holds the energy for an input system, and that the remaining tasks holds the gradient for the system. TODO(rbharath, joegomes): It is unclear whether this should be a Dataset function. Might get refactored out. TODO(rbharath, joegomes): If y_n were an exposed part of the API, this function could be entirely written in userspace. """ if len(self) == 0: return None, None, None, None df = self.metadata_df y = [] y_n = [] for _, row in df.iterrows(): yy = load_from_disk(os.path.join(self.data_dir, row['y'])) y.append(yy) yn = load_from_disk(os.path.join(self.data_dir, row['y_n'])) y_n.append(np.array(yn)) y = np.vstack(y) y_n = np.sum(y_n, axis=0) energy = y[:,0] grad = y[:,1:] for i in xrange(energy.size): grad[i] *= energy[i] ydely_means = np.sum(grad, axis=0)/y_n[1:] return grad, ydely_means
def __init__(self, feature_dir, dataset_files=None, overwrite=True, reload_data=False): """ Initialiize FeaturizedSamples If feature_dir does not exist, must specify dataset_files. Then feature_dir is created and populated. If feature_dir exists (created by previous call to FeaturizedSamples), then dataset_files cannot be specified. If overwrite is set and dataset_files is provided, will overwrite old dataset_files with new. """ self.dataset_files = dataset_files if not os.path.exists(feature_dir): os.makedirs(feature_dir) self.feature_dir = feature_dir if os.path.exists(self._get_compounds_filename()) and reload_data: compounds_df = load_from_disk(self._get_compounds_filename()) else: compounds_df = self._get_compounds() # compounds_df is not altered by any method after initialization, so it's # safe to keep a copy in memory and on disk. save_to_disk(compounds_df, self._get_compounds_filename()) _check_validity(compounds_df) self.compounds_df = compounds_df if os.path.exists(self._get_dataset_paths_filename()): if dataset_files is not None: if overwrite: save_to_disk(dataset_files, self._get_dataset_paths_filename()) else: raise ValueError("Can't change dataset_files already stored on disk") self.dataset_files = load_from_disk(self._get_dataset_paths_filename()) else: save_to_disk(dataset_files, self._get_dataset_paths_filename())
def transform_row(self, i, df, data_dir): """ Normalizes the data (X, y, w, ...) in a single row). """ row = df.iloc[i] if self.transform_X: X = load_from_disk( os.path.join(data_dir, row['X-transformed'])) X = np.nan_to_num((X - self.X_means) / self.X_stds) save_to_disk(X, os.path.join(data_dir, row['X-transformed'])) if self.transform_y: y = load_from_disk(os.path.join(data_dir, row['y-transformed'])) # transform tasks as normal y = np.nan_to_num((y - self.y_means) / self.y_stds) # add 2nd order correction term to gradients grad_var = 1/self.y_stds[0]*(self.ydely_means-self.y_means[0]*self.y_means[1:]) for i in range(y.shape[0]): y[i,1:] = y[i,1:] - grad_var*y[i,0]/self.y_stds[0] save_to_disk(y, os.path.join(data_dir, row['y-transformed']))
def _transform_row(i, df, normalize_X, normalize_y, truncate_X, truncate_y, log_X, log_y, X_means, X_stds, y_means, y_stds, trunc): """ Transforms the data (X, y, w,...) in a single row. Writes X-transforme,d y-transformed to disk. """ row = df.iloc[i] X = load_from_disk(row['X']) if normalize_X or log_X: if normalize_X: # Turns NaNs to zeros X = np.nan_to_num((X - X_means) / X_stds) if truncate_X: X[X > trunc] = trunc X[X < (-1.0*trunc)] = -1.0 * trunc if log_X: X = np.log(X) save_to_disk(X, row['X-transformed']) y = load_from_disk(row['y']) if normalize_y or log_y: if normalize_y: y = np.nan_to_num((y - y_means) / y_stds) if truncate_y: y[y > trunc] = trunc y[y < (-1.0*trunc)] = -1.0 * trunc if log_y: y = np.log(y) save_to_disk(y, row['y-transformed'])
def transform_row(self, i, df, data_dir): """Logarithmically transforms data in dataset.""" """Select features and tasks of interest for transformation.""" row = df.iloc[i] if self.transform_X: X = load_from_disk(os.path.join(data_dir, row['X-transformed'])) num_features=len(X[0]) if self.features is None: X = np.log(X+1) else: for j in xrange(num_features): if j in self.features: X[:,j] = np.log(X[:,j]+1) else: X[:,j] = X[:,j] save_to_disk(X, os.path.join(data_dir, row['X-transformed'])) if self.transform_y: y = load_from_disk(os.path.join(data_dir, row['y-transformed'])) num_tasks=len(y[0]) if self.tasks is None: y = np.log(y+1) else: for j in xrange(num_tasks): if j in self.tasks: y[:,j] = np.log(y[:,j]+1) else: y[:,j] = y[:,j] save_to_disk(y, os.path.join(data_dir, row['y-transformed']))
def _itershards(self): """ Iterates over all shards in dataset. """ for _, row in self.metadata_df.iterrows(): X = load_from_disk(row['X-transformed']) y = load_from_disk(row['y-transformed']) w = load_from_disk(row['w']) ids = load_from_disk(row['ids']) yield (X, y, w, ids)
def iterate(dataset): for _, row in dataset.metadata_df.iterrows(): X = np.array(load_from_disk( os.path.join(dataset.data_dir, row['X-transformed']))) y = np.array(load_from_disk( os.path.join(dataset.data_dir, row['y-transformed']))) w = np.array(load_from_disk( os.path.join(dataset.data_dir, row['w-transformed']))) ids = np.array(load_from_disk( os.path.join(dataset.data_dir, row['ids'])), dtype=object) yield (X, y, w, ids)
def get_shard(self, i): """Retrieves data for the i-th shard from disk.""" row = self.metadata_df.iloc[i] X = np.array(load_from_disk( os.path.join(self.data_dir, row['X-transformed']))) y = np.array(load_from_disk( os.path.join(self.data_dir, row['y-transformed']))) w = np.array(load_from_disk( os.path.join(self.data_dir, row['w-transformed']))) ids = np.array(load_from_disk( os.path.join(self.data_dir, row['ids'])), dtype=object) return (X, y, w, ids)
def transform_row(i, df): """Logarithmically transforms data in dataset.""" row = df.iloc[i] if self.transform_X: X = load_from_disk(row['X-transformed']) X = np.log(X) save_to_disk(X, row['X-transformed']) if self.transform_y: y = load_from_disk(row['y-transformed']) y = np.log(y) save_to_disk(y, row['y-transformed'])
def iterate(dataset): for _, row in dataset.metadata_df.iterrows(): X = np.array(load_from_disk( os.path.join(dataset.data_dir, row['X']))) y = np.array(load_from_disk( os.path.join(dataset.data_dir, row['y']))) w_filename = os.path.join(dataset.data_dir, row['w']) if os.path.exists(w_filename): w = np.array(load_from_disk(w_filename)) else: w = np.ones(y.shape) ids = np.array(load_from_disk( os.path.join(dataset.data_dir, row['ids'])), dtype=object) yield (X, y, w, ids)
def transform_row(self, i, df, data_dir): """Reweight the labels for this data.""" row = df.iloc[i] y = load_from_disk(os.path.join(data_dir, row['y-transformed'])) w = load_from_disk(os.path.join(data_dir, row['w-transformed'])) w_balanced = np.zeros_like(w) for ind, task in enumerate(self.dataset.get_task_names()): task_y = y[:, ind] task_w = w[:, ind] zero_indices = np.logical_and(task_y==0, task_w != 0) one_indices = np.logical_and(task_y==1, task_w != 0) w_balanced[zero_indices, ind] = self.weights[ind][0] w_balanced[one_indices, ind] = self.weights[ind][1] save_to_disk(w_balanced, os.path.join(data_dir, row['w-transformed']))
def transform_row(self, i, df): """ Clips outliers for the data (X, y, w, ...) in a single row). """ row = df.iloc[i] if self.transform_X: X = load_from_disk(row['X-transformed']) X[X > self.max_val] = self.max_val X[X < (-1.0*self.max_val)] = -1.0 * self.max_val save_to_disk(X, row['X-transformed']) if self.transform_y: y = load_from_disk(row['y-transformed']) y[y > trunc] = trunc y[y < (-1.0*trunc)] = -1.0 * trunc save_to_disk(y, row['y-transformed'])
def transform_row(self, i, df): """ Normalizes the data (X, y, w, ...) in a single row). """ row = df.iloc[i] if self.transform_X: X = load_from_disk(row['X-transformed']) X = np.nan_to_num((X - self.X_means) / self.X_stds) save_to_disk(X, row['X-transformed']) if self.transform_y: y = load_from_disk(row['y-transformed']) y = np.nan_to_num((y - self.y_means) / self.y_stds) save_to_disk(y, row['y-transformed'])
def get_shard(self, i): """Retrieves data for the i-th shard from disk.""" row = self.metadata_df.iloc[i] X = np.array(load_from_disk( os.path.join(self.data_dir, row['X']))) y = np.array(load_from_disk( os.path.join(self.data_dir, row['y']))) w_filename = os.path.join(self.data_dir, row['w']) if os.path.exists(w_filename): w = np.array(load_from_disk(w_filename)) else: w = np.ones(y.shape) ids = np.array(load_from_disk( os.path.join(self.data_dir, row['ids'])), dtype=object) return (X, y, w, ids)
def get_shard_size(self): """Gets size of shards on disk.""" if not len(self.metadata_df): raise ValueError("No data in dataset.") sample_y = load_from_disk( os.path.join(self.data_dir, next(self.metadata_df.iterrows())[1]['y'])) return len(sample_y)
def __init__(self, shard_generator=[], data_dir=None, tasks=[], reload=False, verbose=True): """ Turns featurized dataframes into numpy files, writes them & metadata to disk. """ if data_dir is not None: if not os.path.exists(data_dir): os.makedirs(data_dir) else: data_dir = tempfile.mkdtemp() self.data_dir = data_dir self.verbose = verbose if reload: log("Loading pre-existing dataset.", self.verbose) if os.path.exists(self._get_metadata_filename()): (self.tasks, self.metadata_df) = load_from_disk( self._get_metadata_filename()) else: raise ValueError("No metadata found.") return metadata_rows = [] time1 = time.time() for shard_num, (X, y, w, ids) in enumerate(shard_generator): basename = "shard-%d" % shard_num metadata_rows.append( DiskDataset.write_data_to_disk( self.data_dir, basename, tasks, X, y, w, ids)) self.tasks = tasks self.metadata_df = DiskDataset.construct_metadata(metadata_rows) self.save_to_disk() time2 = time.time() print("TIMING: dataset construction took %0.3f s" % (time2-time1), self.verbose)
def get_data_shape(self): """ Gets array shape of datapoints in this dataset. """ if not len(self.metadata_df): raise ValueError("No data in dataset.") sample_X = load_from_disk(self.metadata_df.iterrows().next()[1]['X'])[0] return np.shape(sample_X)
def __init__(self, data_dir, samples=None, feature_types=None): """ Turns featurized dataframes into numpy files, writes them & metadata to disk. """ if not os.path.exists(data_dir): os.makedirs(data_dir) self.data_dir = data_dir if samples is not None and feature_types is not None: if not isinstance(feature_types, list): raise ValueError("feature_types must be a list or None.") write_dataset_single_partial = partial( write_dataset_single, data_dir=self.data_dir, feature_types=feature_types) metadata_rows = [] # TODO(rbharath): Still a bit of information leakage. for df_file, df in zip(samples.dataset_files, samples.itersamples()): retval = write_dataset_single_partial((df_file, df)) if retval is not None: metadata_rows.append(retval) # TODO(rbharath): FeaturizedSamples should not be responsible for # X-transform, X_sums, etc. Move that stuff over to Dataset. self.metadata_df = pd.DataFrame( metadata_rows, columns=('df_file', 'task_names', 'ids', 'X', 'X-transformed', 'y', 'y-transformed', 'w', 'X_sums', 'X_sum_squares', 'X_n', 'y_sums', 'y_sum_squares', 'y_n')) save_to_disk( self.metadata_df, self._get_metadata_filename()) # input/output transforms not specified yet, so # self.transforms = (input_transforms, output_transforms) => self.transforms = ([], []) save_to_disk( self.transforms, self._get_transforms_filename()) else: if os.path.exists(self._get_metadata_filename()): self.metadata_df = load_from_disk(self._get_metadata_filename()) self.transforms = load_from_disk(self._get_transforms_filename()) else: raise ValueError("No metadata found.")
def __len__(self): """ Finds number of elements in dataset. """ total = 0 for _, row in self.metadata_df.iterrows(): y = load_from_disk(os.path.join(self.data_dir, row['y-transformed'])) total += len(y) return total
def itershards(self): """ Iterates over all shards in dataset. Datasets are stored in sharded fashion on disk. Each call to next() for the generator defined by this function returns the data from a particular shard. The order of shards returned is guaranteed to remain fixed. """ for _, row in self.metadata_df.iterrows(): X = np.array(load_from_disk( os.path.join(self.data_dir, row['X-transformed']))) y = np.array(load_from_disk( os.path.join(self.data_dir, row['y-transformed']))) w = np.array(load_from_disk( os.path.join(self.data_dir, row['w-transformed']))) ids = np.array(load_from_disk( os.path.join(self.data_dir, row['ids'])), dtype=object) yield (X, y, w, ids)
def __init__(self, samples_dir, featurizers, dataset_files=None, reload=False, verbosity=None): """ Initialiize FeaturizedSamples If samples_dir does not exist, must specify dataset_files. Then samples_dir is created and populated. If samples_dir exists (created by previous call to FeaturizedSamples), then dataset_files cannot be specified. If reload is False and dataset_files is provided, will overwrite old dataset_files with new. """ assert verbosity in [None, "low", "high"] self.verbosity = verbosity self.dataset_files = dataset_files self.feature_types = ( ["user-specified-features"] + [featurizer.__class__.__name__ for featurizer in featurizers]) self.featurizers = featurizers if not os.path.exists(samples_dir): os.makedirs(samples_dir) self.samples_dir = samples_dir if os.path.exists(self._get_dataset_paths_filename()): if dataset_files is not None: if not reload: save_to_disk(dataset_files, self._get_dataset_paths_filename()) else: raise ValueError("Can't change dataset_files already stored on disk") else: save_to_disk(dataset_files, self._get_dataset_paths_filename()) self.dataset_files = load_from_disk(self._get_dataset_paths_filename()) if os.path.exists(self._get_compounds_filename()) and reload: compounds_df = load_from_disk(self._get_compounds_filename()) else: compounds_df = self._get_compounds() # compounds_df is not altered by any method after initialization, so it's # safe to keep a copy in memory and on disk. save_to_disk(compounds_df, self._get_compounds_filename()) _check_validity(compounds_df) self.compounds_df = compounds_df self.num_samples = len(compounds_df)
def update_mean_and_std(df): """ Compute means/stds of X/y from sums/sum_squares of tensors. """ X_transform = [] for _, row in df.iterrows(): Xt = load_from_disk(row['X-transformed']) Xs = np.sum(Xt,axis=0) Xss = np.sum(np.square(Xt),axis=0) save_to_disk(Xs, row['X_sums']) save_to_disk(Xss, row['X_sum_squares']) y_transform = [] for _, row in df.iterrows(): yt = load_from_disk(row['y-transformed']) ys = np.sum(yt,axis=0) yss = np.sum(np.square(yt),axis=0) save_to_disk(ys, row['y_sums']) save_to_disk(yss, row['y_sum_squares'])
def get_data_shape(self): """ Gets array shape of datapoints in this dataset. """ if not len(self.metadata_df): raise ValueError("No data in dataset.") sample_X = load_from_disk( os.path.join( self.data_dir, next(self.metadata_df.iterrows())[1]['X-transformed']))[0] return np.shape(sample_X)
def itersamples(self): """Iterates over samples in this object.""" compound_ids = set(list(self.compounds_df["mol_id"])) for df_file in self.dataset_files: df = load_from_disk(df_file) visible_inds = [] for ind, row in df.iterrows(): if row["mol_id"] in compound_ids: visible_inds.append(ind) for visible_ind in visible_inds: yield df.loc[visible_ind]
def load_muv(base_dir, reload=True): """Load MUV datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load MUV dataset print("About to load MUV dataset.") dataset_file = os.path.join( current_dir, "../../datasets/muv.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize MUV dataset print("About to featurize MUV dataset.") featurizer = CircularFingerprint(size=1024) all_MUV_tasks = sorted(['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644', 'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712', 'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652', 'MUV-466', 'MUV-832']) loader = DataLoader(tasks=all_MUV_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True else: dataset = Dataset(data_dir, reload=True) # Initialize transformers transformers = [ BalancingTransformer(transform_w=True, dataset=dataset)] if regen: print("About to transform data") for transformer in transformers: transformer.transform(dataset) return all_MUV_tasks, dataset, transformers
def iterate(dataset): for _, row in dataset.metadata_df.iterrows(): X = np.array(load_from_disk(os.path.join(dataset.data_dir, row['X']))) ids = np.array( load_from_disk(os.path.join(dataset.data_dir, row['ids'])), dtype=object) # These columns may be missing is the dataset is unlabelled. if row['y'] is not None: y = np.array(load_from_disk(os.path.join(dataset.data_dir, row['y']))) else: y = None if row['w'] is not None: w_filename = os.path.join(dataset.data_dir, row['w']) if os.path.exists(w_filename): w = np.array(load_from_disk(w_filename)) else: w = np.ones(y.shape) else: w = None yield (X, y, w, ids)
def load_tox21(base_dir, reload=True): """Load Tox21 datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. samples_dir = os.path.join(base_dir, "samples") data_dir = os.path.join(base_dir, "dataset") # Load Tox21 dataset print("About to load Tox21 dataset.") dataset_file = os.path.join( current_dir, "../../datasets/tox21.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize Tox21 dataset print("About to featurize Tox21 dataset.") featurizer = CircularFingerprint(size=1024) all_tox21_tasks = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'] if not reload or not os.path.exists(data_dir): loader = DataLoader(tasks=all_tox21_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize( dataset_file, data_dir, shard_size=8192) else: dataset = Dataset(data_dir, all_tox21_tasks, reload=True) # Initialize transformers transformers = [ BalancingTransformer(transform_w=True, dataset=dataset)] if not reload: print("About to transform data") for transformer in transformers: transformer.transform(dataset) return all_tox21_tasks, dataset, transformers
def _update_mean_and_std(self, df, X_stats, y_stats): """ Compute means/stds of X/y from sums/sum_squares of tensors. """ if X_stats: X_transform = [] for _, row in df.iterrows(): Xt = load_from_disk(os.path.join(self.data_dir, row['X-transformed'])) Xs = np.sum(Xt,axis=0) Xss = np.sum(np.square(Xt),axis=0) save_to_disk(Xs, os.path.join(self.data_dir, row['X_sums'])) save_to_disk(Xss, os.path.join(self.data_dir, row['X_sum_squares'])) if y_stats: y_transform = [] for _, row in df.iterrows(): yt = load_from_disk(os.path.join(self.data_dir, row['y-transformed'])) ys = np.sum(yt,axis=0) yss = np.sum(np.square(yt),axis=0) save_to_disk(ys, os.path.join(self.data_dir, row['y_sums'])) save_to_disk(yss, os.path.join(self.data_dir, row['y_sum_squares']))
def __init__(self, data_dir, verbose=True): """ Turns featurized dataframes into numpy files, writes them & metadata to disk. """ self.data_dir = data_dir self.verbose = verbose log("Loading dataset from disk.", self.verbose) if os.path.exists(self._get_metadata_filename()): (self.tasks, self.metadata_df) = load_from_disk(self._get_metadata_filename()) else: raise ValueError("No metadata found on disk.")
def iterdataframes(self): """ Provides a bulk iterator over data. Each sample from the iterator is a dataframe of samples. """ compound_ids = set(list(self.compounds_df["mol_id"])) for df_file in self.dataset_files: df = load_from_disk(df_file) visible_inds = [] for ind, row in df.iterrows(): if row["mol_id"] in compound_ids: visible_inds.append(ind) yield df.loc[visible_inds]
def iterate(dataset): for _, row in dataset.metadata_df.iterrows(): X = np.array(load_from_disk(os.path.join(dataset.data_dir, row['X']))) ids = np.array( load_from_disk(os.path.join(dataset.data_dir, row['ids'])), dtype=object) # These columns may be missing is the dataset is unlabelled. if row['y'] is not None: y = np.array(load_from_disk(os.path.join(dataset.data_dir, row['y']))) else: y = None if row['w'] is not None: w_filename = os.path.join(dataset.data_dir, row['w']) if os.path.exists(w_filename): w = np.array(load_from_disk(w_filename)) else: if len(y.shape) == 1: w = np.ones(y.shape[0], np.float32) else: w = np.ones((y.shape[0], 1), np.float32) else: w = None yield (X, y, w, ids)
def get_shard(self, i): """Retrieves data for the i-th shard from disk.""" row = self.metadata_df.iloc[i] X = np.array(load_from_disk(os.path.join(self.data_dir, row['X']))) if row['y'] is not None: y = np.array(load_from_disk(os.path.join(self.data_dir, row['y']))) else: y = None if row['w'] is not None: # TODO (ytz): Under what condition does this exist but the file itself doesn't? w_filename = os.path.join(self.data_dir, row['w']) if os.path.exists(w_filename): w = np.array(load_from_disk(w_filename)) else: w = np.ones(y.shape) else: w = None ids = np.array( load_from_disk(os.path.join(self.data_dir, row['ids'])), dtype=object) return (X, y, w, ids)
def itersamples(self): """ Provides an iterator over samples. Each sample from the iterator is a dataframe of samples. """ compound_ids = set(list(self.compounds_df["mol_id"])) for df_file in self.dataset_files: df = load_from_disk(df_file) visible_inds = [] for ind, row in df.iterrows(): if row["mol_id"] in compound_ids: visible_inds.append(ind) yield df.loc[visible_inds]
def _run_muv_experiment(self, dataset_file, reload=False, verbosity=None): """Loads or reloads a small version of MUV dataset.""" # Load MUV dataset raw_dataset = load_from_disk(dataset_file) print("Number of examples in dataset: %s" % str(raw_dataset.shape[0])) print("About to featurize compounds") featurizer = CircularFingerprint(size=1024) MUV_tasks = [ 'MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644', 'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712', 'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652', 'MUV-466', 'MUV-832' ] loader = DataLoader(tasks=MUV_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, self.data_dir) assert len(dataset) == len(raw_dataset) print("About to split compounds into train/valid/test") splitter = ScaffoldSplitter(verbosity=verbosity) frac_train, frac_valid, frac_test = .8, .1, .1 train_dataset, valid_dataset, test_dataset = \ splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir, log_every_n=1000, frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid) # Do an approximate comparison since splits are sometimes slightly off from # the exact fraction. assert relative_difference(len(train_dataset), frac_train * len(dataset)) < 1e-3 assert relative_difference(len(valid_dataset), frac_valid * len(dataset)) < 1e-3 assert relative_difference(len(test_dataset), frac_test * len(dataset)) < 1e-3 # TODO(rbharath): Transformers don't play nice with reload! Namely, # reloading will cause the transform to be reapplied. This is undesirable in # almost all cases. Need to understand a method to fix this. transformers = [ BalancingTransformer(transform_w=True, dataset=train_dataset) ] print("Transforming datasets") for dataset in [train_dataset, valid_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) return (len(train_dataset), len(valid_dataset), len(test_dataset))
def _get_fields(input_file): """Get the names of fields and field_types for input data.""" # If CSV input, assume that first row contains labels input_type = _get_input_type(input_file) if input_type == "csv": with open(input_file, "rb") as inp_file_obj: return csv.reader(inp_file_obj).next() elif input_type == "pandas-joblib": df = load_from_disk(input_file) return df.keys() elif input_type == "pandas-pickle": df = load_pickle_from_disk(input_file) return df.keys() else: raise ValueError("Unrecognized extension for %s" % input_file)
def transform_row(self, i, df, data_dir): """ Randomly permute a Coulomb Matrix in a dataset """ row = df.iloc[i] if self.transform_X: X = load_from_disk(os.path.join(data_dir, row['X-transformed'])) for j in range(len(X)): cm = self.construct_cm_from_triu(X[j]) X[j] = self.unpad_randomize_and_flatten(cm) save_to_disk(X, os.path.join(data_dir, row['X-transformed'])) if self.transform_y: print("y will not be transformed by " "CoulombRandomizationTransformer.")
def transform(self, dataset, parallel=False): super(CoulombBinarizationTransformer, self).transform(dataset, parallel=parallel) df = dataset.metadata_df Xt = [] for _, row in df.iterrows(): X_t = load_from_disk(os.path.join(dataset.data_dir, row['X-transformed'])) Xt.append(np.array(X_t)) X = np.vstack(Xt) X_means = X.mean(axis=0) X_stds = (X-X_means).std() for i, row in df.iterrows(): X_t = (Xt[i]-X_means)/X_stds save_to_disk(X_t, os.path.join(dataset.data_dir, row['X-transformed']))
def _get_compounds(self): """ Create dataframe containing metadata about compounds. """ compound_rows = [] for dataset_file in self.dataset_files: df = load_from_disk(dataset_file) compound_ids = list(df["mol_id"]) smiles = list(df["smiles"]) if "split" in df.keys(): splits = list(df["split"]) else: splits = [None] * len(smiles) compound_rows += [ list(elt) for elt in zip(compound_ids, smiles, splits) ] compounds_df = pd.DataFrame(compound_rows, columns=("mol_id", "smiles", "split")) return compounds_df
def load_metadata(self): try: tasks_filename, metadata_filename = self._get_metadata_filename() with open(tasks_filename) as fin: tasks = json.load(fin) metadata_df = pd.read_csv(metadata_filename, compression='gzip') metadata_df = metadata_df.where((pd.notnull(metadata_df)), None) return tasks, metadata_df except Exception as e: pass # Load obsolete format -> save in new format metadata_filename = os.path.join(self.data_dir, "metadata.joblib") if os.path.exists(metadata_filename): tasks, metadata_df = load_from_disk(metadata_filename) del metadata_df['task_names'] del metadata_df['basename'] save_metadata(tasks, metadata_df, self.data_dir) return tasks, metadata_df raise ValueError("No Metadata Found On Disk")
def transform_row(self, i, df, data_dir): """ Binarizes data in dataset with sigmoid function """ row = df.iloc[i] X_bin = [] if self.update_state: self.set_max(df, data_dir) self.update_state = False if self.transform_X: X = load_from_disk(os.path.join(data_dir, row['X-transformed'])) for i in range(X.shape[1]): for k in np.arange(0,self.feature_max[i]+self.theta,self.theta): X_bin += [np.tanh((X[:,i]-k)/self.theta)] X_bin = np.array(X_bin).T save_to_disk(X_bin, os.path.join(data_dir, row['X-transformed'])) if self.transform_y: print("y will not be transformed by " "CoulombBinarizationTransformer.")
def __init__(self, data_dir=None, tasks=[], metadata_rows=None, #featurizers=None, raw_data=None, verbosity=None, reload=False, compute_feature_statistics=True): """ Turns featurized dataframes into numpy files, writes them & metadata to disk. """ if not os.path.exists(data_dir): os.makedirs(data_dir) self.data_dir = data_dir assert verbosity in [None, "low", "high"] self.verbosity = verbosity if not reload or not os.path.exists(self._get_metadata_filename()): if metadata_rows is not None: self.metadata_df = DiskDataset.construct_metadata(metadata_rows) self.save_to_disk() elif raw_data is not None: metadata_rows = [] ids, X, y, w = raw_data metadata_rows.append( DiskDataset.write_data_to_disk( self.data_dir, "data", tasks, X, y, w, ids, compute_feature_statistics=compute_feature_statistics)) self.metadata_df = DiskDataset.construct_metadata(metadata_rows) self.save_to_disk() else: # Create an empty metadata dataframe to be filled at a later time basename = "metadata" metadata_rows = [DiskDataset.write_data_to_disk( self.data_dir, basename, tasks)] self.metadata_df = DiskDataset.construct_metadata(metadata_rows) self.save_to_disk() else: log("Loading pre-existing metadata file.", self.verbosity) if os.path.exists(self._get_metadata_filename()): self.metadata_df = load_from_disk(self._get_metadata_filename()) else: raise ValueError("No metadata found.")
def reload(self): """Loads sklearn model from joblib file on disk.""" self.model_instance = load_from_disk( Model.get_model_filename(self.model_dir))
base_dir = "/scratch/users/rbharath/tox21_analysis" if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") train_dir = os.path.join(base_dir, "train_dataset") valid_dir = os.path.join(base_dir, "valid_dataset") test_dir = os.path.join(base_dir, "test_dataset") model_dir = os.path.join(base_dir, "model") # Load Tox21 dataset print("About to load Tox21 dataset.") dataset_file = os.path.join(current_dir, "../../datasets/tox21.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) tox21_tasks, tox21_dataset, transformers = load_tox21(data_dir, reload=reload) num_train = 7200 X, y, w, ids = tox21_dataset.to_numpy() X_train, X_valid = X[:num_train], X[num_train:] y_train, y_valid = y[:num_train], y[num_train:] w_train, w_valid = w[:num_train], w[num_train:] ids_train, ids_valid = ids[:num_train], ids[num_train:] # Not sure if we need to constantly delete these directories... if os.path.exists(train_dir): shutil.rmtree(train_dir) train_dataset = Dataset.from_numpy(train_dir,
def load_bace(mode="regression", transform=True, split="20-80"): """Load BACE-1 dataset as regression/classification problem.""" reload = True verbosity = "high" regen = False assert split in ["20-80", "80-20"] current_dir = os.path.dirname(os.path.realpath(__file__)) if split == "20-80": dataset_file = os.path.join(current_dir, "../../datasets/desc_canvas_aug30.csv") elif split == "80-20": dataset_file = os.path.join(current_dir, "../../datasets/rev8020split_desc.csv") dataset = load_from_disk(dataset_file) num_display = 10 pretty_columns = ("[" + ",".join( ["'%s'" % column for column in dataset.columns.values[:num_display]]) + ",...]") crystal_dataset_file = os.path.join( current_dir, "../../datasets/crystal_desc_canvas_aug30.csv") crystal_dataset = load_from_disk(crystal_dataset_file) print("Columns of dataset: %s" % pretty_columns) print("Number of examples in dataset: %s" % str(dataset.shape[0])) print("Number of examples in crystal dataset: %s" % str(crystal_dataset.shape[0])) #Make directories to store the raw and featurized datasets. base_dir = tempfile.mkdtemp() data_dir = os.path.join(base_dir, "dataset") train_dir = os.path.join(base_dir, "train_dataset") valid_dir = os.path.join(base_dir, "valid_dataset") test_dir = os.path.join(base_dir, "test_dataset") model_dir = os.path.join(base_dir, "model") crystal_dir = os.path.join(base_dir, "crystal") if mode == "regression": bace_tasks = ["pIC50"] elif mode == "classification": bace_tasks = ["Class"] else: raise ValueError("Unknown mode %s" % mode) featurizer = UserDefinedFeaturizer(user_specified_features) loader = DataLoader(tasks=bace_tasks, smiles_field="mol", id_field="CID", featurizer=featurizer) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True else: dataset = Dataset(data_dir, reload=True) if not reload or not os.path.exists(crystal_dir): crystal_dataset = loader.featurize(crystal_dataset_file, crystal_dir) else: crystal_dataset = Dataset(crystal_dir, reload=True) if (not reload or not os.path.exists(train_dir) or not os.path.exists(valid_dir) or not os.path.exists(test_dir)): regen = True splitter = SpecifiedSplitter(dataset_file, "Model", verbosity=verbosity) train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, train_dir, valid_dir, test_dir) else: train_dataset = Dataset(train_dir, reload=True) valid_dataset = Dataset(valid_dir, reload=True) test_dataset = Dataset(test_dir, reload=True) #NOTE THE RENAMING: if split == "20-80": valid_dataset, test_dataset = test_dataset, valid_dataset print("Number of compounds in train set") print(len(train_dataset)) print("Number of compounds in validation set") print(len(valid_dataset)) print("Number of compounds in test set") print(len(test_dataset)) print("Number of compounds in crystal set") print(len(crystal_dataset)) if transform and regen: input_transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset) ] output_transformers = [] if mode == "regression": output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset) ] else: output_transformers = [] else: input_transformers, output_transformers = [], [] transformers = input_transformers + output_transformers for dataset in [ train_dataset, valid_dataset, test_dataset, crystal_dataset ]: for transformer in transformers: dataset = transformer.transform(dataset) return (bace_tasks, train_dataset, valid_dataset, test_dataset, crystal_dataset, output_transformers)
def load_sweet(base_dir, reload=True, frac_train=.8): """Load sweet datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") train_dir = os.path.join(base_dir, "train_dataset") valid_dir = os.path.join(base_dir, "valid_dataset") # Load SWEET dataset print("About to load SWEET dataset.") dataset_file = os.path.join(current_dir, "./sweet.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize SWEET dataset print("About to featurize SWEET dataset.") featurizer = CircularFingerprint(size=1024) SWEET_tasks = dataset.columns.values[1:].tolist() loader = DataLoader(tasks=SWEET_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True else: dataset = DiskDataset(data_dir, reload=True) # Initialize transformers transformers = [BalancingTransformer(transform_w=True, dataset=dataset)] if regen: print("About to transform data") for transformer in transformers: dataset = transformer.transform(dataset) X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) num_tasks = 17 num_train = frac_train * len(dataset) SWEET_tasks = SWEET_tasks[:num_tasks] print("Using following tasks") print(SWEET_tasks) X_train, X_valid = X[:num_train], X[num_train:] y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks] w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks] ids_train, ids_valid = ids[:num_train], ids[num_train:] train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train, SWEET_tasks) valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid, SWEET_tasks) return SWEET_tasks, (train_dataset, valid_dataset), transformers
def load_pcba(base_dir, reload=True): """Load PCBA datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load PCBA dataset print("About to load PCBA dataset.") dataset_file = os.path.join( current_dir, "../../datasets/pcba.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize PCBA dataset print("About to featurize PCBA dataset.") featurizer = CircularFingerprint(size=1024) all_PCBA_tasks = [ 'PCBA-1030','PCBA-1379','PCBA-1452','PCBA-1454','PCBA-1457', 'PCBA-1458','PCBA-1460','PCBA-1461','PCBA-1468','PCBA-1469', 'PCBA-1471','PCBA-1479','PCBA-1631','PCBA-1634','PCBA-1688', 'PCBA-1721','PCBA-2100','PCBA-2101','PCBA-2147','PCBA-2242', 'PCBA-2326','PCBA-2451','PCBA-2517','PCBA-2528','PCBA-2546', 'PCBA-2549','PCBA-2551','PCBA-2662','PCBA-2675','PCBA-2676', 'PCBA-411','PCBA-463254','PCBA-485281','PCBA-485290','PCBA-485294', 'PCBA-485297','PCBA-485313','PCBA-485314','PCBA-485341','PCBA-485349', 'PCBA-485353','PCBA-485360','PCBA-485364','PCBA-485367','PCBA-492947', 'PCBA-493208','PCBA-504327','PCBA-504332','PCBA-504333','PCBA-504339', 'PCBA-504444','PCBA-504466','PCBA-504467','PCBA-504706','PCBA-504842', 'PCBA-504845','PCBA-504847','PCBA-504891','PCBA-540276','PCBA-540317', 'PCBA-588342','PCBA-588453','PCBA-588456','PCBA-588579','PCBA-588590', 'PCBA-588591','PCBA-588795','PCBA-588855','PCBA-602179','PCBA-602233', 'PCBA-602310','PCBA-602313','PCBA-602332','PCBA-624170','PCBA-624171', 'PCBA-624173','PCBA-624202','PCBA-624246','PCBA-624287','PCBA-624288', 'PCBA-624291','PCBA-624296','PCBA-624297','PCBA-624417','PCBA-651635', 'PCBA-651644','PCBA-651768','PCBA-651965','PCBA-652025','PCBA-652104', 'PCBA-652105','PCBA-652106','PCBA-686970','PCBA-686978','PCBA-686979', 'PCBA-720504','PCBA-720532','PCBA-720542','PCBA-720551','PCBA-720553', 'PCBA-720579','PCBA-720580','PCBA-720707','PCBA-720708','PCBA-720709', 'PCBA-720711','PCBA-743255','PCBA-743266','PCBA-875','PCBA-881', 'PCBA-883','PCBA-884','PCBA-885','PCBA-887','PCBA-891','PCBA-899', 'PCBA-902','PCBA-903','PCBA-904','PCBA-912','PCBA-914','PCBA-915', 'PCBA-924','PCBA-925','PCBA-926','PCBA-927','PCBA-938','PCBA-995'] loader = DataLoader(tasks=all_PCBA_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True else: dataset = Dataset(data_dir, reload=True) # Initialize transformers transformers = [ BalancingTransformer(transform_w=True, dataset=dataset)] if regen: print("About to transform data") for transformer in transformers: transformer.transform(dataset) return all_PCBA_tasks, dataset, transformers
def set_max(self, df, data_dir): for _, row in df.iterrows(): X = load_from_disk(os.path.join(data_dir, row['X-transformed'])) self.feature_max = np.maximum(self.feature_max, X.max(axis=0))
def load_tox21(base_dir, reload=True, num_train=7200): """Load Tox21 datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") train_dir = os.path.join(base_dir, "train") valid_dir = os.path.join(base_dir, "valid") # Load Tox21 dataset print("About to load Tox21 dataset.") dataset_file = os.path.join(current_dir, "../../datasets/tox21.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize Tox21 dataset print("About to featurize Tox21 dataset.") featurizer = CircularFingerprint(size=1024) tox21_tasks = [ 'NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53' ] if not reload or not os.path.exists(data_dir): loader = DataLoader(tasks=tox21_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir, shard_size=8192) else: dataset = DiskDataset(data_dir, tox21_tasks, reload=True) # Initialize transformers transformers = [BalancingTransformer(transform_w=True, dataset=dataset)] if not reload: print("About to transform data") for transformer in transformers: transformer.transform(dataset) X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) X_train, X_valid = X[:num_train], X[num_train:] y_train, y_valid = y[:num_train], y[num_train:] w_train, w_valid = w[:num_train], w[num_train:] ids_train, ids_valid = ids[:num_train], ids[num_train:] train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train, tox21_tasks) valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid, tox21_tasks) return tox21_tasks, (train_dataset, valid_dataset), transformers
def test_singletask_matches_multitask_load(self): """Check that singletask load and multitask load of dataset are same.""" # Only for debug! np.random.seed(123) # Set some global variables up top reload = True verbosity = "high" base_dir = tempfile.mkdtemp() current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") train_dir = os.path.join(base_dir, "train_dataset") valid_dir = os.path.join(base_dir, "valid_dataset") test_dir = os.path.join(base_dir, "test_dataset") model_dir = os.path.join(base_dir, "model") # Load dataset print("About to load dataset.") dataset_file = os.path.join( current_dir, "../../models/tests/multitask_example.csv") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize tox21 dataset print("About to featurize dataset.") featurizer = CircularFingerprint(size=1024) all_tasks = ["task%d" % i for i in range(17)] # For debugging purposes n_tasks = 17 tasks = all_tasks[0:n_tasks] ####### Do multitask load loader = DataLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir) # Do train/valid split. X_multi, y_multi, w_multi, ids_multi = (dataset.X, dataset.y, dataset.w, dataset.ids) ####### Do singletask load y_tasks, w_tasks, ids_tasks = [], [], [] for task in tasks: print("Processing task %s" % task) if os.path.exists(data_dir): shutil.rmtree(data_dir) loader = DataLoader(tasks=[task], smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir) X_task, y_task, w_task, ids_task = (dataset.X, dataset.y, dataset.w, dataset.ids) y_tasks.append(y_task) w_tasks.append(w_task) ids_tasks.append(ids_task) ################## Do comparison for ind, task in enumerate(tasks): y_multi_task = y_multi[:, ind] w_multi_task = w_multi[:, ind] y_task = y_tasks[ind] w_task = w_tasks[ind] ids_task = ids_tasks[ind] np.testing.assert_allclose(y_multi_task.flatten(), y_task.flatten()) np.testing.assert_allclose(w_multi_task.flatten(), w_task.flatten()) shutil.rmtree(base_dir)
def test_multiload(self): """Check can re-use featurization for multiple task selections. TODO(rbharath): This test seems silly after the recent round of refactoring. Can it be removed? """ # Only for debug! np.random.seed(123) # Set some global variables up top reload = True verbosity = "high" current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(self.base_dir, "dataset") train_dir = os.path.join(self.base_dir, "train_dataset") valid_dir = os.path.join(self.base_dir, "valid_dataset") test_dir = os.path.join(self.base_dir, "test_dataset") model_dir = os.path.join(self.base_dir, "model") # Load dataset print("About to load dataset.") dataset_file = os.path.join( current_dir, "../../models/tests/multitask_example.csv") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize tox21 dataset print("About to featurize dataset.") featurizer = CircularFingerprint(size=1024) all_tasks = ["task%d" % i for i in range(17)] ####### Do featurization loader = DataLoader(tasks=all_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir) # Do train/valid split. X_multi, y_multi, w_multi, ids_multi = (dataset.X, dataset.y, dataset.w, dataset.ids) ####### Do singletask load y_tasks, w_tasks, = [], [] for ind, task in enumerate(all_tasks): print("Processing task %s" % task) dataset = DiskDataset(data_dir, verbosity=verbosity, reload=reload) X_task, y_task, w_task, ids_task = (dataset.X, dataset.y, dataset.w, dataset.ids) y_tasks.append(y_task[:, ind]) w_tasks.append(w_task[:, ind]) ################## Do comparison for ind, task in enumerate(all_tasks): y_multi_task = y_multi[:, ind] w_multi_task = w_multi[:, ind] y_task = y_tasks[ind] w_task = w_tasks[ind] np.testing.assert_allclose(y_multi_task.flatten(), y_task.flatten()) np.testing.assert_allclose(w_multi_task.flatten(), w_task.flatten())
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #In this program, we analyze the BACE enyzme and build machine learning models for predicting the Ki of ligands to the protein. #We will use the deepchem library to load this data into memory, split into train/test/validation folds, build and cross-validate models, and report statistics. import os import sys import deepchem as dc from deepchem.utils.save import load_from_disk current_dir = os.path.dirname(os.path.realpath("__file__")) dc.utils.download_url( "https://s3-us-west-1.amazonaws.com/deepchem.io/datasets/desc_canvas_aug30.csv", current_dir) dataset_file = "desc_canvas_aug30.csv" dataset = load_from_disk(dataset_file) num_display = 10 pretty_columns = ("[" + ",".join( ["'%s'" % column for column in dataset.columns.values[:num_display]]) + ",...]") dc.utils.download_url( "https://s3-us-west-1.amazonaws.com/deepchem.io/datasets/crystal_desc_canvas_aug30.csv", current_dir) crystal_dataset_file = "crystal_desc_canvas_aug30.csv" crystal_dataset = load_from_disk(crystal_dataset_file) print("Columns of dataset: %s" % pretty_columns) print("Number of examples in dataset: %s" % str(dataset.shape[0])) print("Number of examples in crystal dataset: %s" % str(crystal_dataset.shape[0]))
import pandas as pd import deepchem as dc import numpy as np import tempfile from rdkit import Chem from rdkit.Chem import Draw from itertools import islice import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestRegressor from deepchem.utils.evaluate import Evaluator import numpy.random from deepchem.utils.save import load_from_disk dataset_file = "datasets/delaney-processed.csv" dataset = load_from_disk( dataset_file ) #Compound ID', 'ESOL predicted log solubility in mols per litre', # 'Minimum Degree', 'Molecular Weight', 'Number of H-Bond Donors', # 'Number of Rings', 'Number of Rotatable Bonds', 'Polar Surface Area', # 'measured log solubility in mols per litre', 'smiles']が入っている #//Erorrが出てしまうためコメントアウト def display_images(filenames): imagesList = ''.join([ "<img style='width: 140px; margin: 0px; float: left; border: 1px solid black;' src='%s' />" % str(s) for s in sorted(filenames) ]) display(HTML(imagesList))