예제 #1
0
  def get_grad_statistics(self):
    """Computes and returns statistics of this dataset

    This function assumes that the first task of a dataset holds the energy for
    an input system, and that the remaining tasks holds the gradient for the
    system.

    TODO(rbharath, joegomes): It is unclear whether this should be a Dataset
    function. Might get refactored out.
    TODO(rbharath, joegomes): If y_n were an exposed part of the API, this
    function could be entirely written in userspace.
    """
    if len(self) == 0:
      return None, None, None, None
    df = self.metadata_df
    y = []
    y_n = []
    for _, row in df.iterrows():
      yy = load_from_disk(os.path.join(self.data_dir, row['y']))
      y.append(yy)
      yn = load_from_disk(os.path.join(self.data_dir, row['y_n']))
      y_n.append(np.array(yn))

    y = np.vstack(y)
    y_n = np.sum(y_n, axis=0)

    energy = y[:,0]
    grad = y[:,1:]
    for i in xrange(energy.size):
      grad[i] *= energy[i]

    ydely_means = np.sum(grad, axis=0)/y_n[1:]

    return grad, ydely_means
예제 #2
0
  def __init__(self, feature_dir, dataset_files=None, overwrite=True,
               reload_data=False):
    """
    Initialiize FeaturizedSamples

    If feature_dir does not exist, must specify dataset_files. Then feature_dir
    is created and populated. If feature_dir exists (created by previous call to
    FeaturizedSamples), then dataset_files cannot be specified. If overwrite is
    set and dataset_files is provided, will overwrite old dataset_files with
    new.
    """
    self.dataset_files = dataset_files

    if not os.path.exists(feature_dir):
      os.makedirs(feature_dir)
    self.feature_dir = feature_dir
    if os.path.exists(self._get_compounds_filename()) and reload_data:
      compounds_df = load_from_disk(self._get_compounds_filename())
    else:
      compounds_df = self._get_compounds()
      # compounds_df is not altered by any method after initialization, so it's
      # safe to keep a copy in memory and on disk.
      save_to_disk(compounds_df, self._get_compounds_filename())
    _check_validity(compounds_df)
    self.compounds_df = compounds_df

    if os.path.exists(self._get_dataset_paths_filename()):
      if dataset_files is not None:
        if overwrite:
          save_to_disk(dataset_files, self._get_dataset_paths_filename())
        else:
          raise ValueError("Can't change dataset_files already stored on disk")
      self.dataset_files = load_from_disk(self._get_dataset_paths_filename())
    else:
      save_to_disk(dataset_files, self._get_dataset_paths_filename())
예제 #3
0
  def transform_row(self, i, df, data_dir):
    """
    Normalizes the data (X, y, w, ...) in a single row).
    """
    row = df.iloc[i]

    if self.transform_X:
      X = load_from_disk(
          os.path.join(data_dir, row['X-transformed']))
      X = np.nan_to_num((X - self.X_means) / self.X_stds)
      save_to_disk(X, os.path.join(data_dir, row['X-transformed']))

    if self.transform_y:

      y = load_from_disk(os.path.join(data_dir, row['y-transformed']))

      # transform tasks as normal
      y = np.nan_to_num((y - self.y_means) / self.y_stds)

      # add 2nd order correction term to gradients
      grad_var = 1/self.y_stds[0]*(self.ydely_means-self.y_means[0]*self.y_means[1:])
      for i in range(y.shape[0]):
        y[i,1:] = y[i,1:] - grad_var*y[i,0]/self.y_stds[0]

      save_to_disk(y, os.path.join(data_dir, row['y-transformed']))
예제 #4
0
def _transform_row(i, df, normalize_X, normalize_y, truncate_X, truncate_y,
                   log_X, log_y, X_means, X_stds, y_means, y_stds, trunc):
  """
  Transforms the data (X, y, w,...) in a single row.

  Writes X-transforme,d y-transformed to disk.
  """
  row = df.iloc[i]
  X = load_from_disk(row['X'])
  if normalize_X or log_X:
    if normalize_X:
      # Turns NaNs to zeros
      X = np.nan_to_num((X - X_means) / X_stds)
      if truncate_X:
        X[X > trunc] = trunc
        X[X < (-1.0*trunc)] = -1.0 * trunc
    if log_X:
      X = np.log(X)
  save_to_disk(X, row['X-transformed'])

  y = load_from_disk(row['y'])
  if normalize_y or log_y:
    if normalize_y:
      y = np.nan_to_num((y - y_means) / y_stds)
      if truncate_y:
        y[y > trunc] = trunc
        y[y < (-1.0*trunc)] = -1.0 * trunc
    if log_y:
      y = np.log(y)
  save_to_disk(y, row['y-transformed'])
예제 #5
0
  def transform_row(self, i, df, data_dir):
    """Logarithmically transforms data in dataset."""
    """Select features and tasks of interest for transformation."""
    row = df.iloc[i]
    if self.transform_X:
      X = load_from_disk(os.path.join(data_dir, row['X-transformed']))
      num_features=len(X[0])
      if self.features is None:
        X = np.log(X+1)
      else:
        for j in xrange(num_features):
          if j in self.features:
            X[:,j] = np.log(X[:,j]+1)
          else:
            X[:,j] = X[:,j]
      save_to_disk(X, os.path.join(data_dir, row['X-transformed']))

    if self.transform_y:
      y = load_from_disk(os.path.join(data_dir, row['y-transformed']))
      num_tasks=len(y[0])
      if self.tasks is None:
        y = np.log(y+1)
      else:
        for j in xrange(num_tasks):
          if j in self.tasks:
            y[:,j] = np.log(y[:,j]+1)
          else:
            y[:,j] = y[:,j]
      save_to_disk(y, os.path.join(data_dir, row['y-transformed']))
예제 #6
0
 def _itershards(self):
   """
   Iterates over all shards in dataset.
   """
   for _, row in self.metadata_df.iterrows():
     X = load_from_disk(row['X-transformed'])
     y = load_from_disk(row['y-transformed'])
     w = load_from_disk(row['w'])
     ids = load_from_disk(row['ids'])
     yield (X, y, w, ids)
예제 #7
0
 def iterate(dataset):
   for _, row in dataset.metadata_df.iterrows():
     X = np.array(load_from_disk(
         os.path.join(dataset.data_dir, row['X-transformed'])))
     y = np.array(load_from_disk(
         os.path.join(dataset.data_dir, row['y-transformed'])))
     w = np.array(load_from_disk(
         os.path.join(dataset.data_dir, row['w-transformed'])))
     ids = np.array(load_from_disk(
         os.path.join(dataset.data_dir, row['ids'])), dtype=object)
     yield (X, y, w, ids)
예제 #8
0
 def get_shard(self, i):
   """Retrieves data for the i-th shard from disk."""
   row = self.metadata_df.iloc[i]
   X = np.array(load_from_disk(
       os.path.join(self.data_dir, row['X-transformed'])))
   y = np.array(load_from_disk(
       os.path.join(self.data_dir, row['y-transformed'])))
   w = np.array(load_from_disk(
       os.path.join(self.data_dir, row['w-transformed'])))
   ids = np.array(load_from_disk(
       os.path.join(self.data_dir, row['ids'])), dtype=object)
   return (X, y, w, ids)
예제 #9
0
  def transform_row(i, df):
    """Logarithmically transforms data in dataset."""
    row = df.iloc[i]
    if self.transform_X:
      X = load_from_disk(row['X-transformed'])
      X = np.log(X)
      save_to_disk(X, row['X-transformed'])

    if self.transform_y:
      y = load_from_disk(row['y-transformed'])
      y = np.log(y)
      save_to_disk(y, row['y-transformed'])
예제 #10
0
 def iterate(dataset):
   for _, row in dataset.metadata_df.iterrows():
     X = np.array(load_from_disk(
         os.path.join(dataset.data_dir, row['X'])))
     y = np.array(load_from_disk(
         os.path.join(dataset.data_dir, row['y'])))
     w_filename = os.path.join(dataset.data_dir, row['w'])
     if os.path.exists(w_filename):
         w = np.array(load_from_disk(w_filename))
     else:
         w = np.ones(y.shape)
     ids = np.array(load_from_disk(
         os.path.join(dataset.data_dir, row['ids'])), dtype=object)
     yield (X, y, w, ids)
예제 #11
0
 def transform_row(self, i, df, data_dir):
   """Reweight the labels for this data."""
   row = df.iloc[i]
   y = load_from_disk(os.path.join(data_dir, row['y-transformed']))
   w = load_from_disk(os.path.join(data_dir, row['w-transformed']))
   w_balanced = np.zeros_like(w)
   for ind, task in enumerate(self.dataset.get_task_names()):
     task_y = y[:, ind]
     task_w = w[:, ind]
     zero_indices = np.logical_and(task_y==0, task_w != 0)
     one_indices = np.logical_and(task_y==1, task_w != 0)
     w_balanced[zero_indices, ind] = self.weights[ind][0]
     w_balanced[one_indices, ind] = self.weights[ind][1]
   save_to_disk(w_balanced, os.path.join(data_dir, row['w-transformed']))
예제 #12
0
 def transform_row(self, i, df):
   """
   Clips outliers for the data (X, y, w, ...) in a single row).
   """
   row = df.iloc[i]
   if self.transform_X:
     X = load_from_disk(row['X-transformed'])
     X[X > self.max_val] = self.max_val
     X[X < (-1.0*self.max_val)] = -1.0 * self.max_val
     save_to_disk(X, row['X-transformed'])
   if self.transform_y:
     y = load_from_disk(row['y-transformed'])
     y[y > trunc] = trunc
     y[y < (-1.0*trunc)] = -1.0 * trunc
     save_to_disk(y, row['y-transformed'])
예제 #13
0
  def transform_row(self, i, df):
    """
    Normalizes the data (X, y, w, ...) in a single row).
    """
    row = df.iloc[i]

    if self.transform_X:
      X = load_from_disk(row['X-transformed'])
      X = np.nan_to_num((X - self.X_means) / self.X_stds)
      save_to_disk(X, row['X-transformed'])

    if self.transform_y:
      y = load_from_disk(row['y-transformed'])
      y = np.nan_to_num((y - self.y_means) / self.y_stds)
      save_to_disk(y, row['y-transformed'])
예제 #14
0
 def get_shard(self, i):
   """Retrieves data for the i-th shard from disk."""
   row = self.metadata_df.iloc[i]
   X = np.array(load_from_disk(
       os.path.join(self.data_dir, row['X'])))
   y = np.array(load_from_disk(
       os.path.join(self.data_dir, row['y'])))
   w_filename = os.path.join(self.data_dir, row['w'])
   if os.path.exists(w_filename):
       w = np.array(load_from_disk(w_filename))
   else:
       w = np.ones(y.shape)
   ids = np.array(load_from_disk(
       os.path.join(self.data_dir, row['ids'])), dtype=object)
   return (X, y, w, ids)
예제 #15
0
 def get_shard_size(self):
   """Gets size of shards on disk."""
   if not len(self.metadata_df):
     raise ValueError("No data in dataset.")
   sample_y = load_from_disk(
       os.path.join(self.data_dir, next(self.metadata_df.iterrows())[1]['y']))
   return len(sample_y)
예제 #16
0
  def __init__(self, shard_generator=[], data_dir=None, tasks=[],
               reload=False, verbose=True):
    """
    Turns featurized dataframes into numpy files, writes them & metadata to disk.
    """
    if data_dir is not None:
      if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    else:
      data_dir = tempfile.mkdtemp()
    self.data_dir = data_dir
    self.verbose = verbose

    if reload:
      log("Loading pre-existing dataset.", self.verbose)
      if os.path.exists(self._get_metadata_filename()):
        (self.tasks, self.metadata_df) = load_from_disk(
            self._get_metadata_filename())
      else:
        raise ValueError("No metadata found.")
      return

    metadata_rows = []
    time1 = time.time()
    for shard_num, (X, y, w, ids) in enumerate(shard_generator):
      basename = "shard-%d" % shard_num 
      metadata_rows.append(
          DiskDataset.write_data_to_disk(
              self.data_dir, basename, tasks, X, y, w, ids))
    self.tasks = tasks
    self.metadata_df = DiskDataset.construct_metadata(metadata_rows)
    self.save_to_disk()
    time2 = time.time()
    print("TIMING: dataset construction took %0.3f s" % (time2-time1),
          self.verbose)
예제 #17
0
 def get_data_shape(self):
   """
   Gets array shape of datapoints in this dataset.
   """
   if not len(self.metadata_df):
     raise ValueError("No data in dataset.")
   sample_X = load_from_disk(self.metadata_df.iterrows().next()[1]['X'])[0]
   return np.shape(sample_X)
예제 #18
0
  def __init__(self, data_dir, samples=None, feature_types=None):
    """
    Turns featurized dataframes into numpy files, writes them & metadata to disk.
    """
    if not os.path.exists(data_dir):
      os.makedirs(data_dir)
    self.data_dir = data_dir

    if samples is not None and feature_types is not None:
      if not isinstance(feature_types, list):
        raise ValueError("feature_types must be a list or None.")

      write_dataset_single_partial = partial(
          write_dataset_single, data_dir=self.data_dir,
          feature_types=feature_types)

      metadata_rows = []
      # TODO(rbharath): Still a bit of information leakage.
      for df_file, df in zip(samples.dataset_files, samples.itersamples()):
        retval = write_dataset_single_partial((df_file, df))
        if retval is not None:
          metadata_rows.append(retval)

      # TODO(rbharath): FeaturizedSamples should not be responsible for
      # X-transform, X_sums, etc. Move that stuff over to Dataset.
      self.metadata_df = pd.DataFrame(
          metadata_rows,
          columns=('df_file', 'task_names', 'ids',
                   'X', 'X-transformed', 'y', 'y-transformed',
                   'w',
                   'X_sums', 'X_sum_squares', 'X_n',
                   'y_sums', 'y_sum_squares', 'y_n'))
      save_to_disk(
          self.metadata_df, self._get_metadata_filename())
      # input/output transforms not specified yet, so
      # self.transforms = (input_transforms, output_transforms) =>
      self.transforms = ([], [])
      save_to_disk(
          self.transforms, self._get_transforms_filename())
    else:
      if os.path.exists(self._get_metadata_filename()):
        self.metadata_df = load_from_disk(self._get_metadata_filename())
        self.transforms = load_from_disk(self._get_transforms_filename())
      else:
        raise ValueError("No metadata found.")
예제 #19
0
 def __len__(self):
   """
   Finds number of elements in dataset.
   """
   total = 0
   for _, row in self.metadata_df.iterrows():
     y = load_from_disk(os.path.join(self.data_dir, row['y-transformed']))
     total += len(y)
   return total
예제 #20
0
  def itershards(self):
    """
    Iterates over all shards in dataset.

    Datasets are stored in sharded fashion on disk. Each call to next() for the
    generator defined by this function returns the data from a particular shard.
    The order of shards returned is guaranteed to remain fixed.
    """
    for _, row in self.metadata_df.iterrows():
      X = np.array(load_from_disk(
          os.path.join(self.data_dir, row['X-transformed'])))
      y = np.array(load_from_disk(
          os.path.join(self.data_dir, row['y-transformed'])))
      w = np.array(load_from_disk(
          os.path.join(self.data_dir, row['w-transformed'])))
      ids = np.array(load_from_disk(
          os.path.join(self.data_dir, row['ids'])), dtype=object)
      yield (X, y, w, ids)
예제 #21
0
  def __init__(self, samples_dir, featurizers, dataset_files=None, 
               reload=False, verbosity=None):
    """
    Initialiize FeaturizedSamples

    If samples_dir does not exist, must specify dataset_files. Then samples_dir
    is created and populated. If samples_dir exists (created by previous call to
    FeaturizedSamples), then dataset_files cannot be specified. If reload is
    False and dataset_files is provided, will overwrite old dataset_files with
    new.
    """
    assert verbosity in [None, "low", "high"]
    self.verbosity = verbosity
    self.dataset_files = dataset_files
    self.feature_types = (
        ["user-specified-features"] + 
        [featurizer.__class__.__name__ for featurizer in featurizers])

    self.featurizers = featurizers

    if not os.path.exists(samples_dir):
      os.makedirs(samples_dir)
    self.samples_dir = samples_dir

    if os.path.exists(self._get_dataset_paths_filename()):
      if dataset_files is not None:
        if not reload:
          save_to_disk(dataset_files, self._get_dataset_paths_filename())
        else:
          raise ValueError("Can't change dataset_files already stored on disk")
    else:
      save_to_disk(dataset_files, self._get_dataset_paths_filename())
    self.dataset_files = load_from_disk(self._get_dataset_paths_filename())

    if os.path.exists(self._get_compounds_filename()) and reload:
      compounds_df = load_from_disk(self._get_compounds_filename())
    else:
      compounds_df = self._get_compounds()
      # compounds_df is not altered by any method after initialization, so it's
      # safe to keep a copy in memory and on disk.
      save_to_disk(compounds_df, self._get_compounds_filename())
    _check_validity(compounds_df)
    self.compounds_df = compounds_df
    self.num_samples = len(compounds_df)
예제 #22
0
def update_mean_and_std(df):
  """
  Compute means/stds of X/y from sums/sum_squares of tensors.
  """
  X_transform = []
  for _, row in df.iterrows():
    Xt = load_from_disk(row['X-transformed'])
    Xs = np.sum(Xt,axis=0)
    Xss = np.sum(np.square(Xt),axis=0)
    save_to_disk(Xs, row['X_sums'])
    save_to_disk(Xss, row['X_sum_squares'])

  y_transform = []
  for _, row in df.iterrows():
    yt = load_from_disk(row['y-transformed'])
    ys = np.sum(yt,axis=0)
    yss = np.sum(np.square(yt),axis=0)
    save_to_disk(ys, row['y_sums'])
    save_to_disk(yss, row['y_sum_squares'])
예제 #23
0
 def get_data_shape(self):
   """
   Gets array shape of datapoints in this dataset.
   """
   if not len(self.metadata_df):
     raise ValueError("No data in dataset.")
   sample_X = load_from_disk(
       os.path.join(
           self.data_dir,
           next(self.metadata_df.iterrows())[1]['X-transformed']))[0]
   return np.shape(sample_X)
예제 #24
0
파일: featurize.py 프로젝트: arose/deepchem
 def itersamples(self):
   """Iterates over samples in this object."""
   compound_ids = set(list(self.compounds_df["mol_id"]))
   for df_file in self.dataset_files:
     df = load_from_disk(df_file)
     visible_inds = []
     for ind, row in df.iterrows():
       if row["mol_id"] in compound_ids:
         visible_inds.append(ind)
     for visible_ind in visible_inds:
       yield df.loc[visible_ind]
예제 #25
0
def load_muv(base_dir, reload=True):
  """Load MUV datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")

  # Load MUV dataset
  print("About to load MUV dataset.")
  dataset_file = os.path.join(
      current_dir, "../../datasets/muv.csv.gz")
  dataset = load_from_disk(dataset_file)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

  # Featurize MUV dataset
  print("About to featurize MUV dataset.")
  featurizer = CircularFingerprint(size=1024)
  all_MUV_tasks = sorted(['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644',
                          'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712',
                          'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652',
                          'MUV-466', 'MUV-832'])

  loader = DataLoader(tasks=all_MUV_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = loader.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)

  # Initialize transformers 
  transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]
  if regen:
    print("About to transform data")
    for transformer in transformers:
        transformer.transform(dataset)
  
  return all_MUV_tasks, dataset, transformers
예제 #26
0
 def iterate(dataset):
   for _, row in dataset.metadata_df.iterrows():
     X = np.array(load_from_disk(os.path.join(dataset.data_dir, row['X'])))
     ids = np.array(
         load_from_disk(os.path.join(dataset.data_dir, row['ids'])),
         dtype=object)
     # These columns may be missing is the dataset is unlabelled.
     if row['y'] is not None:
       y = np.array(load_from_disk(os.path.join(dataset.data_dir, row['y'])))
     else:
       y = None
     if row['w'] is not None:
       w_filename = os.path.join(dataset.data_dir, row['w'])
       if os.path.exists(w_filename):
         w = np.array(load_from_disk(w_filename))
       else:
         w = np.ones(y.shape)
     else:
       w = None
     yield (X, y, w, ids)
예제 #27
0
def load_tox21(base_dir, reload=True):
  """Load Tox21 datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  samples_dir = os.path.join(base_dir, "samples")
  data_dir = os.path.join(base_dir, "dataset")

  # Load Tox21 dataset
  print("About to load Tox21 dataset.")
  dataset_file = os.path.join(
      current_dir, "../../datasets/tox21.csv.gz")
  dataset = load_from_disk(dataset_file)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

  # Featurize Tox21 dataset
  print("About to featurize Tox21 dataset.")
  featurizer = CircularFingerprint(size=1024)
  all_tox21_tasks = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER',
                     'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5',
                     'SR-HSE', 'SR-MMP', 'SR-p53']

  if not reload or not os.path.exists(data_dir):
    loader = DataLoader(tasks=all_tox21_tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    dataset = loader.featurize(
        dataset_file, data_dir, shard_size=8192)
  else:
    dataset = Dataset(data_dir, all_tox21_tasks, reload=True)

  # Initialize transformers 
  transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]
  if not reload:
    print("About to transform data")
    for transformer in transformers:
        transformer.transform(dataset)
  
  return all_tox21_tasks, dataset, transformers
예제 #28
0
  def _update_mean_and_std(self, df, X_stats, y_stats):
    """
    Compute means/stds of X/y from sums/sum_squares of tensors.
    """
    if X_stats:
      X_transform = []
      for _, row in df.iterrows():
        Xt = load_from_disk(os.path.join(self.data_dir, row['X-transformed']))
        Xs = np.sum(Xt,axis=0)
        Xss = np.sum(np.square(Xt),axis=0)
        save_to_disk(Xs, os.path.join(self.data_dir, row['X_sums']))
        save_to_disk(Xss, os.path.join(self.data_dir, row['X_sum_squares']))

    if y_stats:
      y_transform = []
      for _, row in df.iterrows():
        yt = load_from_disk(os.path.join(self.data_dir, row['y-transformed']))
        ys = np.sum(yt,axis=0)
        yss = np.sum(np.square(yt),axis=0)
        save_to_disk(ys, os.path.join(self.data_dir, row['y_sums']))
        save_to_disk(yss, os.path.join(self.data_dir, row['y_sum_squares']))
예제 #29
0
  def __init__(self, data_dir, verbose=True):
    """
    Turns featurized dataframes into numpy files, writes them & metadata to disk.
    """
    self.data_dir = data_dir
    self.verbose = verbose

    log("Loading dataset from disk.", self.verbose)
    if os.path.exists(self._get_metadata_filename()):
      (self.tasks,
       self.metadata_df) = load_from_disk(self._get_metadata_filename())
    else:
      raise ValueError("No metadata found on disk.")
예제 #30
0
파일: featurize.py 프로젝트: hainm/deepchem
    def iterdataframes(self):
        """
    Provides a bulk iterator over data.

    Each sample from the iterator is a dataframe of samples.
    """
        compound_ids = set(list(self.compounds_df["mol_id"]))
        for df_file in self.dataset_files:
            df = load_from_disk(df_file)
            visible_inds = []
            for ind, row in df.iterrows():
                if row["mol_id"] in compound_ids:
                    visible_inds.append(ind)
            yield df.loc[visible_inds]
예제 #31
0
 def iterate(dataset):
   for _, row in dataset.metadata_df.iterrows():
     X = np.array(load_from_disk(os.path.join(dataset.data_dir, row['X'])))
     ids = np.array(
         load_from_disk(os.path.join(dataset.data_dir, row['ids'])),
         dtype=object)
     # These columns may be missing is the dataset is unlabelled.
     if row['y'] is not None:
       y = np.array(load_from_disk(os.path.join(dataset.data_dir, row['y'])))
     else:
       y = None
     if row['w'] is not None:
       w_filename = os.path.join(dataset.data_dir, row['w'])
       if os.path.exists(w_filename):
         w = np.array(load_from_disk(w_filename))
       else:
         if len(y.shape) == 1:
           w = np.ones(y.shape[0], np.float32)
         else:
           w = np.ones((y.shape[0], 1), np.float32)
     else:
       w = None
     yield (X, y, w, ids)
예제 #32
0
  def get_shard(self, i):
    """Retrieves data for the i-th shard from disk."""
    row = self.metadata_df.iloc[i]
    X = np.array(load_from_disk(os.path.join(self.data_dir, row['X'])))

    if row['y'] is not None:
      y = np.array(load_from_disk(os.path.join(self.data_dir, row['y'])))
    else:
      y = None

    if row['w'] is not None:
      # TODO (ytz): Under what condition does this exist but the file itself doesn't?
      w_filename = os.path.join(self.data_dir, row['w'])
      if os.path.exists(w_filename):
        w = np.array(load_from_disk(w_filename))
      else:
        w = np.ones(y.shape)
    else:
      w = None

    ids = np.array(
        load_from_disk(os.path.join(self.data_dir, row['ids'])), dtype=object)
    return (X, y, w, ids)
예제 #33
0
  def itersamples(self):
    """
    Provides an iterator over samples.

    Each sample from the iterator is a dataframe of samples.
    """
    compound_ids = set(list(self.compounds_df["mol_id"]))
    for df_file in self.dataset_files:
      df = load_from_disk(df_file)
      visible_inds = []
      for ind, row in df.iterrows():
        if row["mol_id"] in compound_ids:
          visible_inds.append(ind)
      yield df.loc[visible_inds]
예제 #34
0
    def _run_muv_experiment(self, dataset_file, reload=False, verbosity=None):
        """Loads or reloads a small version of MUV dataset."""
        # Load MUV dataset
        raw_dataset = load_from_disk(dataset_file)
        print("Number of examples in dataset: %s" % str(raw_dataset.shape[0]))

        print("About to featurize compounds")
        featurizer = CircularFingerprint(size=1024)
        MUV_tasks = [
            'MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644', 'MUV-548',
            'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712', 'MUV-737', 'MUV-858',
            'MUV-713', 'MUV-733', 'MUV-652', 'MUV-466', 'MUV-832'
        ]
        loader = DataLoader(tasks=MUV_tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        dataset = loader.featurize(dataset_file, self.data_dir)
        assert len(dataset) == len(raw_dataset)

        print("About to split compounds into train/valid/test")
        splitter = ScaffoldSplitter(verbosity=verbosity)
        frac_train, frac_valid, frac_test = .8, .1, .1
        train_dataset, valid_dataset, test_dataset = \
            splitter.train_valid_test_split(
                dataset, self.train_dir, self.valid_dir, self.test_dir,
                log_every_n=1000, frac_train=frac_train,
                frac_test=frac_test, frac_valid=frac_valid)
        # Do an approximate comparison since splits are sometimes slightly off from
        # the exact fraction.
        assert relative_difference(len(train_dataset),
                                   frac_train * len(dataset)) < 1e-3
        assert relative_difference(len(valid_dataset),
                                   frac_valid * len(dataset)) < 1e-3
        assert relative_difference(len(test_dataset),
                                   frac_test * len(dataset)) < 1e-3

        # TODO(rbharath): Transformers don't play nice with reload! Namely,
        # reloading will cause the transform to be reapplied. This is undesirable in
        # almost all cases. Need to understand a method to fix this.
        transformers = [
            BalancingTransformer(transform_w=True, dataset=train_dataset)
        ]
        print("Transforming datasets")
        for dataset in [train_dataset, valid_dataset, test_dataset]:
            for transformer in transformers:
                transformer.transform(dataset)

        return (len(train_dataset), len(valid_dataset), len(test_dataset))
예제 #35
0
def _get_fields(input_file):
    """Get the names of fields and field_types for input data."""
    # If CSV input, assume that first row contains labels
    input_type = _get_input_type(input_file)
    if input_type == "csv":
        with open(input_file, "rb") as inp_file_obj:
            return csv.reader(inp_file_obj).next()
    elif input_type == "pandas-joblib":
        df = load_from_disk(input_file)
        return df.keys()
    elif input_type == "pandas-pickle":
        df = load_pickle_from_disk(input_file)
        return df.keys()
    else:
        raise ValueError("Unrecognized extension for %s" % input_file)
예제 #36
0
    def transform_row(self, i, df, data_dir):
        """
    Randomly permute a Coulomb Matrix in a dataset
    """
        row = df.iloc[i]
        if self.transform_X:
            X = load_from_disk(os.path.join(data_dir, row['X-transformed']))
            for j in range(len(X)):
                cm = self.construct_cm_from_triu(X[j])
                X[j] = self.unpad_randomize_and_flatten(cm)
            save_to_disk(X, os.path.join(data_dir, row['X-transformed']))

        if self.transform_y:
            print("y will not be transformed by "
                  "CoulombRandomizationTransformer.")
예제 #37
0
  def transform(self, dataset, parallel=False):

    super(CoulombBinarizationTransformer, self).transform(dataset,
          parallel=parallel)

    df = dataset.metadata_df
    Xt = []

    for _, row in df.iterrows():
      X_t = load_from_disk(os.path.join(dataset.data_dir, row['X-transformed']))
      Xt.append(np.array(X_t))

    X = np.vstack(Xt)
    X_means = X.mean(axis=0)
    X_stds = (X-X_means).std()

    for i, row in df.iterrows():
      X_t = (Xt[i]-X_means)/X_stds
      save_to_disk(X_t, os.path.join(dataset.data_dir, row['X-transformed']))
예제 #38
0
    def _get_compounds(self):
        """
    Create dataframe containing metadata about compounds.
    """
        compound_rows = []
        for dataset_file in self.dataset_files:
            df = load_from_disk(dataset_file)
            compound_ids = list(df["mol_id"])
            smiles = list(df["smiles"])
            if "split" in df.keys():
                splits = list(df["split"])
            else:
                splits = [None] * len(smiles)
            compound_rows += [
                list(elt) for elt in zip(compound_ids, smiles, splits)
            ]

        compounds_df = pd.DataFrame(compound_rows,
                                    columns=("mol_id", "smiles", "split"))
        return compounds_df
예제 #39
0
  def load_metadata(self):
    try:
      tasks_filename, metadata_filename = self._get_metadata_filename()
      with open(tasks_filename) as fin:
        tasks = json.load(fin)
      metadata_df = pd.read_csv(metadata_filename, compression='gzip')
      metadata_df = metadata_df.where((pd.notnull(metadata_df)), None)
      return tasks, metadata_df
    except Exception as e:
      pass

    # Load obsolete format -> save in new format
    metadata_filename = os.path.join(self.data_dir, "metadata.joblib")
    if os.path.exists(metadata_filename):
      tasks, metadata_df = load_from_disk(metadata_filename)
      del metadata_df['task_names']
      del metadata_df['basename']
      save_metadata(tasks, metadata_df, self.data_dir)
      return tasks, metadata_df
    raise ValueError("No Metadata Found On Disk")
예제 #40
0
  def transform_row(self, i, df, data_dir):
    """
    Binarizes data in dataset with sigmoid function
    """
    row = df.iloc[i]
    X_bin = []
    if self.update_state: 
      self.set_max(df, data_dir)
      self.update_state = False
    if self.transform_X:
      X = load_from_disk(os.path.join(data_dir, row['X-transformed']))
      for i in range(X.shape[1]):
        for k in np.arange(0,self.feature_max[i]+self.theta,self.theta):
          X_bin += [np.tanh((X[:,i]-k)/self.theta)]

      X_bin = np.array(X_bin).T
      save_to_disk(X_bin, os.path.join(data_dir, row['X-transformed']))

    if self.transform_y:
      print("y will not be transformed by "
            "CoulombBinarizationTransformer.")
예제 #41
0
  def __init__(self, data_dir=None, tasks=[], metadata_rows=None, #featurizers=None, 
               raw_data=None, verbosity=None, reload=False,
               compute_feature_statistics=True):
    """
    Turns featurized dataframes into numpy files, writes them & metadata to disk.
    """
    if not os.path.exists(data_dir):
      os.makedirs(data_dir)
    self.data_dir = data_dir
    assert verbosity in [None, "low", "high"]
    self.verbosity = verbosity

    if not reload or not os.path.exists(self._get_metadata_filename()):
      if metadata_rows is not None:
        self.metadata_df = DiskDataset.construct_metadata(metadata_rows)
        self.save_to_disk()
      elif raw_data is not None:
        metadata_rows = []
        ids, X, y, w = raw_data
        metadata_rows.append(
            DiskDataset.write_data_to_disk(
                self.data_dir, "data", tasks, X, y, w, ids,
                compute_feature_statistics=compute_feature_statistics))
        self.metadata_df = DiskDataset.construct_metadata(metadata_rows)
        self.save_to_disk()
      else:
        # Create an empty metadata dataframe to be filled at a later time
        basename = "metadata"
        metadata_rows = [DiskDataset.write_data_to_disk(
            self.data_dir, basename, tasks)]
        self.metadata_df = DiskDataset.construct_metadata(metadata_rows)
        self.save_to_disk()

    else:
      log("Loading pre-existing metadata file.", self.verbosity)
      if os.path.exists(self._get_metadata_filename()):
        self.metadata_df = load_from_disk(self._get_metadata_filename())
      else:
        raise ValueError("No metadata found.")
예제 #42
0
 def reload(self):
     """Loads sklearn model from joblib file on disk."""
     self.model_instance = load_from_disk(
         Model.get_model_filename(self.model_dir))
base_dir = "/scratch/users/rbharath/tox21_analysis"
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

current_dir = os.path.dirname(os.path.realpath(__file__))
#Make directories to store the raw and featurized datasets.
data_dir = os.path.join(base_dir, "dataset")
train_dir = os.path.join(base_dir, "train_dataset")
valid_dir = os.path.join(base_dir, "valid_dataset")
test_dir = os.path.join(base_dir, "test_dataset")
model_dir = os.path.join(base_dir, "model")

# Load Tox21 dataset
print("About to load Tox21 dataset.")
dataset_file = os.path.join(current_dir, "../../datasets/tox21.csv.gz")
dataset = load_from_disk(dataset_file)
print("Columns of dataset: %s" % str(dataset.columns.values))
print("Number of examples in dataset: %s" % str(dataset.shape[0]))

tox21_tasks, tox21_dataset, transformers = load_tox21(data_dir, reload=reload)
num_train = 7200
X, y, w, ids = tox21_dataset.to_numpy()
X_train, X_valid = X[:num_train], X[num_train:]
y_train, y_valid = y[:num_train], y[num_train:]
w_train, w_valid = w[:num_train], w[num_train:]
ids_train, ids_valid = ids[:num_train], ids[num_train:]

# Not sure if we need to constantly delete these directories...
if os.path.exists(train_dir):
    shutil.rmtree(train_dir)
train_dataset = Dataset.from_numpy(train_dir,
예제 #44
0
def load_bace(mode="regression", transform=True, split="20-80"):
    """Load BACE-1 dataset as regression/classification problem."""
    reload = True
    verbosity = "high"
    regen = False
    assert split in ["20-80", "80-20"]

    current_dir = os.path.dirname(os.path.realpath(__file__))
    if split == "20-80":
        dataset_file = os.path.join(current_dir,
                                    "../../datasets/desc_canvas_aug30.csv")
    elif split == "80-20":
        dataset_file = os.path.join(current_dir,
                                    "../../datasets/rev8020split_desc.csv")
    dataset = load_from_disk(dataset_file)
    num_display = 10
    pretty_columns = ("[" + ",".join(
        ["'%s'" % column
         for column in dataset.columns.values[:num_display]]) + ",...]")

    crystal_dataset_file = os.path.join(
        current_dir, "../../datasets/crystal_desc_canvas_aug30.csv")
    crystal_dataset = load_from_disk(crystal_dataset_file)

    print("Columns of dataset: %s" % pretty_columns)
    print("Number of examples in dataset: %s" % str(dataset.shape[0]))
    print("Number of examples in crystal dataset: %s" %
          str(crystal_dataset.shape[0]))

    #Make directories to store the raw and featurized datasets.
    base_dir = tempfile.mkdtemp()
    data_dir = os.path.join(base_dir, "dataset")
    train_dir = os.path.join(base_dir, "train_dataset")
    valid_dir = os.path.join(base_dir, "valid_dataset")
    test_dir = os.path.join(base_dir, "test_dataset")
    model_dir = os.path.join(base_dir, "model")
    crystal_dir = os.path.join(base_dir, "crystal")

    if mode == "regression":
        bace_tasks = ["pIC50"]
    elif mode == "classification":
        bace_tasks = ["Class"]
    else:
        raise ValueError("Unknown mode %s" % mode)
    featurizer = UserDefinedFeaturizer(user_specified_features)
    loader = DataLoader(tasks=bace_tasks,
                        smiles_field="mol",
                        id_field="CID",
                        featurizer=featurizer)
    if not reload or not os.path.exists(data_dir):
        dataset = loader.featurize(dataset_file, data_dir)
        regen = True
    else:
        dataset = Dataset(data_dir, reload=True)
    if not reload or not os.path.exists(crystal_dir):
        crystal_dataset = loader.featurize(crystal_dataset_file, crystal_dir)
    else:
        crystal_dataset = Dataset(crystal_dir, reload=True)

    if (not reload or not os.path.exists(train_dir)
            or not os.path.exists(valid_dir) or not os.path.exists(test_dir)):
        regen = True
        splitter = SpecifiedSplitter(dataset_file,
                                     "Model",
                                     verbosity=verbosity)
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
            dataset, train_dir, valid_dir, test_dir)
    else:
        train_dataset = Dataset(train_dir, reload=True)
        valid_dataset = Dataset(valid_dir, reload=True)
        test_dataset = Dataset(test_dir, reload=True)

    #NOTE THE RENAMING:
    if split == "20-80":
        valid_dataset, test_dataset = test_dataset, valid_dataset
    print("Number of compounds in train set")
    print(len(train_dataset))
    print("Number of compounds in validation set")
    print(len(valid_dataset))
    print("Number of compounds in test set")
    print(len(test_dataset))
    print("Number of compounds in crystal set")
    print(len(crystal_dataset))

    if transform and regen:
        input_transformers = [
            NormalizationTransformer(transform_X=True, dataset=train_dataset),
            ClippingTransformer(transform_X=True, dataset=train_dataset)
        ]
        output_transformers = []
        if mode == "regression":
            output_transformers = [
                NormalizationTransformer(transform_y=True,
                                         dataset=train_dataset)
            ]
        else:
            output_transformers = []
    else:
        input_transformers, output_transformers = [], []

    transformers = input_transformers + output_transformers
    for dataset in [
            train_dataset, valid_dataset, test_dataset, crystal_dataset
    ]:
        for transformer in transformers:
            dataset = transformer.transform(dataset)

    return (bace_tasks, train_dataset, valid_dataset, test_dataset,
            crystal_dataset, output_transformers)
예제 #45
0
def load_sweet(base_dir, reload=True, frac_train=.8):
    """Load sweet datasets. Does not do train/test split"""
    # Set some global variables up top
    reload = True
    verbosity = "high"
    model = "logistic"
    regen = False

    # Create some directories for analysis
    # The base_dir holds the results of all analysis
    if not reload:
        if os.path.exists(base_dir):
            shutil.rmtree(base_dir)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    current_dir = os.path.dirname(os.path.realpath(__file__))
    #Make directories to store the raw and featurized datasets.
    data_dir = os.path.join(base_dir, "dataset")
    train_dir = os.path.join(base_dir, "train_dataset")
    valid_dir = os.path.join(base_dir, "valid_dataset")

    # Load SWEET dataset
    print("About to load SWEET dataset.")
    dataset_file = os.path.join(current_dir, "./sweet.csv.gz")
    dataset = load_from_disk(dataset_file)
    print("Columns of dataset: %s" % str(dataset.columns.values))
    print("Number of examples in dataset: %s" % str(dataset.shape[0]))

    # Featurize SWEET dataset
    print("About to featurize SWEET dataset.")
    featurizer = CircularFingerprint(size=1024)
    SWEET_tasks = dataset.columns.values[1:].tolist()

    loader = DataLoader(tasks=SWEET_tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    if not reload or not os.path.exists(data_dir):
        dataset = loader.featurize(dataset_file, data_dir)
        regen = True
    else:
        dataset = DiskDataset(data_dir, reload=True)

    # Initialize transformers
    transformers = [BalancingTransformer(transform_w=True, dataset=dataset)]
    if regen:
        print("About to transform data")
        for transformer in transformers:
            dataset = transformer.transform(dataset)

    X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
    num_tasks = 17
    num_train = frac_train * len(dataset)
    SWEET_tasks = SWEET_tasks[:num_tasks]
    print("Using following tasks")
    print(SWEET_tasks)
    X_train, X_valid = X[:num_train], X[num_train:]
    y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks]
    w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
    ids_train, ids_valid = ids[:num_train], ids[num_train:]

    train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train,
                                           w_train, ids_train, SWEET_tasks)
    valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid,
                                           w_valid, ids_valid, SWEET_tasks)

    return SWEET_tasks, (train_dataset, valid_dataset), transformers
예제 #46
0
def load_pcba(base_dir, reload=True):
  """Load PCBA datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")

  # Load PCBA dataset
  print("About to load PCBA dataset.")
  dataset_file = os.path.join(
      current_dir, "../../datasets/pcba.csv.gz")
  dataset = load_from_disk(dataset_file)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

  # Featurize PCBA dataset
  print("About to featurize PCBA dataset.")
  featurizer = CircularFingerprint(size=1024)
  all_PCBA_tasks = [
      'PCBA-1030','PCBA-1379','PCBA-1452','PCBA-1454','PCBA-1457',
      'PCBA-1458','PCBA-1460','PCBA-1461','PCBA-1468','PCBA-1469',
      'PCBA-1471','PCBA-1479','PCBA-1631','PCBA-1634','PCBA-1688',
      'PCBA-1721','PCBA-2100','PCBA-2101','PCBA-2147','PCBA-2242',
      'PCBA-2326','PCBA-2451','PCBA-2517','PCBA-2528','PCBA-2546',
      'PCBA-2549','PCBA-2551','PCBA-2662','PCBA-2675','PCBA-2676',
      'PCBA-411','PCBA-463254','PCBA-485281','PCBA-485290','PCBA-485294',
      'PCBA-485297','PCBA-485313','PCBA-485314','PCBA-485341','PCBA-485349',
      'PCBA-485353','PCBA-485360','PCBA-485364','PCBA-485367','PCBA-492947',
      'PCBA-493208','PCBA-504327','PCBA-504332','PCBA-504333','PCBA-504339',
      'PCBA-504444','PCBA-504466','PCBA-504467','PCBA-504706','PCBA-504842',
      'PCBA-504845','PCBA-504847','PCBA-504891','PCBA-540276','PCBA-540317',
      'PCBA-588342','PCBA-588453','PCBA-588456','PCBA-588579','PCBA-588590',
      'PCBA-588591','PCBA-588795','PCBA-588855','PCBA-602179','PCBA-602233',
      'PCBA-602310','PCBA-602313','PCBA-602332','PCBA-624170','PCBA-624171',
      'PCBA-624173','PCBA-624202','PCBA-624246','PCBA-624287','PCBA-624288',
      'PCBA-624291','PCBA-624296','PCBA-624297','PCBA-624417','PCBA-651635',
      'PCBA-651644','PCBA-651768','PCBA-651965','PCBA-652025','PCBA-652104',
      'PCBA-652105','PCBA-652106','PCBA-686970','PCBA-686978','PCBA-686979',
      'PCBA-720504','PCBA-720532','PCBA-720542','PCBA-720551','PCBA-720553',
      'PCBA-720579','PCBA-720580','PCBA-720707','PCBA-720708','PCBA-720709',
      'PCBA-720711','PCBA-743255','PCBA-743266','PCBA-875','PCBA-881',
      'PCBA-883','PCBA-884','PCBA-885','PCBA-887','PCBA-891','PCBA-899',
      'PCBA-902','PCBA-903','PCBA-904','PCBA-912','PCBA-914','PCBA-915',
      'PCBA-924','PCBA-925','PCBA-926','PCBA-927','PCBA-938','PCBA-995']

  loader = DataLoader(tasks=all_PCBA_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = loader.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)

  # Initialize transformers 
  transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]

  if regen:
    print("About to transform data")
    for transformer in transformers:
        transformer.transform(dataset)
  
  return all_PCBA_tasks, dataset, transformers
예제 #47
0
    def set_max(self, df, data_dir):

        for _, row in df.iterrows():
            X = load_from_disk(os.path.join(data_dir, row['X-transformed']))
            self.feature_max = np.maximum(self.feature_max, X.max(axis=0))
예제 #48
0
def load_tox21(base_dir, reload=True, num_train=7200):
    """Load Tox21 datasets. Does not do train/test split"""
    # Set some global variables up top
    reload = True
    verbosity = "high"

    # Create some directories for analysis
    # The base_dir holds the results of all analysis
    if not reload:
        if os.path.exists(base_dir):
            shutil.rmtree(base_dir)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    current_dir = os.path.dirname(os.path.realpath(__file__))
    #Make directories to store the raw and featurized datasets.
    data_dir = os.path.join(base_dir, "dataset")
    train_dir = os.path.join(base_dir, "train")
    valid_dir = os.path.join(base_dir, "valid")

    # Load Tox21 dataset
    print("About to load Tox21 dataset.")
    dataset_file = os.path.join(current_dir, "../../datasets/tox21.csv.gz")
    dataset = load_from_disk(dataset_file)
    print("Columns of dataset: %s" % str(dataset.columns.values))
    print("Number of examples in dataset: %s" % str(dataset.shape[0]))

    # Featurize Tox21 dataset
    print("About to featurize Tox21 dataset.")
    featurizer = CircularFingerprint(size=1024)
    tox21_tasks = [
        'NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD',
        'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'
    ]

    if not reload or not os.path.exists(data_dir):
        loader = DataLoader(tasks=tox21_tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        dataset = loader.featurize(dataset_file, data_dir, shard_size=8192)
    else:
        dataset = DiskDataset(data_dir, tox21_tasks, reload=True)

    # Initialize transformers
    transformers = [BalancingTransformer(transform_w=True, dataset=dataset)]
    if not reload:
        print("About to transform data")
        for transformer in transformers:
            transformer.transform(dataset)

    X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
    X_train, X_valid = X[:num_train], X[num_train:]
    y_train, y_valid = y[:num_train], y[num_train:]
    w_train, w_valid = w[:num_train], w[num_train:]
    ids_train, ids_valid = ids[:num_train], ids[num_train:]

    train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train,
                                           w_train, ids_train, tox21_tasks)
    valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid,
                                           w_valid, ids_valid, tox21_tasks)

    return tox21_tasks, (train_dataset, valid_dataset), transformers
예제 #49
0
    def test_singletask_matches_multitask_load(self):
        """Check that singletask load and multitask load of dataset are same."""
        # Only for debug!
        np.random.seed(123)

        # Set some global variables up top
        reload = True
        verbosity = "high"

        base_dir = tempfile.mkdtemp()

        current_dir = os.path.dirname(os.path.realpath(__file__))
        #Make directories to store the raw and featurized datasets.
        data_dir = os.path.join(base_dir, "dataset")
        train_dir = os.path.join(base_dir, "train_dataset")
        valid_dir = os.path.join(base_dir, "valid_dataset")
        test_dir = os.path.join(base_dir, "test_dataset")
        model_dir = os.path.join(base_dir, "model")

        # Load dataset
        print("About to load dataset.")
        dataset_file = os.path.join(
            current_dir, "../../models/tests/multitask_example.csv")
        dataset = load_from_disk(dataset_file)
        print("Columns of dataset: %s" % str(dataset.columns.values))
        print("Number of examples in dataset: %s" % str(dataset.shape[0]))

        # Featurize tox21 dataset
        print("About to featurize dataset.")
        featurizer = CircularFingerprint(size=1024)
        all_tasks = ["task%d" % i for i in range(17)]
        # For debugging purposes
        n_tasks = 17
        tasks = all_tasks[0:n_tasks]

        ####### Do multitask load
        loader = DataLoader(tasks=tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        dataset = loader.featurize(dataset_file, data_dir)

        # Do train/valid split.
        X_multi, y_multi, w_multi, ids_multi = (dataset.X, dataset.y,
                                                dataset.w, dataset.ids)

        ####### Do singletask load
        y_tasks, w_tasks, ids_tasks = [], [], []
        for task in tasks:
            print("Processing task %s" % task)
            if os.path.exists(data_dir):
                shutil.rmtree(data_dir)
            loader = DataLoader(tasks=[task],
                                smiles_field="smiles",
                                featurizer=featurizer,
                                verbosity=verbosity)
            dataset = loader.featurize(dataset_file, data_dir)

            X_task, y_task, w_task, ids_task = (dataset.X, dataset.y,
                                                dataset.w, dataset.ids)
            y_tasks.append(y_task)
            w_tasks.append(w_task)
            ids_tasks.append(ids_task)

        ################## Do comparison
        for ind, task in enumerate(tasks):
            y_multi_task = y_multi[:, ind]
            w_multi_task = w_multi[:, ind]

            y_task = y_tasks[ind]
            w_task = w_tasks[ind]
            ids_task = ids_tasks[ind]

            np.testing.assert_allclose(y_multi_task.flatten(),
                                       y_task.flatten())
            np.testing.assert_allclose(w_multi_task.flatten(),
                                       w_task.flatten())
        shutil.rmtree(base_dir)
예제 #50
0
    def test_multiload(self):
        """Check can re-use featurization for multiple task selections.

    TODO(rbharath): This test seems silly after the recent round of
                    refactoring. Can it be removed?
    """
        # Only for debug!
        np.random.seed(123)

        # Set some global variables up top
        reload = True
        verbosity = "high"

        current_dir = os.path.dirname(os.path.realpath(__file__))
        #Make directories to store the raw and featurized datasets.
        data_dir = os.path.join(self.base_dir, "dataset")
        train_dir = os.path.join(self.base_dir, "train_dataset")
        valid_dir = os.path.join(self.base_dir, "valid_dataset")
        test_dir = os.path.join(self.base_dir, "test_dataset")
        model_dir = os.path.join(self.base_dir, "model")

        # Load dataset
        print("About to load dataset.")
        dataset_file = os.path.join(
            current_dir, "../../models/tests/multitask_example.csv")
        dataset = load_from_disk(dataset_file)
        print("Columns of dataset: %s" % str(dataset.columns.values))
        print("Number of examples in dataset: %s" % str(dataset.shape[0]))

        # Featurize tox21 dataset
        print("About to featurize dataset.")
        featurizer = CircularFingerprint(size=1024)
        all_tasks = ["task%d" % i for i in range(17)]

        ####### Do featurization
        loader = DataLoader(tasks=all_tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        dataset = loader.featurize(dataset_file, data_dir)

        # Do train/valid split.
        X_multi, y_multi, w_multi, ids_multi = (dataset.X, dataset.y,
                                                dataset.w, dataset.ids)

        ####### Do singletask load
        y_tasks, w_tasks, = [], []
        for ind, task in enumerate(all_tasks):
            print("Processing task %s" % task)
            dataset = DiskDataset(data_dir, verbosity=verbosity, reload=reload)

            X_task, y_task, w_task, ids_task = (dataset.X, dataset.y,
                                                dataset.w, dataset.ids)
            y_tasks.append(y_task[:, ind])
            w_tasks.append(w_task[:, ind])

        ################## Do comparison
        for ind, task in enumerate(all_tasks):
            y_multi_task = y_multi[:, ind]
            w_multi_task = w_multi[:, ind]

            y_task = y_tasks[ind]
            w_task = w_tasks[ind]

            np.testing.assert_allclose(y_multi_task.flatten(),
                                       y_task.flatten())
            np.testing.assert_allclose(w_multi_task.flatten(),
                                       w_task.flatten())
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#In this program, we analyze the BACE enyzme and build machine learning models for predicting the Ki of ligands to the protein.
#We will use the deepchem library to load this data into memory, split into train/test/validation folds, build and cross-validate models, and report statistics.

import os
import sys
import deepchem as dc
from deepchem.utils.save import load_from_disk

current_dir = os.path.dirname(os.path.realpath("__file__"))
dc.utils.download_url(
    "https://s3-us-west-1.amazonaws.com/deepchem.io/datasets/desc_canvas_aug30.csv",
    current_dir)
dataset_file = "desc_canvas_aug30.csv"
dataset = load_from_disk(dataset_file)
num_display = 10
pretty_columns = ("[" + ",".join(
    ["'%s'" % column
     for column in dataset.columns.values[:num_display]]) + ",...]")

dc.utils.download_url(
    "https://s3-us-west-1.amazonaws.com/deepchem.io/datasets/crystal_desc_canvas_aug30.csv",
    current_dir)
crystal_dataset_file = "crystal_desc_canvas_aug30.csv"
crystal_dataset = load_from_disk(crystal_dataset_file)

print("Columns of dataset: %s" % pretty_columns)
print("Number of examples in dataset: %s" % str(dataset.shape[0]))
print("Number of examples in crystal dataset: %s" %
      str(crystal_dataset.shape[0]))
예제 #52
0
import pandas as pd
import deepchem as dc
import numpy as np
import tempfile
from rdkit import Chem
from rdkit.Chem import Draw
from itertools import islice
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from deepchem.utils.evaluate import Evaluator
import numpy.random
from deepchem.utils.save import load_from_disk

dataset_file = "datasets/delaney-processed.csv"
dataset = load_from_disk(
    dataset_file
)  #Compound ID', 'ESOL predicted log solubility in mols per litre',

#    'Minimum Degree', 'Molecular Weight', 'Number of H-Bond Donors',
#    'Number of Rings', 'Number of Rotatable Bonds', 'Polar Surface Area',
#    'measured log solubility in mols per litre', 'smiles']が入っている


#//Erorrが出てしまうためコメントアウト
def display_images(filenames):
    imagesList = ''.join([
        "<img style='width: 140px; margin: 0px; float: left; border: 1px solid black;' src='%s' />"
        % str(s) for s in sorted(filenames)
    ])
    display(HTML(imagesList))