예제 #1
0
 def test_smiles_to_image_long_molecule(self):
     """Test SmilesToImage for a molecule which does not fit the image"""
     featurizer = SmilesToImage(img_size=80,
                                res=0.5,
                                max_len=250,
                                img_spec="std")
     features = featurizer.featurize(self.long_molecule_smiles)
     assert features.shape == (1, 0)
예제 #2
0
 def test_smiles_to_image_with_max_len(self):
     """Test SmilesToImage with max_len"""
     smiles_length = [len(s) for s in self.smiles]
     assert smiles_length == [26, 25]
     featurizer = SmilesToImage(max_len=25)
     features = featurizer.featurize(self.smiles)
     assert features[0].shape == (0, )
     assert features[1].shape == (80, 80, 1)
예제 #3
0
def get_dataset(mode="classification",
                featurizer="smiles2seq",
                max_seq_len=20,
                data_points=10,
                n_tasks=5):
    dataset_file = os.path.join(os.path.dirname(__file__),
                                "chembl_25_small.csv")

    if featurizer == "smiles2seq":
        max_len = 250
        pad_len = 10
        char_to_idx = create_char_to_idx(dataset_file,
                                         max_len=max_len,
                                         smiles_field="smiles")
        feat = SmilesToSeq(char_to_idx=char_to_idx,
                           max_len=max_len,
                           pad_len=pad_len)

    elif featurizer == "smiles2img":
        img_size = 80
        img_spec = "engd"
        res = 0.5
        feat = SmilesToImage(img_size=img_size, img_spec=img_spec, res=res)

    loader = dc.data.CSVLoader(tasks=chembl25_tasks,
                               smiles_field='smiles',
                               featurizer=feat)
    dataset = loader.create_dataset(inputs=[dataset_file],
                                    shard_size=10000,
                                    data_dir=tempfile.mkdtemp())

    w = np.ones(shape=(data_points, n_tasks))

    if mode == 'classification':
        y = np.random.randint(0, 2, size=(data_points, n_tasks))
        metric = dc.metrics.Metric(dc.metrics.roc_auc_score,
                                   np.mean,
                                   mode="classification")
    else:
        y = np.random.normal(size=(data_points, n_tasks))
        metric = dc.metrics.Metric(dc.metrics.mean_absolute_error,
                                   mode="regression")

    if featurizer == "smiles2seq":
        dataset = dc.data.NumpyDataset(dataset.X[:data_points, :max_seq_len],
                                       y, w, dataset.ids[:data_points])
    else:
        dataset = dc.data.NumpyDataset(dataset.X[:data_points], y, w,
                                       dataset.ids[:data_points])

    if featurizer == "smiles2seq":
        return dataset, metric, char_to_idx
    else:
        return dataset, metric
예제 #4
0
 def test_smiles_to_image_with_img_spec(self):
     """Test SmilesToImage with img_spec"""
     featurizer = SmilesToImage()
     base_features = featurizer.featurize(self.smiles)
     featurizer = SmilesToImage(img_spec='engd')
     features = featurizer.featurize(self.smiles)
     assert features.shape == (2, 80, 80, 4)
     assert not np.allclose(base_features, features)
예제 #5
0
 def test_smiles_to_image_with_res(self):
     """Test SmilesToImage with res"""
     featurizer = SmilesToImage()
     base_features = featurizer.featurize(self.smiles)
     featurizer = SmilesToImage(res=0.6)
     features = featurizer.featurize(self.smiles)
     assert features.shape == (2, 80, 80, 1)
     assert not np.allclose(base_features, features)
예제 #6
0
 def test_smiles_to_image_with_image_size(self):
     """Test SmilesToImage with image_size"""
     featurizer = SmilesToImage(img_size=100)
     features = featurizer.featurize(self.smiles)
     assert features.shape == (2, 100, 100, 1)
예제 #7
0
 def test_smiles_to_image(self):
     """Test default SmilesToImage"""
     featurizer = SmilesToImage()
     features = featurizer.featurize(self.smiles)
     assert features.shape == (2, 80, 80, 1)
예제 #8
0
def load_chembl25(featurizer="smiles2seq",
                  split="random",
                  data_dir=None,
                  save_dir=None,
                  split_seed=None,
                  reload=True,
                  transformer_type='minmax',
                  **kwargs):
  """Loads the ChEMBL25 dataset, featurizes it, and does a split.
  Parameters
  ----------
  featurizer: str, default smiles2seq
    Featurizer to use
  split: str, default None
    Splitter to use
  data_dir: str, default None
    Directory to download data to, or load dataset from. (TODO: If None, make tmp)
  save_dir: str, default None
    Directory to save the featurized dataset to. (TODO: If None, make tmp)
  split_seed: int, default None
    Seed to be used for splitting the dataset
  reload: bool, default True
    Whether to reload saved dataset
  transformer_type: str, default minmax:
    Transformer to use
  """
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  save_folder = os.path.join(save_dir, "chembl_25-featurized", str(featurizer))
  if featurizer == "smiles2img":
    img_spec = kwargs.get("img_spec", "engd")
    save_folder = os.path.join(save_folder, img_spec)

  if reload:
    if not os.path.exists(save_folder):
      logger.warning(
          "{} does not exist. Reconstructing dataset.".format(save_folder))
    else:
      logger.info("{} exists. Restoring dataset.".format(save_folder))
      loaded, dataset, transformers = dc.utils.save.load_dataset_from_disk(
          save_folder)
      if loaded:
        return chembl25_tasks, dataset, transformers

  dataset_file = os.path.join(data_dir, "chembl_25.csv.gz")

  if not os.path.exists(dataset_file):
    logger.warning("File {} not found. Downloading dataset. (~555 MB)".format(
        dataset_file))
    dc.utils.download_url(url=CHEMBL_URL, dest_dir=data_dir)

  if featurizer == "smiles2seq":
    max_len = kwargs.get('max_len', 250)
    pad_len = kwargs.get('pad_len', 10)
    char_to_idx = create_char_to_idx(
        dataset_file, max_len=max_len, smiles_field="smiles")
    featurizer = SmilesToSeq(
        char_to_idx=char_to_idx, max_len=max_len, pad_len=pad_len)

  elif featurizer == "smiles2img":
    img_size = kwargs.get("img_size", 80)
    img_spec = kwargs.get("img_spec", "engd")
    res = kwargs.get("res", 0.5)
    featurizer = SmilesToImage(img_size=img_size, img_spec=img_spec, res=res)

  else:
    raise ValueError(
        "Featurizer of type {} is not supported".format(featurizer))

  loader = dc.data.CSVLoader(
      tasks=chembl25_tasks, smiles_field='smiles', featurizer=featurizer)
  dataset = loader.featurize(
      input_files=[dataset_file], shard_size=10000, data_dir=save_folder)

  if split is None:
    if transformer_type == "minmax":
      transformers = [
          dc.trans.MinMaxTransformer(
              transform_X=False, transform_y=True, dataset=dataset)
      ]
    else:
      transformers = [
          dc.trans.NormalizationTransformer(
              transform_X=False, transform_y=True, dataset=dataset)
      ]

    logger.info("Split is None, about to transform dataset.")
    for transformer in transformers:
      dataset = transformer.transform(dataset)
    return chembl25_tasks, (dataset, None, None), transformers

  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
      'scaffold': dc.splits.ScaffoldSplitter(),
  }

  logger.info("About to split data with {} splitter.".format(split))
  splitter = splitters[split]

  frac_train = kwargs.get('frac_train', 4 / 6)
  frac_valid = kwargs.get('frac_valid', 1 / 6)
  frac_test = kwargs.get('frac_test', 1 / 6)

  train, valid, test = splitter.train_valid_test_split(
      dataset,
      seed=split_seed,
      frac_train=frac_train,
      frac_test=frac_test,
      frac_valid=frac_valid)
  if transformer_type == "minmax":
    transformers = [
        dc.trans.MinMaxTransformer(
            transform_X=False, transform_y=True, dataset=train)
    ]
  else:
    transformers = [
        dc.trans.NormalizationTransformer(
            transform_X=False, transform_y=True, dataset=train)
    ]

  for transformer in transformers:
    train = transformer.transform(train)
    valid = transformer.transform(valid)
    test = transformer.transform(test)

  if reload:
    dc.utils.save.save_dataset_to_disk(save_folder, train, valid, test,
                                       transformers)

  return chembl25_tasks, (train, valid, test), transformers