Exemplo n.º 1
0
  def create_dataset(shard_generator, data_dir=None, tasks=[], verbose=True):
    """Creates a new DiskDataset

    Parameters
    ----------
    shard_generator: Iterable
      An iterable (either a list or generator) that provides tuples of data
      (X, y, w, ids). Each tuple will be written to a separate shard on disk.
    data_dir: str
      Filename for data directory. Creates a temp directory if none specified.
    tasks: list
      List of tasks for this dataset.
    """
    if data_dir is None:
      data_dir = tempfile.mkdtemp()
    elif not os.path.exists(data_dir):
      os.makedirs(data_dir)

    metadata_rows = []
    time1 = time.time()
    for shard_num, (X, y, w, ids) in enumerate(shard_generator):
      basename = "shard-%d" % shard_num
      metadata_rows.append(
          DiskDataset.write_data_to_disk(data_dir, basename, tasks, X, y, w,
                                         ids))
    metadata_df = DiskDataset._construct_metadata(metadata_rows)
    save_metadata(tasks, metadata_df, data_dir)
    time2 = time.time()
    log("TIMING: dataset construction took %0.3f s" % (time2 - time1), verbose)
    return DiskDataset(data_dir, verbose=verbose)
Exemplo n.º 2
0
  def _featurize_compounds(self, df, featurizer, parallel=True):    
    """Featurize individual compounds.

       Given a featurizer that operates on individual chemical compounds 
       or macromolecules, compute & add features for that compound to the 
       features dataframe
    """
    sample_smiles = df["smiles"].tolist()

    if not parallel:
      features = []
      for ind, smiles in enumerate(sample_smiles):
        if ind % self.log_every_n == 0:
          log("Featurizing sample %d" % ind, self.verbose)
        mol = Chem.MolFromSmiles(smiles)
        features.append(featurizer.featurize([mol]))
    else:
      def featurize_wrapper(smiles):
        mol = Chem.MolFromSmiles(smiles)
        return featurizer.featurize([mol])

      features = ProcessingPool(mp.cpu_count()).map(featurize_wrapper, 
                                                    sample_smiles)

    df[featurizer.__class__.__name__] = features
Exemplo n.º 3
0
    def create_dataset(shard_generator, data_dir=None, tasks=[], verbose=True):
        """Creates a new DiskDataset

    Parameters
    ----------
    shard_generator: Iterable
      An iterable (either a list or generator) that provides tuples of data
      (X, y, w, ids). Each tuple will be written to a separate shard on disk.
    data_dir: str
      Filename for data directory. Creates a temp directory if none specified.
    tasks: list
      List of tasks for this dataset.
    """
        if data_dir is None:
            data_dir = tempfile.mkdtemp()
        elif not os.path.exists(data_dir):
            os.makedirs(data_dir)

        metadata_rows = []
        time1 = time.time()
        for shard_num, (X, y, w, ids) in enumerate(shard_generator):
            basename = "shard-%d" % shard_num
            metadata_rows.append(
                DiskDataset.write_data_to_disk(data_dir, basename, tasks, X, y,
                                               w, ids))
        metadata_df = DiskDataset._construct_metadata(metadata_rows)
        save_metadata(tasks, metadata_df, data_dir)
        time2 = time.time()
        log("TIMING: dataset construction took %0.3f s" % (time2 - time1),
            verbose)
        return DiskDataset(data_dir, verbose=verbose)
Exemplo n.º 4
0
    def _featurize_complex(self, ligand_ext, ligand_lines, protein_pdb_lines):
        tempdir = tempfile.mkdtemp()

        ############################################################## TIMING
        time1 = time.time()
        ############################################################## TIMING
        ligand_file = os.path.join(tempdir, "ligand.%s" % ligand_ext)
        with open(ligand_file, "w") as mol_f:
            mol_f.writelines(ligand_lines)
        ############################################################## TIMING
        time2 = time.time()
        log("TIMING: Writing ligand took %0.3f s" % (time2 - time1),
            self.verbose)
        ############################################################## TIMING

        ############################################################## TIMING
        time1 = time.time()
        ############################################################## TIMING
        protein_pdb_file = os.path.join(tempdir, "protein.pdb")
        with open(protein_pdb_file, "w") as protein_f:
            protein_f.writelines(protein_pdb_lines)
        ############################################################## TIMING
        time2 = time.time()
        log("TIMING: Writing protein took %0.3f s" % (time2 - time1),
            self.verbose)
        ############################################################## TIMING

        features_dict = self._transform(protein_pdb_file, ligand_file)
        shutil.rmtree(tempdir)
        return features_dict.values()
Exemplo n.º 5
0
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
    """Featurize individual compounds in dataframe.

  Given a featurizer that operates on individual chemical compounds
  or macromolecules, compute & add features for that compound to the
  features dataframe
  """
    sample_elems = df[field].tolist()

    features = []
    for ind, elem in enumerate(sample_elems):
        mol = Chem.MolFromSmiles(elem)
        # TODO (ytz) this is a bandage solution to reorder the atoms so
        # that they're always in the same canonical order. Presumably this
        # should be correctly implemented in the future for graph mols.
        if mol:
            new_order = rdmolfiles.CanonicalRankAtoms(mol)
            mol = rdmolops.RenumberAtoms(mol, new_order)
        if ind % log_every_N == 0:
            log("Featurizing sample %d" % ind, verbose)
        features.append(featurizer.featurize([mol]))
    valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features],
                          dtype=bool)
    features = [
        elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
    ]
    return np.squeeze(np.array(features), axis=1), valid_inds
Exemplo n.º 6
0
 def write_dataframe(val, data_dir, featurizer=None, tasks=None,
                     raw_data=None, basename=None, mol_id_field="mol_id",
                     verbosity=None, compute_feature_statistics=None):
   """Writes data from dataframe to disk."""
   if featurizer is not None and tasks is not None:
     feature_type = featurizer.__class__.__name__
     (basename, df) = val
     # TODO(rbharath): This is a hack. clean up.
     if not len(df):
       return None
     if compute_feature_statistics is None:
       if hasattr(featurizer, "dtype"):
         dtype = featurizer.dtype
         compute_feature_statistics = False
       else:
         dtype = float
         compute_feature_statistics = True
     ############################################################## TIMING
     time1 = time.time()
     ############################################################## TIMING
     ids, X, y, w = convert_df_to_numpy(df, feature_type, tasks, mol_id_field,
                                        dtype, verbosity)
     ############################################################## TIMING
     time2 = time.time()
     log("TIMING: convert_df_to_numpy took %0.3f s" % (time2-time1), verbosity)
     ############################################################## TIMING
   else:
     ids, X, y, w = raw_data
     basename = ""
     assert X.shape[0] == y.shape[0]
     assert y.shape == w.shape
     assert len(ids) == X.shape[0]
   return DiskDataset.write_data_to_disk(
       data_dir, basename, tasks, X, y, w, ids,
       compute_feature_statistics=compute_feature_statistics)
Exemplo n.º 7
0
def featurize_smiles_np(arr, featurizer, log_every_N=1000, verbose=True):
    """Featurize individual compounds in a numpy array.

  Given a featurizer that operates on individual chemical compounds
  or macromolecules, compute & add features for that compound to the
  features array
  """
    features = []
    from rdkit import Chem
    from rdkit.Chem import rdmolfiles
    from rdkit.Chem import rdmolops
    for ind, elem in enumerate(arr.tolist()):
        mol = Chem.MolFromSmiles(elem)
        if mol:
            new_order = rdmolfiles.CanonicalRankAtoms(mol)
            mol = rdmolops.RenumberAtoms(mol, new_order)
        if ind % log_every_N == 0:
            log("Featurizing sample %d" % ind, verbose)
        features.append(featurizer.featurize([mol]))

    valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features],
                          dtype=bool)
    features = [
        elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
    ]
    features = np.squeeze(np.array(features))
    return features.reshape(-1, )
Exemplo n.º 8
0
        def shard_generator():
            for shard_num, shard in enumerate(
                    self.get_shards(input_files, shard_size)):
                time1 = time.time()
                X, valid_inds = self.featurize_shard(shard)
                ids = shard[self.id_field].values
                ids = ids[valid_inds]
                if len(self.tasks) > 0:
                    # Featurize task results iff they exist.
                    y, w = convert_df_to_numpy(shard, self.tasks,
                                               self.id_field)
                    # Filter out examples where featurization failed.
                    y, w = (y[valid_inds], w[valid_inds])
                    assert len(X) == len(ids) == len(y) == len(w)
                else:
                    # For prospective data where results are unknown, it makes
                    # no sense to have y values or weights.
                    y, w = (None, None)
                    assert len(X) == len(ids)

                time2 = time.time()
                log(
                    "TIMING: featurizing shard %d took %0.3f s" %
                    (shard_num, time2 - time1), self.verbose)
                yield X, y, w, ids
Exemplo n.º 9
0
  def compute_model_performance(self, metrics, csv_out=None, stats_out=None,
                                threshold=None):
    """
    Computes statistics of model on test data and saves results to csv.
    """
    y = self.dataset.get_labels()
    y = undo_transforms(y, self.transformers)
    w = self.dataset.get_weights()

    if not len(metrics):
      return {}
    else:
      mode = metrics[0].mode
    if mode == "classification":
      y_pred = self.model.predict_proba(self.dataset, self.transformers)
      y_pred_print = self.model.predict(self.dataset, self.transformers).astype(int)
    else:
      y_pred = self.model.predict(self.dataset, self.transformers)
      y_pred_print = y_pred
    multitask_scores = {}

    if csv_out is not None:
      log("Saving predictions to %s" % csv_out, self.verbosity)
      self.output_predictions(y_pred_print, csv_out)

    # Compute multitask metrics
    for metric in metrics:
      multitask_scores[metric.name] = metric.compute_metric(y, y_pred, w)
    
    if stats_out is not None:
      log("Saving stats to %s" % stats_out, self.verbosity)
      self.output_statistics(multitask_scores, stats_out)
  
    return multitask_scores
Exemplo n.º 10
0
    def featurize_complexes(self,
                            mol_pdbs,
                            protein_pdbs,
                            verbose=True,
                            log_every_n=1000):
        """
    Calculate features for mol/protein complexes.

    Parameters
    ----------
    mol_pdbs: list
      List of PDBs for molecules. Each PDB should be a list of lines of the
      PDB file.
    protein_pdbs: list
      List of PDBs for proteins. Each PDB should be a list of lines of the
      PDB file.
    """
        features = []
        for i, (mol_pdb, protein_pdb) in enumerate(zip(mol_pdbs,
                                                       protein_pdbs)):
            if verbose and i % log_every_n == 0:
                log("Featurizing %d / %d" % (i, len(mol_pdbs)))
            features.append(self._featurize_complex(mol_pdb, protein_pdb))
        features = np.asarray(features)
        return features
Exemplo n.º 11
0
    def featurize_complexes(self, mol_files, protein_pdbs, log_every_n=1000):
        """
    Calculate features for mol/protein complexes.

    Parameters
    ----------
    mols: list
      List of PDB filenames for molecules.
    protein_pdbs: list
      List of PDB filenames for proteins.
    """
        features = []
        for i, (mol_file,
                protein_pdb) in enumerate(zip(mol_files, protein_pdbs)):
            if i % log_every_n == 0:
                log("Featurizing %d / %d" % (i, len(mol_files)))
            ligand_ext = get_ligand_filetype(mol_file)
            with open(mol_file) as mol_f:
                mol_lines = mol_f.readlines()
            with open(protein_pdb) as protein_file:
                protein_pdb_lines = protein_file.readlines()
            features += self._featurize_complex(ligand_ext, mol_lines,
                                                protein_pdb_lines)
        features = np.asarray(features)
        return features
Exemplo n.º 12
0
  def _add_user_specified_features(self, df, featurizer):
    """Merge user specified features. 

      Merge features included in dataset provided by user
      into final features dataframe

      Three types of featurization here:

        1) Molecule featurization
          -) Smiles string featurization
          -) Rdkit MOL featurization
        2) Complex featurization
          -) PDB files for interacting molecules.
        3) User specified featurizations.
    """
    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING
    df[featurizer.feature_fields] = df[featurizer.feature_fields].apply(pd.to_numeric)
    X_shard = df.as_matrix(columns=featurizer.feature_fields)
    df[featurizer.__class__.__name__] = [np.array(elt) for elt in X_shard.tolist()]
    ############################################################## TIMING
    time2 = time.time()
    log("TIMING: user specified processing took %0.3f s" % (time2-time1),
        self.verbosity)
Exemplo n.º 13
0
    def generate_scaffolds(self, dataset, log_every_n=1000):
        """
        Returns all scaffolds from the dataset
        """
        scaffolds = {}
        data_len = len(dataset)

        log("About to generate scaffolds", self.verbose)
        for ind, smiles in enumerate(dataset.ids):
            if ind % log_every_n == 0:
                log(f"Generating scaffold {ind} {data_len}", self.verbose)
            scaffold = generate_scaffold(smiles)
            if scaffold not in scaffolds:
                scaffolds[scaffold] = [ind]
            else:
                scaffolds[scaffold].append(ind)

        # Sort from largest to smallest scaffold sets
        scaffolds = {key: sorted(value) for key, value in scaffolds.items()}
        scaffold_sets = [
            scaffold_set
            for (scaffold,
                 scaffold_set) in sorted(scaffolds.items(),
                                         key=lambda x: (len(x[1]), x[1][0]),
                                         reverse=True)
        ]
        return scaffold_sets
Exemplo n.º 14
0
    def split(self,
              dataset,
              frac_train=.8,
              frac_valid=.1,
              frac_test=.1,
              log_every_n=1000):
        """
        Splits internal compounds into train/validation/test by scaffold.
        """
        np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
        scaffold_sets = self.generate_scaffolds(dataset)

        train_cutoff = frac_train * len(dataset)
        valid_cutoff = (frac_train + frac_valid) * len(dataset)
        train_inds, valid_inds, test_inds = [], [], []

        log("About to sort in scaffold sets", self.verbose)
        for scaffold_set in scaffold_sets:
            if len(train_inds) + len(scaffold_set) > train_cutoff:
                if len(train_inds) + len(valid_inds) + len(
                        scaffold_set) > valid_cutoff:
                    test_inds += scaffold_set
                else:
                    valid_inds += scaffold_set
            else:
                train_inds += scaffold_set
        return train_inds, valid_inds, test_inds
Exemplo n.º 15
0
 def _featurize_shard(self, df_shard, write_fn, shard_num, input_type):
     """Featurizes a shard of an input dataframe."""
     field = self.mol_field if input_type == "sdf" else self.smiles_field
     field_type = "mol" if input_type == "sdf" else "smiles"
     log(
         "Currently featurizing feature_type: %s" %
         self.featurizer.__class__.__name__, self.verbosity)
     if isinstance(self.featurizer, UserDefinedFeaturizer):
         self._add_user_specified_features(df_shard, self.featurizer)
     elif isinstance(self.featurizer, Featurizer):
         self._featurize_mol(df_shard,
                             self.featurizer,
                             field=field,
                             field_type=field_type)
     elif isinstance(self.featurizer, ComplexFeaturizer):
         self._featurize_complexes(df_shard, self.featurizer)
     basename = "shard-%d" % shard_num
     ############################################################## TIMING
     time1 = time.time()
     ############################################################## TIMING
     metadata_row = write_fn((basename, df_shard))
     ############################################################## TIMING
     time2 = time.time()
     log("TIMING: writing metadata row took %0.3f s" % (time2 - time1),
         self.verbosity)
     ############################################################## TIMING
     return metadata_row
Exemplo n.º 16
0
  def featurize(self, input_files, data_dir=None, shard_size=8192):
    """Featurize provided files and write to specified location."""
    log("Loading raw samples now.", self.verbose)
    log("shard_size: %d" % shard_size, self.verbose)

    if not isinstance(input_files, list):
      input_files = [input_files]

    def shard_generator():
      for shard_num, shard in enumerate(
          self.get_shards(input_files, shard_size)):
        time1 = time.time()
        X, valid_inds = self.featurize_shard(shard)
        ids = shard[self.id_field].values
        ids = ids[valid_inds]
        if len(self.tasks) > 0:
          # Featurize task results iff they exist.
          y, w = convert_df_to_numpy(shard, self.tasks, self.id_field)
          # Filter out examples where featurization failed.
          y, w = (y[valid_inds], w[valid_inds])
          assert len(X) == len(ids) == len(y) == len(w)
        else:
          # For prospective data where results are unknown, it makes
          # no sense to have y values or weights.
          y, w = (None, None)
          assert len(X) == len(ids)

        time2 = time.time()
        log("TIMING: featurizing shard %d took %0.3f s" %
            (shard_num, time2 - time1), self.verbose)
        yield X, y, w, ids

    return DiskDataset.create_dataset(
        shard_generator(), data_dir, self.tasks, verbose=self.verbose)
Exemplo n.º 17
0
  def __init__(self, shard_generator=[], data_dir=None, tasks=[],
               reload=False, verbose=True):
    """
    Turns featurized dataframes into numpy files, writes them & metadata to disk.
    """
    if data_dir is not None:
      if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    else:
      data_dir = tempfile.mkdtemp()
    self.data_dir = data_dir
    self.verbose = verbose

    if reload:
      log("Loading pre-existing dataset.", self.verbose)
      if os.path.exists(self._get_metadata_filename()):
        (self.tasks, self.metadata_df) = load_from_disk(
            self._get_metadata_filename())
      else:
        raise ValueError("No metadata found.")
      return

    metadata_rows = []
    time1 = time.time()
    for shard_num, (X, y, w, ids) in enumerate(shard_generator):
      basename = "shard-%d" % shard_num 
      metadata_rows.append(
          DiskDataset.write_data_to_disk(
              self.data_dir, basename, tasks, X, y, w, ids))
    self.tasks = tasks
    self.metadata_df = DiskDataset.construct_metadata(metadata_rows)
    self.save_to_disk()
    time2 = time.time()
    print("TIMING: dataset construction took %0.3f s" % (time2-time1),
          self.verbose)
Exemplo n.º 18
0
  def _add_user_specified_features(self, df, featurizer):
    """Merge user specified features. 

      Merge features included in dataset provided by user
      into final features dataframe

      Three types of featurization here:

        1) Molecule featurization
          -) Smiles string featurization
          -) Rdkit MOL featurization
        2) Complex featurization
          -) PDB files for interacting molecules.
        3) User specified featurizations.
    """
    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING
    df[featurizer.feature_fields] = df[featurizer.feature_fields].apply(pd.to_numeric)
    X_shard = df.as_matrix(columns=featurizer.feature_fields)
    df[featurizer.__class__.__name__] = [np.array(elt) for elt in X_shard.tolist()]
    ############################################################## TIMING
    time2 = time.time()
    log("TIMING: user specified processing took %0.3f s" % (time2-time1),
        self.verbosity)
Exemplo n.º 19
0
  def sparse_shuffle(self):
    """Shuffling that exploits data sparsity to shuffle large datasets.

    Only for 1-dimensional feature vectors (does not work for tensorial
    featurizations).
    """
    time1 = time.time()
    shard_size = self.get_shard_size()
    num_shards = self.get_number_shards()
    X_sparses, ys, ws, ids = [], [], [], []
    num_features = None
    for i in range(num_shards):
      (X_s, y_s, w_s, ids_s) = self.get_shard(i)
      if num_features is None:
        num_features = X_s.shape[1]
      X_sparse = sparsify_features(X_s)
      X_sparses, ys, ws, ids = (X_sparses + [X_sparse], ys + [y_s], ws + [w_s],
                                ids + [np.atleast_1d(np.squeeze(ids_s))])
    # Get full dataset in memory
    (X_sparse, y, w, ids) = (np.vstack(X_sparses), np.vstack(ys), np.vstack(ws),
                             np.concatenate(ids))
    # Shuffle in memory
    num_samples = len(X_sparse)
    permutation = np.random.permutation(num_samples)
    X_sparse, y, w, ids = (X_sparse[permutation], y[permutation],
                           w[permutation], ids[permutation])
    # Write shuffled shards out to disk
    for i in range(num_shards):
      start, stop = i * shard_size, (i + 1) * shard_size
      (X_sparse_s, y_s, w_s, ids_s) = (X_sparse[start:stop], y[start:stop],
                                       w[start:stop], ids[start:stop])
      X_s = densify_features(X_sparse_s, num_features)
      self.set_shard(i, X_s, y_s, w_s, ids_s)
    time2 = time.time()
    log("TIMING: sparse_shuffle took %0.3f s" % (time2 - time1), self.verbose)
Exemplo n.º 20
0
 def featurize(self,
               protein_file,
               pockets,
               pocket_atoms_map,
               pocket_coords,
               verbose=False):
   """
   Calculate atomic coodinates.
   """
   import mdtraj
   protein = mdtraj.load(protein_file)
   n_pockets = len(pockets)
   n_residues = len(BindingPocketFeaturizer.residues)
   res_map = dict(zip(BindingPocketFeaturizer.residues, range(n_residues)))
   all_features = np.zeros((n_pockets, n_residues))
   for pocket_num, (pocket, coords) in enumerate(zip(pockets, pocket_coords)):
     pocket_atoms = pocket_atoms_map[pocket]
     for ind, atom in enumerate(pocket_atoms):
       atom_name = str(protein.top.atom(atom))
       # atom_name is of format RESX-ATOMTYPE
       # where X is a 1 to 4 digit number
       residue = atom_name[:3]
       if residue not in res_map:
         log("Warning: Non-standard residue in PDB file", verbose)
         continue
       atomtype = atom_name.split("-")[1]
       all_features[pocket_num, res_map[residue]] += 1
   return all_features
Exemplo n.º 21
0
  def _featurize_complex(self, ligand_pdb_lines, protein_pdb_lines):
    tempdir = tempfile.mkdtemp()

    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING
    ligand_pdb_file = os.path.join(tempdir, "ligand.pdb")
    with open(ligand_pdb_file, "w") as mol_f:
      mol_f.writelines(ligand_pdb_lines)
    ############################################################## TIMING
    time2 = time.time()
    log("TIMING: Writing ligand took %0.3f s" % (time2-time1), self.verbose)
    ############################################################## TIMING

    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING
    protein_pdb_file = os.path.join(tempdir, "protein.pdb")
    with open(protein_pdb_file, "w") as protein_f:
      protein_f.writelines(protein_pdb_lines)
    ############################################################## TIMING
    time2 = time.time()
    log("TIMING: Writing protein took %0.3f s" % (time2-time1), self.verbose)
    ############################################################## TIMING

    features_dict = self._transform(protein_pdb_file, ligand_pdb_file)
    shutil.rmtree(tempdir)
    return features_dict.values()
Exemplo n.º 22
0
  def _featurize_compounds(self, df, featurizer, parallel=True,
                           worker_pool=None):    
    """Featurize individual compounds.

       Given a featurizer that operates on individual chemical compounds 
       or macromolecules, compute & add features for that compound to the 
       features dataframe
    """
    sample_smiles = df["smiles"].tolist()

    if worker_pool is None:
      features = []
      for ind, smiles in enumerate(sample_smiles):
        if ind % self.log_every_n == 0:
          log("Featurizing sample %d" % ind, self.verbosity)
        mol = Chem.MolFromSmiles(smiles)
        features.append(featurizer.featurize([mol], verbosity=self.verbosity))
    else:
      def featurize_wrapper(smiles, dilled_featurizer):
        print("Featurizing %s" % smiles)
        mol = Chem.MolFromSmiles(smiles)
        featurizer = dill.loads(dilled_featurizer)
        feature = featurizer.featurize([mol], verbosity=self.verbosity)
        return feature

      features = worker_pool.map_sync(featurize_wrapper, 
                                      sample_smiles)

    df[featurizer.__class__.__name__] = features
Exemplo n.º 23
0
    def compute_model_performance(self, metrics, csv_out=None, stats_out=None, threshold=None):
        """
    Computes statistics of model on test data and saves results to csv.
    """
        y = self.dataset.get_labels()
        y = undo_transforms(y, self.output_transformers)
        w = self.dataset.get_weights()

        if not len(metrics):
            return {}
        else:
            mode = metrics[0].mode
        if mode == "classification":
            y_pred = self.model.predict_proba(self.dataset, self.output_transformers)
            y_pred_print = self.model.predict(self.dataset, self.output_transformers).astype(int)
        else:
            y_pred = self.model.predict(self.dataset, self.output_transformers)
            y_pred_print = y_pred
        multitask_scores = {}

        if csv_out is not None:
            log("Saving predictions to %s" % csv_out, self.verbosity)
            self.output_predictions(y_pred_print, csv_out)

        # Compute multitask metrics
        for metric in metrics:
            multitask_scores[metric.name] = metric.compute_metric(y, y_pred, w)

        if stats_out is not None:
            log("Saving stats to %s" % stats_out, self.verbosity)
            self.output_statistics(multitask_scores, stats_out)

        return multitask_scores
Exemplo n.º 24
0
  def train_valid_test_split(self, dataset, train_dir=None,
                             valid_dir=None, test_dir=None, frac_train=.8,
                             frac_valid=.1, frac_test=.1, seed=None,
                             log_every_n=1000):
    """
    Splits self into train/validation/test sets.

    Returns Dataset objects.
    """
    log("Computing train/valid/test indices", self.verbose)
    train_inds, valid_inds, test_inds = self.split(
      dataset,
      frac_train=frac_train, frac_test=frac_test,
      frac_valid=frac_valid, log_every_n=log_every_n)
    if train_dir is None:
      train_dir = tempfile.mkdtemp()
    if valid_dir is None:
      valid_dir = tempfile.mkdtemp()
    if test_dir is None:
      test_dir = tempfile.mkdtemp()
    train_dataset = dataset.select( 
        train_inds, train_dir)
    if frac_valid != 0:
      valid_dataset = dataset.select(
          valid_inds, valid_dir)
    else:
      valid_dataset = None
    test_dataset = dataset.select(
        test_inds, test_dir)

    return train_dataset, valid_dataset, test_dataset
Exemplo n.º 25
0
 def __init__(self, tasks, task_types, model_params, model_dir, model_builder,
              store_in_memory=False, verbosity=None):
   self.tasks = tasks
   self.task_types = task_types
   self.model_params = model_params
   self.models = {}
   self.model_dir = model_dir
   # If models are TF models, they don't use up RAM, so can keep in memory
   self.task_models = {}
   self.task_model_dirs = {}
   self.model_builder = model_builder
   self.verbosity = verbosity
   self.store_in_memory = store_in_memory
   log("About to initialize singletask to multitask model",
       self.verbosity, "high")
   if not os.path.exists(self.model_dir):
     os.makedirs(self.model_dir)
   self.fit_transformers = False
   for task in self.tasks:
     task_type = self.task_types[task]
     task_model_dir = os.path.join(self.model_dir, str(task))
     if not os.path.exists(task_model_dir):
       os.makedirs(task_model_dir)
     log("Initializing model for task %s" % task,
         self.verbosity, "high")
     self.task_model_dirs[task] = task_model_dir
Exemplo n.º 26
0
 def write_dataframe(val, data_dir, featurizer=None, tasks=None,
                     raw_data=None, basename=None, mol_id_field="mol_id",
                     verbosity=None, compute_feature_statistics=None):
   """Writes data from dataframe to disk."""
   if featurizer is not None and tasks is not None:
     feature_type = featurizer.__class__.__name__
     (basename, df) = val
     # TODO(rbharath): This is a hack. clean up.
     if not len(df):
       return None
     if compute_feature_statistics is None:
       if hasattr(featurizer, "dtype"):
         dtype = featurizer.dtype
         compute_feature_statistics = False
       else:
         dtype = float
         compute_feature_statistics = True
     ############################################################## TIMING
     time1 = time.time()
     ############################################################## TIMING
     ids, X, y, w = convert_df_to_numpy(df, feature_type, tasks, mol_id_field,
                                        dtype, verbosity)
     ############################################################## TIMING
     time2 = time.time()
     log("TIMING: convert_df_to_numpy took %0.3f s" % (time2-time1), verbosity)
     ############################################################## TIMING
   else:
     ids, X, y, w = raw_data
     basename = ""
     assert X.shape[0] == y.shape[0]
     assert y.shape == w.shape
     assert len(ids) == X.shape[0]
   return DiskDataset.write_data_to_disk(
       data_dir, basename, tasks, X, y, w, ids,
       compute_feature_statistics=compute_feature_statistics)
Exemplo n.º 27
0
    def fit(self,
            dataset,
            nb_epoch=10,
            batch_size=50,
            pad_batches=False,
            **kwargs):
        """
    Fits a model on data in a Dataset object.
    """
        # TODO(rbharath/enf): We need a structured way to deal with potential GPU
        #                     memory overflows.
        for epoch in range(nb_epoch):
            log("Starting epoch %s" % str(epoch + 1), self.verbosity)
            losses = []
            for (X_batch, y_batch, w_batch,
                 ids_batch) in dataset.iterbatches(batch_size,
                                                   pad_batches=pad_batches):
                if self.fit_transformers:
                    X_batch, y_batch, w_batch = self.transform_on_batch(
                        X_batch, y_batch, w_batch)
                if pad_batches:
                    X_batch, y_batch, w_batch, ids_batch = pad_batch(
                        batch_size, X_batch, y_batch, w_batch, ids_batch)

                losses.append(self.fit_on_batch(X_batch, y_batch, w_batch))
            log(
                "Avg loss for epoch %d: %f" %
                (epoch + 1, np.array(losses).mean()), self.verbosity)
Exemplo n.º 28
0
 def k_fold_split(self, dataset, k, directories=None):
   """Does K-fold split of dataset."""
   log("Computing K-fold split", self.verbose)
   if directories is None:
     directories = [tempfile.mkdtemp() for _ in range(k)]
   else:
     assert len(directories) == k
   fold_datasets = []
   # rem_dataset is remaining portion of dataset
   rem_dataset = dataset
   for fold in range(k):
     # Note starts as 1/k since fold starts at 0. Ends at 1 since fold goes up
     # to k-1.
     frac_fold = 1./(k-fold)
     fold_dir = directories[fold]
     fold_inds, rem_inds, _ = self.split(
         rem_dataset,
         frac_train=frac_fold, frac_valid=1-frac_fold, frac_test=0)
     fold_dataset = rem_dataset.select( 
         fold_inds, fold_dir)
     rem_dir = tempfile.mkdtemp()
     rem_dataset = rem_dataset.select( 
         rem_inds, rem_dir)
     fold_datasets.append(fold_dataset)
   return fold_datasets
Exemplo n.º 29
0
def featurize_smiles_np(arr, featurizer, log_every_N=1000, verbose=True):
  """Featurize individual compounds in a numpy array.

  Given a featurizer that operates on individual chemical compounds
  or macromolecules, compute & add features for that compound to the
  features array
  """
  features = []
  from rdkit import Chem
  from rdkit.Chem import rdmolfiles
  from rdkit.Chem import rdmolops
  for ind, elem in enumerate(arr.tolist()):
    mol = Chem.MolFromSmiles(elem)
    if mol:
      new_order = rdmolfiles.CanonicalRankAtoms(mol)
      mol = rdmolops.RenumberAtoms(mol, new_order)
    if ind % log_every_N == 0:
      log("Featurizing sample %d" % ind, verbose)
    features.append(featurizer.featurize([mol]))

  valid_inds = np.array(
      [1 if elt.size > 0 else 0 for elt in features], dtype=bool)
  features = [elt for (is_valid, elt) in zip(valid_inds, features) if is_valid]
  features = np.squeeze(np.array(features))
  return features.reshape(-1,)
Exemplo n.º 30
0
  def train_valid_test_split(self, dataset, train_dir,
                             valid_dir, test_dir, frac_train=.8,
                             frac_valid=.1, frac_test=.1, seed=None,
                             log_every_n=1000,
                             compute_feature_statistics=True):
    """
    Splits self into train/validation/test sets.

    Returns Dataset objects.
    """
    log("Computing train/valid/test indices", self.verbosity)
    train_inds, valid_inds, test_inds = self.split(
      dataset,
      frac_train=frac_train, frac_test=frac_test,
      frac_valid=frac_valid, log_every_n=log_every_n)
    train_dataset = dataset.select( 
        train_dir, train_inds,
        compute_feature_statistics=compute_feature_statistics)
    if valid_dir is not None:
      valid_dataset = dataset.select(
          valid_dir, valid_inds,
          compute_feature_statistics=compute_feature_statistics)
    else:
      valid_dataset = None
    test_dataset = dataset.select(
        test_dir, test_inds,
        compute_feature_statistics=compute_feature_statistics)

    return train_dataset, valid_dataset, test_dataset
Exemplo n.º 31
0
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
  """Featurize individual compounds in dataframe.

  Given a featurizer that operates on individual chemical compounds 
  or macromolecules, compute & add features for that compound to the 
  features dataframe
  """
  sample_elems = df[field].tolist()

  features = []
  for ind, elem in enumerate(sample_elems):
    mol = Chem.MolFromSmiles(elem)
    # TODO (ytz) this is a bandage solution to reorder the atoms so
    # that they're always in the same canonical order. Presumably this
    # should be correctly implemented in the future for graph mols.
    if mol:
      new_order = rdmolfiles.CanonicalRankAtoms(mol)
      mol = rdmolops.RenumberAtoms(mol, new_order)
    if ind % log_every_N == 0:
      log("Featurizing sample %d" % ind, verbose)
    features.append(featurizer.featurize([mol]))
  valid_inds = np.array(
      [1 if elt.size > 0 else 0 for elt in features], dtype=bool)
  features = [elt for (is_valid, elt) in zip(valid_inds, features) if is_valid]
  return np.squeeze(np.array(features), axis=1), valid_inds
Exemplo n.º 32
0
    def compute_metric(self,
                       y_true,
                       y_pred,
                       w=None,
                       n_classes=2,
                       filter_nans=True):
        """Compute a performance metric for each task.

    Args:
      y_true: A list of arrays containing true values for each task.
      y_pred: A list of arrays containing predicted values for each task.
      metric: Must be a class that inherits from Metric 

    Returns:
      A numpy array containing metric values for each task.
    """
        if len(y_true.shape) > 1:
            n_samples, n_tasks = y_true.shape[0], y_true.shape[1]
        else:
            n_samples, n_tasks = y_true.shape[0], 1
        if self.mode == "classification":
            y_pred = np.reshape(y_pred, (n_samples, n_tasks, n_classes))
        else:
            y_pred = np.reshape(y_pred, (n_samples, n_tasks))
        y_true = np.reshape(y_true, (n_samples, n_tasks))
        if w is None:
            w = np.ones_like(y_true)
        assert y_true.shape[0] == y_pred.shape[0] == w.shape[0]
        computed_metrics = []
        for task in range(n_tasks):
            y_task = y_true[:, task]
            if self.mode == "regression":
                y_pred_task = y_pred[:, task]
            else:
                y_pred_task = y_pred[:, task, :]
            w_task = w[:, task]

            metric_value = self.compute_singletask_metric(
                y_task, y_pred_task, w_task)
            computed_metrics.append(metric_value)
        log("computed_metrics: %s" % str(computed_metrics), self.verbosity)
        if n_tasks == 1:
            computed_metrics = computed_metrics[0]
        if not self.is_multitask:
            return computed_metrics
        else:
            if filter_nans:
                computed_metrics = np.array(computed_metrics)
                computed_metrics = computed_metrics[~np.isnan(computed_metrics
                                                              )]
            if self.compute_energy_metric:
                # TODO(rbharath, joegomes): What is this magic number?
                force_error = self.task_averager(
                    computed_metrics[1:]) * 4961.47596096
                print("Force error (metric: np.mean(%s)): %f kJ/mol/A" %
                      (self.name, force_error))
                return computed_metrics[0]
            else:
                return self.task_averager(computed_metrics)
Exemplo n.º 33
0
  def compute_model_performance(self,
                                metrics,
                                csv_out=None,
                                stats_out=None,
                                per_task_metrics=False):
    """
    Computes statistics of model on test data and saves results to csv.

    Parameters
    ----------
    metrics: list
      List of dc.metrics.Metric objects
    csv_out: str, optional
      Filename to write CSV of model predictions.
    stats_out: str, optional
      Filename to write computed statistics.
    per_task_metrics: bool, optional
      If true, return computed metric for each task on multitask dataset.
    """
    y = self.dataset.y
    y = undo_transforms(y, self.output_transformers)
    w = self.dataset.w

    if not len(metrics):
      return {}
    else:
      mode = metrics[0].mode
    if mode == "classification":
      y_pred = self.model.predict_proba(self.dataset, self.output_transformers)
      y_pred_print = self.model.predict(self.dataset,
                                        self.output_transformers).astype(int)
    else:
      y_pred = self.model.predict(self.dataset, self.output_transformers)
      y_pred_print = y_pred
    multitask_scores = {}
    all_task_scores = {}

    if csv_out is not None:
      log("Saving predictions to %s" % csv_out, self.verbose)
      self.output_predictions(y_pred_print, csv_out)

    # Compute multitask metrics
    for metric in metrics:
      if per_task_metrics:
        multitask_scores[metric.name], computed_metrics = metric.compute_metric(
            y, y_pred, w, per_task_metrics=True)
        all_task_scores[metric.name] = computed_metrics
      else:
        multitask_scores[metric.name] = metric.compute_metric(
            y, y_pred, w, per_task_metrics=False)

    if stats_out is not None:
      log("Saving stats to %s" % stats_out, self.verbose)
      self.output_statistics(multitask_scores, stats_out)

    if not per_task_metrics:
      return multitask_scores
    else:
      return multitask_scores, all_task_scores
Exemplo n.º 34
0
  def compute_model_performance(self,
                                metrics,
                                csv_out=None,
                                stats_out=None,
                                per_task_metrics=False):
    """
    Computes statistics of model on test data and saves results to csv.

    Parameters
    ----------
    metrics: list
      List of dc.metrics.Metric objects
    csv_out: str, optional
      Filename to write CSV of model predictions.
    stats_out: str, optional
      Filename to write computed statistics.
    per_task_metrics: bool, optional
      If true, return computed metric for each task on multitask dataset.
    """
    y = self.dataset.y
    y = undo_transforms(y, self.output_transformers)
    w = self.dataset.w

    if not len(metrics):
      return {}
    else:
      mode = metrics[0].mode
    if mode == "classification":
      y_pred = self.model.predict_proba(self.dataset, self.output_transformers)
      y_pred_print = self.model.predict(self.dataset,
                                        self.output_transformers).astype(int)
    else:
      y_pred = self.model.predict(self.dataset, self.output_transformers)
      y_pred_print = y_pred
    multitask_scores = {}
    all_task_scores = {}

    if csv_out is not None:
      log("Saving predictions to %s" % csv_out, self.verbose)
      self.output_predictions(y_pred_print, csv_out)

    # Compute multitask metrics
    for metric in metrics:
      if per_task_metrics:
        multitask_scores[metric.name], computed_metrics = metric.compute_metric(
            y, y_pred, w, per_task_metrics=True)
        all_task_scores[metric.name] = computed_metrics
      else:
        multitask_scores[metric.name] = metric.compute_metric(
            y, y_pred, w, per_task_metrics=False)

    if stats_out is not None:
      log("Saving stats to %s" % stats_out, self.verbose)
      self.output_statistics(multitask_scores, stats_out)

    if not per_task_metrics:
      return multitask_scores
    else:
      return multitask_scores, all_task_scores
Exemplo n.º 35
0
    def featurize(self,
                  mols,
                  parallel=False,
                  client_kwargs=None,
                  view_flags=None,
                  verbosity=None,
                  log_every_n=1000):
        """
    Calculate features for molecules.

    Parameters
    ----------
    mols : iterable
        RDKit Mol objects.
    parallel : bool, optional
        Whether to train subtrainers in parallel using
        IPython.parallel (default False).
    client_kwargs : dict, optional
        Keyword arguments for IPython.parallel Client.
    view_flags : dict, optional
        Flags for IPython.parallel LoadBalancedView.
    """
        if self.conformers and isinstance(mols, types.GeneratorType):
            mols = list(mols)
        assert verbosity in [None, "low", "high"]

        if parallel:
            from IPython.parallel import Client

            if client_kwargs is None:
                client_kwargs = {}
            if view_flags is None:
                view_flags = {}
            client = Client(**client_kwargs)
            client.direct_view().use_dill()  # use dill
            view = client.load_balanced_view()
            view.set_flags(**view_flags)
            call = view.map(self._featurize, mols, block=False)
            features = call.get()

            # get output from engines
            call.display_outputs()

        else:
            features = []
            for i, mol in enumerate(mols):
                if verbosity is not None and i % log_every_n == 0:
                    log("Featurizing %d / %d" % (i, len(mols)))
                if mol is not None:
                    features.append(self._featurize(mol))
                else:
                    features.append(np.array([]))

        if self.conformers:
            features = self.conformer_container(mols, features)
        else:
            features = np.asarray(features)
        return features
Exemplo n.º 36
0
    def __init__(
            self,
            data_dir=None,
            tasks=[],
            metadata_rows=None,  #featurizers=None, 
            raw_data=None,
            verbosity=None,
            reload=False,
            compute_feature_statistics=True):
        """
    Turns featurized dataframes into numpy files, writes them & metadata to disk.
    """
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)
        self.data_dir = data_dir
        assert verbosity in [None, "low", "high"]
        self.verbosity = verbosity

        if not reload or not os.path.exists(self._get_metadata_filename()):
            if metadata_rows is not None:
                self.metadata_df = DiskDataset.construct_metadata(
                    metadata_rows)
                self.save_to_disk()
            elif raw_data is not None:
                metadata_rows = []
                ids, X, y, w = raw_data
                metadata_rows.append(
                    DiskDataset.write_data_to_disk(
                        self.data_dir,
                        "data",
                        tasks,
                        X,
                        y,
                        w,
                        ids,
                        compute_feature_statistics=compute_feature_statistics))
                self.metadata_df = DiskDataset.construct_metadata(
                    metadata_rows)
                self.save_to_disk()
            else:
                # Create an empty metadata dataframe to be filled at a later time
                basename = "metadata"
                metadata_rows = [
                    DiskDataset.write_data_to_disk(self.data_dir, basename,
                                                   tasks)
                ]
                self.metadata_df = DiskDataset.construct_metadata(
                    metadata_rows)
                self.save_to_disk()

        else:
            log("Loading pre-existing metadata file.", self.verbosity)
            if os.path.exists(self._get_metadata_filename()):
                self.metadata_df = load_from_disk(
                    self._get_metadata_filename())
            else:
                raise ValueError("No metadata found.")
Exemplo n.º 37
0
  def __init__(self, data_dir, verbose=True):
    """
    Turns featurized dataframes into numpy files, writes them & metadata to disk.
    """
    self.data_dir = data_dir
    self.verbose = verbose

    log("Loading dataset from disk.", self.verbose)
    self.tasks, self.metadata_df = self.load_metadata()
Exemplo n.º 38
0
  def __init__(self, data_dir, verbose=True):
    """
    Turns featurized dataframes into numpy files, writes them & metadata to disk.
    """
    self.data_dir = data_dir
    self.verbose = verbose

    log("Loading dataset from disk.", self.verbose)
    self.tasks, self.metadata_df = self.load_metadata()
Exemplo n.º 39
0
    def compute_model_performance(self, csv_out, stats_file):
        """
    Computes statistics of model on test data and saves results to csv.
    """
        pred_y_df = self.model.predict(self.dataset)
        log("Saving predictions to %s" % csv_out, self.verbose)
        pred_y_df.to_csv(csv_out)

        if self.task_type == "classification":
            colnames = [
                "task_name", "roc_auc_score", "matthews_corrcoef",
                "recall_score", "accuracy_score"
            ]
        elif self.task_type == "regression":
            colnames = ["task_name", "r2_score", "rms_error"]
        else:
            raise ValueError("Unrecognized task type: %s" % self.task_type)

        performance_df = pd.DataFrame(columns=colnames)
        y_means = pred_y_df.iterrows().next()[1]["y_means"]
        y_stds = pred_y_df.iterrows().next()[1]["y_stds"]

        for i, task_name in enumerate(self.task_names):
            y = pred_y_df[task_name].values
            y_pred = pred_y_df["%s_pred" % task_name].values
            w = pred_y_df["%s_weight" % task_name].values
            y = undo_transform(y, y_means, y_stds, self.output_transforms)
            y_pred = undo_transform(y_pred, y_means, y_stds,
                                    self.output_transforms)

            if self.task_type == "classification":
                y, y_pred = y[w.nonzero()].astype(int), y_pred[
                    w.nonzero()].astype(int)
                # Sometimes all samples have zero weight. In this case, continue.
                if not len(y):
                    continue
                auc = compute_roc_auc_scores(y, y_pred)
                mcc = matthews_corrcoef(y, y_pred)
                recall = recall_score(y, y_pred)
                accuracy = accuracy_score(y, y_pred)
                performance_df.loc[i] = [task_name, auc, mcc, recall, accuracy]

            elif self.task_type == "regression":
                try:
                    r2s = r2_score(y, y_pred)
                    rms = np.sqrt(mean_squared_error(y, y_pred))
                except ValueError:
                    r2s = np.nan
                    rms = np.nan
                performance_df.loc[i] = [task_name, r2s, rms]

        log("Saving model performance scores to %s" % stats_file, self.verbose)
        performance_df.to_csv(stats_file)

        return pred_y_df, performance_df
Exemplo n.º 40
0
 def __init__(self, tasks, model_builder, model_dir=None, verbose=True):
   super().__init__(self, model_dir=model_dir, verbose=verbose)
   self.tasks = tasks
   self.task_model_dirs = {}
   self.model_builder = model_builder
   log("About to initialize singletask to multitask model", self.verbose)
   for task in self.tasks:
     task_model_dir = os.path.join(self.model_dir, str(task))
     if not os.path.exists(task_model_dir):
       os.makedirs(task_model_dir)
     log("Initializing directory for task %s" % task, self.verbose)
     self.task_model_dirs[task] = task_model_dir
Exemplo n.º 41
0
def featurize_smiles_df(df, featurizer, field, log_every_N=1000, verbose=True):
    """Featurize individual compounds in dataframe.

    Given a featurizer that operates on individual chemical compounds
    or macromolecules, compute & add features for that compound to the
    features dataframe
    """
    sample_elems = df[field].tolist()

    features = []
    from rdkit import Chem
    from rdkit.Chem import rdmolfiles
    from rdkit.Chem import rdmolops

    if 'Comet' in str(featurizer.__class__.__qualname__):
        mols = preprocess_df(sample_elems, NUM_WORKERS)
        mols_chunks = np.array_split(mols, len(mols) // BATCH_SIZE + 1)
        for chunk in mols_chunks:
            X, A, L = list(zip(*chunk))
            X = np.array(X, dtype=np.uint8)
            A = np.array(A, dtype=np.float32)
            L = np.array(L, dtype=np.uint8)
            max_len = L[-1]
            X = X[:, :max_len, :]
            A = A[:, :max_len, :max_len]
            temp = featurizer._featurize((X, A))
            features += list(temp)

        valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features],
                              dtype=bool)
        features = [
            elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
        ]
        return np.array(features), valid_inds

    else:
        for ind, elem in enumerate(sample_elems):
            mol = Chem.MolFromSmiles(elem)
            # TODO (ytz) this is a bandage solution to reorder the atoms so
            # that they're always in the same canonical order. Presumably this
            # should be correctly implemented in the future for graph mols.
            if mol:
                new_order = rdmolfiles.CanonicalRankAtoms(mol)
                mol = rdmolops.RenumberAtoms(mol, new_order)
            if ind % log_every_N == 0:
                log("Featurizing sample %d" % ind, verbose)
            features.append(featurizer.featurize([mol]))
        valid_inds = np.array([1 if elt.size > 0 else 0 for elt in features],
                              dtype=bool)
        features = [
            elt for (is_valid, elt) in zip(valid_inds, features) if is_valid
        ]
        return np.squeeze(np.array(features), axis=1), valid_inds
Exemplo n.º 42
0
  def featurize(self, mols, parallel=False, client_kwargs=None,
                view_flags=None, verbosity=None, log_every_n=1000):
    """
    Calculate features for molecules.

    Parameters
    ----------
    mols : iterable
        RDKit Mol objects.
    parallel : bool, optional
        Whether to train subtrainers in parallel using
        IPython.parallel (default False).
    client_kwargs : dict, optional
        Keyword arguments for IPython.parallel Client.
    view_flags : dict, optional
        Flags for IPython.parallel LoadBalancedView.
    """
    if self.conformers and isinstance(mols, types.GeneratorType):
      mols = list(mols)
    assert verbosity in [None, "low", "high"]

    if parallel:
      from IPython.parallel import Client

      if client_kwargs is None:
          client_kwargs = {}
      if view_flags is None:
          view_flags = {}
      client = Client(**client_kwargs)
      client.direct_view().use_dill()  # use dill
      view = client.load_balanced_view()
      view.set_flags(**view_flags)
      call = view.map(self._featurize, mols, block=False)
      features = call.get()

      # get output from engines
      call.display_outputs()

    else:
      features = []
      for i, mol in enumerate(mols):
        if verbosity is not None and i % log_every_n == 0:
          log("Featurizing %d / %d" % (i, len(mols)))
        if mol is not None:
          features.append(self._featurize(mol))
        else:
          features.append(np.array([]))

    if self.conformers:
      features = self.conformer_container(mols, features)
    else:
      features = np.asarray(features)
    return features
Exemplo n.º 43
0
 def shard_generator():
   for shard_num, shard in enumerate(self.get_shards(input_files, shard_size)):
     time1 = time.time()
     X, valid_inds = self.featurize_shard(shard)
     ids, y, w = convert_df_to_numpy(shard, self.tasks, self.id_field)  
     # Filter out examples where featurization failed.
     ids, y, w = (ids[valid_inds], y[valid_inds], w[valid_inds])
     assert len(X) == len(ids) == len(y) == len(w)
     time2 = time.time()
     log("TIMING: featurizing shard %d took %0.3f s" % (shard_num, time2-time1),
         self.verbose)
     yield X, y, w, ids
Exemplo n.º 44
0
 def __init__(self, tasks, model_builder, model_dir=None, verbose=True):
   super(SingletaskToMultitask, self).__init__(
       self, model_dir=model_dir, verbose=verbose)
   self.tasks = tasks
   self.task_model_dirs = {}
   self.model_builder = model_builder
   log("About to initialize singletask to multitask model", self.verbose)
   for task in self.tasks:
     task_model_dir = os.path.join(self.model_dir, str(task))
     if not os.path.exists(task_model_dir):
       os.makedirs(task_model_dir)
     log("Initializing directory for task %s" % task, self.verbose)
     self.task_model_dirs[task] = task_model_dir
Exemplo n.º 45
0
  def __init__(self, data_dir, verbose=True):
    """
    Turns featurized dataframes into numpy files, writes them & metadata to disk.
    """
    self.data_dir = data_dir
    self.verbose = verbose

    log("Loading dataset from disk.", self.verbose)
    if os.path.exists(self._get_metadata_filename()):
      (self.tasks,
       self.metadata_df) = load_from_disk(self._get_metadata_filename())
    else:
      raise ValueError("No metadata found on disk.")
Exemplo n.º 46
0
  def __init__(self, data_dir, verbose=True):
    """
    Turns featurized dataframes into numpy files, writes them & metadata to disk.
    """
    self.data_dir = data_dir
    self.verbose = verbose

    log("Loading dataset from disk.", self.verbose)
    if os.path.exists(self._get_metadata_filename()):
      (self.tasks,
       self.metadata_df) = load_from_disk(self._get_metadata_filename())
    else:
      raise ValueError("No metadata found on disk.")
Exemplo n.º 47
0
    def _featurize_mol(self,
                       df,
                       featurizer,
                       parallel=True,
                       field_type="mol",
                       field=None,
                       worker_pool=None):
        """Featurize individual compounds.

       Given a featurizer that operates on individual chemical compounds 
       or macromolecules, compute & add features for that compound to the 
       features dataframe

       When featurizing a .sdf file, the 3-D structure should be preserved
       so we use the rdkit "mol" object created from .sdf instead of smiles
       string. Some featurizers such as CoulombMatrix also require a 3-D
       structure.  Featurizing from .sdf is currently the only way to
       perform CM feautization.

      TODO(rbharath): Needs to be merged with _featurize_compounds
    """
        assert field_type in ["mol", "smiles"]
        assert field is not None
        sample_elems = df[field].tolist()

        if worker_pool is None:
            features = []
            for ind, elem in enumerate(sample_elems):
                if field_type == "smiles":
                    mol = Chem.MolFromSmiles(elem)
                else:
                    mol = elem
                if ind % self.log_every_n == 0:
                    log("Featurizing sample %d" % ind, self.verbosity)
                features.append(
                    featurizer.featurize([mol], verbosity=self.verbosity))
        else:

            def featurize_wrapper(elem, dilled_featurizer):
                print("Featurizing %s" % elem)
                if field_type == "smiles":
                    mol = Chem.MolFromSmiles(smiles)
                else:
                    mol = elem
                featurizer = dill.loads(dilled_featurizer)
                feature = featurizer.featurize([mol], verbosity=self.verbosity)
                return feature

            features = worker_pool.map_sync(featurize_wrapper, sample_elems)

        df[featurizer.__class__.__name__] = features
Exemplo n.º 48
0
  def compute_model_performance(self, csv_out, stats_file):
    """
    Computes statistics of model on test data and saves results to csv.
    """
    pred_y_df = self.model.predict(self.dataset)
    log("Saving predictions to %s" % csv_out, self.verbose)
    pred_y_df.to_csv(csv_out)

    if self.task_type == "classification":
      colnames = ["task_name", "roc_auc_score", "matthews_corrcoef",
                  "recall_score", "accuracy_score"]
    elif self.task_type == "regression":
      colnames = ["task_name", "r2_score", "rms_error"]
    else:
      raise ValueError("Unrecognized task type: %s" % self.task_type)

    performance_df = pd.DataFrame(columns=colnames)
    y_means = pred_y_df.iterrows().next()[1]["y_means"]
    y_stds = pred_y_df.iterrows().next()[1]["y_stds"]

    for i, task_name in enumerate(self.task_names):
      y = pred_y_df[task_name].values
      y_pred = pred_y_df["%s_pred" % task_name].values
      w = pred_y_df["%s_weight" % task_name].values
      y = undo_transform(y, y_means, y_stds, self.output_transforms)
      y_pred = undo_transform(y_pred, y_means, y_stds, self.output_transforms)

      if self.task_type == "classification":
        y, y_pred = y[w.nonzero()].astype(int), y_pred[w.nonzero()].astype(int)
        # Sometimes all samples have zero weight. In this case, continue.
        if not len(y):
          continue
        auc = compute_roc_auc_scores(y, y_pred)
        mcc = matthews_corrcoef(y, y_pred)
        recall = recall_score(y, y_pred)
        accuracy = accuracy_score(y, y_pred)
        performance_df.loc[i] = [task_name, auc, mcc, recall, accuracy]

      elif self.task_type == "regression":
        try:
          r2s = r2_score(y, y_pred)
          rms = np.sqrt(mean_squared_error(y, y_pred))
        except ValueError:
          r2s = np.nan
          rms = np.nan
        performance_df.loc[i] = [task_name, r2s, rms]

    log("Saving model performance scores to %s" % stats_file, self.verbose)
    performance_df.to_csv(stats_file)

    return pred_y_df, performance_df
Exemplo n.º 49
0
    def featurize(self, input_files, data_dir=None, shard_size=8192):
        """Featurize provided files and write to specified location.
    
    For large datasets, automatically shards into smaller chunks
    for convenience.

    Parameters
    ----------
    input_files: list
      List of input filenames.
    data_dir: str
      (Optional) Directory to store featurized dataset.
    shard_size: int
      (Optional) Number of examples stored in each shard.
    """
        log("Loading raw samples now.", self.verbose)
        log("shard_size: %d" % shard_size, self.verbose)

        if not isinstance(input_files, list):
            input_files = [input_files]

        def shard_generator():
            for shard_num, shard in enumerate(
                    self.get_shards(input_files, shard_size)):
                time1 = time.time()
                X, valid_inds = self.featurize_shard(shard)
                ids = shard[self.id_field].values
                ids = ids[valid_inds]
                if len(self.tasks) > 0:
                    # Featurize task results iff they exist.
                    y, w = convert_df_to_numpy(shard, self.tasks,
                                               self.id_field)
                    # Filter out examples where featurization failed.
                    y, w = (y[valid_inds], w[valid_inds])
                    assert len(X) == len(ids) == len(y) == len(w)
                else:
                    # For prospective data where results are unknown, it makes
                    # no sense to have y values or weights.
                    y, w = (None, None)
                    assert len(X) == len(ids)

                time2 = time.time()
                log(
                    "TIMING: featurizing shard %d took %0.3f s" %
                    (shard_num, time2 - time1), self.verbose)
                yield X, y, w, ids

        return DiskDataset.create_dataset(shard_generator(),
                                          data_dir,
                                          self.tasks,
                                          verbose=self.verbose)
Exemplo n.º 50
0
  def compute_metric(self, y_true, y_pred, w=None, n_classes=2, filter_nans=True):
    """Compute a performance metric for each task.

    Args:
      y_true: A list of arrays containing true values for each task.
      y_pred: A list of arrays containing predicted values for each task.
      metric: Must be a class that inherits from Metric 

    Returns:
      A numpy array containing metric values for each task.
    """
    if len(y_true.shape) > 1:
      n_samples, n_tasks = y_true.shape[0], y_true.shape[1] 
    else:
      n_samples, n_tasks = y_true.shape[0], 1
    if self.mode == "classification":
      y_pred = np.reshape(y_pred, (n_samples, n_tasks, n_classes))
    else:
      y_pred = np.reshape(y_pred, (n_samples, n_tasks))
    y_true = np.reshape(y_true, (n_samples, n_tasks))
    if w is None:
      w = np.ones_like(y_true)
    assert y_true.shape[0] == y_pred.shape[0] == w.shape[0]
    computed_metrics = []
    for task in range(n_tasks):
      y_task = y_true[:, task]
      if self.mode == "regression":
        y_pred_task = y_pred[:, task]
      else:
        y_pred_task = y_pred[:, task, :]
      w_task = w[:, task]
    
      metric_value = self.compute_singletask_metric(
          y_task, y_pred_task, w_task)
      computed_metrics.append(metric_value)
    log("computed_metrics: %s" % str(computed_metrics), self.verbosity)
    if n_tasks == 1:
      computed_metrics = computed_metrics[0]
    if not self.is_multitask:
      return computed_metrics
    else:
      if filter_nans:
        computed_metrics = np.array(computed_metrics)
        computed_metrics = computed_metrics[~np.isnan(computed_metrics)]
      if self.compute_energy_metric:
        # TODO(rbharath, joegomes): What is this magic number?
        force_error = self.task_averager(computed_metrics[1:])*4961.47596096
        print("Force error (metric: np.mean(%s)): %f kJ/mol/A" % (self.name, force_error))
        return computed_metrics[0]
      else:
        return self.task_averager(computed_metrics)
Exemplo n.º 51
0
 def _create_task_datasets(self, dataset):
   """Make directories to hold data for tasks"""
   task_data_dirs = []
   for task in self.tasks:
     task_data_dir = os.path.join(self.model_dir, str(task) + "_data")
     if os.path.exists(task_data_dir):
       shutil.rmtree(task_data_dir)
     os.makedirs(task_data_dir)
     task_data_dirs.append(task_data_dir)
   task_datasets = self._to_singletask(dataset, task_data_dirs)
   for task, task_dataset in zip(self.tasks, task_datasets):
     log("Dataset for task %s has shape %s"
         % (task, str(task_dataset.get_shape())), self.verbose)
   return task_datasets
Exemplo n.º 52
0
 def _create_task_datasets(self, dataset):
   """Make directories to hold data for tasks"""
   task_data_dirs = []
   for task in self.tasks:
     task_data_dir = os.path.join(self.model_dir, str(task) + "_data")
     if os.path.exists(task_data_dir):
       shutil.rmtree(task_data_dir)
     os.makedirs(task_data_dir)
     task_data_dirs.append(task_data_dir)
   task_datasets = self._to_singletask(dataset, task_data_dirs)
   for task, task_dataset in zip(self.tasks, task_datasets):
     log("Dataset for task %s has shape %s" %
         (task, str(task_dataset.get_shape())), self.verbose)
   return task_datasets
Exemplo n.º 53
0
def featurize_map_function(args):
    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING
    ((loader, shard_size, input_type, data_dir), (shard_num,
                                                  raw_df_shard)) = args
    log(
        "Loading shard %d of size %s from file." %
        (shard_num + 1, str(shard_size)), loader.verbosity)
    log("About to featurize shard.", loader.verbosity)
    write_fn = partial(Dataset.write_dataframe,
                       data_dir=data_dir,
                       featurizer=loader.featurizer,
                       tasks=loader.tasks,
                       mol_id_field=loader.id_field,
                       verbosity=loader.verbosity)
    ############################################################## TIMING
    shard_time1 = time.time()
    ############################################################## TIMING
    metadata_row = loader._featurize_shard(raw_df_shard, write_fn, shard_num,
                                           input_type)
    ############################################################## TIMING
    shard_time2 = time.time()
    log(
        "TIMING: shard featurization took %0.3f s" %
        (shard_time2 - shard_time1), loader.verbosity)
    ############################################################## TIMING
    ############################################################## TIMING
    time2 = time.time()
    log("TIMING: featurization map function took %0.3f s" % (time2 - time1),
        loader.verbosity)
    ############################################################## TIMING
    return metadata_row
Exemplo n.º 54
0
def featurize_map_function(args):
  ############################################################## TIMING
  time1 = time.time()
  ############################################################## TIMING
  ((loader, shard_size, input_type, data_dir), (shard_num, raw_df_shard)) = args
  log("Loading shard %d of size %s from file." % (shard_num+1, str(shard_size)),
      loader.verbosity)
  log("About to featurize shard.", loader.verbosity)
  write_fn = partial(
      Dataset.write_dataframe, data_dir=data_dir,
      featurizer=loader.featurizer, tasks=loader.tasks,
      mol_id_field=loader.id_field, verbosity=loader.verbosity)
  ############################################################## TIMING
  shard_time1 = time.time()
  ############################################################## TIMING
  metadata_row = loader._featurize_shard(
      raw_df_shard, write_fn, shard_num, input_type)
  ############################################################## TIMING
  shard_time2 = time.time()
  log("TIMING: shard featurization took %0.3f s" % (shard_time2-shard_time1),
      loader.verbosity)
  ############################################################## TIMING
  ############################################################## TIMING
  time2 = time.time()
  log("TIMING: featurization map function took %0.3f s" % (time2-time1),
      loader.verbosity)
  ############################################################## TIMING
  return metadata_row
Exemplo n.º 55
0
 def fit(self, dataset, nb_epoch=10, batch_size=50, **kwargs):
   """
   Fits a model on data in a Dataset object.
   """
   # TODO(rbharath/enf): We need a structured way to deal with potential GPU
   #                     memory overflows.
   for epoch in range(nb_epoch):
     log("Starting epoch %s" % str(epoch + 1), self.verbose)
     losses = []
     for (X_batch, y_batch, w_batch,
          ids_batch) in dataset.iterbatches(batch_size):
       losses.append(self.fit_on_batch(X_batch, y_batch, w_batch))
     log("Avg loss for epoch %d: %f" % (epoch + 1, np.array(losses).mean()),
         self.verbose)
Exemplo n.º 56
0
  def fit(self, dataset, **kwargs):
    """
    Updates all singletask models with new information.

    Warning: This current implementation is only functional for sklearn models.
    """
    if not isinstance(dataset, DiskDataset):
      raise ValueError('SingletaskToMultitask only works with DiskDatasets')
    log("About to create task-specific datasets", self.verbose)
    task_datasets = self._create_task_datasets(dataset)
    for ind, task in enumerate(self.tasks):
      log("Fitting model for task %s" % task, self.verbose)
      task_model = self.model_builder(self.task_model_dirs[task])
      task_model.fit(task_datasets[ind], **kwargs)
      task_model.save()
Exemplo n.º 57
0
  def fit(self, dataset, **kwargs):
    """
    Updates all singletask models with new information.

    Warning: This current implementation is only functional for sklearn models.
    """
    if not isinstance(dataset, DiskDataset):
      raise ValueError('SingletaskToMultitask only works with DiskDatasets')
    log("About to create task-specific datasets", self.verbose)
    task_datasets = self._create_task_datasets(dataset)
    for ind, task in enumerate(self.tasks):
      log("Fitting model for task %s" % task, self.verbose)
      task_model = self.model_builder(self.task_model_dirs[task])
      task_model.fit(task_datasets[ind], **kwargs)
      task_model.save()
Exemplo n.º 58
0
 def shard_generator():
     for shard_num, shard in enumerate(
             self.get_shards(input_files, shard_size)):
         time1 = time.time()
         X, valid_inds = self.featurize_shard(shard)
         ids, y, w = convert_df_to_numpy(shard, self.tasks,
                                         self.id_field)
         # Filter out examples where featurization failed.
         ids, y, w = (ids[valid_inds], y[valid_inds], w[valid_inds])
         assert len(X) == len(ids) == len(y) == len(w)
         time2 = time.time()
         log(
             "TIMING: featurizing shard %d took %0.3f s" %
             (shard_num, time2 - time1), self.verbose)
         yield X, y, w, ids
Exemplo n.º 59
0
  def fit(self, dataset, nb_epoch=10, pad_batches=False, shuffle=False,
          max_checkpoints_to_keep=5, log_every_N_batches=50, **kwargs):
    """Fit the model.

    Args:
      dataset: Dataset object that represents data on disk.
      max_checkpoints_to_keep: Integer. Maximum number of checkpoints to keep;
        older checkpoints will be deleted.

    Raises:
      AssertionError: If model is not in training mode.
    """
    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING
    n_datapoints = len(dataset)
    batch_size = self.batch_size
    step_per_epoch = np.ceil(float(n_datapoints)/batch_size)
    log("Training for %d epochs" % nb_epoch, self.verbosity)
    with self.train_graph.graph.as_default():
      train_op = self.get_training_op(
          self.train_graph.graph, self.train_graph.loss)
      with self._get_shared_session(train=True) as sess:
        sess.run(tf.initialize_all_variables())
        saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep)
        # Save an initial checkpoint.
        saver.save(sess, self._save_path, global_step=0)
        for epoch in range(nb_epoch):
          avg_loss, n_batches = 0., 0
          if shuffle:
            log("About to shuffle dataset before epoch start.", self.verbosity)
            dataset.shuffle()
          for ind, (X_b, y_b, w_b, ids_b) in enumerate(
              dataset.iterbatches(batch_size, pad_batches=True)): # hardcode pad_batches=True to work around limitations in Tensorflow
            if ind % log_every_N_batches == 0:
              log("On batch %d" % ind, self.verbosity)
            # Run training op.
            feed_dict = self.construct_feed_dict(X_b, y_b, w_b, ids_b)
            fetches = self.train_graph.output + [
                train_op, self.train_graph.loss]
            fetched_values = sess.run(
                fetches,
                feed_dict=feed_dict)
            output = fetched_values[:len(self.train_graph.output)]
            loss = fetched_values[-1]
            avg_loss += loss
            y_pred = np.squeeze(np.array(output))
            y_b = y_b.flatten()
            n_batches += 1
          saver.save(sess, self._save_path, global_step=epoch)
          avg_loss = float(avg_loss)/n_batches
          log('Ending epoch %d: Average loss %g' % (epoch, avg_loss), self.verbosity)
        # Always save a final checkpoint when complete.
        saver.save(sess, self._save_path, global_step=epoch+1)
    ############################################################## TIMING
    time2 = time.time()
    print("TIMING: model fitting took %0.3f s" % (time2-time1),
          self.verbosity)