예제 #1
0
 def setUp(self):
     """Downloads dataset."""
     download_url(
         "http://deepchem.io.s3-website-us-west-1.amazonaws.com/featurized_datasets/core_grid.json"
     )
     json_fname = os.path.join(get_data_dir(), 'core_grid.json')
     self.core_dataset = dc.data.NumpyDataset.from_json(json_fname)
예제 #2
0
    def __init__(self,
                 radius: float = 8.0,
                 max_neighbors: float = 8,
                 step: float = 0.2):
        """
    Parameters
    ----------
    radius: float (default 8.0)
      Radius of sphere for finding neighbors of atoms in unit cell.
    max_neighbors: int (default 8)
      Maximum number of neighbors to consider when constructing graph.
    step: float (default 0.2)
      Step size for Gaussian filter. This value is used when building edge features.
    """

        self.radius = radius
        self.max_neighbors = int(max_neighbors)
        self.step = step

        # load atom_init.json
        data_dir = get_data_dir()
        download_url(ATOM_INIT_JSON_URL, data_dir)
        atom_init_json_path = os.path.join(data_dir, 'atom_init.json')
        with open(atom_init_json_path, 'r') as f:
            atom_init_json = json.load(f)

        self.atom_features = {
            int(key): np.array(value, dtype=np.float32)
            for key, value in atom_init_json.items()
        }
        self.valid_atom_number = set(self.atom_features.keys())
예제 #3
0
 def setUp(self):
   """Downloads dataset."""
   download_url(
       "http://deepchem.io.s3-website-us-west-1.amazonaws.com/featurized_datasets/core_grid.json"
   )
   json_fname = os.path.join(get_data_dir(), 'core_grid.json')
   self.core_dataset = dc.data.NumpyDataset.from_json(json_fname)
예제 #4
0
def test_lcnn_reload():

  # needs change
  current_dir = tempfile.mkdtemp()
  download_url(url=URL, dest_dir=current_dir)
  untargz_file(path.join(current_dir, 'lcnn_data_feature.tar.gz'), current_dir)
  tasks, datasets, transformers = load_dataset_from_disk(
      path.join(current_dir, 'lcnn_data'))
  train, valid, test = datasets
  model_dir = tempfile.mkdtemp()
  model = LCNNModel(
      mode='regression', batch_size=8, learning_rate=0.001, model_dir=model_dir)
  model.fit(train, nb_epoch=10)

  # check predict shape
  valid_preds = model.predict_on_batch(valid.X)
  assert valid_preds.shape == (65, 1)
  test_preds = model.predict(test)
  assert test_preds.shape == (65, 1)
  # check overfit
  regression_metric = Metric(mae_score)
  scores = model.evaluate(test, [regression_metric], transformers)
  assert scores[regression_metric.name] < 0.6

  # reload
  reloaded_model = LCNNModel(
      mode='regression', batch_size=8, learning_rate=0.001, model_dir=model_dir)
  reloaded_model.restore()

  original_pred = model.predict(test)
  reload_pred = reloaded_model.predict(test)

  assert np.all(np.abs(original_pred - reload_pred) < 0.0000001)
예제 #5
0
    def __init__(self,
                 sixty_four_bits: bool = True,
                 pocket_finder: Optional[BindingPocketFinder] = None):
        """Initializes Vina Pose Generator

    Parameters
    ----------
    sixty_four_bits: bool, optional (default True)
      Specifies whether this is a 64-bit machine. Needed to download
      the correct executable.
    pocket_finder: BindingPocketFinder, optional (default None)
      If specified should be an instance of
      `dc.dock.BindingPocketFinder`.
    """
        data_dir = get_data_dir()
        if platform.system() == 'Linux':
            url = "http://vina.scripps.edu/download/autodock_vina_1_1_2_linux_x86.tgz"
            filename = "autodock_vina_1_1_2_linux_x86.tgz"
            dirname = "autodock_vina_1_1_2_linux_x86"
            self.vina_dir = os.path.join(data_dir, dirname)
            self.vina_cmd = os.path.join(self.vina_dir, "bin/vina")
        elif platform.system() == 'Darwin':
            if sixty_four_bits:
                url = "http://vina.scripps.edu/download/autodock_vina_1_1_2_mac_64bit.tar.gz"
                filename = "autodock_vina_1_1_2_mac_64bit.tar.gz"
                dirname = "autodock_vina_1_1_2_mac_catalina_64bit"
            else:
                url = "http://vina.scripps.edu/download/autodock_vina_1_1_2_mac.tgz"
                filename = "autodock_vina_1_1_2_mac.tgz"
                dirname = "autodock_vina_1_1_2_mac"
            self.vina_dir = os.path.join(data_dir, dirname)
            self.vina_cmd = os.path.join(self.vina_dir, "bin/vina")
        elif platform.system() == 'Windows':
            url = "http://vina.scripps.edu/download/autodock_vina_1_1_2_win32.msi"
            filename = "autodock_vina_1_1_2_win32.msi"
            self.vina_dir = "\\Program Files (x86)\\The Scripps Research Institute\\Vina"
            self.vina_cmd = os.path.join(self.vina_dir, "vina.exe")
        else:
            raise ValueError(
                "Unknown operating system.  Try using a cloud platform to run this code instead."
            )
        self.pocket_finder = pocket_finder
        if not os.path.exists(self.vina_dir):
            logger.info("Vina not available. Downloading")
            download_url(url, data_dir)
            downloaded_file = os.path.join(data_dir, filename)
            logger.info("Downloaded Vina. Extracting")
            if platform.system() == 'Windows':
                msi_cmd = "msiexec /i %s" % downloaded_file
                check_output(msi_cmd.split())
            else:
                with tarfile.open(downloaded_file) as tar:
                    tar.extractall(data_dir)
            logger.info("Cleanup: removing downloaded vina tar.gz")
            os.remove(downloaded_file)
예제 #6
0
    def __init__(self,
                 pretrain_model_path: Optional[str] = None,
                 radius: int = 1,
                 unseen: str = 'UNK',
                 gather_method: str = 'sum'):
        """
    Paremeters
    ----------
    pretrain_file: str, optional
      The path for pretrained model. If this value is None, we use the model which is put on
      github repository (https://github.com/samoturk/mol2vec/tree/master/examples/models).
      The model is trained on 20 million compounds downloaded from ZINC.
    radius: int, optional (default 1)
      The fingerprint radius. The default value was used to train the model which is put on
      github repository.
    unseen: str, optional (default 'UNK')
      The string to used to replace uncommon words/identifiers while training.
    gather_method: str, optional (default 'sum')
      How to aggregate vectors of identifiers are extracted from Mol2vec.
      'sum' or 'mean' is supported.
    """
        try:
            from gensim.models import word2vec
            from mol2vec.features import mol2alt_sentence, sentences2vec
        except ModuleNotFoundError:
            raise ValueError("This class requires mol2vec to be installed.")

        self.radius = radius
        self.unseen = unseen
        self.gather_method = gather_method
        self.sentences2vec = sentences2vec
        self.mol2alt_sentence = mol2alt_sentence
        if pretrain_model_path is None:
            data_dir = get_data_dir()
            pretrain_model_path = path.join(data_dir,
                                            'mol2vec_model_300dim.pkl')
            if not path.exists(pretrain_model_path):
                targz_file = path.join(data_dir, 'mol2vec_model_300dim.tar.gz')
                if not path.exists(targz_file):
                    download_url(DEFAULT_PRETRAINED_MODEL_URL, data_dir)
                untargz_file(
                    path.join(data_dir, 'mol2vec_model_300dim.tar.gz'),
                    data_dir)
        # load pretrained models
        self.model = word2vec.Word2Vec.load(pretrain_model_path)
예제 #7
0
    def zinc_encoder():
        """
    Returns
    -------
    obj
      An Encoder with weights that were trained on the zinc dataset
    """
        current_dir = os.path.dirname(os.path.realpath(__file__))
        weights_filename = "zinc_model.h5"
        weights_file = os.path.join(current_dir, weights_filename)

        if not os.path.exists(weights_file):
            download_url("http://karlleswing.com/misc/keras-molecule/model.h5",
                         current_dir)
            mv_cmd = "mv model.h5 %s" % weights_file
            call(mv_cmd.split())
        return TensorflowMoleculeEncoder(model_dir=current_dir,
                                         weights_file=weights_filename)
예제 #8
0
  def zinc_encoder():
    """
    Returns
    -------
    obj
      An Encoder with weights that were trained on the zinc dataset
    """
    current_dir = os.path.dirname(os.path.realpath(__file__))
    weights_filename = "zinc_model.h5"
    weights_file = os.path.join(current_dir, weights_filename)

    if not os.path.exists(weights_file):
      download_url("http://karlleswing.com/misc/keras-molecule/model.h5",
                   current_dir)
      mv_cmd = "mv model.h5 %s" % weights_file
      call(mv_cmd.split())
    return TensorflowMoleculeEncoder(
        model_dir=current_dir, weights_file=weights_filename)
예제 #9
0
def create_gene_ontology(feature_mapping,
                         outputs_per_feature=0.3,
                         min_outputs=20,
                         min_node_features=6,
                         omit_redundant_nodes=True,
                         ontology_file=None):
  """Create a tree of OntologyNodes describing the Gene Ontology classification.

  See http://geneontology.org/ for details about the Gene Ontology classification.

  Parameters
  ----------
  feature_mapping: dict
    defines the mapping of features to GO categories.  Each key should be a
    feature ID.  The corresponding value should be a list of strings, giving the
    unique identifiers of all GO categories that feature belongs to.
  outputs_per_feature: float
    the number of outputs for each node is set to this value times the total
    number of features the node contains (including all subnodes)
  min_outputs: int
    the minimum number of outputs for any node
  min_node_features: int
    the minimum number of features corresponding to a node (including all its
    subnodes).  If a category has fewer features than this, no node is create
    for it.  Instead, its features are added directly to its parent node.
  omit_redundant_nodes: bool
    if True, a node will be omitted if it has only one child node and does not
    directly directly correspond to any features
  ontology_file: str
    the path to a Gene Ontology OBO file defining the ontology.  If this is
    omitted, the most recent version of the ontology is downloaded from the GO
    website.
  """
  # If necessary, download the file defining the ontology.

  if ontology_file is None:
    ontology_file = os.path.join(get_data_dir(), 'go-basic.obo')
    if not os.path.isfile(ontology_file):
      download_url('http://purl.obolibrary.org/obo/go/go-basic.obo')

  # Parse the ontology definition and create a list of terms.

  terms = []
  term = None
  with open(ontology_file) as input:
    for line in input:
      if line.startswith('[Term]'):
        if term is not None:
          terms.append(term)
        term = {'parents': []}
      elif line.startswith('[Typedef]'):
        if term is not None:
          terms.append(term)
        term = None
      elif line.startswith('id:') and term is not None:
        term['id'] = line.split()[1]
      elif line.startswith('name:') and term is not None:
        term['name'] = line[5:].strip()
      elif line.startswith('is_a:') and term is not None:
        term['parents'].append(line.split()[1])
      elif line.startswith('is_obsolete:'):
        if line.split()[1] == 'true':
          term = None
  if term is not None:
    terms.append(term)

  # Create OntologyNode objects for all the terms.

  nodes = {}
  for term in terms:
    nodes[term['id']] = OntologyNode(term['id'], 0, name=term['name'])

  # Assign parent-child relationships between nodes, and identify root nodes.

  roots = []
  for term in terms:
    node = nodes[term['id']]
    for parent in term['parents']:
      nodes[parent].children.append(node)
    if len(term['parents']) == 0:
      roots.append(node)

  # Create a single root node that combines the three GO roots.

  root = OntologyNode('GO', 0, name='Gene Ontology Root Node', children=roots)

  # Assign features to nodes.

  for feature_id in feature_mapping:
    for node_id in feature_mapping[feature_id]:
      nodes[node_id].feature_ids.append(feature_id)

  # Count the number of features within each node.  Eliminate nodes with too few
  # features and set the number of outputs for each one.

  def count_features(node):
    self_features = set(node.feature_ids)
    all_features = set(node.feature_ids)
    for i, child in enumerate(node.children[:]):
      child_features = count_features(child)
      all_features.update(child_features)
      if len(child_features) < min_node_features:
        node.children.remove(child)
        self_features.update(child.feature_ids)
    if omit_redundant_nodes and len(
        node.children) == 1 and len(self_features) == 0:
      self_features = node.children[0].feature_ids
      node.children = node.children[0].children
    n_features = len(self_features)
    if n_features > len(node.feature_ids):
      node.feature_ids = list(self_features)
    node.n_outputs = max(min_outputs,
                         math.ceil(outputs_per_feature * n_features))
    return all_features

  count_features(root)
  return root