def __init__(self, radius: float = 8.0, max_neighbors: float = 8, step: float = 0.2): """ Parameters ---------- radius: float (default 8.0) Radius of sphere for finding neighbors of atoms in unit cell. max_neighbors: int (default 8) Maximum number of neighbors to consider when constructing graph. step: float (default 0.2) Step size for Gaussian filter. This value is used when building edge features. """ self.radius = radius self.max_neighbors = int(max_neighbors) self.step = step # load atom_init.json data_dir = get_data_dir() download_url(ATOM_INIT_JSON_URL, data_dir) atom_init_json_path = os.path.join(data_dir, 'atom_init.json') with open(atom_init_json_path, 'r') as f: atom_init_json = json.load(f) self.atom_features = { int(key): np.array(value, dtype=np.float32) for key, value in atom_init_json.items() } self.valid_atom_number = set(self.atom_features.keys())
def setUp(self): """Downloads dataset.""" download_url( "http://deepchem.io.s3-website-us-west-1.amazonaws.com/featurized_datasets/core_grid.json" ) json_fname = os.path.join(get_data_dir(), 'core_grid.json') self.core_dataset = dc.data.NumpyDataset.from_json(json_fname)
def __init__(self, sixty_four_bits: bool = True, pocket_finder: Optional[BindingPocketFinder] = None): """Initializes Vina Pose Generator Parameters ---------- sixty_four_bits: bool, optional (default True) Specifies whether this is a 64-bit machine. Needed to download the correct executable. pocket_finder: BindingPocketFinder, optional (default None) If specified should be an instance of `dc.dock.BindingPocketFinder`. """ data_dir = get_data_dir() if platform.system() == 'Linux': url = "http://vina.scripps.edu/download/autodock_vina_1_1_2_linux_x86.tgz" filename = "autodock_vina_1_1_2_linux_x86.tgz" dirname = "autodock_vina_1_1_2_linux_x86" self.vina_dir = os.path.join(data_dir, dirname) self.vina_cmd = os.path.join(self.vina_dir, "bin/vina") elif platform.system() == 'Darwin': if sixty_four_bits: url = "http://vina.scripps.edu/download/autodock_vina_1_1_2_mac_64bit.tar.gz" filename = "autodock_vina_1_1_2_mac_64bit.tar.gz" dirname = "autodock_vina_1_1_2_mac_catalina_64bit" else: url = "http://vina.scripps.edu/download/autodock_vina_1_1_2_mac.tgz" filename = "autodock_vina_1_1_2_mac.tgz" dirname = "autodock_vina_1_1_2_mac" self.vina_dir = os.path.join(data_dir, dirname) self.vina_cmd = os.path.join(self.vina_dir, "bin/vina") elif platform.system() == 'Windows': url = "http://vina.scripps.edu/download/autodock_vina_1_1_2_win32.msi" filename = "autodock_vina_1_1_2_win32.msi" self.vina_dir = "\\Program Files (x86)\\The Scripps Research Institute\\Vina" self.vina_cmd = os.path.join(self.vina_dir, "vina.exe") else: raise ValueError( "Unknown operating system. Try using a cloud platform to run this code instead." ) self.pocket_finder = pocket_finder if not os.path.exists(self.vina_dir): logger.info("Vina not available. Downloading") download_url(url, data_dir) downloaded_file = os.path.join(data_dir, filename) logger.info("Downloaded Vina. Extracting") if platform.system() == 'Windows': msi_cmd = "msiexec /i %s" % downloaded_file check_output(msi_cmd.split()) else: with tarfile.open(downloaded_file) as tar: tar.extractall(data_dir) logger.info("Cleanup: removing downloaded vina tar.gz") os.remove(downloaded_file)
def __init__(self, pretrain_model_path: Optional[str] = None, radius: int = 1, unseen: str = 'UNK', gather_method: str = 'sum'): """ Paremeters ---------- pretrain_file: str, optional The path for pretrained model. If this value is None, we use the model which is put on github repository (https://github.com/samoturk/mol2vec/tree/master/examples/models). The model is trained on 20 million compounds downloaded from ZINC. radius: int, optional (default 1) The fingerprint radius. The default value was used to train the model which is put on github repository. unseen: str, optional (default 'UNK') The string to used to replace uncommon words/identifiers while training. gather_method: str, optional (default 'sum') How to aggregate vectors of identifiers are extracted from Mol2vec. 'sum' or 'mean' is supported. """ try: from gensim.models import word2vec from mol2vec.features import mol2alt_sentence, sentences2vec except ModuleNotFoundError: raise ValueError("This class requires mol2vec to be installed.") self.radius = radius self.unseen = unseen self.gather_method = gather_method self.sentences2vec = sentences2vec self.mol2alt_sentence = mol2alt_sentence if pretrain_model_path is None: data_dir = get_data_dir() pretrain_model_path = path.join(data_dir, 'mol2vec_model_300dim.pkl') if not path.exists(pretrain_model_path): targz_file = path.join(data_dir, 'mol2vec_model_300dim.tar.gz') if not path.exists(targz_file): download_url(DEFAULT_PRETRAINED_MODEL_URL, data_dir) untargz_file( path.join(data_dir, 'mol2vec_model_300dim.tar.gz'), data_dir) # load pretrained models self.model = word2vec.Word2Vec.load(pretrain_model_path)
def create_gene_ontology(feature_mapping, outputs_per_feature=0.3, min_outputs=20, min_node_features=6, omit_redundant_nodes=True, ontology_file=None): """Create a tree of OntologyNodes describing the Gene Ontology classification. See http://geneontology.org/ for details about the Gene Ontology classification. Parameters ---------- feature_mapping: dict defines the mapping of features to GO categories. Each key should be a feature ID. The corresponding value should be a list of strings, giving the unique identifiers of all GO categories that feature belongs to. outputs_per_feature: float the number of outputs for each node is set to this value times the total number of features the node contains (including all subnodes) min_outputs: int the minimum number of outputs for any node min_node_features: int the minimum number of features corresponding to a node (including all its subnodes). If a category has fewer features than this, no node is create for it. Instead, its features are added directly to its parent node. omit_redundant_nodes: bool if True, a node will be omitted if it has only one child node and does not directly directly correspond to any features ontology_file: str the path to a Gene Ontology OBO file defining the ontology. If this is omitted, the most recent version of the ontology is downloaded from the GO website. """ # If necessary, download the file defining the ontology. if ontology_file is None: ontology_file = os.path.join(get_data_dir(), 'go-basic.obo') if not os.path.isfile(ontology_file): download_url('http://purl.obolibrary.org/obo/go/go-basic.obo') # Parse the ontology definition and create a list of terms. terms = [] term = None with open(ontology_file) as input: for line in input: if line.startswith('[Term]'): if term is not None: terms.append(term) term = {'parents': []} elif line.startswith('[Typedef]'): if term is not None: terms.append(term) term = None elif line.startswith('id:') and term is not None: term['id'] = line.split()[1] elif line.startswith('name:') and term is not None: term['name'] = line[5:].strip() elif line.startswith('is_a:') and term is not None: term['parents'].append(line.split()[1]) elif line.startswith('is_obsolete:'): if line.split()[1] == 'true': term = None if term is not None: terms.append(term) # Create OntologyNode objects for all the terms. nodes = {} for term in terms: nodes[term['id']] = OntologyNode(term['id'], 0, name=term['name']) # Assign parent-child relationships between nodes, and identify root nodes. roots = [] for term in terms: node = nodes[term['id']] for parent in term['parents']: nodes[parent].children.append(node) if len(term['parents']) == 0: roots.append(node) # Create a single root node that combines the three GO roots. root = OntologyNode('GO', 0, name='Gene Ontology Root Node', children=roots) # Assign features to nodes. for feature_id in feature_mapping: for node_id in feature_mapping[feature_id]: nodes[node_id].feature_ids.append(feature_id) # Count the number of features within each node. Eliminate nodes with too few # features and set the number of outputs for each one. def count_features(node): self_features = set(node.feature_ids) all_features = set(node.feature_ids) for i, child in enumerate(node.children[:]): child_features = count_features(child) all_features.update(child_features) if len(child_features) < min_node_features: node.children.remove(child) self_features.update(child.feature_ids) if omit_redundant_nodes and len( node.children) == 1 and len(self_features) == 0: self_features = node.children[0].feature_ids node.children = node.children[0].children n_features = len(self_features) if n_features > len(node.feature_ids): node.feature_ids = list(self_features) node.n_outputs = max(min_outputs, math.ceil(outputs_per_feature * n_features)) return all_features count_features(root) return root