sns.set() rdBase.DisableLog('rdApp.error') np.random.seed(0) # convert rdkit fingerprint to numpy array def fp2arr(fp): arr = np.zeros((1,)) DataStructs.ConvertToNumpyArray(fp, arr) return arr mol_zinc = [] with open('zinc10000.txt', 'r') as f: for line in f: smiles = line.rstrip() mol = Chem.MolFromSmiles(smiles) if mol is not None: mol_zinc.append(mol) else: print(smiles) mol_active = [] with open('actives_final.ism', 'r') as f: for line in f: smiles = line.split()[0] mol = Chem.MolFromSmiles(smiles) if mol is not None: mol_active.append(mol) else: print(smiles)
def canonicalize(smiles): mol = Chem.MolFromSmiles(smiles) if smiles != '' and mol is not None and mol.GetNumAtoms() > 1: return Chem.MolToSmiles(mol) else: return smiles
def test1(self): # computeCanonicalTransform returns more approximate eigenvalues/eigencvectors # when built against the native RDKit PowerEigenSolver, so unit test results # differ slightly builtAgainstEigen3 = hasattr(AllChem, 'ComputePrincipalAxesAndMomentsFromGyrationMatrix') if builtAgainstEigen3: expectedSkelPts = 15 expectedAlgs = [0, 5, 21, 0] prunedAlgs = [0, 4, 11, 0] else: expectedSkelPts = 16 expectedAlgs = [0, 5, 28, 0] prunedAlgs = [0, 4, 12, 0] filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_data', '5ht3ligs.sdf') suppl = Chem.SDMolSupplier(filename) builder = SubshapeBuilder.SubshapeBuilder() builder.gridDims = (20., 20., 10) builder.gridSpacing = 0.5 builder.winRad = 4. ms = [] shapes = [] for m in suppl: m = Chem.AddHs(m, addCoords=True) AllChem.CanonicalizeConformer(m.GetConformer()) ms.append(m) shape = builder(m, terminalPtsOnly=True) shapes.append(shape) self.assertEqual(len(ms), 4) self.assertEqual(len(shapes), 4) self.assertEqual([len(x.skelPts) for x in shapes], [5, 5, 5, 5]) refShape = builder.GenerateSubshapeShape(ms[0]) self.assertEqual(len(refShape.skelPts), expectedSkelPts) aligner = SubshapeAligner.SubshapeAligner() aligner.shapeDistTol = .30 algStore = [] for i, s1 in enumerate(shapes): if not i or not s1: algStore.append([]) continue m1 = ms[i] alignments = aligner.GetSubshapeAlignments(ms[0], refShape, m1, s1, builder) algStore.append(alignments) self.assertEqual([len(x) for x in algStore], expectedAlgs) algStore = [] for i, s1 in enumerate(shapes): if not i or not s1: algStore.append([]) continue m1 = ms[i] alignments = list(aligner(ms[0], refShape, m1, s1, builder)) algStore.append(alignments) self.assertEqual([len(x) for x in algStore], expectedAlgs) pruned = [] for i, mi in enumerate(ms): alignments = algStore[i] pruned.append(SubshapeAligner.ClusterAlignments( mi, alignments, builder, neighborTol=0.15)) self.assertEqual([len(x) for x in pruned], prunedAlgs)
print('successfully loaded editor model from %s' % path) if config['proposal'] == 'random': proposal = Proposal_Random(config) elif config['proposal'] == 'editor': proposal = Proposal_Editor(config, editor) elif config['proposal'] == 'mix': proposal = Proposal_Mix(config, editor) else: raise NotImplementedError ### sampler if config['sampler'] == 're': sampler = Sampler_Recursive(config, proposal, estimator) elif config['sampler'] == 'sa': sampler = Sampler_SA(config, proposal, estimator) elif config['sampler'] == 'mh': sampler = Sampler_MH(config, proposal, estimator) else: raise NotImplementedError ### sampling if config['mols_init']: mols = load_mols(config['data_dir'], config['mols_init']) mols = random.choices(mols, k=config['num_mols']) mols_init = mols[:config['num_mols']] else: mols_init = [ Chem.MolFromSmiles('CC') for _ in range(config['num_mols']) ] sampler.sample(run_dir, mols_init)
def mols_to_pngs(mols, basename="test"): """Helper to write RDKit mols to png files.""" filenames = [] for i, mol in enumerate(mols): filename = "MUV_%s%d.png" % (basename, i) Draw.MolToFile(mol, filename) filenames.append(filename) return filenames num_to_display = 12 molecules = [] for _, data in islice(dataset.iterrows(), num_to_display): molecules.append(Chem.MolFromSmiles(data["smiles"])) display_images(mols_to_pngs(molecules)) MUV_tasks = [ 'MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644', 'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712', 'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652', 'MUV-466', 'MUV-832' ] featurizer = dc.feat.CircularFingerprint(size=1024) loader = dc.data.CSVLoader(tasks=MUV_tasks, smiles_field="smiles", featurizer=featurizer) dataset = loader.featurize(dataset_file) splitter = dc.splits.RandomSplitter(dataset_file)
def test2Issue217(self) : smi = 'c1ccccc1' m = Chem.MolFromSmiles(smi) addConf(m) self.assertTrue(m.GetNumConformers()==1); mb2 = Chem.MolToMolBlock(m)
def predict(self, react, top_cand_bonds, top_cand_scores=[], scores=True, top_n=100, atommap=False): '''react: atom mapped reactant smiles top_cand_bonds: list of strings "ai-aj-bo"''' cand_bonds = [] if not top_cand_scores: top_cand_scores = [0.0 for b in top_cand_bonds] for i, b in enumerate(top_cand_bonds): x, y, t = b.split('-') x, y, t = int(float(x)) - 1, int(float(y)) - 1, float(t) cand_bonds.append((x, y, t, float(top_cand_scores[i]))) while True: src_tuple, conf = smiles2graph(react, None, cand_bonds, None, core_size=core_size, cutoff=MAX_NCAND, testing=True) if len(conf) <= MAX_NCAND: break ncore -= 1 feed_map = {x: y for x, y in zip(self.src_holder, src_tuple)} cur_scores, cur_probs, candidates = self.session.run( self.predict_vars, feed_dict=feed_map) idxfunc = lambda a: a.GetAtomMapNum() bond_types = [ Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC ] bond_types_as_double = {0.0: 0, 1.0: 1, 2.0: 2, 3.0: 3, 1.5: 4} # Don't waste predictions on bond changes that aren't actually changes rmol = Chem.MolFromSmiles(react) rbonds = {} for bond in rmol.GetBonds(): a1 = idxfunc(bond.GetBeginAtom()) a2 = idxfunc(bond.GetEndAtom()) t = bond_types.index(bond.GetBondType()) + 1 a1, a2 = min(a1, a2), max(a1, a2) rbonds[(a1, a2)] = t cand_smiles = [] cand_scores = [] cand_probs = [] for idx in candidates: cbonds = [] # Define edits from prediction for x, y, t, v in conf[idx]: x, y = x + 1, y + 1 if ((x, y) not in rbonds and t > 0) or ( (x, y) in rbonds and rbonds[(x, y)] != t): cbonds.append((x, y, bond_types_as_double[t])) pred_smiles = edit_mol(rmol, cbonds, atommap=atommap) cand_smiles.append(pred_smiles) cand_scores.append(cur_scores[idx]) cand_probs.append(cur_probs[idx]) outcomes = [] if scores: for i in range(min(len(cand_smiles), top_n)): outcomes.append({ 'rank': i + 1, 'smiles': '.'.join(cand_smiles[i]), 'score': cand_scores[i], 'prob': cand_probs[i], }) else: for i in range(min(len(cand_smiles), top_n)): outcomes.append({ 'rank': i + 1, 'smiles': '.'.join(cand_smiles[i]), }) return outcomes
def _read_smi(file_name): while True: line = file_name.readline() if not line: break yield Chem.MolFromSmiles(line.split('\t')[0])
def generate_corpus(in_file, out_file, r, sentence_type='alt', n_jobs=1): """Generates corpus file from sdf Parameters ---------- in_file : str Input sdf out_file : str Outfile name prefix, suffix is either _r0, _r1, etc. or _alt_r1 (max radius in alt sentence) r : int Radius of morgan fingerprint sentence_type : str Options: 'all' - generates all corpus files for all types of sentences, 'alt' - generates a corpus file with only combined alternating sentence, 'individual' - generates corpus files for each radius n_jobs : int Number of cores to use (only 'alt' sentence type is parallelized) Returns ------- """ # File type detection in_split = in_file.split('.') if in_split[-1].lower() not in ['sdf', 'smi', 'ism', 'gz']: raise ValueError('File extension not supported (sdf, smi, ism, sdf.gz, smi.gz)') gzipped = False if in_split[-1].lower() == 'gz': gzipped = True if in_split[-2].lower() not in ['sdf', 'smi', 'ism']: raise ValueError('File extension not supported (sdf, smi, ism, sdf.gz, smi.gz)') file_handles = [] # write only files which contain corpus if (sentence_type == 'individual') or (sentence_type == 'all'): f1 = open(out_file+'_r0.corpus', "w") f2 = open(out_file+'_r1.corpus', "w") file_handles.append(f1) file_handles.append(f2) if (sentence_type == 'alt') or (sentence_type == 'all'): f3 = open(out_file, "w") file_handles.append(f3) if gzipped: import gzip if in_split[-2].lower() == 'sdf': mols_file = gzip.open(in_file, mode='r') suppl = Chem.ForwardSDMolSupplier(mols_file) else: mols_file = gzip.open(in_file, mode='rt') suppl = _read_smi(mols_file) else: if in_split[-1].lower() == 'sdf': suppl = Chem.ForwardSDMolSupplier(in_file) else: mols_file = open(in_file, mode='rt') suppl = _read_smi(mols_file) if sentence_type == 'alt': # This can run parallelized result = Parallel(n_jobs=n_jobs, verbose=1)(delayed(_parallel_job)(mol, r) for mol in suppl) for i, line in enumerate(result): f3.write(str(line) + '\n') print('% molecules successfully processed.') else: for mol in suppl: if mol is not None: smiles = Chem.MolToSmiles(mol) mol = Chem.MolFromSmiles(smiles) identifier_sentences, alternating_sentence = mol2sentence(mol, r) identifier_sentence_r0 = " ".join(identifier_sentences[0]) identifier_sentence_r1 = " ".join(identifier_sentences[1]) alternating_sentence_r0r1 = " ".join(alternating_sentence) if len(smiles) != 0: if (sentence_type == 'individual') or (sentence_type == 'all'): f1.write(str(identifier_sentence_r0)+'\n') f2.write(str(identifier_sentence_r1)+'\n') if (sentence_type == 'alt') or (sentence_type == 'all'): f3.write(str(alternating_sentence_r0r1)+'\n') for fh in file_handles: fh.close()
mymols = make_molecules(cno) #Make sum over bonds descriptor bond_types, bonds_in_molecule = sum_over_bonds(mymols) np.savetxt("sum_over_bonds.out", bonds_in_molecule, delimiter=" ") #*********** Generate Estate indices************************ # #Note that there are 79 possible Estate descriptors, #however only a subset are non-zero for the Huang-Massa/Mathieu dataset so I #remove the null vectors using scrub_null_columns() num_smiles = len(smi) icount = 0 estate_fingers = np.zeros((num_smiles, 79)) #There are 79 possible descriptors while icount < num_smiles: m = Chem.MolFromSmiles(smi[icount]) counts, sums = FingerprintMol(m) estate_fingers[icount, :] = np.transpose( counts) #can also use sums as descriptor icount += 1 nz_estate = scrub_null_columns(estate_fingers) np.savetxt("nz_estate.out", nz_estate, delimiter=" ") # # #**********Done with Estate Generation************************** # Make Morgan fingerprints using Dan's code dan_prints = make_fingerprints(mymols) morgan_prints = np.asarray(dan_prints[2].x) np.savetxt("morgan_prints.out", morgan_prints, delimiter=" ")
def cal_feature_IG(sess, all_data, placeholders, info, config, prediction, ig_modal_target, ig_label_target, *, model=None, logger=None, args=None): """ calculate integrated gradients Args: sess: session object all_data: placeholders: info: config prediction: prediction score(output of the network) ig_modal_target: ig_label_target: model: logger: args: """ divide_number = 100 header = "mol" if args is not None and args.visualization_header is not None: header = args.visualization_header outdir = config["visualize_path"] os.makedirs(outdir, exist_ok=True) mol_obj_list = info.mol_info["obj_list"] if "mol_info" in info else None tf_grads = None all_count = 0 correct_count = 0 visualize_ids = range(all_data.num) if args.visualize_resample_num: visualize_ids = np.random.choice(visualize_ids, args.visualize_resample_num, replace=False) for compound_id in visualize_ids: s = time.time() batch_idx = [compound_id] if all_data['sequences'] is not None and hasattr(model, "embedding"): _data = all_data['sequences'] _data = np.expand_dims(_data[compound_id, ...], axis=0) _data = model.embedding(sess, _data) feed_dict = construct_feed(batch_idx, placeholders, all_data, batch_size=1, info=info, embedded_layer=_data) else: feed_dict = construct_feed(batch_idx, placeholders, all_data, batch_size=1, info=info) out_prediction = sess.run(prediction, feed_dict=feed_dict) # print("prediction shape",out_prediction.shape) # to give consistency with multitask. multitask = False if len(out_prediction.shape) == 1: out_prediction = out_prediction[:, np.newaxis, np.newaxis] elif len(out_prediction.shape) == 2: out_prediction = np.expand_dims(out_prediction, axis=1) elif len(out_prediction.shape) == 3: if out_prediction.shape[1] > 1: multitask = True # out_prediction: #data x # task x #class # labels: data x #task/#label for idx in range(out_prediction.shape[1]): _out_prediction = out_prediction[0, idx, :] true_label = np.argmax( all_data.labels[compound_id] ) if not multitask else all_data.labels[compound_id, idx] _prediction = prediction[:, idx, :] if len( prediction.shape) == 3 else prediction # multitask = 3 if ig_label_target == "max": target_index = np.argmax(_out_prediction) target_prediction = _prediction[:, target_index] target_score = _out_prediction[target_index] elif ig_label_target == "all": target_prediction = _prediction target_index = "all" target_score = np.sum(_out_prediction) elif ig_label_target == "correct": target_index = np.argmax(_out_prediction) if not target_index == true_label: continue target_prediction = _prediction[:, target_index] target_score = _out_prediction[target_index] elif ig_label_target == "uncorrect": target_index = np.argmax(_out_prediction) if target_index == true_label: continue target_prediction = _prediction[:, target_index] target_score = _out_prediction[target_index] elif ig_label_target == "label": target_index = true_label target_prediction = _prediction[:, target_index] target_score = _out_prediction[target_index] else: target_index = int(ig_label_target) target_prediction = _prediction[:, target_index] target_score = _out_prediction[target_index] # convert a assay string according to a prediction score if len(_out_prediction) > 2: # softmax output assay_str = f"class{target_index}" elif len(_out_prediction) == 2: # softmax output assay_str = "active" if _out_prediction[1] > 0.5 else "inactive" else: assay_str = "active" if _out_prediction > 0.5 else "inactive" try: mol_name = Chem.MolToSmiles(mol_obj_list[compound_id]) mol_obj = mol_obj_list[compound_id] except: mol_name = None mol_obj = None if args.verbose: print( f"No.{compound_id}, task={idx}: \"{mol_name}\": {assay_str} (score= {_out_prediction}, " f"true_label= {true_label}, target_label= {target_index}, target_score= {target_score})" ) else: print( f"No.{compound_id}, task={idx}: \"{mol_name}\": {assay_str}" ) visualizer = CompoundVisualizer( sess, outdir, compound_id, info, config, batch_idx, placeholders, all_data, target_prediction, logger=logger, model=model, ig_modal_target=ig_modal_target, perturbation_target=ig_modal_target, grads=tf_grads) tf_grads = visualizer.grads if tf_grads is None else tf_grads visualizer.cal_integrated_gradients(sess, divide_number, method=args.visualize_method) visualizer.check_IG(sess, target_prediction) visualizer.dump( f"{header}_{compound_id:04d}_task_{idx}_{assay_str}_{ig_modal_target}_scaling.jbl", additional_data={ "mol": mol_obj, "prediction_score": target_score, "target_label": target_index, "true_label": true_label, }) logger.info( f"prediction score: {target_score}\n" f"check score: {visualizer.end_score - visualizer.start_score}\n" f"sum of IG: {visualizer.sum_of_ig}\n" f"time : {time.time() - s}\n") all_count += 1 if np.argmax(_out_prediction) == int(true_label): correct_count += 1 logger.info(f"accuracy(visualized_data) = {correct_count/all_count}")
def sample(mdl, scaffold_smi, num_samples): """Generate `num_samples` samples from the model `mdl` based on a given scaffold with SMILES `scaffold_smi`. Args: mdl (DeepScaffold): The scaffold-based molecule generative model scaffold_smi (str): The SMILES string of the given scaffold num_samples (int): The number of samples to generate Returns: t.Tuple[t.List[t.Union[str, None]], float, float]: The generated molecules. Molecules that does not satisfy the validity requirements are returned as `None` """ lg = RDLogger.logger() lg.setLevel(RDLogger.CRITICAL) # Convert SMILES to molecule scaffold = Chem.MolFromSmiles(scaffold_smi) # Convert molecule to numpy array # shape: 1, ..., 5 scaffold_array, _ = get_array_from_mol(mol=scaffold, scaffold_nodes=range( scaffold.GetNumHeavyAtoms()), nh_nodes=[], np_nodes=[], k=1, p=1.0) # Convert numpy array to torch tensor # shape: 1, ..., 5 scaffold_tensor = torch.from_numpy(scaffold_array).long().cuda() # Generate with torch.no_grad(): # Expand the first dimension # shape: num_samples, ..., 5 scaffold_tensor = scaffold_tensor.expand(num_samples, -1, -1) # Generate samples # shape: [num_samples, -1, 5] mol_array = mdl.generate(scaffold_tensor) # Move to CPU mol_array = mol_array.detach().cpu().numpy() # Convert numpy array to Chem.Mol object mol_list = get_mol_from_array(mol_array, sanitize=True) # Convert Chem.Mol object to SMILES def _to_smiles(_mol): if _mol is None: return None try: _smiles = Chem.MolToSmiles(_mol) except ValueError: # If the molecule can not be converted to SMILES, return None return None # If the output SMILES is None, return None if _smiles is None: return None # Make sure that the SMILES can be convert back to molecule try: _mol = Chem.MolFromSmiles(_smiles) except ValueError: # If there are any error encountered during the process, # return None return None # If the output molecule object is None, return None if _mol is None: return None return _smiles smiles_list = list(map(_to_smiles, mol_list)) # Get the validity statistics num_valid = sum(1 for _ in smiles_list if _ is not None) percent_valid = float(num_valid) / len(smiles_list) # Get the uniqueness statistics num_unique = len(set(smiles_list)) - 1 percent_unique = float(num_unique) / num_valid return smiles_list, percent_valid, percent_unique
def perceive_smiles(self, atommap=True): """ Using the geometry, perceive the corresponding SMILES with bond orders using Open Babel and RDKit. In order to create a sensible SMILES, first infer the connectivity from the 3D coordinates using Open Babel, then convert to InChI to saturate unphysical multi-radical structures, then convert to RDKit and match the atoms to the ones in self in order to return a SMILES with atom mapping corresponding to the order given by the values of atom.idx for all atoms in self. This method requires Open Babel version >=2.4.1 """ # Get dict of atomic numbers for later comparison. atoms_in_mol_true = {} for atom in self: anum = atom.get_atomicnum() atoms_in_mol_true[anum] = atoms_in_mol_true.get(anum, 0) + 1 # There seems to be no particularly simple way in RDKit to read # in 3D structures, so use Open Babel for this part. RMG doesn't # recognize some single bonds, so we can't use that. # We've probably called to_pybel_mol at some previous time to set # connections, but it shouldn't be too expensive to do it again. pybel_mol = self.to_pybel_mol() # Open Babel will often make single bonds and generate Smiles # that have multiple radicals, which would probably correspond # to double bonds. To get around this, convert to InChI (which # does not consider bond orders) and then convert to Smiles. inchi = pybel_mol.write('inchi', opt={'F': None}).strip() # Add fixed H layer # Use RDKit to convert back to Smiles mol_sanitized = Chem.MolFromInchi(inchi) # RDKit doesn't like some hypervalent atoms if mol_sanitized is None: raise SanitizationError( f'Could not convert \n{self.to_xyz()}\nto Smiles. Unsanitized Smiles: {pybel_mol.write("smi").strip()}' ) # RDKit adds unnecessary hydrogens in some cases. If # this happens, give up and return an error. mol_sanitized = Chem.AddHs(mol_sanitized) atoms_in_mol_sani = {} for atom in mol_sanitized.GetAtoms(): atoms_in_mol_sani[atom.GetAtomicNum()] = atoms_in_mol_sani.get(atom.GetAtomicNum(), 0) + 1 if atoms_in_mol_sani != atoms_in_mol_true: raise SanitizationError( f'Could not convert \n{self.to_xyz()}\nto Smiles. Wrong Smiles: {Chem.MolToSmiles(mol_sanitized)}' ) if not atommap: self.smiles = Chem.MolToSmiles(mol_sanitized) return self.smiles # Because we went through InChI, we lost atom mapping # information. Restore it by matching the original molecule. # There should only be one unique map. mol_with_map = self.to_rdkit_mol() # This only has single bonds mol_sani_sb = Chem.Mol(mol_sanitized) # Make copy with single bonds only for bond in mol_sani_sb.GetBonds(): bond.SetBondType(Chem.rdchem.BondType.SINGLE) match = mol_sani_sb.GetSubstructMatch(mol_with_map) # Isomorphism mapping assert mol_with_map.GetNumAtoms() == len(match) # Make sure we match all atoms for atom in mol_with_map.GetAtoms(): idx = match[atom.GetIdx()] map_num = atom.GetAtomMapNum() mol_sanitized.GetAtomWithIdx(idx).SetAtomMapNum(map_num) # If everything succeeded up to here, we hopefully have a # sensible Smiles string with atom mappings for all atoms. self.smiles = Chem.MolToSmiles(mol_sanitized) return self.smiles
#!/usr/bin/env python # -*- coding:utf-8 -*- import os import numpy as np import pybel from rdkit import Chem from rdkit.Chem import AllChem, GetPeriodicTable _rdkit_periodic_table = GetPeriodicTable() RDKIT_SMILES_PARSER_PARAMS = Chem.SmilesParserParams() def smiles_to_rdkit(smi, gen_3d=True, nconf=100): """ Convert smiles to RDKit molecule. Tries to generate the lowest-energy conformer. """ mol = Chem.MolFromSmiles(smi) mol = Chem.AddHs(mol) if gen_3d: cids = AllChem.EmbedMultipleConfs(mol, nconf, AllChem.ETKDG()) AllChem.MMFFSanitizeMolecule(mol) mmff_props = AllChem.MMFFGetMoleculeProperties(mol) energies = [] for cid in cids: ff = AllChem.MMFFGetMoleculeForceField(mol, mmff_props, confId=cid)
def transform_command(parser, args): min_radius = args.min_radius assert min_radius in list("012345"), min_radius min_radius = int(min_radius) min_pairs = int(args.min_pairs) min_variable_size = args.min_variable_size max_variable_size = args.max_variable_size assert max_variable_size > min_variable_size, "max-variable-size must be greater than min-variable-size" min_constant_size = args.min_constant_size explain = command_support.get_explain(args.explain) start_time = time.time() dataset = dbutils.open_dataset_from_args_or_exit(args) open_time = time.time() property_names = command_support.get_property_names_or_error( parser, args, dataset) if not property_names: include_empty = True else: include_empty = False # should there be a --show-all option to enable this? if args.substructure: substructure_pat = Chem.MolFromSmarts(args.substructure) if substructure_pat is None: parser.error("Cannot parse --substructure %r" % (args.substructure, )) else: substructure_pat = None # evaluate --where, --score, and --rule-selection-cutoffs. rule_selection_function = analysis_algorithms.get_rule_selection_function_from_args( parser, args) transform_tool = analysis_algorithms.get_transform_tool( dataset, rule_selection_function) transform_record = transform_tool.fragment_transform_smiles(args.smiles) if transform_record.errmsg: parser.error("Unable to fragment --smiles %r: %s" % (args.smiles, transform_record.errmsg)) # Make sure I can open the output file before I start doing heavy work. try: outfile = fileio.open_output(args.output, args.output) except IOError as err: parser.error("Cannot open --output file: %s" % (err, )) query_prep_time = time.time() if args.jobs > 1: pool = multiprocessing.Pool(processes=args.jobs) else: pool = None try: result = transform_tool.transform( transform_record.fragments, property_names, min_radius=min_radius, min_pairs=min_pairs, min_variable_size=min_variable_size, max_variable_size=max_variable_size, min_constant_size=min_constant_size, substructure_pat=substructure_pat, pool=pool, explain=explain, ) except analysis_algorithms.EvalError as err: sys.stderr.write("ERROR: %s\nExiting.\n" % (err, )) raise SystemExit(1) transform_time = time.time() with outfile: result.write_products( outfile, field_names=( # "rule_environment_statistics_id",), "from_smiles", "to_smiles", "radius", "fingerprint", "rule_environment_id", "count", "avg", "std", "kurtosis", "skewness", "min", "q1", "median", "q3", "max", "paired_t", "p_value"), #column_aliases = {"from_smiles": "FROM"}, # use this to change the column name for a field include_empty=include_empty) output_time = time.time() if args.times: sys.stderr.write("Elapsed time (in seconds):\n") format_dt = get_time_delta_formatter(output_time - start_time) sys.stderr.write(" open database: %s\n" % format_dt(open_time - start_time)) sys.stderr.write(" prepare query: %s\n" % format_dt(query_prep_time - open_time)) sys.stderr.write(" transform: %s\n" % format_dt(transform_time - query_prep_time)) sys.stderr.write(" write output: %s\n" % format_dt(output_time - transform_time)) sys.stderr.write(" TOTAL = %s\n" % format_dt(output_time - start_time))
def setUp(self): self.mol1 = Chem.MolFromSmiles('c1ccccc1') self.mol2 = Chem.MolFromSmiles('c1ccncc1')
def randomize_smi(smi): random_equivalent_smiles = Chem.MolFromSmiles( Chem.MolToSmiles(smi, doRandom=True)) return random_equivalent_smiles
def vectorize_rdkit(smiles, mol=None): if mol is None: mol = Chem.MolFromSmiles(smiles) return [desc[x](mol) for x in keys]
def test3Exceptions(self) : smi = 'c1ccccc1' m = Chem.MolFromSmiles(smi) addConf(m) self.assertTrue(m.GetNumConformers()==1) self.assertRaises(ValueError,lambda:m.GetConformer(2))
def smiles2sentence(smiles): mol = Chem.MolFromSmiles(smiles) sentence = mol2alt_sentence(mol, 1) return sentence
Input: mol is a molecule object Output: result is a dict form ################################################################# """ result = {} result.update(CalculateLabuteASA(mol)) result.update(CalculateTPSA(mol)) result.update(CalculateSLOGPVSA(mol, bins=None)) result.update(CalculateSMRVSA(mol, bins=None)) result.update(CalculatePEOEVSA(mol, bins=None)) result.update(CalculateEstateVSA(mol, bins=None)) result.update(CalculateVSAEstate(mol, bins=None)) return result ######################################################################### if __name__ == "__main__": smi5 = [ 'COCCCC', 'CCC(C)CC', 'CC(C)CCC', 'CC(C)C(C)C', 'CCOCCN', 'c1ccccc1N' ] smis = ['CCCC', 'CCCCC', 'CCCCCC', 'CC(N)C(=O)O', 'CC(N)C(=O)[O-].[Na+]'] for index, smi in enumerate(smis): m = Chem.MolFromSmiles(smi) print(index + 1) print(smi) print('\t', GetMOE(m)) print('\t', len(GetMOE(m)))
percentage_cutoff = 0.1 # compute intdiv on top 10% molecules in the samples ### Get QED distribution at different steps of CbAS tanim_dist_all, tanim_dist_top = [],[] for step in np.arange(1,steps+1): samples = pd.read_csv(f'../cbas/slurm/results/{name}/docking_results/{step}.csv') samples = samples.sort_values('score') N = int(samples.shape[0]*percentage_cutoff) smiles = samples.smile smiles = [s for s in smiles if Chem.MolFromSmiles(s) is not None] mols = [Chem.MolFromSmiles(s) for s in smiles] fps = [AllChem.GetMorganFingerprintAsBitVect(m , 3, nBits=2048) for m in mols] fps= np.array(fps) D= pairwise_distances(fps, metric = 'jaccard') D_top = D[:N,:N] tanim_dist_all.append(np.mean(D)) tanim_dist_top.append(np.mean(D_top)) sns.lineplot(x=np.arange(1, step+1), y=tanim_dist_all, color = 'b', label = 'all samples') sns.lineplot(x=np.arange(1, step+1), y=tanim_dist_top, color = 'r', label = f'top {percentage_cutoff*100:.0f}%') plt.ylim(0,1) plt.ylabel('Average fingerprint pairwise distance')
# -*- encoding: utf-8 -*- from rdkit import Chem m = Chem.MolFromSmiles('Cc1ccccc1') # Smiles: # Mol:摩尔 (物质的量) help(m) print()
def process_hmdb(args): conn = sqlite3.connect(args.database_dir + '/HMDB_MAGMa.db') c = conn.cursor() try: c.execute("""CREATE TABLE molecules (id TEXT PRIMARY KEY, mim INTEGER NOT NULL, charge INTEGER NOT NULL, natoms INTEGER NOT NULL, molblock TEXT, inchikey TEXT, smiles TEXT, molform TEXT, name TEXT, reference TEXT, logp INT)""") conn.commit() print("HMDB_MAGMa.db created") except: print("HMDB_MAGMa.db already exists (or error creating it)") exit() if args.data_dir == None: zf = urllib2.urlopen( 'http://www.hmdb.ca/system/downloads/current/structures.zip') else: zf = open(args.data_dir + 'structures.zip') sdfile = zipfile.ZipFile(StringIO.StringIO( zf.read())).open('structures.sdf') memstore = {} line = '$$$$' while line != "": record = [] amap = {} skip = False ionized = 0 # read heading: for x in range(4): line = sdfile.readline() record.append(line) if line == "": continue natoms = int(record[-1][:3]) nbonds = int(record[-1][3:6]) bonds = 0 y = 0 for x in range(natoms): line = sdfile.readline() if line[31:33] == 'H ': # skip hydrogens continue y += 1 amap[x + 1] = y if line[31:33] not in [ 'C ', 'N ', 'O ', 'P ', 'S ', 'F ', 'Cl', 'Br', 'I ' ]: # filter non-organic compounds skip = True elif line[50:51] != '0': # this flag has something to do with polymeric structures # and resulted in deviation between calculated and given inchikeys, skip skip = True elif line[38:39] == '4': # radical, resulted in deviation between calculated and given inchikeys skip = True record.append(line[:42] + '\n') for x in range(nbonds): line = sdfile.readline() a1 = int(line[:3]) a2 = int(line[3:6]) # skip bonds involving hydrogens if a1 in amap and a2 in amap: bonds += 1 # use bonds with stereoflags set to zero record.append('%3i%3i%s 0\n' % (amap[a1], amap[a2], line[6:9])) while line != 'M END\n' and line != '': line = sdfile.readline() record.append(line) if line[:6] == 'M ISO': skip = True print 'Skipped isotopically labeled:', record[0][:-1] while line != "$$$$\n" and line != "": line = sdfile.readline() if line == "> <HMDB_ID>\n": hmdb_id = str(sdfile.readline()[:-1]) elif line == "> <GENERIC_NAME>\n": molname = str(sdfile.readline()[:-1]) elif line == "> <INCHI_KEY>\n": inchi_key = sdfile.readline()[:-1] if line != "" and skip == False: record[3] = repr(y).rjust(3) + repr(bonds).rjust(3) + record[3][6:] molblock = ''.join(record) mol = Chem.MolFromMolBlock(molblock) if mol == None or mol.GetNumAtoms() == 0: continue smiles = Chem.MolToSmiles(mol) if len(Chem.GetMolFrags(mol)) > 1: print 'complex:', hmdb_id, smiles continue conf = mol.GetConformer(0) molblock = base64.encodestring(zlib.compress(''.join(record))) molform = Chem.rdMolDescriptors.CalcMolFormula(mol) mim = Chem.rdMolDescriptors.CalcExactMolWt(mol) charge = 0 if '-' in molform: if molform[-1] == '-': charge = -1 else: continue elif '+' in molform: if molform[-1] == '+': charge = 1 else: continue if mim > 1200.0: print 'molecule to heavy:', hmdb_id, smiles continue natoms = mol.GetNumHeavyAtoms() logp = Chem.Crippen.MolLogP(mol) inchikey = Chem.AllChem.InchiToInchiKey( AllChem.MolToInchi(mol))[:14] if inchikey != inchi_key[:14]: print 'given inchikey does not match calculated inchikey, skipped:', hmdb_id, smiles continue ionized = 0 for x in ['C(=O)[O-]', '[NH+]', '[NH2+]', '[NH3+]', '[NH4+]']: if smiles.find(x) >= 0: ionized = 1 if inchikey in memstore: dbid, reference, dbionized = memstore[inchikey] reference = reference + ',' + hmdb_id print 'Duplicates:', reference, molname if dbionized > ionized: # prefer non-ionized CID's c.execute( '''UPDATE molecules SET id=?, mim=?, charge=?, molblock=?, smiles=?, molform=?, name=?, reference=?, logp=? WHERE id == ?''', (hmdb_id, int(mim * 1e6), charge, unicode(molblock), unicode(smiles), unicode(molform), unicode(molname, 'utf-8', 'xmlcharrefreplace'), unicode(reference), int(logp * 10), dbid)) memstore[inchikey] = (hmdb_id, reference, ionized) else: c.execute('UPDATE molecules SET reference=? WHERE id == ?', (unicode(reference), dbid)) memstore[inchikey] = (dbid, reference, dbionized) else: c.execute( '''INSERT INTO molecules (id, mim, charge, natoms, molblock, inchikey, smiles,molform,name,reference,logp) VALUES (?,?,?,?,?,?,?,?,?,?,?)''', (hmdb_id, int( mim * 1e6), charge, int(natoms), unicode(molblock), unicode(inchikey), unicode(smiles), unicode(molform), unicode(molname, 'utf-8', 'xmlcharrefreplace'), unicode(hmdb_id), int(logp * 10))) memstore[inchikey] = (hmdb_id, hmdb_id, ionized) conn.commit() print "Creating index ..." c.execute('PRAGMA temp_store = 2') c.execute( 'CREATE INDEX idx_cover ON molecules (charge,mim,natoms,reference,molform,inchikey,smiles,name,molblock,logp)' ) conn.commit()
def bond_topologies_from_geom(molecule, bond_lengths, matching_parameters): """Return all BondTopology's that are plausible. Given a molecule described by `bond_topology` and `geometry`, return all possible BondTopology that are consistent with that. Note that `bond_topology` will be put in a canonical form. Args: molecule: bond_lengths: matrix of interatomic distances matching_parameters: Returns: TopologyMatches """ starting_topology = molecule.bond_topologies[0] result = dataset_pb2.TopologyMatches() # To be returned. result.starting_smiles = starting_topology.smiles result.molecule_id = molecule.molecule_id result.fate = molecule.properties.errors.fate natoms = len(starting_topology.atoms) if natoms == 1: return result # empty. if len(molecule.optimized_geometry.atom_positions) != natoms: return result # empty distances = utilities.distances(molecule.optimized_geometry) # First join each Hydrogen to its nearest heavy atom, thereby # creating a minimal BondTopology from which all others can grow if matching_parameters.check_hydrogen_dists: minimal_bond_topology = hydrogen_to_nearest_atom(starting_topology, distances, bond_lengths) else: minimal_bond_topology = hydrogen_to_nearest_atom(starting_topology, distances, None) if minimal_bond_topology is None: return result heavy_atom_indices = [ i for i, t in enumerate(starting_topology.atoms) if t != dataset_pb2.BondTopology.AtomType.ATOM_H ] # For each atom pair, a list of possible bond types. # Key is a tuple of the two atom numbers, value is an np.array # with the score for each bond type. bonds_to_scores: Dict[Tuple[int, int], np.ndarray] = {} for (i, j) in itertools.combinations(heavy_atom_indices, 2): # All pairs. dist = distances[i, j] if dist > THRESHOLD: continue try: possible_bonds = bond_lengths.probability_of_bond_types( starting_topology.atoms[i], starting_topology.atoms[j], dist) except KeyError: # Happens when this bond type has no data continue if not possible_bonds: continue # Note that this relies on the fact that BOND_SINGLE==1 etc.. btypes = np.zeros(4, np.float32) for key, value in possible_bonds.items(): btypes[key] = value bonds_to_scores[(i, j)] = btypes if not bonds_to_scores: # Seems unlikely. return result rdkit_mol = smu_utils_lib.bond_topology_to_rdkit_molecule(starting_topology) initial_ring_atom_count = utilities.ring_atom_count_mol(rdkit_mol) mol = topology_molecule.TopologyMolecule(minimal_bond_topology, bonds_to_scores, matching_parameters) search_space = mol.generate_search_state() for s in itertools.product(*search_space): bt = mol.place_bonds(list(s), matching_parameters) if not bt: continue rdkit_mol = smu_utils_lib.bond_topology_to_rdkit_molecule(bt) if matching_parameters.consider_not_bonded and len( Chem.GetMolFrags(rdkit_mol)) > 1: continue utilities.canonicalize_bond_topology(bt) if matching_parameters.ring_atom_count_cannot_decrease: ring_atoms = utilities.ring_atom_count_mol(rdkit_mol) if ring_atoms < initial_ring_atom_count: continue bt.ring_atom_count = ring_atoms bt.smiles = smu_utils_lib.compute_smiles_for_rdkit_molecule( rdkit_mol, include_hs=matching_parameters.smiles_with_h) bt.geometry_score = geometry_score(bt, distances, bond_lengths) result.bond_topology.append(bt) if len(result.bond_topology) > 1: result.bond_topology.sort(key=lambda bt: bt.score, reverse=True) score_sum = np.sum([bt.score for bt in result.bond_topology]) for bt in result.bond_topology: bt.topology_score = np.log(bt.score / score_sum) bt.ClearField("score") return result
def SMILESFromGraph(node_list, adjacency_matrix): return Chem.MolToSmiles(MolFromGraphs(nodes, a))
num = 16546 mol_list = [] nCnumber = [] for p, k in enumerate(hanni[1:]): k2 = int(k[1:]) if k2 > num: k3 = str(p+1) with open("../../../database/knapsack-kcf/KNApSAck" + k3 + ".kcf")as f2: Clist = f2.read().split("///\n") try: for C in Clist: if i == C.split()[1]: molblock = kcfco.kcf_to_molblock(C) # print("OK", i) # print(molblock[1]) mol = Chem.MolFromMolBlock(molblock[1]) if mol is None: print("None", i, z, k3) if "#+" in C or "#-" in C: print("Charge in\n") counter += 1 break # rdDepictor.Compute2DCoords(mol) mol_list.append(mol) nCnumber.append(i) if "#+" in C or "#-" in C: print(i, z, k3, "Charge in\n") break except IndexError: counter += 1 print("DAME", i, z)
def _reward(self): molecule = Chem.MolFromSmiles(self._state) if molecule is None: return 0.0 return molecules.penalized_logp(molecule)
def process_DB(DB): # df = pd.read_csv(data_folder + 'tox21.csv', sep=',') list_ID, list_SMILES, list_y, dict_id2smile = [], [], [], {} reader = csv.reader(open('data/MUV/muv.csv'), delimiter=',') if DB == 'MUV': n_None = [0 for _ in range(17)] i = 0 for row in reader: if i > 0: smile = row[18] m = Chem.MolFromSmiles(smile) if m is not None and smile != '': if DB == 'MUV': list_ID.append(row[17]) list_SMILES.append(row[18]) y_temp, n_None = get_multi_label(row[:17], n_None) list_y.append(y_temp) dict_id2smile[row[17]] = row[18] elif 'MUV' in DB: if row[int(DB.split('_')[1])] != '': list_ID.append(row[17]) list_SMILES.append(row[18]) dict_id2smile[row[17]] = row[18] list_y.append(int(row[int(DB.split('_')[1])])) i += 1 pickle.dump(dict_id2smile, open('data/' + DB + '/' + DB + '_dict_ID2SMILES.data', 'wb')) # pickle.dump(dict_uniprot2fasta, # open(root + 'data/' + DB + '/' + DB + '_dict_ID2FASTA.data', 'wb')) pickle.dump(list_SMILES, open('data/' + DB + '/' + DB + '_list_SMILES.data', 'wb')) pickle.dump(list_y, open('data/' + DB + '/' + DB + '_list_y.data', 'wb')) pickle.dump(list_ID, open('data/' + DB + '/' + DB + '_list_ID.data', 'wb')) f = open('data/' + DB + '/' + DB + '_dict_ID2SMILES.tsv', 'w') for cle, valeur in dict_id2smile.items(): f.write(cle + '\t' + valeur + '\n') f.close() f = open('data/' + DB + '/' + DB + '_list_SMILES.tsv', 'w') for s in list_SMILES: f.write(s + '\n') f.close() f = open('data/' + DB + '/' + DB + '_list_y.tsv', 'w') for s in list_y: if type(s) is list: for ll in s: f.write(str(ll) + '\t') f.write('\n') else: f.write(str(s) + '\n') f.close() f = open('data/' + DB + '/' + DB + '_list_ID.tsv', 'w') for s in list_ID: f.write(s + '\n') f.close() print(len(list_SMILES)) if DB == 'MUV': print([len(list_SMILES) - n_None[i] for i in range(len(n_None))]) elif 'MUV' in DB: print(collections.Counter(list_y))
import pandas as pd from rdkit import Chem df = pd.read_csv("drug_class_test.txt", sep = "\t") for index,row in df.iterrows(): smile = row['Canonical_Smiles'] print(index, Chem.MolFromSmiles(smile)) # index 2708 produces none df = df[df['index']!=2708] # 2708 antiinfective/1169 antiinfective F[As-](F)(F)(F)(F)F.c1ccc([I+]c2ccccc2)cc1 6 df.to_csv("drug_class_test.txt", sep = "\t")