def get_smiles_fp(smiles, ids, filename, num): hex_fps = [] new_ids = [] new_smiles = [] count = 0 for smile in smiles: m = Chem.MolFromSmiles(smile) if m is None: count = count + 1 continue # the method for generating fingerprints - morgan_fp or rdk_fp # fp2 = AllChem.GetMorganFingerprintAsBitVect(m, 2, vec_dim) fp2 = Chem.RDKFingerprint(m, fpSize=vec_dim) hex_fp = DataStructs.BitVectToFPSText(fp2) hex_fps.append(hex_fp) new_ids.append(ids[count]) new_smiles.append(smile) count = count + 1 hex_fps = np.array(hex_fps) np.save(OUT + '/' + OUT_NPY + '/' + filename + "%03d" % num + '.npy', hex_fps) save_file(new_smiles, OUT + '/' + OUT_SMILES + '/' + filename + "%03d" % num + '.smi') save_file(new_ids, OUT + '/' + OUT_IDS + '/' + filename + "%03d" % num + '.txt') del hex_fps del new_smiles del new_ids gc.collect()
def smiles_to_vec(smiles): mols = Chem.MolFromSmiles(smiles) fp = AllChem.GetMorganFingerprintAsBitVect(mols, 2, VECTOR_DIMENSION) hex_fp = DataStructs.BitVectToFPSText(fp) # print(hex_fp) vec = bytes.fromhex(hex_fp) return vec
def search_ids_smi_list(table_name, topk, ids, smiles): rand = None query_list = [] if ids: smiles = get_smi_in_pg(table_name, ids) mols = Chem.MolFromSmiles(smiles) fp = Chem.RDKFingerprint(mols, fpSize=VECTOR_DIMENSION) hex_fp = DataStructs.BitVectToFPSText(fp) # print(hex_fp) vec = bytes.fromhex(hex_fp) query_list.append(vec) print("table name:", table_name, "query list:", len(query_list), "topk:", topk) time_start = time.time() status, results = MILVUS.search(collection_name=table_name, query_records=query_list, top_k=topk, params={}) time_end = time.time() time_cost = time_end - time_start print("time_search = ", time_cost) print(status,results) time_start = time.time() save_re_to_file(table_name, results) time_end = time.time() time_cost = time_end - time_start print("time_save = ", time_cost)
def test7FPS(self): bv = DataStructs.ExplicitBitVect(32) bv.SetBit(0) bv.SetBit(1) bv.SetBit(17) bv.SetBit(23) bv.SetBit(31) self.assertEqual(DataStructs.BitVectToFPSText(bv), "03008280") bv2 = DataStructs.CreateFromFPSText("03008280") self.assertEqual(bv, bv2) self.assertRaises(ValueError, lambda: DataStructs.CreateFromFPSText("030082801")) bv2 = DataStructs.CreateFromFPSText("") self.assertEqual(bv2.GetNumBits(), 0)
def _getFPSStream(f, mols, type='morgan', radius=2, n_bits=2048): f.write("#FPS1\n#num_bits=%s\n#software=RDKit/%s\n" % (n_bits, rdBase.rdkitVersion)) for i, mol in enumerate(mols): if mol: idx = i if mol.HasProp('chembl_id'): idx = mol.GetProp('chembl_id') elif Chem.INCHI_AVAILABLE: try: Chem.SanitizeMol(mol) idx = Chem.InchiToInchiKey(Chem.MolToInchi(mol)) except: pass if type == 'morgan': fp = rdMolDescriptors.GetMorganFingerprintAsBitVect( mol, radius, nBits=n_bits) elif type == 'pair': fp = Pairs.GetAtomPairFingerprintAsBitVect(mol) elif type == 'maccs': fp = MACCSkeys.GenMACCSKeys(mol) f.write("%s\t%s\n" % (DataStructs.BitVectToFPSText(fp), idx))