def fp_maccs_std_mp(mol, mol_can, i): fp_mol = MACCSkeys.GenMACCSKeys(mol) if id(mol) == id(mol_can): fp_can = fp_mol else: fp_can = MACCSkeys.GenMACCSKeys(mol_can) return (i, fp_mol, fp_can)
def fp_maccs_std(mols): for i in mols: fp_mol = MACCSkeys.GenMACCSKeys(mols[i]["mol"]) mols[i]["fp"] = fp_mol if id(mols[i]["mol"]) == id(mols[i]["mol_can"]): mols[i]["fp_can"] = fp_mol else: mols[i]["fp_can"] = MACCSkeys.GenMACCSKeys(mols[i]["mol_can"])
def label_switching_encoder(key_smiles, bits, df, nmol_df): ''' :param bits: a list of ACSII code :param df: df where to pick key molecule and the 'pad' molecules :return: key molecule and chemical messages ''' # molecular key key_mol = Chem.MolFromSmiles(key_smiles) key_fp = MACCSkeys.GenMACCSKeys(key_mol) # build root_seed and rotor_seed based on MW and number of atoms of key_mol root_seed = int(Chem.Descriptors.ExactMolWt(key_mol)) rotor_seed = key_mol.GetNumAtoms() #pick 128 neighbor molecules # Pick the 128 reference molecules np.random.seed(root_seed) ref_smiles = np.random.choice(nmol_df.SMILES, size=128, replace=False) #compute the distance dist = [] for i in range(len(ref_smiles)): mol = Chem.MolFromSmiles(ref_smiles[i]) fp = MACCSkeys.GenMACCSKeys(mol) dist.append(DataStructs.FingerprintSimilarity(key_fp, fp)) # build a list from 0 to 127 orig_label = [i for i in range(128)] message_mol_list = [] for index, bit in enumerate(bits): SEED = root_seed + index * rotor_seed np.random.seed(SEED) # Base on the random seed, swap the original distance. step_dist = np.random.choice(dist, size=len(dist), replace=False) # get the index of ordered distances dict_rank = [0] * len(step_dist) for i, x in enumerate( sorted(range(len(step_dist)), key=lambda y: step_dist[y])): dict_rank[x] = i swaper_dict = dict(zip(dict_rank, orig_label)) # pick mol from df # fix the problem that the original text has the ACSII code larger than 127 if int(bit) < 128: rand_mol = random.choice( df[df.clusters == swaper_dict.get(int(bit))]['smiles']) else: rand_mol = random.choice(df[df.clusters == swaper_dict.get(int( 42))]['smiles']) # use * as replacement message_mol_list.append(rand_mol) return message_mol_list
def getXNN(trainSmilesList, train, predEx, smilesAttrName, nameAttr, X, simType): if simType == "Topological": fpsTrain = [FingerprintMols.FingerprintMol(x) for x in trainSmilesList] fp = FingerprintMols.FingerprintMol( Chem.MolFromSmiles(predEx[smilesAttrName].value)) elif simType == "Morgan": fpsTrain = [ AllChem.GetMorganFingerprint(x, 2) for x in trainSmilesList ] fp = AllChem.GetMorganFingerprint( Chem.MolFromSmiles(predEx[smilesAttrName].value), 2) elif simType == "MACCS": fpsTrain = [MACCSkeys.GenMACCSKeys(x) for x in trainSmilesList] fp = MACCSkeys.GenMACCSKeys( Chem.MolFromSmiles(predEx[smilesAttrName].value)) else: print "This type of sim is not implemented ", simType simDict = {} idx = 0 simList = [] for ex in train: if simType == "Topological": sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp) elif simType == "Morgan": sim = DataStructs.DiceSimilarity(fpsTrain[idx], fp) elif simType == "MACCS": sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp) else: print "This type of sim is not implemented ", simType idx = idx + 1 simDict[ex[nameAttr].value] = sim simList.append(sim) simList.sort(reverse=True) simList = simList[0:X] medSim = round(numpy.median(simList), 3) stdSim = round(numpy.std(simList), 3) minSim = round(min(simList), 3) maxSim = round(max(simList), 3) entropy = round(getRespVar(simList, simDict, train, nameAttr), 3) entropyClosest = round( getRespVar(simList[0:X / 2], simDict, train, nameAttr), 3) return medSim, stdSim, minSim, maxSim, entropy, entropyClosest
def compute_pca(self): Database = self.Database2 smiles = list(Database.SMILES) smi = [Chem.MolFromSmiles(x) for x in smiles] fps=[MACCSkeys.GenMACCSKeys(x) for x in smi] # Generate the lower similarity matrix triangle tanimoto_sim_mat_lower_triangle=GetTanimotoSimMat(fps) # tanimoto_sim_mat_lower_triangle n_mol = len(fps) similarity_matrix = np.ones([n_mol,n_mol]) i_lower= np.tril_indices(n=n_mol,m=n_mol,k=-1) i_upper= np.triu_indices(n=n_mol,m=n_mol,k=1) similarity_matrix[i_lower] = tanimoto_sim_mat_lower_triangle similarity_matrix[i_upper] = similarity_matrix.T[i_upper] sklearn_pca = sklearn.decomposition.PCA(n_components=2, svd_solver = "full", whiten = True) sklearn_pca.fit(similarity_matrix) variance = list(sklearn_pca.explained_variance_ratio_) a = round(variance[0] * 100, 2) b = round(variance[1] * 100,2) pca_result = pd.DataFrame(sklearn_pca.transform(similarity_matrix) , columns=['PC1','PC2']) pca_result["LIBRARY"] = Database.LIBRARY pca_result["TIPO"] = Database.LIBRARY pca_result["SMILES"] = Database.SMILES pca_result["NAME"] = Database.NAME self.pca_result = pca_result.set_index('TIPO') variance = list(sklearn_pca.explained_variance_ratio_) self.a = round(variance[0] * 100, 2) self.b = round(variance[1] * 100,2) return pca_result
def preprocess_dataset(path, data_config, fingerprint, morgan_nbits=None): """Calculate representation for each smiles in the dataset.""" if fingerprint == 'morgan': assert morgan_nbits is not None, 'Parameter `morgan_nbits` must be set when using Morgan fingerprint.' smiles, labels = load_data_from_df([path,], **data_config[csv_section]) x = [] y = [] calculated_smiles = [] # we go smiles by smiles because some compounds make rdkit throw errors for this_smiles, this_label in zip(smiles, labels): try: mol = Chem.MolFromSmiles(this_smiles) if fingerprint == 'morgan': fp = AllChem.GetMorganFingerprintAsBitVect(mol, 6, nBits=morgan_nbits) fp = [int(i) for i in fp.ToBitString()] elif fingerprint == 'maccs': fp = MACCSkeys.GenMACCSKeys(mol) fp = np.array(fp)[1:] # index 0 is unset elif fingerprint == 'krfp': fp = krfp(this_smiles) else: pass # unknown fingerprint x.append(fp) y.append(this_label) calculated_smiles.append(this_smiles) except Exception as e: print('exp', e) return np.array(x), np.array(y), calculated_smiles
def __init__(self): self.binaryfp_names = [ "MACCSkeys", "Avalon", "Morgan2(1024bits)", "Morgan2F(1024bits)", "Morgan4(2048bits)", "Morgan4F(2048bits)", # "AtomPair", # "Topological", # "TopologicalTortion", ] self.binaryfp = [ lambda mol: MACCSkeys.GenMACCSKeys(mol), lambda mol: pyAvalonTools.GetAvalonFP(mol), lambda mol: AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024), lambda mol: AllChem.GetMorganFingerprintAsBitVect( mol, 2, nBits=1024, useFeatures=True ), lambda mol: AllChem.GetMorganFingerprintAsBitVect(mol, 4, nBits=2048), lambda mol: AllChem.GetMorganFingerprintAsBitVect( mol, 4, nBits=2048, useFeatures=True ), # lambda mol: Pairs.GetAtomPairFingerprintAsBitVect(mol), # クラッシュする # lambda mol: FingerprintMols.FingerprintMol(mol), #Topological Fingerprint # NaNを生成する # lambda mol: Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol), # ToBitString を持ってない ] self.countfp_names = [ "ECFP2", "FCFP2", "ECFP4", "FCFP4", "ECFP6", "FCFP6", ] self.countfp = [ lambda mol: AllChem.GetMorganFingerprint( mol, radius=1, bitInfo=self.bit_info, useFeatures=False ), lambda mol: AllChem.GetMorganFingerprint( mol, radius=1, bitInfo=self.bit_info, useFeatures=True ), lambda mol: AllChem.GetMorganFingerprint( mol, radius=2, bitInfo=self.bit_info, useFeatures=False ), lambda mol: AllChem.GetMorganFingerprint( mol, radius=2, bitInfo=self.bit_info, useFeatures=True ), lambda mol: AllChem.GetMorganFingerprint( mol, radius=3, bitInfo=self.bit_info, useFeatures=False ), lambda mol: AllChem.GetMorganFingerprint( mol, radius=3, bitInfo=self.bit_info, useFeatures=True ), ] self.bit_info = {} self.bit_infos = {} self.vectors = [] self.all_bit_info_keys = {} self.mols = []
def numpy_atompair(mols): """ Calculate atom pair fingerprints and output them as a numpy array :param mols: {list} list of molecules (RDKit mols) :return: numpy array containing row-wise fingerprints for every molecule """ return _rdk2numpy([MACCSkeys.GenMACCSKeys(m) for m in mols if m])
def create_fingerprints(chemical_compounds): """ Create a learning matrix `X` with (Morgan) fingerprints from the `chemical_compounds` molecular structures. Parameters ---------- chemical_compounds: array [n_chem, 1] or list [n_chem,] chemical_compounds[i] is a string describing the ith chemical compound. Return ------ X: array [n_chem, 124] Generated (Morgan) fingerprints for each chemical compound, which represent presence or absence of substructures. """ n_chem = chemical_compounds.shape[0] #nBits = 167 nBits = 512 X = np.zeros((n_chem, nBits)) X2 = np.zeros((n_chem, 167)) for i in range(n_chem): m = Chem.MolFromSmiles(chemical_compounds[i]) X[i,:] = AllChem.GetMorganFingerprintAsBitVect(m,3,nBits=512,useFeatures=True) X2[i,:] = MACCSkeys.GenMACCSKeys(m) #print(AllChem.GetMorganFingerprintAsBitVect(m,2,nBits=1024)) X3 = np.concatenate((X,X2),axis=1) return X3
def _maccsClustering(rdkit_mols): """ Returns the tanimoto distance matrix based on maccs method Parameters ---------- rdkit_mols: list The list of rdkit.Chem.rdchem.Mol objects Returns ------- tanimotomatrix: np.array The numpy array containing the tanimoto matrix """ from rdkit.Chem import MACCSkeys # calcola MACCS keys fps = [] for m in tqdm(rdkit_mols): fps.append(MACCSkeys.GenMACCSKeys(m)) aprun = ParallelExecutor(n_jobs=-1) # _config['ncpus']) tanimoto_matrix = aprun(total=len(fps), desc='MACCS Distance') \ (delayed(TanimotoDistances)(fp1, fps) for fp1 in fps) return np.array(tanimoto_matrix)
def get_maccfps(self): df = self.df df['Standard Value'].dropna(axis=0) smi = df['Canonical Smiles'] sd = [Chem.MolFromSmiles(m) for m in smi] maccfps = [MACCSkeys.GenMACCSKeys(m) for m in sd] return maccfps
def fingerprint(smiles_or_mol, fp_type='maccs', dtype=None, morgan__r=2, morgan__n=1024, *args, **kwargs): """ Generates fingerprint for SMILES If smiles is invalid, returns None Returns numpy array of fingerprint bits Parameters: smiles: SMILES string type: type of fingerprint: [MACCS|morgan] dtype: if not None, specifies the dtype of returned array """ fp_type = fp_type.lower() molecule = get_mol(smiles_or_mol, *args, **kwargs) if molecule is None: return None if fp_type == 'maccs': keys = MACCSkeys.GenMACCSKeys(molecule) keys = np.array(keys.GetOnBits()) fingerprint = np.zeros(166, dtype='uint8') if len(keys) != 0: fingerprint[keys - 1] = 1 # We drop 0-th key that is always zero elif fp_type == 'morgan': fingerprint = np.asarray(Morgan(molecule, morgan__r, nBits=morgan__n), dtype='uint8') else: raise ValueError("Unknown fingerprint type {}".format(fp_type)) if dtype is not None: fingerprint = fingerprint.astype(dtype) return fingerprint
def make_fingerprints(data, length=512, verbose=False): fp_list = [ fingerprint(Chem.rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect, "Torsion "), fingerprint(lambda x: GetMorganFingerprintAsBitVect(x, 2, nBits=length), "Morgan"), fingerprint(FingerprintMol, "Estate (1995)"), fingerprint(lambda x: GetAvalonFP(x, nBits=length), "Avalon bit based (2006)"), fingerprint(lambda x: np.append(GetAvalonFP(x, nBits=length), Descriptors.MolWt(x)), "Avalon+mol. weight"), fingerprint(lambda x: GetErGFingerprint(x), "ErG fingerprint (2006)"), fingerprint(lambda x: RDKFingerprint(x, fpSize=length), "RDKit fingerprint"), fingerprint(lambda x: MACCSkeys.GenMACCSKeys(x), "MACCS fingerprint"), fingerprint(lambda x: get_fingerprint(x,fp_type='pubchem'), "PubChem"), # fingerprint(lambda x: get_fingerprint(x, fp_type='FP4'), "FP4") fingerprint(lambda x: Generate.Gen2DFingerprint(x,Gobbi_Pharm2D.factory,dMat=Chem.Get3DDistanceMatrix(x)), "3D pharmacophore"), ] for fp in fp_list: if (verbose): print("doing", fp.name) fp.apply_fp(data) return fp_list
def CalculateMACCSFingerprint(mol): """ ################################################################# Calculate MACCS keys (166 bits). Usage: result=CalculateMACCSFingerprint(mol) Input: mol is a molecule object. Output: result is a tuple form. The first is the number of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ res = {} NumFinger = 166 bv = MACCSkeys.GenMACCSKeys(mol) temp = tuple(bv.GetOnBits()) for i in temp: res.update({i: 1}) return NumFinger, res, bv
def computeFP(self, typeFP): from rdkit.Chem.Fingerprints import FingerprintMols from rdkit.Chem import MACCSkeys from rdkit.Chem.AtomPairs import Pairs, Torsions from rdkit.Chem import AllChem if not "smiclean" in self.__dict__: self.log = self.log + "No smiles prepared\n" return 1 else: self.mol = Chem.MolFromSmiles(self.smiclean) #print self.smiclean dFP = {} if typeFP == "Mol" or typeFP == "All": dFP["Mol"] = FingerprintMols.FingerprintMol(self.mol) if typeFP == "MACCS" or typeFP == "All": dFP["MACCS"] = MACCSkeys.GenMACCSKeys(self.mol) if typeFP == "pairs" or typeFP == "All": dFP["pairs"] = Pairs.GetAtomPairFingerprint(self.mol) if typeFP == "Torsion" or typeFP == "All": dFP["Torsion"] = Torsions.GetTopologicalTorsionFingerprint( self.mol) if typeFP == "Morgan" or typeFP == "All": dFP["Morgan"] = AllChem.GetMorganFingerprint(self.mol, 2) self.FP = dFP return 0
def calc_maccs(molecules, name_col='CASRN'): """ Takes in a list of rdkit molecules and returns MACCS fingerprints for a list of rdkit molecules :param name_col: Name of the field to index the resulting DataFrame. Needs to be a valid property of all molecules :param molecules: List of rdkit molecules with no None values :return: pandas DataFrame of dimensions m x n, where m = # of descriptors and n = # of molecules """ # Checks for appropriate input assert isinstance( molecules, list), 'The molecules entered are not in the form of a list.' assert all((isinstance(mol, Chem.rdchem.Mol) for mol in molecules)), 'The molecules entered are not rdkit Mol ' \ 'objects.' assert None not in molecules, 'The list of molecules entered contains None values.' assert isinstance( name_col, str), 'The input parameter name_col (%s) must be a string.' % name_col data = [] for mol in molecules: maccs = [int(x) for x in MACCSkeys.GenMACCSKeys(mol)] data.append(maccs) return pd.DataFrame( data, index=[ mol.GetProp(name_col) if mol.HasProp(name_col) else '' for mol in molecules ])
def compute_tsne(self): Database = self.Database2 smiles = list(Database["SMILES"]) smi = [Chem.MolFromSmiles(x) for x in smiles] fps = [MACCSkeys.GenMACCSKeys(x) for x in smi] tanimoto_sim_mat_lower_triangle = GetTanimotoSimMat(fps) n_mol = len(fps) similarity_matrix = np.ones([n_mol, n_mol]) i_lower = np.tril_indices(n=n_mol, m=n_mol, k=-1) i_upper = np.triu_indices(n=n_mol, m=n_mol, k=1) similarity_matrix[i_lower] = tanimoto_sim_mat_lower_triangle similarity_matrix[i_upper] = similarity_matrix.T[i_upper] distance_matrix = np.subtract(1, similarity_matrix) TSNE_sim = TSNE( n_components=2, init='pca', random_state=1992, angle=0.3, perplexity=self.perplexity).fit_transform(distance_matrix) tsne_result = pd.DataFrame(data=TSNE_sim, columns=["PC1", "PC2"]) tsne_result["LIBRARY"] = list(Database.LIBRARY) tsne_result["TIPO"] = list(Database.LIBRARY) tsne_result["SMILES"] = list(Database.SMILES) tsne_result["NAME"] = list(Database.NAME) self.tsne_result = tsne_result.set_index('TIPO')
def get_maccs_fp(smiles): mol = Chem.MolFromSmiles(smiles) vec = MACCSkeys.GenMACCSKeys(mol) bv = list(vec.GetOnBits()) arr = np.zeros(167) arr[bv] = 1 return arr
def smi_to_maccs(smi): MACCS_SIZE = 167 mol = Chem.MolFromSmiles(smi) if mol is not None: return np.array(MACCSkeys.GenMACCSKeys(mol)) else: return np.zeros(MACCS_SIZE)
def MACCfpDataFrame(chempandas, namecol, smicol): """ Generate the physicochemical properties of the compounds. The compounds are stored in the DataFrame Structure defined by pandas. Keyword arguments: chempandas: the compounds stored in DataFrame, which contain the name and SMILES as columns. namecol: the column number of the name of SMILES. smicol: the column number of SMILES in the DataFrame. Return: a DataFrame of the compounds merging chempadas and the fingerprints by columns. If None is detected given a SMILES-like string, it would be not deleted. Note: The SMILES output by Chem.MolToSmiles is canonical, and might be different with the original. Add the names to different compounds. """ assert chempandas.shape[0] <= MAXLINES molsmitmp = [Chem.MolFromSmiles(x) for x in chempandas.iloc[:, smicol]] i = 0 molsmi = [] for x in molsmitmp: if x is not None: x.SetProp("_Name", chempandas.iloc[i, namecol]) molsmi.append(x) i += 1 # MACC Fingerprints. fps = [MACCSkeys.GenMACCSKeys(x) for x in molsmi] fpsmat = np.matrix(fps) df = DataFrame(fpsmat, index=[x.GetProp("_Name") for x in molsmi]) # how to name the col? df['SMILES'] = [Chem.MolToSmiles(x) for x in molsmi] df['CHEMBL'] = df.index return (df)
def get_ecfp( smi_path, data_path='./', ): if type(smi_path) is str: smi_path = Path(smi_path) def get_smi(smifile): smiles = {} with open(str(smifile), 'r+') as f: lines = f.readlines() smiles = pd.DataFrame({ 'cindex': [ smifile.stem + '_' + str(idx) for idx, content in enumerate(lines) ], 'smiles': [content.strip('\n') for idx, content in enumerate(lines)] }) return smiles smiles = get_smi(smi_path)['smiles'] mols = [Chem.MolFromSmiles(smi) for smi in smiles] fingerprints = [MACCSkeys.GenMACCSKeys(molecule) for molecule in mols] fingerprints_bit = [list(fp.ToBitString()) for fp in fingerprints] fingerprints_df = pd.DataFrame(fingerprints_bit) data = pd.concat([fingerprints_df, get_smi(smi_path)], axis=1) data.to_csv(data_path + '/' + smi_path.stem + '_fp.csv') return data
def label_switching_decoder(key_smiles, bit_list, nmol_df): ''' :param key_smiles: key molecules :param bit_list: model predictions :param df: df where to pick key molecule and the 'neighbor' molecules :return: list; ACSII code ''' bit_list = list(map(int, bit_list)) #conver string to integers # build a list from 0 to 127 orig_label = [i for i in range(128)] key_mol = Chem.MolFromSmiles(key_smiles) key_fp = MACCSkeys.GenMACCSKeys(key_mol) # rebuild root_seed and rotor_seed based on MW and number of atoms of key_mol root_seed = int(Chem.Descriptors.ExactMolWt(key_mol)) rotor_seed = key_mol.GetNumAtoms() #pick 128 neighbor molecules # Pick the 128 reference molecules np.random.seed(root_seed) ref_smiles = np.random.choice(nmol_df.SMILES, size=128, replace=False) #compute the distance dist = [] for i in range(len(ref_smiles)): mol = Chem.MolFromSmiles(ref_smiles[i]) fp = MACCSkeys.GenMACCSKeys(mol) dist.append(DataStructs.FingerprintSimilarity(key_fp, fp)) decoded_message = [] for index, bit in enumerate(bit_list): SEED = root_seed + index * rotor_seed # Pick the 128 reference molecules np.random.seed(SEED) step_dist = np.random.choice(dist, size=len(dist), replace=False) # Base on the distance, swap the original cluster labels # get the index of ordered distances dict_rank = [0] * len(dist) for i, x in enumerate( sorted(range(len(step_dist)), key=lambda y: step_dist[y])): dict_rank[x] = i swaper_dict = dict(zip(orig_label, dict_rank)) # print(swaper_dict) decoded_message.append(swaper_dict.get(bit)) output = ''.join([chr(i) for i in decoded_message]) return output
def maacs_fingerprint_evaluation(references): """ Generate Similarity via MACCSKeys """ scores = [] for reference in references: cur_scores = [] for candidate in references: if reference != candidate: candidate_maccs = MACCSkeys.GenMACCSKeys(candidate) reference_maccs = MACCSkeys.GenMACCSKeys(reference) cur_scores.append( round( DataStructs.TanimotoSimilarity(reference_maccs, candidate_maccs), 4)) scores.append(np.mean(cur_scores)) return round(np.mean(scores), 4)
def _maccs_keys(self, molecules: List, parameters: {}): fingerprints = [] fps = [MACCSkeys.GenMACCSKeys(mol) for mol in molecules] for fp in fps: fp_np = np.zeros((1, ), dtype=np.int32) DataStructs.ConvertToNumpyArray(fp, fp_np) fingerprints.append(fp_np) return fingerprints
def smi2fp(smi): m = Chem.MolFromSmiles(smi) fp = MACCSkeys.GenMACCSKeys(m) fp_array = [] for byte in fp: fp_array.append(byte) fp_array = fp_array[1:] return fp_array
def generate_MACCS(smiles): header = ['bit' + str(i) for i in range(167)] data = [] for i in range(len(smiles)): mol = Chem.MolFromSmiles(smiles[i]) ds = list(MACCSkeys.GenMACCSKeys(mol).ToBitString()) data.append(ds) return data, header
def transform(self, molecules): print("\tBuilding MACS Fingerprints") df = pd.DataFrame() molecules = molecules["molecules"].tolist() fingerprints = [MACCSkeys.GenMACCSKeys(mol).ToBitString() for mol in molecules] for i, fingerprint in enumerate(fingerprints): df = df.append(pd.Series({"rdkit_fingerprintMACS_{}".format(j):element for j, element in enumerate(fingerprint)}), ignore_index=True) np.savetxt("MAC_descriptors.txt", list(df), fmt="%s") return df.astype(float)
def createFingerprint(smiles): try: m = Chem.MolFromSmiles(smiles) if m == None: return None else: return MACCSkeys.GenMACCSKeys(m) except: return None
def compute_MACCS(self, name): MACCS_list = [] header = ['bit' + str(i) for i in range(167)] for i in range(len(self.mols)): ds = list(MACCSkeys.GenMACCSKeys(self.mols[i]).ToBitString()) MACCS_list.append(ds) df = pd.DataFrame(MACCS_list,columns=header) df.insert(loc=0, column='smiles', value=self.smiles) df.to_csv(name[:-4]+'_MACCS.csv', index=False)
def maacs_fingerprint_evaluation(references, candidates): """ Generate Similarity via MACCSKeys """ print("Calculating Similarity via MACCS Keys") similarities = [ [], [], [], [], [] ] # various similarities: Tanimoto, Dice, Cosine, Sokal, McConnaughey for img in references: similarity = [0, 0, 0, 0, 0] if img in candidates: candidate_maccs = MACCSkeys.GenMACCSKeys(candidates[img]) reference_maccs = MACCSkeys.GenMACCSKeys(references[img]) similarity[0] = round( DataStructs.TanimotoSimilarity(reference_maccs, candidate_maccs), 4) similarity[1] = round( DataStructs.DiceSimilarity(reference_maccs, candidate_maccs), 4) similarity[2] = round( DataStructs.CosineSimilarity(reference_maccs, candidate_maccs), 4) similarity[3] = round( DataStructs.SokalSimilarity(reference_maccs, candidate_maccs), 4) similarity[4] = round( DataStructs.McConnaugheySimilarity(reference_maccs, candidate_maccs), 4) similarities[0].append(similarity[0]) similarities[1].append(similarity[1]) similarities[2].append(similarity[2]) similarities[3].append(similarity[3]) similarities[4].append(similarity[4]) print("Done Calculating Similarity via MACCS Keys") print("##########################################") print("Tanimoto Similarity:{}".format(round(np.mean(similarities[0]), 4))) print("Dice Similarity:{}".format(round(np.mean(similarities[1]), 4))) print("Cosine Similarity:{}".format(round(np.mean(similarities[2]), 4))) print("Sokal Similarity:{}".format(round(np.mean(similarities[3]), 4))) print("McConnaughey Similarity:{}".format( round(np.mean(similarities[4]), 4))) print("##########################################") return round(np.mean(similarities[0]), 4)