def atom_pair_fp(self): df = pd.read_csv(self.csv_path) smiles_list = df['Smiles'].tolist() fingerprints = [] not_found = [] for i in tqdm(range(len(smiles_list))): try: mol = Chem.MolFromSmiles(smiles_list[i]) fp = Pairs.GetAtomPairFingerprintAsIntVect(mol) fp._sumCache = fp.GetTotalVal( ) #Bit vector here will be huge, which is why taking TotalVal() # bits = fp.ToBitString() # bits_array = (np.fromstring(fp.ToBitString(),'u1') - ord('0')) fingerprints.append(fp._sumCache) print('fing', fingerprints) except: fingerprints.append(np.nan) not_found.append(i) pass df.drop(not_found, axis=0, inplace=True) print('Number of FPs not found: {}'.format(len(not_found))) df.reset_index(drop=True, inplace=True) labelencoder = LabelEncoder() Y = labelencoder.fit_transform(df['Label'].values) Y = Y.reshape(Y.shape[0], 1) print('Output shape: {}'.format(Y.shape)) fp_array = (np.asarray((fingerprints), dtype=object)) X = np.delete(fp_array, not_found, axis=0) X = np.vstack(X).astype(np.float32) print('Typeof X', type(X)) print(X) print('Input shape: {}'.format(X.shape)) final_array = np.concatenate((X, Y), axis=1) # Removing rows, from final_array, where duplicate FPs are present final_array_slice = final_array[:, 0:(final_array.shape[1] - 1)] _, unq_row_indices = np.unique(final_array_slice, return_index=True, axis=0) final_array_unique = final_array[unq_row_indices] print( 'Number of Duplicate FPs: {}'.format(final_array.shape[0] - final_array_unique.shape[0])) print('Final Numpy array shape: {}'.format(final_array_unique.shape)) print('Type of final array: {}'.format(type(final_array_unique))) final_numpy_array = np.asarray((final_array_unique), dtype=np.float32) return final_numpy_array
def atom(SMILES): ms = [Chem.MolFromSmiles(i) for i in SMILES] fp = [Pairs.GetAtomPairFingerprintAsIntVect(x) for x in ms] return fp
def BuildAtomPairFP(mol): from rdkit.Chem.AtomPairs import Pairs fp = Pairs.GetAtomPairFingerprintAsIntVect(mol) fp._sumCache = fp.GetTotalVal() return fp
def Calc_AtomPairs_Int(self): #type error pairFps_int = [Pairs.GetAtomPairFingerprintAsIntVect(x) for x in self.sd] return pairFps_int