Python DataStructs.FingerprintSimilarity示例，rdkit.Chem.DataStructs.FingerprintSimilarity Python示例

示例#1

0

显示文件

def filter_actions(smiles, valid_actions, target_fps, target_atoms,
                   target_bonds, target_C_envs, radius):
    filter_actions = []
    reach = False
    mol1 = Chem.MolFromSmiles(smiles)
    fps1 = AllChem.GetMorganFingerprintAsBitVect(mol1,
                                                 radius=radius,
                                                 nBits=1024)
    base_similarity = DataStructs.FingerprintSimilarity(fps1, target_fps)
    for next_smiles in valid_actions:
        fps2, atoms2, bonds2, C_envs2 = get_mol_infos(next_smiles, radius)
        #        print (all(elem in target_C_envs for elem in C_envs2))
        next_similarity = DataStructs.FingerprintSimilarity(fps2, target_fps)
        if next_similarity > base_similarity and not mol_violation(
                atoms2, bonds2, C_envs2, target_atoms, target_bonds,
                target_C_envs):
            #            base_similarity = next_similarity ## Accelerate
            #            print (next_smiles)
            #            print ('target', target_C_envs)
            #            print ('next', C_envs2)
            filter_actions.append(next_smiles)


#            print (next_smiles, next_similarity)
        if next_similarity == 1:
            reach = True
            filter_actions = [next_smiles]
            break
    return filter_actions, reach

示例#2

0

显示文件

def rd_kit(dir_sdf = "../data/sdf/"):

    temp_str = "ls " + dir_sdf
    temp = os.popen(temp_str).read()
    temp = str(temp).split()
    bit_length = 1024

    sim_matrix_morgan = []
    sim_matrix_rdk = []
    sim_matrix_aval = []
    sim_matrix_layer = []

    baseline = SDMolSupplier(dir_sdf + temp[0])

    baseline_morgan = AllChem.GetMorganFingerprintAsBitVect(baseline[0], 2, nBits=bit_length)
    baseline_rdk = AllChem.RDKFingerprint(baseline[0], maxPath=2)
    baseline_aval = pyAvalonTools.GetAvalonFP(baseline[0], 128)
    baseline_layer = AllChem.LayeredFingerprint(baseline[0])
    count = 0
    for item in temp:
        suppl = SDMolSupplier(dir_sdf + item)
        count += 1
        fp = AllChem.GetMorganFingerprint(suppl[0], 2)

        fp_bit = AllChem.GetMorganFingerprintAsBitVect(suppl[0], 3, nBits=bit_length)
        fp_rdk = AllChem.RDKFingerprint(suppl[0], maxPath=3)
        fp_aval = pyAvalonTools.GetAvalonFP(suppl[0], 128)
        fp_layer = AllChem.LayeredFingerprint(suppl[0])

        sim_matrix_morgan.append(
            DataStructs.FingerprintSimilarity(baseline_morgan, fp_bit, metric=DataStructs.TanimotoSimilarity))
        sim_matrix_rdk.append(
            DataStructs.FingerprintSimilarity(baseline_rdk, fp_rdk, metric=DataStructs.TanimotoSimilarity))
        sim_matrix_aval.append(
            DataStructs.FingerprintSimilarity(baseline_aval, fp_aval, metric=DataStructs.TanimotoSimilarity))
        sim_matrix_layer.append(
            DataStructs.FingerprintSimilarity(baseline_layer, fp_layer, metric=DataStructs.TanimotoSimilarity))

    sim_matrix_morgan = np.array(sim_matrix_morgan)
    sim_matrix_rdk = np.array(sim_matrix_rdk)
    sim_matrix_aval = np.array(sim_matrix_aval)
    sim_matrix_layer = np.array(sim_matrix_layer)

    label_morgan = "morgan" + str(bit_length)
    plt.hist(sim_matrix_morgan, label = label_morgan)
    plt.hist(sim_matrix_rdk, label = "rdk2")
    #plt.hist(sim_matrix_aval, label = "avalon128")
    #plt.hist(sim_matrix_layer, label = "layer")
    print(np.mean(sim_matrix_rdk))
    print(count)
    plt.xlabel("Similarity to Baseline")
    plt.ylabel("Counts")
    plt.title("Different Fingerprinting Methods, Similarity to Baseline")
    plt.legend()
    plt.show()

示例#3

0

显示文件

def chemical_random_episode(env, search_dict, target_fps, target_atoms,
                            target_bonds, target_C_envs, radius):
    initial_state = env.reset()
    state = initial_state
    pre_state = initial_state
    episode = [state]
    reach = False
    while True:
        if state not in search_dict:
            valid_actions = env._get_valid_actions()
            valid_actions, reach = filter_actions(state, valid_actions,
                                                  target_fps, target_atoms,
                                                  target_bonds, target_C_envs,
                                                  radius)  # filter actions
            search_dict[
                state] = valid_actions  # first meet state, record possible actions
        elif search_dict == {initial_state: []}:
            search_dict = 'terminate'
            break
        else:
            valid_actions = search_dict[state]  # load updated actions
            valid_actions, reach = filter_actions(state, valid_actions,
                                                  target_fps, target_atoms,
                                                  target_bonds, target_C_envs,
                                                  radius)  ##filter again


#        print (valid_actions)
        nA = len(valid_actions)
        if nA == 0:  # if len(valid_actions) == 0, fail and remove this state from dictionary and never add back
            search_dict.pop(
                state)  # if state has no action left, delete from dictionary
            search_dict[pre_state].remove(state)
            mol1 = Chem.MolFromSmiles(state)
            fps1 = AllChem.GetMorganFingerprintAsBitVect(mol1, radius=radius)
            print('No action space, Last action: %s, similarity: %.3f' %
                  (state, DataStructs.FingerprintSimilarity(fps1, target_fps)))
            break
        action = np.random.randint(nA)
        next_state, reward, done = env.step(valid_actions, action)
        episode.append(next_state)

        if reach == True:
            search_dict[state].remove(next_state)
            mol2 = Chem.MolFromSmiles(next_state)
            fps2 = AllChem.GetMorganFingerprintAsBitVect(mol2, radius=radius)
            print('Reach, last action: %s, similarity: %.3f' %
                  (next_state,
                   DataStructs.FingerprintSimilarity(fps2, target_fps)))
            break
        pre_state = state
        state = next_state

    return episode, reach, search_dict

示例#4

0

显示文件

文件： CompoundSimilarity.py 项目： the-ahuja-lab/Machine-Olf-Action

    def measure_similarity(self, db_fps, sim_metric=DataStructs.TanimotoSimilarity, th=0.8):
        global user_ip_fps
        global db_cntr
        global fps_matches

        if db_cntr % 10000 == 0:
            self.jlogger.info("Completed checking similarity with {} compound of db".format(db_cntr))

        u_fps_cntr = 0

        if not db_fps is None:
            for u_fps in user_ip_fps:
                try:
                    if not u_fps is None:
                        sim = DataStructs.FingerprintSimilarity(u_fps, db_fps, metric=sim_metric)
                        if sim >= th:
                            if db_cntr in fps_matches:
                                fps_matches[db_cntr].append((u_fps_cntr, sim))
                            else:
                                fps_matches[db_cntr] = [(u_fps_cntr, sim)]
                    else:
                        self.jlogger.debug(
                            "User Finger print is unavailable, skipping this compound {}".format(u_fps_cntr))
                except Exception as e:
                    logger.exception(
                        "Error measuring similarity of compound db_cntr {} and u_fps_cntr {}".format(db_cntr,
                                                                                                     u_fps_cntr))
                    self.jlogger.debug(
                        "Error measuring similarity of compound db_cntr {} and u_fps_cntr {}".format(db_cntr,
                                                                                                     u_fps_cntr))
                u_fps_cntr += 1

            db_cntr += 1
        else:
            self.jlogger.debug("DB Finger print is unavailable, skipping this compound {}".format(db_cntr))

示例#5

0

显示文件

def TakeInput(filepath, hmdb_filepath, OR_name):
    positive_Cancer = extractPositiveOnes(filepath)
    data_hmdb = pd.read_csv(hmdb_filepath, encoding="ISO-8859-1")
    positive_Cancer = pd.read_csv(filepath, encoding="ISO-8859-1")
    hmdb_names = data_hmdb['NAME']
    hmdb_SMILES = data_hmdb['SMILES']
    positive_Cancer_SMILES = positive_Cancer['Smiles']
    positive_Cancer_Names = positive_Cancer["Ligand"]
    hmdb_data = pd.concat([hmdb_SMILES, hmdb_names], axis=1)
    dataframe = pd.concat([positive_Cancer_SMILES, positive_Cancer_Names],
                          axis=1)
    Cancer_clean_data = dataframe.drop_duplicates()
    Cancer_clean_data = Cancer_clean_data.reset_index(drop=True)
    df1 = pd.DataFrame({
        "Cancer_Molecule": [],
        "Cancer_SMILES": [],
        "HMDB_Molecule": [],
        "HMDB_SMILES": [],
        "TANIMOTO_Similarity_Value": []
    })
    hmdb_data = hmdb_data.reset_index(drop=True)
    k = 0
    for i in range(len(Cancer_clean_data)):
        # df1=df1.iloc[0:0]
        # df1= pd.DataFrame({"Cancer_clean_data_Molecule":[],"Cancer_clean_data_SMILES":[],"HMDB_Molecule":[],"HMDB_SMILES":[],"TANIMOTO_Similarity_Value":[]})
        y = Chem.MolFromSmiles(Cancer_clean_data['Smiles'][i])
        fps1 = FingerprintMols.FingerprintMol(y)
        for j in range(len(hmdb_data)):
            try:
                x = Chem.MolFromSmiles(hmdb_data['SMILES'][j])
                fps2 = FingerprintMols.FingerprintMol(x)
                sim_val = DataStructs.FingerprintSimilarity(fps1, fps2)
                if sim_val >= 0.85:  # threshold for similarity value
                    df1.loc[k] = [
                        Cancer_clean_data['Ligand'][i],
                        Cancer_clean_data['Smiles'][i], hmdb_data['NAME'][j],
                        hmdb_data['SMILES'][j], sim_val
                    ]
                    k = k + 1
            except:
                print("WARNING")
        print("Comparison Done for Ligand :" + str(i))
    df1.to_csv("Final_test_set_" + OR_name + ".csv")
    Ligand = df1["Cancer_clean_data_Molecule"]
    Smiles = df1["Cancer_clean_data_SMILES"]
    Activation_Status = []
    Shortlisted_Metabolites = pd.DataFrame(
        list(zip(Smiles, Ligand, Activation_Status)),
        columns=['Smiles', 'Ligand', 'Activation Status'])
    Shortlisted_Metabolites = Shortlisted_Metabolites.drop_duplicates(
        subset='Ligand', keep='first')
    Shortlisted_Metabolites.to_csv("Shortlisted_Metabolites" + OR_Name +
                                   ".csv")
    print("Shortlisted_Metabolites" + OR_Name + ".csv" + " has been saved")
    print("Congrats! Final_test_set_" + OR_Name +
          ".csv has been successfully saved!")

示例#6

0

显示文件

文件： MOLREAD_GUI.py 项目： liuyunwu/CryptoChem

def label_switching_decoder(key_smiles, bit_list, nmol_df):
    '''
    :param key_smiles: key molecules
    :param bit_list: model predictions
    :param df: df where to pick key molecule and the 'neighbor' molecules
    :return: list; ACSII code
    '''
    bit_list = list(map(int, bit_list))  #conver string to integers

    # build a list from 0 to 127
    orig_label = [i for i in range(128)]

    key_mol = Chem.MolFromSmiles(key_smiles)
    key_fp = MACCSkeys.GenMACCSKeys(key_mol)
    # rebuild root_seed and rotor_seed based on MW and number of atoms of key_mol
    root_seed = int(Chem.Descriptors.ExactMolWt(key_mol))
    rotor_seed = key_mol.GetNumAtoms()

    #pick 128 neighbor molecules
    # Pick the 128 reference molecules
    np.random.seed(root_seed)
    ref_smiles = np.random.choice(nmol_df.SMILES, size=128, replace=False)
    #compute the distance
    dist = []
    for i in range(len(ref_smiles)):
        mol = Chem.MolFromSmiles(ref_smiles[i])
        fp = MACCSkeys.GenMACCSKeys(mol)
        dist.append(DataStructs.FingerprintSimilarity(key_fp, fp))

    decoded_message = []
    for index, bit in enumerate(bit_list):
        SEED = root_seed + index * rotor_seed
        # Pick the 128 reference molecules
        np.random.seed(SEED)
        step_dist = np.random.choice(dist, size=len(dist), replace=False)
        # Base on the distance, swap the original cluster labels

        # get the index of ordered distances
        dict_rank = [0] * len(dist)
        for i, x in enumerate(
                sorted(range(len(step_dist)), key=lambda y: step_dist[y])):
            dict_rank[x] = i
        swaper_dict = dict(zip(orig_label, dict_rank))
        # print(swaper_dict)
        decoded_message.append(swaper_dict.get(bit))
        output = ''.join([chr(i) for i in decoded_message])
    return output

示例#7

0

显示文件

文件： descriptor_test.py 项目： santi921/ML_CO2

def rd_kit_rd(dir_sdf = "../data/sdf/"):

	temp_str = "ls " + dir_sdf
	temp = os.popen(temp_str).read()
	temp = str(temp).split()

	sim_matrix_rdk = []
	baseline = SDMolSupplier("../data/sdf/" + temp[0])
	baseline_rdk = AllChem.RDKFingerprint(baseline[0], maxPath=2)

	for item in temp:
		suppl = SDMolSupplier("../data/sdf/" + item)
		fp_rdk = AllChem.RDKFingerprint(suppl[0], maxPath=2)
		sim_matrix_rdk.append(DataStructs.FingerprintSimilarity(baseline_rdk, fp_rdk, metric=DataStructs.TanimotoSimilarity))

	sim_matrix_rdk = np.array(sim_matrix_rdk)
	return sim_matrix_rdk

示例#8

0

显示文件

    def _searchMorganFP (self, cutoff, numsel, metric):

        incompatible = ['Substructural']
        if metric is None:
            metric = 'Tanimoto'
        elif metric in incompatible:
            LOG.warning (f'Metric {metric} is not compatible with the descriptors present in this space')
            metric = 'Tanimoto'

        results = []

        t1 = time.time()

        # for each compound in the search set 
        for ivector in self.X:
            bitestring="".join(ivector.astype(str))
            ifp = DataStructs.cDataStructs.CreateFromBitString(bitestring)

            selected_i = []
            selected_d = []
            
            d_worst = 0.000

            #TODO Check speed BulkTanimoto
            # for each compound in the space
            for j, jvector in enumerate(self.Xref):

                d = DataStructs.FingerprintSimilarity(ifp,jvector, metric=DataStructs.TanimotoSimilarity)

                if d <= cutoff:
                    continue

                # if results set is not completed add
                if len(selected_i) < numsel:
                    selected_i.append(j)
                    selected_d.append(d)
                    z = sorted (zip(selected_d,selected_i),reverse=True)
                    selected_d = [x for x,_ in z]
                    selected_i = [x for _,x in z]

                    d_worst = selected_d[-1]

                # otherwyse, compare the new d with the min d
                else:
                    if d > d_worst:   # better than worse compound                           
                        #replace worst
                        selected_i[-1]=j
                        selected_d[-1]=d
                        z = sorted (zip(selected_d,selected_i),reverse=True)
                        selected_d = [x for x,_ in z]
                        selected_i = [x for _,x in z]
    
                        d_worst = selected_d[-1]

                        # if the worst compound is identical, we cannot improve the search 
                        if d_worst == 1.000:
                            break

            # results for molecule i are stored in a dictionary
            results_info = {}
            results_info['distances'] = []   # distances are allways stored
            for oi in self.objinforef:
                results_info[oi] = []        # all the objects information (name, smiles, ID, activity, etc.)

            for sd,si in zip(selected_d, selected_i):
                results_info['distances'].append(sd)
                for oi in self.objinforef:
                    results_info[oi].append(self.objinforef[oi][si])

            results.append(results_info)

        LOG.info (f'search completed in time: {time.time()-t1:.4f} secs')

        return True, results

示例#9

0

显示文件

文件： descriptor_test.py 项目： santi921/ML_CO2

baseline_rdk = AllChem.RDKFingerprint(baseline[0], maxPath=2)
baseline_aval = pyAvalonTools.GetAvalonFP(baseline[0], 128)
baseline_layer = AllChem.LayeredFingerprint(baseline[0])

for item in temp:
    suppl = SDMolSupplier("../data/sdf/" + item)

    fp = AllChem.GetMorganFingerprint(suppl[0], 2)

    fp_bit = AllChem.GetMorganFingerprintAsBitVect(suppl[0], 2, nBits=bit_length)
    fp_rdk = AllChem.RDKFingerprint(suppl[0], maxPath=2)
    fp_aval = pyAvalonTools.GetAvalonFP(suppl[0], 128)
    fp_layer = AllChem.LayeredFingerprint(suppl[0])

    sim_matrix_morgan.append(
        DataStructs.FingerprintSimilarity(baseline_morgan, fp_bit, metric=DataStructs.TanimotoSimilarity))
    sim_matrix_rdk.append(
        DataStructs.FingerprintSimilarity(baseline_rdk, fp_rdk, metric=DataStructs.TanimotoSimilarity))
    sim_matrix_aval.append(
        DataStructs.FingerprintSimilarity(baseline_aval, fp_aval, metric=DataStructs.TanimotoSimilarity))
    sim_matrix_layer.append(
        DataStructs.FingerprintSimilarity(baseline_layer, fp_layer, metric=DataStructs.TanimotoSimilarity))

sim_matrix_morgan = np.array(sim_matrix_morgan)
sim_matrix_rdk = np.array(sim_matrix_rdk)
sim_matrix_aval = np.array(sim_matrix_aval)
sim_matrix_layer = np.array(sim_matrix_layer)

label_morgan = "morgan" + str(bit_length)
plt.hist(sim_matrix_morgan, label = label_morgan)
plt.hist(sim_matrix_rdk, label = "rdk2")

示例#10

0

显示文件

文件： space.py 项目： e7dal/flame

    def search (self, cutoff, numsel, metric):
        ''' This function searches for compounds in the chemical space similar to the compounds of input file
            already characterized by the X matrix

            the metric and the cutoff used for the search (distance cutoff and number to extract) are
            defined as parameters
        '''

        # load pickle with reference space
        self.load_space()

        # set defaults
        if cutoff is None:
            cutoff = 0.0
        
        if numsel is None:
            #numsel = len(self.X)
            numsel = 10

        # float variables only can be compared using euclidean
        if self.isFingerprint is False:
            metric = 'Euclidean'
        else:
            if metric is None:
                if self.isFingerprint :
                    metric = 'Tanimoto'
                else:
                    metric = 'Euclidean'

        results = []

        # for each compound in the search set 
        for ivector in self.X:

            if self.isFingerprint:
                bitestring="".join(ivector.astype(str))
                ifp = DataStructs.cDataStructs.CreateFromBitString(bitestring)

            # for each compound in the space
            selected_i = []
            selected_d = []
            #print ('searching compound:', i)
            
            d_worst = 0.000

            for j, jvector in enumerate(self.Xref):

                if metric == 'Tanimoto':
                    d = DataStructs.FingerprintSimilarity(ifp,jvector, metric=DataStructs.TanimotoSimilarity)
                elif metric == 'Euclidean':
                    d = 1.000-(distance.euclidean(ivector,jvector)/self.Dmax)

                if d <= cutoff:
                    continue

                # if results set is not completed add
                if len(selected_i) < numsel:
                    selected_i.append(j)
                    selected_d.append(d)
                    z = sorted (zip(selected_d,selected_i),reverse=True)
                    selected_d = [x for x,_ in z]
                    selected_i = [x for _,x in z]

                    d_worst = selected_d[-1]

                # otherwyse, compare the new d with the min d
                else:
                    if d > d_worst:   # better than worse compound                           
                        #replace worst
                        selected_i[-1]=j
                        selected_d[-1]=d
                        z = sorted (zip(selected_d,selected_i),reverse=True)
                        selected_d = [x for x,_ in z]
                        selected_i = [x for _,x in z]
    
                        d_worst = selected_d[-1]

                        # if the worst compound is identical, we cannot improve the search 
                        if d_worst == 1.000:
                            break

            # results for molecule i are stored in a dictionary
            results_info = {}
            results_info['distances'] = []   # distances are allways stored
            for oi in self.objinfo:
                results_info[oi] = []        # all the objects information (name, smiles, ID, activity, etc.)

            for sd,si in zip(selected_d, selected_i):
                results_info['distances'].append(sd)
                for oi in self.objinfo:
                    results_info[oi].append(self.objinfo[oi][si])

            results.append(results_info)
    
        return True, results

示例#11

0

显示文件

文件： space.py 项目： ismaelresp/flame

    def search (self, X, cutoff, numsel, metric):
        ''' This function searches for compounds in the chemical space similar to the compounds of input file
            already characterized by the X matrix

            the metric and the cutoff used for the search (distance cutoff and number to extract) are
            defined as parameters
        '''

        # load pickle with reference space
        self.load_space()

        # True for fingerprint MD
        isFingerprint = (self.param.getVal('computeMD_method') == ['morganFP'])

        # set defaults
        if cutoff is None:
            cutoff = 0.0
        
        if numsel is None:
            #numsel = len(self.X)
            numsel = 10

        if metric is None:
            if isFingerprint :
                metric = 'Tanimoto'
            else:
                metric = 'Euclidean'

        results = []

        # for each compound in the search set 
        for i, ivector in enumerate(X):

            if isFingerprint:
                bitestring="".join(ivector.astype(str))
                ifp = DataStructs.cDataStructs.CreateFromBitString(bitestring)

            # for each compound in the space
            selected_i = []
            selected_d = []
            #print ('searching compound:', i)
            
            d_worst = 0.000

            for j, jvector in enumerate(self.X):

                
                if metric == 'Tanimoto':
                    d = DataStructs.FingerprintSimilarity(ifp,jvector, metric=DataStructs.TanimotoSimilarity)
                elif metric == 'Euclidean':
                    d = 1.000-(distance.euclidean(ivector,jvector)/self.Dmax)

                if d <= cutoff:
                    continue

                # if results set is not completed add
                if len(selected_i) < numsel:
                    selected_i.append(j)
                    selected_d.append(d)
                    z = sorted (zip(selected_d,selected_i),reverse=True)
                    selected_d = [x for x,_ in z]
                    selected_i = [x for _,x in z]

                    d_worst = selected_d[-1]

                # otherwyse, compare the new d with the min d
                else:
                    if d > d_worst:   # better than worse compound                           
                        #replace worst
                        selected_i[-1]=j
                        selected_d[-1]=d
                        z = sorted (zip(selected_d,selected_i),reverse=True)
                        selected_d = [x for x,_ in z]
                        selected_i = [x for _,x in z]
    
                        d_worst = selected_d[-1]

                        # if the worst compound is identical, we cannot improve the search 
                        if d_worst == 1.000:
                            break


            #print ('completed')
            results_distances = []
            results_names = []
            results_ids = []
            results_smiles = []

            for sd,si in zip(selected_d, selected_i):
                results_distances.append(sd)
                results_names.append(self.names[si])
                results_ids.append(self.ids[si])
                results_smiles.append(self.SMILES[si])
                
                #print (i, sd, self.names[si], self.SMILES[si])

            results.append({'distances':results_distances,
                            'names':results_names,
                            'ids':results_ids,
                            'SMILES':results_smiles
            })

    
        return True, results

示例#12

0

显示文件

文件： play.py 项目： ismaelresp/flame

)
fp2 = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
DataStructs.ConvertToNumpyArray(fp2, xmatrix[1])

with open('nfingers.pkl', 'wb') as fo:
    pickle.dump(xmatrix, fo)

with open('rfingers.pkl', 'wb') as fo:
    pickle.dump(fp1, fo)
    pickle.dump(fp2, fo)

# xmatrix = np.vstack((xmatrix, fp2))

print('start')
for i in range(1000000):
    d = DataStructs.FingerprintSimilarity(
        fp1, fp2, metric=DataStructs.TanimotoSimilarity)
print(d)

# d = DataStructs.FingerprintSimilarity(xmatrix[0],xmatrix[1], metric=DataStructs.TanimotoSimilarity)

print('start')
x1 = xmatrix[0]
x2 = xmatrix[1]
for i in range(100):
    d = 1.0 - distance.jaccard(x1, x2)
print(d)

# fp1 = np.array(AllChem.GetMorganFingerprintAsBitVect(mol1, 8), dtype='bool')
# fp2 = np.array(AllChem.GetMorganFingerprintAsBitVect(mol2, 8), dtype='bool')