Exemplo n.º 1
0
def molecular_similarity(best, parent_candidates, all=False):
    """
    returns a similarity score (0-1) of best with the
    closest molecular relative in parent_candidates

    Parameters
    ----------
    best : object
        Chromosome object, the current
        mutated candidate
    parent_candidates : array
        parent pool of molecules to compare with best.
        These are represented by SMILES
    all : boolean, optional, default = False
        default behavior is false and the tanimoto
        similarity score is returned. If True
        tanimoto, dice, cosine, sokal, kulczynski,
        and mcconnaughey similarities are returned

    Returns
    ----------
    similarity_score : float
    similarity_index : int
        if all=False the best tanimoto similarity score
        as well as the index of the closest molecular
        relative are returned
        if all=True an array of best scores and indeces
        of the closest molecular relative are returned
    """
    scores = []
    if all:
        indices = []
        metrics = [
            DataStructs.TanimotoSimilarity, DataStructs.DiceSimilarity,
            DataStructs.CosineSimilarity, DataStructs.SokalSimilarity,
            DataStructs.KulczynskiSimilarity,
            DataStructs.McConnaugheySimilarity
        ]

        for j in range(len(metrics)):

            scores_micro = []
            for i in range(len(parent_candidates)):
                ms = [best.Mol, Chem.MolFromSmiles(parent_candidates[i])]
                fps = [FingerprintMols.FingerprintMol(x) for x in ms]
                score = DataStructs.FingerprintSimilarity(fps[0],
                                                          fps[1],
                                                          metric=metrics[j])
                scores_micro.append(score)
            scores.append(max(scores_micro))
            indices.append(scores_micro.index(max(scores_micro)))
        return scores, indices
    else:
        for i in range(len(parent_candidates)):
            ms = [best.Mol, Chem.MolFromSmiles(parent_candidates[i])]
            fps = [FingerprintMols.FingerprintMol(x) for x in ms]
            score = DataStructs.FingerprintSimilarity(fps[0], fps[1])
            scores.append(score)
        return max(scores), scores.index(max(scores))
def similarityMeasure(fps, neg, mol2):

    fps2 = Generate.Gen2DFingerprint(mol2, sigFactory)

    similarityPos = DataStructs.FingerprintSimilarity(
        fps, fps2, metric=DataStructs.TanimotoSimilarity)
    similarityNeg = DataStructs.FingerprintSimilarity(
        neg, fps2, metric=DataStructs.TanimotoSimilarity)
    #    if similarityPos>=0.75:

    print Chem.MolToSmiles(mol2), similarityPos, similarityNeg
    return similarityPos, similarityPos - similarityNeg
Exemplo n.º 3
0
def mols_similarity(ms_smiles=['CCOC', 'CCO', 'COC']):
    from rdkit import Chem
    from rdkit import DataStructs
    from rdkit.Chem.Fingerprints import FingerprintMols
    ms = [Chem.MolFromSmiles(m_sm) for m_sm in ms_smiles]

    # [Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('COC')]
    fps = [FingerprintMols.FingerprintMol(x) for x in ms]
    print fps[0]
    print DataStructs.FingerprintSimilarity(fps[0], fps[1])
    print DataStructs.FingerprintSimilarity(fps[0], fps[2])
    print DataStructs.FingerprintSimilarity(fps[1], fps[2])
    print DataStructs.FingerprintSimilarity(fps[0], fps[0])
Exemplo n.º 4
0
def get_distance_func(name):
    if name == 'RDK/T':
        make_representation = (lambda chem: Chem.RDKFingerprint(chem.mol))
        distf = lambda x, y: 1.0 - DataStructs.FingerprintSimilarity(x, y)

        return (make_representation, distf)
    elif name == 'GOBI/T':
        make_representation = lambda chem: Generate.Gen2DFingerprint(
            chem.mol, Gobbi_Pharm2D.factory)
        distf = lambda x, y: 1.0 - DataStructs.FingerprintSimilarity(x, y)

        return (make_representation, distf)
    else:
        raise Exception('Unknown similarity measure: %s' % job.sim_measure)
Exemplo n.º 5
0
def check_mol_similarity():
    from rdkit import Chem
    from rdkit import DataStructs
    from rdkit.Chem.Fingerprints import FingerprintMols
    ms = [
        Chem.MolFromSmiles('CCOC'),
        Chem.MolFromSmiles('CCO'),
        Chem.MolFromSmiles('COC')
    ]
    fps = [FingerprintMols.FingerprintMol(x) for x in ms]
    print fps[0]
    print DataStructs.FingerprintSimilarity(fps[0], fps[1])
    print DataStructs.FingerprintSimilarity(fps[0], fps[2])
    print DataStructs.FingerprintSimilarity(fps[1], fps[2])
    print DataStructs.FingerprintSimilarity(fps[0], fps[0])
Exemplo n.º 6
0
def getXNN(trainSmilesList, train, predEx, smilesAttrName, nameAttr, X,
           simType):

    if simType == "Topological":
        fpsTrain = [FingerprintMols.FingerprintMol(x) for x in trainSmilesList]
        fp = FingerprintMols.FingerprintMol(
            Chem.MolFromSmiles(predEx[smilesAttrName].value))
    elif simType == "Morgan":
        fpsTrain = [
            AllChem.GetMorganFingerprint(x, 2) for x in trainSmilesList
        ]
        fp = AllChem.GetMorganFingerprint(
            Chem.MolFromSmiles(predEx[smilesAttrName].value), 2)
    elif simType == "MACCS":
        fpsTrain = [MACCSkeys.GenMACCSKeys(x) for x in trainSmilesList]
        fp = MACCSkeys.GenMACCSKeys(
            Chem.MolFromSmiles(predEx[smilesAttrName].value))
    else:
        print "This type of sim is not implemented ", simType

    simDict = {}
    idx = 0
    simList = []
    for ex in train:
        if simType == "Topological":
            sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp)
        elif simType == "Morgan":
            sim = DataStructs.DiceSimilarity(fpsTrain[idx], fp)
        elif simType == "MACCS":
            sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp)
        else:
            print "This type of sim is not implemented ", simType
        idx = idx + 1
        simDict[ex[nameAttr].value] = sim
        simList.append(sim)

    simList.sort(reverse=True)
    simList = simList[0:X]
    medSim = round(numpy.median(simList), 3)
    stdSim = round(numpy.std(simList), 3)
    minSim = round(min(simList), 3)
    maxSim = round(max(simList), 3)

    entropy = round(getRespVar(simList, simDict, train, nameAttr), 3)
    entropyClosest = round(
        getRespVar(simList[0:X / 2], simDict, train, nameAttr), 3)

    return medSim, stdSim, minSim, maxSim, entropy, entropyClosest
Exemplo n.º 7
0
def pipe_sim_filter(stream,
                    query,
                    cutoff=0.8,
                    summary=None,
                    comp_id="pipe_sim_filter"):
    """Filter for compounds that have a similarity greater or equal
    than `cutoff` to the `query` Smiles.
    If the field `FP_b64` (e.g. pre-calculated) is present, this will be used,
    otherwise the fingerprint will be generated on-the-fly (much slower)."""
    rec_counter = 0

    query_mol = Chem.MolFromSmiles(query)
    if not query_mol:
        print("* {} ERROR: could not generate query from SMILES.".format(
            comp_id))
        return None

    query_fp = FingerprintMols.FingerprintMol(query_mol)
    for rec in stream:
        if "mol" not in rec: continue

        if "FP_b64" in rec:  # use the pre-defined fingerprint if it is present in the stream
            mol_fp = pickle.loads(b64.b64decode(rec["FP_b64"]))
        else:
            mol_fp = FingerprintMols.FingerprintMol(rec["mol"])

        sim = DataStructs.FingerprintSimilarity(query_fp, mol_fp)
        if sim >= cutoff:
            rec_counter += 1

            if summary is not None:
                summary[comp_id] = rec_counter

            yield rec
Exemplo n.º 8
0
 def test90BulkDistances(self):
     """
   verify that the base similarity (tanimoto) works using an 18 fp regression
   panel of pubchem compounds (chosen to have different lengths: 5x2048,
   5x1024, 5x512, 3x256)
 """
     from rdkit import DataStructs
     import cPickle, os
     from rdkit import RDConfig
     fps = cPickle.load(
         file(
             os.path.join(RDConfig.RDCodeDir, 'DataStructs', 'test_data',
                          'pubchem_fps.pkl'), 'rb'))
     dm = cPickle.load(
         file(
             os.path.join(RDConfig.RDCodeDir, 'DataStructs', 'test_data',
                          'pubchem_fps.dm.pkl'), 'rb'))
     dmIdx = 0
     for i in range(len(fps)):
         nmi, fpi = fps[i]
         for j in range(i + 1, len(fps)):
             nmj, fpj = fps[j]
             sim = DataStructs.FingerprintSimilarity(fpi, fpj)
             self.failUnless(feq(sim, dm[dmIdx]))
             dmIdx += 1
Exemplo n.º 9
0
def calc_appdom(training_set, out_model_dir):
    appdom_fps = []
    # output_ext = "%s_%s_%d_%d" % (mode, method, int(rand_split), int(rand_state))

    for mol in training_set:
        # fingerprint = Chem.GetMorganFingerprintAsBitVect(mol, 3, nBits=1024)
        fingerprint = FingerprintMols.FingerprintMol(mol)
        appdom_fps.append(fingerprint)

    distances = []

    for i in range(0, (len(appdom_fps) - 1)):
        for j in range(i + 1, len(appdom_fps)):
            # dist = 1.0 - (DataStructs.TanimotoSimilarity(appdom_fps[i], appdom_fps[j]))
            dist = 1.0 - (DataStructs.FingerprintSimilarity(
                appdom_fps[i], appdom_fps[j]))
            distances.append(dist)

    distances = np.array(distances)
    mean_distance = np.mean(distances)
    dev_distance = np.std(distances)

    appdom_radius = mean_distance + dev_distance

    # Write fingerprints of training set and AD radius to pickle files for later prediction runs
    with open(out_model_dir + "/training-FPs_%s.dat" % output_ext, 'wb') as f:
        pickle.dump(appdom_fps, f)

    with open(out_model_dir + "/AD-radius_%s.dat" % output_ext, 'wb') as f:
        pickle.dump(appdom_radius, f)

    return appdom_fps, appdom_radius
Exemplo n.º 10
0
def TanimotoDistances(fp1, fps):
    """
    Returns the tanimoto row based on fingeprints passed

    Parameters
    ----------
    fp1: rdkit fingerprint
        The rdkit fingerprint computed used as reference
    fps: list
        The list of the rdkit fingerprint computed

    Returns
    -------
    tanimotorow: list
        A list with the tanimoto row similarities
    """

    from rdkit import DataStructs  # fingerprint similarity

    tanimoto_row = []

    for fp2 in fps:
        tani = 1 - DataStructs.FingerprintSimilarity(fp1, fp2)
        tanimoto_row.append(tani)
    return tanimoto_row
Exemplo n.º 11
0
def tanimoto_similarity_average(generated_synt_valid_set : List[str]) -> float:
    '''
    Calculate the average of the tanimoto simlarity with respect to the RDK Fingerprint 
    among all the generated syntactically valid molecules.
    
    Paramters
    ---------
    generated_synt_valid_set : list
        List of syntactically valid molecules generated.
    
    Returns
    -------
    sum(fing_mols_generated) / len(fing_mols_generated)
        Average of tanimoto similarity among all syntactically valid molecules generated.
    '''
    
    mols_generated = []
    fing_mols_generated = []
    for smile in generated_synt_valid_set:
        molecule = Chem.MolFromSmiles(smile)
        if molecule:
            mols_generated.append(molecule)

    fps = [Chem.RDKFingerprint(mol) for mol in mols_generated]
    for i in range(len(fps)):
        for j in range(i, len(fps)):
            sim = DataStructs.FingerprintSimilarity(fps[i], fps[j])
            fing_mols_generated.append(sim)

    return sum(fing_mols_generated) / len(fing_mols_generated)
Exemplo n.º 12
0
def getSimilarity(item1, item2):
    """
    Calculates similarity given two tuples of (smiles, fingerprint)
    Returns a tuple of (smiles1, smiles2, similarity)
    """
    return (item1[0], item2[0],
            DataStructs.FingerprintSimilarity(item1[1], item2[1]))
def test_ligand_data(target, ligand_name, lig):
    m1 = Chem.MolFromSmiles(lig._data["smiles"][0])
    m1 = Chem.AddHs(m1)
    m2 = Chem.SDMolSupplier(
        os.path.join(
            targets.data_path,
            targets.get_target_dir(target),
            "02_ligands",
            ligand_name,
            "crd",
            f"{ligand_name}.sdf",
        ),
        removeHs=False,
    )[0]
    assert m1.GetNumAtoms() == m2.GetNumAtoms()
    m1.RemoveAllConformers()
    m2.RemoveAllConformers()
    assert pytest.approx(1.0, 1e-9) == DataStructs.FingerprintSimilarity(
        Chem.RDKFingerprint(m1), Chem.RDKFingerprint(m2))
    #            assert Chem.MolToMolBlock(m1) == Chem.MolToMolBlock(m2)
    res = rdFMCS.FindMCS([m1, m2])
    assert res.numAtoms == m1.GetNumAtoms()
    assert res.numBonds == m1.GetNumBonds()

    m3 = lig.get_molecule()
    m2 = Molecule.from_rdkit(m2)
    assert Molecule.are_isomorphic(m2, m3)
Exemplo n.º 14
0
    def calcTani(i,fin_temp1_ori):
        print(i)
        col=[]
        if fin_temp1_ori =='':
            pass
        else:
            molFile =Chem.MolFromSmiles(fin_temp1_ori)
            if molFile == None:
                pass
            else:
                finger1 = AllChem.GetMACCSKeysFingerprint(molFile)
            trigger = 0
            for j,fin_temp2_ori in enumerate(fin1):
                if trigger == 0 and fin_temp1_ori ==  fin_temp2_ori:
                    result =0
                    col.append(result)

                elif molFile == None or fin_temp2_ori =='':
                    result = '0'
                    col.append(result)
                else:
                    try:
                        molFile2 = Chem.MolFromSmiles(fin_temp2_ori)
                        finger2= AllChem.GetMACCSKeysFingerprint(molFile2)
                        result = DataStructs.FingerprintSimilarity(finger1,finger2)
                        #print(result)
                        col.append(result)
                    except:
                        result = '0'
                        col.append(result)
            return  i,col
Exemplo n.º 15
0
def sim_i_j(row_i, row_j):
    """For two given rows of a dataframe, use the rdkit fingerprints to compute
    TanimotoSimilarity and return the resulting float"""
    return DataStructs.FingerprintSimilarity(
        row_i['Fingerprint'],
        row_j['Fingerprint'],
        metric=DataStructs.TanimotoSimilarity)
Exemplo n.º 16
0
def add_tanimoto_score(path_found_decoys):
    found_decoys = pd.read_csv(path_found_decoys, index_col=0)
    total = len(found_decoys)
    log(f'Total rows: {total}')
    tanimotos = []
    dec_smiles = dict()
    act_smiles = dict()
    for ix, row in found_decoys.iterrows():
        if ix % 100000 == 0:
            log(f'Added tanimoto scores to {ix} rows')
        try:
            chembl_smile = act_smiles[row['ligand_id']]
        except KeyError:
            chembl_smile = make_fingerprint(row['ligand_smile'], fp='fp2')
            act_smiles[row['ligand_id']] = chembl_smile
        try:
            decoy_smile = dec_smiles[row['zinc']]
        except KeyError:
            decoy_smile = make_fingerprint(row['zinc_smile'], fp='fp2')
            dec_smiles[row['zinc']] = decoy_smile
        tc = DataStructs.FingerprintSimilarity(chembl_smile, decoy_smile)
        tanimotos.append(tc)
    found_decoys = found_decoys.assign(tanimoto=tanimotos)
    found_decoys.to_csv(path_found_decoys)
    return found_decoys
    def pair_similiar_fcfp4(self, valid_smiles):
        if len(valid_smiles) < 2:
            return 0, 0
        else:

            valid_mols = [Chem.MolFromSmiles(i) for i in valid_smiles]
            valid_fps = [
                AllChem.GetMorganFingerprintAsBitVect(mol,
                                                      2,
                                                      nBits=1024,
                                                      useFeatures=True)
                for mol in valid_mols
            ]
            pair_similiar = []
            for i in range(len(valid_fps)):
                for j in range(i + 1, len(valid_fps)):
                    fp_i = valid_fps[i]
                    fp_j = valid_fps[j]
                    pair_similiar.append(
                        DataStructs.FingerprintSimilarity(fp_i, fp_j))

            pair_similiar_numpy = np.array(pair_similiar)
            very_similiar = pair_similiar_numpy[pair_similiar_numpy > 0.75]
            very_similiar_rate = very_similiar.shape[0] / len(pair_similiar)
            mean_pair_similiar = sum(pair_similiar) / len(pair_similiar)
            return str(very_similiar_rate), str(mean_pair_similiar)
Exemplo n.º 18
0
def similarity_search(fps_db, smile):
    fps_test = RDKFingerprint(MolFromSmiles(smile))
    ts = []
    for i, s_top in enumerate(fps_db):
        ts.append(DataStructs.FingerprintSimilarity(s_top, fps_test))
    ts = np.array(ts)
    return ts.mean()  # ts.max()
Exemplo n.º 19
0
def similarity(mol1, mol2, metric="Tanimoto"):
    """Compare similarity between ligands.

    Parameters
    ----------
    mol1 : str
        The smile code of a molecule
    mol2 : str
        The smile code of a molecule
    metric : str, default = 'Tanimoto'
        The ligand similarity metric. Options:
        Tanimoto, Dice, Russel, Cosine.

    Examples
    --------
    >>> from deepunion import fingerprints
    >>> fingerprints.similarity("CC(C)OCC", "CCOCC")
    0.666666666666666

    References
    ----------
    https://www.rdkit.org/docs/GettingStartedInPython.html

    """
    ms = [Chem.MolFromSmiles(mol1), Chem.MolFromSmiles(mol2)]

    fps = [FingerprintMols.FingerprintMol(x) for x in ms]

    m = metric_fingerprints(metric)

    try:
        return DataStructs.FingerprintSimilarity(fps[0], fps[1], m)
    except:
        return 0.0
Exemplo n.º 20
0
def check_for_known(all_enz_df, mol_fingerprint, threshold):

    bad_cols = ['Known', 'Mol', 'Fingerprint']

    for col in bad_cols:
        if col in all_enz_df.columns:
            all_enz_df.drop(columns=col, inplace=True)
        else:
            continue

    fingerprint = mol_fingerprint

    input_df = fingerprint_products(
        all_enz_df)  # fingerprint the input dataframe and return it

    input_df['Known'] = ''  # initialize similarity column

    for index, row in input_df.iterrows():
        similarity = DataStructs.FingerprintSimilarity(
            fingerprint,
            row['Fingerprint'],
            metric=DataStructs.TanimotoSimilarity)
        input_df['Known'].loc[index] = similarity

    known_df = input_df[input_df['Known'] >= threshold]

    if len(known_df) > 0:
        known_df.sort_values(by='Known', ascending=False, inplace=True)
        result = known_df
    else:
        #call to promiscuous search code here
        result = print('No known enzymes. Beginning promiscuous search.')

    return result
Exemplo n.º 21
0
def calc_similarity(compound_one, compound_two):
    if compound_one in joint_sim:
        if compound_two in joint_sim[compound_one]:
            return joint_sim[compound_one][compound_two]
    else:
        joint_sim[compound_one] = dict()

    if compound_two not in joint_sim:
        joint_sim[compound_two] = dict()

    if cg_props[compound_one.lower()]["type"] != cg_props[compound_one.lower()]["type"]:
        joint_sim[compound_one][compound_two] = 0.0
        joint_sim[compound_two][compound_one] = 0.0
        return 0.0

    from rdkit import DataStructs
    from rdkit.Chem.Fingerprints import FingerprintMols
    from rdkit import Chem

    mol_one = Chem.MolFromSmiles(str(cg_props[compound_one.lower()]["smiles"]))
    mol_two = Chem.MolFromSmiles(str(cg_props[compound_two.lower()]["smiles"]))
    fp_1 = FingerprintMols.FingerprintMol(mol_one)
    fp_2 = FingerprintMols.FingerprintMol(mol_two)
    similarity = DataStructs.FingerprintSimilarity(fp_1, fp_2)
    joint_sim[compound_one][compound_two] = similarity
    joint_sim[compound_two][compound_one] = similarity
    return similarity
Exemplo n.º 22
0
 def tanimoto_similarity(self):
     ms = [
         Chem.MolFromSmiles(self.smiles1),
         Chem.MolFromSmiles(self.smiles2)
     ]
     fps = [FingerprintMols.FingerprintMol(x) for x in ms]
     return DataStructs.FingerprintSimilarity(fps[0], fps[1])
Exemplo n.º 23
0
 def _initTopN(self):
     self.topN = TopNContainer.TopNContainer(self.numToGet)
     for obj in self.dataSource:
         fp = self.fingerprinter(obj)
         sim = DataStructs.FingerprintSimilarity(fp, self.probe,
                                                 self.metric)
         self.topN.Insert(sim, obj)
Exemplo n.º 24
0
def mols_similarity_base_return(ms_smiles_mid,
                                ms_smiles_base,
                                property_of_base=None):
    """
    The results will be returned. 
        A * w = b, A and b will be returned.
        return A, b, w
    """
    from rdkit import DataStructs

    [fps_base, fps_mid] = mols_similarity_base_core(ms_smiles_mid,
                                                    ms_smiles_base)

    Nb, Nm = len(fps_base), len(fps_mid)
    A = np.zeros((Nm, Nb))
    b = np.zeros(Nb)

    for (bx, f_b) in enumerate(fps_base):
        for (mx, f_m) in enumerate(fps_mid):
            print("Base:{0}, Target:{1}".format(ms_smiles_base.keys()[bx],
                                                ms_smiles_mid.keys()[mx]))
            A[mx, bx] = DataStructs.FingerprintSimilarity(f_b, f_m)
            print(A[mx, bx])
        if property_of_base:
            b[bx] = property_of_base[bx]
            print(b[bx])

    if property_of_base:
        print "b is obtained."
        return A, b
    else:
        return A
Exemplo n.º 25
0
    def __init__(self, moli, molj):
        """
        Inizialization function
    
        Parameters
        ----------

        moli : RDKit molecule object 
            the first molecule used to perform the Figureprint calculation
        molj : RDKit molecule object 
            the second molecule used to perform the Figureprint calculation
        options : argparse python object 
            the list of user options 
       
        """

        # Set logging level and format
        logging.basicConfig(format='%(levelname)s:\t%(message)s',
                            level=logging.INFO)

        # Local pointers to the passed molecules
        self.moli = moli
        self.molj = molj

        if not options.verbose == 'pedantic':
            lg = RDLogger.logger()
            lg.setLevel(RDLogger.CRITICAL)

        self.fps_moli = FingerprintMols.FingerprintMol(self.moli)
        self.fps_molj = FingerprintMols.FingerprintMol(self.molj)
        self.fps_tan = DataStructs.FingerprintSimilarity(
            self.fps_moli, self.fps_molj)
Exemplo n.º 26
0
def evaluate_similarity_method(dataset, resultsdir):
    size = 4
    ref_corr = range(size, 0, -1)
    ref_corr_b = range(0, size)

    # Setup results dir
    if not os.path.isdir(resultsdir):
        os.mkdir(resultsdir)

    writer = Writer(resultsdir)
    for i, d in enumerate(get_rdkitmols(dataset)):
        for fpName, fpCalculator in flib.fpdict.iteritems():
            ref_mol = d[0]
            ref_fp = fpCalculator(ref_mol)
            ref_nbonds = ref_mol.GetNumBonds()
            tanimotos = []
            adjusted_tanimotos = []
            for smol in d[1:]:
                sfp = fpCalculator(smol)
                if fpName in ["ap", "tt"] or fpName.startswith("ecfc") or fpName.startswith("fcfc"):
                    tanimoto = DataStructs.DiceSimilarity(ref_fp, sfp)
                else:
                    tanimoto = DataStructs.FingerprintSimilarity(ref_fp, sfp)
                tanimotos.append(tanimoto)

            label = fpName
            writer.write_result(label, tanimotos, i==0)
Exemplo n.º 27
0
def build_similarity_matrix():
    with open("/home/jeherr/tensorchem/tmp/all_opt_smiles.txt", "r") as f:
        lines = f.readlines()
        opt_smiles = [line.strip("\n") for line in lines]

    opt_mols = [Chem.MolFromSmiles(smile) for smile in opt_smiles]
    rarest_elements = [5, 14, 15, 34, 35, 53]
    keep_idx = [
        i for i, mol in enumerate(opt_mols)
        if any(item in rarest_elements for item in mol.GetAtoms())
    ]
    #opt_less = [mol for mol in opt_mols if mol.GetNumHeavyAtoms() < 25]
    opt_fps = [Chem.RDKFingerprint(mol) for mol in opt_mols]

    opt_scores = np.ones((len(opt_mols), len(opt_mols)))
    for i, fp1 in enumerate(opt_fps):
        for j, fp2 in enumerate(opt_fps[i + 1:]):
            score = DataStructs.FingerprintSimilarity(
                fp1, fp2, metric=DataStructs.TanimotoSimilarity)
            opt_scores[i, i + j + 1] = opt_scores[i + j + 1, i] = score

    np.save("/home/jeherr/tensorchem/tmp/all_opt_scores.npy", opt_scores)
    opt_scores = np.square(opt_scores)
    np.save("/home/jeherr/tensorchem/tmp/opt_scores_squared.npy", opt_scores)
    return keep_idx
Exemplo n.º 28
0
def _mols_similarity_base_r0(ms_smiles_mid, ms_smiles_base):
    """
    Input: dictionary type required such as {nick name: smiles code, ...}
    """
    from rdkit import Chem
    from rdkit import DataStructs
    from rdkit.Chem.Fingerprints import FingerprintMols

    # processing for mid
    print("Target: " + ms_smiles_mid.keys())
    ms_mid = [Chem.MolFromSmiles(m_sm) for m_sm in ms_smiles_mid.values()]
    # [Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('COC')]
    fps_mid = [FingerprintMols.FingerprintMol(x) for x in ms_mid]

    # processing for base
    print("Base: " + ms_smiles_base.keys())
    ms_base = [Chem.MolFromSmiles(m_sm) for m_sm in ms_smiles_base.values()]
    # [Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('COC')]
    fps_base = [FingerprintMols.FingerprintMol(x) for x in ms_base]

    for (bx, f_b) in enumerate(fps_base):
        for (dx, f_d) in enumerate(fps_mid):
            print("Base:{0}, Target:{1}".format(ms_smiles_base.keys()[bx],
                                                ms_smiles_mid.keys()[dx]))
            print(DataStructs.FingerprintSimilarity(f_b, f_d))
Exemplo n.º 29
0
def compute_fraggle_similarity_for_subs(inMol,
                                        qMol,
                                        qSmi,
                                        qSubs,
                                        tverskyThresh=0.8):
    qFP = Chem.RDKFingerprint(qMol, **rdkitFpParams)
    iFP = Chem.RDKFingerprint(inMol, **rdkitFpParams)

    rdkit_sim = DataStructs.TanimotoSimilarity(qFP, iFP)

    qm_key = "%s_%s" % (qSubs, qSmi)
    if qm_key in modified_query_fps:
        qmMolFp = modified_query_fps[qm_key]
    else:
        qmMol = atomContrib(qSubs, qMol, tverskyThresh)
        qmMolFp = Chem.RDKFingerprint(qmMol, **rdkitFpParams)
        modified_query_fps[qm_key] = qmMolFp

    rmMol = atomContrib(qSubs, inMol, tverskyThresh)

    # wrap in a try, catch
    try:
        rmMolFp = Chem.RDKFingerprint(rmMol, **rdkitFpParams)
        fraggle_sim = max(DataStructs.FingerprintSimilarity(qmMolFp, rmMolFp),
                          rdkit_sim)
    except Exception:  # pragma: nocover
        sys.stderr.write("Can't generate fp for: %s\n" %
                         (Chem.MolToSmiles(rmMol, True)))
        fraggle_sim = 0.0

    return rdkit_sim, fraggle_sim
def morgan2_fp(SMILES, Library):
    ms = list()
    sim = list()
    y = list()
    ms = [Chem.MolFromSmiles(i) for i in SMILES]
    fps_Morgan = [AllChem.GetMorganFingerprintAsBitVect(x, 2) for x in ms]
    Morgan = [
        DataStructs.FingerprintSimilarity(y, x)
        for x, y in it.combinations(fps_Morgan, 2)
    ]
    Morgan.sort()
    sim = Morgan
    # estatistical values
    stat = {
        "MIN": [round(min(sim), 2)],
        "1Q": [round(np.percentile(sim, 25))],
        "MEDIAN": [round(st.median(sim))],
        "MEAN": [round(st.mean(sim), 2)],
        "3Q": [round(np.percentile(sim, 75), 2)],
        "MAX": [max(sim)],
        "STD": [round(st.stdev(sim), 2)],
        "Library": [str(Library)],
    }
    df = pd.DataFrame.from_dict(stat)
    fp_result = {"sim": sim, "y": np.arange(1, len(sim) + 1) / len(sim), "df": df}
    return fp_result