예제 #1
0
def test_distance():
    fp_a = mhfp_encoder.from_molecular_shingling(mhfp_encoder.shingling_from_smiles('CCOC1=C(C=C(C=C1)S(=O)(=O)N(C)C)C2=NC(=O)C3=C(N2)C(=NN3C)C(C)(C)C', sanitize=True))
    fp_b = mhfp_encoder.from_molecular_shingling(mhfp_encoder.shingling_from_smiles('CCCC1=NN(C2=C1NC(=NC2=O)C3=C(C=CC(=C3)S(=O)(=O)N4CCN(CC4)C)OCC)C', sanitize=True))
    fp_c = mhfp_encoder.from_molecular_shingling(mhfp_encoder.shingling_from_smiles('O=C(OC)C(C1CCCCN1)C2=CC=CC=C2', sanitize=True))
    assert MHFPEncoder.distance(fp, fp_b) == 0.0
    assert MHFPEncoder.distance(fp, fp_a) == 0.45849609375
    assert MHFPEncoder.distance(fp, fp_c) == 0.97216796875
예제 #2
0
def GetMHFP6(mol, nBits=2048, radius=3):
    """
    MHFP6: radius=3
    """
    encoder = MHFPEncoder(n_permutations=nBits)
    hash_values = encoder.encode_mol(mol,
                                     radius=radius,
                                     rings=True,
                                     kekulize=True,
                                     min_radius=1)
    arr = encoder.fold(hash_values, nBits)
    return arr.astype(bool)
예제 #3
0
파일: map4.py 프로젝트: iwatobipen/map4
    def __init__(self, dimensions=1024, radius=2, is_counted=False, is_folded=False):
        """
        MAP4 calculator class
        """
        self.radius = radius
        self.is_counted = is_counted
        self.is_folded = is_folded

        if self.is_folded:
            self.encoder = MHFPEncoder(dimensions)
        else:
            self.encoder = tm.Minhash(dimensions)
예제 #4
0
def CalculateSmilesExtendedConnectivityFingerprint(
        mol: Chem.Mol,
        radius: int = 2,
        rtype: str = 'bitstring',
        bits: int = 2048) -> Tuple[str, dict, Any]:
    """Calculate SMILES extended connectivity fingerprint (SECFP), doi: 10.1186/s13321-018-0321-8.

    :param radius: maximum radius of atom-centered substructures.
    :param rtype: Type of output, may either be:
                  bitstring (default), returns a binary string
                  numpy, return the underlying numpy array
                  rdkit, return the native rdkit DataStructs
                  dict, for a dict of bits turned on
    :param bits: Number of folded bits (ignored if rtype != 'bitstring')
    """
    secfp = MHFPEncoder.secfp_from_mol(mol,
                                       length=bits,
                                       radius=radius,
                                       rings=True,
                                       kekulize=True,
                                       min_radius=1)
    if rtype == 'numpy':
        return secfp
    elif rtype == 'dict':
        return {x: 1 for x in secfp.tolist() if x != 0}
    bv = DataStructs.ExplicitBitVect(bits)
    bv.SetBitsFromList([x for x, y in enumerate(secfp.tolist()) if y != 0])
    if rtype == 'rdkit':
        return bv
    else:
        return bv.ToBitString()
예제 #5
0
def convert(subset):
    target = '/cluster/chembl/chembl.' + str(subset) + '.smi'
    actives = pd.read_csv(target, sep=' ', usecols=[0], header=None)

    mh = MHFPEncoder()

    with open('/cluster/chembl/chembl.' + str(subset) + '.mhfp6', 'w+') as f:
        for _, row in actives.iterrows():
            mol = AllChem.MolFromSmiles(row[0])
            if mol:
                fp_vals = ','.join(map(str, mh.encode_mol(mol)))

                f.write(fp_vals + '\n')

    with open('/cluster/chembl/chembl.' + str(subset) + '.mhecfp4', 'w+') as f:
        for _, row in actives.iterrows():
            mol = AllChem.MolFromSmiles(row[0])
            if mol:
                fp_vals = ','.join(
                    map(
                        str,
                        mh.from_sparse_array([
                            *AllChem.GetMorganFingerprint(
                                mol, 2).GetNonzeroElements()
                        ])))

                f.write(fp_vals + '\n')

    with open('/cluster/chembl/chembl.' + str(subset) + '.ecfp4', 'w+') as f:
        for _, row in actives.iterrows():
            mol = AllChem.MolFromSmiles(row[0])
            if mol:
                fp_vals = ','.join(
                    map(
                        str,
                        AllChem.GetMorganFingerprintAsBitVect(mol,
                                                              2,
                                                              nBits=2048)))

                f.write(fp_vals + '\n')
예제 #6
0
    def __init__(self,
                 dimensions=1024,
                 radius=2,
                 is_counted=False,
                 is_folded=False,
                 return_strings=False):
        """
        Parameters
        ----------
        dimensions : int
            (default = 1024)
            Number of entries in the output map4 fingerprint.

        radius : int
            (default = 2)
            Number of bonds away from atom centre to consider.

        is_counted : bool
            (default = False)

        is_folded : bool
            (default = False)

        return_strings : bool
            (default = False)
            If True then returns substructure strings rather than hashed fingerprint.
        """
        self.dimensions = int(dimensions)
        self.radius = int(radius)
        self.is_counted = bool(is_counted)
        self.is_folded = bool(is_folded)
        self.return_strings = bool(return_strings)

        if self.is_folded:
            self.encoder = MHFPEncoder(dimensions)
        else:
            self.encoder = tm.Minhash(dimensions)
예제 #7
0
def CalculateMinHashFingerprint(mol: Chem.Mol,
                                radius: int = 3,
                                rtype: str = 'bitstring',
                                bits: int = 2048) -> Tuple[str, dict, Any]:
    """Calculate the MinHash Fingerprint (MHFP) of molecule.

    doi: 10.1186/s13321-018-0321-8.
    :param radius: maximum radius of atom-centered substructures.
    :param rtype: Type of output, may either be:
                  bitstring (default), returns a binary string
                  numpy, return the underlying numpy array
                  dict, for a dict of bits turned on
    :param bits: Number of folded bits (ignored if rtype != 'bitstring')
    """
    mhfp = MHFPEncoder()
    shingles = mhfp.shingling_from_mol(mol, radius, True, True, 1)
    hash_values = mhfp.hash(shingles)
    if rtype == 'numpy':
        return hash_values
    elif rtype == 'dict':
        return {x: 1 for x in hash_values.tolist()}
    else:
        folded = mhfp.fold(hash_values, bits)
        return ''.join(map(str, folded))
예제 #8
0
  def _get_knn(query_mhfp, ann, k, data):
    """ Brute-force search for selecting k nearest neighbors from k * kc  approximate nearest neighbors.

    Keyword arguments:
        query_mhfp {numpy.ndarray} -- The query MHFP fingerprint.
        ann {list} -- A list of indices of approximate nearest neighbors of size k * kc to be brute-force searched
        k {int} -- The number of nearest neighbors to be returned from the approximate nearest neighbors
        data {dict} -- The MHFP values indexed with the same key supplied to add()
    """

    dists = []

    for index in ann:
        dists.append((index, 1.0 - MHFPEncoder.distance(query_mhfp, data[index])))
    
    dists.sort(key=itemgetter(1), reverse=True)
    return [x[0] for x in dists[:k]]
예제 #9
0
def similarity_search(chos_fp, db, mol, number_of_hits, lf1, lf2, lf3, findID1,
                      findID2, findID3):
    """returns n hits of the given query ligand  

    Arguments:
        smile {string} -- smile of the query ligand
        number_of_hits {integer} -- number of required hits

    Returns:
        list -- n NN of the query molecule according to MXfp 
    """

    results = []

    if db == 'ChEMBL':
        lf = lf1
        findID = findID1

    elif db == 'SwissProt':
        lf = lf2
        findID = findID2

    else:
        lf = lf3
        findID = findID3

    mhfp_encoder = MHFPEncoder(512)

    if chos_fp == 'MAP4':
        fp = calc_map(mol)
    else:
        fp = calc_mhfp(mhfp_encoder, mol)

    NNs = lf.query_linear_scan(fp, int(number_of_hits))

    for i, NN in enumerate(NNs):
        results.append([
            findID[NN[1]][1].split(';'), findID[NN[1]][0],
            round(NN[0], 3), findID[NN[1]][2]
        ])

    return results
예제 #10
0
def LSH_Convert(mols, outpath, num_workers):
    # MinHash fingerprints (mhfp) encoder for molecular fingerprinting
    enc = MHFPEncoder(1024)
    # Locality Sensitive Hashing Forest Instance
    lf = tm.LSHForest(1024, 64)

    print("Number of mols to be hashed:", len(mols))
    fps = process_map(enc.encode_mol,
                      mols,
                      chunksize=100,
                      max_workers=num_workers)

    fp_vecs = [tm.VectorUint(fp) for fp in fps]

    lf.batch_add(fp_vecs)
    lf.index()
    # save fp and lf
    with open(os.path.join(outpath, "fps.pickle"), "wb") as fpfile:
        pickle.dump(fps, fpfile)
    lf.store(os.path.join(outpath, "lf.dat"))
    print('LSH data files saved!')
    return lf
예제 #11
0
import pytest
import numpy as np
from scipy.spatial.distance import jaccard
from rdkit.Chem import AllChem
from mhfp.encoder import MHFPEncoder
from mhfp.lsh_forest import LSHForestHelper

# Keeping tests barebone and simple

mhfp_encoder = MHFPEncoder()
lfh = LSHForestHelper()

drugbank = []

with open('test/drugbank.smi') as f:
    for line in f.readlines():
        mol = AllChem.MolFromSmiles(line.strip().split()[0])
        if mol:
            drugbank.append(mhfp_encoder.encode_mol(mol))

for i, fp in enumerate(drugbank):
    lfh.add(i, fp)

lfh.index()


def test_setup():
    assert len(drugbank) == 226


def test_add():
예제 #12
0
        type=int,
        help="Number of workers (CPU cores) to use for multiprocessing,\
                        default to the number of available CPU cores minus one",
        default=os.cpu_count() - 1)
    parser.add_argument("-d",
                        "--dim",
                        type=int,
                        help="Fingerprint dimension, default to 1024",
                        default=1024)

    a = parser.parse_args()
    outpath = os.path.abspath(a.output)
    mols = file_to_mols(a.filename)

    # Define a named properties tuple
    # To pickle a named tuple correctly:
    ## 1) The named tupple object has to be declared under __main__
    ## 2) The declared variable for the named tuple has to match
    ##    the tuple name in the quotation mark!!
    Props = namedtuple('Props', ['SMILES', 'MolWt', 'LogP', 'QED', 'SAS'])

    # MinHash fingerprints (mhfp) encoder. This is a specialized molecular fingerprint scheme
    enc = MHFPEncoder(a.dim)
    # Locality Sensitive Hashing Forest
    lf = tm.LSHForest(a.dim, 64)

    MolsToLSHForest(mol_list=mols,
                    save_path=outpath,
                    worker=a.worker,
                    batch_size=a.batch)
예제 #13
0
파일: test.py 프로젝트: reymond-group/mhfp
from rdkit.Chem import AllChem

# config = mstmap.LayoutConfiguration()
# # config.merger = mstmap.Merger.Solar
# # print(config)

# # TODO: Fails for disconnected components!
# u = mstmap.VectorUint([0, 1, 2, 3, 4])
# v = mstmap.VectorUint([1, 2, 0, 4, 3])
# w = mstmap.VectorFloat([1.0, 1.0, 1.0, 2.0, 6.0])
# x, y = mstmap.layout(5, u, v, config, w)

# print(x)
# print(y)

enc = MHFPEncoder(512)

fps = []

if not os.path.isfile('fps.dat'):
    with open('drugbank.smi', 'r') as f:
        i = 0
        for line in f:
            smiles = line.split()[0].strip()
            mol = AllChem.MolFromSmiles(smiles)
            if mol:
                fps.append(enc.encode_mol(mol))
            i += 1
            if i > 2000: break
    pickle.dump(fps, open('fps.dat', 'wb'))
else:
예제 #14
0
파일: map4.py 프로젝트: iwatobipen/map4
class MAP4Calculator:

    def __init__(self, dimensions=1024, radius=2, is_counted=False, is_folded=False):
        """
        MAP4 calculator class
        """
        self.radius = radius
        self.is_counted = is_counted
        self.is_folded = is_folded

        if self.is_folded:
            self.encoder = MHFPEncoder(dimensions)
        else:
            self.encoder = tm.Minhash(dimensions)

    def calculate(self, mol):
        """Calculates the atom pair minhashed fingerprint

        Arguments:
            mol -- rdkit mol object

        Returns:
            tmap VectorUint -- minhashed fingerprint
        """
        
        atom_env_pairs = self._calculate(mol)
        if self.is_folded:
            return self._fold(atom_env_pairs)
        return self.encoder.from_string_array(atom_env_pairs)

    def calculate_many(self, mols):
        """ Calculates the atom pair minhashed fingerprint

        Arguments:
            mols -- list of mols

        Returns:
            list of tmap VectorUint -- minhashed fingerprints list
        """

        atom_env_pairs_list = [self._calculate(mol) for mol in mols]
        if self.is_folded:
            return [self._fold(pairs) for pairs in atom_env_pairs_list]
        return self.encoder.batch_from_string_array(atom_env_pairs_list)

    def _calculate(self, mol):
        return self._all_pairs(mol, self._get_atom_envs(mol))

    def _fold(self, pairs):
        fp_hash = self.encoder.hash(set(pairs))
        return self.encoder.fold(fp_hash)

    def _get_atom_envs(self, mol):
        atoms_env = {}
        for atom in mol.GetAtoms():
            idx = atom.GetIdx()
            for radius in range(1, self.radius + 1):
                if idx not in atoms_env:
                    atoms_env[idx] = []
                atoms_env[idx].append(MAP4Calculator._find_env(mol, idx, radius))
        return atoms_env

    @classmethod
    def _find_env(cls, mol, idx, radius):
        env = rdmolops.FindAtomEnvironmentOfRadiusN(mol, radius, idx)
        atom_map = {}

        submol = Chem.PathToSubmol(mol, env, atomMap=atom_map)
        if idx in atom_map:
            smiles = Chem.MolToSmiles(submol, rootedAtAtom=atom_map[idx], canonical=True, isomericSmiles=False)
            return smiles
        return ''

    def _all_pairs(self, mol, atoms_env):
        atom_pairs = []
        distance_matrix = GetDistanceMatrix(mol)
        num_atoms = mol.GetNumAtoms()
        shingle_dict = defaultdict(int)
        for idx1, idx2 in itertools.combinations(range(num_atoms), 2):
            dist = str(int(distance_matrix[idx1][idx2]))

            for i in range(self.radius):
                env_a = atoms_env[idx1][i]
                env_b = atoms_env[idx2][i]

                ordered = sorted([env_a, env_b])

                shingle = '{}|{}|{}'.format(ordered[0], dist, ordered[1])

                if self.is_counted:
                    shingle_dict[shingle] += 1
                    shingle += '|' + str(shingle_dict[shingle])

                atom_pairs.append(shingle.encode('utf-8'))
        return list(set(atom_pairs))
예제 #15
0
def test_secfp():
    assert np.array_equal(MHFPEncoder.secfp_from_smiles(smiles), secfp)
예제 #16
0
import pytest
import numpy as np
from scipy.spatial.distance import jaccard
from rdkit.Chem import AllChem
from mhfp.encoder import MHFPEncoder

# Keeping tests barebone and simple

mhfp_encoder = MHFPEncoder()
smiles = 'CCCC1=NN(C2=C1NC(=NC2=O)C3=C(C=CC(=C3)S(=O)(=O)N4CCN(CC4)C)OCC)C'
mol = AllChem.MolFromSmiles(smiles)
shingling = sorted([
    b'c1cnnc1', b'Cn(nc)c(c)c', b'C(C)Oc', b'c1(OCC)ccccc1-c([nH])n',
    b'S(c)(N)(=O)=O', b'c(nc)([nH]c)-c(c)c', b'CN(C)C', b'n(c(-c)[nH])c(c)=O',
    b'C(C)O', b'N(C)(C)S', b'S(=O)(=O)(c(cc)cc)N(CC)CC', b'CC',
    b'c1(CCC)nn(C)c(c)c1[nH]c', b'c1(S(=O)(=O)N(C)C)cccc(-c)c1', b'N(C)(C)C',
    b'c(cc)c(c)S', b'N1(S(=O)(=O)c(c)c)CCNCC1', b'c(c)(c)[nH]',
    b'O=c(nc)c(c)n', b'N(C)(CC)CC', b'n(c(c)C)n(c)C', b'C1CNCCN1', b'c1ccccc1',
    b'C(C)N', b'n(c)(C)n', b'C(CC)c(c)n', b'c(c(c)-c)c(c)S', b'O(c)C', b'CCO',
    b'CN(CC)CC', b'[nH](c)c', b'n(c)c', b'n1c(-c(c)c)[nH]cc(n)c1=O',
    b'N(CC)(CC)S(c)(=O)=O', b'C(CN)N(C)C', b'S(=O)(=O)(c(c)c)N(C)C',
    b'O(CC)c(cc)c(c)-c', b'C(C)Oc(c)c', b'C(C)Cc', b'C(CN)N(C)S',
    b'c1(-c(nc)[nH]c)cc(S)ccc1OC', b'c(cc)(OC)c(c)-c',
    b'n1c(CC)c([nH])c(c)n1C', b'c(c)(c)-c', b'c([nH]c)(c(C)n)c(c)n',
    b'c(c)(c)O', b'c1cc(O)ccc1S(N)(=O)=O', b'O=S(c)(N)=O',
    b'c12[nH]c(-c)nc(=O)c1n(C)nc2CC', b'c(-c)([nH])n',
    b'c1(=O)nc(-c)[nH]c(c)c1n(C)n', b'c1cc(S)cc(-c)c1OC', b'O(CC)c(c)c',
    b'c(c)(c)n', b'C(C)C', b'n(c)n', b'CCOc', b'c(cc)(-c([nH])n)c(c)O',
    b'c(CC)(nn)c(c)[nH]', b'c(c)(c)S', b'CCC', b'N1(C)CCNCC1',
    b'c(c(n)=O)(c(c)[nH])n(C)n', b'n(C)(nc)c(c)c', b'c(c)(n)=O', b'Cn(c)n',
    b'c(cc)(cc)S(N)(=O)=O', b'O=c(c)n', b'c(c)c', b'n1(C)nc(C)c([nH])c1c(n)=O',
예제 #17
0
class Map4Fingerprint:
    """Calculates the atom pair minmashed fingerprint for a given molecular object.
    Fingerprint is as described by `DOI: 10.1186/1758-2946-5-26` and implemented in the
    [corresponding repository](https://github.com/reymond-group/map4).
    """
    def __init__(self,
                 dimensions=1024,
                 radius=2,
                 is_counted=False,
                 is_folded=False,
                 return_strings=False):
        """
        Parameters
        ----------
        dimensions : int
            (default = 1024)
            Number of entries in the output map4 fingerprint.

        radius : int
            (default = 2)
            Number of bonds away from atom centre to consider.

        is_counted : bool
            (default = False)

        is_folded : bool
            (default = False)

        return_strings : bool
            (default = False)
            If True then returns substructure strings rather than hashed fingerprint.
        """
        self.dimensions = int(dimensions)
        self.radius = int(radius)
        self.is_counted = bool(is_counted)
        self.is_folded = bool(is_folded)
        self.return_strings = bool(return_strings)

        if self.is_folded:
            self.encoder = MHFPEncoder(dimensions)
        else:
            self.encoder = tm.Minhash(dimensions)

    def __call__(self, mol):
        """Calculates the atom pair minmashed fingerprint for a given molecular object.
        Fingerprint is as described by `DOI: 10.1186/1758-2946-5-26` and implemented in the
        [corresponding repository](https://github.com/reymond-group/map4).

        Parameters
        ----------
        mol : rdkit.Chem.rdchem.Mol
            `rdkit` mol object.

        Returns
        -------
        fp_arr : np.ndarray
            shape(self.dimensions, )
            Map4 fingerprint.
        """
        atom_envs = self._get_atom_envs(mol)
        atom_env_pairs = self._all_pairs(mol, atom_envs)
        if self.is_folded:
            fp_arr = self._fold(atom_env_pairs)
        elif self.return_strings:
            fp_arr = atom_env_pairs
        else:
            fp_arr = self.encoder.from_string_array(atom_env_pairs)
        return np.asarray(fp_arr)

    def _fold(self, pairs):
        fp_hash = self.encoder.hash(set(pairs))
        return self.encoder.fold(fp_hash, self.dimensions)

    def _get_atom_envs(self, mol):
        atoms_env = {}
        for atom in mol.GetAtoms():
            idx = atom.GetIdx()
            for radius in range(1, self.radius + 1):
                if idx not in atoms_env:
                    atoms_env[idx] = []
                atoms_env[idx].append(
                    Map4Fingerprint._find_env(mol, idx, radius))
        return atoms_env

    @classmethod
    def _find_env(cls, mol, idx, radius):
        env = rdmolops.FindAtomEnvironmentOfRadiusN(mol, radius, idx)
        atom_map = {}

        submol = Chem.PathToSubmol(mol, env, atomMap=atom_map)
        if idx in atom_map:
            smiles = Chem.MolToSmiles(submol,
                                      rootedAtAtom=atom_map[idx],
                                      canonical=True,
                                      isomericSmiles=False)
            return smiles
        return ''

    def _all_pairs(self, mol, atoms_env):
        atom_pairs = []
        distance_matrix = GetDistanceMatrix(mol)
        num_atoms = mol.GetNumAtoms()
        shingle_dict = defaultdict(int)
        for idx1, idx2 in itertools.combinations(range(num_atoms), 2):
            dist = str(int(distance_matrix[idx1][idx2]))

            for i in range(self.radius):
                env_a = atoms_env[idx1][i]
                env_b = atoms_env[idx2][i]

                ordered = sorted([env_a, env_b])

                shingle = '{}|{}|{}'.format(ordered[0], dist, ordered[1])

                if self.is_counted:
                    shingle_dict[shingle] += 1
                    shingle += '|' + str(shingle_dict[shingle])

                atom_pairs.append(shingle.encode('utf-8'))
        return list(set(atom_pairs))
예제 #18
0
def main():
    """ The main function """
    df = pd.read_csv("drugbank.csv").dropna(subset=["SMILES"]).reset_index(
        drop=True)
    enc = MHFPEncoder()
    lf = tm.LSHForest(2048, 128)

    fps = []
    labels = []
    groups = []
    tpsa = []
    logp = []
    mw = []
    h_acceptors = []
    h_donors = []
    ring_count = []
    is_lipinski = []
    has_coc = []
    has_sa = []
    has_tz = []

    substruct_coc = AllChem.MolFromSmiles("COC")
    substruct_sa = AllChem.MolFromSmiles("NS(=O)=O")
    substruct_tz = AllChem.MolFromSmiles("N1N=NN=C1")

    total = len(df)
    for i, row in df.iterrows():
        if i % 1000 == 0 and i > 0:
            print(f"{round(100 * (i / total))}% done ...")

        smiles = row[6]
        mol = AllChem.MolFromSmiles(smiles)

        if mol and mol.GetNumAtoms() > 5 and smiles.count(".") < 2:
            fps.append(tm.VectorUint(enc.encode_mol(mol, min_radius=0)))
            labels.append(
                f'{smiles}__<a href="https://www.drugbank.ca/drugs/{row[0]}" target="_blank">{row[0]}</a>__{row[1]}'
                .replace("'", ""))
            groups.append(row[3].split(";")[0])
            tpsa.append(Descriptors.TPSA(mol))
            logp.append(Descriptors.MolLogP(mol))
            mw.append(Descriptors.MolWt(mol))
            h_acceptors.append(Descriptors.NumHAcceptors(mol))
            h_donors.append(Descriptors.NumHDonors(mol))
            ring_count.append(Descriptors.RingCount(mol))
            is_lipinski.append(lipinski_pass(mol))
            has_coc.append(mol.HasSubstructMatch(substruct_coc))
            has_sa.append(mol.HasSubstructMatch(substruct_sa))
            has_tz.append(mol.HasSubstructMatch(substruct_tz))

    # Create the labels and the integer encoded array for the groups,
    # as they're categorical
    labels_groups, groups = Faerun.create_categories(groups)
    tpsa_ranked = ss.rankdata(np.array(tpsa) / max(tpsa)) / len(tpsa)
    logp_ranked = ss.rankdata(np.array(logp) / max(logp)) / len(logp)
    mw_ranked = ss.rankdata(np.array(mw) / max(mw)) / len(mw)
    h_acceptors_ranked = ss.rankdata(
        np.array(h_acceptors) / max(h_acceptors)) / len(h_acceptors)
    h_donors_ranked = ss.rankdata(
        np.array(h_donors) / max(h_donors)) / len(h_donors)
    ring_count_ranked = ss.rankdata(
        np.array(ring_count) / max(ring_count)) / len(ring_count)

    lf.batch_add(fps)
    lf.index()
    cfg = tm.LayoutConfiguration()
    cfg.k = 100
    # cfg.sl_extra_scaling_steps = 1
    cfg.sl_repeats = 2
    cfg.mmm_repeats = 2
    cfg.node_size = 2
    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, config=cfg)

    # Define a colormap highlighting approved vs non-approved
    custom_cmap = ListedColormap(
        [
            "#2ecc71", "#9b59b6", "#ecf0f1", "#e74c3c", "#e67e22", "#f1c40f",
            "#95a5a6"
        ],
        name="custom",
    )

    bin_cmap = ListedColormap(["#e74c3c", "#2ecc71"], name="bin_cmap")

    f = Faerun(
        clear_color="#222222",
        coords=False,
        view="front",
        impress=
        'made with <a href="http://tmap.gdb.tools" target="_blank">tmap</a><br />and <a href="https://github.com/reymond-group/faerun-python" target="_blank">faerun</a><br /><a href="https://gist.github.com/daenuprobst/5cddd0159c0cf4758fb16b4b4acbef89">source</a>',
    )

    f.add_scatter(
        "Drugbank",
        {
            "x":
            x,
            "y":
            y,
            "c": [
                groups,
                is_lipinski,
                has_coc,
                has_sa,
                has_tz,
                tpsa_ranked,
                logp_ranked,
                mw_ranked,
                h_acceptors_ranked,
                h_donors_ranked,
                ring_count_ranked,
            ],
            "labels":
            labels,
        },
        shader="smoothCircle",
        colormap=[
            custom_cmap,
            bin_cmap,
            bin_cmap,
            bin_cmap,
            bin_cmap,
            "viridis",
            "viridis",
            "viridis",
            "viridis",
            "viridis",
            "viridis",
        ],
        point_scale=2.5,
        categorical=[
            True, True, True, True, True, False, False, False, False, False
        ],
        has_legend=True,
        legend_labels=[
            labels_groups,
            [(0, "No"), (1, "Yes")],
            [(0, "No"), (1, "Yes")],
            [(0, "No"), (1, "Yes")],
            [(0, "No"), (1, "Yes")],
        ],
        selected_labels=["SMILES", "Drugbank ID", "Name"],
        series_title=[
            "Group",
            "Lipinski",
            "Ethers",
            "Sulfonamides",
            "Tetrazoles",
            "TPSA",
            "logP",
            "Mol Weight",
            "H Acceptors",
            "H Donors",
            "Ring Count",
        ],
        max_legend_label=[
            None,
            None,
            None,
            None,
            None,
            str(round(max(tpsa))),
            str(round(max(logp))),
            str(round(max(mw))),
            str(round(max(h_acceptors))),
            str(round(max(h_donors))),
            str(round(max(ring_count))),
        ],
        min_legend_label=[
            None,
            None,
            None,
            None,
            None,
            str(round(min(tpsa))),
            str(round(min(logp))),
            str(round(min(mw))),
            str(round(min(h_acceptors))),
            str(round(min(h_donors))),
            str(round(min(ring_count))),
        ],
        title_index=2,
        legend_title="",
    )

    f.add_tree("drugbanktree", {"from": s, "to": t}, point_helper="Drugbank")

    f.plot("drugbank", template="smiles")
예제 #19
0
# Import Modules
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType
import pyspark.sql.functions as F
from db_config import url, properties
from mhfp.encoder import MHFPEncoder
from pyspark.sql.types import IntegerType, StringType
sudo apt-get install libboost-all-dev
mhfp_encoder = MHFPEncoder()
spark = SparkSession.builder.getOrCreate()

# Defined multiple UDF via PySpark Sql appfunctions.py module
def filename(path):
    return path
countCarbons = F.udf(lambda x : str(x).lower().count('c'), IntegerType())
sourceFile = F.udf(filename, StringType())
mhfp_smiles = F.udf(lambda x : mhfp_encoder.encode(x, radius=3, rings=True, kekulize=True, sanitize=True), StringType())
# Created DataFrames here with the new columns that I required and dropped the duplicates
df = spark.read.format('csv').option('delimiter','\t').option('header', 'false')\
    .load('s3a://zincdata/zinc/AA/AAAA.txt')
df = df.withColumn('mhfp', mhfp_smiles('smiles'))
df = df.dropDuplicates(['smiles'])
df.show()
# Performed my dataframe write with the help of jdbc
#df.write.jdbc(url='jdbc:%s' % url, table="zincmap", mode='append', properties=properties)