示例#1
0
    def test_parallelized(self):
        def fn(x):
            return x**2

        results = dm.parallelized(
            fn,
            [{
                "x": i
            } for i in range(10)],
            scheduler="processes",
            n_jobs=None,
            arg_type="kwargs",
            progress=True,
        )
        assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

        results = dm.parallelized(
            fn,
            [[i] for i in range(10)],
            scheduler="processes",
            n_jobs=None,
            arg_type="args",
            progress=True,
        )
        assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

        results = dm.parallelized(
            fn,
            [i for i in range(10)],
            scheduler="processes",
            n_jobs=None,
            progress=False,
        )
        assert results == [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
示例#2
0
def pdist(mols: List[Chem.rdchem.Mol],
          n_jobs: Optional[int] = 1,
          **fp_args) -> Tuple[np.ndarray, np.ndarray]:
    """Compute the pairwise tanimoto distance between the fingerprints of all the
    molecules in the input set.

    Args:
        mols: list of molecules
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.
        **fp_args: list of args to pass to `to_fp()`.

    Returns:
        distmat, valid_idx: Distance matrix, and valid index that have passed the conversion
            to fingerprint.
    """

    fps = dm.parallelized(
        functools.partial(dm.to_fp, as_array=False, **fp_args),
        mols,
        n_jobs=n_jobs,
    )

    valid_idx, fps = zip(*[(i, fp) for i, fp in enumerate(fps)
                           if fp is not None])
    fps = list(fps)

    dist = GetTanimotoDistMat(fps)
    dist_mat = np.zeros((len(fps), len(fps)))
    dist_mat[np.triu_indices_from(dist_mat, 1)] = dist
    dist_mat += dist_mat.T

    return dist_mat, np.array(valid_idx)
示例#3
0
def ingest_chembl_smi(smi_path, smiles_column, canonical_id_column, activity_column):
    
    """Convert an smi file with a smiles column to a molchunk. It is assumed that
        the SMI has been cleaned (no header, and other columns have been removed).
        
    Args:
        smi_path: path to the smi file.
        smiles_column: column where the SMILES are located: f0 = col 1 f1 = col 2 .. etc
        canonical_id_column: name/id for molecule: f0 = col 1 f1 = col 2 .. etc
        activity column: column where bioactivity is listed (ki, ec50, etc): f0 = col 1 f1 = col 2 .. etc

    """

    
    # Next we will the multithreaded read options that pyarrow allows for.

    opts = pa.csv.ReadOptions(use_threads=True, autogenerate_column_names=True)

    # Then we tell pyarrow that the columns in our csv file are seperated by ';'
    # If they were tab seperated we would use '\t' and if it was comma we would use 
    # ','
    parse_options= pa.csv.ParseOptions(delimiter=' ')

    # Now we read the CSV into a pyarrow table. This is a columular dataset. More
    # on this later. Note how we specified the options above.

    table = pa.csv.read_csv(smi_path, opts, parse_options)


    # Now we will use a function that converts the pyarrow table into a pandas 
    # dataframe. We could have done this without arrow, but again -- there are 
    # very powerful tools that arrow will grant us.

    df_new = table.to_pandas()
 
    smiles_column = 'f0'
    
    # run initial mapper on smiles column to generate basic information and fingerprint on bits
    df_clean_mapped = dm.parallelized(_preprocess, list(df_new.iterrows()), arg_type='args', progress=True)
    df_clean_mapped = pd.DataFrame(df_clean_mapped)
    
    #rename columns
    df_clean_mapped['smiles'] = df_clean_mapped[smiles_column]
    df_clean_mapped['canonical_id'] = df_clean_mapped[canonical_id_column]
    df_clean_mapped['ki'] = df_clean_mapped[activity_column]
    
    #delete old columns
    del df_clean_mapped['f2']
    del df_clean_mapped['f1']
    del df_clean_mapped['f0']
    
    #remove duplicated standard SMILES and reindex
    duplicateRowsDF2 = df_clean_mapped[df_clean_mapped.duplicated(['standard_smiles'])]
    print("Duplicate Rows based on a single column are:", duplicateRowsDF2, sep='\n')
    df_clean_mapped = df_clean_mapped.drop_duplicates(subset='standard_smiles', keep="first", inplace=False)
    df = df_clean_mapped.reset_index(drop=True)
    
    return df
示例#4
0
def pick_diverse(
    mols: List[Chem.rdchem.Mol],
    npick: int,
    initial_picks: List[int] = None,
    feature_fn: Callable = None,
    dist_fn: Callable = None,
    seed: int = 42,
    n_jobs: Optional[int] = 1,
):
    r"""Pick a set of diverse molecules based on they fingerprint.

    Args:
        mols: a list of molecules.
        npick: Number of element to pick from mols, including the preselection.
        initial_picks: Starting list of index for molecules that should be in the
            set of picked molecules. Default to None.
        feature_fn: A feature function that takes a Chem.rdchem.Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        dist_fn: A function that takes two indexes (i,j) and return the
            distance between them. You might use partial to set the fingerprints as input.
            By default, the Tanimoto similarity will be used. Default to None.
        seed: seed for reproducibility
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.

    Returns:
        picked_inds: index of the molecule that have been picked
        mols: molecules that have been picked
    """

    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)

    def distij(i, j, features=features):
        return 1.0 - DataStructs.TanimotoSimilarity(features[i], features[j])

    if dist_fn is None:
        dist_fn = distij

    picker = MaxMinPicker()
    initial_picks = [] if initial_picks is None else initial_picks
    picked_inds = picker.LazyPick(dist_fn,
                                  len(mols),
                                  npick,
                                  firstPicks=initial_picks,
                                  seed=seed)
    picked_inds = np.array(picked_inds)
    picked_mols = [mols[x] for x in picked_inds]

    return picked_inds, picked_mols
示例#5
0
def assign_to_centroids(
    mols: List[Chem.rdchem.Mol],
    centroids: List[Chem.rdchem.Mol],
    feature_fn: Callable = None,
    dist_fn: Callable = None,
    n_jobs: Optional[int] = 1,
):
    r"""Assign molecules to centroids. Each molecule will be assigned to the closest centroid.

    Args:
        mols: a list of molecules to assign to centroids
        centroids: list of molecules to use as centroid
        feature_fn: A feature function that takes a Chem.rdchem.Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        dist_fn: A function that takes two indexes (i,j) and return the
            distance between them. You might use partial to set the fingerprints as input.
            By default, the Tanimoto similarity will be used. Default to None.
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.

    Returns:
        clusters_map: dict of index mapping each centroid index to the molecule index in the cluster
        clusters_list: list of all molecules in each cluster. The cluster index follows the index of the centroid.
            Note that the centroid molecule is not added to the cluster.
    """

    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    all_mols = [x for x in mols] + [c for c in centroids]
    features = dm.parallelized(feature_fn, all_mols, n_jobs=n_jobs)

    def distij(i, j, features=features):
        return 1.0 - DataStructs.TanimotoSimilarity(features[int(i)],
                                                    features[int(j)])

    if dist_fn is None:
        dist_fn = distij

    clusters_map = ddict(list)
    clusters_list = [[] for _ in centroids]
    query_inds = np.expand_dims(np.arange(len(mols), dtype=int), axis=1)
    centroid_inds = np.expand_dims(np.arange(len(centroids), dtype=int),
                                   axis=1) + len(mols)
    dist_mat = distance.cdist(query_inds, centroid_inds, metric=distij)
    closest = np.argmin(dist_mat, axis=1)
    for ind, cluster_ind in enumerate(closest):  # type: ignore
        clusters_map[cluster_ind].append(ind)
        clusters_list[cluster_ind].append(mols[ind])
    return clusters_map, clusters_list
示例#6
0
def cdist(
    mols1: List[Chem.rdchem.Mol],
    mols2: List[Chem.rdchem.Mol],
    n_jobs: Optional[int] = 1,
    **fp_args,
) -> np.ndarray:
    """Compute the pairwise tanimoto distance between the fingerprints of
    each pair of molecules of the two collections of inputs.

    Args:
        mols1: list of molecules.
        mols2: list of molecules.
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.
        **fp_args: list of args to pass to `to_fp()`.

    Returns:
        distmat
    """

    fps1 = dm.parallelized(
        functools.partial(dm.to_fp, as_array=True, **fp_args),
        mols1,
        n_jobs=n_jobs,
    )

    fps2 = dm.parallelized(
        functools.partial(dm.to_fp, as_array=True, **fp_args),
        mols2,
        n_jobs=n_jobs,
    )

    fps1 = np.array(fps1)
    fps2 = np.array(fps2)

    dist_mat = distance.cdist(fps1, fps2, metric="jaccard")

    return dist_mat
示例#7
0
def cluster_mols(
    mols: List[Chem.rdchem.Mol],
    cutoff: float = 0.2,
    feature_fn: Callable = None,
    n_jobs: Optional[int] = 1,
):
    """Cluster a set of molecules using the butina clustering algorithm and a given threshold.

    Args:
        mols: a list of molecules.
        cutoff: Cuttoff for the clustering. Default to 0.2.
        feature_fn: A feature function that takes a Chem.rdchem.Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.
    """

    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)

    dists = []
    n_mols = len(mols)

    for i in range(1, n_mols):
        dist = DataStructs.BulkTanimotoSimilarity(features[i],
                                                  features[:i],
                                                  returnDistance=True)
        dists.extend([x for x in dist])

    # now cluster the data
    cluster_indices = Butina.ClusterData(dists,
                                         n_mols,
                                         cutoff,
                                         isDistData=True)
    cluster_mols = [
        operator.itemgetter(*cluster)(mols) for cluster in cluster_indices
    ]

    # Make single mol cluster a list
    cluster_mols = [[c] if isinstance(c, Chem.rdchem.Mol) else c
                    for c in cluster_mols]

    return cluster_indices, cluster_mols
示例#8
0
def prep_parquet_db(df, n_jobs, smiles_col, catalog_id_col, canonical_id_col):
    
    '''Take a cleaned df that contains protonated/tautomerized smiles, 
    the vendor database ID and a canonical ID -- number indicates protomer/taut
    and 1) enumerate stereoisomers 2) generate chiral/achiral fingerprints 3) smarts and
    a new canonical ID that references stereoisomer. 
    
    Returns: elaborated dataframe - pandas dataframe
    
    args: df == dataframe to be passed in - pandas dataframe
    n_jobs == number of jobs utilized by joblib - integer
    smiles_col == the name of the smiles column - string
    catalog_id == name of column referencing the catalog ID - string
    canonical_id == name of col referencing the canonical ID usually Z123456789_1 where _1 is protomer/taut num -string
    '''
    
    smiles_column = smiles_col
    
    #Add clean the mols, standardize and generate lists for enumerated smiles, fingerprints both chiral/achiral at 8kbits
    df_clean_mapped = dm.parallelized(_preprocess, list(df.iterrows()), arg_type='args', progress=True, n_jobs=n_jobs)
    df_clean_mapped = pd.DataFrame(df_clean_mapped)
    
    #keep only the following columns
    columns_to_keep = ['enumerated_smiles', catalog_id_col, canonical_id_col, 'achiral_fp', 'chiral_fp', 'smarts', 'selfies']
    df2 = df_clean_mapped[columns_to_keep]
    
    #remove dropped smiles, these fail due to invalid mols from rdkit
    df_dropped = df2[df2.smarts == 'dropped']
    df3 = df2[df2.smarts != 'dropped']
    
    #explode all the lists and generate new rows, then drop duplicated smiles.
    df4 = df3.set_index(['CatalogID', 'ID_Index', 'smarts', 'selfies']).apply(pd.Series.explode).reset_index()
    df5 = df4.drop_duplicates(subset='enumerated_smiles', keep="first", inplace=False)
    df5 = df5.reset_index(drop=True)
    
    #generate a new indexing system that creates unique names for canonical_id Z123456_1_1 where 
    # Z123456_1 is taut/prot id and the additional _1 is the stereoisomer id
    df6 = df5.set_index('ID_Index')
    df6.index = df6.index + '_' + df6.groupby(level=0).cumcount().add(1).astype(str).replace('0','')
    df7 = df6.reset_index()
    
    #cleanup columns and return
    df7.columns = ['canonical_ID', 'CatalogID', 'smarts', 'selfies', 'enumerated_smiles', 'achiral_fp', 'chiral_fp']
    return df7
示例#9
0
def pick_centroids(
    mols: List[Chem.rdchem.Mol],
    npick: int = 0,
    initial_picks: List[int] = None,
    threshold: float = 0.5,
    feature_fn: Callable = None,
    dist_fn: Callable = None,
    seed: int = 42,
    method: str = "sphere",
    n_jobs: Optional[int] = 1,
):
    r"""Pick a set of `npick` centroids from a list of molecules.

    Args:
        mols: a list of molecules.
        npick: Number of element to pick from mols, including the preselection.
        threshold: Minimum distance between centroids for `maxmin` and sphere exclusion (`sphere`) methods.
        initial_picks: Starting list of index for molecules that should be in the
            set of picked molecules. Default to None.
        feature_fn (callable, optional): A feature function that takes a Chem.rdchem.Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        dist_fn: A function that takes two indexes (i,j) and return the
            distance between them. You might use partial to set the fingerprints as input.
            By default, the Tanimoto similarity will be used. Default to None.
        seed: seed for reproducibility
        method: Picking method to use. One of  `sphere`, `maxmin` or any
            supported rdkit hierarchical clustering method such as `centroid`, `clink`, `upgma`
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.

    Returns:
        picked_inds: index of the molecule that have been selected as centroids
        mols: molecules that have been picked
    """

    n_mols = len(mols)
    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)

    def distij(i, j, features=features):
        return 1.0 - DataStructs.TanimotoSimilarity(features[i], features[j])

    if dist_fn is None:
        dist_fn = distij

    initial_picks = [] if initial_picks is None else initial_picks

    if method == "maxmin":
        picker = MaxMinPicker()
        picked_inds, _ = picker.LazyPickWithThreshold(
            dist_fn,
            n_mols,
            pickSize=npick,
            threshold=threshold,
            firstPicks=initial_picks,
            seed=seed,
        )

    elif method == "sphere":
        picker = LeaderPicker()
        picked_inds = picker.LazyPick(dist_fn,
                                      n_mols,
                                      threshold=threshold,
                                      pickSize=npick,
                                      firstPicks=initial_picks)

    elif method.upper() in ClusterMethod.names.keys() and npick:
        if initial_picks:
            logger.warning(
                "Initial picks is not supported by hierarchical clustering. You pick has been discarded."
            )

        dist_mat = dm.parallelized(distij,
                                   list(
                                       zip(*np.tril_indices(len(mols), k=-1))),
                                   arg_type="args")
        dist_mat = np.asarray(dist_mat)
        picker = HierarchicalClusterPicker(ClusterMethod.names[method.upper()])
        picked_inds = picker.Pick(dist_mat, n_mols, npick)
    else:
        raise ValueError(
            f"Picking method {method} with {npick} elements to pick is not supported."
        )
    picked_inds = np.array(picked_inds)
    picked_mols = [mols[x] for x in picked_inds]

    return picked_inds, picked_mols
示例#10
0
        molid = names[count]
        m.SetProp('_Name', molid)
        
        probe = Chem.Mol(m.ToBinary())
        v.ShowMol(probe, name=molid, showOnly=False)
        
df = pa.feather.read_feather('/data/mol_chunk_tests_cluster/test_2.molchunk')
# df['combined_smiles'] = df[['standard_smiles', 'enumerated_smiles']].values.tolist()
columns_to_keep = ['enumerated_smiles', 'CatalogID', 'ID_Index']
df2 = df[columns_to_keep]
df3 = df2.explode('enumerated_smiles')
df5 = df3.reset_index(drop=True)
smiles_column = 'enumerated_smiles'

# run initial mapper on smiles column to generate basic information and fingerprint on bits
df_clean_mapped = dm.parallelized(_preprocess, list(df5.iterrows()), arg_type='args', progress=True)
df_clean_mapped = pd.DataFrame(df_clean_mapped)



# del df_clean_mapped['combined_smiles']


#remove duplicated standard SMILES and reindex
duplicateRowsDF2 = df_clean_mapped[df_clean_mapped.duplicated(['standard_smiles'])]
#     print("Duplicate Rows based on a single column are:", duplicateRowsDF2, sep='\n')
df_clean_mapped = df_clean_mapped.drop_duplicates(subset='standard_smiles', keep="first", inplace=False)
df6 = df_clean_mapped.reset_index(drop=True)

limit = 13000
results_list = []
示例#11
0
                             columns = ['names', 'input_pdbqt_path', 'output_docking_pose_paths', 'output_docking_scores'])

    try:
        out_df = pd.merge(df, docked_df, on="names")
        return out_df
    except:
        print("merging df, failed")
        return None


col_to_dock = 'pdbqt_ambcc'
working_dir = '/data/dockop_glide_d3/dock_test'

smiles_column = 'standard_smiles'
df2 = dm.parallelized(_generate_pdbqt_outfiles_for_docking,
                      list(df2.iterrows()),
                      arg_type='args',
                      progress=True)
df2 = pd.DataFrame(df2)

autodock_gpu = '/home/schrogpu/ADFRsuite-1.0/AutoDock-GPU/bin/autodock_gpu_128wi'
receptor_path = '/home/schrogpu/ADFRsuite-1.0/d3_docking/pocket2fixer/rigidReceptor.maps.fld'
lsmet = 'sw'
num_runs = 50
dev_num = 0

names_to_dock = list(df2['canonical_id'])
filenames = list(df2['pdbqt_out_path'])
batch_list = f'{working_dir}/{col_to_dock}_batch.txt'
with open(batch_list, 'w') as f:
    f.write(f'{receptor_path}\n')
    for i, filepath in enumerate(filenames):
        row["inchi"] = 'dropped'
        row["inchikey"] = 'dropped'
        row["enumerated_smiles"] = list('dropped')
        return row


# Load the dataset from parquet one by one
dataset = ds.dataset(dataset_dir, format="parquet")

# Create a list of fragments that are not memory loaded
fragments = [file for file in dataset.get_fragments()]

for count, element in enumerate(fragments):
    #cast the fragment as a pandas df
    df_docked = element.to_table().to_pandas()
    #reset the index
    df_docked = df_docked.reset_index(drop=True)

    #now write the nearest neighbor name and smiles to the df
    smiles_column = 'Smile'
    df_add_nn = dm.parallelized(_preprocess,
                                list(df_docked.iterrows()),
                                arg_type='args',
                                progress=True,
                                n_jobs=54)
    df_add_nn = pd.DataFrame(df_add_nn)

    #write the mochunk to disk
    feather.write_feather(df_add_nn,
                          f'{output_dir}/er_enumisomers_{count}.molchunk')
示例#13
0
    df5 = df5.set_index('ID_Index')
    df5.index = df5.index + '_' + df5.groupby(
        level=0).cumcount().add(1).astype(str).replace('0', '')
    df5 = df5.reset_index()
    df5.columns = ['canonical_id', 'enumerated_smiles', 'CatalogID']

    # columns_to_keep = ['enumerated_smiles', 'CatalogID', 'ID_Index']
    # df2 = df[columns_to_keep]
    # df3 = df2.explode('enumerated_smiles')
    # df5 = df3.reset_index(drop=True)
    smiles_column = 'enumerated_smiles'

    # run initial mapper on smiles column to generate basic information and fingerprint on bits
    df_clean_mapped = dm.parallelized(_preprocess,
                                      list(df5.iterrows()),
                                      arg_type='args',
                                      progress=True,
                                      n_jobs=4)
    df_clean_mapped = pd.DataFrame(df_clean_mapped)

    df_dropped = df_clean_mapped[df_clean_mapped.standard_smiles == 'dropped']
    print(f' The number of dropped entries is: {len(df_dropped)})')
    feather.write_feather(df_dropped,
                          f'{output_dir}/er_d3sim_dropped_{count}.molchunk')

    df_clean_mapped = df_clean_mapped[
        df_clean_mapped.standard_smiles != 'dropped']
    print(f' The number of successful entries is: {len(df_clean_mapped)})')

    # del df_clean_mapped['combined_smiles']
示例#14
0
        row["mol2_block_am1bcc"] = mol2_block_am1bcc
        row["pdb_am1bcc"] = pdb_am1bcc
        row["pdbqt_am1bcc"] = pdbqt_am1bcc
        row["pdbqt_gast"] = pdbqt_gast
#         print(f'{name} with smiles {smiles} is complete')
        return row
    except:
        smiles = str(row[smiles_column])
        name = row[6]


        row["mol2_block_am1bcc"] = 'dropped'
        row["pdb_am1bcc"] = 'dropped'
        row["pdbqt_am1bcc"] = 'dropped'
        row["pdbqt_gast"] = 'dropped'
#         print(f'{name} with smiles {smiles} is failed!!')
        return row

smiles_column = 'standard_smiles'
df_clean_mapped_3d = dm.parallelized(_preprocess_3d, list(d3_df.iterrows()), arg_type='args', progress=True)
df_clean_mapped_3d_1 = pd.DataFrame(df_clean_mapped_3d)
df2 = df_clean_mapped_3d_1

df2 = df2.set_index('canonical_id')
df2.index = df2.index + df2.groupby(level=0).cumcount().astype(str).replace('0','')
df2 = df2.reset_index()
df2['canonical_id'] = df2['index']
del df2['index']
df2
feather.write_feather(df_clean_mapped_3d_1, '/data/dockop_glide_d3/chembld3.molchunk')