def test_from_df_pop_mol_column(): df = dm.data.freesolv().iloc[:10] # type: ignore mols = [dm.to_mol(smiles) for smiles in df["smiles"]] df: pd.DataFrame = dm.to_df(mols, mol_column="mol") # type: ignore df["dummy"] = "hello" # test with provided mol column mols = dm.from_df(df.copy(), mol_column="mol") assert set(mols[0].GetPropsAsDict().keys()) == {"smiles", "dummy"} # test with automatic mol column detection mols = dm.from_df(df.copy()) assert set(mols[0].GetPropsAsDict().keys()) == {"smiles", "dummy"}
def test_to_image(): # Get a list of molecules data = dm.data.freesolv() mols = dm.from_df(data) # type: ignore mols = mols[:8] # With multiple molecules legends = [dm.to_smiles(mol) for mol in mols] image = dm.viz.to_image(mols, legends=legends, n_cols=4, mol_size=(200, 200)) # image = _convert_ipython_to_array(image) image = np.array(image) assert image.dtype == np.uint8 assert image.shape == (400, 800, 3) assert image.shape[1] == 200 * 4 # With a single molecule mol = mols[0] legends = dm.to_smiles(mol) image = dm.viz.to_image(mol, legends=legends, mol_size=(200, 200)) # image = _convert_ipython_to_array(image) image = np.array(image) assert image.dtype == np.uint8 assert image.shape == (200, 200, 3) dm.viz.to_image(mol, indices=True, mol_size=400)
def test_to_df_smiles_warning(datadir, caplog): data_path = datadir / "freesolv.csv" df = dm.read_csv(data_path) mols = dm.from_df(df, conserve_smiles=True) df = dm.to_df(mols) assert sum(df.columns == "smiles") == 2 assert "WARNING" in caplog.text assert ( "The SMILES column name provided ('smiles') is already present in the properties of the molecules" in caplog.text)
def test_from_df(datadir): data_path = datadir / "TUBB3-observations.sdf" df = dm.read_sdf(data_path, as_df=True) mols = dm.from_df(df) assert len(mols) == 10 assert isinstance(mols[0], Chem.rdchem.Mol) assert set(mols[0].GetPropsAsDict().keys()) == { "zinc_id", "ortholog_name", "gene_name", "affinity", "chembldocid", "title", "reference.pubmed_id", "reference.doi", "reference.chembl_id", "reference.journal", "reference.year", } assert dm.from_df(pd.DataFrame()) == []
def to_sdf( mols: Union[Chem.rdchem.Mol, Sequence[Chem.rdchem.Mol], pd.DataFrame], urlpath: Union[str, os.PathLike, TextIO], smiles_column: Optional[str] = "smiles", mol_column: str = None, ): """Write molecules to a file. Args: mols: a dataframe, a molecule or a list of molecule. urlpath: Path to a file or a file-like object. Path can be remote or local. smiles_column: Column name to extract the molecule. mol_column: Column name to extract the molecule. It takes precedence over `smiles_column`. """ if isinstance(mols, pd.DataFrame): mols = dm.from_df(mols, smiles_column=smiles_column, mol_column=mol_column) elif isinstance(mols, Chem.rdchem.Mol): mols = [mols] # Filter out None values mols = [mol for mol in mols if mol is not None] # File-like object if isinstance(urlpath, io.IOBase): writer = Chem.SDWriter(urlpath) for mol in mols: writer.write(mol) writer.close() # Regular local or remote paths else: with fsspec.open(urlpath, mode="w") as f: writer = Chem.SDWriter(f) for mol in mols: writer.write(mol) writer.close()
def test_from_df_conserve_smiles(datadir): data_path = datadir / "freesolv.csv" df = dm.read_csv(data_path) mols = dm.from_df(df, conserve_smiles=True) assert "smiles" in mols[0].GetPropsAsDict().keys()