def test_ligand_OneHotSMILESFeaturizer(smiles, solution): """ OFFTK _will_ add hydrogens to all ingested SMILES, and export a canonicalized output, so the representation you get might not be the one you expect if you compute it directly. That's why we use RDKitLigand here. """ ligand = RDKitLigand.from_smiles(smiles) system = System([ligand]) featurizer = OneHotSMILESFeaturizer() featurizer.featurize(system) matrix = system.featurizations[featurizer.name] assert matrix.shape == solution.T.shape assert (matrix == solution.T).all()
def test_ClearFeaturizations_removeall(): from kinoml.features.ligand import OneHotSMILESFeaturizer systems = ( System([RDKitLigand.from_smiles("C")]), System([RDKitLigand.from_smiles("CC")]), System([RDKitLigand.from_smiles("CCC")]), ) OneHotSMILESFeaturizer().featurize(systems) PadFeaturizer().featurize(systems) ClearFeaturizations(keys=tuple(), style="keep").featurize(systems) for s in systems: assert not s.featurizations
def test_PadFeaturizer(): from kinoml.features.ligand import OneHotSMILESFeaturizer systems = ( System([RDKitLigand.from_smiles("C")]), System([RDKitLigand.from_smiles("CC")]), System([RDKitLigand.from_smiles("CCC")]), ) OneHotSMILESFeaturizer().featurize(systems) PadFeaturizer().featurize(systems) for s in systems: assert s.featurizations["last"].shape == (53, 3) return systems
def test_ClearFeaturizations_keeplast(): from kinoml.features.ligand import OneHotSMILESFeaturizer systems = ( System([RDKitLigand.from_smiles("C")]), System([RDKitLigand.from_smiles("CC")]), System([RDKitLigand.from_smiles("CCC")]), ) OneHotSMILESFeaturizer().featurize(systems) PadFeaturizer().featurize(systems) ClearFeaturizations().featurize(systems) for s in systems: assert len(s.featurizations) == 1 assert "last" in s.featurizations
def test_datasetprovider_exporter_single_tensor_different_shapes(): """ When a featurizer returns arrays of different shape for each system, one can either choose between: A. Pad them to the same dimension with PadFeaturizer, and apply the Concatenated aggregator. B. Keep them separate and export them with `.dict_of_arrays()`, This creates a dictionary of arrays, where each key is autogenerated like `X_s{int}`, where `s` is the system index. """ from kinoml.core.ligands import RDKitLigand from kinoml.features.ligand import OneHotSMILESFeaturizer conditions = AssayConditions() smiles = ("CCCCC", "CCCCCCCC") systems = [System([RDKitLigand.from_smiles(smi)]) for smi in smiles] measurements = [ BaseMeasurement(50, conditions=conditions, system=systems[0]), BaseMeasurement(30, conditions=conditions, system=systems[1]), ] dataset = DatasetProvider(measurements=measurements) featurizer = OneHotSMILESFeaturizer() featurizer.featurize(dataset.systems) for system, smi in zip(systems, smiles): assert system.featurizations["last"].shape == (53, len(smi)) arrays = dataset.to_dict_of_arrays() X_keys = [k for k in arrays.keys() if k.startswith("X")] assert sorted(X_keys) == ["X_s0_", "X_s1_"] for X_key, smi in zip(X_keys, smiles): assert arrays[X_key].shape == (53, len(smi))
def test_datasetprovider_awkward_exporter_single_tensor_different_shapes(): """ When a featurizer returns arrays of different shape for each system, one can either choose between: A. Pad them to the same dimension with PadFeaturizer, and apply the Concatenated aggregator. B. Keep them separate and export them with `.dict_of_arrays()`, This creates a dictionary of arrays, where each key is autogenerated like `X_s{int}`, where `s` is the system index. """ from kinoml.core.ligands import RDKitLigand from kinoml.features.ligand import OneHotSMILESFeaturizer from kinoml.features.core import TupleOfArrays import awkward as ak conditions = AssayConditions() smiles = ("CCCCC", "CCCCCCCC") systems = [System([RDKitLigand.from_smiles(smi)]) for smi in smiles] measurements = [ BaseMeasurement(50, conditions=conditions, system=systems[0]), BaseMeasurement(30, conditions=conditions, system=systems[1]), ] dataset = DatasetProvider(measurements=measurements) featurizer = OneHotSMILESFeaturizer() aggregated = TupleOfArrays([featurizer]) aggregated.featurize(dataset.systems) for system, smi in zip(systems, smiles): assert system.featurizations["last"][0].shape == (53, len(smi)) # X is returned as single-item list thanks to TupleOfArrays (X, ), y = dataset.to_awkward() assert X.type.length == len(y) assert ak.to_numpy(X[0]).shape == (53, len(smiles[0])) assert ak.to_numpy(X[1]).shape == (53, len(smiles[1]))