def test_CallableFeaturizer(): from sklearn.preprocessing import scale systems = ( LigandSystem([RDKitLigand.from_smiles("C")]), LigandSystem([RDKitLigand.from_smiles("CC")]), LigandSystem([RDKitLigand.from_smiles("CCC")]), ) HashFeaturizer(getter=lambda s: s.ligand.to_smiles(), normalize=False).featurize(systems) CallableFeaturizer(lambda s: scale(s.featurizations["last"].reshape((1,)))).featurize(systems) for s in systems: assert s.featurizations["last"].shape
def test_ClearFeaturizations_removeall(): from kinoml.features.ligand import OneHotSMILESFeaturizer systems = ( System([RDKitLigand.from_smiles("C")]), System([RDKitLigand.from_smiles("CC")]), System([RDKitLigand.from_smiles("CCC")]), ) OneHotSMILESFeaturizer().featurize(systems) PadFeaturizer().featurize(systems) ClearFeaturizations(keys=tuple(), style="keep").featurize(systems) for s in systems: assert not s.featurizations
def test_PadFeaturizer(): from kinoml.features.ligand import OneHotSMILESFeaturizer systems = ( System([RDKitLigand.from_smiles("C")]), System([RDKitLigand.from_smiles("CC")]), System([RDKitLigand.from_smiles("CCC")]), ) OneHotSMILESFeaturizer().featurize(systems) PadFeaturizer().featurize(systems) for s in systems: assert s.featurizations["last"].shape == (53, 3) return systems
def test_ClearFeaturizations_keeplast(): from kinoml.features.ligand import OneHotSMILESFeaturizer systems = ( System([RDKitLigand.from_smiles("C")]), System([RDKitLigand.from_smiles("CC")]), System([RDKitLigand.from_smiles("CCC")]), ) OneHotSMILESFeaturizer().featurize(systems) PadFeaturizer().featurize(systems) ClearFeaturizations().featurize(systems) for s in systems: assert len(s.featurizations) == 1 assert "last" in s.featurizations
def test_SmilesToLigandFeaturizer_fails(): ligand = RDKitLigand.from_smiles("CCCCC") system = System([ligand]) featurizer = SmilesToLigandFeaturizer(ligand_type="openforcefield") with pytest.raises(ValueError): featurizer.featurize([system]) molecule = system.featurizations[featurizer.name] assert type(molecule) == OpenForceFieldLigand
def test_Concatenated(): from kinoml.features.ligand import MorganFingerprintFeaturizer ligand = RDKitLigand.from_smiles("CCCC") system = System([ligand]) featurizer1 = MorganFingerprintFeaturizer(radius=2, nbits=512) featurizer2 = MorganFingerprintFeaturizer(radius=2, nbits=512) concatenated = Concatenated([featurizer1, featurizer2], axis=1) concatenated.featurize([system]) assert system.featurizations["last"].shape[0] == 1024
def test_datasetprovider_awkward_exporter_single_tensor_same_shape(): """ The core assumption for simplest cases is that one system will be featurized as a single tensor, and that all the tensors will be of the same shape across systems. If that's the case: - DatasetProvider.to_numpy() will work and will return a X, y tuple of arrays - DatasetProvider.to_dict_of_arrays() will work and will return a dict of arrays with X, y keys. Note that `.to_numpy()` won't work if the core assumptions are broken. For those cases, `.to_dict_of_arrays()` is recommended instead. """ from kinoml.core.ligands import RDKitLigand from kinoml.features.ligand import MorganFingerprintFeaturizer from kinoml.features.core import Concatenated, TupleOfArrays import awkward as ak conditions = AssayConditions() systems = [ System([RDKitLigand.from_smiles(smi)]) for smi in ("CCCCC", "CCCCCCCC") ] measurements = [ BaseMeasurement(50, conditions=conditions, system=systems[0]), BaseMeasurement(30, conditions=conditions, system=systems[1]), ] dataset = DatasetProvider(measurements=measurements) featurizer1 = MorganFingerprintFeaturizer(radius=2, nbits=512) featurizer2 = MorganFingerprintFeaturizer(radius=2, nbits=1024) concatenate = Concatenated([featurizer1, featurizer2], axis=1) aggregated = TupleOfArrays([concatenate]) aggregated.featurize(systems) for system in systems: assert system.featurizations["last"][0].shape[0] == (1024 + 512) # With a single tensor per system, we build a unified X tensor # First dimension in X and y must match X, y = dataset.to_numpy() # This extra dimension here V # comes from the TupleOfArrays aggregation assert X.shape[:3] == (2, 1, (1024 + 512)) assert X.shape[0] == y.shape[0] # With dict_of_arrays and single tensor per system, # the behavior is essentially the same (Xa, ), ya = dataset.to_awkward() assert ak.to_numpy(Xa).shape == (2, (1024 + 512)) assert ak.to_numpy(ya).shape == (2, )
def test_ligand_MorganFingerprintFeaturizer(smiles, solution): """ OFFTK _will_ add hydrogens to all ingested SMILES, and export a canonicalized output, so the representation you get might not be the one you expect if you compute it directly. """ ligand = RDKitLigand.from_smiles(smiles) system = System([ligand]) featurizer = MorganFingerprintFeaturizer(radius=2, nbits=512) featurizer.featurize(system) fingerprint = system.featurizations[featurizer.name] solution_array = np.array(list(map(int, solution)), dtype="uint8") assert (fingerprint == solution_array).all()
def test_ligand_GraphLigandFeaturizer_RDKit(smiles, solution): """ OFFTK _will_ add hydrogens to all ingested SMILES, and export a canonicalized output, so the representation you get might not be the one you expect if you compute it directly. That's why we use RDKitLigand here. """ ligand = RDKitLigand.from_smiles(smiles) system = System([ligand]) GraphLigandFeaturizer().featurize([system]) connectivity, features = system.featurizations["last"] assert (connectivity == solution[0]).all() assert features == pytest.approx(solution[1])
def test_TupleOfArrays(): from kinoml.features.ligand import MorganFingerprintFeaturizer ligand = RDKitLigand.from_smiles("CCCC") system = System([ligand]) featurizer1 = MorganFingerprintFeaturizer(radius=2, nbits=512) featurizer2 = MorganFingerprintFeaturizer(radius=2, nbits=1024) aggregated = TupleOfArrays([featurizer1, featurizer2]) aggregated.featurize([system]) assert len(system.featurizations["last"]) == 2 assert system.featurizations["last"][0].shape[0] == 512 assert system.featurizations["last"][1].shape[0] == 1024
def test_ligand_OneHotSMILESFeaturizer(smiles, solution): """ OFFTK _will_ add hydrogens to all ingested SMILES, and export a canonicalized output, so the representation you get might not be the one you expect if you compute it directly. That's why we use RDKitLigand here. """ ligand = RDKitLigand.from_smiles(smiles) system = System([ligand]) featurizer = OneHotSMILESFeaturizer() featurizer.featurize(system) matrix = system.featurizations[featurizer.name] assert matrix.shape == solution.T.shape assert (matrix == solution.T).all()
def test_ligand_GraphLigandFeaturizer(smiles, solution): """ OFFTK _will_ add hydrogens to all ingested SMILES, and export a canonicalized output, so the representation you get might not be the one you expect if you compute it directly. That's why we use RDKitLigand here. """ ligand = RDKitLigand.from_smiles(smiles) system = System([ligand]) featurizer = GraphLigandFeaturizer() featurizer.featurize(system) graph = system.featurizations[featurizer.name] assert (graph[0] == solution[0]).all() # connectivity assert (graph[1] == solution[1]).all() # features
def test_datasetprovider_awkward_exporter_multiple_subtensors(): """ When we use an aggregator like TupleOfArrays, which breaks the one system -> one tensor assumption, we need to use the `.dict_of_arrays()` exporter. This creates a dictionary of arrays, where each key is autogenerated like `X_s{int}_a{int}`, where `s` is the system index and `a` is the array index. """ from kinoml.core.ligands import RDKitLigand from kinoml.features.ligand import MorganFingerprintFeaturizer from kinoml.features.core import TupleOfArrays import awkward as ak conditions = AssayConditions() systems = [ System([RDKitLigand.from_smiles(smi)]) for smi in ("CCCCC", "CCCCCCCC") ] measurements = [ BaseMeasurement(50, conditions=conditions, system=systems[0]), BaseMeasurement(30, conditions=conditions, system=systems[1]), ] dataset = DatasetProvider(measurements=measurements) featurizer1 = MorganFingerprintFeaturizer(radius=2, nbits=512) featurizer2 = MorganFingerprintFeaturizer(radius=2, nbits=1024) aggregated = TupleOfArrays([featurizer1, featurizer2]) aggregated.featurize(dataset.systems) for system in systems: assert len(system.featurizations["last"]) == 2 assert system.featurizations["last"][0].shape[0] == 512 assert system.featurizations["last"][1].shape[0] == 1024 (x1, x2), y = dataset.to_awkward() assert len(x1) == len(x2) == len(y) assert ak.to_numpy(x1).shape == (2, 512) assert ak.to_numpy(x2).shape == (2, 1024)
def test_datasetprovider_awkward_exporter_single_tensor_different_shapes(): """ When a featurizer returns arrays of different shape for each system, one can either choose between: A. Pad them to the same dimension with PadFeaturizer, and apply the Concatenated aggregator. B. Keep them separate and export them with `.dict_of_arrays()`, This creates a dictionary of arrays, where each key is autogenerated like `X_s{int}`, where `s` is the system index. """ from kinoml.core.ligands import RDKitLigand from kinoml.features.ligand import OneHotSMILESFeaturizer from kinoml.features.core import TupleOfArrays import awkward as ak conditions = AssayConditions() smiles = ("CCCCC", "CCCCCCCC") systems = [System([RDKitLigand.from_smiles(smi)]) for smi in smiles] measurements = [ BaseMeasurement(50, conditions=conditions, system=systems[0]), BaseMeasurement(30, conditions=conditions, system=systems[1]), ] dataset = DatasetProvider(measurements=measurements) featurizer = OneHotSMILESFeaturizer() aggregated = TupleOfArrays([featurizer]) aggregated.featurize(dataset.systems) for system, smi in zip(systems, smiles): assert system.featurizations["last"][0].shape == (53, len(smi)) # X is returned as single-item list thanks to TupleOfArrays (X, ), y = dataset.to_awkward() assert X.type.length == len(y) assert ak.to_numpy(X[0]).shape == (53, len(smiles[0])) assert ak.to_numpy(X[1]).shape == (53, len(smiles[1]))
def test_datasetprovider_exporter_single_tensor_different_shapes(): """ When a featurizer returns arrays of different shape for each system, one can either choose between: A. Pad them to the same dimension with PadFeaturizer, and apply the Concatenated aggregator. B. Keep them separate and export them with `.dict_of_arrays()`, This creates a dictionary of arrays, where each key is autogenerated like `X_s{int}`, where `s` is the system index. """ from kinoml.core.ligands import RDKitLigand from kinoml.features.ligand import OneHotSMILESFeaturizer conditions = AssayConditions() smiles = ("CCCCC", "CCCCCCCC") systems = [System([RDKitLigand.from_smiles(smi)]) for smi in smiles] measurements = [ BaseMeasurement(50, conditions=conditions, system=systems[0]), BaseMeasurement(30, conditions=conditions, system=systems[1]), ] dataset = DatasetProvider(measurements=measurements) featurizer = OneHotSMILESFeaturizer() featurizer.featurize(dataset.systems) for system, smi in zip(systems, smiles): assert system.featurizations["last"].shape == (53, len(smi)) arrays = dataset.to_dict_of_arrays() X_keys = [k for k in arrays.keys() if k.startswith("X")] assert sorted(X_keys) == ["X_s0_", "X_s1_"] for X_key, smi in zip(X_keys, smiles): assert arrays[X_key].shape == (53, len(smi))