def test_datasetprovider_awkward_exporter_single_tensor_same_shape(): """ The core assumption for simplest cases is that one system will be featurized as a single tensor, and that all the tensors will be of the same shape across systems. If that's the case: - DatasetProvider.to_numpy() will work and will return a X, y tuple of arrays - DatasetProvider.to_dict_of_arrays() will work and will return a dict of arrays with X, y keys. Note that `.to_numpy()` won't work if the core assumptions are broken. For those cases, `.to_dict_of_arrays()` is recommended instead. """ from kinoml.core.ligands import RDKitLigand from kinoml.features.ligand import MorganFingerprintFeaturizer from kinoml.features.core import Concatenated, TupleOfArrays import awkward as ak conditions = AssayConditions() systems = [ System([RDKitLigand.from_smiles(smi)]) for smi in ("CCCCC", "CCCCCCCC") ] measurements = [ BaseMeasurement(50, conditions=conditions, system=systems[0]), BaseMeasurement(30, conditions=conditions, system=systems[1]), ] dataset = DatasetProvider(measurements=measurements) featurizer1 = MorganFingerprintFeaturizer(radius=2, nbits=512) featurizer2 = MorganFingerprintFeaturizer(radius=2, nbits=1024) concatenate = Concatenated([featurizer1, featurizer2], axis=1) aggregated = TupleOfArrays([concatenate]) aggregated.featurize(systems) for system in systems: assert system.featurizations["last"][0].shape[0] == (1024 + 512) # With a single tensor per system, we build a unified X tensor # First dimension in X and y must match X, y = dataset.to_numpy() # This extra dimension here V # comes from the TupleOfArrays aggregation assert X.shape[:3] == (2, 1, (1024 + 512)) assert X.shape[0] == y.shape[0] # With dict_of_arrays and single tensor per system, # the behavior is essentially the same (Xa, ), ya = dataset.to_awkward() assert ak.to_numpy(Xa).shape == (2, (1024 + 512)) assert ak.to_numpy(ya).shape == (2, )
def test_measurements(): from kinoml.core.measurements import BaseMeasurement, PercentageDisplacementMeasurement from kinoml.core.conditions import AssayConditions from kinoml.core.components import MolecularComponent from kinoml.core.systems import System conditions = AssayConditions() system = System([MolecularComponent()]) measurement = BaseMeasurement(50, conditions=conditions, system=system) assert isinstance(measurement, BaseMeasurement) assert measurement == BaseMeasurement(50, conditions=conditions, system=system) assert measurement != BaseMeasurement(10, conditions=conditions, system=system)
def test_datasetprovider(): conditions = AssayConditions() measurements = [ BaseMeasurement(50, conditions=conditions, system=System([MolecularComponent()])), BaseMeasurement(30, conditions=conditions, system=System([MolecularComponent()])), ] dataset = DatasetProvider(measurements=measurements) assert len(dataset.conditions) == 1 assert next(iter(dataset.conditions)) == conditions
def test_datasetprovider(): from kinoml.datasets.core import DatasetProvider from kinoml.core.systems import System from kinoml.core.components import MolecularComponent from kinoml.core.measurements import BaseMeasurement from kinoml.core.conditions import AssayConditions from kinoml.features.core import BaseFeaturizer conditions = AssayConditions() measurements = [ BaseMeasurement(50, conditions=conditions, system=System([MolecularComponent()])), BaseMeasurement(30, conditions=conditions, system=System([MolecularComponent()])), ] provider = DatasetProvider(measurements=measurements, featurizers=[BaseFeaturizer()]) assert len(provider.conditions) == 1 assert next(iter(provider.conditions)) == conditions
def test_datasetprovider_awkward_exporter_multiple_subtensors(): """ When we use an aggregator like TupleOfArrays, which breaks the one system -> one tensor assumption, we need to use the `.dict_of_arrays()` exporter. This creates a dictionary of arrays, where each key is autogenerated like `X_s{int}_a{int}`, where `s` is the system index and `a` is the array index. """ from kinoml.core.ligands import RDKitLigand from kinoml.features.ligand import MorganFingerprintFeaturizer from kinoml.features.core import TupleOfArrays import awkward as ak conditions = AssayConditions() systems = [ System([RDKitLigand.from_smiles(smi)]) for smi in ("CCCCC", "CCCCCCCC") ] measurements = [ BaseMeasurement(50, conditions=conditions, system=systems[0]), BaseMeasurement(30, conditions=conditions, system=systems[1]), ] dataset = DatasetProvider(measurements=measurements) featurizer1 = MorganFingerprintFeaturizer(radius=2, nbits=512) featurizer2 = MorganFingerprintFeaturizer(radius=2, nbits=1024) aggregated = TupleOfArrays([featurizer1, featurizer2]) aggregated.featurize(dataset.systems) for system in systems: assert len(system.featurizations["last"]) == 2 assert system.featurizations["last"][0].shape[0] == 512 assert system.featurizations["last"][1].shape[0] == 1024 (x1, x2), y = dataset.to_awkward() assert len(x1) == len(x2) == len(y) assert ak.to_numpy(x1).shape == (2, 512) assert ak.to_numpy(x2).shape == (2, 1024)
def test_datasetprovider_awkward_exporter_single_tensor_different_shapes(): """ When a featurizer returns arrays of different shape for each system, one can either choose between: A. Pad them to the same dimension with PadFeaturizer, and apply the Concatenated aggregator. B. Keep them separate and export them with `.dict_of_arrays()`, This creates a dictionary of arrays, where each key is autogenerated like `X_s{int}`, where `s` is the system index. """ from kinoml.core.ligands import RDKitLigand from kinoml.features.ligand import OneHotSMILESFeaturizer from kinoml.features.core import TupleOfArrays import awkward as ak conditions = AssayConditions() smiles = ("CCCCC", "CCCCCCCC") systems = [System([RDKitLigand.from_smiles(smi)]) for smi in smiles] measurements = [ BaseMeasurement(50, conditions=conditions, system=systems[0]), BaseMeasurement(30, conditions=conditions, system=systems[1]), ] dataset = DatasetProvider(measurements=measurements) featurizer = OneHotSMILESFeaturizer() aggregated = TupleOfArrays([featurizer]) aggregated.featurize(dataset.systems) for system, smi in zip(systems, smiles): assert system.featurizations["last"][0].shape == (53, len(smi)) # X is returned as single-item list thanks to TupleOfArrays (X, ), y = dataset.to_awkward() assert X.type.length == len(y) assert ak.to_numpy(X[0]).shape == (53, len(smiles[0])) assert ak.to_numpy(X[1]).shape == (53, len(smiles[1]))
def test_datasetprovider_exporter_single_tensor_different_shapes(): """ When a featurizer returns arrays of different shape for each system, one can either choose between: A. Pad them to the same dimension with PadFeaturizer, and apply the Concatenated aggregator. B. Keep them separate and export them with `.dict_of_arrays()`, This creates a dictionary of arrays, where each key is autogenerated like `X_s{int}`, where `s` is the system index. """ from kinoml.core.ligands import RDKitLigand from kinoml.features.ligand import OneHotSMILESFeaturizer conditions = AssayConditions() smiles = ("CCCCC", "CCCCCCCC") systems = [System([RDKitLigand.from_smiles(smi)]) for smi in smiles] measurements = [ BaseMeasurement(50, conditions=conditions, system=systems[0]), BaseMeasurement(30, conditions=conditions, system=systems[1]), ] dataset = DatasetProvider(measurements=measurements) featurizer = OneHotSMILESFeaturizer() featurizer.featurize(dataset.systems) for system, smi in zip(systems, smiles): assert system.featurizations["last"].shape == (53, len(smi)) arrays = dataset.to_dict_of_arrays() X_keys = [k for k in arrays.keys() if k.startswith("X")] assert sorted(X_keys) == ["X_s0_", "X_s1_"] for X_key, smi in zip(X_keys, smiles): assert arrays[X_key].shape == (53, len(smi))