def test_RandomForestRegressionLolo():
    """Simple examples.

    Lolo requires at least 8 training points.
    """

    # constant function
    # MH: for constant labels, expected uncertainties are zero
    train_data = smlb.TabularData(
        data=np.array([[-4], [-3], [-2], [-1], [0], [1], [2], [3], [4]]),
        labels=np.array([1, 1, 1, 1, 1, 1, 1, 1, 1]),
    )
    valid_data = smlb.TabularData(data=np.array([[-4], [-2], [0], [3], [4]]))
    rf = RandomForestRegressionLolo(num_trees=10)
    preds = rf.fit(train_data).apply(valid_data)
    mean, stddev = preds.mean, preds.stddev

    assert np.allclose(mean, [1, 1, 1, 1, 1])
    assert np.allclose(stddev, [0, 0, 0, 0, 0])

    # delta distributions (zero standard deviation)
    rf = RandomForestRegressionLolo(num_trees=10, use_jackknife=False)
    preds = rf.fit(train_data).apply(valid_data)
    mean, stddev = preds.mean, preds.stddev

    assert np.allclose(mean, [1, 1, 1, 1, 1])
    assert np.allclose(stddev, [0, 0, 0, 0, 0])
def test_RandomForestRegressionLolo_2():
    """Non-trivial test case, including standard deviation."""

    n, m, xlen = 100, 600, 10
    train_inputs = np.reshape(np.linspace(-xlen / 2, +xlen / 2, n), (n, 1))
    train_labels = (train_inputs * 2 + 1).flatten()
    train_data = smlb.TabularData(data=train_inputs, labels=train_labels)
    train_data = smlb.LabelNoise(noise=smlb.NormalNoise(
        rng=0)).fit(train_data).apply(train_data)

    valid_inputs = np.reshape(np.linspace(-xlen / 2, +xlen / 2, m), (m, 1))
    valid_labels = (valid_inputs * 2 + 1).flatten()
    valid_data = smlb.TabularData(data=valid_inputs, labels=valid_labels)
    valid_data = smlb.LabelNoise(noise=smlb.NormalNoise(
        rng=1)).fit(valid_data).apply(valid_data)

    # 12 trees meets minimal requirements for jackknife estimates
    rf = RandomForestRegressionLolo()
    preds = rf.fit(train_data).apply(valid_data)
    mae = smlb.MeanAbsoluteError().evaluate(valid_data.labels(), preds)

    # for perfect predictions, expect MAE of 1.12943
    # (absolute difference between draws from two unit normal distributions)
    assert np.allclose(mae, 1.13, atol=0.25)
    assert np.allclose(np.median(preds.stddev), 1, atol=0.5)
def test_GradientBoostedTreesRegressionSklearn_2():
    """Simple examples: linear 1-d function."""

    rf = GradientBoostedTreesRegressionSklearn(rng=1,
                                               uncertainties=None)  # "naive"
    train_data = smlb.TabularData(
        data=np.asarray([[-2], [-1.5], [-1], [-0.5], [0], [0.5], [1], [1.5],
                         [2]]),
        labels=np.asarray([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]),
    )
    rf.fit(train_data)

    mean = rf.apply(smlb.TabularData(data=np.asarray([[-1], [0], [1]]))).mean
    assert np.allclose(mean, [-1, 0, 1], atol=0.2)

    # stddev = rf.apply(smlb.TabularData(data=[[-2], [0], [2]])).stddev
    # assert stddev[0] > stddev[1] < stddev[2]

    # without uncertainties
    rf = GradientBoostedTreesRegressionSklearn(
        rng=1)  # default for uncertainties is None
    rf.fit(train_data)

    preds = rf.apply(smlb.TabularData(data=np.asarray([[-1], [0], [1]])))
    assert np.allclose(preds.mean, [-1, 0, 1], atol=0.2)

    assert isinstance(preds, smlb.DeltaPredictiveDistribution)
def test_GradientBoostedTreesRegressionSklearn_4():
    """Simple examples."""

    # constant function
    # MH: for constant labels, expected uncertainties are zero
    train_data = smlb.TabularData(
        data=np.asarray([[-4], [-3], [-2], [-1], [0], [1], [2], [3], [4]]),
        labels=np.asarray([1, 1, 1, 1, 1, 1, 1, 1, 1]),
    )
    valid_data = smlb.TabularData(data=np.asarray([[-4], [-2], [0], [3], [4]]))
    rf = GradientBoostedTreesRegressionSklearn(
        n_estimators=10,
        uncertainties=None,
        rng=0  # "naive"
    )
    preds = rf.fit(train_data).apply(valid_data)
    mean, stddev = preds.mean, preds.stddev

    assert np.allclose(mean, [1, 1, 1, 1, 1])
    # assert np.allclose(stddev, [0, 0, 0, 0, 0])

    # delta distributions (zero standard deviation)
    rf = GradientBoostedTreesRegressionSklearn(n_estimators=10,
                                               uncertainties=None,
                                               rng=0)
    preds = rf.fit(train_data).apply(valid_data)
    mean, stddev = preds.mean, preds.stddev

    assert np.allclose(mean, [1, 1, 1, 1, 1])
def test_RandomForestRegressionSklearn_2():
    """Simple examples: linear 1-d function."""

    rf = RandomForestRegressionSklearn(rng=1,
                                       uncertainties="naive",
                                       correlations="naive")
    train_data = smlb.TabularData(
        data=np.array([[-2], [-1.5], [-1], [-0.5], [0], [0.5], [1], [1.5],
                       [2]]),
        labels=np.array([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]),
    )
    rf.fit(train_data)

    mean = rf.apply(smlb.TabularData(data=np.array([[-1], [0], [1]]))).mean
    assert np.allclose(mean, [-1, 0, 1], atol=0.2)

    stddev = rf.apply(smlb.TabularData(data=np.array([[-2], [0], [2]]))).stddev
    assert stddev[0] > stddev[1] < stddev[2]

    corr = rf.apply(smlb.TabularData(data=np.array([[-1], [0], [1]]))).corr
    assert corr.shape == (len(mean), len(mean))
    assert np.allclose(
        corr, [[1, -0.08, -0.05], [-0.08, 1, -0.023], [-0.05, -0.023, 1]],
        rtol=0.1)

    # without uncertainties
    rf = RandomForestRegressionSklearn(
        rng=1)  # default for uncertainties is None
    rf.fit(train_data)

    preds = rf.apply(smlb.TabularData(data=np.array([[-1], [0], [1]])))
    assert np.allclose(preds.mean, [-1, 0, 1], atol=0.2)

    assert isinstance(preds, smlb.DeltaPredictiveDistribution)
def test_GradientBoostedTreesRegressionSklearn_1():
    """Simple example: constant 1-d function."""

    # MH: for constant labels, expected uncertainties are zero
    train_data = smlb.TabularData(
        data=np.asarray([[-4], [-3], [-2], [-1], [0], [1], [2], [3], [4]]),
        labels=np.asarray([1, 1, 1, 1, 1, 1, 1, 1, 1]),
    )
    valid_data = smlb.TabularData(data=np.asarray([[-4], [-2], [0], [3], [4]]))
    rf = GradientBoostedTreesRegressionSklearn(rng=1,
                                               uncertainties=None)  # "naive"
    preds = rf.fit(train_data).apply(valid_data)
    mean, stddev = preds.mean, preds.stddev

    assert np.allclose(mean, [1, 1, 1, 1, 1])
    # assert np.allclose(stddev, [0, 0, 0, 0, 0])

    rf = GradientBoostedTreesRegressionSklearn(rng=1, uncertainties=None)
    preds = rf.fit(train_data).apply(valid_data)
    mean, stddev = preds.mean, preds.stddev

    assert np.allclose(mean, [1, 1, 1, 1, 1])
    # assert np.allclose(stddev, [0, 0, 0, 0, 0])

    assert isinstance(preds, smlb.DeltaPredictiveDistribution)
예제 #7
0
def test_GaussianProcessRegressionSklearn_2():
    """All predictive distributions.

    Linear noise-free function, linear kernel + white noise kernel.
    The optimized noise level is expected to go to its lower bound.
    """

    kernel = skl.gaussian_process.kernels.DotProduct(
        sigma_0=0, sigma_0_bounds="fixed"
    ) + skl.gaussian_process.kernels.WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-5, 1e-5))
    gpr = GaussianProcessRegressionSklearn(kernel=kernel, random_state=1)
    n = 100
    train_data = smlb.TabularData(
        data=np.ones(shape=(n, 1)) * 2, labels=np.ones(shape=n) * 3
    )
    valid_data = smlb.TabularData(data=train_data.samples())
    preds = gpr.fit(train_data).apply(valid_data)

    assert preds.has_signal_part and preds.has_noise_part
    conf, noise = preds.signal_part, preds.noise_part

    assert np.allclose(conf.mean, train_data.labels())
    assert np.allclose(conf.stddev, np.ones(n) * np.sqrt(1e-5), atol=1e-3)

    assert (preds.mean == conf.mean).all()
    assert np.allclose(preds.stddev, np.ones(n) * np.sqrt(np.square(conf.stddev) + 1e-5))

    assert np.allclose(noise.mean, np.zeros(shape=n))
    assert np.allclose(noise.stddev, np.sqrt(1e-5))
예제 #8
0
    def mixed_labeled_setup(n):
        global ds1, ds2

        dts = [("A", float), ("B", "U1"), ("C", float)]
        dtl = [("X", "U2"), ("Y", int)]

        data1a = np.random.uniform(size=n)
        data1b = np.random.choice(list(string.ascii_letters), n)
        data1c = np.random.uniform(size=n)
        data1 = np.array([(a, b, c) for a, b, c in zip(data1a, data1b, data1c)], dtype=dts)
        data1[int(0.33 * n)] = (1, "b", 3)
        data1[int(0.83 * n)] = data1[int(0.01 * n)] = (4, "c", 6)  # duplicate

        labels1a = np.random.choice(list(string.ascii_letters), n)
        labels1b = np.random.randint(32000, size=n)
        labels1 = np.array([(x, y) for x, y in zip(labels1a, labels1b)], dtype=dtl)
        labels1[int(0.33 * n)] = ("xx", 22)
        labels1[int(0.83 * n)] = labels1[int(0.01 * n)] = ("yy", 55)  # duplicate

        ds1 = smlb.TabularData(data=data1, labels=labels1)

        data2a = np.random.uniform(size=2 * n)
        data2b = np.random.choice(list(string.ascii_letters), 2 * n)
        data2c = np.random.uniform(size=2 * n)
        data2 = np.array([(a, b, c) for a, b, c in zip(data2a, data2b, data2c)], dtype=dts)
        data2[int(1.9 * n)] = (1, "b", 3)
        data2[int(0.5 * n)] = (4, "c", 6)

        labels2a = np.random.choice(list(string.ascii_letters), 2 * n)
        labels2b = np.random.randint(32000, size=2 * n)
        labels2 = np.array([(x, y) for x, y in zip(labels2a, labels2b)], dtype=dtl)
        labels2[int(1.9 * n)] = ("xx", 22)
        labels2[int(0.5 * n)] = ("yy", 55)

        ds2 = smlb.TabularData(data=data2, labels=labels2)
예제 #9
0
def test_MatminerCompositionFeatures_1():
    """Simple examples."""

    # callable, without labels
    data = smlb.TabularData(data=np.array(["LiF", "Sb2Te3"]))
    feat = MatminerCompositionFeatures().fit(data=data).apply(data=data)

    assert isinstance(feat, smlb.TabularData)
    assert feat.is_finite and not feat.is_labeled
    smlb.params.real_matrix(feat.samples())  # must not raise

    # callable, with labels
    data = smlb.TabularData(data=np.array(["LiF", "Sb2Te3"]), labels=np.array([1.0, 2.0]))
    feat = MatminerCompositionFeatures().fit(data=data).apply(data=data)

    assert isinstance(feat, smlb.TabularData)
    smlb.params.real_matrix(feat.samples())  # must not raise
    smlb.params.real_vector(feat.labels())  # must not raise

    # third example
    data = smlb.TabularData(data=np.array(["Al2O3", "Ni1.8W.05Al0.4"]))
    feat = MatminerCompositionFeatures(ionic_fast=True).fit(data=data).apply(data=data)

    assert isinstance(feat, smlb.TabularData)
    smlb.params.real_matrix(feat.samples())  # must not raise
def test_ExtremelyRandomizedTreesRegressionSklearn_1():
    """Simple examples."""

    # constant function
    # MH: for constant labels, expected uncertainties are zero
    train_data = smlb.TabularData(
        data=np.array([[-4], [-3], [-2], [-1], [0], [1], [2], [3], [4]]),
        labels=np.array([1, 1, 1, 1, 1, 1, 1, 1, 1]),
    )
    valid_data = smlb.TabularData(data=np.array([[-4], [-2], [0], [3], [4]]))
    rf = ExtremelyRandomizedTreesRegressionSklearn(n_estimators=10,
                                                   uncertainties="naive",
                                                   random_state=0)
    preds = rf.fit(train_data).apply(valid_data)
    mean, stddev = preds.mean, preds.stddev

    assert np.allclose(mean, [1, 1, 1, 1, 1])
    assert np.allclose(stddev, [0, 0, 0, 0, 0])

    # delta distributions (zero standard deviation)
    rf = ExtremelyRandomizedTreesRegressionSklearn(n_estimators=10,
                                                   uncertainties=None,
                                                   random_state=0)
    preds = rf.fit(train_data).apply(valid_data)
    mean, stddev = preds.mean, preds.stddev

    assert np.allclose(mean, [1, 1, 1, 1, 1])
    assert np.allclose(stddev, [0, 0, 0, 0, 0])

    assert isinstance(preds, smlb.DeltaPredictiveDistribution)
def test_ChemistryDevelopmentKitMoleculeFeatures_1():
    """Simple examples."""

    # specific descriptors

    # citric acid, three carboxylic groups
    data = smlb.TabularData(data=np.array(["OC(=O)CC(O)(C(=O)O)CC(=O)O"]))
    features = (
        ChemistryDevelopmentKitMoleculeFeatures(
            # using an order different from the order in which descriptors are defined
            # in ChemistryDevelopmentKitMoleculeFeatures to test that descriptors are
            # calculated in the order specified by `select`
            select=["acidic_group_count", "bond_count",
                    "atom_count"], ).fit(data).apply(data))
    assert features.samples()[0][0] == 3
    assert features.samples()[0][1] == 12
    assert features.samples()[0][2] == 21

    # all descriptors

    # citric acid, benzene
    data = smlb.TabularData(
        data=np.array(["OC(=O)CC(O)(C(=O)O)CC(=O)O", "c1ccccc1"]))
    features = (ChemistryDevelopmentKitMoleculeFeatures(
        select=ChemistryDevelopmentKitMoleculeFeatures.PRESET_ALL, ).fit(
            data).apply(data))
    dimensions = (
        v[1]
        for v in ChemistryDevelopmentKitMoleculeFeatures.DESCRIPTORS.values())
    assert len(features.samples()[0]) == sum(dimensions)

    # pre-sets
    features = ((ChemistryDevelopmentKitMoleculeFeatures(
        select=ChemistryDevelopmentKitMoleculeFeatures.PRESET_ROBUST, )
                 ).fit(data).apply(data))

    # fragile descriptors
    data = smlb.TabularData(data=np.array(["CCCCl"]))
    features = ((ChemistryDevelopmentKitMoleculeFeatures(
        select=["alogp"])).fit(data).apply(data)).samples()[0]
    assert np.allclose((features[0], features[2]), (1.719, 20.585), atol=0.01)

    # raise for unknown descriptors
    with pytest.raises(smlb.InvalidParameterError):
        ChemistryDevelopmentKitMoleculeFeatures(select=["atoms_counts"])

    # raise for invalid cdk_path
    with pytest.raises(smlb.InvalidParameterError):
        ChemistryDevelopmentKitMoleculeFeatures(
            CdkJavaGateway(cdk_jar_path="/nonexisting/path/to/cdk.jar"))

    # todo: this is a temporary fix for problems in the interaction between
    #        ChemistryDevelopmentKitMoleculeFeatures and lolopy. If the
    #        JavaGateway for CDK is not shut down, lolopy hangs on querying
    #        the port number of its server:
    #        ../../../virtualenv/python3.6.7/lib/python3.6/site-packages/lolopy/loloserver.py:74: in get_java_gateway
    #        >       _port = int(proc.stdout.readline())
    #        E       Failed: Timeout >10.0s
    #        ../../../virtualenv/python3.6.7/lib/python3.6/site-packages/py4j/java_gateway.py:332: Failed
    CdkJavaGateway()._shutdown_gateway()
예제 #12
0
    def numeric_unlabeled_setup(n: int):
        global ds1, ds2

        data1 = np.random.uniform(size=(n, 3))
        data1[int(0.33 * n)] = [1, 2, 3]
        data1[int(0.83 * n)] = data1[int(0.01 * n)] = [4, 5, 6]  # duplicate
        ds1 = smlb.TabularData(data=data1)

        data2 = np.random.uniform(size=(2 * n, 3))
        data2[int(1.9 * n)] = [1, 2, 3]
        data2[int(0.5 * n)] = [4, 5, 6]
        ds2 = smlb.TabularData(data=data2)
예제 #13
0
def test_GaussianProcessRegressionSklearn_1():
    """Simple examples."""

    # linear function with linear kernel
    kernel = skl.gaussian_process.kernels.DotProduct(sigma_0=0, sigma_0_bounds="fixed")
    gpr = GaussianProcessRegressionSklearn(kernel=kernel, optimizer=None, random_state=1)
    train_data = smlb.TabularData(data=np.array([[-1], [1]]), labels=np.array([-1, 1]))
    valid_data = smlb.TabularData(data=np.array([[-2], [-1], [0], [1], [2]]))
    preds = gpr.fit(train_data).apply(valid_data)
    mean, stddev = preds.mean, preds.stddev

    assert np.allclose(mean, [-2, -1, 0, 1, 2])
    assert stddev[0] > stddev[1] > stddev[2] < stddev[3] < stddev[4]
예제 #14
0
def test_RandomForestRegressionSklearn_1():
    """Simple example: constant 1-d function."""

    # MH: for constant labels, expected uncertainties are zero
    train_data = smlb.TabularData(
        data=np.array([[-4], [-3], [-2], [-1], [0], [1], [2], [3], [4]]),
        labels=np.array([1, 1, 1, 1, 1, 1, 1, 1, 1]),
    )
    valid_data = smlb.TabularData(data=np.array([[-4], [-2], [0], [3], [4]]))
    rf = RandomForestRegressionSklearn(random_state=1, uncertainties="naive")
    preds = rf.fit(train_data).apply(valid_data)
    mean, stddev = preds.mean, preds.stddev

    assert np.allclose(mean, [1, 1, 1, 1, 1])
    assert np.allclose(stddev, [0, 0, 0, 0, 0])
예제 #15
0
 def _create_ds_ss(size=10, draw=3, labels=False, rng=0):
     data = np.arange(size)
     dataset = smlb.TabularData(data=data,
                                labels=np.arange(size) +
                                1 if labels else None)
     sampler = smlb.RandomSubsetSampler(size=draw, rng=rng)
     return dataset, sampler
예제 #16
0
def test_InverseTransformation():
    """Simple example."""
    class TestInverseTransformation(smlb.DataValuedTransformation):
        """Transforms strings back to integers."""
        def fit(self, data):
            return self

        def apply(self, data):
            return smlb.TabularData(data=np.array([[int(i)] for i in data]))

    class TestTransformation(smlb.DataTransformation,
                             smlb.InvertibleTransformation):
        """Transforms integers to strings."""
        def fit(self, data):
            return self

        def apply(self, data):
            return [str(i[0]) for i in data.samples()]

        def inverse(self):
            return TestInverseTransformation()

    original_data = smlb.TabularData(np.array([[1], [2], [3], [5], [8]]))
    transformed_data = TestTransformation().fit(original_data).apply(
        original_data)
    assert transformed_data == ["1", "2", "3", "5", "8"]
    preimage_data = (TestTransformation().fit(original_data).inverse().apply(
        transformed_data))  # no need to fit
    assert all(preimage_data.samples() == original_data.samples())
예제 #17
0
def test_MatminerCompositionFeatures_2():
    """Test that feature subsets can be applied individually."""

    data = smlb.TabularData(data=np.array(["V4O5", "Ni87.3Al10Cu3.3Co.23"]))
    mmfa = (
        MatminerCompositionFeatures(select="all", ionic_fast=True).fit(data=data).apply(data=data)
    )
    mmfb = (
        MatminerCompositionFeatures(
            select=("stoichiometry", "elemental", "ionic", "valence"), ionic_fast=True
        )
        .fit(data=data)
        .apply(data=data)
    )

    mmf1 = MatminerCompositionFeatures(select="stoichiometry").fit(data=data).apply(data=data)
    mmf2 = MatminerCompositionFeatures(select="elemental").fit(data=data).apply(data=data)
    mmf3 = (
        MatminerCompositionFeatures(select="ionic", ionic_fast=True)
        .fit(data=data)
        .apply(data=data)
    )
    mmf4 = MatminerCompositionFeatures(select="valence").fit(data=data).apply(data=data)

    # stack the individual featurizations together and assert that we recover full featurization
    recombined_features = np.hstack(
        [mmf1.samples(), mmf2.samples(), mmf3.samples(), mmf4.samples()]
    )

    assert (recombined_features == mmfa.samples()).all()
    assert (mmfa.samples() == mmfb.samples()).all()
예제 #18
0
def test_GaussianProcessRegressionSklearn_3():
    """All predictive distributions.

    Linear noisy function, linear kernel + white noise kernel.
    The optimized noise level is expected to go to its true value.
    """

    kernel = skl.gaussian_process.kernels.DotProduct(
        sigma_0=0, sigma_0_bounds="fixed"
    ) + skl.gaussian_process.kernels.WhiteKernel(noise_level=1, noise_level_bounds=(1e-5, 1e5))
    gpr = GaussianProcessRegressionSklearn(kernel=kernel, random_state=1)
    n, nlsd = 100, 0.5
    data = smlb.TabularData(data=np.ones(shape=(n, 1)) * 2, labels=np.ones(shape=n) * 3)
    data = smlb.LabelNoise(noise=smlb.NormalNoise(stddev=nlsd, rng=1)).fit(data).apply(data)
    preds = gpr.fit(data).apply(data)

    assert preds.has_signal_part and preds.has_noise_part
    conf, noise = preds.signal_part, preds.noise_part

    assert np.allclose(conf.mean, np.ones(n) * 3, atol=1e-1)
    assert np.allclose(conf.stddev, np.ones(n) * nlsd, atol=1e-1)

    assert (preds.mean == conf.mean).all()
    assert np.allclose(preds.stddev, np.sqrt(np.square(conf.stddev) + np.square(nlsd)), atol=1e-1)

    assert np.allclose(noise.mean, np.zeros(shape=n))
    assert np.allclose(noise.stddev, nlsd, atol=1e-1)
예제 #19
0
def test_RandomForestRegressionSklearn_3():
    """Ensure predictions are identical independent of uncertainties method used."""

    rf1 = RandomForestRegressionSklearn(random_state=1, uncertainties=None)
    rf2 = RandomForestRegressionSklearn(random_state=1, uncertainties="naive")
    train_data = smlb.TabularData(
        data=np.array([[-2], [-1.5], [-1], [-0.5], [0], [0.5], [1], [1.5], [2]]),
        labels=np.array([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]),
    )
    rf1.fit(train_data)
    rf2.fit(train_data)

    test_data = np.array([[-3], [-1], [0], [0.5], [1], [2]])
    mean1 = rf1.apply(smlb.TabularData(data=test_data)).mean
    mean2 = rf2.apply(smlb.TabularData(data=test_data)).mean
    assert np.allclose(mean1, mean2, atol=1e-6)
def test_ChemistryDevelopmentKitMoleculeFeatures_2():
    """Failures during SMILES parsing."""

    # specific descriptors

    # "raise"
    data = smlb.TabularData(data=np.array(["[NH]c1cc[nH]nn1"]))
    features = ChemistryDevelopmentKitMoleculeFeatures(select=["atom_count"],
                                                       failmode="raise")
    with pytest.raises(smlb.BenchmarkError):
        features.fit(data).apply(data)

    # "drop"
    data = smlb.TabularData(data=np.array(["N", "[NH]c1cc[nH]nn1", "O"]))
    features = ChemistryDevelopmentKitMoleculeFeatures(select=["atom_count"],
                                                       failmode="drop")
    data = features.fit(data).apply(data)
    assert (data.samples() == [[4], [3]]).all()

    # "mask"
    data = smlb.TabularData(data=np.array(["N", "[NH]c1cc[nH]nn1", "O"]))
    mask = np.empty(3, dtype=bool)
    features = ChemistryDevelopmentKitMoleculeFeatures(select=["atom_count"],
                                                       failmode=("mask", mask))
    data = features.fit(data).apply(data)
    assert (mask == [False, True, False]).all()

    # "index"
    data = smlb.TabularData(data=np.array(["N", "[NH]c1cc[nH]nn1", "O"]))
    index = []
    features = ChemistryDevelopmentKitMoleculeFeatures(select=["atom_count"],
                                                       failmode=("index",
                                                                 index))
    data = features.fit(data).apply(data)
    assert index == [1]

    # todo: this is a temporary fix for problems in the interaction between
    #        ChemistryDevelopmentKitMoleculeFeatures and lolopy. If the
    #        JavaGateway for CDK is not shut down, lolopy hangs on querying
    #        the port number of its server:
    #        ../../../virtualenv/python3.6.7/lib/python3.6/site-packages/lolopy/loloserver.py:74: in get_java_gateway
    #        >       _port = int(proc.stdout.readline())
    #        E       Failed: Timeout >10.0s
    #        ../../../virtualenv/python3.6.7/lib/python3.6/site-packages/py4j/java_gateway.py:332: Failed
    CdkJavaGateway()._shutdown_gateway()
예제 #21
0
    def numeric_labeled_setup(n: int):
        global ds1, ds2

        data1 = np.random.uniform(size=(n, 3))
        labels1 = np.random.uniform(size=(n, 2))
        data1[int(0.33 * n)] = [1, 2, 3]
        labels1[int(0.33 * n)] = [11, 22]
        data1[int(0.83 * n)] = data1[int(0.01 * n)] = [4, 5, 6]  # duplicate
        labels1[int(0.83 * n)] = labels1[int(0.01 * n)] = [44, 55]  # duplicate
        ds1 = smlb.TabularData(data=data1, labels=labels1)

        data2 = np.random.uniform(size=(2 * n, 3))
        labels2 = np.random.uniform(size=(2 * n, 2))
        data2[int(1.9 * n)] = [1, 2, 3]
        labels2[int(1.9 * n)] = [11, 22]
        data2[int(0.5 * n)] = [4, 5, 6]
        labels2[int(0.5 * n)] = [44, 55]
        ds2 = smlb.TabularData(data=data2, labels=labels2)
예제 #22
0
    def mixed_unlabeled_setup(n: int):
        global ds1, ds2

        dt = [("A", float), ("B", "U1"), ("C", float)]

        data1a = np.random.uniform(size=n)
        data1b = np.random.choice(list(string.ascii_letters), n)
        data1c = np.random.uniform(size=n)
        data1 = np.array([(a, b, c) for a, b, c in zip(data1a, data1b, data1c)], dtype=dt)
        data1[int(0.33 * n)] = (1, "b", 3)
        data1[int(0.83 * n)] = data1[int(0.01 * n)] = (4, "c", 6)  # duplicate
        ds1 = smlb.TabularData(data=data1)

        data2a = np.random.uniform(size=2 * n)
        data2b = np.random.choice(list(string.ascii_letters), 2 * n)
        data2c = np.random.uniform(size=2 * n)
        data2 = np.array([(a, b, c) for a, b, c in zip(data2a, data2b, data2c)], dtype=dt)
        data2[int(1.9 * n)] = (1, "b", 3)
        data2[int(0.5 * n)] = (4, "c", 6)
        ds2 = smlb.TabularData(data=data2)
예제 #23
0
def test_DataTransformationFailureMode_no_duplicates():
    """Test that only unique indices are returned."""

    dataset = smlb.TabularData(data=np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))
    fails = []
    failmode = smlb.DataTransformationFailureMode(("index", fails),
                                                  dataset.num_samples)
    failmode.handle_failure(1)
    failmode.handle_failure(5)
    failmode.handle_failure(6)
    failmode.handle_failure(5)
    dataset = failmode.finalize(dataset)

    assert dataset.num_samples == 10
    assert fails == [1, 5, 6]
예제 #24
0
def test_MatminerCompositionFeatures_3():
    """Test specific values for each feature group.

    These tests compute all wrapped feature groups
    (e.g., stoichiometry, elemental, ionic, valence)
    for reference systems (e.g., Fe2 O3) and compare
    against reference values provided by the matminer tests.

    Reference values from matminer `test_composition.py`:
    https://github.com/hackingmaterials/matminer/blob/master/matminer/featurizers/tests/test_composition.py

    The tests proceed according to this scheme:
    ```
        # create an (unlabeled) dataset containing one or two chemical sum formulas
        data = smlb.TabularData(data=["compound(s) formula", ...])
        # compute a specific group of matminer features; some accept parameters passed through to matminer
        mmf = MatminerCompositionFeatures(select="group", pass-through parameters)
        # compute the features; mmf is now a dataset that contains feature vectors
        mmf = mmf.fit(data).apply(data)
        # compare the i-th feature of first sample versus matminer reference value
        assert np.allclose(mmf.samples()[0][i], reference_value)
    ```
    The reference values are taken from matminer. They do not have any meaning beyond
    having been computed there. This test only verifies that the smlb wrapper returns
    the same values as the original matminer call for selected test cases.
    """

    # stoichiometry

    # default
    data = smlb.TabularData(data=np.array(["Fe2O3"]))
    mmf = MatminerCompositionFeatures(select="stoichiometry").fit(data).apply(data)
    assert mmf.samples()[0][0] == 2
    assert np.allclose(mmf.samples()[0][-2], 0.604895199)

    # user-defined norms
    mmf = (
        MatminerCompositionFeatures(select="stoichiometry", stoichiometry_p_list=(7, 0))
        .fit(data)
        .apply(data)
    )
    assert np.allclose(mmf.samples()[0][0], 0.604895199)
    assert mmf.samples()[0][1] == 2

    # invariance to amounts
    data = smlb.TabularData(np.array(["FeO", "Fe0.5O0.5", "Fe2O2"]))
    mmf = MatminerCompositionFeatures(select="stoichiometry").fit(data).apply(data)
    assert np.allclose(mmf.samples()[0], mmf.samples()[1])
    assert np.allclose(mmf.samples()[0], mmf.samples()[2])

    # elemental

    # magpie
    data = smlb.TabularData(np.array(["Fe2O3"]))
    mmf = MatminerCompositionFeatures(select="elemental").fit(data).apply(data)
    assert np.allclose(mmf.samples()[0][:6], [8, 26, 18, 15.2, 8.64, 8])

    # ionic

    # default
    data = smlb.TabularData(data=np.array(["Fe2O3"]))
    mmf = MatminerCompositionFeatures(select="ionic").fit(data=data).apply(data=data)
    assert np.allclose(mmf.samples()[0], [1, 0.476922164, 0.114461319])

    # fast
    mmf = (
        MatminerCompositionFeatures(select="ionic", ionic_fast=True)
        .fit(data=data)
        .apply(data=data)
    )
    assert np.allclose(mmf.samples()[0], [1, 0.476922164, 0.114461319])

    # fast with heterovalent compound
    data = smlb.TabularData(data=np.array(["Fe3O4"]))
    mmf1 = MatminerCompositionFeatures(select="ionic", ionic_fast=False).fit(data).apply(data)
    mmf2 = MatminerCompositionFeatures(select="ionic", ionic_fast=True).fit(data).apply(data)
    assert mmf1.samples()[0][0] == 1 and mmf2.samples()[0][0] == 0

    # valence

    # default parameters
    data = smlb.TabularData(np.array(["Fe2O3"]))
    mmf = MatminerCompositionFeatures(select="valence").fit(data).apply(data)
    np.allclose(mmf.samples()[0], [2.0, 2.4, 2.4, 0.0, 0.294117647, 0.352941176, 0.352941176, 0])

    # user-defined parameters
    data = smlb.TabularData(np.array(["Fe2O3"]))
    mmf = (
        MatminerCompositionFeatures(
            select="valence", valence_orbitals=("s", "p"), valence_props=("avg",)
        )
        .fit(data)
        .apply(data)
    )
    np.allclose(mmf.samples()[0], [2.0, 2.4])

    data = smlb.TabularData(np.array(["Fe2O3"]))
    mmf = (
        MatminerCompositionFeatures(
            select="valence", valence_orbitals=("p", "s"), valence_props=("frac", "avg",)
        )
        .fit(data)
        .apply(data)
    )
    np.allclose(mmf.samples()[0], [0.352941176, 0.294117647, 2.4, 2.0])
예제 #25
0
 def apply(self, data):
     return smlb.TabularData(data=np.array([[int(i)] for i in data]))
예제 #26
0
    def create_TabularData(sel: int) -> smlb.TabularData:
        """Returns one of a given list of TabularData objects.

        TabularData objects are created anew on each invocation.
        Changes to them are not persistent.

        Parameter:
            sel: which TabularData object to return
                 0   empty set
                10   2 x 3, numerical unlabeled
                11   2 x 3, numerical scalar labels
                12   2 x 3, numerical vector labels
                13   2 x 3, mixed unlabeled
                14   2 x 3, mixed labeled
                20   7 x 3, numerical unlabeled
                21   7 x 3, numerical scalar labels
                22   7 x 3, numerical vector labels
                23   7 x 3, mixed unlabeled
                24   7 x 3, mixed labeled

        Returns:
            Selected TabularData object

        0: empty set

        10: 2x3, numerical unlabeled
            1 2 3
            4 5 6
        11: 2x3, numerical scalar labels
            1 2 3    10
            4 5 6    20
        12: 2x3, numerical vector labels
            1 2 3    10 11
            4 5 6    20 21
        13: 2x3, mixed unlabeled
            1 'a' 1.1
            2 'b' 2.2
        14: 2x3, mixed labeled
            1 'a' 1.1    'x', 10
            2 'b' 2.2    'y', 20

        20: 7x3, numerical unlabeled
            1 2 3.3    # 0
            4 5 6.6    # 1
            7 8 9.9    # 2
            1 2 3.3    # 3
            3 5 6.6    # 4
            7 8 9.9    # 5
            1 2 3.3    # 6
        21: 7x3, numerical scalar labels
            1 2 3.3    1    # 0
            4 5 6.6    2    # 1
            7 8 9.9    3    # 2
            1 2 3.3    1    # 3
            3 5 6.6    5    # 4
            7 8 9.9    3    # 5
            1 2 3.3    7    # 6
        22: 7x3, numerical vector labels
            1 2 3.3    1 11    # 0
            4 5 6.6    2 22    # 1
            7 8 9.9    3 33    # 2
            1 2 3.3    1 11    # 3
            3 5 6.6    5 55    # 4
            7 8 9.9    3 33    # 5
            1 2 3.3    7 77    # 6
        23: 7x3, mixed unlabeled
            1 'b' 3.3    # 0
            4 'e' 6.6    # 1
            7 'h' 9.9    # 2
            1 'b' 3.3    # 3
            3 'e' 6.6    # 4
            7 'h' 9.9    # 5
            1 'b' 3.3    # 6
        24: 7x3, mixed labeled
            1 'b' 3.3    'a' 11    # 0
            4 'e' 6.6    'b' 22    # 1
            7 'h' 9.9    'c' 33    # 2
            1 'b' 3.3    'a' 11    # 3
            3 'e' 6.6    'e' 55    # 4
            7 'h' 9.9    'c' 33    # 5
            1 'b' 3.3    'g' 77    # 6

        """

        data10 = np.asarray([[1, 2, 3], [4, 5, 6]])
        labels11 = np.asarray([10, 20])
        labels12 = np.asarray([[10, 11], [20, 21]])
        data13 = np.asarray([(1, "a", 1.1), (2, "b", 2.2)],
                            dtype=[("A", int), ("B", "U1"), ("C", float)])
        labels14 = np.asarray([("x", 10), ("y", 20)],
                              dtype=[("X", "U1"), ("Y", int)])

        data20 = np.asarray([
            [1, 2, 3.3],  # 0
            [4, 5, 6.6],  # 1
            [7, 8, 9.9],  # 2
            [1, 2, 3.3],  # 3
            [3, 5, 6.6],  # 4
            [7, 8, 9.9],  # 5
            [1, 2, 3.3],  # 6
        ])
        labels21 = np.array([1, 2, 3, 1, 5, 3, 7])
        labels22 = np.array([[1, 11], [2, 22], [3, 33], [1, 11], [5, 55],
                             [3, 33], [7, 77]])

        data23 = np.array(
            [
                (1, "b", 3.3),  # 0
                (4, "e", 6.6),  # 1
                (7, "h", 9.9),  # 2
                (1, "b", 3.3),  # 3
                (3, "e", 6.6),  # 4
                (7, "h", 9.9),  # 5
                (1, "b", 3.3),  # 6
            ],
            dtype=[("A", int), ("B", "U1"), ("C", float)],
        )
        labels24 = np.array(
            [("a", 11), ("b", 22), ("c", 33), ("a", 11), ("e", 55), ("c", 33),
             ("g", 77)],
            dtype=[("X", "U1"), ("Y", int)],
        )

        if sel == 0:  # empty set
            return smlb.TabularData(data=np.empty(shape=(0, 0)))

        elif sel == 10:  # 2 x 3, unlabeled
            return smlb.TabularData(data=data10)
        elif sel == 11:  # 2 x 3, scalar labels
            return smlb.TabularData(data=data10, labels=labels11)
        elif sel == 12:  # 2 x 3, vector labels
            return smlb.TabularData(data=data10, labels=labels12)
        elif sel == 13:  # 2 x 3, mixed unlabeled
            return smlb.TabularData(data=data13)
        elif sel == 14:  # 2 x 3, mixed labeled
            return smlb.TabularData(data=data13, labels=labels14)
        elif sel == 20:  # 7 x 3, unlabeled, with repetitions
            return smlb.TabularData(data=data20)
        elif sel == 21:  # 7 x 3, scalar labels, with repetitions
            return smlb.TabularData(data=data20, labels=labels21)
        elif sel == 22:  # 7 x 3, vector labels, with repetitions
            return smlb.TabularData(data=data20, labels=labels22)
        elif sel == 23:  # 7 x 3, mixed unlabeled
            return smlb.TabularData(data=data23)
        elif sel == 24:  # 7 x 3, mixed labeled
            return smlb.TabularData(data=data23, labels=labels24)
        else:
            raise smlb.InvalidParameterError("dataset identifier", sel)