Пример #1
0
def test_class1_neural_network_a0205_training_accuracy():
    # Memorize the dataset.
    hyperparameters = dict(activation="tanh",
                           layer_sizes=[16],
                           max_epochs=500,
                           early_stopping=False,
                           validation_split=0.0,
                           locally_connected_layers=[{
                               "filters": 8,
                               "activation": "tanh",
                               "kernel_size": 3
                           }],
                           dense_layer_l1_regularization=0.0,
                           dropout_probability=0.0)

    # First test a Class1NeuralNetwork, then a Class1AffinityPredictor.
    allele = "HLA-A*02:05"

    df = pandas.read_csv(
        get_path("data_curated", "curated_training_data.affinity.csv.bz2"))
    df = df.loc[df.allele == allele]
    df = df.loc[df.peptide.str.len() == 9]
    df = df.loc[df.measurement_type == "quantitative"]
    df = df.loc[df.measurement_source == "kim2014"]

    predictor = Class1NeuralNetwork(**hyperparameters)
    predictor.fit(df.peptide.values, df.measurement_value.values)
    ic50_pred = predictor.predict(df.peptide.values)
    ic50_true = df.measurement_value.values
    eq_(len(ic50_pred), len(ic50_true))
    testing.assert_allclose(numpy.log(ic50_pred),
                            numpy.log(ic50_true),
                            rtol=0.2,
                            atol=0.2)

    # Test that a second predictor has the same architecture json.
    # This is important for an optimization we use to re-use predictors of the
    # same architecture at prediction time.
    hyperparameters2 = dict(activation="tanh",
                            layer_sizes=[16],
                            max_epochs=1,
                            early_stopping=False,
                            validation_split=0.0,
                            locally_connected_layers=[{
                                "filters": 8,
                                "activation": "tanh",
                                "kernel_size": 3
                            }],
                            dense_layer_l1_regularization=0.0,
                            dropout_probability=0.0)
    predictor2 = Class1NeuralNetwork(**hyperparameters2)
    predictor2.fit(df.peptide.values, df.measurement_value.values, verbose=0)
    eq_(predictor.network().to_json(), predictor2.network().to_json())
Пример #2
0
def test_multi_output():
    hyperparameters = dict(
        loss="custom:mse_with_inequalities_and_multiple_outputs",
        activation="tanh",
        layer_sizes=[16],
        max_epochs=50,
        minibatch_size=250,
        random_negative_rate=0.0,
        random_negative_constant=0.0,
        early_stopping=False,
        validation_split=0.0,
        locally_connected_layers=[
        ],
        dense_layer_l1_regularization=0.0,
        dropout_probability=0.0,
        optimizer="adam",
        num_outputs=3)

    df = pandas.DataFrame()
    df["peptide"] = random_peptides(10000, length=9)
    df["output1"] = df.peptide.map(lambda s: s[4] == 'K').astype(int) * 49000 + 1
    df["output2"] = df.peptide.map(lambda s: s[3] == 'Q').astype(int) * 49000 + 1
    df["output3"] = df.peptide.map(lambda s: s[4] == 'K' or s[3] == 'Q').astype(int) * 49000 + 1

    print("output1 mean", df.output1.mean())
    print("output2 mean", df.output2.mean())

    stacked = df.set_index("peptide").stack().reset_index()
    stacked.columns = ['peptide', 'output_name', 'value']
    stacked["output_index"] = stacked.output_name.map({
        "output1": 0,
        "output2": 1,
        "output3": 2,
    })
    assert not stacked.output_index.isnull().any(), stacked

    fit_kwargs = {
        'verbose': 1,
    }

    predictor = Class1NeuralNetwork(**hyperparameters)
    stacked_train = stacked
    predictor.fit(
        stacked_train.peptide.values,
        stacked_train.value.values,
        output_indices=stacked_train.output_index.values,
        **fit_kwargs)

    result = predictor.predict(df.peptide.values, output_index=None)
    print(df.shape, result.shape)
    print(result)

    df["prediction1"] = result[:,0]
    df["prediction2"] = result[:,1]
    df["prediction3"] = result[:,2]

    df_by_peptide = df.set_index("peptide")

    correlation = pandas.DataFrame(
        numpy.corrcoef(df_by_peptide.T),
        columns=df_by_peptide.columns,
        index=df_by_peptide.columns)
    print(correlation)

    sub_correlation = correlation.loc[
        ["output1", "output2", "output3"],
        ["prediction1", "prediction2", "prediction3"],
    ]
    assert sub_correlation.iloc[0, 0] > 0.99, correlation
    assert sub_correlation.iloc[1, 1] > 0.99, correlation
    assert sub_correlation.iloc[2, 2] > 0.99, correlation
Пример #3
0
def test_inequalities():
    # Memorize the dataset.
    hyperparameters = dict(
        peptide_amino_acid_encoding="one-hot",
        activation="tanh",
        layer_sizes=[64],
        max_epochs=200,
        minibatch_size=32,
        random_negative_rate=0.0,
        random_negative_constant=0,
        early_stopping=False,
        validation_split=0.0,
        locally_connected_layers=[{
            "filters": 8,
            "activation": "tanh",
            "kernel_size": 3
        }],
        dense_layer_l1_regularization=0.0,
        dropout_probability=0.0,
        loss="custom:mse_with_inequalities_and_multiple_outputs")

    dfs = []

    # Weak binders
    df = pandas.DataFrame()
    df["peptide"] = random_peptides(100, length=9)
    df["value"] = 100
    df["inequality1"] = "="
    df["inequality2"] = "<"
    dfs.append(df)

    # Strong binders - same peptides as above but more measurement values
    df = pandas.DataFrame()
    df["peptide"] = dfs[-1].peptide.values
    df["value"] = 1
    df["inequality1"] = "="
    df["inequality2"] = "="
    dfs.append(df)

    # Non-binders
    df = pandas.DataFrame()
    df["peptide"] = random_peptides(100, length=10)
    df["value"] = 1000
    df["inequality1"] = ">"
    df["inequality2"] = ">"
    dfs.append(df)

    df = pandas.concat(dfs, ignore_index=True)

    fit_kwargs = {'verbose': 0}

    predictor = Class1NeuralNetwork(**hyperparameters)
    predictor.fit(df.peptide.values,
                  df.value.values,
                  inequalities=df.inequality1.values,
                  **fit_kwargs)
    df["prediction1"] = predictor.predict(df.peptide.values)

    predictor = Class1NeuralNetwork(**hyperparameters)
    predictor.fit(df.peptide.values,
                  df.value.values,
                  inequalities=df.inequality2.values,
                  **fit_kwargs)
    df["prediction2"] = predictor.predict(df.peptide.values)

    # Binders should be stronger
    for pred in ["prediction1", "prediction2"]:
        assert_less(df.loc[df.value < 1000, pred].mean(), 500)
        assert_greater(df.loc[df.value >= 1000, pred].mean(), 500)

    # For the binders, the (=) on the weak-binding measurement (100) in
    # inequality1 should make the prediction weaker, whereas for inequality2
    # this measurement is a "<" so it should allow the strong-binder measurement
    # to dominate.
    numpy.testing.assert_allclose(df.loc[df.value == 1].prediction2.values,
                                  1.0,
                                  atol=0.5)
    numpy.testing.assert_array_less(5.0,
                                    df.loc[df.value == 1].prediction1.values)
    print(df.groupby("value")[["prediction1", "prediction2"]].mean())
Пример #4
0
def test_inequalities():
    # Memorize the dataset.
    hyperparameters = dict(
        loss="custom:mse_with_inequalities",
        peptide_amino_acid_encoding="one-hot",
        activation="tanh",
        layer_sizes=[16],
        max_epochs=50,
        minibatch_size=32,
        random_negative_rate=0.0,
        early_stopping=False,
        validation_split=0.0,
        locally_connected_layers=[
            {
                "filters": 8,
                "activation": "tanh",
                "kernel_size": 3
            }
        ],
        dense_layer_l1_regularization=0.0,
        dropout_probability=0.0)

    df = pandas.DataFrame()
    df["peptide"] = random_peptides(1000, length=9)

    # First half are binders
    df["binder"] = df.index < len(df) / 2
    df["value"] = df.binder.map({True: 100, False: 5000})
    df.loc[:10, "value"] = 1.0  # some strong binders
    df["inequality1"] = "="
    df["inequality2"] = df.binder.map({True: "<", False: "="})
    df["inequality3"] = df.binder.map({True: "=", False: ">"})

    # "A" at start of peptide indicates strong binder
    df["peptide"] = [
        ("C" if not row.binder else "A") + row.peptide[1:]
        for _, row in df.iterrows()
    ]

    fit_kwargs = {'verbose': 0}

    # Prediction1 uses no inequalities (i.e. all are (=))
    predictor = Class1NeuralNetwork(**hyperparameters)
    predictor.fit(
        df.peptide.values,
        df.value.values,
        inequalities=df.inequality1.values,
        **fit_kwargs)
    df["prediction1"] = predictor.predict(df.peptide.values)

    # Prediction2 has a (<) inequality on binders and an (=) on non-binders
    predictor = Class1NeuralNetwork(**hyperparameters)
    predictor.fit(
        df.peptide.values,
        df.value.values,
        inequalities=df.inequality2.values,
        **fit_kwargs)
    df["prediction2"] = predictor.predict(df.peptide.values)

    # Prediction3 has a (=) inequality on binders and an (>) on non-binders
    predictor = Class1NeuralNetwork(**hyperparameters)
    predictor.fit(
        df.peptide.values,
        df.value.values,
        inequalities=df.inequality3.values,
        **fit_kwargs)
    df["prediction3"] = predictor.predict(df.peptide.values)

    df_binders = df.loc[df.binder]
    df_nonbinders = df.loc[~df.binder]

    print("***** Binders: *****")
    print(df_binders.head(5))

    print("***** Non-binders: *****")
    print(df_nonbinders.head(5))

    # Binders should always be given tighter predicted affinity than non-binders
    assert_less(df_binders.prediction1.mean(), df_nonbinders.prediction1.mean())
    assert_less(df_binders.prediction2.mean(), df_nonbinders.prediction2.mean())
    assert_less(df_binders.prediction3.mean(), df_nonbinders.prediction3.mean())

    # prediction2 binders should be tighter on average than prediction1
    # binders, since prediction2 has a (<) inequality for binders.
    # Non-binders should be about the same between prediction2 and prediction1
    assert_less(df_binders.prediction2.mean(), df_binders.prediction1.mean())
    assert_almost_equal(
        df_nonbinders.prediction2.mean(),
        df_nonbinders.prediction1.mean(),
        delta=3000)

    # prediction3 non-binders should be weaker on average than prediction2 (or 1)
    # non-binders, since prediction3 has a (>) inequality for these peptides.
    # Binders should be about the same.
    assert_greater(
        df_nonbinders.prediction3.mean(),
        df_nonbinders.prediction2.mean())
    assert_greater(
        df_nonbinders.prediction3.mean(),
        df_nonbinders.prediction1.mean())
    assert_almost_equal(
        df_binders.prediction3.mean(),
        df_binders.prediction1.mean(),
        delta=3000)