Exemplo n.º 1
0
def test_dataset_intersection():
    dataset1 = Dataset.from_nested_dictionary({
        "H-2-Kb": {
            "SIINFEKL": 10.0,
            "FEKLSIIN": 20000.0,
            "SIFEKLIN": 50000.0,
        }
    })
    dataset2 = Dataset.from_nested_dictionary({"H-2-Kb": {"SIINFEKL": 30.0}})
    dataset_intersection = dataset1.intersection(dataset2)
    expected_result = Dataset.from_nested_dictionary(
        {"H-2-Kb": {
            "SIINFEKL": 10.0
        }})
    eq_(dataset_intersection, expected_result)
Exemplo n.º 2
0
def test_dataset_difference():
    dataset1 = Dataset.from_nested_dictionary({
        "H-2-Kb": {
            "SIINFEKL": 10.0,
            "FEKLSIIN": 20000.0,
            "SIFEKLIN": 50000.0,
        }
    })
    dataset2 = Dataset.from_nested_dictionary({"H-2-Kb": {"SIINFEKL": 10.0}})
    dataset_diff = dataset1.difference(dataset2)
    expected_result = Dataset.from_nested_dictionary(
        {"H-2-Kb": {
            "FEKLSIIN": 20000.0,
            "SIFEKLIN": 50000.0,
        }})
    eq_(dataset_diff, expected_result)
def test_class1_binding_predictor_A0205_training_accuracy():
    dataset = Dataset.from_csv(get_path(
        "data_combined_iedb_kim2014", "combined_human_class1_dataset.csv"))
    dataset_a0205_all_lengths = dataset.get_allele("HLA-A0205")
    dataset_a0205 = Dataset(
        dataset_a0205_all_lengths._df.ix[
            dataset_a0205_all_lengths._df.peptide.str.len() == 9])

    predictor = Class1BindingPredictor(
        name="A0205",
        embedding_output_dim=32,
        activation="tanh",
        layer_sizes=[64],
        optimizer="adam",
        dropout_probability=0.0)
    predictor.fit_dataset(dataset_a0205, n_training_epochs=1000)
    peptides = dataset_a0205.peptides
    ic50_pred = predictor.predict(peptides)
    ic50_true = dataset_a0205.affinities
    eq_(len(ic50_pred), len(ic50_true))
    testing.assert_allclose(
        np.log(ic50_pred),
        np.log(ic50_true),
        rtol=0.2,
        atol=0.2)
def class1_binding_predictor_A0205_training_accuracy():
    dataset = Dataset.from_csv(CLASS1_DATA_CSV_PATH)
    dataset_a0205 = dataset.get_allele("HLA-A0205")

    predictor = Class1BindingPredictor.from_hyperparameters(name="A0205")
    predictor.fit_dataset(dataset_a0205)
    peptides = dataset_a0205.peptides
    ic50_pred = predictor.predict(peptides)
    ic50_true = dataset_a0205.affinities
    eq_(len(ic50_pred), len(ic50_true))
    assert np.allclose(ic50_pred, ic50_true)
Exemplo n.º 5
0
def test_dataset_random_split():
    dataset = Dataset.from_nested_dictionary({
        "H-2-Kb": {
            "SIINFEKL": 10.0,
            "FEKLSIIN": 20000.0,
            "SIFEKLIN": 50000.0,
        }
    })
    left, right = dataset.random_split(n=2)
    assert len(left) == 2
    assert len(right) == 1
Exemplo n.º 6
0
def test_class1_binding_predictor_A0205_training_accuracy():
    dataset = Dataset.from_csv(
        get_path("data_combined_iedb_kim2014",
                 "combined_human_class1_dataset.csv"))
    dataset_a0205_all_lengths = dataset.get_allele("HLA-A0205")
    dataset_a0205 = Dataset(dataset_a0205_all_lengths._df.ix[
        dataset_a0205_all_lengths._df.peptide.str.len() == 9])

    predictor = Class1BindingPredictor(name="A0205",
                                       embedding_output_dim=32,
                                       activation="tanh",
                                       layer_sizes=[64],
                                       optimizer="adam",
                                       dropout_probability=0.0)
    predictor.fit_dataset(dataset_a0205, n_training_epochs=1000)
    peptides = dataset_a0205.peptides
    ic50_pred = predictor.predict(peptides)
    ic50_true = dataset_a0205.affinities
    eq_(len(ic50_pred), len(ic50_true))
    testing.assert_allclose(np.log(ic50_pred),
                            np.log(ic50_true),
                            rtol=0.2,
                            atol=0.2)
Exemplo n.º 7
0
def test_create_imputed_datasets_two_alleles():
    dataset = Dataset.from_nested_dictionary({
        "HLA-A*02:01": {
            "A" * 9: 20.0,
            "C" * 9: 40000.0,
        },
        "HLA-A*02:05": {
            "S" * 9: 500.0,
            "A" * 9: 25.0,
        },
    })
    imputed_dataset = dataset.impute_missing_values(MICE(n_imputations=25))
    eq_(imputed_dataset.unique_alleles(), {"HLA-A*02:01", "HLA-A*02:05"})
    expected_peptides = {"A" * 9, "C" * 9, "S" * 9}
    for allele_name, allele_data in imputed_dataset.groupby_allele():
        eq_(set(allele_data.peptides), expected_peptides)
Exemplo n.º 8
0
def test_create_imputed_datasets_two_alleles():
    dataset = Dataset.from_nested_dictionary({
        "HLA-A*02:01": {
            "A" * 9: 20.0,
            "C" * 9: 40000.0,
        },
        "HLA-A*02:05": {
            "S" * 9: 500.0,
            "A" * 9: 25.0,
        },
    })
    imputed_dataset = dataset.impute_missing_values(MICE(n_imputations=25))
    eq_(imputed_dataset.unique_alleles(), {"HLA-A*02:01", "HLA-A*02:05"})
    expected_peptides = {"A" * 9, "C" * 9, "S" * 9}
    for allele_name, allele_data in imputed_dataset.groupby_allele():
        eq_(set(allele_data.peptides), expected_peptides)
Exemplo n.º 9
0
def test_create_allele_data_from_single_allele_dict():
    peptide_to_ic50_dict = {
        ("A" * 10): 1.2,
        ("C" * 9): 1000,
    }
    dataset = Dataset.from_single_allele_dictionary(
        allele_name="A0201", peptide_to_affinity_dict=peptide_to_ic50_dict)
    assert isinstance(dataset, Dataset)

    eq_(len(peptide_to_ic50_dict), len(dataset))
    expected_peptides = set([
        "A" * 10,
        "C" * 9,
    ])
    for pi, pj in zip(sorted(expected_peptides), sorted(dataset.peptides)):
        eq_(pi, pj)
    for pi, pj in zip(sorted(expected_peptides),
                      sorted(dataset.unique_peptides())):
        eq_(pi, pj)
Exemplo n.º 10
0
def test_dataset_cross_validation():
    dataset = Dataset.from_nested_dictionary({
        "H-2-Kb": {
            "SIINFEKL": 10.0,
            "FEKLSIIN": 20000.0,
            "SIFEKLIN": 50000.0,
        },
        "HLA-A*02:01": {
            "ASASAS": 1.0,
            "CCC": 0.0
        }
    })

    fold_count = 0
    for train_dataset, test_dataset in dataset.cross_validation_iterator(
            test_allele="HLA-A*02:01", n_folds=2):
        assert train_dataset.unique_alleles() == {"H-2-Kb", "HLA-A*02:01"}
        assert test_dataset.unique_alleles() == {"HLA-A*02:01"}
        assert len(test_dataset) == 1
        fold_count += 1
    assert fold_count == 2
            orientation="columns",
            verbose=verbose,
            print_interval=knn_print_interval)
    return result_dict


if __name__ == "__main__":
    args = parser.parse_args()
    print(args)
    imputation_methods = create_imputation_methods(
        verbose=args.verbose,
        clip_imputed_values=not (args.normalize_rows or args.normalize_rows),
    )
    print("Imputation methods: %s" % imputation_methods)

    dataset = Dataset.from_csv(args.binding_data_csv)
    X, peptide_list, allele_list = dataset.to_dense_pMHC_affinity_matrix(
        min_observations_per_allele=args.n_folds,
        min_observations_per_peptide=args.min_observations_per_peptide)
    observed_mask = np.isfinite(X)
    print("Loaded binding data, shape: %s, n_observed=%d/%d (%0.2f%%)" % (
        X.shape,
        observed_mask.sum(),
        X.size,
        100.0 * observed_mask.sum() / X.size))
    if args.save_incomplete_affinity_matrix:
        print("Saving incomplete data to %s" % args.save_incomplete_affinity_matrix)
        df = pd.DataFrame(X, columns=allele_list, index=peptide_list)
        df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide")

    scores = ScoreSet()
Exemplo n.º 12
0
def load_csv(filename):
    return Dataset.from_csv(data_path(filename))
Exemplo n.º 13
0
def test_performance_improves_for_A0205_with_pretraining():
    # test to make sure that imputation improves predictive accuracy after a
    # small number of training iterations (5 epochs)
    dataset = Dataset.from_csv(CLASS1_DATA_CSV_PATH)
    print("Full dataset: %d pMHC entries" % len(dataset))

    limited_alleles = ["HLA-A0205", "HLA-A0201", "HLA-A0101", "HLA-B0702"]

    # restrict to just five alleles
    dataset = dataset.get_alleles(limited_alleles)
    print("After filtering to %s, # entries: %d" % (limited_alleles, len(dataset)))

    a0205_data_without_imputation = dataset.get_allele("HLA-A0205")

    print("Dataset with only A0205, # entries: %d" % len(a0205_data_without_imputation))

    predictor_without_imputation = \
        Class1BindingPredictor.from_hyperparameters(name="A0205-no-impute")

    X_index, ic50_true, sample_weights, _ = \
        a0205_data_without_imputation.kmer_index_encoding()

    assert sample_weights.min() >= 0, sample_weights.min()
    assert sample_weights.max() <= 1, sample_weights.max()
    assert ic50_true.min() >= 0, ic50_true.min()

    predictor_without_imputation.fit_kmer_encoded_arrays(
        X=X_index,
        ic50=ic50_true,
        sample_weights=sample_weights,
        n_training_epochs=10)

    ic50_pred_without_imputation = \
        predictor_without_imputation.predict_ic50_for_kmer_encoded_array(X_index)
    diff_squared = (ic50_true - ic50_pred_without_imputation) ** 2

    ic50_true_label = ic50_true <= 500
    ic50_pred_label_without_imputation = ic50_pred_without_imputation <= 500
    ic50_label_same_without_imputation = (
        ic50_true_label == ic50_pred_label_without_imputation)
    mse_without_imputation = (diff_squared * sample_weights).sum() / sample_weights.sum()
    accuracy_without_imputation = (
        ic50_label_same_without_imputation * sample_weights).sum() / sample_weights.sum()
    imputed_datset = dataset.impute_missing_values(MICE(n_imputations=25))
    print("After imputation, dataset for %s has %d entries" % (
        limited_alleles, len(imputed_datset)))
    a0205_data_with_imputation = imputed_datset.get_allele("HLA-A0205")
    print("Limited to just A0205, # entries: %d" % (len(a0205_data_with_imputation)))

    X_index_imputed, ic50_imputed, sample_weights_imputed, _ = \
        a0205_data_with_imputation.kmer_index_encoding()
    assert sample_weights_imputed.min() >= 0, sample_weights_imputed.min()
    assert sample_weights_imputed.max() <= 1, sample_weights_imputed.max()
    assert ic50_imputed.min() >= 0, ic50_imputed.min()

    predictor_with_imputation = \
        Class1BindingPredictor.from_hyperparameters(name="A0205-impute")

    predictor_with_imputation.fit_kmer_encoded_arrays(
        X=X_index,
        ic50=ic50_true,
        sample_weights=sample_weights,
        X_pretrain=X_index_imputed,
        ic50_pretrain=ic50_imputed,
        sample_weights_pretrain=sample_weights_imputed,
        n_training_epochs=200)

    ic50_pred_with_imputation = \
        predictor_with_imputation.predict_ic50_for_kmer_encoded_array(X_index)
    diff_squared = (ic50_true - ic50_pred_with_imputation) ** 2
    mse_with_imputation = (diff_squared * sample_weights).sum() / sample_weights.sum()

    ic50_pred_label_with_imputation = ic50_pred_with_imputation <= 500
    ic50_label_same_with_imputation = (
        ic50_true_label == ic50_pred_label_with_imputation)
    accuracy_with_imputation = (
        ic50_label_same_with_imputation * sample_weights).sum() / sample_weights.sum()
    print("RMS w/out imputation: %f" % (np.sqrt(mse_without_imputation),))
    print("RMS w/ imputation: %f" % (np.sqrt(mse_with_imputation),))

    assert mse_with_imputation < mse_without_imputation, \
        "Expected MSE with imputation (%f) to be less than (%f) without imputation" % (
            mse_with_imputation, mse_without_imputation)

    print("IC50 <= 500nM accuracy w/out imputation: %f" % (
        accuracy_without_imputation,))
    print("IC50 <= 500nM accuracy w/ imputation: %f" % (
        accuracy_with_imputation,))
    assert accuracy_with_imputation > accuracy_without_imputation
    base_filename = \
        ("%s-vs-nsamples-hidden-%s-activation-%s"
         "-impute-%s-epochs-%d-embedding-%d-pretrain-%s") % (
            args.allele,
            args.hidden_layer_size,
            args.activation,
            args.imputation_method,
            args.training_epochs,
            args.embedding_size,
            args.pretraining_weight_decay)
    csv_filename = base_filename + ".csv"

    if args.load_existing_data:
        results_df = pd.read_csv(csv_filename)
    else:
        dataset = Dataset.from_csv(args.training_csv)
        imputer = imputer_from_args(args)

        def make_model():
            return predictor_from_args(allele_name=args.allele, args=args)

        if args.pretraining_weight_decay == "exponential":
            def pretrain_weight_decay_fn(t):
                return np.exp(-t)

        elif args.pretraining_weight_decay == "quadratic":
            def pretrain_weight_decay_fn(t):
                return 1.0 / (t + 1) ** 2.0

        elif args.pretraining_weight_decay == "linear":
            def pretrain_weight_decay_fn(t):
    default=[],
    nargs="+",
    type=normalize_allele_name)

# add options for neural network hyperparameters
parser = add_arguments_to_parser(parser)

if __name__ == "__main__":
    args = parser.parse_args()
    print(args)

    if not exists(args.output_dir):
        makedirs(args.output_dir)

    dataset = Dataset.from_csv(
        filename=args.binding_data_csv,
        sep=",",
        peptide_column_name="peptide")

    # if user didn't specify alleles then train models for all available alleles
    alleles = args.alleles

    if not alleles:
        alleles = list(sorted(dataset.unique_alleles()))
    else:
        dataset = dataset.get_alleles(alleles)

    imputer = imputer_from_args(args)

    if imputer is None:
        imputed_dataset = Dataset.create_empty()
    else:
Exemplo n.º 16
0
def test_create_imputed_datasets_empty():
    empty_dataset = Dataset.create_empty()
    result = empty_dataset.impute_missing_values(MICE(n_imputations=25))
    eq_(result, empty_dataset)
def load_csv(filename):
    base_dir = dirname(realpath(__file__))
    data_dir = join(base_dir, "data")
    full_path = join(data_dir, filename)
    return Dataset.from_csv(full_path)
Exemplo n.º 18
0
def test_create_imputed_datasets_empty():
    empty_dataset = Dataset.create_empty()
    result = empty_dataset.impute_missing_values(MICE(n_imputations=25))
    eq_(result, empty_dataset)
Exemplo n.º 19
0
def test_performance_improves_for_A0205_with_pretraining():
    # test to make sure that imputation improves predictive accuracy after a
    # small number of training iterations (5 epochs)
    dataset = Dataset.from_csv(
        get_path("data_combined_iedb_kim2014",
                 "combined_human_class1_dataset.csv"))
    print("Full dataset: %d pMHC entries" % len(dataset))

    limited_alleles = ["HLA-A0205", "HLA-A0201", "HLA-A0101", "HLA-B0702"]

    # restrict to just five alleles
    dataset = dataset.get_alleles(limited_alleles)
    print("After filtering to %s, # entries: %d" %
          (limited_alleles, len(dataset)))

    a0205_data_without_imputation = dataset.get_allele("HLA-A0205")

    print("Dataset with only A0205, # entries: %d" %
          (len(a0205_data_without_imputation)))

    predictor_without_imputation = Class1BindingPredictor(
        name="A0205-no-impute")

    X_index, ic50_true, sample_weights, _ = (
        a0205_data_without_imputation.kmer_index_encoding())

    assert sample_weights.min() >= 0, sample_weights.min()
    assert sample_weights.max() <= 1, sample_weights.max()
    assert ic50_true.min() >= 0, ic50_true.min()

    predictor_without_imputation.fit_kmer_encoded_arrays(
        X=X_index,
        ic50=ic50_true,
        sample_weights=sample_weights,
        n_training_epochs=10)

    ic50_pred_without_imputation = (
        predictor_without_imputation.predict_ic50_for_kmer_encoded_array(
            X_index))
    diff_squared = (ic50_true - ic50_pred_without_imputation)**2

    ic50_true_label = ic50_true <= 500
    ic50_pred_label_without_imputation = ic50_pred_without_imputation <= 500
    ic50_label_same_without_imputation = (
        ic50_true_label == ic50_pred_label_without_imputation)
    mse_without_imputation = ((diff_squared * sample_weights).sum() /
                              sample_weights.sum())
    accuracy_without_imputation = (
        (ic50_label_same_without_imputation * sample_weights).sum() /
        sample_weights.sum())
    imputed_datset = dataset.impute_missing_values(MICE(n_imputations=25))
    print("After imputation, dataset for %s has %d entries" %
          (limited_alleles, len(imputed_datset)))
    a0205_data_with_imputation = imputed_datset.get_allele("HLA-A0205")
    print("Limited to just A0205, # entries: %d" %
          (len(a0205_data_with_imputation)))

    X_index_imputed, ic50_imputed, sample_weights_imputed, _ = \
        a0205_data_with_imputation.kmer_index_encoding()
    assert sample_weights_imputed.min() >= 0, sample_weights_imputed.min()
    assert sample_weights_imputed.max() <= 1, sample_weights_imputed.max()
    assert ic50_imputed.min() >= 0, ic50_imputed.min()

    predictor_with_imputation = Class1BindingPredictor(name="A0205-impute")

    predictor_with_imputation.fit_kmer_encoded_arrays(
        X=X_index,
        ic50=ic50_true,
        sample_weights=sample_weights,
        X_pretrain=X_index_imputed,
        ic50_pretrain=ic50_imputed,
        sample_weights_pretrain=sample_weights_imputed,
        n_training_epochs=10)

    ic50_pred_with_imputation = \
        predictor_with_imputation.predict_ic50_for_kmer_encoded_array(X_index)
    diff_squared = (ic50_true - ic50_pred_with_imputation)**2
    mse_with_imputation = ((diff_squared * sample_weights).sum() /
                           sample_weights.sum())

    ic50_pred_label_with_imputation = ic50_pred_with_imputation <= 500
    ic50_label_same_with_imputation = (
        ic50_true_label == ic50_pred_label_with_imputation)
    accuracy_with_imputation = (
        (ic50_label_same_with_imputation * sample_weights).sum() /
        sample_weights.sum())
    print("RMS w/out imputation: %f" % (np.sqrt(mse_without_imputation), ))
    print("RMS w/ imputation: %f" % (np.sqrt(mse_with_imputation), ))

    assert mse_with_imputation < mse_without_imputation, \
        "Expected MSE with imputation (%f) to be < w/o imputation (%f)" % (
            mse_with_imputation, mse_without_imputation)

    print("IC50 <= 500nM accuracy w/out imputation: %f" %
          (accuracy_without_imputation, ))
    print("IC50 <= 500nM accuracy w/ imputation: %f" %
          (accuracy_with_imputation, ))
    assert accuracy_with_imputation > accuracy_without_imputation
def load_csv(filename):
    return Dataset.from_csv(data_path(filename))