continue
                else:
                    param_count += 1
                param_str =  \
                    "%d: Assay = '%s', ngram %s, alphabet %s, mhc_class %s" % \
                    (param_count, assay, max_ngram, alphabet, mhc_class)
                print param_str
                d['assay'].append(assay)
                d['alphabet'].append(alphabet)
                d['ngram'].append(max_ngram)
                d['mhc'].append(mhc_class)

                X, Y, vectorizer = iedb.load_tcell_ngrams(
                    assay_group = assay,
                    human = True,
                    mhc_class = 1,
                    max_ngram = max_ngram,
                    reduced_alphabet = alphabet_dict,
                    min_count = None,
                    return_transformer = True)
                print "Data shape", X.shape, "n_true", np.sum(Y)
                ensemble = BalancedEnsembleClassifier()

                accs = sklearn.cross_validation.cross_val_score(
                    ensemble, X, Y, cv = 3)
                acc = np.mean(accs)
                print "CV accuracy %0.4f (std %0.4f)" % \
                    (acc, np.std(accs))
                d['cv_acc'].append(acc)

                aucs = sklearn.cross_validation.cross_val_score(
                    ensemble, X, Y, cv = 5, scoring='roc_auc')
Exemplo n.º 2
0
  x_test_true = f.transform(cancer_peptides)
  x_test_false = f.transform(self_peptides)
  x_test = np.vstack([x_test_true, x_test_false])
  y_test = np.ones(x_test.shape[0], dtype='bool')
  y_test[len(x_test_true):] = 0
  eval_dataset.eval_split(x,y,x_test,y_test)

ASSAY = 'cytotoxicity'

print
print "---"
print "aromatic unigram"
X, Y, f = iedb.load_tcell_ngrams(
                 noisy_labels = 'majority', assay_group = ASSAY, subsample_bigger_class = True,
                 human = True,
                 mhc_class = 1,
                 max_ngram = 1,
                 reduced_alphabet= reduced_alphabet.aromatic2,
                 return_transformer = True)

eval_dataset.eval_cv(X, Y)
print "Tumor-specific antigens"
run(X,Y,f)

print
print "---"
print "aromatic bigram"
X, Y, f = iedb.load_tcell_ngrams(
                 noisy_labels = 'majority', assay_group = ASSAY, subsample_bigger_class = True,
                 human = True,
                 mhc_class = 1,
Exemplo n.º 3
0
import sklearn.linear_model

from epitopes import iedb
import eval_dataset

"""
Instead of dropping or keeping the noisy labels, started
trying to just the majority vote. This is saner and became the default
"""


print
print "---"
print "Human MHC1"
X_human_mhc1_filter, Y_human_mhc1_filter = iedb.load_tcell_ngrams(
                 noisy_labels = 'majority',
                 human = True,
                 mhc_class = 1)
eval_dataset.eval_cv(X_human_mhc1_filter, Y_human_mhc1_filter)



print
print "---"
print "No HLA-A2"
X_no_hla_a2, Y_no_hla_a2 = iedb.load_tcell_ngrams(
                 noisy_labels = 'majority',
                 human = True,
                 mhc_class = 1,
                 exclude_hla_type = 'HLA-A2$|A-\*02')
eval_dataset.eval_cv(X_no_hla_a2, Y_no_hla_a2)
Exemplo n.º 4
0
from epitopes import iedb, amino_acid, features, reduced_alphabet

import eval_dataset

"""
Do results from a restrict HLA sample (only A2) generalize to all the other HLA types?

(repeated for AA bigrams)
"""
A2 = "A2$|A\*02"

print
print "---"
print "Human MHC1 (keep)"
X_human_mhc1, Y_human_mhc1 = iedb.load_tcell_ngrams(noisy_labels="keep", human=True, max_ngram=2, mhc_class=1)
eval_dataset.eval_cv(X_human_mhc1, Y_human_mhc1)


print
print "---"
print "Human MHC1 (drop)"
X_human_mhc1_filter, Y_human_mhc1_filter = iedb.load_tcell_ngrams(
    noisy_labels="drop", human=True, max_ngram=2, mhc_class=1
)
eval_dataset.eval_cv(X_human_mhc1_filter, Y_human_mhc1_filter)


print
print "---"
print "Human MHC1 noisy = positive"
Exemplo n.º 5
0
  lr.fit(X,Y)
  print "LR coefs", lr.coef_


  n_classifiers = 200

  rf = sklearn.ensemble.RandomForestClassifier(n_classifiers)
  print "RF Accuracy", np.mean(sklearn.cross_validation.cross_val_score(rf, X, Y, cv = 10))

  rf.fit(X,Y)
  print "RF Features", rf.feature_importances_


print "4 letter alphabet:"
X4,Y4 = iedb.load_tcell_ngrams(
  assay_group = 'cytotoxicity',
  reduced_alphabet = reduced_alphabet.gbmr4,
)
run_classifiers(X4, Y4)

print "---"
print
print "12 letter alphabet:"
X12,Y12 = iedb.load_tcell_ngrams(
  assay_group = 'cytotoxicity',
  reduced_alphabet = reduced_alphabet.sdm12,
)

run_classifiers(X12, Y12)

print "---"
print
Exemplo n.º 6
0
import sklearn.linear_model

from epitopes import iedb, amino_acid, features, reduced_alphabet

import eval_dataset

"""
Do results from a restrict HLA sample (only A2) generalize to all the other HLA types?
"""
A2 = 'A2$|A\*02'

print
print "---"
print "Human MHC1 (keep)"
X_human_mhc1, Y_human_mhc1 = iedb.load_tcell_ngrams(
                 noisy_labels = 'keep',
                 human = True,
                 mhc_class = 1)
eval_dataset.eval_cv(X_human_mhc1, Y_human_mhc1)


print
print "---"
print "Human MHC1 (drop)"
X_human_mhc1_filter, Y_human_mhc1_filter = iedb.load_tcell_ngrams(
                 noisy_labels = 'drop',
                 human = True,
                 mhc_class = 1)
eval_dataset.eval_cv(X_human_mhc1_filter, Y_human_mhc1_filter)


print