os.makedirs(dir_results) df_metrics = Assess_Performance_KNN(distances_list, names, DTCRU.class_id, dir_results, metrics=['AUC']) df_u = pd.DataFrame() df_u['Class'] = df_metrics['Classes'] df_u['AUC'] = df_metrics['Value'] df_u['Method'] = df_metrics['Algorithm'] df_u['Type'] = 'Unsupervised' #Run Supervised Sequence Classifier DTCRS = DeepTCR_SS('Sequence_C') DTCRS.Get_Data(directory='../../Data/Murine_Antigens', Load_Prev_Data=True, aggregate_by_aa=True, aa_column_beta=0, count_column=1, v_beta_column=2, j_beta_column=3) AUC = [] Class = [] Method = [] for i in range(10): DTCRS.Get_Train_Valid_Test() DTCRS.Train(use_only_seq=True)
import pandas as pd from DeepTCR.DeepTCR import DeepTCR_SS import numpy as np epitope = 'ELAGIGILTV' cdr3_beta_col = 'CDR3.beta.aa' cdr3_alpha_col = 'CDR3.alpha.aa' epitope_col = 'Epitope.peptide' df = pd.read_csv('../../../Data/10x_Data/Data_Regression.csv') DTCRS = DeepTCR_SS('reg_mart1', device=2) #Check performance no sequences in MCPAS df_train_pep = pd.DataFrame() df_train_pep['alpha'] = np.asarray(df['alpha'].tolist()) df_train_pep['beta'] = np.asarray(df['beta'].tolist()) df_train_pep['seq_id'] = df_train_pep['alpha'] + '_' + df_train_pep['beta'] df_tcr = pd.read_csv('../../../Data/McPAS-TCR.csv') df_tcr.dropna(subset=[cdr3_alpha_col, cdr3_beta_col], inplace=True) df_tcr = df_tcr.groupby([cdr3_alpha_col, cdr3_beta_col]).agg({ epitope_col: 'first' }).reset_index() df_tcr['seq_id'] = df_tcr[cdr3_alpha_col] + '_' + df_tcr[cdr3_beta_col] df_tcr = df_tcr[~df_tcr['seq_id'].isin(df_train_pep['seq_id'])] remove = ["""[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ 1234567890]"""] df_tcr = df_tcr[~df_tcr[cdr3_alpha_col].str. contains('|'.join(remove), regex=True)] df_tcr = df_tcr[~df_tcr[cdr3_beta_col].str. contains('|'.join(remove), regex=True)]
from DeepTCR.DeepTCR import DeepTCR_SS import numpy as np import matplotlib.pyplot as plt import pandas as pd from scipy.spatial.distance import pdist, squareform from scipy.stats import spearmanr import seaborn as sns import pickle import os import matplotlib matplotlib.rc('font', family='Arial') #Instantiate training object DTCRU = DeepTCR_SS('Murine_Sup') #Load Data DTCRU.Get_Data(directory='../../Data/Murine_Antigens', Load_Prev_Data=False, aa_column_beta=0, count_column=1, v_beta_column=2, j_beta_column=3, classes=['Db-F2', 'Db-M45', 'Db-NP', 'Db-PA', 'Db-PB1'])
import pandas as pd from DeepTCR.DeepTCR import DeepTCR_SS import numpy as np from multiprocessing import Pool import os import matplotlib.pyplot as plt from scipy.stats import gaussian_kde import shutil from Bio import SeqIO from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio.Alphabet import IUPAC df = pd.read_csv('../../Data/10x_Data/Data_Regression.csv') DTCRS = DeepTCR_SS('reg',device=2) p = Pool(40) #Get alpha/beta sequences alpha = np.asarray(df['alpha'].tolist()) beta = np.asarray(df['beta'].tolist()) antigen = 'A0201_GILGFVFTL_Flu-MP_Influenza' i = np.where(df.columns==antigen)[0][0] sel = df.iloc[:, i] Y = np.log2(np.asarray(sel.tolist()) + 1) DTCRS.Load_Data(alpha_sequences=alpha, beta_sequences=beta, Y=Y, p=p) DTCRS.K_Fold_CrossVal(split_by_sample=False, folds=5) DTCRS.Representative_Sequences(top_seq=100,motif_seq=10,color_scheme='hydrophobicity') dir = 'Reg_Rep_Sequences'
import numpy as np import pandas as pd from DeepTCR.DeepTCR import DeepTCR_SS import seaborn as sns import matplotlib.pyplot as plt import matplotlib matplotlib.rc('font', family='Arial') from sklearn.metrics import roc_auc_score, roc_curve DTCRS = DeepTCR_SS('reg_flu', device=2) alpha = 'CAGAGSQGNLIF' beta = 'CASSSRSSYEQYF' contacts_alpha = [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0] contacts_beta = [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0] input_alpha = np.array([alpha, alpha]) input_beta = np.array([beta, beta]) fig_rsl, ax_rsl = DTCRS.Residue_Sensitivity_Logo(input_alpha, input_beta, background_color='black', Load_Prev_Data=False) df_alpha = pd.DataFrame() df_alpha['seq'] = list(alpha) df_alpha['mag'] = DTCRS.mag_alpha df_alpha['label'] = contacts_alpha df_beta = pd.DataFrame() df_beta['seq'] = list(beta) df_beta['mag'] = DTCRS.mag_beta
"""This script runs regression for the 10x Dataset where alpha/beta TCR's are regressed against the quantitative evaluation of antigen-specificity via dCODE Dextramer reagents""" import pandas as pd from DeepTCR.DeepTCR import DeepTCR_SS import numpy as np from multiprocessing import Pool import os import matplotlib.pyplot as plt from scipy.stats import gaussian_kde df = pd.read_csv('../../Data/10x_Data/Data_Regression.csv') DTCRS = DeepTCR_SS('reg',device='/gpu:2') p = Pool(40) #Get alpha/beta sequences alpha = np.asarray(df['alpha'].tolist()) beta = np.asarray(df['beta'].tolist()) y_pred = [] y_test = [] antigen = [] #Iterate through all antigens for i in range(2,df.columns.shape[0]): print(df.iloc[:,i].name) sel = df.iloc[:,i] Y = np.log2(np.asarray(sel.tolist()) + 1) DTCRS.Load_Data(alpha_sequences=alpha, beta_sequences=beta, Y=Y,p=p) DTCRS.K_Fold_CrossVal(split_by_sample=False,folds=5) y_pred.append(DTCRS.y_pred)
os.makedirs(dir_results) df_metrics = Assess_Performance_KNN(distances_list, names, DTCRU.class_id, dir_results, metrics=['AUC']) df_u = pd.DataFrame() df_u['Class'] = df_metrics['Classes'] df_u['AUC'] = df_metrics['Value'] df_u['Method'] = df_metrics['Algorithm'] df_u['Type'] = 'Unsupervised' #Run Supervised Sequence Classifier DTCRS = DeepTCR_SS('Sequence_C', device=1) DTCRS.Get_Data(directory='../../Data/Murine_Antigens', Load_Prev_Data=True, aggregate_by_aa=True, aa_column_beta=0, count_column=1, v_beta_column=2, j_beta_column=3) AUC = [] Class = [] Method = [] folds = 100 seeds = np.array(range(folds)) for i in range(folds): np.random.seed(seeds[i])
import numpy as np import pandas as pd from DeepTCR.DeepTCR import DeepTCR_SS import seaborn as sns import matplotlib.pyplot as plt import matplotlib matplotlib.rc('font', family='Arial') DTCRS = DeepTCR_SS('reg_mart1', device=2) alpha = 'CAVNFGGGKLIF' beta = 'CASSWSFGTEAFF' input_alpha = np.array([alpha, alpha]) input_beta = np.array([beta, beta]) pred = DTCRS.Sequence_Inference(input_alpha, input_beta) fig_rsl, ax_rsl = DTCRS.Residue_Sensitivity_Logo(input_alpha, input_beta, background_color='black', Load_Prev_Data=False) fig_rsl.savefig('mart1_rsl.png', dpi=1200, facecolor='black') fig, ax = plt.subplots(1, 2, figsize=(10, 5)) sns.swarmplot(data=DTCRS.df_alpha_list[0], x='pos', y='high', ax=ax[0]) i = 0 ax[i].set_xlabel('') ax[i].set_ylabel('') ax[i].set_xticklabels(list(alpha), size=24) ax[i].tick_params(axis='y', labelsize=18) ax[i].spines['right'].set_visible(False) ax[i].spines['top'].set_visible(False)
from DeepTCR.DeepTCR import DeepTCR_SS from multiprocessing import Pool import matplotlib.pyplot as plt from sklearn.metrics import roc_auc_score, roc_curve import os p = Pool(80) dir_results = 'alpha_v_beta_results' if not os.path.exists(dir_results): os.makedirs(dir_results) antigens = [ 'GANAB-S5F', 'ATP6AP1-KLG_G3W', 'CMV-MLN', 'GNL3L-R4C', 'MART1-A2L', 'YFV-LLW' ] for a in antigens: DTCR = DeepTCR_SS(a + 'Rep') DTCR.Get_Data(directory='../../Data/Zhang/' + a, aa_column_alpha=0, aa_column_beta=1, p=p) DTCR.Monte_Carlo_CrossVal(folds=50, weight_by_class=True) DTCR.Representative_Sequences()
from DeepTCR.DeepTCR import DeepTCR_SS, DeepTCR_WF folds = 100 LOO = 4 epochs_min = 100 #Train Sequence Classifier DTCR_SS = DeepTCR_SS('Rudqvist_SS') DTCR_SS.Get_Data(directory='../../Data/Rudqvist', Load_Prev_Data=False, aa_column_beta=1, count_column=2, v_beta_column=7, d_beta_column=14, j_beta_column=21) DTCR_SS.Monte_Carlo_CrossVal(folds=folds, test_size=0.25) DTCR_SS.AUC_Curve(filename='AUC.eps') #Train Repertoire Classifier without on-graph clustering DTCR_WF = DeepTCR_WF('Rudqvist_WF') DTCR_WF.Get_Data(directory='../../Data/Rudqvist', Load_Prev_Data=False, aa_column_beta=1, count_column=2, v_beta_column=7, d_beta_column=14, j_beta_column=21) DTCR_WF.Monte_Carlo_CrossVal(folds=folds, LOO=LOO, epochs_min=epochs_min) DTCR_WF.AUC_Curve(filename='Rep_AUC.eps')
from DeepTCR.DeepTCR import DeepTCR_SS, DeepTCR_WF #Train Sequence Classifier DTCR_SS = DeepTCR_SS('Rudqvist') DTCR_SS.Get_Data(directory='../../Data/Rudqvist', Load_Prev_Data=False, aggregate_by_aa=True, aa_column_beta=1, count_column=2, v_beta_column=7, d_beta_column=14, j_beta_column=21) DTCR_SS.Monte_Carlo_CrossVal(folds=100, test_size=0.25) DTCR_SS.AUC_Curve() #Train Repertoire Classifier without on-graph clustering DTCR_WF = DeepTCR_WF('Rudqvist') DTCR_WF.Get_Data(directory='../../Data/Rudqvist', Load_Prev_Data=False, aggregate_by_aa=True, aa_column_beta=1, count_column=2, v_beta_column=7, d_beta_column=14, j_beta_column=21) DTCR_WF.Monte_Carlo_CrossVal(folds=100, LOO=4, epochs_min=50) DTCR_WF.AUC_Curve() #Train Repertoire Classifier with on-graph clustering
""" """This script was used to train the supervised TCR sequence classifier and generate the top representative sequences for each class and derive the motifs that were learned by the network.""" from DeepTCR.DeepTCR import DeepTCR_SS from Bio import SeqIO from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio.Alphabet import IUPAC import numpy as np import os import shutil #Run Supervised Sequence Classifier DTCRS = DeepTCR_SS('Sequence_C', device=6) DTCRS.Get_Data(directory='../../Data/Murine_Antigens', Load_Prev_Data=True, aggregate_by_aa=True, aa_column_beta=0, count_column=1, v_beta_column=2, j_beta_column=3) folds = 100 seeds = np.array(range(folds)) graph_seed = 0 DTCRS.Monte_Carlo_CrossVal(folds=folds, graph_seed=graph_seed, seeds=seeds) DTCRS.Representative_Sequences(top_seq=25, motif_seq=10, color_scheme='hydrophobicity')
thresh = 0.99 seq_train = [] label_train = [] count_train = [] for s, seq_cl, p, c in zip(sequences, seq_class_labels, predicted, counts): sel_idx = p > thresh seq_train.append(s[sel_idx]) label_train.append(seq_cl[sel_idx]) count_train.append(c[sel_idx]) seq_train = np.hstack(seq_train) label_train = np.hstack(label_train) count_train = np.hstack(count_train) #Train Sequence Classifier DTCR = DeepTCR_SS('tw10_seq', device=gpu) DTCR.Load_Data(beta_sequences=seq_train, class_labels=label_train) DTCR.Monte_Carlo_CrossVal(folds=folds, graph_seed=graph_seed, seeds=seeds, convergence='training') y_pred = DTCR.predicted y_test = DTCR.Y plt.figure(figsize=(6, 5)) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') for ii, cl in enumerate(DTCR.lb.classes_, 0): fpr, tpr, _ = roc_curve(y_test[:, ii], y_pred[:, ii]) roc_score = roc_auc_score(y_test[:, ii], y_pred[:, ii])
from DeepTCR.DeepTCR import DeepTCR_SS import numpy as np import matplotlib.pyplot as plt import pandas as pd from scipy.spatial.distance import pdist, squareform from scipy.stats import spearmanr import seaborn as sns import pickle import os import matplotlib matplotlib.rc('font', family='Arial') #Instantiate training object DTCRU = DeepTCR_SS('Murine_Sup') #Load Data # DTCRU.Get_Data(directory='../../Data/Murine_Antigens',Load_Prev_Data=False, # aa_column_beta=0,count_column=1,v_beta_column=2,j_beta_column=3, # classes=['Db-F2', 'Db-M45', 'Db-NP', 'Db-PA', 'Db-PB1']) # DTCRU.Monte_Carlo_CrossVal(folds=5) DTCR_inf = DeepTCR_SS('load') DTCR_inf.Get_Data(directory='../../Data/Murine_Antigens', Load_Prev_Data=False, aa_column_beta=0, count_column=1, v_beta_column=2, j_beta_column=3, classes=['Kb-M38', 'Kb-SIY', 'Kb-TRP2', 'Kb-m139']) beta_sequences = DTCR_inf.beta_sequences
import pandas as pd from DeepTCR.DeepTCR import DeepTCR_SS import numpy as np from scipy.stats import gaussian_kde import matplotlib.pyplot as plt import matplotlib matplotlib.rc('font', family='Arial') import pickle df = pd.read_csv('../../../Data/10x_Data/Data_Regression.csv') antigen = 'A0201_ELAGIGILTV_MART-1_Cancer' DTCRS = DeepTCR_SS('reg_mart1', device=2) #Get alpha/beta sequences alpha = np.asarray(df['alpha'].tolist()) beta = np.asarray(df['beta'].tolist()) i = np.where(df.columns == antigen)[0][0] sel = df.iloc[:, i] Y = np.log2(np.asarray(sel.tolist()) + 1) DTCRS.Load_Data(alpha_sequences=alpha, beta_sequences=beta, Y=Y) folds = 5 seeds = np.array(range(folds)) graph_seed = 0 DTCRS.K_Fold_CrossVal(split_by_sample=False, folds=folds, seeds=seeds, graph_seed=graph_seed) with open('mart1_preds.pkl', 'wb') as f: pickle.dump([antigen, np.squeeze(DTCRS.predicted), Y], f, protocol=4)
""" Fig 2C """ """This script was used to train the supervised TCR sequence classifier and generate the top representative sequences for each class and derive the motifs that were learned by the network.""" from DeepTCR.DeepTCR import DeepTCR_SS, DeepTCR_U import numpy as np import seaborn as sns #Run Supervised Sequence Classifier DTCRS = DeepTCR_SS('Sequence_C') DTCRS.Get_Data(directory='../../Data/Murine_Antigens', Load_Prev_Data=False, aggregate_by_aa=True, aa_column_beta=0, count_column=1, v_beta_column=2, j_beta_column=3) DTCRS.Monte_Carlo_CrossVal(folds=10, stop_criterion=0.01) DTCRS.Representative_Sequences(top_seq=10, unique=True) from Bio import SeqIO from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio.Alphabet import IUPAC for item in DTCRS.Rep_Seq: break t = DTCRS.Rep_Seq[item]
"""Figure 2B""" """This script is used to create the ROC curves for assessing the ability of supervised sequence classifier to correctly predict the antigen-specificity of the 9 murine antigens in the manuscript..""" from DeepTCR.DeepTCR import DeepTCR_SS #Run Supervised Sequence Classifier DTCRS = DeepTCR_SS('Sequence_C') DTCRS.Get_Data(directory='../../Data/Murine_Antigens', Load_Prev_Data=False, aggregate_by_aa=True, aa_column_beta=0, count_column=1, v_beta_column=2, j_beta_column=3) DTCRS.Monte_Carlo_CrossVal(folds=10) DTCRS.AUC_Curve()
Supplementary Figure 17 """ """This script is used to benchmark DeepTCR's Sequence Classifier against an SVM and RF where the inputs for those latter machine learning algorithms are the outputs of a K-mer search""" import numpy as np import pandas as pd from DeepTCR.DeepTCR import DeepTCR_SS from NN_Assessment_utils import * from sklearn.svm import SVC from sklearn.metrics import roc_auc_score, roc_curve import seaborn as sns from sklearn.ensemble import RandomForestClassifier DTCRS = DeepTCR_SS('Sequence_C') DTCRS.Get_Data(directory='../../Data/Murine_Antigens', Load_Prev_Data=False, aggregate_by_aa=True, aa_column_beta=0, count_column=1, v_beta_column=2, j_beta_column=3) kmer_features = kmer_search(DTCRS.beta_sequences) clf_svm = SVC(probability=True) clf_rf = RandomForestClassifier(n_estimators=100) y_test_list = [] y_pred_list_dtcr = [] y_pred_list_svm = [] y_pred_list_rf = []
"""Figure 3B""" """This script is used to train both the sequence and repertoire classifier on the Rudqvist_2017 dataset and compare their performances.""" from DeepTCR.DeepTCR import DeepTCR_SS, DeepTCR_WF from sklearn.metrics import roc_curve, roc_auc_score import numpy as np from matplotlib import pyplot as plt #Train Sequence Classifier DTCR_SS = DeepTCR_SS('Rudqvist_SS', device='/device:GPU:0') DTCR_SS.Get_Data(directory='../../Data/Rudqvist', Load_Prev_Data=False, aa_column_beta=1, count_column=2, v_beta_column=7, d_beta_column=14, j_beta_column=21) DTCR_SS.K_Fold_CrossVal(folds=5) #Train Repertoire Classifier folds = 100 LOO = 4 epochs_min = 10 size_of_net = 'small' num_concepts = 64 hinge_loss_t = 0.1 train_loss_min = 0.1 seeds = np.array(range(folds)) graph_seed = 0
"""Figure 2B""" """This script is used to create the ROC curves for assessing the ability of supervised sequence classifier to correctly predict the antigen-specificity of the 9 murine antigens in the manuscript..""" from DeepTCR.DeepTCR import DeepTCR_SS import numpy as np import matplotlib.pyplot as plt import matplotlib matplotlib.rc('font', family='Arial') #Run Supervised Sequence Classifier DTCRS = DeepTCR_SS('Sequence_C', device=2) DTCRS.Get_Data(directory='../../../Data/Murine_Antigens', Load_Prev_Data=False, aggregate_by_aa=True, aa_column_beta=0, count_column=1, v_beta_column=2, j_beta_column=3) folds = 10 seeds = np.array(range(folds)) graph_seed = 0 DTCRS.Monte_Carlo_CrossVal(folds=folds, seeds=seeds, graph_seed=graph_seed) DTCRS.AUC_Curve(xlabel_size=24, ylabel_size=24, xtick_size=18, ytick_size=18, legend_font_size=14,
"""Figure 2D""" """This script is used to benchmark the performance of the Supervised Sequence Classifier with either the alpha chain, beta chain, or both provided to the model.""" from DeepTCR.DeepTCR import DeepTCR_SS from multiprocessing import Pool import matplotlib.pyplot as plt from sklearn.metrics import roc_auc_score, roc_curve import os p = Pool(80) dir_results = 'alpha_v_beta_results' if not os.path.exists(dir_results): os.makedirs(dir_results) DTCR = DeepTCR_SS('alpha_v_beta_SS') antigens = ['ATP6AP1-KLG_G3W', 'GNL3L-R4C', 'MART1-A2L', 'YFV-LLW'] opt = ['alpha', 'beta', 'alpha_beta'] for a in antigens: y_pred_list = [] y_test_list = [] for o in opt: if o == 'alpha': DTCR = DeepTCR_SS('alpha_v_beta_SS') DTCR.Get_Data(directory='../../Data/Zhang/' + a, aa_column_alpha=0, p=p) elif o == 'beta':
"""Figure 2E""" """This script runs regression for the 10x Dataset where alpha/beta TCR's are regressed against the quantitative evaluation of antigen-specificity via dCODE Dextramer reagents""" import pandas as pd from DeepTCR.DeepTCR import DeepTCR_SS import numpy as np from multiprocessing import Pool import os import matplotlib.pyplot as plt from scipy.stats import gaussian_kde df = pd.read_csv('../../Data/10x_Data/Data_Regression.csv') DTCRS = DeepTCR_SS('reg', device=2) p = Pool(40) #Get alpha/beta sequences alpha = np.asarray(df['alpha'].tolist()) beta = np.asarray(df['beta'].tolist()) y_pred = [] y_test = [] antigen = [] folds = 5 seeds = np.array(range(folds)) graph_seed = 0 #Iterate through all antigens for i in range(2, df.columns.shape[0]): print(df.iloc[:, i].name) sel = df.iloc[:, i]