Exemplo n.º 1
0
"""Figure 2D"""
"""This script is used to benchmark the performance of the Supervised Sequence Classifier
with either the alpha chain, beta chain, or both provided to the model."""

from DeepTCR.DeepTCR import DeepTCR_SS
from multiprocessing import Pool
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
import os

p = Pool(80)
dir_results = 'alpha_v_beta_results'
if not os.path.exists(dir_results):
    os.makedirs(dir_results)

DTCR = DeepTCR_SS('alpha_v_beta_SS')

antigens = ['ATP6AP1-KLG_G3W', 'GNL3L-R4C', 'MART1-A2L', 'YFV-LLW']

opt = ['alpha', 'beta', 'alpha_beta']

for a in antigens:
    y_pred_list = []
    y_test_list = []
    for o in opt:
        if o == 'alpha':
            DTCR = DeepTCR_SS('alpha_v_beta_SS')
            DTCR.Get_Data(directory='../../Data/Zhang/' + a,
                          aa_column_alpha=0,
                          p=p)
        elif o == 'beta':
Exemplo n.º 2
0
"""Figure 3B"""
"""This script is used to train both the sequence and repertoire classifier on the
Rudqvist_2017 dataset and compare their performances."""

from DeepTCR.DeepTCR import DeepTCR_SS, DeepTCR_WF
from sklearn.metrics import roc_curve, roc_auc_score
import numpy as np
from matplotlib import pyplot as plt

#Train Sequence Classifier
DTCR_SS = DeepTCR_SS('Rudqvist_SS', device='/device:GPU:0')
DTCR_SS.Get_Data(directory='../../Data/Rudqvist',
                 Load_Prev_Data=False,
                 aa_column_beta=1,
                 count_column=2,
                 v_beta_column=7,
                 d_beta_column=14,
                 j_beta_column=21)

DTCR_SS.K_Fold_CrossVal(folds=5)

#Train Repertoire Classifier
folds = 100
LOO = 4
epochs_min = 10
size_of_net = 'small'
num_concepts = 64
hinge_loss_t = 0.1
train_loss_min = 0.1
seeds = np.array(range(folds))
graph_seed = 0
Exemplo n.º 3
0
    os.makedirs(dir_results)

df_metrics = Assess_Performance_KNN(distances_list,
                                    names,
                                    DTCRU.class_id,
                                    dir_results,
                                    metrics=['AUC'])

df_u = pd.DataFrame()
df_u['Class'] = df_metrics['Classes']
df_u['AUC'] = df_metrics['Value']
df_u['Method'] = df_metrics['Algorithm']
df_u['Type'] = 'Unsupervised'

#Run Supervised Sequence Classifier
DTCRS = DeepTCR_SS('Sequence_C', device=1)
DTCRS.Get_Data(directory='../../Data/Murine_Antigens',
               Load_Prev_Data=True,
               aggregate_by_aa=True,
               aa_column_beta=0,
               count_column=1,
               v_beta_column=2,
               j_beta_column=3)

AUC = []
Class = []
Method = []
folds = 100
seeds = np.array(range(folds))
for i in range(folds):
    np.random.seed(seeds[i])
Exemplo n.º 4
0
"""Figure 2B"""
"""This script is used to create the ROC curves for assessing the ability
of supervised sequence classifier to correctly predict the antigen-specificity of 
the 9 murine antigens in the manuscript.."""

from DeepTCR.DeepTCR import DeepTCR_SS

#Run Supervised Sequence Classifier
DTCRS = DeepTCR_SS('Sequence_C')
DTCRS.Get_Data(directory='../../Data/Murine_Antigens',
               Load_Prev_Data=False,
               aggregate_by_aa=True,
               aa_column_beta=0,
               count_column=1,
               v_beta_column=2,
               j_beta_column=3)
DTCRS.Monte_Carlo_CrossVal(folds=10)
DTCRS.AUC_Curve()
Exemplo n.º 5
0
"""This script runs regression for the 10x Dataset where alpha/beta TCR's are
regressed against the quantitative evaluation of antigen-specificity via
dCODE Dextramer reagents"""

import pandas as pd
from DeepTCR.DeepTCR import DeepTCR_SS
import numpy as np
from multiprocessing import Pool
import os
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

df = pd.read_csv('../../Data/10x_Data/Data_Regression.csv')
DTCRS = DeepTCR_SS('reg',device='/gpu:2')
p = Pool(40)

#Get alpha/beta sequences
alpha = np.asarray(df['alpha'].tolist())
beta = np.asarray(df['beta'].tolist())

y_pred = []
y_test = []
antigen = []
#Iterate through all antigens
for i in range(2,df.columns.shape[0]):
    print(df.iloc[:,i].name)
    sel = df.iloc[:,i]
    Y = np.log2(np.asarray(sel.tolist()) + 1)
    DTCRS.Load_Data(alpha_sequences=alpha, beta_sequences=beta, Y=Y,p=p)
    DTCRS.K_Fold_CrossVal(split_by_sample=False,folds=5)
    y_pred.append(DTCRS.y_pred)
Exemplo n.º 6
0
from DeepTCR.DeepTCR import DeepTCR_SS
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.spatial.distance import pdist, squareform
from scipy.stats import spearmanr
import seaborn as sns
import pickle
import os
import matplotlib
matplotlib.rc('font', family='Arial')

#Instantiate training object
DTCRU = DeepTCR_SS('Murine_Sup')
#Load Data
DTCRU.Get_Data(directory='../../Data/Murine_Antigens',
               Load_Prev_Data=False,
               aa_column_beta=0,
               count_column=1,
               v_beta_column=2,
               j_beta_column=3,
               classes=['Db-F2', 'Db-M45', 'Db-NP', 'Db-PA', 'Db-PB1'])
Exemplo n.º 7
0
import pandas as pd
from DeepTCR.DeepTCR import DeepTCR_SS
import numpy as np
from scipy.stats import gaussian_kde
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rc('font', family='Arial')
import pickle

df = pd.read_csv('../../../Data/10x_Data/Data_Regression.csv')
antigen = 'A0201_ELAGIGILTV_MART-1_Cancer'

DTCRS = DeepTCR_SS('reg_mart1', device=2)
#Get alpha/beta sequences
alpha = np.asarray(df['alpha'].tolist())
beta = np.asarray(df['beta'].tolist())
i = np.where(df.columns == antigen)[0][0]
sel = df.iloc[:, i]
Y = np.log2(np.asarray(sel.tolist()) + 1)
DTCRS.Load_Data(alpha_sequences=alpha, beta_sequences=beta, Y=Y)
folds = 5
seeds = np.array(range(folds))
graph_seed = 0
DTCRS.K_Fold_CrossVal(split_by_sample=False,
                      folds=folds,
                      seeds=seeds,
                      graph_seed=graph_seed)
with open('mart1_preds.pkl', 'wb') as f:
    pickle.dump([antigen, np.squeeze(DTCRS.predicted), Y], f, protocol=4)
Exemplo n.º 8
0
import pandas as pd
from DeepTCR.DeepTCR import DeepTCR_SS
import numpy as np
from scipy.stats import gaussian_kde
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rc('font', family='Arial')
import pickle

df = pd.read_csv('../../../Data/10x_Data/Data_Regression.csv')
antigen = 'A0201_GLCTLVAML_BMLF1_EBV'

DTCRS = DeepTCR_SS('reg_ebv',device=2)
#Get alpha/beta sequences
alpha = np.asarray(df['alpha'].tolist())
beta = np.asarray(df['beta'].tolist())
i = np.where(df.columns==antigen)[0][0]
sel = df.iloc[:, i]
Y = np.log2(np.asarray(sel.tolist()) + 1)
DTCRS.Load_Data(alpha_sequences=alpha, beta_sequences=beta, Y=Y)
folds = 5
seeds = np.array(range(folds))
graph_seed = 0
DTCRS.K_Fold_CrossVal(split_by_sample=False, folds=folds,seeds=seeds,graph_seed=graph_seed)
with open('ebv_preds.pkl','wb') as f:
    pickle.dump([antigen,np.squeeze(DTCRS.predicted),Y],f,protocol=4)
Exemplo n.º 9
0
import numpy as np
import pandas as pd
from DeepTCR.DeepTCR import DeepTCR_SS
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rc('font', family='Arial')
from sklearn.metrics import roc_auc_score, roc_curve

DTCRS = DeepTCR_SS('reg_flu', device=2)

alpha = 'CAGAGSQGNLIF'
beta = 'CASSSRSSYEQYF'
contacts_alpha = [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0]
contacts_beta = [0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0]

input_alpha = np.array([alpha, alpha])
input_beta = np.array([beta, beta])
fig_rsl, ax_rsl = DTCRS.Residue_Sensitivity_Logo(input_alpha,
                                                 input_beta,
                                                 background_color='black',
                                                 Load_Prev_Data=False)

df_alpha = pd.DataFrame()
df_alpha['seq'] = list(alpha)
df_alpha['mag'] = DTCRS.mag_alpha
df_alpha['label'] = contacts_alpha

df_beta = pd.DataFrame()
df_beta['seq'] = list(beta)
df_beta['mag'] = DTCRS.mag_beta
Exemplo n.º 10
0
df_tcr = pd.read_csv('../../Data/McPAS-TCR.csv')
mcpas_counts = []
for e in epitopes:
    temp = df_tcr[df_tcr['Epitope.peptide'] == e]
    temp = temp.groupby(['CDR3.beta.aa']).agg({'Epitope.peptide': 'first'}).reset_index()
    c = np.sum(np.isin(np.asarray(df['beta'].tolist()), temp['CDR3.beta.aa']))
    mcpas_counts.append(c)

df_epitope_counts = pd.DataFrame()
df_epitope_counts['antigen'] = antigens
df_epitope_counts['epitope'] = epitopes
df_epitope_counts['counts'] = mcpas_counts
df_epitope_counts.sort_values(by='counts',inplace=True,ascending=False)

DTCRS = DeepTCR_SS('reg_bm',device=2)

z=0
antigen = df_epitope_counts['antigen'].iloc[z]
epitope = df_epitope_counts['epitope'].iloc[z]

#Get alpha/beta sequences
alpha = np.asarray(df['alpha'].tolist())
beta = np.asarray(df['beta'].tolist())
i = np.where(df.columns==antigen)[0][0]
sel = df.iloc[:, i]
Y = np.log2(np.asarray(sel.tolist()) + 1)
DTCRS.Load_Data(alpha_sequences=alpha, beta_sequences=beta, Y=Y)
folds = 5
seeds = np.array(range(folds))
graph_seed = 0
Exemplo n.º 11
0
from DeepTCR.DeepTCR import DeepTCR_SS
from multiprocessing import Pool
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
import os

p = Pool(80)
dir_results = 'alpha_v_beta_results'
if not os.path.exists(dir_results):
    os.makedirs(dir_results)

antigens = [
    'GANAB-S5F', 'ATP6AP1-KLG_G3W', 'CMV-MLN', 'GNL3L-R4C', 'MART1-A2L',
    'YFV-LLW'
]

for a in antigens:
    DTCR = DeepTCR_SS(a + 'Rep')
    DTCR.Get_Data(directory='../../Data/Zhang/' + a,
                  aa_column_alpha=0,
                  aa_column_beta=1,
                  p=p)
    DTCR.Monte_Carlo_CrossVal(folds=50, weight_by_class=True)
    DTCR.Representative_Sequences()
Exemplo n.º 12
0
from DeepTCR.DeepTCR import DeepTCR_SS, DeepTCR_WF

#Train Sequence Classifier
DTCR_SS = DeepTCR_SS('Rudqvist')
DTCR_SS.Get_Data(directory='../../Data/Rudqvist',
                 Load_Prev_Data=False,
                 aggregate_by_aa=True,
                 aa_column_beta=1,
                 count_column=2,
                 v_beta_column=7,
                 d_beta_column=14,
                 j_beta_column=21)

DTCR_SS.Monte_Carlo_CrossVal(folds=100, test_size=0.25)
DTCR_SS.AUC_Curve()

#Train Repertoire Classifier without on-graph clustering
DTCR_WF = DeepTCR_WF('Rudqvist')
DTCR_WF.Get_Data(directory='../../Data/Rudqvist',
                 Load_Prev_Data=False,
                 aggregate_by_aa=True,
                 aa_column_beta=1,
                 count_column=2,
                 v_beta_column=7,
                 d_beta_column=14,
                 j_beta_column=21)

DTCR_WF.Monte_Carlo_CrossVal(folds=100, LOO=4, epochs_min=50)
DTCR_WF.AUC_Curve()

#Train Repertoire Classifier with on-graph clustering
from DeepTCR.DeepTCR import DeepTCR_SS, DeepTCR_WF

folds = 100
LOO = 4
epochs_min = 100

#Train Sequence Classifier
DTCR_SS = DeepTCR_SS('Rudqvist_SS')
DTCR_SS.Get_Data(directory='../../Data/Rudqvist',
                 Load_Prev_Data=False,
                 aa_column_beta=1,
                 count_column=2,
                 v_beta_column=7,
                 d_beta_column=14,
                 j_beta_column=21)

DTCR_SS.Monte_Carlo_CrossVal(folds=folds, test_size=0.25)
DTCR_SS.AUC_Curve(filename='AUC.eps')

#Train Repertoire Classifier without on-graph clustering
DTCR_WF = DeepTCR_WF('Rudqvist_WF')
DTCR_WF.Get_Data(directory='../../Data/Rudqvist',
                 Load_Prev_Data=False,
                 aa_column_beta=1,
                 count_column=2,
                 v_beta_column=7,
                 d_beta_column=14,
                 j_beta_column=21)

DTCR_WF.Monte_Carlo_CrossVal(folds=folds, LOO=LOO, epochs_min=epochs_min)
DTCR_WF.AUC_Curve(filename='Rep_AUC.eps')
Exemplo n.º 14
0
thresh = 0.99
seq_train = []
label_train = []
count_train = []
for s, seq_cl, p, c in zip(sequences, seq_class_labels, predicted, counts):
    sel_idx = p > thresh
    seq_train.append(s[sel_idx])
    label_train.append(seq_cl[sel_idx])
    count_train.append(c[sel_idx])

seq_train = np.hstack(seq_train)
label_train = np.hstack(label_train)
count_train = np.hstack(count_train)

#Train Sequence Classifier
DTCR = DeepTCR_SS('tw10_seq', device=gpu)
DTCR.Load_Data(beta_sequences=seq_train, class_labels=label_train)
DTCR.Monte_Carlo_CrossVal(folds=folds,
                          graph_seed=graph_seed,
                          seeds=seeds,
                          convergence='training')
y_pred = DTCR.predicted
y_test = DTCR.Y
plt.figure(figsize=(6, 5))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
for ii, cl in enumerate(DTCR.lb.classes_, 0):
    fpr, tpr, _ = roc_curve(y_test[:, ii], y_pred[:, ii])
    roc_score = roc_auc_score(y_test[:, ii], y_pred[:, ii])
Exemplo n.º 15
0
from DeepTCR.DeepTCR import DeepTCR_SS
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.spatial.distance import pdist, squareform
from scipy.stats import spearmanr
import seaborn as sns
import pickle
import os
import matplotlib

matplotlib.rc('font', family='Arial')

#Instantiate training object
DTCRU = DeepTCR_SS('Murine_Sup')
#Load Data
# DTCRU.Get_Data(directory='../../Data/Murine_Antigens',Load_Prev_Data=False,
#                aa_column_beta=0,count_column=1,v_beta_column=2,j_beta_column=3,
#                classes=['Db-F2', 'Db-M45', 'Db-NP', 'Db-PA', 'Db-PB1'])
# DTCRU.Monte_Carlo_CrossVal(folds=5)

DTCR_inf = DeepTCR_SS('load')
DTCR_inf.Get_Data(directory='../../Data/Murine_Antigens',
                  Load_Prev_Data=False,
                  aa_column_beta=0,
                  count_column=1,
                  v_beta_column=2,
                  j_beta_column=3,
                  classes=['Kb-M38', 'Kb-SIY', 'Kb-TRP2', 'Kb-m139'])

beta_sequences = DTCR_inf.beta_sequences
Exemplo n.º 16
0
import pandas as pd
from DeepTCR.DeepTCR import DeepTCR_SS
import numpy as np
from multiprocessing import Pool
import os
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import shutil
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

df = pd.read_csv('../../Data/10x_Data/Data_Regression.csv')
DTCRS = DeepTCR_SS('reg',device=2)
p = Pool(40)

#Get alpha/beta sequences
alpha = np.asarray(df['alpha'].tolist())
beta = np.asarray(df['beta'].tolist())

antigen = 'A0201_GILGFVFTL_Flu-MP_Influenza'
i = np.where(df.columns==antigen)[0][0]
sel = df.iloc[:, i]
Y = np.log2(np.asarray(sel.tolist()) + 1)
DTCRS.Load_Data(alpha_sequences=alpha, beta_sequences=beta, Y=Y, p=p)
DTCRS.K_Fold_CrossVal(split_by_sample=False, folds=5)
DTCRS.Representative_Sequences(top_seq=100,motif_seq=10,color_scheme='hydrophobicity')

dir = 'Reg_Rep_Sequences'