def main_anti_anchor(global_args): ''' Test whether or not a certain residue would be detrimental to binding. A set of random peptides are modifies in two different ways: (1) Replacing their original residue at the position in question with the anti-anchor residue. (2) Replacing their original residue at the same position with a random residue. See if the first type of modification leads to lower predicted scores Args: 1.global arguments Return values: 1. Write the results to a file: (1)average/median score after two different kinds of modifications. (2)p-value of Kruskal test. ''' [blosum_matrix, aa, main_dir, output_path] = global_args allele = "B*35:01" anti_anchor_aa = "K" #The amino acid type to be tested anti_anchor_pos = 7 #The position of the anti-anchor foutput( "anti-anchor: " + allele + " " + anti_anchor_aa + " " + str(anti_anchor_pos), output_path) path_train = main_dir + "binding_data/binding_data.txt" path_seq = main_dir + "HLA_A_B.txt" seq_dict = allele_seq(path_seq) pseq_dict = pseudo_seq(seq_dict, global_args) #Load the models trained previously models = [] for i in range(25): json_f = open(main_dir + "models/model_" + str(i) + ".json", 'r') loaded_model_json = json_f.read() json_f.close() loaded_model = model_from_json(loaded_model_json) loaded_model.load_weights( (main_dir + "models/model_" + str(i) + ".h5")) models.append(loaded_model) #Randomly sample peptides from the proteome proteome_path = main_dir + "Homo_sapiens.GRCh38.pep.all.fa" proteome = read_proteome(proteome_path) peptides = protein_scanning(proteome, global_args) #Modify the peptides in two ways peptides_1 = peptides peptides_2 = copy.deepcopy(peptides) #Replacing the original residue with the anti-anchor residue for pep in peptides_1: pep[anti_anchor_pos - 1] = blosum_matrix[aa[anti_anchor_aa]] #Replacing the original residue with a random residue for pep in peptides_2: pep[anti_anchor_pos - 1] = blosum_matrix[random.randint(0, 19)] scores_1 = scoring(models, [np.array(peptides_1), \ np.array([pseq_dict[allele] for i in range(len(peptides_1))])]) scores_2 = scoring(models, [np.array(peptides_2), \ np.array([pseq_dict[allele] for i in range(len(peptides_2))])]) foutput("anti-anchor: " + str(np.median(scores_1)) + " random replacement: " + str(np.median(scores_2)) +"\n"\ + str(ss.kruskal(scores_1, scores_2)), output_path) return
def main_cross_validation_without_attention(global_args): #Reading sequence data and peptide-MHC binding data [blosum_matrix, aa, main_dir, output_path] = global_args path_seq = main_dir+ "HLA_A_B.txt" seq_dict = allele_seq(path_seq) pseq_dict = pseudo_seq(seq_dict, global_args) path_train = main_dir+ "binding_data/binding_data_train.txt" data_dict = read_binding_data(path_train,pseq_dict,global_args) data_dict = redundancy_removal(data_dict) path_val = main_dir+ "binding_data/binding_data_val.txt" validation_data, validation_target = read_validation_data(path_val,pseq_dict,global_args) #Data partition for cross-validation n_splits = 5 training_data, test_dicts = preparing_data(data_dict, n_splits, test_len = 10) print "Finished data loading" print "shape of training data", np.shape(training_data) #Cross-validation performance_dicts = [] for split in range(n_splits): performance_dict = cross_validation_training_without_attention(np.array(training_data[split]), test_dicts[split], validation_data, validation_target, global_args) performance_dicts.append(performance_dict) #Output the results for allele in sorted(performance_dicts[1].keys()): try: performances = [perf_dict[allele] for perf_dict in performance_dicts] mean_performance = [np.mean(metric) for metric in zip(*performances)] foutput(allele+"\t"+str(mean_performance[0])+"\t"+str(mean_performance[1])+"\t"+str(mean_performance[2]), output_path) except KeyError: pass
def main_binding_prediction(global_args): [blosum_matrix, aa, main_dir, output_path] = global_args path_train = main_dir + "binding_data/binding_data.txt" path_seq = main_dir + "HLA_A_B.txt" seq_dict = allele_seq(path_seq) pseq_dict = pseudo_seq(seq_dict, global_args) #Load the models trained previously models = [] for i in range(25): json_f = open(main_dir + "models/model_" + str(i) + ".json", 'r') loaded_model_json = json_f.read() json_f.close() loaded_model = model_from_json(loaded_model_json) loaded_model.load_weights( (main_dir + "models/model_" + str(i) + ".h5")) models.append(loaded_model) #Read the MHC alleles and peptide sequences [peptides, alleles ] = read_prediction_input(main_dir + "binding_prediction/prediction_input.txt") #Encode the peptides input_pep = [] for pep in peptides: pep_blosum = [] #Encoded peptide seuqence for residue_index in range(12): #Encode the peptide sequence in the 1-12 columns, with the N-terminal aligned to the left end #If the peptide is shorter than 12 residues, the remaining positions on #the rightare filled will zero-padding if residue_index < len(pep): pep_blosum.append(blosum_matrix[aa[pep[residue_index]]]) else: pep_blosum.append(np.zeros(20)) for residue_index in range(12): #Encode the peptide sequence in the 13-24 columns, with the C-terminal aligned to the right end #If the peptide is shorter than 12 residues, the remaining positions on #the left are filled will zero-padding if 12 - residue_index > len(pep): pep_blosum.append(np.zeros(20)) else: pep_blosum.append(blosum_matrix[aa[pep[len(pep) - 12 + residue_index]]]) input_pep.append(pep_blosum) #Encode the MHC alleles input_mhc = [pseq_dict[allele] for allele in alleles] #Making predictions scores = scoring(models, [np.array(input_pep), np.array(input_mhc)]) #Output for i in range(len(scores)): foutput(peptides[i] + "\t" + alleles[i] + "\t" + str(scores[i]), output_path)
def main_external_testing(global_args): [blosum_matrix, aa, main_dir, output_path] = global_args path_train = main_dir + "binding_data/binding_data.txt" path_seq = main_dir + "HLA_A_B.txt" seq_dict = allele_seq(path_seq) pseq_dict = pseudo_seq(seq_dict, global_args) #Load the models trained previously models = [] for i in range(25): json_f = open(main_dir + "models/model_" + str(i) + ".json", 'r') loaded_model_json = json_f.read() json_f.close() loaded_model = model_from_json(loaded_model_json) loaded_model.load_weights( (main_dir + "models/model_" + str(i) + ".h5")) models.append(loaded_model) dataset_dates = [ "20180511", "20170901", "20170323", "20161209", "20160503", "20160219", "20150807", "20150731", "20150717", "20150619", "20150515" ] foutput("dataste_date \t dataset \t allele \t len \t SRCC \t AUC\t", output_path) for dataset_date in dataset_dates: #Read the data of external dataset path_external = main_dir + "IEDB_benchmarking_datasets/" + dataset_date + ".txt" external_dict = read_external_test(path_external, pseq_dict, global_args) #Test the model on these datasets for dataset in external_dict.keys(): for allele in external_dict[dataset].keys(): #Peptides of different lengths are sorted into different datasets for len_pep in external_dict[dataset][allele].keys(): external_dataset = external_dict[dataset][allele][len_pep] #Exclude the datasets with less than five samples if len(external_dataset[0]) < 5: continue #Use the model to make prediction scores for the peptides val_scores = scoring(models, [ np.array(external_dataset[1]), np.array(external_dataset[2]) ]) print np.shape(val_scores), np.shape(external_dataset[3]) #Use the prediction scores and exerimental measurements to calculate AUROC and SRCC test_label = [ 1 if aff > 1 - log(500) / log(50000) else 0 for aff in external_dataset[3] ] fpr, tpr, thresholds = roc_curve(test_label, val_scores) roc_auc = auc(fpr, tpr) foutput(dataset_date+"\t"+dataset+"\t"+allele+"\t"+str(len_pep)+"\t"\ +str(ss.spearmanr(val_scores,external_dataset[3])[0])+"\t"+str(roc_auc), output_path)
def main_MHC_clustering(global_args): #Read MHC sequence data and MHC-pep binding data [blosum_matrix, aa, main_dir, output_path] = global_args path_train = main_dir + "binding_data/binding_data.txt" path_seq = main_dir + "HLA_A_B.txt" seq_dict = allele_seq(path_seq) pseq_dict = pseudo_seq(seq_dict, global_args) data_dict = read_binding_data(path_train, pseq_dict, global_args) path_external = main_dir + "binding_data/external_training_set.txt" #This is the one used in NetMHCpan4 #Published in "MHC class I associated peptides derive from selective regions of the human genome" external_dict = read_external_train(path_external, pseq_dict, global_args) #Remove the redundant data between the training sets for allele in sorted(data_dict.keys()): if allele in external_dict.keys(): print allele data_dict[allele] = data_dict[allele] + external_dict[allele] unique_seq = [] unique_data = [] for dt in data_dict[allele]: if dt[4] not in unique_seq: unique_data.append(dt) unique_seq.append(dt[4]) print "unique", len(unique_data) data_dict[allele] = unique_data #Count the number of samples for each allele dataset_size = [ len(data_dict[allele]) for allele in sorted(data_dict.keys()) ] alleles = sorted(data_dict.keys()) n_alleles = len(alleles) similarity_matrix = np.zeros((n_alleles, n_alleles)) for i in range(n_alleles): for j in range(n_alleles): similarity_matrix[i][j] = distance.levenshtein(pseq_dict[alleles[i]],\ pseq_dict[alleles[j]]) #Measuring how much information one allele can obtain from the data of other alleles #We assume the information obtained is proportional to the size of the dataset and #inversely prportional to the editing distance between the MHC seuqences of the two alleles #To avoid division by zero, we add a smoothing term reference_info = [] for i in range(n_alleles): sum_info = 0 for j in range(n_alleles): if i != j: #Exclude the allele in question itself sum_info += dataset_size[j] * pow( (similarity_matrix[i][j]) + 1, -2) reference_info.append(sum_info) for i in range(n_alleles): foutput(alleles[i] + "\t" + str(reference_info[i]), output_path)
def main_model_training(global_args): [blosum_matrix, aa, main_dir, output_path] = global_args path_train = main_dir + "binding_data/binding_data.txt" path_seq = main_dir + "HLA_A_B.txt" seq_dict = allele_seq(path_seq) pseq_dict = pseudo_seq(seq_dict, global_args) data_dict = read_binding_data(path_train, pseq_dict, global_args) n_splits = 5 path_external = main_dir + "binding_data/external_training_set.txt" #This is the one used in NetMHCpan4 #Published in "MHC class I associated peptides derive from selective regions of the human genome" external_dict = read_external_train(path_external, pseq_dict, global_args) path_val = main_dir + "binding_data/binding_data_val.txt" validation_data, validation_target = read_validation_data( path_val, pseq_dict, global_args) #Remove the redundant data between the training sets for allele in sorted(data_dict.keys()): if allele in external_dict.keys(): print allele data_dict[allele] = data_dict[allele] + external_dict[allele] unique_seq = [] unique_data = [] for dt in data_dict[allele]: if dt[4] not in unique_seq: unique_data.append(dt) unique_seq.append(dt[4]) print "unique", len(unique_data) data_dict[allele] = unique_data #Train the models models = [] for cross_validation in range(5): training_data, test_dicts = preparing_data(data_dict, n_splits) for partition in range(5): training_data_partition = training_data[partition] models.extend(model_training(np.array(training_data_partition), np.array(validation_data),\ np.array(validation_target),global_args, n_estimators = 1)) #Save model and weights to file for i in range(len(models)): model = models[i] model_json = model.to_json() with open(main_dir + "models/model_" + str(i) + ".json", "w") as json_file: json_file.write(model_json) model.save_weights(main_dir + "models/model_" + str(i) + ".h5")
def main_pearson_benchmark_redundancy(global_args): [blosum_matrix, aa, main_dir, output_path] = global_args path_train = main_dir + "binding_data/binding_data.txt" path_seq = main_dir + "HLA_A_B.txt" seq_dict = allele_seq(path_seq) pseq_dict = pseudo_seq(seq_dict, global_args) path_external = main_dir + "binding_data/external_training_set.txt" #This is the one used in NetMHCpan4 Pearson_dict = read_external_train(path_external, pseq_dict, global_args) for allele in Pearson_dict.keys(): #Extract the seuqence in the form of strings Pearson_dict[allele] = [dt[4] for dt in Pearson_dict[allele]] dataset_dates = [ "20170901", "20170323", "20161209", "20160503", "20160219", "20150807", "20150731", "20150717", "20150619", "20150515" ] print("dataste_date \t dataset \t allele \t ") for dataset_date in dataset_dates: #Read the data of external dataset path_external = main_dir + "IEDB_benchmarking_datasets/" + dataset_date + ".txt" external_dict = read_external_test(path_external, pseq_dict, global_args) #Test the model on these datasets for dataset in external_dict.keys(): for allele in external_dict[dataset].keys(): print(allele) for len_pep in external_dict[dataset][allele].keys(): all_pep = 0 overlap_pep = 0 for pep in external_dict[dataset][allele][len_pep][0]: #print(pep) all_pep += 1 if allele in Pearson_dict.keys( ) and pep in Pearson_dict[allele]: overlap_pep += 1 print(all_pep, overlap_pep)
def main_motif(global_args, mode): #Read the alleles and their pseudo-sequences [blosum_matrix, aa, main_dir, output_path] = global_args path_train = main_dir + "binding_data/binding_data.txt" path_seq = main_dir + "HLA_A_B.txt" seq_dict = allele_seq(path_seq) pseq_dict = pseudo_seq(seq_dict, global_args) #Load the models trained previously models = [] for i in range(25): json_f = open(main_dir + "models/model_" + str(i) + ".json", 'r') loaded_model_json = json_f.read() json_f.close() loaded_model = model_from_json(loaded_model_json) loaded_model.load_weights( (main_dir + "models/model_" + str(i) + ".h5")) models.append(loaded_model) #Randomly sample peptides from the proteome proteome_path = main_dir + "Homo_sapiens.GRCh38.pep.all.fa" proteome = read_proteome(proteome_path) peptides = protein_scanning(proteome, global_args) #Start to output the motifs #Output is a dictionary foutput("heatmap_dict = {\n", output_path) alleles = allele_list(path_train) for allele in alleles: #First select 1000 peptides with the highest (or lowest, depending on which mode you choose) binding affinities. if allele not in pseq_dict.keys(): continue pseqs = [pseq_dict[allele] for i in range(len(peptides))] #Predict the binding affinities of the peptides. scores = scoring(models, [np.array(peptides), np.array(pseqs)]) #Select the ones with the highest/lowest affinities. upper_threshold = sorted(scores)[-1000] lower_threshold = sorted(scores)[1000] positives = [ i for i in range(len(scores)) if scores[i] >= upper_threshold ] negatives = [ i for i in range(len(scores)) if scores[i] <= lower_threshold ] if mode == 'binder': selected_peptides = [peptides[j] for j in positives] else: selected_peptides = [peptides[j] for j in negatives] #Pseudo-sequences of the corresponding peptide . selected_pseqs = [ pseq_dict[allele] for i in range(len(selected_peptides)) ] #Use the model to assign attention scores to the residues in each peptide. attentions_of_models = [] model_inputs = [np.array(selected_peptides), np.array(selected_pseqs)] #We have an ensembl of models. Use each of them to assign attention scores. #The attention scores given by different models are averaged to get the final attention score . for model in models: attentions = get_attentions(model, model_inputs, print_shape_only=False, layer_name=None) attentions_of_models.append(attentions) heatmap = np.zeros((20, 9)) for i, pep in enumerate(selected_peptides): #Take the mean(average) of the attention given by different models pep_attention = [np.mean(residue_attentions) for residue_attentions \ in list(zip(*[attentions_of_models[model_index][i] for model_index in range(len(models))]))] #In the encoded matrix, each residue correspond to 2 rows. The attention scores of these #2 positions are summed to get the attention of this residue. pep_attention = [(pep_attention[k] + pep_attention[k + 12 + 3]) for k in range(9)] #Add the newly assigned attention score to the matrix. for position in range(9): for residue in range(20): if blosum_matrix[residue] == list(pep[position]): heatmap[residue][position] += pep_attention[position] #Output the result out = "\"" + allele + "\":[" for i in range(20): out += str(list(heatmap[i])) + "," out += "],\n" foutput(out, output_path) foutput("}", output_path)
def main_test_attention(global_args): #Read the alleles and their pseudo-sequences [blosum_matrix, aa, main_dir, output_path] = global_args path_train = main_dir + "binding_data/binding_data.txt" path_seq = main_dir + "HLA_A_B.txt" seq_dict = allele_seq(path_seq) pseq_dict = pseudo_seq(seq_dict, global_args) #Load the models trained previously models = [] for i in range(25): json_f = open(main_dir + "models/model_" + str(i) + ".json", 'r') loaded_model_json = json_f.read() json_f.close() loaded_model = model_from_json(loaded_model_json) loaded_model.load_weights( (main_dir + "models/model_" + str(i) + ".h5")) models.append(loaded_model) #Randomly sample peptides from the proteome proteome_path = main_dir + "Homo_sapiens.GRCh38.pep.all.fa" proteome = read_proteome(proteome_path) peptides = protein_scanning(proteome, global_args) #Start to output the motifs #Output is a dictionary foutput("heatmap_dict = {\n", output_path) alleles = allele_list(path_train) D1 = [] D2 = [] for allele in alleles: #First select 1000 peptides with the highest (or lowest, depending on which mode you choose) binding affinities. if allele not in pseq_dict.keys(): continue pseqs = [pseq_dict[allele] for i in range(len(peptides))] #Predict the binding affinities of the peptides. scores = scoring(models, [np.array(peptides), np.array(pseqs)]) #Select the ones with the highest/lowest affinities. upper_threshold = sorted(scores)[-1000] positives = [ i for i in range(len(scores)) if scores[i] > upper_threshold ] selected_peptides = [peptides[j] for j in positives] #Record the predicted affinities before masking Ao = [scores[k] for k in positives] #Pseudo-sequences of the corresponding peptide . positive_pseqs = [pseqs[j] for j in positives] #copy the peptides so that we could mask certain positions and make predictions later pep_mask_highest = copy.deepcopy(selected_peptides) pep_mask_lowest = copy.deepcopy(selected_peptides) #Use the model to assign attention scores to the residues in each peptide. attentions_of_models = [] model_inputs = [np.array(selected_peptides), np.array(positive_pseqs)] #We have an ensembl of models. Use each of them to assign attention scores. #The attention scores given by different models are averaged to get the final attention score . for model in models: attentions = get_attentions(model, model_inputs, print_shape_only=False, layer_name=None) attentions_of_models.append(attentions) heatmap = np.zeros((20, 9)) for i, pep in enumerate(selected_peptides): #Take the mean(average) of the attention given by different models pep_attention = [np.mean(residue_attentions) for residue_attentions \ in list(zip(*[attentions_of_models[model_index][i] for model_index in range(len(models))]))] #In the encoded matrix, each residue correspond to 2 rows. The attention scores of these #2 positions are summed to get the attention of this residue. pep_attention = [(pep_attention[k] + pep_attention[k + 12 + 3]) for k in range(9)] #Using zeros to mask the positions with the highest attention value pep_mask_highest[i][np.argmax(pep_attention)] = np.zeros(20) pep_mask_highest[i][np.argmax(pep_attention) + 12 + 3] = np.zeros(20) #Using zeros to mask the positions with the lowest attention value pep_mask_lowest[i][np.argmin(pep_attention)] = np.zeros(20) pep_mask_lowest[i][np.argmin(pep_attention) + 12 + 3] = np.zeros(20) Amh = scoring(models, [np.array(pep_mask_highest), np.array(pseqs)]) Aml = scoring(models, [np.array(pep_mask_lowest), np.array(pseqs)]) new_D1 = abs(np.subtract(Amh, Ao)) new_D2 = abs(np.subtract(Aml, Ao)) D1.extend(new_D1) D2.extend(new_D2) foutput(allele + " " + str(np.median(new_D1)) + " " + str(np.median(new_D2))\ + " " + str(ss.kruskal(new_D1, new_D2)), output_path) foutput("mean D1 is " + str(np.median(D1)), output_path) foutput("mean D2 is " + str(np.median(D2)), output_path) foutput("Mann-Whitney U test " + str(ss.kruskal(D1, D2)), output_path)
def main_leave_one_out(global_args): [blosum_matrix, aa, main_dir, output_path_1, output_path_2] = global_args global_args = [blosum_matrix, aa, main_dir, output_path_2] path_train = main_dir + "binding_data/binding_data.txt" path_seq = main_dir + "HLA_A_B.txt" seq_dict = allele_seq(path_seq) pseq_dict = pseudo_seq(seq_dict, global_args) data_dict = read_binding_data(path_train, pseq_dict, global_args) n_splits = 5 path_external = main_dir + "binding_data/external_training_set.txt" #This is the one used in NetMHCpan4 #Published in "MHC class I associated peptides derive from selective regions of the human genome" external_dict = read_external_train(path_external, pseq_dict, global_args) path_val = main_dir + "binding_data/binding_data_val.txt" validation_data, validation_target = read_validation_data( path_val, pseq_dict, global_args) alleles = ["A*02:01"] #Remove the redundant data between the training sets for allele in alleles: if allele in external_dict.keys(): print allele data_dict[allele] = data_dict[allele] + external_dict[allele] unique_seq = [] unique_data = [] for dt in data_dict[allele]: if dt[4] not in unique_seq: unique_data.append(dt) unique_seq.append(dt[4]) print "unique", len(unique_data) data_dict[allele] = unique_data for allele in sorted(data_dict.keys()): if len(data_dict[allele]) > 100: print allele, len(data_dict[allele]) #Leave-one-out training foutput("allele\tPCC\tAUROC\tACC", output_path_1) for left_out_allele in ["B*48:01"]: foutput(left_out_allele, output_path_1) #Training data and test data temp_data_dict = copy.deepcopy(data_dict) [test_pep, test_mhc, test_target] = [[i[j] for i in data_dict[left_out_allele]] for j in range(3)] print(np.shape(test_pep), np.shape(test_mhc), np.shape(test_target)) #Remove the data of the left out allele temp_data_dict.pop(left_out_allele, None) #Model training models = [] for cross_validation in range(1): training_data, test_dicts = preparing_data(temp_data_dict, n_splits) for partition in range(5): training_data_partition = training_data[partition] models.extend(model_training(np.array(training_data_partition), \ np.array(validation_data),np.array(validation_target),global_args, n_estimators = 5)) #Test the performance of the models pcc, roc_auc, max_acc = model_eval( models, [np.array(test_pep), np.array(test_mhc)], np.array(test_target)) foutput( left_out_allele + "\t" + str(pcc) + "\t" + str(roc_auc) + "\t" + str(max_acc), output_path_1)