示例#1
0
def main_anti_anchor(global_args):
    '''
    Test whether or not a certain residue would be detrimental to binding.
    A set of random peptides are modifies in two different ways:
        (1) Replacing their original residue at the position in question with the anti-anchor residue.
        (2) Replacing their original residue at the same position with a random residue.
    See if the first type of modification leads to lower predicted scores
    Args:
        1.global arguments
    Return values:
        1. Write the results to a file: 
            (1)average/median score after two different kinds of modifications.
            (2)p-value of Kruskal test.
    '''
    [blosum_matrix, aa, main_dir, output_path] = global_args
    allele = "B*35:01"
    anti_anchor_aa = "K"  #The amino acid type to be tested
    anti_anchor_pos = 7  #The position of the anti-anchor
    foutput(
        "anti-anchor: " + allele + " " + anti_anchor_aa + " " +
        str(anti_anchor_pos), output_path)
    path_train = main_dir + "binding_data/binding_data.txt"
    path_seq = main_dir + "HLA_A_B.txt"
    seq_dict = allele_seq(path_seq)
    pseq_dict = pseudo_seq(seq_dict, global_args)

    #Load the models trained previously
    models = []
    for i in range(25):
        json_f = open(main_dir + "models/model_" + str(i) + ".json", 'r')
        loaded_model_json = json_f.read()
        json_f.close()
        loaded_model = model_from_json(loaded_model_json)
        loaded_model.load_weights(
            (main_dir + "models/model_" + str(i) + ".h5"))
        models.append(loaded_model)

    #Randomly sample peptides from the proteome
    proteome_path = main_dir + "Homo_sapiens.GRCh38.pep.all.fa"
    proteome = read_proteome(proteome_path)
    peptides = protein_scanning(proteome, global_args)

    #Modify the peptides in two ways
    peptides_1 = peptides
    peptides_2 = copy.deepcopy(peptides)
    #Replacing the original residue with the anti-anchor residue
    for pep in peptides_1:
        pep[anti_anchor_pos - 1] = blosum_matrix[aa[anti_anchor_aa]]
    #Replacing the original residue with a random residue
    for pep in peptides_2:
        pep[anti_anchor_pos - 1] = blosum_matrix[random.randint(0, 19)]

    scores_1 = scoring(models, [np.array(peptides_1), \
                                np.array([pseq_dict[allele] for i in range(len(peptides_1))])])
    scores_2 = scoring(models, [np.array(peptides_2), \
                                np.array([pseq_dict[allele] for i in range(len(peptides_2))])])

    foutput("anti-anchor: " + str(np.median(scores_1)) + " random replacement: " + str(np.median(scores_2)) +"\n"\
            + str(ss.kruskal(scores_1, scores_2)), output_path)
    return
def main_cross_validation_without_attention(global_args):
    #Reading sequence data and peptide-MHC binding data
    [blosum_matrix, aa, main_dir, output_path] = global_args
    path_seq = main_dir+ "HLA_A_B.txt"
    seq_dict = allele_seq(path_seq)
    pseq_dict = pseudo_seq(seq_dict, global_args)
    path_train = main_dir+ "binding_data/binding_data_train.txt"
    data_dict = read_binding_data(path_train,pseq_dict,global_args)    
    data_dict = redundancy_removal(data_dict)
    path_val = main_dir+ "binding_data/binding_data_val.txt"
    validation_data, validation_target = read_validation_data(path_val,pseq_dict,global_args)
    
    #Data partition for cross-validation
    n_splits = 5
    training_data, test_dicts = preparing_data(data_dict, n_splits, test_len = 10)  
    print "Finished data loading"
    print "shape of training data", np.shape(training_data)       

    #Cross-validation
    performance_dicts = []
    for split in range(n_splits):
        performance_dict = cross_validation_training_without_attention(np.array(training_data[split]), test_dicts[split], 
                                          validation_data, validation_target, global_args)
        performance_dicts.append(performance_dict)
        
    #Output the results
    for allele in sorted(performance_dicts[1].keys()):
        try:
            performances = [perf_dict[allele] for perf_dict in performance_dicts]
            mean_performance = [np.mean(metric) for metric in zip(*performances)]
            foutput(allele+"\t"+str(mean_performance[0])+"\t"+str(mean_performance[1])+"\t"+str(mean_performance[2]), output_path)
        except KeyError:
            pass
示例#3
0
def main_binding_prediction(global_args):
    [blosum_matrix, aa, main_dir, output_path] = global_args
    path_train = main_dir + "binding_data/binding_data.txt"
    path_seq = main_dir + "HLA_A_B.txt"
    seq_dict = allele_seq(path_seq)
    pseq_dict = pseudo_seq(seq_dict, global_args)

    #Load the models trained previously
    models = []
    for i in range(25):
        json_f = open(main_dir + "models/model_" + str(i) + ".json", 'r')
        loaded_model_json = json_f.read()
        json_f.close()
        loaded_model = model_from_json(loaded_model_json)
        loaded_model.load_weights(
            (main_dir + "models/model_" + str(i) + ".h5"))
        models.append(loaded_model)

    #Read the MHC alleles and peptide sequences
    [peptides, alleles
     ] = read_prediction_input(main_dir +
                               "binding_prediction/prediction_input.txt")

    #Encode the peptides
    input_pep = []
    for pep in peptides:
        pep_blosum = []  #Encoded peptide seuqence
        for residue_index in range(12):
            #Encode the peptide sequence in the 1-12 columns, with the N-terminal aligned to the left end
            #If the peptide is shorter than 12 residues, the remaining positions on
            #the rightare filled will zero-padding
            if residue_index < len(pep):
                pep_blosum.append(blosum_matrix[aa[pep[residue_index]]])
            else:
                pep_blosum.append(np.zeros(20))
        for residue_index in range(12):
            #Encode the peptide sequence in the 13-24 columns, with the C-terminal aligned to the right end
            #If the peptide is shorter than 12 residues, the remaining positions on
            #the left are filled will zero-padding
            if 12 - residue_index > len(pep):
                pep_blosum.append(np.zeros(20))
            else:
                pep_blosum.append(blosum_matrix[aa[pep[len(pep) - 12 +
                                                       residue_index]]])
        input_pep.append(pep_blosum)

    #Encode the MHC alleles
    input_mhc = [pseq_dict[allele] for allele in alleles]

    #Making predictions
    scores = scoring(models, [np.array(input_pep), np.array(input_mhc)])

    #Output
    for i in range(len(scores)):
        foutput(peptides[i] + "\t" + alleles[i] + "\t" + str(scores[i]),
                output_path)
示例#4
0
def main_external_testing(global_args):
    [blosum_matrix, aa, main_dir, output_path] = global_args
    path_train = main_dir + "binding_data/binding_data.txt"
    path_seq = main_dir + "HLA_A_B.txt"
    seq_dict = allele_seq(path_seq)
    pseq_dict = pseudo_seq(seq_dict, global_args)

    #Load the models trained previously
    models = []
    for i in range(25):
        json_f = open(main_dir + "models/model_" + str(i) + ".json", 'r')
        loaded_model_json = json_f.read()
        json_f.close()
        loaded_model = model_from_json(loaded_model_json)
        loaded_model.load_weights(
            (main_dir + "models/model_" + str(i) + ".h5"))
        models.append(loaded_model)

    dataset_dates = [
        "20180511", "20170901", "20170323", "20161209", "20160503", "20160219",
        "20150807", "20150731", "20150717", "20150619", "20150515"
    ]

    foutput("dataste_date \t dataset \t allele \t len \t SRCC \t AUC\t",
            output_path)
    for dataset_date in dataset_dates:
        #Read the data of external dataset
        path_external = main_dir + "IEDB_benchmarking_datasets/" + dataset_date + ".txt"
        external_dict = read_external_test(path_external, pseq_dict,
                                           global_args)
        #Test the model on these datasets
        for dataset in external_dict.keys():
            for allele in external_dict[dataset].keys():
                #Peptides of different lengths are sorted into different datasets
                for len_pep in external_dict[dataset][allele].keys():
                    external_dataset = external_dict[dataset][allele][len_pep]
                    #Exclude the datasets with less than five samples
                    if len(external_dataset[0]) < 5:
                        continue
                    #Use the model to make prediction scores for the peptides
                    val_scores = scoring(models, [
                        np.array(external_dataset[1]),
                        np.array(external_dataset[2])
                    ])
                    print np.shape(val_scores), np.shape(external_dataset[3])
                    #Use the prediction scores and exerimental measurements to calculate AUROC and SRCC
                    test_label = [
                        1 if aff > 1 - log(500) / log(50000) else 0
                        for aff in external_dataset[3]
                    ]
                    fpr, tpr, thresholds = roc_curve(test_label, val_scores)
                    roc_auc = auc(fpr, tpr)
                    foutput(dataset_date+"\t"+dataset+"\t"+allele+"\t"+str(len_pep)+"\t"\
                                   +str(ss.spearmanr(val_scores,external_dataset[3])[0])+"\t"+str(roc_auc),
                                   output_path)
示例#5
0
def main_MHC_clustering(global_args):
    #Read MHC sequence data and MHC-pep binding data
    [blosum_matrix, aa, main_dir, output_path] = global_args
    path_train = main_dir + "binding_data/binding_data.txt"
    path_seq = main_dir + "HLA_A_B.txt"
    seq_dict = allele_seq(path_seq)
    pseq_dict = pseudo_seq(seq_dict, global_args)
    data_dict = read_binding_data(path_train, pseq_dict, global_args)
    path_external = main_dir + "binding_data/external_training_set.txt"  #This is the one used in NetMHCpan4
    #Published in "MHC class I associated peptides derive from selective regions of the human genome"
    external_dict = read_external_train(path_external, pseq_dict, global_args)

    #Remove the redundant data between the training sets
    for allele in sorted(data_dict.keys()):
        if allele in external_dict.keys():
            print allele
            data_dict[allele] = data_dict[allele] + external_dict[allele]
            unique_seq = []
            unique_data = []
            for dt in data_dict[allele]:
                if dt[4] not in unique_seq:
                    unique_data.append(dt)
                    unique_seq.append(dt[4])
            print "unique", len(unique_data)
            data_dict[allele] = unique_data

    #Count the number of samples for each allele
    dataset_size = [
        len(data_dict[allele]) for allele in sorted(data_dict.keys())
    ]
    alleles = sorted(data_dict.keys())
    n_alleles = len(alleles)
    similarity_matrix = np.zeros((n_alleles, n_alleles))
    for i in range(n_alleles):
        for j in range(n_alleles):
            similarity_matrix[i][j] = distance.levenshtein(pseq_dict[alleles[i]],\
                             pseq_dict[alleles[j]])

    #Measuring how much information one allele can obtain from the data of other alleles
    #We assume the information obtained is proportional to the size of the dataset and
    #inversely prportional to the editing distance between the MHC seuqences of the two alleles
    #To avoid division by zero, we add a smoothing term
    reference_info = []
    for i in range(n_alleles):
        sum_info = 0
        for j in range(n_alleles):
            if i != j:  #Exclude the allele in question itself
                sum_info += dataset_size[j] * pow(
                    (similarity_matrix[i][j]) + 1, -2)
        reference_info.append(sum_info)
    for i in range(n_alleles):
        foutput(alleles[i] + "\t" + str(reference_info[i]), output_path)
示例#6
0
def main_model_training(global_args):
    [blosum_matrix, aa, main_dir, output_path] = global_args
    path_train = main_dir + "binding_data/binding_data.txt"
    path_seq = main_dir + "HLA_A_B.txt"
    seq_dict = allele_seq(path_seq)
    pseq_dict = pseudo_seq(seq_dict, global_args)
    data_dict = read_binding_data(path_train, pseq_dict, global_args)
    n_splits = 5
    path_external = main_dir + "binding_data/external_training_set.txt"  #This is the one used in NetMHCpan4
    #Published in "MHC class I associated peptides derive from selective regions of the human genome"
    external_dict = read_external_train(path_external, pseq_dict, global_args)
    path_val = main_dir + "binding_data/binding_data_val.txt"
    validation_data, validation_target = read_validation_data(
        path_val, pseq_dict, global_args)

    #Remove the redundant data between the training sets
    for allele in sorted(data_dict.keys()):
        if allele in external_dict.keys():
            print allele
            data_dict[allele] = data_dict[allele] + external_dict[allele]
            unique_seq = []
            unique_data = []
            for dt in data_dict[allele]:
                if dt[4] not in unique_seq:
                    unique_data.append(dt)
                    unique_seq.append(dt[4])
            print "unique", len(unique_data)
            data_dict[allele] = unique_data

    #Train the models
    models = []
    for cross_validation in range(5):
        training_data, test_dicts = preparing_data(data_dict, n_splits)
        for partition in range(5):
            training_data_partition = training_data[partition]
            models.extend(model_training(np.array(training_data_partition), np.array(validation_data),\
                                         np.array(validation_target),global_args, n_estimators = 1))

    #Save model and weights to file
    for i in range(len(models)):
        model = models[i]
        model_json = model.to_json()
        with open(main_dir + "models/model_" + str(i) + ".json",
                  "w") as json_file:
            json_file.write(model_json)
        model.save_weights(main_dir + "models/model_" + str(i) + ".h5")
def main_pearson_benchmark_redundancy(global_args):
    [blosum_matrix, aa, main_dir, output_path] = global_args
    path_train = main_dir + "binding_data/binding_data.txt"
    path_seq = main_dir + "HLA_A_B.txt"
    seq_dict = allele_seq(path_seq)
    pseq_dict = pseudo_seq(seq_dict, global_args)
    path_external = main_dir + "binding_data/external_training_set.txt"  #This is the one used in NetMHCpan4
    Pearson_dict = read_external_train(path_external, pseq_dict, global_args)
    for allele in Pearson_dict.keys():
        #Extract the seuqence in the form of strings
        Pearson_dict[allele] = [dt[4] for dt in Pearson_dict[allele]]
    dataset_dates = [
        "20170901", "20170323", "20161209", "20160503", "20160219", "20150807",
        "20150731", "20150717", "20150619", "20150515"
    ]

    print("dataste_date \t dataset \t allele \t ")
    for dataset_date in dataset_dates:
        #Read the data of external dataset
        path_external = main_dir + "IEDB_benchmarking_datasets/" + dataset_date + ".txt"
        external_dict = read_external_test(path_external, pseq_dict,
                                           global_args)
        #Test the model on these datasets
        for dataset in external_dict.keys():
            for allele in external_dict[dataset].keys():
                print(allele)
                for len_pep in external_dict[dataset][allele].keys():
                    all_pep = 0
                    overlap_pep = 0
                    for pep in external_dict[dataset][allele][len_pep][0]:
                        #print(pep)
                        all_pep += 1
                        if allele in Pearson_dict.keys(
                        ) and pep in Pearson_dict[allele]:
                            overlap_pep += 1
                    print(all_pep, overlap_pep)
示例#8
0
文件: main_motif.py 项目: wxyz/ACME
def main_motif(global_args, mode):
    #Read the alleles and their pseudo-sequences
    [blosum_matrix, aa, main_dir, output_path] = global_args
    path_train = main_dir + "binding_data/binding_data.txt"
    path_seq = main_dir + "HLA_A_B.txt"
    seq_dict = allele_seq(path_seq)
    pseq_dict = pseudo_seq(seq_dict, global_args)

    #Load the models trained previously
    models = []
    for i in range(25):
        json_f = open(main_dir + "models/model_" + str(i) + ".json", 'r')
        loaded_model_json = json_f.read()
        json_f.close()
        loaded_model = model_from_json(loaded_model_json)
        loaded_model.load_weights(
            (main_dir + "models/model_" + str(i) + ".h5"))
        models.append(loaded_model)

    #Randomly sample peptides from the proteome
    proteome_path = main_dir + "Homo_sapiens.GRCh38.pep.all.fa"
    proteome = read_proteome(proteome_path)
    peptides = protein_scanning(proteome, global_args)

    #Start to output the motifs
    #Output is a dictionary
    foutput("heatmap_dict = {\n", output_path)
    alleles = allele_list(path_train)
    for allele in alleles:
        #First select 1000 peptides with the highest (or lowest, depending on which mode you choose) binding affinities.
        if allele not in pseq_dict.keys():
            continue
        pseqs = [pseq_dict[allele] for i in range(len(peptides))]
        #Predict the binding affinities of the peptides.
        scores = scoring(models, [np.array(peptides), np.array(pseqs)])
        #Select the ones with the highest/lowest affinities.
        upper_threshold = sorted(scores)[-1000]
        lower_threshold = sorted(scores)[1000]
        positives = [
            i for i in range(len(scores)) if scores[i] >= upper_threshold
        ]
        negatives = [
            i for i in range(len(scores)) if scores[i] <= lower_threshold
        ]
        if mode == 'binder':
            selected_peptides = [peptides[j] for j in positives]
        else:
            selected_peptides = [peptides[j] for j in negatives]
        #Pseudo-sequences of the corresponding peptide    .
        selected_pseqs = [
            pseq_dict[allele] for i in range(len(selected_peptides))
        ]
        #Use the model to assign attention scores to the residues in each peptide.
        attentions_of_models = []
        model_inputs = [np.array(selected_peptides), np.array(selected_pseqs)]
        #We have an ensembl of models. Use each of them to assign attention scores.
        #The attention scores given by different models are averaged to get the final attention score .
        for model in models:
            attentions = get_attentions(model,
                                        model_inputs,
                                        print_shape_only=False,
                                        layer_name=None)
            attentions_of_models.append(attentions)
        heatmap = np.zeros((20, 9))
        for i, pep in enumerate(selected_peptides):
            #Take the mean(average) of the attention given by different models
            pep_attention = [np.mean(residue_attentions) for residue_attentions \
                            in list(zip(*[attentions_of_models[model_index][i] for model_index in range(len(models))]))]
            #In the encoded matrix, each residue correspond to 2 rows. The attention scores of these
            #2 positions are summed to get the attention of this residue.
            pep_attention = [(pep_attention[k] + pep_attention[k + 12 + 3])
                             for k in range(9)]
            #Add the newly assigned attention score to the matrix.
            for position in range(9):
                for residue in range(20):
                    if blosum_matrix[residue] == list(pep[position]):
                        heatmap[residue][position] += pep_attention[position]
        #Output the result
        out = "\"" + allele + "\":["
        for i in range(20):
            out += str(list(heatmap[i])) + ","
        out += "],\n"
        foutput(out, output_path)
    foutput("}", output_path)
示例#9
0
def main_test_attention(global_args):
    #Read the alleles and their pseudo-sequences
    [blosum_matrix, aa, main_dir, output_path] = global_args
    path_train = main_dir + "binding_data/binding_data.txt"
    path_seq = main_dir + "HLA_A_B.txt"
    seq_dict = allele_seq(path_seq)
    pseq_dict = pseudo_seq(seq_dict, global_args)

    #Load the models trained previously
    models = []
    for i in range(25):
        json_f = open(main_dir + "models/model_" + str(i) + ".json", 'r')
        loaded_model_json = json_f.read()
        json_f.close()
        loaded_model = model_from_json(loaded_model_json)
        loaded_model.load_weights(
            (main_dir + "models/model_" + str(i) + ".h5"))
        models.append(loaded_model)

    #Randomly sample peptides from the proteome
    proteome_path = main_dir + "Homo_sapiens.GRCh38.pep.all.fa"
    proteome = read_proteome(proteome_path)
    peptides = protein_scanning(proteome, global_args)

    #Start to output the motifs
    #Output is a dictionary
    foutput("heatmap_dict = {\n", output_path)
    alleles = allele_list(path_train)
    D1 = []
    D2 = []
    for allele in alleles:
        #First select 1000 peptides with the highest (or lowest, depending on which mode you choose) binding affinities.
        if allele not in pseq_dict.keys():
            continue
        pseqs = [pseq_dict[allele] for i in range(len(peptides))]
        #Predict the binding affinities of the peptides.
        scores = scoring(models, [np.array(peptides), np.array(pseqs)])
        #Select the ones with the highest/lowest affinities.
        upper_threshold = sorted(scores)[-1000]
        positives = [
            i for i in range(len(scores)) if scores[i] > upper_threshold
        ]
        selected_peptides = [peptides[j] for j in positives]
        #Record the predicted affinities before masking
        Ao = [scores[k] for k in positives]
        #Pseudo-sequences of the corresponding peptide    .
        positive_pseqs = [pseqs[j] for j in positives]
        #copy the peptides so that we could mask certain positions and make predictions later
        pep_mask_highest = copy.deepcopy(selected_peptides)
        pep_mask_lowest = copy.deepcopy(selected_peptides)
        #Use the model to assign attention scores to the residues in each peptide.
        attentions_of_models = []
        model_inputs = [np.array(selected_peptides), np.array(positive_pseqs)]
        #We have an ensembl of models. Use each of them to assign attention scores.
        #The attention scores given by different models are averaged to get the final attention score .
        for model in models:
            attentions = get_attentions(model,
                                        model_inputs,
                                        print_shape_only=False,
                                        layer_name=None)
            attentions_of_models.append(attentions)
        heatmap = np.zeros((20, 9))
        for i, pep in enumerate(selected_peptides):
            #Take the mean(average) of the attention given by different models
            pep_attention = [np.mean(residue_attentions) for residue_attentions \
                            in list(zip(*[attentions_of_models[model_index][i] for model_index in range(len(models))]))]
            #In the encoded matrix, each residue correspond to 2 rows. The attention scores of these
            #2 positions are summed to get the attention of this residue.
            pep_attention = [(pep_attention[k] + pep_attention[k + 12 + 3])
                             for k in range(9)]
            #Using zeros to mask the positions with the highest attention value
            pep_mask_highest[i][np.argmax(pep_attention)] = np.zeros(20)
            pep_mask_highest[i][np.argmax(pep_attention) + 12 +
                                3] = np.zeros(20)
            #Using zeros to mask the positions with the lowest attention value
            pep_mask_lowest[i][np.argmin(pep_attention)] = np.zeros(20)
            pep_mask_lowest[i][np.argmin(pep_attention) + 12 +
                               3] = np.zeros(20)
        Amh = scoring(models, [np.array(pep_mask_highest), np.array(pseqs)])
        Aml = scoring(models, [np.array(pep_mask_lowest), np.array(pseqs)])
        new_D1 = abs(np.subtract(Amh, Ao))
        new_D2 = abs(np.subtract(Aml, Ao))
        D1.extend(new_D1)
        D2.extend(new_D2)
        foutput(allele + " " + str(np.median(new_D1)) + " " + str(np.median(new_D2))\
                + " " + str(ss.kruskal(new_D1, new_D2)), output_path)
    foutput("mean D1 is " + str(np.median(D1)), output_path)
    foutput("mean D2 is " + str(np.median(D2)), output_path)
    foutput("Mann-Whitney U test " + str(ss.kruskal(D1, D2)), output_path)
示例#10
0
def main_leave_one_out(global_args):
    [blosum_matrix, aa, main_dir, output_path_1, output_path_2] = global_args
    global_args = [blosum_matrix, aa, main_dir, output_path_2]
    path_train = main_dir + "binding_data/binding_data.txt"
    path_seq = main_dir + "HLA_A_B.txt"
    seq_dict = allele_seq(path_seq)
    pseq_dict = pseudo_seq(seq_dict, global_args)
    data_dict = read_binding_data(path_train, pseq_dict, global_args)
    n_splits = 5
    path_external = main_dir + "binding_data/external_training_set.txt"  #This is the one used in NetMHCpan4
    #Published in "MHC class I associated peptides derive from selective regions of the human genome"
    external_dict = read_external_train(path_external, pseq_dict, global_args)
    path_val = main_dir + "binding_data/binding_data_val.txt"
    validation_data, validation_target = read_validation_data(
        path_val, pseq_dict, global_args)

    alleles = ["A*02:01"]

    #Remove the redundant data between the training sets
    for allele in alleles:
        if allele in external_dict.keys():
            print allele
            data_dict[allele] = data_dict[allele] + external_dict[allele]
            unique_seq = []
            unique_data = []
            for dt in data_dict[allele]:
                if dt[4] not in unique_seq:
                    unique_data.append(dt)
                    unique_seq.append(dt[4])
            print "unique", len(unique_data)
            data_dict[allele] = unique_data

    for allele in sorted(data_dict.keys()):
        if len(data_dict[allele]) > 100:
            print allele, len(data_dict[allele])

    #Leave-one-out training
    foutput("allele\tPCC\tAUROC\tACC", output_path_1)
    for left_out_allele in ["B*48:01"]:
        foutput(left_out_allele, output_path_1)

        #Training data and test data
        temp_data_dict = copy.deepcopy(data_dict)
        [test_pep, test_mhc,
         test_target] = [[i[j] for i in data_dict[left_out_allele]]
                         for j in range(3)]
        print(np.shape(test_pep), np.shape(test_mhc), np.shape(test_target))
        #Remove the data of the left out allele
        temp_data_dict.pop(left_out_allele, None)

        #Model training
        models = []
        for cross_validation in range(1):
            training_data, test_dicts = preparing_data(temp_data_dict,
                                                       n_splits)
            for partition in range(5):
                training_data_partition = training_data[partition]
                models.extend(model_training(np.array(training_data_partition), \
                        np.array(validation_data),np.array(validation_target),global_args, n_estimators = 5))
        #Test the performance of the models
        pcc, roc_auc, max_acc = model_eval(
            models,
            [np.array(test_pep), np.array(test_mhc)], np.array(test_target))
        foutput(
            left_out_allele + "\t" + str(pcc) + "\t" + str(roc_auc) + "\t" +
            str(max_acc), output_path_1)