def prediction_whole_seq(protein_seqs, alleles, result_format="filtered", comparison_quantity="affinity", filter_value=500): """prediction_whole_seq wraps the class1presentationpredictor from mhcflurry 2.0 for customizability Args: protein_seqs (Dict): dictionary that maps a name like "protein_x": aa sequence alleles (Dict): dictionary that maps a name like "sample_1": allele name result_format (str, optional): Description comparison_quantity (str, optional): Description filter_value (int, optional): Description """ predictor = Class1PresentationPredictor.load() prediction_df = predictor.predict_sequences( sequences=protein_seqs, alleles=alleles, result=result_format, comparison_quantity=comparison_quantity, filter_value=filter_value, verbose=1) return prediction_df
def obtain_allele_list(preselected=None): if preselected is None: predictor = Class1PresentationPredictor.load() # snippet for obtaining supported alleles directly return predictor.supported_alleles else: with open(preselected, 'r') as f: all_lines = f.readlines() all_lines = list(map(lambda x: x.rstrip(), all_lines)) return all_lines
def file_process(is_checked, upload="./uploaded/multiple_query.txt", download="./app/download/result.txt"): table_scaled = wrapper_read_scaling() # [21,553] after_pca = pca_apply_reduction(table_scaled) # [21,12] hla = pd.read_csv('hla2paratopeTable_aligned.txt', sep='\t') hla_dic = hla_df_to_dic(hla) inventory = list(hla_dic.keys()) dic_inventory = dict_inventory(inventory) cnn_model = seperateCNN() cnn_model.load_weights('cnn_model_331_3_7/') ori_score = pd.read_csv(upload, sep=',', header=None) ori_score.columns = ['peptide', 'HLA'] ori_score['immunogenicity'] = ['0'] * ori_score.shape[0] print('************************ 1 *************************' ) # may need to remove dataset_score, hla_type = construct_aaindex(ori_score, hla_dic, after_pca, dic_inventory) input1_score = pull_peptide_aaindex(dataset_score) input2_score = pull_hla_aaindex(dataset_score) label_score = pull_label_aaindex(dataset_score) scoring = cnn_model.predict(x=[input1_score, input2_score]) scoring = cnn_model.predict(x=[input1_score, input2_score]) ori_score['immunogenicity'] = scoring if is_checked == 'True': # see the compute_m function for why we set up like this m = ori_score['HLA'].values.tolist() # a list of HLA p = ori_score['peptide'].values.tolist() # a list of peptides tmp_dic_for_alleles = {} for index, mhc_ in enumerate(m): tmp_dic_for_alleles['sample{}'.format(index)] = [mhc_] predictor = Class1PresentationPredictor.load() result = predictor.predict(peptides=p, alleles=tmp_dic_for_alleles, verbose=0) final = [] for sample, chunk in result.groupby(by='sample_name'): index = int(sample[-1:]) final.append(chunk.iloc[index, :]['presentation_score']) ori_score['binding(mhcflurry)'] = final ori_score.to_csv(download, sep='\t', index=None)
def score_with_mhcflurry(peptides: np.ndarray, alleles: np.ndarray): predictor = Class1PresentationPredictor.load() predictor_scores = predictor.predict(peptides=peptides, alleles=alleles, verbose=0)
def binding_score_from_mhcflurry_s(peptide, mhc): predictor = Class1PresentationPredictor.load() result = predictor.predict(peptides=[peptide], alleles=[mhc], verbose=0) binding = result.iloc[0]['presentation_score'] return float(binding)
def computing_m(peptide, mhc, is_checked): # multiple MHC query table_scaled = wrapper_read_scaling() # [21,553] after_pca = pca_apply_reduction(table_scaled) # [21,12] hla = pd.read_csv('hla2paratopeTable_aligned.txt', sep='\t') hla_dic = hla_df_to_dic(hla) inventory = list(hla_dic.keys()) dic_inventory = dict_inventory(inventory) cnn_model = seperateCNN() cnn_model.load_weights('cnn_model_331_3_7/') hla_score = [ 'HLA-A*0101', 'HLA-A*0201', 'HLA-A*0202', 'HLA-A*0301', 'HLA-A*1101', 'HLA-A*2402', 'HLA-A*6802', 'HLA-B*0702', 'HLA-B*0801', 'HLA-B*3501', 'HLA-B*4402' ] peptide_score = [peptide] * len(hla_score) immuno_score = ['0'] * len(hla_score) ori_score = pd.DataFrame({ 'peptide': peptide_score, 'HLA': hla_score, 'immunogenicity': immuno_score }) dataset_score, hla_type = construct_aaindex(ori_score, hla_dic, after_pca, dic_inventory) input1_score = pull_peptide_aaindex(dataset_score) input2_score = pull_hla_aaindex(dataset_score) label_score = pull_label_aaindex(dataset_score) scoring = cnn_model.predict(x=[input1_score, input2_score]) ori_score['immunogenicity'] = scoring ori_score.sort_values(by=['immunogenicity'], ascending=False, inplace=True) top5 = ori_score.iloc[0:5] p = top5['peptide'].tolist() m = top5['HLA'].tolist() i = [item for item in top5['immunogenicity']] # for these 5 complex, compute binding affnity if is_checked == 'True': ''' strange input requirement: when you have 5 peptides, 5 mhc, you have to construct like this: a = predictor.predict( peptides=["NLVPMVATV","AAAAAAAAA","TTTTTTTT","PPPPPPPP","QQQQQQQQ"], alleles={'sample0': ['HLA-C*0517'], 'sample1': ['HLA-C*0602'], 'sample2': ['HLA-C*0401'], 'sample3': ['HLA-B*4403'], 'sample4': ['HLA-B*5101']}, verbose=0) then since they compute a cross-product, I need to pick the value I need from the returned result ''' tmp_dic_for_alleles = {} for index, mhc_ in enumerate(m): tmp_dic_for_alleles['sample{}'.format(index)] = [mhc_] predictor = Class1PresentationPredictor.load() result = predictor.predict(peptides=p, alleles=tmp_dic_for_alleles, verbose=0) final = [] for sample, chunk in result.groupby(by='sample_name'): index = int(sample[-1:]) final.append(chunk.iloc[index, :]['presentation_score']) else: final = ['NA', 'NA', 'NA', 'NA', 'NA'] return p, m, i, final
from mhcflurry import Class1PresentationPredictor predictor = Class1PresentationPredictor.load() print(len(predictor.supported_alleles)) for item in predictor.supported_alleles: print(item) # alleles = predictor.supported_alleles # df = predictor.predict( # peptides=["SIINFEKL", "NLVPMVATV"], # alleles=["HLA-A0201", "HLA-A0301"], # verbose=0 # ) # # print(df) # # # def test_affinity(): # from mhcflurry import Class1AffinityPredictor # affinity_predictor = Class1AffinityPredictor.load() # df = affinity_predictor.predict_to_dataframe(alleles="HLA-A0201", peptides=["SIINFEKL", "SIINFEQL"]) # print(df)
def MHCflurry(df, alleles_list): predictor = Class1PresentationPredictor.load() #create dicts of index and nmer mut_dict = {} wt_dict = {} for i, r in df.loc[((df['Mut nmer'] != '-') & (df['Mut nmer'].str.len() >= 8))].iterrows(): mut_dict[i] = r['Mut nmer'] for i, r in df.loc[((df['Wt nmer'] != '-') & (df['Wt nmer'].str.len() >= 8))].iterrows(): wt_dict[i] = r['Wt nmer'] ##for whatever reason the result "all" isn't outputting everything ##this works fine if I loop through each HLA and since MHCflurry is ##quick this shouldn't be much of a problem MHCFlurryMut = pd.DataFrame() MHCFlurryWt = pd.DataFrame() for a in alleles_list: MHCFlurryMut = MHCFlurryMut.append(predictor.predict_sequences( sequences=mut_dict, result='all', alleles=[a], peptide_lengths=(8, 9, 10, 11, 12), use_flanks=True, verbose=0), ignore_index=True) MHCFlurryWt = MHCFlurryWt.append(predictor.predict_sequences( sequences=wt_dict, result='all', alleles=[a], peptide_lengths=(8, 9, 10, 11, 12), use_flanks=True, verbose=0), ignore_index=True) ##add lengths in MHCFlurryMut['peptide length'] = MHCFlurryMut['peptide'].str.len() MHCFlurryWt['peptide length'] = MHCFlurryWt['peptide'].str.len() ##rename some columns MHCFlurryMut.rename(columns={'peptide': 'Mutant peptide', 'affinity': 'MHCflurry mutant affinity',\ 'best_allele':'HLA', 'affinity_percentile':'MHCflurry affinity percentile rank mutant',\ 'processing_score':'MHCflurry processing score mutant', 'presentation_score':'MHCflurry presentation score mutant'},\ inplace = True) MHCFlurryWt.rename(columns={'peptide': 'Wild type peptide', 'affinity': 'MHCflurry wild type affinity',\ 'best_allele':'HLA', 'affinity_percentile':'MHCflurry affinity percentile rank wild type',\ 'processing_score':'MHCflurry processing score wild type', 'presentation_score':'MHCflurry presentation score wild type'},\ inplace = True) ##merge mutant and wild type and remove wild type peptied in mutant peptide column merged = MHCFlurryMut.merge(MHCFlurryWt, how = 'left', left_on = ['sequence_name', 'pos', 'HLA', 'peptide length'],\ right_on = ['sequence_name', 'pos', 'HLA', 'peptide length'] ) merged = merged.loc[~(merged['Mutant peptide']==merged['Wild type peptide'])][['sequence_name', 'HLA', 'peptide length','Mutant peptide','Wild type peptide',\ 'MHCflurry mutant affinity','MHCflurry wild type affinity','MHCflurry affinity percentile rank mutant',\ 'MHCflurry affinity percentile rank wild type','MHCflurry processing score mutant', 'MHCflurry processing score wild type',\ 'MHCflurry presentation score mutant','MHCflurry presentation score wild type',]] ##merge with original dataframe out = df.merge(merged, how='left', left_index=True, right_on='sequence_name').drop(columns='sequence_name') out['MHCFlurry Wt:Mut rank'] = out.apply(lambda x: WT2MutpercentileRank( x['MHCflurry affinity percentile rank wild type'], x[ 'MHCflurry affinity percentile rank mutant']), axis=1) ##return out return out
def obtain_allele_list(): predictor = Class1PresentationPredictor.load() # snippet for obtaining supported alleles directly return predictor.supported_alleles