for ic50 in ic50_data: float_ic50_data = float(ic50) regression_ic50.append(dataset.map_ic50_for_regression(float_ic50_data)) binarized_ic50.append(dataset.binarize_ic50(float_ic50_data)) count = 0 for item in binarized_ic50: if item == 0: count += 1 #print (regression_ic50) #print (count) """ mask_peptides干啥的 """ mask_len = MHCI_MASK_LEN with open(data, 'r') as csvfile: reader = csv.DictReader(csvfile) peptides = [row['peptide'] for row in reader] masked_peptides = dataset.mask_peptides(peptides, max_len=mask_len) #print (masked_peptides) """ cut_pad_peptides干啥的 """ padded_peptides = dataset.cut_pad_peptides(peptides) #print(padded_peptides) """ tensorize_keras干啥的 """ tensorized = dataset.tensorize_keras(padded_peptides, embed_type="onehot") print(tensorized)
def predict(class_, peptides_path, mhc, model='lstm', weights_path=None, output=None): ''' Prediction protocol ''' # read peptides peptides = [p.strip() for p in open(peptides_path)] # set the length if class_.upper() == 'I': mask_len = MHCI_MASK_LEN elif class_.upper() == 'II': mask_len = MHCII_MASK_LEN print('Predicting for %d peptides' % (len(peptides))) # apply cut/pad or mask to same length if 'lstm' in model or 'gru' in model: normed_peptides = mask_peptides(peptides, max_len=mask_len) else: normed_peptides = cut_pad_peptides(peptides) # get tensorized values for prediction peptides_tensor = tensorize_keras(normed_peptides, embed_type='softhot') # make model print('Building model') # define model if model == 'lstm': model = mhcnuggets_lstm(input_size=(mask_len, NUM_AAS)) if weights_path: model.load_weights(weights_path) else: if class_.upper() == 'I': predictor_mhc = closest_mhcI(mhc) elif class_.upper() == 'II': predictor_mhc = closest_mhcII(mhc) print("Closest allele found", predictor_mhc) model.load_weights( os.path.join(MHCNUGGETS_HOME, "saves", "production", predictor_mhc + '.h5')) model.compile(loss='mse', optimizer=Adam(lr=0.001)) # test model preds_continuous, preds_binary = get_predictions(peptides_tensor, model) ic50s = [map_proba_to_ic50(p[0]) for p in preds_continuous] # write out results if output: filehandle = open(output, 'w') else: filehandle = sys.stdout print(','.join(('peptide', 'ic50')), file=filehandle) for i, peptide in enumerate(peptides): print(','.join((peptide, str(ic50s[i]))), file=filehandle)
def predict(class_, peptides_path, mhc, pickle_path='data/production/examples_per_allele.pkl', model='lstm', model_weights_path="saves/production/", output=None, mass_spec=False, ic50_threshold=500, max_ic50=50000, embed_peptides=False, binary_preds=False, ba_models=False): ''' Prediction protocol ''' # read peptides peptides = [p.strip() for p in open(peptides_path)] # set the length if class_.upper() == 'I': mask_len = MHCI_MASK_LEN input_size = (MHCI_MASK_LEN, NUM_AAS) elif class_.upper() == 'II': mask_len = MHCII_MASK_LEN input_size = (MHCII_MASK_LEN, NUM_AAS) print('Predicting for %d peptides' % (len(peptides))) # apply cut/pad or mask to same length normed_peptides, original_peptides = mask_peptides(peptides, max_len=mask_len) # get tensorized values for prediction peptides_tensor = tensorize_keras(normed_peptides, embed_type='softhot') # make model print('Building model') model = mhcnuggets_lstm(input_size) if class_.upper() == 'I': predictor_mhc = closest_mhcI(mhc, pickle_path) elif class_.upper() == 'II': predictor_mhc = closest_mhcII(mhc, pickle_path) print("Closest allele found", predictor_mhc) if model_weights_path != "saves/production/": print('Predicting with user-specified model: ' + model_weights_path) model.load_weights(model_weights_path) elif ba_models: print('Predicting with only binding affinity trained models') model.load_weights( os.path.join(MHCNUGGETS_HOME, model_weights_path, predictor_mhc + '_BA.h5')) elif os.path.isfile( os.path.join(MHCNUGGETS_HOME, model_weights_path, predictor_mhc + '_BA_to_HLAp.h5')): print('BA_to_HLAp model found, predicting with BA_to_HLAp model...') model.load_weights( os.path.join(MHCNUGGETS_HOME, model_weights_path, predictor_mhc + '_BA_to_HLAp.h5')) else: print('No BA_to_HLAp model found, predicting with BA model...') model.load_weights( os.path.join(MHCNUGGETS_HOME, model_weights_path, predictor_mhc + '_BA.h5')) if mass_spec: model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001)) else: model.compile(loss='mse', optimizer=Adam(lr=0.001)) # test model preds_continuous, preds_binary = get_predictions(peptides_tensor, model, binary_preds, embed_peptides, ic50_threshold, max_ic50) ic50s = [map_proba_to_ic50(p[0], max_ic50) for p in preds_continuous] # write out results if output: filehandle = open(output, 'w') else: filehandle = sys.stdout print(','.join(('peptide', 'ic50')), file=filehandle) for i, peptide in enumerate(original_peptides): print(','.join((peptide, str(round(ic50s[i], 2)))), file=filehandle)