Пример #1
0
for ic50 in ic50_data:
    float_ic50_data = float(ic50)
    regression_ic50.append(dataset.map_ic50_for_regression(float_ic50_data))
    binarized_ic50.append(dataset.binarize_ic50(float_ic50_data))

count = 0
for item in binarized_ic50:
    if item == 0:
        count += 1
#print (regression_ic50)
#print (count)
"""
mask_peptides干啥的
"""
mask_len = MHCI_MASK_LEN
with open(data, 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    peptides = [row['peptide'] for row in reader]
masked_peptides = dataset.mask_peptides(peptides, max_len=mask_len)
#print (masked_peptides)
"""
cut_pad_peptides干啥的
"""
padded_peptides = dataset.cut_pad_peptides(peptides)
#print(padded_peptides)
"""
tensorize_keras干啥的
"""
tensorized = dataset.tensorize_keras(padded_peptides, embed_type="onehot")
print(tensorized)
def predict(class_,
            peptides_path,
            mhc,
            model='lstm',
            weights_path=None,
            output=None):
    '''
    Prediction protocol
    '''

    # read peptides
    peptides = [p.strip() for p in open(peptides_path)]

    # set the length
    if class_.upper() == 'I':
        mask_len = MHCI_MASK_LEN
    elif class_.upper() == 'II':
        mask_len = MHCII_MASK_LEN

    print('Predicting for %d peptides' % (len(peptides)))

    # apply cut/pad or mask to same length
    if 'lstm' in model or 'gru' in model:
        normed_peptides = mask_peptides(peptides, max_len=mask_len)
    else:
        normed_peptides = cut_pad_peptides(peptides)

    # get tensorized values for prediction
    peptides_tensor = tensorize_keras(normed_peptides, embed_type='softhot')

    # make model
    print('Building model')
    # define model
    if model == 'lstm':
        model = mhcnuggets_lstm(input_size=(mask_len, NUM_AAS))

    if weights_path:
        model.load_weights(weights_path)
    else:
        if class_.upper() == 'I':
            predictor_mhc = closest_mhcI(mhc)
        elif class_.upper() == 'II':
            predictor_mhc = closest_mhcII(mhc)

        print("Closest allele found", predictor_mhc)
        model.load_weights(
            os.path.join(MHCNUGGETS_HOME, "saves", "production",
                         predictor_mhc + '.h5'))

    model.compile(loss='mse', optimizer=Adam(lr=0.001))

    # test model
    preds_continuous, preds_binary = get_predictions(peptides_tensor, model)
    ic50s = [map_proba_to_ic50(p[0]) for p in preds_continuous]

    # write out results
    if output:
        filehandle = open(output, 'w')
    else:
        filehandle = sys.stdout

    print(','.join(('peptide', 'ic50')), file=filehandle)
    for i, peptide in enumerate(peptides):
        print(','.join((peptide, str(ic50s[i]))), file=filehandle)
Пример #3
0
def predict(class_,
            peptides_path,
            mhc,
            pickle_path='data/production/examples_per_allele.pkl',
            model='lstm',
            model_weights_path="saves/production/",
            output=None,
            mass_spec=False,
            ic50_threshold=500,
            max_ic50=50000,
            embed_peptides=False,
            binary_preds=False,
            ba_models=False):
    '''
    Prediction protocol
    '''
    # read peptides
    peptides = [p.strip() for p in open(peptides_path)]

    # set the length
    if class_.upper() == 'I':
        mask_len = MHCI_MASK_LEN
        input_size = (MHCI_MASK_LEN, NUM_AAS)
    elif class_.upper() == 'II':
        mask_len = MHCII_MASK_LEN
        input_size = (MHCII_MASK_LEN, NUM_AAS)

    print('Predicting for %d peptides' % (len(peptides)))

    # apply cut/pad or mask to same length
    normed_peptides, original_peptides = mask_peptides(peptides,
                                                       max_len=mask_len)
    # get tensorized values for prediction
    peptides_tensor = tensorize_keras(normed_peptides, embed_type='softhot')

    # make model
    print('Building model')
    model = mhcnuggets_lstm(input_size)
    if class_.upper() == 'I':
        predictor_mhc = closest_mhcI(mhc, pickle_path)
    elif class_.upper() == 'II':
        predictor_mhc = closest_mhcII(mhc, pickle_path)
    print("Closest allele found", predictor_mhc)

    if model_weights_path != "saves/production/":
        print('Predicting with user-specified model: ' + model_weights_path)
        model.load_weights(model_weights_path)
    elif ba_models:
        print('Predicting with only binding affinity trained models')
        model.load_weights(
            os.path.join(MHCNUGGETS_HOME, model_weights_path,
                         predictor_mhc + '_BA.h5'))
    elif os.path.isfile(
            os.path.join(MHCNUGGETS_HOME, model_weights_path,
                         predictor_mhc + '_BA_to_HLAp.h5')):
        print('BA_to_HLAp model found, predicting with BA_to_HLAp model...')
        model.load_weights(
            os.path.join(MHCNUGGETS_HOME, model_weights_path,
                         predictor_mhc + '_BA_to_HLAp.h5'))
    else:
        print('No BA_to_HLAp model found, predicting with BA model...')
        model.load_weights(
            os.path.join(MHCNUGGETS_HOME, model_weights_path,
                         predictor_mhc + '_BA.h5'))

    if mass_spec:
        model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001))
    else:
        model.compile(loss='mse', optimizer=Adam(lr=0.001))

    # test model
    preds_continuous, preds_binary = get_predictions(peptides_tensor, model,
                                                     binary_preds,
                                                     embed_peptides,
                                                     ic50_threshold, max_ic50)
    ic50s = [map_proba_to_ic50(p[0], max_ic50) for p in preds_continuous]

    # write out results
    if output:
        filehandle = open(output, 'w')
    else:
        filehandle = sys.stdout

    print(','.join(('peptide', 'ic50')), file=filehandle)
    for i, peptide in enumerate(original_peptides):
        print(','.join((peptide, str(round(ic50s[i], 2)))), file=filehandle)