def test(class_, data, mhc, model_path, model='lstm'):
    '''
    Evaluation protocol
    '''

    # print out options
    print('Testing\nMHC: %s\nData: %s\nModel: %s\nSave path: %s' %
          (mhc, data, model, model_path))

    # load training
    test_data = Dataset.from_csv(filename=data,
                                 sep=',',
                                 allele_column_name='mhc',
                                 peptide_column_name='peptide',
                                 affinity_column_name='IC50(nM)')

    # set the length
    if class_.upper() == 'I':
        mask_len = MHCI_MASK_LEN
    elif class_.upper() == 'II':
        mask_len = MHCII_MASK_LEN

    # apply cut/pad or mask to same length
    if 'lstm' in model or 'gru' in model or 'attn' in model:
        test_data.mask_peptides(max_len=mask_len)
    else:
        test_data.cut_pad_peptides()

    # get the allele specific data
    mhc_test = test_data.get_allele(mhc)

    # define model
    if model == 'lstm':
        model = mhcnuggets_lstm(input_size=(mask_len, NUM_AAS))

    # compile model
    model.load_weights(model_path)
    model.compile(loss='mse', optimizer=Adam(lr=0.001))

    # get tensorized values for training
    test_peptides, test_continuous, test_binary = mhc_test.tensorize_keras(
        embed_type='softhot')

    # test
    preds_continuous, preds_binary = get_predictions(test_peptides, model)
    test_auc = roc_auc_score(test_binary, preds_continuous)
    test_f1 = f1_score(test_binary, preds_binary)
    test_ktau = kendalltau(test_continuous, preds_continuous)[0]
    print('Test AUC: %.4f, F1: %.4f, KTAU: %.4f' %
          (test_auc, test_f1, test_ktau))
def predict(class_,
            peptides_path,
            mhc,
            model='lstm',
            weights_path=None,
            output=None):
    '''
    Prediction protocol
    '''

    # read peptides
    peptides = [p.strip() for p in open(peptides_path)]

    # set the length
    if class_.upper() == 'I':
        mask_len = MHCI_MASK_LEN
    elif class_.upper() == 'II':
        mask_len = MHCII_MASK_LEN

    print('Predicting for %d peptides' % (len(peptides)))

    # apply cut/pad or mask to same length
    if 'lstm' in model or 'gru' in model:
        normed_peptides = mask_peptides(peptides, max_len=mask_len)
    else:
        normed_peptides = cut_pad_peptides(peptides)

    # get tensorized values for prediction
    peptides_tensor = tensorize_keras(normed_peptides, embed_type='softhot')

    # make model
    print('Building model')
    # define model
    if model == 'lstm':
        model = mhcnuggets_lstm(input_size=(mask_len, NUM_AAS))

    if weights_path:
        model.load_weights(weights_path)
    else:
        if class_.upper() == 'I':
            predictor_mhc = closest_mhcI(mhc)
        elif class_.upper() == 'II':
            predictor_mhc = closest_mhcII(mhc)

        print("Closest allele found", predictor_mhc)
        model.load_weights(
            os.path.join(MHCNUGGETS_HOME, "saves", "production",
                         predictor_mhc + '.h5'))

    model.compile(loss='mse', optimizer=Adam(lr=0.001))

    # test model
    preds_continuous, preds_binary = get_predictions(peptides_tensor, model)
    ic50s = [map_proba_to_ic50(p[0]) for p in preds_continuous]

    # write out results
    if output:
        filehandle = open(output, 'w')
    else:
        filehandle = sys.stdout

    print(','.join(('peptide', 'ic50')), file=filehandle)
    for i, peptide in enumerate(peptides):
        print(','.join((peptide, str(ic50s[i]))), file=filehandle)
예제 #3
0
def calculate_relation(mhc,
                       data,
                       model,
                       weights_dir,
                       mass_spec,
                       rand_negs,
                       ic50_threshold,
                       max_ic50,
                       binary=False,
                       embed_peptides=False):
    '''
    Training protocol
    '''

    print('Calculating tuning MHC for %s' % mhc)

    relations_dict = {}

    # get the allele specific data
    mhc_data, num_positives, num_random_negatives, num_real_negatives = data.get_allele(
        mhc, mass_spec, rand_negs, ic50_threshold)
    train_peptides, train_continuous, train_binary = mhc_data.tensorize_keras(
        embed_type='softhot')
    best_mhc = ''
    best_auc = 0
    best_f1 = 0
    best_ppv_top = 0

    num_mhc = len(mhc_data.peptides)

    for tuning_mhc in sorted(set(data.alleles)):
        # don't want to tune with ourselves
        if mhc == tuning_mhc:
            continue

        # define the path to save weights
        try:
            model_path = os.path.join(weights_dir, tuning_mhc + '.h5')
            model.load_weights(model_path)
        except IOError:
            continue
        preds_continuous, preds_binary = get_predictions(
            train_peptides, model, binary, embed_peptides, ic50_threshold,
            max_ic50)

        try:
            auc = roc_auc_score(train_binary, preds_continuous)
            f1 = f1_score(train_binary, preds_binary)
            #make preds_continuous, test_binary and preds_binary into a matrix, sort by preds_continous, do predicion on the top npos rows only
            raveled_preds_continuous = np.array(preds_continuous,
                                                dtype='float32').ravel()
            np_lists = np.array(
                [raveled_preds_continuous, preds_binary, train_binary])
            columns = ['pred_cont', 'pred_bin', 'true_bin']
            dframe = pd.DataFrame(np_lists.T, columns=columns)
            dframe.sort_values('pred_cont', inplace=True, ascending=False)
            dframe_head = dframe.head(num_positives)
            sorted_pred_cont = dframe_head['pred_cont'].tolist()
            sorted_pred_bin = dframe_head['pred_bin'].tolist()
            sorted_true_bin = dframe_head['true_bin'].tolist()
            ppv_top = precision_score(sorted_true_bin,
                                      sorted_pred_bin,
                                      pos_label=1)

            #print ('MHC: %s, AUC: %.4f, F1: %.4f, KTAU: %.4f' % (tuning_mhc,
            #                                                     auc,
            #                                                     f1,
            #                                                     ktau))
            if auc > best_auc:
                best_auc_mhc = tuning_mhc
                best_auc = auc
            if f1 > best_f1:
                best_f1_mhc = tuning_mhc
                best_f1 = f1
            if ppv_top > best_ppv_top:
                best_ppv_top_mhc = tuning_mhc
                best_ppv_top = ppv_top

            adata, num_pos, num_rand_neg, num_real_neg = data.get_allele(
                tuning_mhc, mass_spec, rand_negs, ic50_threshold)
            num_tuning_mhc = len(adata.peptides)

        except ValueError:
            continue

    return best_auc_mhc, best_auc, best_f1_mhc, best_f1, best_ppv_top_mhc, best_ppv_top, num_mhc, num_tuning_mhc
def train(class_, data, mhc, save_path, n_epoch,
          model='lstm', lr=0.001, transfer_path=None):
    '''
    Training protocol
    '''

    # store model name
    model_name = model

    # print out options
    print('Training\nMHC: %s\nData: %s\nModel: %s\nSave path: %s\nTransfer: %s' %
          (mhc, data, model, save_path, transfer_path))

    # load training
    train_data = Dataset.from_csv(filename=data,
                                  sep=',',
                                  allele_column_name='mhc',
                                  peptide_column_name='peptide',
                                  affinity_column_name='IC50(nM)')

    # set the length
    if class_.upper() == 'I':
        mask_len = MHCI_MASK_LEN
    elif class_.upper() == 'II':
        mask_len = MHCII_MASK_LEN

    # apply cut/pad or mask to same length
    if 'lstm' in model or 'gru' in model or 'attn' in model:
        train_data.mask_peptides(max_len=mask_len)
    else:
        train_data.cut_pad_peptides()

    # get the allele specific data
    mhc_train = train_data.get_allele(mhc)

    print('Training on %d peptides' % len(mhc_train.peptides))

    # define model
    if model == 'lstm':
        model = mhcnuggets_lstm(input_size=(mask_len, NUM_AAS))

    # check if we need to do transfer learning
    if transfer_path:
        model.load_weights(transfer_path)

    # compile model
    model.compile(loss='mse', optimizer=Adam(lr=0.001))

    # get tensorized values for training
    train_peptides, train_continuous, train_binary = mhc_train.tensorize_keras(embed_type='softhot')

    # convergence criterion
    highest_f1 = -1

    for epoch in range(n_epoch):

        # train
        model.fit(train_peptides, train_continuous, epochs=1, verbose=0)
        # test model on training data
        train_preds_cont, train_preds_bin = get_predictions(train_peptides, model)
        train_auc = roc_auc_score(train_binary, train_preds_cont)
        train_f1 = f1_score(train_binary, train_preds_bin)
        train_ktau = kendalltau(train_continuous, train_preds_cont)[0]
        print('epoch %d / %d' % (epoch, n_epoch))
        print('Train AUC: %.4f, F1: %.4f, KTAU: %.4f' %
              (train_auc, train_f1, train_ktau))

        # convergence
        if train_f1 > highest_f1:

            highest_f1 = train_f1
            best_epoch = epoch
            model.save_weights(save_path)

    print('Done!')
예제 #5
0
def train(class_, data, mhc, save_path, n_epoch, model='lstm',
          lr=0.001, transfer_path=None, mass_spec=False, ic50_threshold=500, max_ic50=50000):
    '''
    Training protocol
    '''
    # store model name
    model_name = model

    # print out options
    print('Training\nMHC: %s\nData: %s\nModel: %s\nSave path: %s\nTransfer: %s\nMassSpec: %s' %
          (mhc, data, model, save_path, transfer_path, mass_spec))

    # load training
    train_data = Dataset.from_csv(filename=data, ic50_threshold=ic50_threshold, max_ic50=max_ic50,
                                  sep=',', 
                                  allele_column_name='mhc',
                                  peptide_column_name='peptide',
                                  affinity_column_name='IC50(nM)',
                                  type_column_name='measurement_type',
                                  source_column_name='measurement_source'
                                  )

    # set the length
    if class_.upper() == 'I':
        mask_len = MHCI_MASK_LEN
    elif class_.upper() == 'II':
        mask_len = MHCII_MASK_LEN

    train_data.mask_peptides(max_len=mask_len)

    # get the allele specific data 
    mhc_train, n_pos, n_rand_neg, n_real_neg = train_data.get_allele(mhc, mass_spec, ic50_threshold)

    """
    #calculate the composition of the actual training set that will be used
    print('Training on %d peptides' % len(mhc_train.peptides))

    print(str(n_pos) + ' positives ')
    print(str(n_real_neg)  + ' real_negatives ')
    if n_real_neg != 0:
        real_skew = math.fabs(math.log((float(n_pos) / float(n_real_neg))))
    else:
        real_skew = "ND"
    print(str(real_skew) + ' real skew')
    print(str(n_rand_neg) + ' random negatives ')
    n_all_neg = n_real_neg + n_rand_neg
    if n_real_neg + n_rand_neg != 0:
        total_skew = math.fabs(math.log((float(n_pos) / float(n_all_neg))))    #including random negs
    else:
        total_skew = "ND"
    print(str(total_skew) + 'total skew after random negs added')
    """

    # define model
    input_size = (mask_len, NUM_AAS)
    model = mhcnuggets_lstm(input_size)

    # check if we need to do transfer learning
    if transfer_path:
        model.load_weights(transfer_path)

    #select appropriate loss function for binding affinity data (continuous) or mass spec data (binary)    
    if mass_spec:
        model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001))
    else:
        model.compile(loss='mse', optimizer=Adam(lr=0.001))

    # convergence criterion
#    highest_f1 = -1
    highest_ppv_top = -1

    # get tensorized values of the whole dataset for epoch training and for testing
    train_peptides, train_continuous, train_binary = mhc_train.tensorize_keras(embed_type='softhot')

    for epoch in range(n_epoch):
        # train
        model.fit(train_peptides, train_continuous, epochs=1, verbose=0)
        # test model on training data
        train_preds_cont, train_preds_bin = get_predictions(train_peptides, model)
        train_auc = roc_auc_score(train_binary, train_preds_cont)
        train_f1 = f1_score(train_binary, train_preds_bin)
        train_ktau = kendalltau(train_continuous, train_preds_cont)[0]
        raveled_train_preds_cont = np.array(train_preds_cont, dtype='float32').ravel()
        train_pearsonr = pearsonr(train_continuous, raveled_train_preds_cont)[0]
        train_ppv = precision_score(train_binary, train_preds_bin, pos_label=1)
        #make train_preds_cont, train_binary and train_preds_bin into a matrix, sort by train_preds_cont, do predicion on the top npos rows only
        np_lists = np.array([raveled_train_preds_cont, train_preds_bin, train_binary])
        columns = ['pred_cont','pred_bin','true_bin']
        dframe = pd.DataFrame(np_lists.T,columns=columns)
        dframe.sort_values('pred_cont',inplace=True, ascending=False)
        dframe_head = dframe.head(n_pos)
        sorted_pred_cont = dframe_head['pred_cont'].tolist()
        sorted_pred_bin = dframe_head['pred_bin'].tolist()
        sorted_true_bin = dframe_head['true_bin'].tolist()
        train_ppv_top = precision_score(sorted_true_bin, sorted_pred_bin, pos_label=1)

        print('epoch %d / %d' % (epoch, n_epoch))

        print('Num pos: %.4f\nTrain AUC: %.4f, F1: %.4f, KTAU: %.4f, PCC: %.4f, PPV: %.4f, PPVtop: %.4f' %
              (n_pos, train_auc, train_f1, train_ktau, train_pearsonr, train_ppv, train_ppv_top))

       # convergence
        if train_ppv_top > highest_ppv_top:
            highest_ppv_top = train_ppv_top
            best_epoch = epoch
            model.save_weights(save_path)

    print('Done!')
예제 #6
0
def predict(class_,
            peptides_path,
            mhc,
            pickle_path='data/production/examples_per_allele.pkl',
            model='lstm',
            model_weights_path="saves/production/",
            output=None,
            mass_spec=False,
            ic50_threshold=500,
            max_ic50=50000,
            embed_peptides=False,
            binary_preds=False,
            ba_models=False):
    '''
    Prediction protocol
    '''
    # read peptides
    peptides = [p.strip() for p in open(peptides_path)]

    # set the length
    if class_.upper() == 'I':
        mask_len = MHCI_MASK_LEN
        input_size = (MHCI_MASK_LEN, NUM_AAS)
    elif class_.upper() == 'II':
        mask_len = MHCII_MASK_LEN
        input_size = (MHCII_MASK_LEN, NUM_AAS)

    print('Predicting for %d peptides' % (len(peptides)))

    # apply cut/pad or mask to same length
    normed_peptides, original_peptides = mask_peptides(peptides,
                                                       max_len=mask_len)
    # get tensorized values for prediction
    peptides_tensor = tensorize_keras(normed_peptides, embed_type='softhot')

    # make model
    print('Building model')
    model = mhcnuggets_lstm(input_size)
    if class_.upper() == 'I':
        predictor_mhc = closest_mhcI(mhc, pickle_path)
    elif class_.upper() == 'II':
        predictor_mhc = closest_mhcII(mhc, pickle_path)
    print("Closest allele found", predictor_mhc)

    if model_weights_path != "saves/production/":
        print('Predicting with user-specified model: ' + model_weights_path)
        model.load_weights(model_weights_path)
    elif ba_models:
        print('Predicting with only binding affinity trained models')
        model.load_weights(
            os.path.join(MHCNUGGETS_HOME, model_weights_path,
                         predictor_mhc + '_BA.h5'))
    elif os.path.isfile(
            os.path.join(MHCNUGGETS_HOME, model_weights_path,
                         predictor_mhc + '_BA_to_HLAp.h5')):
        print('BA_to_HLAp model found, predicting with BA_to_HLAp model...')
        model.load_weights(
            os.path.join(MHCNUGGETS_HOME, model_weights_path,
                         predictor_mhc + '_BA_to_HLAp.h5'))
    else:
        print('No BA_to_HLAp model found, predicting with BA model...')
        model.load_weights(
            os.path.join(MHCNUGGETS_HOME, model_weights_path,
                         predictor_mhc + '_BA.h5'))

    if mass_spec:
        model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001))
    else:
        model.compile(loss='mse', optimizer=Adam(lr=0.001))

    # test model
    preds_continuous, preds_binary = get_predictions(peptides_tensor, model,
                                                     binary_preds,
                                                     embed_peptides,
                                                     ic50_threshold, max_ic50)
    ic50s = [map_proba_to_ic50(p[0], max_ic50) for p in preds_continuous]

    # write out results
    if output:
        filehandle = open(output, 'w')
    else:
        filehandle = sys.stdout

    print(','.join(('peptide', 'ic50')), file=filehandle)
    for i, peptide in enumerate(original_peptides):
        print(','.join((peptide, str(round(ic50s[i], 2)))), file=filehandle)
예제 #7
0
def test(class_,
         data,
         mhc,
         model_path,
         model='lstm',
         mass_spec=False,
         ic50_threshold=500,
         max_ic50=50000):
    '''
    Evaluation protocol
    '''

    # print out options
    print(
        'Testing\nMHC: %s\nData: %s\nModel: %s\nSave path: %s\nMass spec: %s\nIC50 threshold: %s\nMax IC50: %s\n'
        % (mhc, data, model, model_path, mass_spec, ic50_threshold, max_ic50))

    # load training
    test_data = Dataset.from_csv(filename=data,
                                 ic50_threshold=ic50_threshold,
                                 max_ic50=max_ic50,
                                 sep=',',
                                 allele_column_name='mhc',
                                 peptide_column_name='peptide',
                                 affinity_column_name='IC50(nM)',
                                 type_column_name='measurement_type',
                                 source_column_name='measurement_source')

    # define model
    if class_.upper() == 'I':
        mask_len = MHCI_MASK_LEN
        input_size = (MHCI_MASK_LEN, NUM_AAS)
    elif class_.upper() == 'II':
        mask_len = MHCII_MASK_LEN
        input_size = (MHCII_MASK_LEN, NUM_AAS)

    model = mhcnuggets_lstm(input_size)
    test_data.mask_peptides(max_len=mask_len)

    # get the allele specific data
    mhc_test, npos, nrandneg, nrealneg = test_data.get_allele(mhc,
                                                              mass_spec,
                                                              ic50_threshold,
                                                              length=None)

    # compile model
    model.load_weights(model_path)
    if mass_spec:
        model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001))
    else:
        model.compile(loss='mse', optimizer=Adam(lr=0.001))

    # get tensorized values for testing
    test_peptides, test_continuous, test_binary = mhc_test.tensorize_keras(
        embed_type='softhot')

    # test
    preds_continuous, preds_binary = get_predictions(
        test_peptides, model, ic50_threshold=ic50_threshold, max_ic50=max_ic50)
    test_auc = roc_auc_score(test_binary, preds_continuous)
    test_f1 = f1_score(test_binary, preds_binary)
    test_ktau = kendalltau(test_continuous, preds_continuous)[0]
    raveled_preds_continuous = np.array(preds_continuous,
                                        dtype='float32').ravel()
    test_pearsonr = pearsonr(test_continuous, raveled_preds_continuous)[0]
    test_ppv = precision_score(test_binary, preds_binary, pos_label=1)
    #make preds_continuous, test_binary and preds_binary into a matrix, sort by preds_continous, do predicion on the top npos rows only
    np_lists = np.array([raveled_preds_continuous, preds_binary, test_binary])
    columns = ['pred_cont', 'pred_bin', 'true_bin']
    dframe = pd.DataFrame(np_lists.T, columns=columns)
    dframe.sort_values('pred_cont', inplace=True, ascending=False)
    dframe_head = dframe.head(npos)
    sorted_pred_cont = dframe_head['pred_cont'].tolist()
    sorted_pred_bin = dframe_head['pred_bin'].tolist()
    sorted_true_bin = dframe_head['true_bin'].tolist()
    test_ppv_top = precision_score(sorted_true_bin,
                                   sorted_pred_bin,
                                   pos_label=1)

    print(
        'Test AUC: %.4f, F1: %.4f, KTAU: %.4f, PCC: %.4f, PPV: %.4f, PPVtop: %.4f'
        %
        (test_auc, test_f1, test_ktau, test_pearsonr, test_ppv, test_ppv_top))