Пример #1
0
def main():
    parser = argparse.ArgumentParser(description="OpenProtein version 0.1")
    parser.add_argument('--silent',
                        dest='silent',
                        action='store_true',
                        help='Dont print verbose debug statements.')
    parser.add_argument('--hide-ui',
                        dest='hide_ui',
                        action='store_true',
                        default=False,
                        help='Hide loss graph and '
                        'visualization UI while training goes on.')
    parser.add_argument('--evaluate-on-test',
                        dest='evaluate_on_test',
                        action='store_true',
                        default=False,
                        help='Run model of test data.')
    parser.add_argument('--use-gpu',
                        dest='use_gpu',
                        action='store_true',
                        default=False,
                        help='Use GPU.')
    parser.add_argument(
        '--eval-interval',
        dest='eval_interval',
        type=int,
        default=10,
        help='Evaluate model on validation set every n minibatches.')
    parser.add_argument('--min-updates',
                        dest='minimum_updates',
                        type=int,
                        default=100,
                        help='Minimum number of minibatch iterations.')
    parser.add_argument('--minibatch-size',
                        dest='minibatch_size',
                        type=int,
                        default=8,
                        help='Size of each minibatch.')
    parser.add_argument('--experiment-id',
                        dest='experiment_id',
                        type=str,
                        default="example",
                        help='Which experiment to run.')
    args, _ = parser.parse_known_args()

    if args.hide_ui:
        write_out("Live plot deactivated, see output folder for plot.")

    use_gpu = args.use_gpu

    if use_gpu and not torch.cuda.is_available():
        write_out("Error: --use-gpu was set, but no GPU is available.")
        sys.exit(1)

    if not args.hide_ui:
        # start web server
        start_dashboard_server()

    experiment = importlib.import_module("experiments." + args.experiment_id)
    experiment.run_experiment(parser, use_gpu)
Пример #2
0
parser.add_argument('--minibatch-size', dest = 'minibatch_size', type=int,
                    default=1, help='Size of each minibatch.')
parser.add_argument('--learning-rate', dest = 'learning_rate', type=float,
                    default=0.01, help='Learning rate to use during training.')
args, unknown = parser.parse_known_args()

if args.hide_ui:
    write_out("Live plot deactivated, see output folder for plot.")

use_gpu = False
if torch.cuda.is_available():
    write_out("CUDA is available, using GPU")
    use_gpu = True

# start web server
start_dashboard_server()

process_raw_data(use_gpu, force_pre_processing_overwrite=False)

training_file = "data/preprocessed/sample.txt.hdf5"
validation_file = "data/preprocessed/sample.txt.hdf5"
testing_file = "data/preprocessed/testing.hdf5"

model = ExampleModel(21, args.minibatch_size, use_gpu=use_gpu)  # embed size = 21

train_loader = contruct_dataloader_from_disk(training_file, args.minibatch_size)
validation_loader = contruct_dataloader_from_disk(validation_file, args.minibatch_size)

train_model_path = train_model(data_set_identifier="TRAIN",
                               model=model,
                               train_loader=train_loader,
Пример #3
0
def run():
    print('Starting server...')
    start_dashboard_server()
    time.sleep(5)

    data_file = "data/preprocessed/validation_100.hdf5"

    protein_loader = contruct_dataloader_from_disk(data_file, 1, with_id=True)
    dataset_size = protein_loader.dataset.__len__()

    use_gpu = False
    if torch.cuda.is_available():
        write_out("CUDA is available, using GPU")
        use_gpu = True

    drmsd_total = 0
    p_id_to_structure = parse_tertiary()
    for minibatch_id, training_minibatch in enumerate(protein_loader, 0):

        print("Predicting next protein")

        primary_sequence, tertiary_positions, mask, p_ids = training_minibatch

        p_id = str(p_ids[0][0], 'utf-8')
        predicted_pos = p_id_to_structure[p_id]
        predicted_pos = torch.Tensor(predicted_pos)
        predicted_pos = predicted_pos / 100  # to angstrom units
        pos = tertiary_positions[0]
        mask = mask[0][:len(predicted_pos)]
        pos = apply_mask(pos, mask)
        predicted_pos = predicted_pos[mask.nonzero()].squeeze(dim=1)
        primary_sequence = torch.masked_select(primary_sequence[0], mask)
        angles = calculate_dihedral_angels(pos, use_gpu)
        angles_pred = calculate_dihedral_angels(predicted_pos, use_gpu)

        # predicted_structure = get_structure_from_angles(primary_sequence[0], angles_pred)
        # actual_structure = get_structure_from_angles(primary_sequence[0], angles)
        write_to_pdb(get_structure_from_angles(primary_sequence, angles),
                     "actual")
        write_to_pdb(get_structure_from_angles(primary_sequence, angles_pred),
                     "predicted")

        pos = pos.contiguous().view(-1, 3)
        predicted_pos = predicted_pos.contiguous().view(-1, 3)

        drmsd = calc_drmsd(pos, predicted_pos, use_gpu).item()

        drmsd_total += drmsd
        print("DRMSD:", drmsd)

        data = {}
        data["pdb_data_pred"] = open("output/protein_predicted.pdb",
                                     "r").read()
        data["pdb_data_true"] = open("output/protein_actual.pdb", "r").read()
        data["phi_actual"] = list(
            [math.degrees(float(v)) for v in angles[1:, 1]])
        data["psi_actual"] = list(
            [math.degrees(float(v)) for v in angles[:-1, 2]])
        data["phi_predicted"] = list(
            [math.degrees(float(v)) for v in angles_pred[1:, 1]])
        data["psi_predicted"] = list(
            [math.degrees(float(v)) for v in angles_pred[:-1, 2]])
        data["validation_dataset_size"] = dataset_size
        data["sample_num"] = [0]
        data["train_loss_values"] = [0]
        data["validation_loss_values"] = [0]
        data["drmsd_avg"] = [drmsd]
        data["rmsd_avg"] = [0]
        res = requests.post('http://localhost:5000/graph', json=data)
        if res.ok:
            print(res.json())
        input("Press Enter to continue...")

    print(drmsd_total / dataset_size)
Пример #4
0
def main():

    hide_ui = True
    if not hide_ui:
        from dashboard import start_dashboard_server
        start_dashboard_server()

    mem_pin = False
    BATCH_SIZE = 32
    epochs = 15000
    curr_ep = 1  # cant be 0 else later on there is division by zero!
    learning_rate = 0.001
    use_DRMSD = False
    clip = 15
    encoder_scheduler_on = False

    #LOAD IN EXISTING MODEL?
    load_model = True
    save_name = 'FullTrainexperiment'
    load_name = 'EvenTighterLatentexperiment'  #LRexperiment

    # WRONG FILEI FOR TRAINING FOR NOW!!
    variant = '_trimmed'
    training_file = "data/preprocessed/training_90" + variant + ".hdf5"
    validation_file = "data/preprocessed/validation" + variant + ".hdf5"
    testing_file = "data/preprocessed/testing" + variant + ".hdf5"

    ENCODING_LSTM_OUTPUT = 600
    META_ENCODING_LSTM_OUTPUT = 600
    CODE_LAYER_SIZE = 250
    DECODING_LSTM_OUTPUT = 600
    VOCAB_SIZE = 21  # 20 amino acids and then the padding value too
    ENCODER_LSTM_NUM_LAYERS = 1
    DECODER_LSTM_NUM_LAYERS = 1
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    readout = False
    allow_teacher_force = False
    teaching_strategy = 'epoch'  # can also be 'accuracy'
    want_preds_printed = False

    # could make this more efficient by preventing padding from being predicted and rewriting the loss function for the sequences.
    encoder_net = EncoderNet(
        device,
        ENCODING_LSTM_OUTPUT=ENCODING_LSTM_OUTPUT,
        META_ENCODING_LSTM_OUTPUT=META_ENCODING_LSTM_OUTPUT,
        CODE_LAYER_SIZE=CODE_LAYER_SIZE,
        VOCAB_SIZE=VOCAB_SIZE,
        ENCODER_LSTM_NUM_LAYERS=ENCODER_LSTM_NUM_LAYERS).to(device)
    decoder_net = DecoderNet(
        device,
        DECODING_LSTM_OUTPUT=DECODING_LSTM_OUTPUT,
        CODE_LAYER_SIZE=CODE_LAYER_SIZE,
        VOCAB_SIZE=VOCAB_SIZE,
        DECODER_LSTM_NUM_LAYERS=DECODER_LSTM_NUM_LAYERS).to(device)

    encoder_optimizer = RAdam(encoder_net.parameters(), lr=learning_rate)
    decoder_optimizer = RAdam(decoder_net.parameters(),
                              lr=learning_rate)  # optim.Adam

    ## Only actually used if encoder_scheduler_on = True.
    # WATCH OUT FOR MIN VS MAX!!! If it is max then it reduces the LR when the value stops INCREASING.
    encoder_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        encoder_optimizer,
        'min',
        factor=0.9,
        patience=5,
        verbose=True,
        threshold=0.001,
        threshold_mode='abs')
    decoder_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        decoder_optimizer,
        'min',
        factor=0.9,
        patience=5,
        verbose=True,
        threshold=0.001,
        threshold_mode='abs')

    #encoder_optimizer = optim.SGD(encoder_net.parameters(), lr=learning_rate, momentum =0.9)
    #decoder_optimizer = optim.SGD(decoder_net.parameters(), lr=learning_rate, momentum =0.9)

    # initialize the model weights!
    '''enc_saved_weights = dict()
    dec_saved_weights = dict()'''
    #nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain('relu'))
    #for net in [encoder_net, decoder_net]: #, save_dict in zip([encoder_net, decoder_net], [enc_saved_weights, dec_saved_weights]):
    #    net.apply(init_weights)
    #save_dict = net.apply(save_weights)

    print('All models for this run will be saved under:', save_name)
    if load_model:
        print("LOADING IN A MODEL, load_model=True")
        encoder_net, decoder_net, encoder_optimizer, decoder_optimizer, loss, curr_ep, best_eval_acc = loadModel(
            encoder_net, decoder_net, encoder_optimizer, decoder_optimizer,
            load_name)

    encoder_net.train()
    decoder_net.train()

    fitModel(encoder_net,
             decoder_net,
             encoder_optimizer,
             decoder_optimizer,
             BATCH_SIZE,
             epochs,
             curr_ep,
             learning_rate,
             mem_pin,
             device,
             save_name,
             load_name,
             readout,
             allow_teacher_force,
             teaching_strategy,
             clip,
             want_preds_printed,
             encoder_scheduler,
             decoder_scheduler,
             training_file,
             validation_file,
             testing_file,
             hide_ui,
             encoder_scheduler_on=encoder_scheduler_on,
             use_DRMSD=use_DRMSD)
Пример #5
0
def predict(model_name, prediction_file, use_gpu=False, batch_size=32, show_ui=True, loss_atoms="c_alpha"):

    if show_ui:
        print('Starting server...')
        start_dashboard_server()
        time.sleep(5)

    data_file = "data/preprocessed/"+prediction_file+".hdf5"
    model_path = "output/models/" + model_name + ".model"



    portein_loader = contruct_dataloader_from_disk(data_file, batch_size)

    if torch.cuda.is_available():
        write_out("CUDA is available, using GPU")
        model = torch.load(model_path)
        use_gpu = True

    else:
        model = torch.load(model_path, map_location='cpu')


    model.use_gpu = use_gpu
    model.model.use_gpu = use_gpu



    model = model.eval()

    dataset_size = portein_loader.dataset.__len__()

    #Set all variables needed for stats.
    drmsd_total = drmsd_fm = drmsd_tbm = \
        tm_counter = fm_counter = tbm_counter = \
        tm_total = tm_fm = tm_tbm = 0

    use_tm = False
    for minibatch_id, training_minibatch in enumerate(portein_loader, 0):

        print("Predicting next minibatch of size:", batch_size)

        primary_sequence, tertiary_positions, mask, p_id, evolutionary = training_minibatch

        #pos = tertiary_positions[0]
        #One Hot encode amino string and concate PSSM values.
        input_sequence, batch_sizes = one_hot_encode(primary_sequence, 21, use_gpu)

        
        evolutionary, batch_sizes = torch.nn.utils.rnn.pad_packed_sequence(torch.nn.utils.rnn.pack_sequence(evolutionary))
        
        if use_gpu:
            evolutionary = evolutionary.cuda()

        input_sequence = torch.cat((input_sequence, evolutionary.view(-1, len(batch_sizes), 21)), 2)
        
        predicted_dihedral_angles, predicted_backbone_atoms, batch_sizes = model((input_sequence, batch_sizes))

        cpu_predicted_angles = predicted_dihedral_angles.transpose(0,1).cpu().detach()
        cpu_predicted_backbone_atoms = predicted_backbone_atoms.transpose(0,1).cpu().detach()

        batch_data = list(zip(primary_sequence, p_id, tertiary_positions, cpu_predicted_backbone_atoms, cpu_predicted_angles, mask))

        for primary_sequence, pid, pos, predicted_pos, predicted_angles, mask in batch_data:
            pos = apply_mask(pos, mask)
            predicted_pos = apply_mask(predicted_pos[:len(primary_sequence)], mask)

            pid = str(pid[0],'utf-8')
            angles = calculate_dihedral_angels(pos, use_gpu)

            angles_pred = apply_mask(predicted_angles[:len(primary_sequence)], mask, size=3)

            primary_sequence = torch.masked_select(primary_sequence, mask)
            #angles_pred = calculate_dihedral_angels(predicted_pos, use_gpu)
            if show_ui:
                write_to_pdb(get_structure_from_angles(primary_sequence, angles), "actual")
                write_to_pdb(get_structure_from_angles(primary_sequence, angles_pred), "predicted")

            if (not torch.isnan(angles).any()) and use_tm:
                print('TM scores:')
                tmscore = TMscore('tmscore/./TMscore')
                tmscore("output/protein_actual.pdb", "output/protein_predicted.pdb")
                tmscore.print_info()
                print('--- End scores ---')
                tm_counter += 1
                tm_total += tmscore.get_tm_score()
            # predicted_structure = get_structure_from_angles(primary_sequence, angles_pred)
            # actual_structure = get_structure_from_angles(primary_sequence, angles)


            predicted_pos = predicted_pos.contiguous().view(-1,3)

            pos = pos.contiguous().view(-1,3)


            drmsd = calc_drmsd(predicted_pos, pos, loss_atoms, use_gpu).item()

            if pid.startswith('FM'):
                print('Free modeling prediction, dRMSD:', drmsd)
                drmsd_fm += drmsd
                fm_counter += 1
                if use_tm:
                    tm_fm += tmscore.get_tm_score()
            elif pid.startswith('TBM'):
                print('Template Based Model prediction, dRMSD:', drmsd)
                drmsd_tbm += drmsd
                tbm_counter += 1
                if use_tm:
                    tm_tbm += tmscore.get_tm_score()
            else:
                print("DRMSD:", drmsd )

            if show_ui:
                data = {}
                data["pdb_data_pred"] = open("output/protein_predicted.pdb","r").read()
                data["pdb_data_true"] = open("output/protein_actual.pdb","r").read()
                data["phi_actual"] = list([math.degrees(float(v)) for v in angles[1:,1]])
                data["psi_actual"] = list([math.degrees(float(v)) for v in angles[:-1,2]])
                data["phi_predicted"] = list([math.degrees(float(v)) for v in angles_pred[1:,0]])
                data["psi_predicted"] = list([math.degrees(float(v)) for v in angles_pred[:-1,1]])
                data["validation_dataset_size"] = dataset_size
                data["sample_num"] = [0]
                data["train_loss_values"] = [0]
                data["validation_loss_values"] = [0]
                data["drmsd_avg"] = [drmsd]
                data["rmsd_avg"] = [0]
            
                res = requests.post('http://localhost:5000/graph', json=data)
                if res.ok:
                    print(res.json())
                input("Press Enter to continue...")

            drmsd_total += drmsd
            print("Evaluating next prediction")


    if fm_counter > 0:
        print('---Results from free-modeling---')
        print('Average dRMSD:', drmsd_fm / fm_counter)
        if use_tm:
            print("Average TM-score:", tm_fm/ fm_counter)
    
    
    if tbm_counter > 0:
        print('---Results from TBM---')
        print('Average dRMSD:', drmsd_tbm / tbm_counter)
        if use_tm:
            print("Average TM-score:", tm_tbm/ tbm_counter)
    
        
    
    print('---Overall results---')
    print("Average drmsd:", drmsd_total / dataset_size)
    if use_tm:
        print("Average TM-score:", tm_total/ tm_counter)
    
    print("No more proteins in file")