def main(): parser = argparse.ArgumentParser(description="OpenProtein version 0.1") parser.add_argument('--silent', dest='silent', action='store_true', help='Dont print verbose debug statements.') parser.add_argument('--hide-ui', dest='hide_ui', action='store_true', default=False, help='Hide loss graph and ' 'visualization UI while training goes on.') parser.add_argument('--evaluate-on-test', dest='evaluate_on_test', action='store_true', default=False, help='Run model of test data.') parser.add_argument('--use-gpu', dest='use_gpu', action='store_true', default=False, help='Use GPU.') parser.add_argument( '--eval-interval', dest='eval_interval', type=int, default=10, help='Evaluate model on validation set every n minibatches.') parser.add_argument('--min-updates', dest='minimum_updates', type=int, default=100, help='Minimum number of minibatch iterations.') parser.add_argument('--minibatch-size', dest='minibatch_size', type=int, default=8, help='Size of each minibatch.') parser.add_argument('--experiment-id', dest='experiment_id', type=str, default="example", help='Which experiment to run.') args, _ = parser.parse_known_args() if args.hide_ui: write_out("Live plot deactivated, see output folder for plot.") use_gpu = args.use_gpu if use_gpu and not torch.cuda.is_available(): write_out("Error: --use-gpu was set, but no GPU is available.") sys.exit(1) if not args.hide_ui: # start web server start_dashboard_server() experiment = importlib.import_module("experiments." + args.experiment_id) experiment.run_experiment(parser, use_gpu)
parser.add_argument('--minibatch-size', dest = 'minibatch_size', type=int, default=1, help='Size of each minibatch.') parser.add_argument('--learning-rate', dest = 'learning_rate', type=float, default=0.01, help='Learning rate to use during training.') args, unknown = parser.parse_known_args() if args.hide_ui: write_out("Live plot deactivated, see output folder for plot.") use_gpu = False if torch.cuda.is_available(): write_out("CUDA is available, using GPU") use_gpu = True # start web server start_dashboard_server() process_raw_data(use_gpu, force_pre_processing_overwrite=False) training_file = "data/preprocessed/sample.txt.hdf5" validation_file = "data/preprocessed/sample.txt.hdf5" testing_file = "data/preprocessed/testing.hdf5" model = ExampleModel(21, args.minibatch_size, use_gpu=use_gpu) # embed size = 21 train_loader = contruct_dataloader_from_disk(training_file, args.minibatch_size) validation_loader = contruct_dataloader_from_disk(validation_file, args.minibatch_size) train_model_path = train_model(data_set_identifier="TRAIN", model=model, train_loader=train_loader,
def run(): print('Starting server...') start_dashboard_server() time.sleep(5) data_file = "data/preprocessed/validation_100.hdf5" protein_loader = contruct_dataloader_from_disk(data_file, 1, with_id=True) dataset_size = protein_loader.dataset.__len__() use_gpu = False if torch.cuda.is_available(): write_out("CUDA is available, using GPU") use_gpu = True drmsd_total = 0 p_id_to_structure = parse_tertiary() for minibatch_id, training_minibatch in enumerate(protein_loader, 0): print("Predicting next protein") primary_sequence, tertiary_positions, mask, p_ids = training_minibatch p_id = str(p_ids[0][0], 'utf-8') predicted_pos = p_id_to_structure[p_id] predicted_pos = torch.Tensor(predicted_pos) predicted_pos = predicted_pos / 100 # to angstrom units pos = tertiary_positions[0] mask = mask[0][:len(predicted_pos)] pos = apply_mask(pos, mask) predicted_pos = predicted_pos[mask.nonzero()].squeeze(dim=1) primary_sequence = torch.masked_select(primary_sequence[0], mask) angles = calculate_dihedral_angels(pos, use_gpu) angles_pred = calculate_dihedral_angels(predicted_pos, use_gpu) # predicted_structure = get_structure_from_angles(primary_sequence[0], angles_pred) # actual_structure = get_structure_from_angles(primary_sequence[0], angles) write_to_pdb(get_structure_from_angles(primary_sequence, angles), "actual") write_to_pdb(get_structure_from_angles(primary_sequence, angles_pred), "predicted") pos = pos.contiguous().view(-1, 3) predicted_pos = predicted_pos.contiguous().view(-1, 3) drmsd = calc_drmsd(pos, predicted_pos, use_gpu).item() drmsd_total += drmsd print("DRMSD:", drmsd) data = {} data["pdb_data_pred"] = open("output/protein_predicted.pdb", "r").read() data["pdb_data_true"] = open("output/protein_actual.pdb", "r").read() data["phi_actual"] = list( [math.degrees(float(v)) for v in angles[1:, 1]]) data["psi_actual"] = list( [math.degrees(float(v)) for v in angles[:-1, 2]]) data["phi_predicted"] = list( [math.degrees(float(v)) for v in angles_pred[1:, 1]]) data["psi_predicted"] = list( [math.degrees(float(v)) for v in angles_pred[:-1, 2]]) data["validation_dataset_size"] = dataset_size data["sample_num"] = [0] data["train_loss_values"] = [0] data["validation_loss_values"] = [0] data["drmsd_avg"] = [drmsd] data["rmsd_avg"] = [0] res = requests.post('http://localhost:5000/graph', json=data) if res.ok: print(res.json()) input("Press Enter to continue...") print(drmsd_total / dataset_size)
def main(): hide_ui = True if not hide_ui: from dashboard import start_dashboard_server start_dashboard_server() mem_pin = False BATCH_SIZE = 32 epochs = 15000 curr_ep = 1 # cant be 0 else later on there is division by zero! learning_rate = 0.001 use_DRMSD = False clip = 15 encoder_scheduler_on = False #LOAD IN EXISTING MODEL? load_model = True save_name = 'FullTrainexperiment' load_name = 'EvenTighterLatentexperiment' #LRexperiment # WRONG FILEI FOR TRAINING FOR NOW!! variant = '_trimmed' training_file = "data/preprocessed/training_90" + variant + ".hdf5" validation_file = "data/preprocessed/validation" + variant + ".hdf5" testing_file = "data/preprocessed/testing" + variant + ".hdf5" ENCODING_LSTM_OUTPUT = 600 META_ENCODING_LSTM_OUTPUT = 600 CODE_LAYER_SIZE = 250 DECODING_LSTM_OUTPUT = 600 VOCAB_SIZE = 21 # 20 amino acids and then the padding value too ENCODER_LSTM_NUM_LAYERS = 1 DECODER_LSTM_NUM_LAYERS = 1 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") readout = False allow_teacher_force = False teaching_strategy = 'epoch' # can also be 'accuracy' want_preds_printed = False # could make this more efficient by preventing padding from being predicted and rewriting the loss function for the sequences. encoder_net = EncoderNet( device, ENCODING_LSTM_OUTPUT=ENCODING_LSTM_OUTPUT, META_ENCODING_LSTM_OUTPUT=META_ENCODING_LSTM_OUTPUT, CODE_LAYER_SIZE=CODE_LAYER_SIZE, VOCAB_SIZE=VOCAB_SIZE, ENCODER_LSTM_NUM_LAYERS=ENCODER_LSTM_NUM_LAYERS).to(device) decoder_net = DecoderNet( device, DECODING_LSTM_OUTPUT=DECODING_LSTM_OUTPUT, CODE_LAYER_SIZE=CODE_LAYER_SIZE, VOCAB_SIZE=VOCAB_SIZE, DECODER_LSTM_NUM_LAYERS=DECODER_LSTM_NUM_LAYERS).to(device) encoder_optimizer = RAdam(encoder_net.parameters(), lr=learning_rate) decoder_optimizer = RAdam(decoder_net.parameters(), lr=learning_rate) # optim.Adam ## Only actually used if encoder_scheduler_on = True. # WATCH OUT FOR MIN VS MAX!!! If it is max then it reduces the LR when the value stops INCREASING. encoder_scheduler = optim.lr_scheduler.ReduceLROnPlateau( encoder_optimizer, 'min', factor=0.9, patience=5, verbose=True, threshold=0.001, threshold_mode='abs') decoder_scheduler = optim.lr_scheduler.ReduceLROnPlateau( decoder_optimizer, 'min', factor=0.9, patience=5, verbose=True, threshold=0.001, threshold_mode='abs') #encoder_optimizer = optim.SGD(encoder_net.parameters(), lr=learning_rate, momentum =0.9) #decoder_optimizer = optim.SGD(decoder_net.parameters(), lr=learning_rate, momentum =0.9) # initialize the model weights! '''enc_saved_weights = dict() dec_saved_weights = dict()''' #nn.init.xavier_uniform_(w, gain=nn.init.calculate_gain('relu')) #for net in [encoder_net, decoder_net]: #, save_dict in zip([encoder_net, decoder_net], [enc_saved_weights, dec_saved_weights]): # net.apply(init_weights) #save_dict = net.apply(save_weights) print('All models for this run will be saved under:', save_name) if load_model: print("LOADING IN A MODEL, load_model=True") encoder_net, decoder_net, encoder_optimizer, decoder_optimizer, loss, curr_ep, best_eval_acc = loadModel( encoder_net, decoder_net, encoder_optimizer, decoder_optimizer, load_name) encoder_net.train() decoder_net.train() fitModel(encoder_net, decoder_net, encoder_optimizer, decoder_optimizer, BATCH_SIZE, epochs, curr_ep, learning_rate, mem_pin, device, save_name, load_name, readout, allow_teacher_force, teaching_strategy, clip, want_preds_printed, encoder_scheduler, decoder_scheduler, training_file, validation_file, testing_file, hide_ui, encoder_scheduler_on=encoder_scheduler_on, use_DRMSD=use_DRMSD)
def predict(model_name, prediction_file, use_gpu=False, batch_size=32, show_ui=True, loss_atoms="c_alpha"): if show_ui: print('Starting server...') start_dashboard_server() time.sleep(5) data_file = "data/preprocessed/"+prediction_file+".hdf5" model_path = "output/models/" + model_name + ".model" portein_loader = contruct_dataloader_from_disk(data_file, batch_size) if torch.cuda.is_available(): write_out("CUDA is available, using GPU") model = torch.load(model_path) use_gpu = True else: model = torch.load(model_path, map_location='cpu') model.use_gpu = use_gpu model.model.use_gpu = use_gpu model = model.eval() dataset_size = portein_loader.dataset.__len__() #Set all variables needed for stats. drmsd_total = drmsd_fm = drmsd_tbm = \ tm_counter = fm_counter = tbm_counter = \ tm_total = tm_fm = tm_tbm = 0 use_tm = False for minibatch_id, training_minibatch in enumerate(portein_loader, 0): print("Predicting next minibatch of size:", batch_size) primary_sequence, tertiary_positions, mask, p_id, evolutionary = training_minibatch #pos = tertiary_positions[0] #One Hot encode amino string and concate PSSM values. input_sequence, batch_sizes = one_hot_encode(primary_sequence, 21, use_gpu) evolutionary, batch_sizes = torch.nn.utils.rnn.pad_packed_sequence(torch.nn.utils.rnn.pack_sequence(evolutionary)) if use_gpu: evolutionary = evolutionary.cuda() input_sequence = torch.cat((input_sequence, evolutionary.view(-1, len(batch_sizes), 21)), 2) predicted_dihedral_angles, predicted_backbone_atoms, batch_sizes = model((input_sequence, batch_sizes)) cpu_predicted_angles = predicted_dihedral_angles.transpose(0,1).cpu().detach() cpu_predicted_backbone_atoms = predicted_backbone_atoms.transpose(0,1).cpu().detach() batch_data = list(zip(primary_sequence, p_id, tertiary_positions, cpu_predicted_backbone_atoms, cpu_predicted_angles, mask)) for primary_sequence, pid, pos, predicted_pos, predicted_angles, mask in batch_data: pos = apply_mask(pos, mask) predicted_pos = apply_mask(predicted_pos[:len(primary_sequence)], mask) pid = str(pid[0],'utf-8') angles = calculate_dihedral_angels(pos, use_gpu) angles_pred = apply_mask(predicted_angles[:len(primary_sequence)], mask, size=3) primary_sequence = torch.masked_select(primary_sequence, mask) #angles_pred = calculate_dihedral_angels(predicted_pos, use_gpu) if show_ui: write_to_pdb(get_structure_from_angles(primary_sequence, angles), "actual") write_to_pdb(get_structure_from_angles(primary_sequence, angles_pred), "predicted") if (not torch.isnan(angles).any()) and use_tm: print('TM scores:') tmscore = TMscore('tmscore/./TMscore') tmscore("output/protein_actual.pdb", "output/protein_predicted.pdb") tmscore.print_info() print('--- End scores ---') tm_counter += 1 tm_total += tmscore.get_tm_score() # predicted_structure = get_structure_from_angles(primary_sequence, angles_pred) # actual_structure = get_structure_from_angles(primary_sequence, angles) predicted_pos = predicted_pos.contiguous().view(-1,3) pos = pos.contiguous().view(-1,3) drmsd = calc_drmsd(predicted_pos, pos, loss_atoms, use_gpu).item() if pid.startswith('FM'): print('Free modeling prediction, dRMSD:', drmsd) drmsd_fm += drmsd fm_counter += 1 if use_tm: tm_fm += tmscore.get_tm_score() elif pid.startswith('TBM'): print('Template Based Model prediction, dRMSD:', drmsd) drmsd_tbm += drmsd tbm_counter += 1 if use_tm: tm_tbm += tmscore.get_tm_score() else: print("DRMSD:", drmsd ) if show_ui: data = {} data["pdb_data_pred"] = open("output/protein_predicted.pdb","r").read() data["pdb_data_true"] = open("output/protein_actual.pdb","r").read() data["phi_actual"] = list([math.degrees(float(v)) for v in angles[1:,1]]) data["psi_actual"] = list([math.degrees(float(v)) for v in angles[:-1,2]]) data["phi_predicted"] = list([math.degrees(float(v)) for v in angles_pred[1:,0]]) data["psi_predicted"] = list([math.degrees(float(v)) for v in angles_pred[:-1,1]]) data["validation_dataset_size"] = dataset_size data["sample_num"] = [0] data["train_loss_values"] = [0] data["validation_loss_values"] = [0] data["drmsd_avg"] = [drmsd] data["rmsd_avg"] = [0] res = requests.post('http://localhost:5000/graph', json=data) if res.ok: print(res.json()) input("Press Enter to continue...") drmsd_total += drmsd print("Evaluating next prediction") if fm_counter > 0: print('---Results from free-modeling---') print('Average dRMSD:', drmsd_fm / fm_counter) if use_tm: print("Average TM-score:", tm_fm/ fm_counter) if tbm_counter > 0: print('---Results from TBM---') print('Average dRMSD:', drmsd_tbm / tbm_counter) if use_tm: print("Average TM-score:", tm_tbm/ tbm_counter) print('---Overall results---') print("Average drmsd:", drmsd_total / dataset_size) if use_tm: print("Average TM-score:", tm_total/ tm_counter) print("No more proteins in file")