def run_experiment(parser, use_gpu): # parse experiment specific command line arguments parser.add_argument('--learning-rate', dest='learning_rate', type=float, default=0.001, help='Learning rate to use during training.') args, _unknown = parser.parse_known_args() # pre-process data process_raw_data(use_gpu, force_pre_processing_overwrite=False) start_compute_grad = time.time() for path in data_paths: # run experiment training_file = "data/preprocessed/training_" + path + ".hdf5" validation_file = "data/preprocessed/validation.hdf5" if args.skenario is 1: model = RGN(embedding_size=42, use_gpu=use_gpu, minibatch_size=args.minibatch_size, pretraining=-1) elif args.skenario is 2: model = UTGN(embedding_size=42, use_gpu=use_gpu, batch_size=args.minibatch_size, pretraining=-1) elif args.skenario is 3: model = RGN(embedding_size=768 + 21, use_gpu=use_gpu, minibatch_size=args.minibatch_size, use_pssm=False, use_token=True) elif args.skenario is 4: model = UTGN(embedding_size=768 + 21, use_gpu=use_gpu, batch_size=args.minibatch_size, use_pssm=False, use_token=True) elif args.skenario is 5: model = RGN(embedding_size=21, use_gpu=use_gpu, minibatch_size=args.minibatch_size, pretraining=-1, use_pssm=False) elif args.skenario is 6: model = UTGN(embedding_size=21, use_gpu=use_gpu, batch_size=args.minibatch_size, pretraining=-1, use_pssm=False) elif args.skenario is 7: model = RGN(embedding_size=768, use_gpu=use_gpu, minibatch_size=args.minibatch_size, use_aa=False, use_pssm=False, use_token=True) elif args.skenario is 8: model = UTGN(embedding_size=768, use_gpu=use_gpu, batch_size=args.minibatch_size, use_aa=False, use_pssm=False, use_token=True) train_loader = contruct_dataloader_from_disk(training_file, args.minibatch_size) validation_loader = contruct_dataloader_from_disk(validation_file, args.minibatch_size) identifier = "skenario{0}".format(args.skenario) train_model_path = train_model( data_set_identifier=identifier, model=model, train_loader=train_loader, validation_loader=validation_loader, learning_rate=args.learning_rate, minibatch_size=args.minibatch_size, eval_interval=args.eval_interval, hide_ui=args.hide_ui, use_gpu=use_gpu, optimizer_type=args.optimizer_type, restart=args.restart, minimum_updates=args.minimum_updates) print(train_model_path) end = time.time() print("Training time:", end - start_compute_grad)
def process_new_data(newData: pd.DataFrame) -> pd.DataFrame: #newData.pop('time') newData = pp.process_raw_data(newData.to_numpy()) newData['subject'] = 'kenji' newData['state'] = 'rest' newData['alg_state'] = newData.apply(OpisenseAlg().determine_state, axis=1) return newData
def run_experiment(parser, use_gpu): # parse experiment specific command line arguments parser.add_argument('--learning-rate', dest='learning_rate', type=float, default=0.01, help='Learning rate to use during training.') parser.add_argument('--min-updates', dest='minimum_updates', type=int, default=1000, help='Minimum number of minibatch iterations.') parser.add_argument('--minibatch-size', dest='minibatch_size', type=int, default=1, help='Size of each minibatch.') args, _unknown = parser.parse_known_args() # pre-process data process_raw_data(use_gpu, force_pre_processing_overwrite=False) # run experiment training_file = "data/preprocessed/single_protein.txt.hdf5" validation_file = "data/preprocessed/single_protein.txt.hdf5" model = ExampleModel(21, args.minibatch_size, use_gpu=use_gpu) # embed size = 21 train_loader = contruct_dataloader_from_disk(training_file, args.minibatch_size) validation_loader = contruct_dataloader_from_disk(validation_file, args.minibatch_size) train_model_path = train_model(data_set_identifier="TRAIN", model=model, train_loader=train_loader, validation_loader=validation_loader, learning_rate=args.learning_rate, minibatch_size=args.minibatch_size, eval_interval=args.eval_interval, hide_ui=args.hide_ui, use_gpu=use_gpu, minimum_updates=args.minimum_updates) print("Completed training, trained model stored at:") print(train_model_path)
def test(): process_raw_data(False, raw_data_path="tests/data/raw/*", force_pre_processing_overwrite=True) # find original and transformed coordinates """origcoords = pos.numpy() origcoords = np.resize(origcoords, (len(origcoords) * 3, 3)) write_pdb("origcoords.pdb", protein_id_to_str(prim.tolist()), origcoords) transf = tertiary.numpy() transf = np.resize(transf, (len(transf) * 3, 3)) write_pdb("transf.pdb", protein_id_to_str(prim.tolist()), transf) sup = SVDSuperimposer() sup.set(transf, origcoords) sup.run() # rotation and transformation for the superimposer #rot, tran = sup.get_rotran() #print(rot, tran) rms = sup.get_rms() print("RMS", rms) # The segment below finds the structure of the orignal coordinates and the transformed encoded = prim.tolist() pos_angles = calculate_dihedral_angles(torch.squeeze(pos), use_gpu) ter_angles = calculate_dihedral_angles(tertiary, use_gpu) pos_struc = get_structure_from_angles(encoded, pos_angles) ter_struc = get_structure_from_angles(encoded, ter_angles) write_to_pdb(pos_struc, "transformed") write_to_pdb(ter_struc, "original")""" sys.argv = [ "__main__.py", "--min-updates", "1", "--eval-interval", "1", "--experiment-id", "rrn", "--hide-ui", "--file", "data/preprocessed/testfile.txt.hdf5" ] main() path_to_onnx_file = './tests/output/openprotein.onnx' if os.path.exists(path_to_onnx_file): os.remove(path_to_onnx_file) sub_process = subprocess.Popen( ["pipenv", "run", "python", "./tests/onnx_export.py"]) stdout, stderr = sub_process.communicate() print(stdout, stderr) assert sub_process.returncode == 0 assert os.path.exists(path_to_onnx_file)
def main(): parser = argparse.ArgumentParser(description="OpenProtein version 0.1") parser.add_argument('--no_force_pre_processing_overwrite', dest='no_force_pre_processing_overwrite', action='store_false', help='Force overwrite existing preprocessed files', default=True) args, _unknown = parser.parse_known_args() uge_gpu = False if torch.cuda.is_available(): write_out("CUDA is available, using GPU") uge_gpu = True process_raw_data( uge_gpu, force_pre_processing_overwrite=args.force_pre_processing_overwrite)
def run_experiment(parser, use_gpu): # parse experiment specific command line arguments parser.add_argument('--learning-rate', dest='learning_rate', type=float, default=0.01, help='Learning rate to use during training.') parser.add_argument( '--input-file', dest='input_file', type=str, default='data/preprocessed/protein_net_testfile.txt.hdf5') args, _unknown = parser.parse_known_args() # pre-process data process_raw_data(use_gpu, force_pre_processing_overwrite=False) # run experiment training_file = args.input_file validation_file = args.input_file model = MyModel(21, use_gpu=use_gpu) # embed size = 21 train_loader = contruct_dataloader_from_disk(training_file, args.minibatch_size) validation_loader = contruct_dataloader_from_disk(validation_file, args.minibatch_size) train_model_path = train_model(data_set_identifier="TRAIN", model=model, train_loader=train_loader, validation_loader=validation_loader, learning_rate=args.learning_rate, minibatch_size=args.minibatch_size, eval_interval=args.eval_interval, hide_ui=args.hide_ui, use_gpu=use_gpu, minimum_updates=args.minimum_updates) print("Completed training, trained model stored at:") print(train_model_path)
def initialize(self): max_training_datapoints = int(self.total_entries * (self.train_data_percentage / 100)) if self.total_entries - max_training_datapoints < self.starting_pos: print('incorrect parameters provided check starting_pos and training_percentage') return temp = preprocessing.process_raw_data(max_training_datapoints=max_training_datapoints, starting_pos=self.starting_pos) self.training_data = temp[0] self.test_data_files = temp[1] self.counts = self.count_vectorizer.fit_transform(self.training_data['text'].values) self.initialized = True
def run_experiment(parser, use_gpu): # parse experiment specific command line arguments parser.add_argument('--learning-rate', dest='learning_rate', type=float, default=0.01, help='Learning rate to use during training.') args, _unknown = parser.parse_known_args() # pre-process data process_raw_data(use_gpu, raw_data_root="data/raw/*", force_pre_processing_overwrite=False) # run experiment training_file = "data/preprocessed/train_sample.txt.hdf5" validation_file = "data/preprocessed/test_sample.txt.hdf5" model = ExampleModel(21, args.minibatch_size, use_gpu=use_gpu) # embed size = 21 train_loader = contruct_dataloader_from_disk(training_file, args.minibatch_size) validation_loader = contruct_dataloader_from_disk(validation_file, args.minibatch_size) train_model_path = train_model(data_set_identifier="TRAIN", model=model, train_loader=train_loader, validation_loader=validation_loader, learning_rate=args.learning_rate, minibatch_size=args.minibatch_size, eval_interval=args.eval_interval, hide_ui=args.hide_ui, use_gpu=use_gpu, minimum_updates=args.minimum_updates) print(train_model_path)
parser.add_argument('--learning-rate', dest = 'learning_rate', type=float, default=0.01, help='Learning rate to use during training.') args, unknown = parser.parse_known_args() if args.hide_ui: write_out("Live plot deactivated, see output folder for plot.") use_gpu = False if torch.cuda.is_available(): write_out("CUDA is available, using GPU") use_gpu = True # start web server start_dashboard_server() process_raw_data(use_gpu, force_pre_processing_overwrite=False) training_file = "data/preprocessed/sample.txt.hdf5" validation_file = "data/preprocessed/sample.txt.hdf5" testing_file = "data/preprocessed/testing.hdf5" model = ExampleModel(21, args.minibatch_size, use_gpu=use_gpu) # embed size = 21 train_loader = contruct_dataloader_from_disk(training_file, args.minibatch_size) validation_loader = contruct_dataloader_from_disk(validation_file, args.minibatch_size) train_model_path = train_model(data_set_identifier="TRAIN", model=model, train_loader=train_loader, validation_loader=validation_loader, learning_rate=args.learning_rate,
config_file = "configurations/" + args.config + ".config" write_out('Using config: %s' % config_file) configs = RunConfig(config_file) if configs.run_params["hide_ui"]: write_out("Live plot deactivated, see output folder for plot.") max_seq_length, use_evolutionary, n_proteins = configs.run_params["max_sequence_length"], configs.run_params["use_evolutionary"], configs.run_params["n_proteins"] # start web server if not configs.run_params["hide_ui"]: start_dashboard_server() process_raw_data(use_gpu, n_proteins=n_proteins, max_sequence_length=max_seq_length, force_pre_processing_overwrite=False) datafolder = "data/preprocessed/" training_file = datafolder + configs.run_params["training_file"] + "_" + str(max_seq_length) + ".hdf5" validation_file = datafolder + configs.run_params["validation_file"] + "_" + str(max_seq_length) + ".hdf5" testing_file = datafolder + configs.run_params["testing_file"] + "_" + str(max_seq_length) + ".hdf5" def train_model(data_set_identifier, train_file, val_file, learning_rate, minibatch_size, name): set_experiment_id(data_set_identifier, learning_rate, minibatch_size, name) train_loader = contruct_dataloader_from_disk(train_file, minibatch_size, use_evolutionary=True) validation_loader = contruct_dataloader_from_disk(val_file, minibatch_size, use_evolutionary=True) validation_dataset_size = validation_loader.dataset.__len__() train_dataset_size = train_loader.dataset.__len__()
parser.add_argument('--learning-rate', dest='learning_rate', type=float, default=0.01, help='Learning rate to use during training.') args, unknown = parser.parse_known_args() if not args.live_plot: write_out("Live plot deactivated, see output folder for plot.") use_gpu = False if torch.cuda.is_available(): write_out("CUDA is available, using GPU") use_gpu = True process_raw_data(force_pre_processing_overwrite=False) training_file = "data/preprocessed/single.txt.hdf5" validation_file = "data/preprocessed/single.txt.hdf5" testing_file = "data/preprocessed/testing.hdf5" cmd.rock() def train_model(data_set_identifier, train_file, val_file, learning_rate, minibatch_size): set_experiment_id(data_set_identifier, learning_rate, minibatch_size) train_loader = contruct_dataloader_from_disk(train_file, minibatch_size) validation_loader = contruct_dataloader_from_disk(val_file, minibatch_size) validation_dataset_size = validation_loader.dataset.__len__()
import os import sys sys.path.append(r'/home/jupyter/openprotein') from preprocessing import process_raw_data IN_DIR = '/home/jupyter/data/casp12/' OUT_DIR = '/home/jupyter/data/casp12-preprocessed/' if os.path.exists(os.path.join(IN_DIR, sys.argv[1])): print("Preprocessing:", os.path.join(IN_DIR, sys.argv[1])) process_raw_data(use_gpu=False, raw_data_path=os.path.join(IN_DIR, sys.argv[1]), force_pre_processing_overwrite=True, output_path=OUT_DIR) else: print("Path doesn't exist:", os.path.join(IN_DIR, sys.argv[1]))