def make_batch_script(trainer_params, model_params, script_params): # Create LBANN objects trainer = lbann.Trainer(mini_batch_size=trainer_params.mini_batch_size) model = make_model(**model_params) reader = make_data_reader() # Optimizer with learning rate schedule # Note: Rough approximation of # embed_dim^-0.5 * min(step^-0.5, step*warmup^-1.5) # with embed_dim=512 and warmup=4000. opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.98, eps=1e-9) model.callbacks.append( lbann.CallbackDropFixedLearningRate( drop_epoch=[1], amt=2, )) model.callbacks.append( lbann.CallbackDropFixedLearningRate( drop_epoch=[2, 4, 8, 12], amt=0.75, )) # Checkpoint after every epoch trainer.callbacks.append( lbann.CallbackCheckpoint( checkpoint_dir=os.path.join(script_params['work_dir'], 'checkpoint'), checkpoint_epochs=1, )) # Dump weights after every epoch model.callbacks.append( lbann.CallbackDumpWeights( basename=os.path.join(script_params['work_dir'], 'weights'), epoch_interval=1, )) # Create Protobuf file protobuf_file = os.path.join(script_params['work_dir'], 'experiment.prototext') lbann.proto.save_prototext( protobuf_file, trainer=trainer, model=model, data_reader=reader, optimizer=opt, ) # Create batch script script = lbann.contrib.launcher.make_batch_script(**script_params, ) script.add_command('echo "Started training at $(date)"') script.add_parallel_command([ lbann.lbann_exe(), f'--prototext={protobuf_file}', ]) script.add_command('status=$?') script.add_command('echo "Finished training at $(date)"') script.add_command('exit ${status}') return script
def setup_experiment(lbann): """Construct LBANN experiment. args: lbann (module): Module for LBANN Python frontend """ trainer = lbann.Trainer(mini_batch_size=mini_batch_size) callbacks = [ lbann.CallbackPrint(), lbann.CallbackTimer(), lbann.CallbackGPUMemoryUsage() ] model = Sparse_Graph_Trainer.make_model(kernel_type='GatedGraph', num_epochs=num_epochs, callbacks=callbacks) reader = data.PROTEINS.make_data_reader() # No validation set optimizer = lbann.Adam(learn_rate=0.01, beta1=0.9, beta2=0.99, eps=1e-8) return trainer, model, reader, optimizer
def setup_experiment(lbann): """Construct LBANN experiment. args: lbann (module): Module for LBANN Python frontend """ trainer = lbann.Trainer(mini_batch_size=mini_batch_size) model = make_model(NUM_NODES, NUM_EDGES, NUM_NODES_FEATURES, NUM_EDGE_FEATURES, EMBEDDING_DIM, EDGE_EMBEDDING_DIM, NUM_OUT_FEATURES, num_epochs) reader = LSC_PPQM4M.make_data_reader("LSC_100K", validation_percent=0) optimizer = lbann.Adam(learn_rate=0.01, beta1=0.9, beta2=0.99, eps=1e-8 ) return trainer, model, reader, optimizer
fldr_name=now.strftime('%Y%m%d_%H%M%S') ## time format data_pct,val_ratio=1.0,0.1 # Percentage of data to use, % of data for validation batchsize=args.batchsize step_interval=args.step_interval print('Step interval',step_interval) work_dir="/global/cscratch1/sd/vpa/proj/cosmogan/results_dir/512square/{0}_bsize{1}_{2}".format(fldr_name,batchsize,args.suffix) ##################### ### Run lbann trainer = lbann.Trainer(mini_batch_size=batchsize,random_seed=random_seed,callbacks=lbann.CallbackCheckpoint(checkpoint_dir='chkpt', checkpoint_epochs=10)) # checkpoint_steps=step_interval)) spectral_loss=args.spec_loss print("Spectral loss: ",spectral_loss) model = construct_model(num_epochs,mcr,spectral_loss=spectral_loss,save_batch_interval=int(step_interval)) #'step_interval*val_ratio' is the step interval for validation set. # Setup optimizer opt = lbann.Adam(learn_rate=args.learn_rate,beta1=0.5,beta2=0.99,eps=1e-8) # Load data reader from prototext data_reader = construct_data_reader(data_pct,val_ratio) status = lbann.run(trainer,model, data_reader, opt, nodes=num_nodes, procs_per_node=num_procs, work_dir=work_dir, scheduler='slurm', time_limit=1440, setup_only=False) print(status)
callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()] layers = list(lbann.traverse_layer_graph([images, responses])) model = lbann.Model(args.num_epochs, layers=layers, metrics=metrics, objective_function=mse, callbacks=callbacks) # Load data reader from prototext data_reader_proto = lbann.lbann_pb2.LbannPB() with open(data_reader_prototext, 'r') as f: txtf.Merge(f.read(), data_reader_proto) data_reader_proto = data_reader_proto.data_reader # Setup trainer trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size) # Setup optimizer opt = lbann.Adam(learn_rate=0.0002, beta1=0.9, beta2=0.99, eps=1e-8) # Run experiment kwargs = lbann.contrib.args.get_scheduler_kwargs(args) lbann.contrib.launcher.run(trainer, model, data_reader_proto, opt, lbann_args=" --use_data_store --preload_data_store", job_name=args.job_name, **kwargs)
def main(): run_args = construct_lc_launcher_args() # add data_config data # and do not overwrite args if data_reader_prototext is enabled if os.path.isfile( run_args.data_config) and not run_args.data_reader_prototext: with open(run_args.data_config, "r") as f: config = json.load(f) for k, v in config.items(): setattr(run_args, k, v) trainer = lbann.Trainer(run_args.batch_size, #name=None, ) # define data_reader if run_args.data_reader_prototext: print("Using data_reader_prototext") assert run_args.sequence_length is not None assert run_args.vocab is not None data_reader_proto = lbann.lbann_pb2.LbannPB() with open(run_args.data_reader_prototext, "r") as f: txtf.Merge(f.read(), data_reader_proto) data_reader = data_reader_proto.data_reader else: data_reader = construct_data_reader(run_args) if "LBANN_EXPERIMENT_DIR" in os.environ: work_dir = os.environ["LBANN_EXPERIMENT_DIR"] else: work_dir = os.path.join(os.getcwd()) timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") experiment_dir = os.path.join(work_dir, "{}_{}".format(timestamp, run_args.job_name)) if not os.path.exists(experiment_dir): os.makedirs(experiment_dir) # model and optimizer model = construct_model(run_args) opt = lbann.Adam(learn_rate=run_args.lr, beta1=0.9, beta2=0.99, eps=1e-8) # dump the config to the experiment_dir so that it can be used to load the model in pytorch (moses codebase) ppn = 4 if run_args.scheduler == "lsf" else 2 print("args:\n" + str(run_args)) if (run_args.scheduler == 'slurm'): import torch torch.save(run_args, "{}/{}_config.pt".format(experiment_dir, run_args.job_name)) m_lbann_args = f"--load_model_weights_dir_is_complete --load_model_weights_dir={run_args.dump_model_dir} --vocab={run_args.vocab} --num_samples={run_args.num_samples} --sequence_length={run_args.sequence_length} --num_io_threads={run_args.num_io_threads} --no_header={run_args.no_header} --delimiter={run_args.delimiter}" if (run_args.data_reader_prototext): m_lbann_args = " ".join( (m_lbann_args, " --use_data_store --preload_data_store ")) if (run_args.procs_per_trainer): m_lbann_args = " ".join( (m_lbann_args, f"--procs_per_trainer={run_args.procs_per_trainer}")) status = lbann.contrib.launcher.run( trainer, model, data_reader, opt, lbann_exe, partition=run_args.partition, scheduler=run_args.scheduler, account=run_args.account, time_limit=run_args.time_limit, nodes=run_args.nodes, procs_per_node=ppn, batch_job=True, #setup_only = True, job_name=run_args.job_name, experiment_dir=experiment_dir, lbann_args=m_lbann_args, #turn on for tensor core environment={ 'LBANN_USE_CUBLAS_TENSOR_OPS': 1, 'LBANN_USE_CUDNN_TENSOR_OPS': 1, }, ) print("LBANN launcher status:\n" + str(status))
def create_unet3d_optimizer(learn_rate): # TODO: This is a temporal optimizer copied from CosomoFlow. adam = lbann.Adam(learn_rate=learn_rate, beta1=0.9, beta2=0.999, eps=1e-8) return adam
objective_function=obj, metrics=metrics, callbacks=[ lbann.CallbackPrint(), lbann.CallbackTimer(), ], ) # Setup trainer, optimizer, data_reader trainer = lbann.Trainer( mini_batch_size=lbann_params.mini_batch_size, num_parallel_readers=1, ) optimizer = lbann.Adam( learn_rate=0.01, beta1=0.9, beta2=0.99, eps=1e-8, ) data_reader = make_data_reader() # Launch LBANN kwargs = lbann.contrib.args.get_scheduler_kwargs(lbann_params) kwargs["environment"] = {} lbann.contrib.launcher.run( trainer, model, data_reader, optimizer, work_dir=lbann_params.work_dir, job_name=lbann_params.job_name, lbann_args=["--num_io_threads=1"],
checkpoint_dir='chkpt', # checkpoint_epochs=10)) checkpoint_steps=gdict['step_interval'])) spectral_loss = gdict['lambda_spec'] if spectral_loss: print("Using Spectral loss with coupling", spectral_loss) model = construct_model( num_epochs, gdict['mcr'], spectral_loss=spectral_loss, save_batch_interval=int(gdict['step_interval']) ) #'step_interval*val_ratio' is the step interval for validation set. # Setup optimizer opt = lbann.Adam(learn_rate=gdict['learn_rate'], beta1=gdict['beta1'], beta2=gdict['beta2'], eps=float(gdict['eps'])) # Load data reader from prototext data_reader = construct_data_reader(data_pct, val_ratio) status = lbann.run(trainer, model, data_reader, opt, nodes=num_nodes, procs_per_node=num_procs, work_dir=work_dir, scheduler='slurm', time_limit=1440, setup_only=False)