def make_features_labels(m_module, args): features_name = m_module.get_features( ) if m_module is not None and hasattr( m_module, "get_features") else args.features_name labels_name = m_module.get_labels() if m_module is not None and hasattr( m_module, "get_labels") else args.labels_name return (features_name, labels_name) if __name__ == '__main__': parser = make_train_parser() args = parser.parse_args() initialize_logger(filename=args.log_file, file_level=args.log_level, stream_level=args.log_level) a_backend = args.backend if 'torch' in args.model: a_backend = 'torch' m_module = __import__(args.model.replace('.py', '').replace('/', '.'), fromlist=[None]) if '.py' in args.model else None (features_name, labels_name) = make_features_labels(m_module, args) (train_list, val_list) = make_train_val_lists(m_module, args) comm = MPI.COMM_WORLD.Dup() if args.timeline: Timeline.enable() use_tf = a_backend == 'keras'
def main(): from TrainingDriver import add_loader_options parser = argparse.ArgumentParser() parser.add_argument('--verbose',help='display metrics for each training batch',action='store_true') parser.add_argument('--profile',help='profile theano code',action='store_true') parser.add_argument('--monitor',help='Monitor cpu and gpu utilization', action='store_true') parser.add_argument('--tf', help='use tensorflow backend', action='store_true') # model arguments parser.add_argument('model_json', help='JSON file containing model architecture') parser.add_argument('--trial-name', help='descriptive name for trial', default='train', dest='trial_name') # training data arguments parser.add_argument('train_data', help='text file listing data inputs for training') parser.add_argument('val_data', help='text file listing data inputs for validation') parser.add_argument('--features-name', help='name of HDF5 dataset with input features', default='features', dest='features_name') parser.add_argument('--labels-name', help='name of HDF5 dataset with output labels', default='labels', dest='labels_name') parser.add_argument('--batch', help='batch size', default=100, type=int) add_loader_options(parser) # configuration of network topology parser.add_argument('--masters', help='number of master processes', default=1, type=int) parser.add_argument('--n-processes', dest='processes', help='number of processes per worker', default=1, type=int) parser.add_argument('--max-gpus', dest='max_gpus', help='max GPUs to use', type=int, default=-1) parser.add_argument('--master-gpu',help='master process should get a gpu', action='store_true', dest='master_gpu') parser.add_argument('--synchronous',help='run in synchronous mode',action='store_true') # configuration of training process parser.add_argument('--epochs', help='number of training epochs', default=1, type=int) parser.add_argument('--optimizer',help='optimizer for master to use',default='adam') parser.add_argument('--loss',help='loss function',default='binary_crossentropy') parser.add_argument('--early-stopping', default=None, dest='early_stopping', help='Configuration for early stopping') parser.add_argument('--target-metric', default=None, dest='target_metric', help='Passing configuration for a target metric') parser.add_argument('--worker-optimizer',help='optimizer for workers to use', dest='worker_optimizer', default='sgd') parser.add_argument('--worker-optimizer-params',help='worker optimizer parameters (string representation of a dict)', dest='worker_optimizer_params', default='{}') parser.add_argument('--sync-every', help='how often to sync weights with master', default=1, type=int, dest='sync_every') parser.add_argument('--mode',help='Mode of operation.' 'One of "downpour" (Downpour), "easgd" (Elastic Averaging SGD) or "gem" (Gradient Energy Matching)',default='downpour',choices=['downpour','easgd','gem']) parser.add_argument('--elastic-force',help='beta parameter for EASGD',type=float,default=0.9) parser.add_argument('--elastic-lr',help='worker SGD learning rate for EASGD', type=float, default=1.0, dest='elastic_lr') parser.add_argument('--elastic-momentum',help='worker SGD momentum for EASGD', type=float, default=0, dest='elastic_momentum') parser.add_argument('--restore', help='pass a file to retore the variables from', default=None) parser.add_argument('--log-file', default=None, dest='log_file', help='log file to write, in additon to output stream') parser.add_argument('--log-level', default='info', dest='log_level', help='log level (debug, info, warn, error)') parser.add_argument('--checkpoint', help='Base name of the checkpointing file. If omitted no checkpointing will be done', default=None) parser.add_argument('--checkpoint-interval', help='Number of epochs between checkpoints', default=5, type=int, dest='checkpoint_interval') args = parser.parse_args() model_name = os.path.basename(args.model_json).replace('.json','') initialize_logger(filename=args.log_file, file_level=args.log_level, stream_level=args.log_level) with open(args.train_data) as train_list_file: train_list = [ s.strip() for s in train_list_file.readlines() ] with open(args.val_data) as val_list_file: val_list = [ s.strip() for s in val_list_file.readlines() ] comm = MPI.COMM_WORLD.Dup() use_tf = args.tf use_torch = not use_tf from TrainingDriver import make_model_weight, make_algo, make_loader model_weights = make_model_weight(args, use_torch) device = get_device( comm, args.masters, gpu_limit=args.max_gpus, gpu_for_master=args.master_gpu) if use_tf: backend = 'tensorflow' if not args.optimizer.endswith("tf"): args.optimizer = args.optimizer + 'tf' os.environ['CUDA_VISIBLE_DEVICES'] = device[-1] if 'gpu' in device else '' logging.info('set to device %s %s'%(os.environ['CUDA_VISIBLE_DEVICES'], socket.gethostname())) os.environ['KERAS_BACKEND'] = backend logging.info(backend) if use_tf: import_keras() import keras.backend as K gpu_options=K.tf.GPUOptions( per_process_gpu_memory_fraction=0.0, allow_growth = True,) K.set_session( K.tf.Session( config=K.tf.ConfigProto( allow_soft_placement=True, #allow_soft_placement=False, #log_device_placement=True , # was false log_device_placement=False , # was false gpu_options=gpu_options ) ) ) if use_tf: from nnlo.train.GanModel import GANModelBuilder model_builder = GANModelBuilder( comm , tf= True, weights=model_weights) data = make_loader(args, args.features_name, args.labels_name, train_list) algo = make_algo( args, use_tf, comm, validate_every=int(data.count_data()/args.batch )) if args.restore: algo.load(args.restore) # Creating the MPIManager object causes all needed worker and master nodes to be created manager = MPIManager( comm=comm, data=data, algo=algo, model_builder=model_builder, num_epochs=args.epochs, train_list=train_list, val_list=val_list, num_masters=args.masters, num_processes=args.processes, synchronous=args.synchronous, verbose=args.verbose , monitor=args.monitor, early_stopping=args.early_stopping,target_metric=args.target_metric , checkpoint=args.checkpoint, checkpoint_interval=args.checkpoint_interval) # Process 0 launches the training procedure if comm.Get_rank() == 0: logging.info(algo) t_0 = time() histories = manager.process.train() delta_t = time() - t_0 manager.free_comms() logging.info("Training finished in {0:.3f} seconds".format(delta_t)) json_name = '_'.join([model_name,args.trial_name,"history.json"]) manager.process.record_details(json_name, meta={"args":vars(args)}) logging.info("Wrote trial information to {0}".format(json_name)) comm.Barrier() logging.info("Terminating")
def main(): parser = make_train_parser() args = parser.parse_args() initialize_logger(filename=args.log_file, file_level=args.log_level, stream_level=args.log_level) a_backend = args.backend if 'torch' in args.model: a_backend = 'torch' m_module, model_source = None, None try: if args.model == 'mnist': m_module = importlib.import_module(f'nnlo.models.model_mnist_tf') model_source = 'models/model_mnist_tf.py' elif args.model == 'mnist_torch': m_module = importlib.import_module( f'nnlo.models.model_mnist_torch') model_source = 'models/model_mnist_torch.py' elif args.model == 'cifar10': m_module = importlib.import_module(f'nnlo.models.model_cifar10_tf') model_source = 'models/model_cifar10_tf.py' except Exception as e: logging.fatal(e) (features_name, labels_name) = make_features_labels(m_module, args) (train_list, val_list) = make_train_val_lists(m_module, args) comm = MPI.COMM_WORLD.Dup() if args.timeline: Timeline.enable() use_tf = a_backend == 'keras' use_torch = not use_tf model_weights = make_model_weight(args, use_torch) # Theano is the default backend; use tensorflow if --tf is specified. # In the theano case it is necessary to specify the device before importing. device = get_device(comm, args.n_masters, gpu_limit=args.max_gpus, gpu_for_master=args.master_gpu) os.environ['CUDA_VISIBLE_DEVICES'] = device[-1] if 'gpu' in device else '' logging.debug('set to device %s', os.environ['CUDA_VISIBLE_DEVICES']) if use_torch: logging.debug("Using pytorch") model_builder = ModelPytorch(comm, source=model_source, weights=model_weights, gpus=1 if 'gpu' in device else 0) else: logging.debug("Using TensorFlow") os.environ['KERAS_BACKEND'] = 'tensorflow' import tensorflow as tf import_keras() #tf.config.gpu.set_per_process_memory_fraction(0.1) #gpu_options=K.tf.GPUOptions( # per_process_gpu_memory_fraction=0.1, #was 0.0 # allow_growth = True, # visible_device_list = device[-1] if 'gpu' in device else '') #gpu_options=K.tf.GPUOptions( # per_process_gpu_memory_fraction=0.0, # allow_growth = True,) gpu_devices = tf.config.experimental.list_physical_devices('GPU') for device in gpu_devices: tf.config.experimental.set_memory_growth(device, True) #NTHREADS=(2,1) #NTHREADS=None #if NTHREADS is None: # K.set_session( K.tf.Session( config=K.tf.ConfigProto( # allow_soft_placement=True, log_device_placement=False, # gpu_options=gpu_options # ) ) ) #else: # K.set_session( K.tf.Session( config=K.tf.ConfigProto( # allow_soft_placement=True, log_device_placement=False, # gpu_options=gpu_options, # intra_op_parallelism_threads=NTHREADS[0], # inter_op_parallelism_threads=NTHREADS[1], # ) ) ) model_builder = ModelTensorFlow(comm, source=model_source, weights=model_weights) data = make_loader(args, features_name, labels_name, train_list) # Some input arguments may be ignored depending on chosen algorithm algo = make_algo(args, use_tf, comm, validate_every=int(data.count_data() / args.batch)) if args.restore: algo.load(args.restore) # Creating the MPIManager object causes all needed worker and master nodes to be created manager = MPIManager(comm=comm, data=data, algo=algo, model_builder=model_builder, num_epochs=args.epochs, train_list=train_list, val_list=val_list, num_masters=args.n_masters, num_processes=args.n_processes, synchronous=args.synchronous, verbose=args.verbose, monitor=args.monitor, early_stopping=args.early_stopping, target_metric=args.target_metric, thread_validation=args.thread_validation, checkpoint=args.checkpoint, checkpoint_interval=args.checkpoint_interval) if m_module: model_name = m_module.get_name() else: model_name = os.path.basename(args.model).replace('.json', '') json_name = args.output + '/' + '_'.join( [model_name, args.trial_name, "history.json"]) tl_json_name = args.output + '/' + '_'.join( [model_name, args.trial_name, "timeline.json"]) # Process 0 launches the training procedure if comm.Get_rank() == 0: logging.debug('Training configuration: %s', algo.get_config()) t_0 = time() histories = manager.process.train() delta_t = time() - t_0 logging.info("Training finished in {0:.3f} seconds".format(delta_t)) manager.process.record_details(json_name, meta={"args": vars(args)}) logging.info("Wrote trial information to {0}".format(json_name)) manager.close() comm.barrier() logging.info("Terminating") if args.timeline: Timeline.collect(clean=True, file_name=tl_json_name)
def main(): logging.info("Process is on {}".format(socket.gethostname())) parser = make_opt_parser() args = parser.parse_args() check_sanity(args) initialize_logger(filename=args.log_file, file_level=args.log_level, stream_level=args.log_level) import socket host = os.environ.get('HOST',os.environ.get('HOSTNAME',socket.gethostname())) test = args.example model_source = args.model a_backend = args.backend if args.model and 'torch' in args.model: a_backend = 'torch' use_tf = a_backend == 'keras' use_torch = not use_tf ##starting the configuration of the processes logging.info("Initializing...") comm_world = MPI.COMM_WORLD.Dup() ## consistency check to make sure everything is appropriate num_blocks, left_over = divmod( (comm_world.Get_size()-1), args.block_size) if left_over: logging.warning("The last block is going to be made of {} nodes, make inconsistent block size {}".format( left_over, args.block_size)) num_blocks += 1 ## to accoun for the last block if left_over<2: logging.warning("The last block is going to be too small for mpi_learn, with no workers") MPI.COMM_WORLD.Abort() block_num = get_block_num(comm_world, args.block_size) device = get_device(comm_world, num_blocks, gpu_limit=args.max_gpus) logging.info("Process {} using device {}".format(comm_world.Get_rank(), device)) os.environ['CUDA_VISIBLE_DEVICES'] = device[-1] if 'gpu' in device else '' logging.info('set to device %s',os.environ['CUDA_VISIBLE_DEVICES']) if use_tf: import keras.backend as K gpu_options=K.tf.GPUOptions( per_process_gpu_memory_fraction=0.0, allow_growth = True,) K.set_session( K.tf.Session( config=K.tf.ConfigProto( allow_soft_placement=True, log_device_placement=True, gpu_options=gpu_options ) ) ) if model_source is not None: ## provide the model details here module = __import__(args.model.replace('.py','').replace('/', '.'), fromlist=[None]) if use_tf: model_provider = BuilderFromFunction( model_fn = module.get_model ) else: model_provider = TorchBuilderFromFunction( model_fn = module.get_model) (train_list, val_list) = make_train_val_lists(module, args) (features_name, labels_name) = make_features_labels(module, args) elif test == 'topclass': ### topclass example if not args.torch: model_provider = BuilderFromFunction( model_fn = models.make_topclass_model ) else: model_provider = TorchBuilderFromFunction( model_fn = models.make_topclass_torch_model) if 'daint' in host: train_list = glob.glob('/scratch/snx3000/vlimant/data/LCDJets_Remake/train/*.h5') val_list = glob.glob('/scratch/snx3000/vlimant/data/LCDJets_Remake/val/*.h5') elif 'titan' in host: train_list = glob.glob('/ccs/proj/csc291/DATA/LCDJets_Abstract_IsoLep_lt_20/train/*.h5') val_list = glob.glob('/ccs/proj/csc291/DATA/LCDJets_Abstract_IsoLep_lt_20/val/*.h5') else: train_list = glob.glob('/bigdata/shared/LCDJets_Abstract_IsoLep_lt_20/train/0*.h5') val_list = glob.glob('/bigdata/shared/LCDJets_Abstract_IsoLep_lt_20/val/0*.h5') features_name='Images' labels_name='Labels' elif test == 'mnist': ### mnist example if args.torch: model_provider = TorchBuilderFromFunction( model_fn = models.make_mnist_torch_model) else: model_provider = BuilderFromFunction( model_fn = models.make_mnist_model) if 'daint' in host: all_list = glob.glob('/scratch/snx3000/vlimant/data/mnist/*.h5') elif 'titan' in host: all_list = glob.glob('/ccs/proj/csc291/DATA/mnist/*.h5') else: all_list = glob.glob('/bigdata/shared/mnist/*.h5') l = int( len(all_list)*0.70) train_list = all_list[:l] val_list = all_list[l:] features_name='features' labels_name='labels' elif test == 'cifar10': ### cifar10 example model_provider = BuilderFromFunction( model_fn = models.make_cifar10_model ) if 'daint' in host: all_list = [] elif 'titan' in host: all_list = glob.glob('/ccs/proj/csc291/DATA/cifar10/*.h5') else: all_list = glob.glob('/bigdata/shared/cifar10/*.h5') l = int( len(all_list)*0.70) train_list = all_list[:l] val_list = all_list[l:] features_name='features' labels_name='labels' elif test == 'gan': from nnlo.train.GanModel import GANBuilder ### the gan example model_provider = GANBuilder( parameters = [ Integer(50,400, name='latent_size' ), Real(0.0, 1.0, name='discr_drop_out'), Categorical([1, 2, 5, 6, 8], name='gen_weight'), Categorical([0.1, 0.2, 1, 2, 10], name='aux_weight'), Categorical([0.1, 0.2, 1, 2, 10], name='ecal_weight'), ] ) ## only this mode functions setattr(args,"mode",'easgd') args.worker_optimizer = 'rmsprop' if 'daint' in host: all_list = glob.glob('/scratch/snx3000/vlimant/data/3DGAN/*.h5') elif 'titan' in host: all_list = glob.glob('/ccs/proj/csc291/DATA/3DGAN/*.h5') else: all_list = glob.glob('/data/shared/3DGAN/*.h5') #l = int( len(all_list)*0.70) #train_list = all_list[:l] #val_list = all_list[l:] N= MPI.COMM_WORLD.Get_size() train_list = all_list[:N] val_list = all_list[-1:] features_name='X' labels_name='y' if use_torch: if 'gpu' in device: model_provider.gpus=1 comm_block = comm_world.Split(block_num) logging.debug("Process {} sees {} blocks, has block number {}, and rank {} in that block".format(comm_world.Get_rank(), num_blocks, block_num, comm_block.Get_rank() )) if args.n_processes>1: t_b_processes= [] if block_num !=0: _,_, b_processes = get_groups(comm_block, args.n_masters, args.n_processes) ## collect all block=>world rank translation r2r = (comm_block.Get_rank() , comm_world.Get_rank()) all_r2r = comm_block.allgather( r2r ) translate = dict( all_r2r ) #key is the rank in block, value is rank in world t_b_processes = [] for pr in b_processes: t_pr = [] for p in pr: t_pr.append( translate[p]) t_b_processes.append( t_pr ) #need to collect all the processes lists all_t_b_processes = comm_world.allgather( t_b_processes ) w_processes = set() for gb in all_t_b_processes: if gb: hgb = map(tuple, gb) w_processes.update( hgb ) if block_num == 0: logging.info("all collect processes {}".format(w_processes)) ## now you have the ranks that needs to be initialized in rings. # MPI process 0 coordinates the Bayesian optimization procedure if block_num == 0: opt_coordinator = Coordinator(comm_world, num_blocks, model_provider.parameters, (args.hyper_opt=='genetic'),args.population, checkpointing = args.checkpoint, label = args.trial_name ) if args.opt_restore: opt_coordinator.load() if args.target_objective: opt_coordinator.target_fom = args.target_objective opt_coordinator.run(num_iterations=args.num_iterations) opt_coordinator.record_details() else: logging.debug("Process {} on block {}, rank {}, create a process block".format( comm_world.Get_rank(), block_num, comm_block.Get_rank())) data = make_loader(args, features_name, labels_name, train_list) from TrainingDriver import make_algo algo = make_algo( args, use_tf, comm_block , validate_every=int(data.count_data()/args.batch )) block = ProcessBlock(comm_world, comm_block, algo, data, device, model_provider, args.epochs, train_list, val_list, folds = args.n_fold, num_masters = args.n_masters, num_process = args.n_processes, verbose=args.verbose, early_stopping=args.early_stopping, target_metric=args.target_metric, monitor=args.monitor, label = args.trial_name, restore = args.opt_restore, checkpoint=args.checkpoint, checkpoint_interval=args.checkpoint_interval) block.run()