model_weights = None if args.restore: args.restore = re.sub(r'\.algo$', '', args.restore) if os.path.isfile(args.restore + '.latest'): with open(args.restore + '.latest', 'r') as latest: args.restore = latest.read().splitlines()[-1] if not args.tf and os.path.isfile(args.restore + '.model'): model_weights = args.restore + '.model' if args.torch: model_weights += '_w' # Theano is the default backend; use tensorflow if --tf is specified. # In the theano case it is necessary to specify the device before importing. device = get_device(comm, args.masters, gpu_limit=args.max_gpus, gpu_for_master=args.master_gpu) hide_device = True if args.torch: logging.debug("Using pytorch") if not args.optimizer.endswith("torch"): args.optimizer = args.optimizer + 'torch' import torch if hide_device: os.environ[ 'CUDA_VISIBLE_DEVICES'] = device[-1] if 'gpu' in device else '' logging.debug('set to device %s', os.environ['CUDA_VISIBLE_DEVICES']) else: if 'gpu' in device: torch.cuda.set_device(int(device[-1]))
args = parser.parse_args() archive_dir = args.archive_dir hashcode = args.hashcode masters = args.masters max_gpus = args.max_gpus # print("Mooop") # if(len(sys.argv) != 3): # raise ValueError("MPIKerasTrail_execute.py -- Incorrect number of arguments.") # if(len(sys.argv)) > # numProcesses = sys.argv[3] print(archive_dir, hashcode, masters, max_gpus) comm = MPI.COMM_WORLD.Dup() # We have to assign GPUs to processes before importing Theano. device = get_device(comm, masters, gpu_limit=max_gpus, gpu_for_master=True) print("Process", comm.Get_rank(), "using device", device) os.environ['THEANO_FLAGS'] = "device=%s,floatX=float32" % (device) from CMS_Deep_Learning.storage.MPIArchiving import MPI_KerasTrial trial = MPI_KerasTrial.find_by_hashcode(archive_dir, hashcode) if (trial == None): raise ValueError("hashcode does not exist") if (not isinstance(trial, MPI_KerasTrial)): raise TypeError("Trial is not MPI_KerasTrial, got type %r" % type(trial)) trial._execute_MPI(comm=comm) # print(sys.argv[0]) # print(sys.argv[1])
## consistency check to make sure everything is appropriate num_blocks, left_over = divmod((comm_world.Get_size() - 1), args.block_size) if left_over: print( "The last block is going to be made of {} nodes, make inconsistent block size {}" .format(left_over, args.block_size)) num_blocks += 1 ## to accoun for the last block if left_over < 2: print( "The last block is going to be too small for mpi_learn, with no workers" ) sys.exit(1) block_num = get_block_num(comm_world, args.block_size) device = mm.get_device(comm_world, num_blocks) backend = 'tensorflow' hide_device = True if hide_device: os.environ[ 'CUDA_VISIBLE_DEVICES'] = device[-1] if 'gpu' in device else '' print('set to device', os.environ['CUDA_VISIBLE_DEVICES']) if not args.torch: import keras.backend as K gpu_options = K.tf.GPUOptions( per_process_gpu_memory_fraction=0.1, allow_growth=True, visible_device_list=device[-1] if 'gpu' in device else '') if hide_device: gpu_options = K.tf.GPUOptions(