예제 #1
0
    model_weights = None

    if args.restore:
        args.restore = re.sub(r'\.algo$', '', args.restore)
        if os.path.isfile(args.restore + '.latest'):
            with open(args.restore + '.latest', 'r') as latest:
                args.restore = latest.read().splitlines()[-1]
        if not args.tf and os.path.isfile(args.restore + '.model'):
            model_weights = args.restore + '.model'
        if args.torch:
            model_weights += '_w'

    # Theano is the default backend; use tensorflow if --tf is specified.
    # In the theano case it is necessary to specify the device before importing.
    device = get_device(comm,
                        args.masters,
                        gpu_limit=args.max_gpus,
                        gpu_for_master=args.master_gpu)
    hide_device = True
    if args.torch:
        logging.debug("Using pytorch")
        if not args.optimizer.endswith("torch"):
            args.optimizer = args.optimizer + 'torch'
        import torch
        if hide_device:
            os.environ[
                'CUDA_VISIBLE_DEVICES'] = device[-1] if 'gpu' in device else ''
            logging.debug('set to device %s',
                          os.environ['CUDA_VISIBLE_DEVICES'])
        else:
            if 'gpu' in device:
                torch.cuda.set_device(int(device[-1]))
args = parser.parse_args()

archive_dir = args.archive_dir
hashcode = args.hashcode
masters = args.masters
max_gpus = args.max_gpus
# print("Mooop")
# if(len(sys.argv) != 3):
#     raise ValueError("MPIKerasTrail_execute.py -- Incorrect number of arguments.")

# if(len(sys.argv)) >
# numProcesses = sys.argv[3]

print(archive_dir, hashcode, masters, max_gpus)

comm = MPI.COMM_WORLD.Dup()
# We have to assign GPUs to processes before importing Theano.
device = get_device(comm, masters, gpu_limit=max_gpus, gpu_for_master=True)
print("Process", comm.Get_rank(), "using device", device)
os.environ['THEANO_FLAGS'] = "device=%s,floatX=float32" % (device)
from CMS_Deep_Learning.storage.MPIArchiving import MPI_KerasTrial

trial = MPI_KerasTrial.find_by_hashcode(archive_dir, hashcode)
if (trial == None):
    raise ValueError("hashcode does not exist")
if (not isinstance(trial, MPI_KerasTrial)):
    raise TypeError("Trial is not MPI_KerasTrial, got type %r" % type(trial))
trial._execute_MPI(comm=comm)
# print(sys.argv[0])
# print(sys.argv[1])
예제 #3
0
    ## consistency check to make sure everything is appropriate
    num_blocks, left_over = divmod((comm_world.Get_size() - 1),
                                   args.block_size)
    if left_over:
        print(
            "The last block is going to be made of {} nodes, make inconsistent block size {}"
            .format(left_over, args.block_size))
        num_blocks += 1  ## to accoun for the last block
        if left_over < 2:
            print(
                "The last block is going to be too small for mpi_learn, with no workers"
            )
        sys.exit(1)

    block_num = get_block_num(comm_world, args.block_size)
    device = mm.get_device(comm_world, num_blocks)
    backend = 'tensorflow'
    hide_device = True
    if hide_device:
        os.environ[
            'CUDA_VISIBLE_DEVICES'] = device[-1] if 'gpu' in device else ''
        print('set to device', os.environ['CUDA_VISIBLE_DEVICES'])

    if not args.torch:
        import keras.backend as K
        gpu_options = K.tf.GPUOptions(
            per_process_gpu_memory_fraction=0.1,
            allow_growth=True,
            visible_device_list=device[-1] if 'gpu' in device else '')
        if hide_device:
            gpu_options = K.tf.GPUOptions(