def run_experiment(args, model, data_reader_proto, optimizer): # Run experiment if not args.disable_run: from lbann.contrib.lc.paths import imagenet_dir, imagenet_labels import lbann.contrib.lc.launcher kwargs = {} if args.nodes: kwargs['nodes'] = args.nodes if args.procs_per_node: kwargs['procs_per_node'] = args.procs_per_node if args.partition: kwargs['partition'] = args.partition if args.account: kwargs['account'] = args.account if args.time_limit: kwargs['time_limit'] = args.time_limit if args.imagenet_classes: classes = args.imagenet_classes kwargs['lbann_args'] = ( '--data_filedir_train={} --data_filename_train={} ' '--data_filedir_test={} --data_filename_test={}' .format(imagenet_dir(data_set='train', num_classes=classes), imagenet_labels(data_set='train', num_classes=classes), imagenet_dir(data_set='val', num_classes=classes), imagenet_labels(data_set='val', num_classes=classes))) lbann.contrib.lc.launcher.run(model, data_reader_proto, optimizer, job_name='lbann_densenet', **kwargs)
def make_data_reader(num_classes=1000): # Load Protobuf message from file current_dir = os.path.dirname(os.path.realpath(__file__)) protobuf_file = os.path.join(current_dir, 'data_reader.prototext') message = lbann.lbann_pb2.LbannPB() with open(protobuf_file, 'r') as f: google.protobuf.text_format.Merge(f.read(), message) message = message.data_reader # Paths to ImageNet data # Note: Paths are only known for some compute centers compute_center = lbann.contrib.launcher.compute_center() if compute_center == 'lc': from lbann.contrib.lc.paths import imagenet_dir, imagenet_labels train_data_dir = imagenet_dir(data_set='train', num_classes=num_classes) train_label_file = imagenet_labels(data_set='train', num_classes=num_classes) test_data_dir = imagenet_dir(data_set='val', num_classes=num_classes) test_label_file = imagenet_labels(data_set='val', num_classes=num_classes) elif compute_center == 'nersc': from lbann.contrib.nersc.paths import imagenet_dir, imagenet_labels train_data_dir = imagenet_dir(data_set='train') train_label_file = imagenet_labels(data_set='train') test_data_dir = imagenet_dir(data_set='val') test_label_file = imagenet_labels(data_set='val') else: raise RuntimeError( f'ImageNet data paths are unknown for current compute center ({compute_center})' ) # Check that data paths are accessible if not os.path.isdir(train_data_dir): raise FileNotFoundError('could not access {}'.format(train_data_dir)) if not os.path.isfile(train_label_file): raise FileNotFoundError('could not access {}'.format(train_label_file)) if not os.path.isdir(test_data_dir): raise FileNotFoundError('could not access {}'.format(test_data_dir)) if not os.path.isfile(test_label_file): raise FileNotFoundError('could not access {}'.format(test_label_file)) # Set paths message.reader[0].data_filedir = train_data_dir message.reader[0].data_filename = train_label_file message.reader[1].data_filedir = test_data_dir message.reader[1].data_filename = test_label_file return message
data_reader_proto = lbann.lbann_pb2.LbannPB() with open(args.data_reader, 'r') as f: txtf.Merge(f.read(), data_reader_proto) data_reader_proto = data_reader_proto.data_reader # Save prototext if args.prototext: lbann.proto.save_prototext(args.prototext, model=model, optimizer=opt, data_reader=data_reader_proto) # Run experiment if not args.prototext: from lbann.contrib.lc.paths import imagenet_dir, imagenet_labels import lbann.contrib.lc.launcher kwargs = lbann.contrib.args.get_scheduler_kwargs(args) classes = args.num_labels kwargs['lbann_args'] = ( '--data_filedir_train={} --data_filename_train={} ' '--data_filedir_test={} --data_filename_test={}'.format( imagenet_dir(data_set='train', num_classes=classes), imagenet_labels(data_set='train', num_classes=classes), imagenet_dir(data_set='val', num_classes=classes), imagenet_labels(data_set='val', num_classes=classes))) lbann.contrib.lc.launcher.run(model, data_reader_proto, opt, job_name='lbann_alexnet', **kwargs)