# ES stuff params.no_copy = args.no_copy # set number of gpu params.ngpu = comm_size # Set up directory baseDir = './expts/' expDir = os.path.join(baseDir, args.config + '/' + str(run_num) + '/') if comm_rank == 0: if not os.path.isdir(expDir): os.makedirs(expDir, exist_ok=True) os.makedirs(expDir + 'training_checkpoints/', exist_ok=True) logging_utils.log_to_file(logger_name=None, log_filename=os.path.join(expDir, 'out.log')) params.log() #args.tboard_writer = SummaryWriter(log_dir=os.path.join(expDir, 'logs/')) params.experiment_dir = os.path.abspath(expDir) params.checkpoint_path = os.path.join(params.experiment_dir, 'training_checkpoints/ckpt.tar') if os.path.isfile(params.checkpoint_path): args.resuming = True train(params, args, comm_rank, comm_local_rank) #if comm_rank == 0: # args.tboard_writer.flush() # args.tboard_writer.close() logging.info('DONE ---- rank %d' % comm_rank)
if __name__ == '__main__': torch.backends.cudnn.benchmark = True if len(sys.argv) != 3: logging.error("Usage", sys.argv[0], "configuration_YAML_file", "configuration") exit() params = YParams(os.path.abspath(sys.argv[1]), sys.argv[2]) if not os.path.exists(params.experiment_dir): os.makedirs(os.path.abspath(params.experiment_dir)) logging_utils.log_to_file(logger_name=None, log_filename=os.path.join( params.experiment_dir, 'out.log')) params.log() tboard_writer = SummaryWriter( log_dir=os.path.join(params.experiment_dir, 'logs/')) params.experiment_dir = os.path.abspath(params.experiment_dir) params.checkpoint_file = os.path.join(params.experiment_dir, 'checkpt.tar') if params.seed: random.seed(params.seed) torch.manual_seed(params.seed) train(params, tboard_writer) tboard_writer.flush() tboard_writer.close()