def run_LY99_batch(test_without_mpi=False): params, options = parse_input() log_by_rank = bool(int(os.environ.get("LOG_BY_RANK", 0))) rank_profile = bool(int(os.environ.get("RANK_PROFILE", 1))) if log_by_rank: import io, sys if rank_profile: import cProfile pr = cProfile.Profile() pr.enable() if test_without_mpi: from LS49.adse13_196.mock_mpi import mpiEmulator MPI = mpiEmulator() else: from libtbx.mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() import omptbx workaround_nt = int(os.environ.get("OMP_NUM_THREADS", 1)) omptbx.omp_set_num_threads(workaround_nt) N_total = int(os.environ["N_SIM"]) # number of items to simulate N_stride = size # total number of worker tasks print("hello from rank %d of %d" % (rank, size), "with omp_threads=", omp_get_num_procs()) import datetime start_comp = time() # now inside the Python imports, begin energy channel calculation wavelength_A = 1.74 # general ballpark X-ray wavelength in Angstroms wavlen = flex.double([12398.425 / (7070.5 + w) for w in range(100)]) direct_algo_res_limit = 1.7 local_data = data() # later put this through broadcast GF = gen_fmodel(resolution=direct_algo_res_limit, pdb_text=local_data.get("pdb_lines"), algorithm="fft", wavelength=wavelength_A) GF.set_k_sol(0.435) GF.make_P1_primitive() # Generating sf for my wavelengths sfall_channels = {} for x in range(len(wavlen)): if rank > len(wavlen): break if x % size != rank: continue GF.reset_wavelength(wavlen[x]) GF.reset_specific_at_wavelength( label_has="FE1", tables=local_data.get("Fe_oxidized_model"), newvalue=wavlen[x]) GF.reset_specific_at_wavelength( label_has="FE2", tables=local_data.get("Fe_reduced_model"), newvalue=wavlen[x]) sfall_channels[x] = GF.get_amplitudes() reports = comm.gather(sfall_channels, root=0) if rank == 0: sfall_channels = {} for report in reports: sfall_channels.update(report) comm.barrier() print( rank, time(), "finished with the calculation of channels, now construct single broadcast" ) if rank == 0: print("Rank 0 time", datetime.datetime.now()) from LS49.spectra.generate_spectra import spectra_simulation from LS49.adse13_196.revapi.LY99_pad import microcrystal print("hello2 from rank %d of %d" % (rank, size)) SS = spectra_simulation() C = microcrystal( Deff_A=4000, length_um=4., beam_diameter_um=1.0) # assume smaller than 10 um crystals from LS49 import legacy_random_orientations random_orientations = legacy_random_orientations(N_total) transmitted_info = dict(spectra=SS, crystal=C, sfall_info=sfall_channels, random_orientations=random_orientations) else: transmitted_info = None transmitted_info = comm.bcast(transmitted_info, root=0) comm.barrier() parcels = list(range(rank, N_total, N_stride)) print(rank, time(), "finished with single broadcast, now set up the rank logger") if log_by_rank: expand_dir = os.path.expandvars(params.logger.outdir) log_path = os.path.join(expand_dir, "rank_%d.log" % rank) error_path = os.path.join(expand_dir, "rank_%d.err" % rank) #print("Rank %d redirecting stdout/stderr to"%rank, log_path, error_path) sys.stdout = io.TextIOWrapper(open(log_path, 'ab', 0), write_through=True) sys.stderr = io.TextIOWrapper(open(error_path, 'ab', 0), write_through=True) print( rank, time(), "finished with the rank logger, now construct the GPU cache container") import random gpu_instance = get_exascale("gpu_instance", params.context) gpu_energy_channels = get_exascale("gpu_energy_channels", params.context) gpu_run = gpu_instance(deviceId=rank % int(os.environ.get("DEVICES_PER_NODE", 1))) gpu_channels_singleton = gpu_energy_channels( deviceId=gpu_run.get_deviceID()) # singleton will instantiate, regardless of gpu, device count, or exascale API comm.barrier() while len(parcels) > 0: idx = random.choice(parcels) cache_time = time() print("idx------start-------->", idx, "rank", rank, time()) # if rank==0: os.system("nvidia-smi") tst_one( image=idx, spectra=transmitted_info["spectra"], crystal=transmitted_info["crystal"], random_orientation=transmitted_info["random_orientations"][idx], sfall_channels=transmitted_info["sfall_info"], gpu_channels_singleton=gpu_channels_singleton, rank=rank, params=params) parcels.remove(idx) print("idx------finis-------->", idx, "rank", rank, time(), "elapsed", time() - cache_time) comm.barrier() del gpu_channels_singleton # avoid Kokkos allocation "device_Fhkl" being deallocated after Kokkos::finalize was called print("Overall rank", rank, "at", datetime.datetime.now(), "seconds elapsed after srun startup %.3f" % (time() - start_elapse)) print("Overall rank", rank, "at", datetime.datetime.now(), "seconds elapsed after Python imports %.3f" % (time() - start_comp)) if rank_profile: pr.disable() pr.dump_stats("cpu_%d.prof" % rank)
def run_batch_job(test_without_mpi=False): params,options = parse_input() if params.log.by_rank: import io, sys if params.log.rank_profile: import cProfile pr = cProfile.Profile() pr.enable() # workaround for getting master nexus os.environ["NXMX_LOCAL_DATA"] = params.nxmx_local_data if test_without_mpi or params.test_without_mpi: from LS49.adse13_196.mock_mpi import mpiEmulator MPI = mpiEmulator() else: from libtbx.mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() import omptbx workaround_nt = int(os.environ.get("OMP_NUM_THREADS",1)) omptbx.omp_set_num_threads(workaround_nt) N_stride = size # total number of worker tasks print("hello from rank %d of %d"%(rank,size),"with omp_threads=",omp_get_num_procs()) import datetime start_comp = time() print(rank, time(), "finished with the calculation of channels, now construct single broadcast") if rank == 0: print("Rank 0 time", datetime.datetime.now()) spectrum_dict = {} from iotbx.reflection_file_reader import any_reflection_file from LS49 import ls49_big_data merge_file = os.path.join(ls49_big_data,"adse13_228","cyto_init_merge.mtz") Fmerge = any_reflection_file(merge_file).as_miller_arrays()[0].as_amplitude_array() print("Fmerge min/max = %f / %f" % (min(Fmerge.data()), max(Fmerge.data()))) transmitted_info = dict(spectra = spectrum_dict, amplitudes = Fmerge, ) else: transmitted_info = None transmitted_info = comm.bcast(transmitted_info, root = 0) comm.barrier() parcels = list(range(rank,params.N_total,N_stride)) print(rank, time(), "finished with single broadcast, now set up the rank logger") if params.log.by_rank: expand_dir = os.path.expandvars(params.log.outdir) os.makedirs(expand_dir, exist_ok=True) log_path = os.path.join(expand_dir,"rank_%d.log"%rank) error_path = os.path.join(expand_dir,"rank_%d.err"%rank) #print("Rank %d redirecting stdout/stderr to"%rank, log_path, error_path) sys.stdout = io.TextIOWrapper(open(log_path,'ab', 0), write_through=True) sys.stderr = io.TextIOWrapper(open(error_path,'ab', 0), write_through=True) print(rank, time(), "finished with the rank logger, now construct the GPU cache container") try: from simtbx.gpu import gpu_energy_channels gpu_channels_singleton = gpu_energy_channels ( deviceId = rank % params.devices_per_node ) # singleton will instantiate, regardless of cuda, device count, or exascale API except ImportError: gpu_channels_singleton = None comm.barrier() import random while len(parcels)>0: idx = random.choice(parcels) cache_time = time() print("idx------start-------->",idx,"rank",rank,time()) # if rank==0: os.system("nvidia-smi") tst_one(i_exp=idx,spectra=transmitted_info["spectra"], Fmerge=transmitted_info["amplitudes"], gpu_channels_singleton=gpu_channels_singleton, rank=rank,params=params ) parcels.remove(idx) print("idx------finis-------->",idx,"rank",rank,time(),"elapsed",time()-cache_time) comm.barrier() print("Overall rank",rank,"at",datetime.datetime.now(), "seconds elapsed after srun startup %.3f"%(time()-start_elapse)) print("Overall rank",rank,"at",datetime.datetime.now(), "seconds elapsed after Python imports %.3f"%(time()-start_comp)) if params.log.rank_profile: pr.disable() pr.dump_stats("cpu_%d.prof"%rank)
def run_batch_job(test_without_mpi=False): from LS49.adse13_187.cyto_batch import parse_input as cyto_batch_parse_input params,options = cyto_batch_parse_input() if params.log.by_rank: import io, sys if params.log.rank_profile: import cProfile pr = cProfile.Profile() pr.enable() if test_without_mpi or params.test_without_mpi: from LS49.adse13_196.mock_mpi import mpiEmulator MPI = mpiEmulator() else: from libtbx.mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() import omptbx workaround_nt = int(os.environ.get("OMP_NUM_THREADS",1)) omptbx.omp_set_num_threads(workaround_nt) N_stride = size # total number of worker tasks print("hello from rank %d of %d"%(rank,size),"with omp_threads=",omp_get_num_procs()) import datetime start_comp = time() if params.log.by_rank: expand_dir = os.path.expandvars(params.log.outdir) os.makedirs(expand_dir, exist_ok=True) log_path = os.path.join(expand_dir,"rank_%d.log"%rank) error_path = os.path.join(expand_dir,"rank_%d.err"%rank) #print("Rank %d redirecting stdout/stderr to"%rank, log_path, error_path) sys.stdout = io.TextIOWrapper(open(log_path,'ab', 0), write_through=True) sys.stderr = io.TextIOWrapper(open(error_path,'ab', 0), write_through=True) print(rank, time(), "finished with the rank logger, now delgate parcels") os.environ["CCTBX_RECOMMEND_DEVICE"] = "%d"%(rank % int(os.environ.get("CCTBX_DEVICE_PER_NODE",1))) print("rank", rank, "device", os.environ["CCTBX_RECOMMEND_DEVICE"]) N_start = int(os.environ.get("N_START",0)) comm.barrier() if rank == 0: os.system("nvidia-smi") # client process (requests all the work) import random parcels = list(range(N_start,N_start + params.N_total)) while len(parcels) > 0: idx = parcels[0] # random.choice(parcels) rankreq = comm.recv(source = MPI.ANY_SOURCE) print("Sending parcel",idx,"to rank",rankreq) comm.send(idx,dest = rankreq) parcels.remove(idx) # finally send a stop command to each process for rankreq in range(size-1): rankreq = comm.recv(source=MPI.ANY_SOURCE) comm.send('endrun',dest=rankreq) else: # server process (does all the work) while True: # inform the client this worker is ready for an event comm.send(rank,dest=0) idx = comm.recv(source=0) if idx == 'endrun': break cache_time = time() print("idx------start-------->",idx,"rank",rank,time()) thin_ds1(idx,frame_params=params) print("idx------finis-------->",idx, "rank",rank,time(),"elapsed %.3fs"%(time()-cache_time)) comm.barrier() print("Overall rank",rank,"at",datetime.datetime.now(), "seconds elapsed after srun startup %.3f"%(time()-start_elapse)) print("Overall rank",rank,"at",datetime.datetime.now(), "seconds elapsed after Python imports %.3f"%(time()-start_comp)) if params.log.rank_profile: pr.disable() pr.dump_stats("cpu_%d.prof"%rank)