tmp_r += 1 return rank_in_pdb, size_in_pdb, ranks_in_pdb if __name__=="__main__": from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() workaround_nt = int(os.environ.get("OMP_NUM_THREADS",1)) omptbx.omp_set_num_threads(workaround_nt) print("## hello from rank %d of %d"%(rank,size),"with omp_threads=",omp_get_num_procs()) ## assign jobs rank_in_pdb, size_in_pdb, ranks_in_pdb = jobAssign(size=size, num_pdb=simparams.num_pdbs, num_img=simparams.num_img[0]) import datetime start_elapse = time.time() if rank == 0: print("Rank 0 time", datetime.datetime.now()) from LS49.spectra.generate_spectra import spectra_simulation SS = spectra_simulation() C = microcrystal(Deff_A = simparams.Deff_A, length_um = simparams.length_um, beam_diameter_um = simparams.beam_diameter_um)
quick=quick, rank=rank) if __name__ == "__main__": from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() import os, omptbx workaround_nt = int(os.environ.get("OMP_NUM_THREADS", 1)) omptbx.omp_set_num_threads(workaround_nt) N_total = 100000 # number of items to simulate N_stride = size # total number of worker tasks print("hello from rank %d of %d" % (rank, size), "with omp_threads=", omp_get_num_procs()) import datetime start_elapse = time() if rank == 0: print("Rank 0 time", datetime.datetime.now()) from LS49.spectra.generate_spectra import spectra_simulation from LS49.sim.step5_pad import microcrystal print("hello2 from rank %d of %d" % (rank, size)) SS = spectra_simulation() C = microcrystal( Deff_A=4000, length_um=4., beam_diameter_um=1.0) # assume smaller than 10 um crystals mt = flex.mersenne_twister(seed=0) random_orientations = [] for iteration in range(N_total): random_orientations.append(mt.random_double_r3_rotation_matrix())
def run_batch_job(test_without_mpi=False): params,options = parse_input() if params.log.by_rank: import io, sys if params.log.rank_profile: import cProfile pr = cProfile.Profile() pr.enable() # workaround for getting master nexus os.environ["NXMX_LOCAL_DATA"] = params.nxmx_local_data if test_without_mpi or params.test_without_mpi: from LS49.adse13_196.mock_mpi import mpiEmulator MPI = mpiEmulator() else: from libtbx.mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() import omptbx workaround_nt = int(os.environ.get("OMP_NUM_THREADS",1)) omptbx.omp_set_num_threads(workaround_nt) N_stride = size # total number of worker tasks print("hello from rank %d of %d"%(rank,size),"with omp_threads=",omp_get_num_procs()) import datetime start_comp = time() print(rank, time(), "finished with the calculation of channels, now construct single broadcast") if rank == 0: print("Rank 0 time", datetime.datetime.now()) spectrum_dict = {} from iotbx.reflection_file_reader import any_reflection_file from LS49 import ls49_big_data merge_file = os.path.join(ls49_big_data,"adse13_228","cyto_init_merge.mtz") Fmerge = any_reflection_file(merge_file).as_miller_arrays()[0].as_amplitude_array() print("Fmerge min/max = %f / %f" % (min(Fmerge.data()), max(Fmerge.data()))) transmitted_info = dict(spectra = spectrum_dict, amplitudes = Fmerge, ) else: transmitted_info = None transmitted_info = comm.bcast(transmitted_info, root = 0) comm.barrier() parcels = list(range(rank,params.N_total,N_stride)) print(rank, time(), "finished with single broadcast, now set up the rank logger") if params.log.by_rank: expand_dir = os.path.expandvars(params.log.outdir) os.makedirs(expand_dir, exist_ok=True) log_path = os.path.join(expand_dir,"rank_%d.log"%rank) error_path = os.path.join(expand_dir,"rank_%d.err"%rank) #print("Rank %d redirecting stdout/stderr to"%rank, log_path, error_path) sys.stdout = io.TextIOWrapper(open(log_path,'ab', 0), write_through=True) sys.stderr = io.TextIOWrapper(open(error_path,'ab', 0), write_through=True) print(rank, time(), "finished with the rank logger, now construct the GPU cache container") try: from simtbx.gpu import gpu_energy_channels gpu_channels_singleton = gpu_energy_channels ( deviceId = rank % params.devices_per_node ) # singleton will instantiate, regardless of cuda, device count, or exascale API except ImportError: gpu_channels_singleton = None comm.barrier() import random while len(parcels)>0: idx = random.choice(parcels) cache_time = time() print("idx------start-------->",idx,"rank",rank,time()) # if rank==0: os.system("nvidia-smi") tst_one(i_exp=idx,spectra=transmitted_info["spectra"], Fmerge=transmitted_info["amplitudes"], gpu_channels_singleton=gpu_channels_singleton, rank=rank,params=params ) parcels.remove(idx) print("idx------finis-------->",idx,"rank",rank,time(),"elapsed",time()-cache_time) comm.barrier() print("Overall rank",rank,"at",datetime.datetime.now(), "seconds elapsed after srun startup %.3f"%(time()-start_elapse)) print("Overall rank",rank,"at",datetime.datetime.now(), "seconds elapsed after Python imports %.3f"%(time()-start_comp)) if params.log.rank_profile: pr.disable() pr.dump_stats("cpu_%d.prof"%rank)
def run_LY99_batch(test_without_mpi=False): params, options = parse_input() log_by_rank = bool(int(os.environ.get("LOG_BY_RANK", 0))) rank_profile = bool(int(os.environ.get("RANK_PROFILE", 1))) if log_by_rank: import io, sys if rank_profile: import cProfile pr = cProfile.Profile() pr.enable() if test_without_mpi: from LS49.adse13_196.mock_mpi import mpiEmulator MPI = mpiEmulator() else: from libtbx.mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() import omptbx workaround_nt = int(os.environ.get("OMP_NUM_THREADS", 1)) omptbx.omp_set_num_threads(workaround_nt) N_total = int(os.environ["N_SIM"]) # number of items to simulate N_stride = size # total number of worker tasks print("hello from rank %d of %d" % (rank, size), "with omp_threads=", omp_get_num_procs()) import datetime start_comp = time() # now inside the Python imports, begin energy channel calculation wavelength_A = 1.74 # general ballpark X-ray wavelength in Angstroms wavlen = flex.double([12398.425 / (7070.5 + w) for w in range(100)]) direct_algo_res_limit = 1.7 local_data = data() # later put this through broadcast GF = gen_fmodel(resolution=direct_algo_res_limit, pdb_text=local_data.get("pdb_lines"), algorithm="fft", wavelength=wavelength_A) GF.set_k_sol(0.435) GF.make_P1_primitive() # Generating sf for my wavelengths sfall_channels = {} for x in range(len(wavlen)): if rank > len(wavlen): break if x % size != rank: continue GF.reset_wavelength(wavlen[x]) GF.reset_specific_at_wavelength( label_has="FE1", tables=local_data.get("Fe_oxidized_model"), newvalue=wavlen[x]) GF.reset_specific_at_wavelength( label_has="FE2", tables=local_data.get("Fe_reduced_model"), newvalue=wavlen[x]) sfall_channels[x] = GF.get_amplitudes() reports = comm.gather(sfall_channels, root=0) if rank == 0: sfall_channels = {} for report in reports: sfall_channels.update(report) comm.barrier() print( rank, time(), "finished with the calculation of channels, now construct single broadcast" ) if rank == 0: print("Rank 0 time", datetime.datetime.now()) from LS49.spectra.generate_spectra import spectra_simulation from LS49.adse13_196.revapi.LY99_pad import microcrystal print("hello2 from rank %d of %d" % (rank, size)) SS = spectra_simulation() C = microcrystal( Deff_A=4000, length_um=4., beam_diameter_um=1.0) # assume smaller than 10 um crystals from LS49 import legacy_random_orientations random_orientations = legacy_random_orientations(N_total) transmitted_info = dict(spectra=SS, crystal=C, sfall_info=sfall_channels, random_orientations=random_orientations) else: transmitted_info = None transmitted_info = comm.bcast(transmitted_info, root=0) comm.barrier() parcels = list(range(rank, N_total, N_stride)) print(rank, time(), "finished with single broadcast, now set up the rank logger") if log_by_rank: expand_dir = os.path.expandvars(params.logger.outdir) log_path = os.path.join(expand_dir, "rank_%d.log" % rank) error_path = os.path.join(expand_dir, "rank_%d.err" % rank) #print("Rank %d redirecting stdout/stderr to"%rank, log_path, error_path) sys.stdout = io.TextIOWrapper(open(log_path, 'ab', 0), write_through=True) sys.stderr = io.TextIOWrapper(open(error_path, 'ab', 0), write_through=True) print( rank, time(), "finished with the rank logger, now construct the GPU cache container") import random gpu_instance = get_exascale("gpu_instance", params.context) gpu_energy_channels = get_exascale("gpu_energy_channels", params.context) gpu_run = gpu_instance(deviceId=rank % int(os.environ.get("DEVICES_PER_NODE", 1))) gpu_channels_singleton = gpu_energy_channels( deviceId=gpu_run.get_deviceID()) # singleton will instantiate, regardless of gpu, device count, or exascale API comm.barrier() while len(parcels) > 0: idx = random.choice(parcels) cache_time = time() print("idx------start-------->", idx, "rank", rank, time()) # if rank==0: os.system("nvidia-smi") tst_one( image=idx, spectra=transmitted_info["spectra"], crystal=transmitted_info["crystal"], random_orientation=transmitted_info["random_orientations"][idx], sfall_channels=transmitted_info["sfall_info"], gpu_channels_singleton=gpu_channels_singleton, rank=rank, params=params) parcels.remove(idx) print("idx------finis-------->", idx, "rank", rank, time(), "elapsed", time() - cache_time) comm.barrier() del gpu_channels_singleton # avoid Kokkos allocation "device_Fhkl" being deallocated after Kokkos::finalize was called print("Overall rank", rank, "at", datetime.datetime.now(), "seconds elapsed after srun startup %.3f" % (time() - start_elapse)) print("Overall rank", rank, "at", datetime.datetime.now(), "seconds elapsed after Python imports %.3f" % (time() - start_comp)) if rank_profile: pr.disable() pr.dump_stats("cpu_%d.prof" % rank)
def find_peaks_clean(self): import omptbx # doesn't seem to be any benefit to using more than say 8 threads num_threads = min(8, omptbx.omp_get_num_procs(), self.params.nproc) omptbx.omp_set_num_threads(num_threads) d_min = self.params.fft3d.reciprocal_space_grid.d_min rlgrid = 2 / (d_min * self.gridding[0]) frame_number = self.reflections['xyzobs.px.value'].parts()[2] scan_range_min = max( int(math.floor(flex.min(frame_number))), self.imagesets[0].get_array_range()[0]) # XXX what about multiple imagesets? scan_range_max = min( int(math.ceil(flex.max(frame_number))), self.imagesets[0].get_array_range()[1]) # XXX what about multiple imagesets? scan_range = self.params.scan_range if not len(scan_range): scan_range = [[scan_range_min, scan_range_max]] scan = self.imagesets[0].get_scan() # XXX what about multiple imagesets? angle_ranges = [ [scan.get_angle_from_array_index(i, deg=False) for i in range_] for range_ in scan_range] grid = flex.double(flex.grid(self.gridding), 0) sampling_volume_map(grid, flex.vec2_double(angle_ranges), self.imagesets[0].get_beam().get_s0(), self.imagesets[0].get_goniometer().get_rotation_axis(), rlgrid, d_min, self.params.b_iso) fft = fftpack.complex_to_complex_3d(self.gridding) grid_complex = flex.complex_double( reals=grid, imags=flex.double(grid.size(), 0)) grid_transformed = fft.forward(grid_complex) grid_real = flex.pow2(flex.real(grid_transformed)) gamma = 1 peaks = flex.vec3_double() #n_peaks = 200 n_peaks = 100 # XXX how many do we need? dirty_beam = grid_real dirty_map = self.grid_real.deep_copy() import time t0 = time.time() peaks = clean_3d(dirty_beam, dirty_map, n_peaks, gamma=gamma) t1 = time.time() #print "clean_3d took %.2f s" %(t1-t0) reciprocal_lattice_points = self.reflections['rlp'].select( self.reflections_used_for_indexing) peaks = self.optimise_peaks(peaks, reciprocal_lattice_points) peaks_frac = flex.vec3_double() for p in peaks: peaks_frac.append((p[0]/self.gridding[0], p[1]/self.gridding[1], p[2]/self.gridding[2])) #print p, peaks_frac[-1] if self.params.debug: self.debug_write_ccp4_map(grid, "sampling_volume.map") self.debug_write_ccp4_map(grid_real, "sampling_volume_FFT.map") self.debug_write_ccp4_map(dirty_map, "clean.map") self.sites = peaks_frac # we don't really know the "volume"" of the peaks, but this method should # find the peaks in order of their intensity (strongest first) self.volumes = flex.double(range(len(self.sites), 0, -1)) return
def run_batch_job(test_without_mpi=False): from LS49.adse13_187.cyto_batch import parse_input as cyto_batch_parse_input params,options = cyto_batch_parse_input() if params.log.by_rank: import io, sys if params.log.rank_profile: import cProfile pr = cProfile.Profile() pr.enable() if test_without_mpi or params.test_without_mpi: from LS49.adse13_196.mock_mpi import mpiEmulator MPI = mpiEmulator() else: from libtbx.mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() import omptbx workaround_nt = int(os.environ.get("OMP_NUM_THREADS",1)) omptbx.omp_set_num_threads(workaround_nt) N_stride = size # total number of worker tasks print("hello from rank %d of %d"%(rank,size),"with omp_threads=",omp_get_num_procs()) import datetime start_comp = time() if params.log.by_rank: expand_dir = os.path.expandvars(params.log.outdir) os.makedirs(expand_dir, exist_ok=True) log_path = os.path.join(expand_dir,"rank_%d.log"%rank) error_path = os.path.join(expand_dir,"rank_%d.err"%rank) #print("Rank %d redirecting stdout/stderr to"%rank, log_path, error_path) sys.stdout = io.TextIOWrapper(open(log_path,'ab', 0), write_through=True) sys.stderr = io.TextIOWrapper(open(error_path,'ab', 0), write_through=True) print(rank, time(), "finished with the rank logger, now delgate parcels") os.environ["CCTBX_RECOMMEND_DEVICE"] = "%d"%(rank % int(os.environ.get("CCTBX_DEVICE_PER_NODE",1))) print("rank", rank, "device", os.environ["CCTBX_RECOMMEND_DEVICE"]) N_start = int(os.environ.get("N_START",0)) comm.barrier() if rank == 0: os.system("nvidia-smi") # client process (requests all the work) import random parcels = list(range(N_start,N_start + params.N_total)) while len(parcels) > 0: idx = parcels[0] # random.choice(parcels) rankreq = comm.recv(source = MPI.ANY_SOURCE) print("Sending parcel",idx,"to rank",rankreq) comm.send(idx,dest = rankreq) parcels.remove(idx) # finally send a stop command to each process for rankreq in range(size-1): rankreq = comm.recv(source=MPI.ANY_SOURCE) comm.send('endrun',dest=rankreq) else: # server process (does all the work) while True: # inform the client this worker is ready for an event comm.send(rank,dest=0) idx = comm.recv(source=0) if idx == 'endrun': break cache_time = time() print("idx------start-------->",idx,"rank",rank,time()) thin_ds1(idx,frame_params=params) print("idx------finis-------->",idx, "rank",rank,time(),"elapsed %.3fs"%(time()-cache_time)) comm.barrier() print("Overall rank",rank,"at",datetime.datetime.now(), "seconds elapsed after srun startup %.3f"%(time()-start_elapse)) print("Overall rank",rank,"at",datetime.datetime.now(), "seconds elapsed after Python imports %.3f"%(time()-start_comp)) if params.log.rank_profile: pr.disable() pr.dump_stats("cpu_%d.prof"%rank)