class ContinuumFitContainerFiles(ContinuumFitContainer): # noinspection PyMissingConstructor def __init__(self, create_new=False, num_spectra=-1): # note: do NOT call super(ContinuumFitContainer, self) # we don't want to initialize a very large object in memory. if create_new: self.num_spectra = num_spectra self.np_spectrum = NpSpectrumContainer(readonly=False, num_spectra=num_spectra, filename=settings.get_continuum_fit_npy()) self.continuum_fit_metadata = table.Table() self.continuum_fit_metadata.add_columns( [table.Column(name='index', dtype='i8', unit=None, length=num_spectra), table.Column(name='is_good_fit', dtype='b', unit=None, length=num_spectra), table.Column(name='goodness_of_fit', dtype='f8', unit=None, length=num_spectra), table.Column(name='snr', dtype='f8', unit=None, length=num_spectra)]) # initialize file self.np_spectrum.zero() else: self.np_spectrum = NpSpectrumContainer(readonly=True, filename=settings.get_continuum_fit_npy()) self.continuum_fit_metadata = np.load(settings.get_continuum_fit_metadata_npy()) self.num_spectra = self.np_spectrum.num_spectra def save(self): np.save(settings.get_continuum_fit_metadata_npy(), self.continuum_fit_metadata)
def __init__(self, num_spectra): self.num_spectra = num_spectra self.delta_t_file = NpSpectrumContainer( False, num_spectra=self.num_spectra, filename=settings.get_delta_t_npy(), max_wavelength_count=1000) self.n = 0 # initialize file self.delta_t_file.zero()
def __init__(self, num_spectra=-1): self.num_spectra = num_spectra self.np_spectrum = NpSpectrumContainer(readonly=False, num_spectra=num_spectra) self.continuum_fit_metadata = table.Table() self.continuum_fit_metadata.add_columns( [table.Column(name='index', dtype='i8', unit=None, length=num_spectra), table.Column(name='is_good_fit', dtype='b', unit=None, length=num_spectra), table.Column(name='goodness_of_fit', dtype='f8', unit=None, length=num_spectra), table.Column(name='snr', dtype='f8', unit=None, length=num_spectra)]) # initialize array self.np_spectrum.zero()
def from_np_array_and_object(cls, np_array, obj): # TODO: consider refactoring. np_spectrum = NpSpectrumContainer.from_np_array(np_array, readonly=True) new_instance = cls(num_spectra=np_spectrum.num_spectra) # replace spectrum container with existing data new_instance.np_spectrum = np_spectrum # replace metadata with existing metadata object assert type(new_instance.continuum_fit_metadata) == type(obj) new_instance.continuum_fit_metadata = obj return new_instance
def accumulate(self, result_enum, ar_qso_indices_list, object_results): del object_results for ar_delta_t, ar_qso_indices in zip(result_enum, ar_qso_indices_list): delta_t = NpSpectrumContainer.from_np_array(ar_delta_t, readonly=True) for j, n in zip(NpSpectrumIterator(delta_t), ar_qso_indices): # if self.n >= self.num_spectra: # break self.delta_t_file.set_wavelength(n, j.get_wavelength()) self.delta_t_file.set_flux(n, j.get_flux()) self.delta_t_file.set_ivar(n, j.get_ivar()) self.n += 1 l_print_no_barrier("n =", self.n) l_print_no_barrier("n =", self.n) return self.return_result()
class ISMTransmittanceAccumulator: """ Modify existing delta transmittance file. Replace forest with ISM spectra. It is intended to be used as a helper object called by mpi_accumulate.accumulate_over_spectra """ def __init__(self, num_spectra): self.num_spectra = num_spectra self.forest_ism_file = NpSpectrumContainer( False, num_spectra=self.num_spectra, filename=settings.get_forest_ism_npy(), max_wavelength_count=1000) self.n = 0 # initialize file self.forest_ism_file.zero() def accumulate(self, result_enum, ar_qso_indices_list, object_results): # unused parameter: del object_results for ar_chunk, ar_qso_indices in zip(result_enum, ar_qso_indices_list): forest_chunk = NpSpectrumContainer.from_np_array(ar_chunk, readonly=True) for j, n in zip(NpSpectrumIterator(forest_chunk), ar_qso_indices): # if self.n >= self.num_spectra: # break self.forest_ism_file.set_wavelength(n, j.get_wavelength()) self.forest_ism_file.set_flux(n, j.get_flux()) self.forest_ism_file.set_ivar(n, j.get_ivar()) self.n += 1 l_print_no_barrier("n =", self.n) l_print_no_barrier("n =", self.n) return self.return_result() def return_result(self): return self.n, None def finalize(self): pass
class DeltaTransmittanceAccumulator: """ Add delta transmittance data to a single memory mapped file. It is intended to be used as a helper object called by mpi_accumulate.accumulate_over_spectra """ def __init__(self, num_spectra): self.num_spectra = num_spectra self.delta_t_file = NpSpectrumContainer( False, num_spectra=self.num_spectra, filename=settings.get_delta_t_npy(), max_wavelength_count=1000) self.n = 0 # initialize file self.delta_t_file.zero() def accumulate(self, result_enum, ar_qso_indices_list, object_results): del object_results for ar_delta_t, ar_qso_indices in zip(result_enum, ar_qso_indices_list): delta_t = NpSpectrumContainer.from_np_array(ar_delta_t, readonly=True) for j, n in zip(NpSpectrumIterator(delta_t), ar_qso_indices): # if self.n >= self.num_spectra: # break self.delta_t_file.set_wavelength(n, j.get_wavelength()) self.delta_t_file.set_flux(n, j.get_flux()) self.delta_t_file.set_ivar(n, j.get_ivar()) self.n += 1 l_print_no_barrier("n =", self.n) l_print_no_barrier("n =", self.n) return self.return_result() def return_result(self): return self.n, None def finalize(self): pass
def delta_transmittance_chunk(qso_record_table): start_offset = qso_record_table[0]['index'] spectra = read_spectrum_hdf5.SpectraWithMetadata( qso_record_table, settings.get_qso_spectra_hdf5()) continuum_fit_file = ContinuumFitContainerFiles(False) num_spectra = len(qso_record_table) delta_t = NpSpectrumContainer(False, num_spectra=num_spectra) # warning: np.ndarray is not initialized by default. zeroing manually. delta_t.zero() m = mean_transmittance.MeanTransmittance.from_file( settings.get_mean_transmittance_npy()) # m = median_transmittance.MedianTransmittance.from_file(settings.get_median_transmittance_npy()) # for debugging with a small data set: # ignore values with less than 20 sample points ar_z_mean_transmittance, ar_mean_transmittance = m.get_weighted_mean_with_minimum_count( 20) # ar_z_mean_transmittance, ar_mean_transmittance = m.get_weighted_median_with_minimum_count(20, weighted=True) remove_dla = RemoveDlaSimple() pixel_weight = pixel_weight_coefficients.PixelWeight( pixel_weight_coefficients.DEFAULT_WEIGHT_Z_RANGE) for n in range(len(qso_record_table)): qso_spec_obj = spectra.return_spectrum(n) index = qso_spec_obj.qso_rec.index if not continuum_fit_file.get_is_good_fit(index): local_delta_stats['bad_fit'] += 1 l_print_no_barrier("skipped QSO (bad fit): ", qso_spec_obj.qso_rec) continue ar_fit_spectrum = continuum_fit_file.get_flux(index) # we assume the fit spectrum uses the same wavelengths. lya_forest_transmittance = qso_transmittance( qso_spec_obj, ar_fit_spectrum, local_delta_stats, downsample_factor=settings.get_forest_downsample_factor()) ar_z = lya_forest_transmittance.ar_z if ar_z.size: # prepare the mean transmittance for the z range of this QSO ar_mean_flux_for_z_range = np.asarray( np.interp(ar_z, ar_z_mean_transmittance, ar_mean_transmittance)) # delta transmittance is the change in relative transmittance vs the mean # therefore, subtract 1. ar_delta_t = lya_forest_transmittance.ar_transmittance / ar_mean_flux_for_z_range - 1 # finish the error estimation, and save it ar_delta_t_ivar = pixel_weight.eval( lya_forest_transmittance.ar_ivar, ar_mean_flux_for_z_range * lya_forest_transmittance.ar_fit, ar_z) # simple DLA removal (without using a catalog) if settings.get_enable_simple_dla_removal(): # remove DLA regions by setting the ivar of nearby pixels to 0 ar_dla_mask = remove_dla.get_mask(ar_delta_t) if np.any(ar_dla_mask): l_print_no_barrier("DLA(s) removed from QSO: ", qso_spec_obj.qso_rec) ar_delta_t_ivar[ar_dla_mask] = 0 # ignore nan or infinite values (in case m_mean has incomplete data because of a low sample size) # Note: using wavelength field to store redshift finite_mask = np.logical_and(np.isfinite(ar_delta_t), np.isfinite(ar_delta_t_ivar)) finite_z = ar_z[finite_mask] finite_delta_t = ar_delta_t[finite_mask] finite_ivar = ar_delta_t_ivar[finite_mask] # detrend forests with large enough range in comoving coordinates: finite_distances = cd.fast_comoving_distance(finite_z) if finite_distances[-1] - finite_distances[0] > 500: delta_t_boxcar = nu_boxcar(finite_distances, finite_delta_t, lambda c: c - 300, lambda c: c + 300, weights=finite_ivar) finite_delta_t = finite_delta_t - delta_t_boxcar delta_t.set_wavelength(n, finite_z) delta_t.set_flux(n, finite_delta_t) delta_t.set_ivar(n, finite_ivar) else: # empty record pass delta_transmittance_chunk.num_spec += 1 l_print_no_barrier("finished chunk, num spectra:", delta_transmittance_chunk.num_spec, " offset: ", start_offset) return delta_t.as_np_array(), None
def ism_transmittance_chunk(qso_record_table): start_offset = qso_record_table[0]['index'] # spectra = read_spectrum_hdf5.SpectraWithMetadata(qso_record_table, settings.get_qso_spectra_hdf5()) # continuum_fit_file = NpSpectrumContainer(True, filename=settings.get_continuum_fit_npy()) delta_transmittance_file = NpSpectrumContainer( readonly=True, filename=settings.get_delta_t_npy(), max_wavelength_count=1000) num_spectra = len(qso_record_table) ism_delta_t = NpSpectrumContainer(False, num_spectra=num_spectra) # warning: np.ndarray is not initialized by default. zeroing manually. ism_delta_t.zero() n = 0 for i in range(len(qso_record_table)): qso_rec = QSORecord.from_row(qso_record_table[i]) index = qso_rec.index # read original delta transmittance ar_redshift = delta_transmittance_file.get_wavelength(index) # ar_flux = delta_transmittance_file.get_flux(index) ar_ivar = delta_transmittance_file.get_ivar(index) # get correction to ISM # ar_flux_new, ar_ivar_new, is_corrected = pre_process_spectrum.mw_lines.apply_correction( # ar_wavelength, np.ones_like(ar_flux), ar_ivar, qso_rec.ra, qso_rec.dec) ar_wavelength = (ar_redshift + 1) * lya_center # type: np.ndarray # limit maximum bin number because higher extinction bins are not reliable max_extinction_bin = max(20, ar_extinction_levels.size) if np.isfinite(qso_rec.extinction_g): extinction_bin = int( np.round( np.interp(qso_rec.extinction_g, ar_extinction_levels, np.arange(max_extinction_bin)))) else: extinction_bin = 0 l_print_no_barrier("extinction_bin = ", extinction_bin) ar_ism_resampled = np.interp( ar_wavelength, extinction_spectra_list[extinction_bin][0], extinction_spectra_list[extinction_bin][1], left=np.nan, right=np.nan) extinction = ar_extinction_levels[extinction_bin] # rescale according to QSO extinction l_print_no_barrier(qso_rec.extinction_g, extinction) ism_scale_factor = 1. ar_flux_new = (ar_ism_resampled - 1 ) * ism_scale_factor * qso_rec.extinction_g / extinction mask = np.logical_and(np.isfinite(ar_flux_new), ar_ivar) ism_delta_t.set_wavelength(i, ar_redshift[mask]) # use reciprocal to get absorption spectrum, then subtract 1 to get the delta ism_delta_t.set_flux(i, ar_flux_new[mask]) # ism_delta_t.set_flux(i, np.ones_like(ar_flux) * qso_rec.extinction_g) # use original ivar because we are not correcting an existing spectrum ism_delta_t.set_ivar(i, ar_ivar[mask]) n += 1 l_print_no_barrier("chunk n =", n, "offset =", start_offset) return ism_delta_t.as_np_array(), None
def profile_main(): # initialize data sources qso_record_table = table.Table(np.load(settings.get_qso_metadata_npy())) if settings.get_ism_only_mode(): delta_t_filename = settings.get_forest_ism_npy() else: delta_t_filename = settings.get_delta_t_npy() delta_t_file = NpSpectrumContainer(True, num_spectra=len(qso_record_table), filename=delta_t_filename, max_wavelength_count=1000) # prepare data for quicker access qso_record_list = [QSORecord.from_row(i) for i in qso_record_table] ar_ra = np.array([i.ra for i in qso_record_list]) ar_dec = np.array([i.dec for i in qso_record_list]) ar_z = np.array([i.z for i in qso_record_list]) ar_distance = cd.fast_comoving_distance(ar_z) mpi_helper.r_print('QSO table size:', len(ar_distance)) # TODO: find a more precise value instead of z=1.9 # set maximum QSO angular separation to 200Mpc/h (in co-moving coordinates) # the article assumes h is measured in units of 100km/s/mpc radius_quantity = (200. * (100. * u.km / (u.Mpc * u.s)) / cd.H0 ) # type: u.Quantity max_transverse_separation = radius_quantity.value max_parallel_separation = radius_quantity.value max_angular_separation = max_transverse_separation / ( cd.comoving_distance(1.9) / u.radian) mpi_helper.r_print('maximum separation of QSOs:', Angle(max_angular_separation).to_string(unit=u.degree)) # print(ar_list) coord_set = coord.SkyCoord(ra=ar_ra * u.degree, dec=ar_dec * u.degree, distance=ar_distance * u.Mpc) data_state = None computation_state = None # either initialize variable or load them to resume if settings.get_resume(): if comm.rank == 0: # resume an existing state data_state = pickle.load( open(settings.get_restartable_data_state_p(), 'rb')) # type: DataState computation_state = pickle.load( open(settings.get_restartable_computation_state_p(), 'rb')) # type: ComputationState else: if comm.rank == 0: # initialize a new state # create a random permutation of the coordinate set # (this is done to balance the load on the nodes) new_coord_permutation = np.random.permutation(len(coord_set)) # data_state should hold everything required to reproduce the exact same computation, # so that it is possible to restart it from the last completed bundle. # NOTE: currently there is no plan to check for consistency on load. # changing the input data before restarting will produce undefined results. data_state = DataState( mpi_comm_size=comm.size, coord_permutation=new_coord_permutation, max_angular_separation=max_angular_separation) computation_state = ComputationState(bundle_index=0, sub_chunk_index=0) pickle.dump(data_state, open(settings.get_restartable_data_state_p(), 'wb')) # send state to all nodes: data_state = comm.bcast(data_state) computation_state = comm.bcast(computation_state) # type: ComputationState if max_angular_separation != data_state.max_angular_separation: raise Exception( "Cannot resume, angular separation has changed ({}->{})".format( data_state.max_angular_separation, max_angular_separation)) if comm.size != data_state.mpi_comm_size: raise Exception("Cannot resume, MPI COMM size must be {}".format( data_state.mpi_comm_size)) coord_permutation = data_state.coord_permutation first_sub_chunk_index = computation_state.sub_chunk_index # find all QSO pairs chunk_sizes, chunk_offsets = mpi_helper.get_chunks(len(coord_set), comm.size) local_start_index = chunk_offsets[comm.rank] local_end_index = local_start_index + chunk_sizes[comm.rank] if settings.get_enable_weighted_median_estimator(): accumulator_type = calc_pixel_pairs.accumulator_types.histogram assert not settings.get_enable_weighted_mean_estimator( ), "Median and mean estimators are mutually exclusive." assert not settings.get_enable_estimator_subsamples( ), "Subsamples not supported for histogram." elif settings.get_enable_weighted_mean_estimator(): if settings.get_enable_estimator_subsamples(): accumulator_type = calc_pixel_pairs.accumulator_types.mean_subsample else: accumulator_type = calc_pixel_pairs.accumulator_types.mean else: assert False, "Either median or mean estimators must be specified." pixel_pairs_object = calc_pixel_pairs.PixelPairs( cd, max_transverse_separation, max_parallel_separation, accumulator_type=accumulator_type) # divide the work into sub chunks # Warning: the number of sub chunks must be identical for all nodes because gather is called after each sub chunk. # NOTE: we no longer divide by comm.size to make sub chunk size independent of number of nodes, # because pairs are generated in bundles, instead of once at the beginning. num_sub_chunks_per_node = settings.get_mpi_num_sub_chunks() sub_chunk_helper = SubChunkHelper(pixel_pairs_object, settings.get_resume()) for bundle_index, local_qso_pair_angles, local_qso_pairs in generate_pairs( ar_dec, ar_ra, coord_permutation, coord_set, local_end_index, local_start_index, max_angular_separation, bundle_start_index=computation_state.bundle_index): pixel_pair_sub_chunks = mpi_helper.get_chunks(local_qso_pairs.shape[0], num_sub_chunks_per_node) sub_chunk_iterator = islice( enumerate(zip(pixel_pair_sub_chunks[0], pixel_pair_sub_chunks[1])), first_sub_chunk_index, None) # if resuming from a previous run, use the value in first_sub_chunk_index only once: first_sub_chunk_index = 0 for sub_chunk_index, (i, j) in sub_chunk_iterator: # save computation state to allow restarting if comm.rank == 0: save_computation_state(bundle_index=bundle_index, sub_chunk_index=sub_chunk_index) sub_chunk_start = j sub_chunk_end = j + i mpi_helper.l_print("sub_chunk: size", i, ", starting at", j, ",", sub_chunk_index, "out of", len(pixel_pair_sub_chunks[0])) sub_chunk_helper.add_pairs_in_sub_chunk( delta_t_file, local_qso_pair_angles, local_qso_pairs[sub_chunk_start:sub_chunk_end], pixel_pairs_object) # done. update computation state one last time with a very large bundle index if comm.rank == 0: save_computation_state(bundle_index=sys.maxsize, sub_chunk_index=sys.maxsize)
class ContinuumFitContainer(object): def __init__(self, num_spectra=-1): self.num_spectra = num_spectra self.np_spectrum = NpSpectrumContainer(readonly=False, num_spectra=num_spectra) self.continuum_fit_metadata = table.Table() self.continuum_fit_metadata.add_columns( [table.Column(name='index', dtype='i8', unit=None, length=num_spectra), table.Column(name='is_good_fit', dtype='b', unit=None, length=num_spectra), table.Column(name='goodness_of_fit', dtype='f8', unit=None, length=num_spectra), table.Column(name='snr', dtype='f8', unit=None, length=num_spectra)]) # initialize array self.np_spectrum.zero() def get_wavelength(self, n): return self.np_spectrum.get_wavelength(n) def get_flux(self, n): return self.np_spectrum.get_flux(n) def set_wavelength(self, n, data): self.np_spectrum.set_wavelength(n, data) def set_flux(self, n, data): self.np_spectrum.set_flux(n, data) def set_metadata(self, n, is_good_fit, goodness_of_fit, snr): self.continuum_fit_metadata[n] = [n, is_good_fit, goodness_of_fit, snr] def copy_metadata(self, n, metadata): self.continuum_fit_metadata[n] = metadata def get_metadata(self, n): return self.continuum_fit_metadata[n] def get_is_good_fit(self, n): return self.get_metadata(n)['is_good_fit'] def get_goodness_of_fit(self, n): return self.get_metadata(n)['goodness_of_fit'] def get_snr(self, n): return self.get_metadata(n)['snr'] @classmethod def from_np_array_and_object(cls, np_array, obj): # TODO: consider refactoring. np_spectrum = NpSpectrumContainer.from_np_array(np_array, readonly=True) new_instance = cls(num_spectra=np_spectrum.num_spectra) # replace spectrum container with existing data new_instance.np_spectrum = np_spectrum # replace metadata with existing metadata object assert type(new_instance.continuum_fit_metadata) == type(obj) new_instance.continuum_fit_metadata = obj return new_instance def as_object(self): return self.continuum_fit_metadata def as_np_array(self): return self.np_spectrum.as_np_array()
mask = ar_ivar_total_ != 0 return ar_z_[mask], ar_delta_t_weighted_[mask] / ar_ivar_total_[mask] if __name__ == '__main__': # execute only on rank 0, since this is a simple IO-bound operation. comm.Barrier() if comm.rank != 0: exit() qso_record_table = table.Table(np.load(settings.get_qso_metadata_npy())) if settings.get_ism_only_mode(): delta_t_filename = settings.get_forest_ism_npy() else: delta_t_filename = settings.get_delta_t_npy() delta_t_file = NpSpectrumContainer(readonly=False, create_new=False, num_spectra=len(qso_record_table), filename=delta_t_filename, max_wavelength_count=1000) ar_delta_t_weighted, ar_ivar_total, ar_z, n, ar_delta_t_median = update_mean( delta_t_file) if settings.get_enable_weighted_mean_estimator(): remove_mean(delta_t_file, ar_delta_t_weighted, ar_ivar_total, ar_z) else: remove_median(delta_t_file, ar_delta_t_median, ar_z)
def test_find_nearby_pixels(self): radius_quantity = (200. * (100. * u.km / (u.Mpc * u.s)) / cd.H0) # type: u.Quantity radius = radius_quantity.value delta_t_file = NpSpectrumContainer(readonly=False, create_new=True, num_spectra=2, filename=None) ar_z0 = np.arange(1.95, 3.56, 0.002) delta_t_file.set_wavelength(0, ar_z0) delta_t_file.set_flux(0, np.sin(ar_z0 * 50)) delta_t_file.set_ivar(0, ar_z0) ar_z1 = np.arange(1.94, 3.4, 0.002) delta_t_file.set_wavelength(1, ar_z1) delta_t_file.set_flux(1, np.sin(ar_z1 * 50)) delta_t_file.set_ivar(1, ar_z1) pixel_pairs = calc_pixel_pairs.PixelPairs(cd, radius, radius, calc_pixel_pairs.accumulator_types.mean) qso_angle = 0.04 bin_dims = np.array([NUM_BINS_X, NUM_BINS_Y, 1]) bin_ranges = np.array([[0, 0, pixel_pairs.min_distance], [pixel_pairs.max_parallel_separation, pixel_pairs.max_transverse_separation, pixel_pairs.max_distance]]) pair_separation_bins_1 = bins_3d.Bins3D(dims=bin_dims, ranges=bin_ranges) pair_separation_bins_2 = bins_3d.Bins3D(dims=bin_dims, ranges=bin_ranges) pixel_pairs.find_nearby_pixels(accumulator=pair_separation_bins_1, qso_angle=qso_angle, spec1_index=0, spec2_index=1, delta_t_file=delta_t_file) pixel_pairs.find_nearby_pixels2(accumulator=pair_separation_bins_2, qso_angle=qso_angle, spec1_index=0, spec2_index=1, delta_t_file=delta_t_file) print(pair_separation_bins_1.ar_flux.sum(), pair_separation_bins_2.ar_flux.sum()) print(pair_separation_bins_1.ar_count.sum(), pair_separation_bins_2.ar_count.sum()) self.assertAlmostEqual((pair_separation_bins_1.ar_flux - pair_separation_bins_2.ar_flux).sum(), 0, 6) self.assertAlmostEqual((pair_separation_bins_1.ar_count - pair_separation_bins_2.ar_count).sum(), 0, 6) self.assertAlmostEqual((pair_separation_bins_1.ar_weights - pair_separation_bins_2.ar_weights).sum(), 0, 6) plot = True if plot: # plt.set_cmap('gray') with np.errstate(divide='ignore', invalid='ignore'): ar_est = (np.sum(pair_separation_bins_1.ar_flux, axis=2) / np.sum(pair_separation_bins_1.ar_weights, axis=2)) plt.imshow(ar_est, interpolation='nearest') plt.show()