Exemplo n.º 1
0
def profile_main():
    galaxy_metadata_file_npy = settings.get_galaxy_metadata_npy()
    histogram_output_npz = settings.get_ism_histogram_npz()

    galaxy_record_table = table.Table(np.load(galaxy_metadata_file_npy))

    num_extinction_bins = settings.get_num_extinction_bins()

    extinction_field_name = settings.get_extinction_source()

    ism_object_classes = settings.get_ism_object_classes()

    galaxy_table_mask = np.array(
        [i in ism_object_classes for i in galaxy_record_table['class']])
    galaxy_record_table = galaxy_record_table[galaxy_table_mask]

    # group results into extinction bins with roughly equal number of spectra.
    galaxy_record_table.sort([extinction_field_name])

    # remove objects with unknown extinction
    galaxy_record_table = galaxy_record_table[np.where(
        np.isfinite(galaxy_record_table[extinction_field_name]))]

    chunk_sizes, chunk_offsets = get_chunks(len(galaxy_record_table),
                                            num_extinction_bins)
    for i in range(num_extinction_bins):
        extinction_bin_start = chunk_offsets[i]
        extinction_bin_end = extinction_bin_start + chunk_sizes[i]

        extinction_bin_record_table = galaxy_record_table[
            extinction_bin_start:extinction_bin_end]

        # this should be done before plate sort
        group_parameters = {
            'extinction_bin_number':
            i,
            'extinction_minimum':
            extinction_bin_record_table[extinction_field_name][0],
            'extinction_maximum':
            extinction_bin_record_table[extinction_field_name][-1],
            'extinction_average':
            np.mean(extinction_bin_record_table[extinction_field_name]),
            'extinction_median':
            np.median(extinction_bin_record_table[extinction_field_name]),
        }

        # sort by plate to avoid constant switching of fits files (which are per plate).
        extinction_bin_record_table.sort(['plate', 'mjd', 'fiberID'])

        base_filename, file_extension = splitext(histogram_output_npz)
        histogram_output_filename = '{}_{:02d}{}'.format(
            base_filename, i, file_extension)

        r_print('Starting extinction bin {}'.format(i))
        calc_median_spectrum(extinction_bin_record_table,
                             histogram_output_filename,
                             group_parameters=group_parameters)
        r_print('Finished extinction bin {}'.format(i))
Exemplo n.º 2
0
def accumulate_over_spectra(func, accumulator):
    qso_record_table = table.Table(np.load(
        settings.get_qso_metadata_npy()))  # type: table
    qso_record_count = len(qso_record_table)

    chunk_sizes, chunk_offsets = mpi_helper.get_chunks(qso_record_count,
                                                       comm.size)

    local_start_index = chunk_offsets[comm.rank]
    local_size = chunk_sizes[comm.rank]
    local_end_index = local_start_index + local_size
    if comm.rank == 0:
        global_acc = accumulator(qso_record_count)

    local_qso_record_table = itertools.islice(
        qso_record_table, int(local_start_index),
        int(local_end_index))  # type: Iterable(table.Row)
    l_print_no_barrier("-----", qso_record_count, local_start_index,
                       local_end_index, local_size)
    slice_size = settings.get_file_chunk_size()
    qso_chunks_iterable = enumerate(
        split_seq(slice_size, local_qso_record_table))
    for slice_number, qso_record_table_chunk in qso_chunks_iterable:
        local_result = func(qso_record_table_chunk)
        # all large data is stored in an array as the first tuple element.
        ar_local_result = local_result[0]
        # generic objects (slower) can be store at the second tuple element.
        object_local_result = local_result[1]

        assert isinstance(ar_local_result, np.ndarray)
        ar_all_results = np.zeros(shape=(comm.size, ) +
                                  tuple(ar_local_result.shape))
        comm.Gatherv(ar_local_result, ar_all_results, root=0)
        ar_qso_indices = np.zeros(shape=(comm.size, slice_size), dtype=int)
        # noinspection PyTypeChecker
        comm.Gatherv(np.array([x['index'] for x in qso_record_table_chunk]),
                     ar_qso_indices)

        # metadata, or anything else that is small, but may have complex data types is transferred as objects:
        object_all_results = comm.gather(object_local_result)

        # "reduce" results
        if comm.rank == 0:
            global_acc.accumulate(ar_all_results, ar_qso_indices,
                                  object_all_results)
            global_acc.finalize()

    l_print_no_barrier("------------------------------")
    if comm.rank == 0:
        return global_acc.return_result()
    else:
        return None, None
Exemplo n.º 3
0
def profile_main():
    # initialize data sources
    qso_record_table = table.Table(np.load(settings.get_qso_metadata_npy()))
    if settings.get_ism_only_mode():
        delta_t_filename = settings.get_forest_ism_npy()
    else:
        delta_t_filename = settings.get_delta_t_npy()

    delta_t_file = NpSpectrumContainer(True,
                                       num_spectra=len(qso_record_table),
                                       filename=delta_t_filename,
                                       max_wavelength_count=1000)

    # prepare data for quicker access
    qso_record_list = [QSORecord.from_row(i) for i in qso_record_table]
    ar_ra = np.array([i.ra for i in qso_record_list])
    ar_dec = np.array([i.dec for i in qso_record_list])
    ar_z = np.array([i.z for i in qso_record_list])
    ar_distance = cd.fast_comoving_distance(ar_z)
    mpi_helper.r_print('QSO table size:', len(ar_distance))

    # TODO: find a more precise value instead of z=1.9
    # set maximum QSO angular separation to 200Mpc/h (in co-moving coordinates)
    # the article assumes h is measured in units of 100km/s/mpc
    radius_quantity = (200. * (100. * u.km / (u.Mpc * u.s)) / cd.H0
                       )  # type: u.Quantity
    max_transverse_separation = radius_quantity.value
    max_parallel_separation = radius_quantity.value
    max_angular_separation = max_transverse_separation / (
        cd.comoving_distance(1.9) / u.radian)
    mpi_helper.r_print('maximum separation of QSOs:',
                       Angle(max_angular_separation).to_string(unit=u.degree))

    # print(ar_list)
    coord_set = coord.SkyCoord(ra=ar_ra * u.degree,
                               dec=ar_dec * u.degree,
                               distance=ar_distance * u.Mpc)

    data_state = None
    computation_state = None

    # either initialize variable or load them to resume
    if settings.get_resume():
        if comm.rank == 0:
            # resume an existing state

            data_state = pickle.load(
                open(settings.get_restartable_data_state_p(),
                     'rb'))  # type: DataState
            computation_state = pickle.load(
                open(settings.get_restartable_computation_state_p(),
                     'rb'))  # type: ComputationState
    else:
        if comm.rank == 0:
            # initialize a new state

            # create a random permutation of the coordinate set
            # (this is done to balance the load on the nodes)
            new_coord_permutation = np.random.permutation(len(coord_set))
            # data_state should hold everything required to reproduce the exact same computation,
            # so that it is possible to restart it from the last completed bundle.
            # NOTE: currently there is no plan to check for consistency on load.
            # changing the input data before restarting will produce undefined results.
            data_state = DataState(
                mpi_comm_size=comm.size,
                coord_permutation=new_coord_permutation,
                max_angular_separation=max_angular_separation)
            computation_state = ComputationState(bundle_index=0,
                                                 sub_chunk_index=0)

            pickle.dump(data_state,
                        open(settings.get_restartable_data_state_p(), 'wb'))

    # send state to all nodes:
    data_state = comm.bcast(data_state)
    computation_state = comm.bcast(computation_state)  # type: ComputationState

    if max_angular_separation != data_state.max_angular_separation:
        raise Exception(
            "Cannot resume, angular separation has changed ({}->{})".format(
                data_state.max_angular_separation, max_angular_separation))
    if comm.size != data_state.mpi_comm_size:
        raise Exception("Cannot resume, MPI COMM size must be {}".format(
            data_state.mpi_comm_size))

    coord_permutation = data_state.coord_permutation
    first_sub_chunk_index = computation_state.sub_chunk_index

    # find all QSO pairs
    chunk_sizes, chunk_offsets = mpi_helper.get_chunks(len(coord_set),
                                                       comm.size)

    local_start_index = chunk_offsets[comm.rank]
    local_end_index = local_start_index + chunk_sizes[comm.rank]

    if settings.get_enable_weighted_median_estimator():
        accumulator_type = calc_pixel_pairs.accumulator_types.histogram
        assert not settings.get_enable_weighted_mean_estimator(
        ), "Median and mean estimators are mutually exclusive."
        assert not settings.get_enable_estimator_subsamples(
        ), "Subsamples not supported for histogram."
    elif settings.get_enable_weighted_mean_estimator():
        if settings.get_enable_estimator_subsamples():
            accumulator_type = calc_pixel_pairs.accumulator_types.mean_subsample
        else:
            accumulator_type = calc_pixel_pairs.accumulator_types.mean
    else:
        assert False, "Either median or mean estimators must be specified."

    pixel_pairs_object = calc_pixel_pairs.PixelPairs(
        cd,
        max_transverse_separation,
        max_parallel_separation,
        accumulator_type=accumulator_type)
    # divide the work into sub chunks
    # Warning: the number of sub chunks must be identical for all nodes because gather is called after each sub chunk.
    # NOTE: we no longer divide by comm.size to make sub chunk size independent of number of nodes,
    #       because pairs are generated in bundles, instead of once at the beginning.
    num_sub_chunks_per_node = settings.get_mpi_num_sub_chunks()

    sub_chunk_helper = SubChunkHelper(pixel_pairs_object,
                                      settings.get_resume())
    for bundle_index, local_qso_pair_angles, local_qso_pairs in generate_pairs(
            ar_dec,
            ar_ra,
            coord_permutation,
            coord_set,
            local_end_index,
            local_start_index,
            max_angular_separation,
            bundle_start_index=computation_state.bundle_index):

        pixel_pair_sub_chunks = mpi_helper.get_chunks(local_qso_pairs.shape[0],
                                                      num_sub_chunks_per_node)
        sub_chunk_iterator = islice(
            enumerate(zip(pixel_pair_sub_chunks[0], pixel_pair_sub_chunks[1])),
            first_sub_chunk_index, None)

        # if resuming from a previous run, use the value in first_sub_chunk_index only once:
        first_sub_chunk_index = 0

        for sub_chunk_index, (i, j) in sub_chunk_iterator:
            # save computation state to allow restarting
            if comm.rank == 0:
                save_computation_state(bundle_index=bundle_index,
                                       sub_chunk_index=sub_chunk_index)

            sub_chunk_start = j
            sub_chunk_end = j + i
            mpi_helper.l_print("sub_chunk: size", i, ", starting at", j, ",",
                               sub_chunk_index, "out of",
                               len(pixel_pair_sub_chunks[0]))
            sub_chunk_helper.add_pairs_in_sub_chunk(
                delta_t_file, local_qso_pair_angles,
                local_qso_pairs[sub_chunk_start:sub_chunk_end],
                pixel_pairs_object)

        # done. update computation state one last time with a very large bundle index
        if comm.rank == 0:
            save_computation_state(bundle_index=sys.maxsize,
                                   sub_chunk_index=sys.maxsize)
Exemplo n.º 4
0
def profile_main():
    # x = coord.SkyCoord(ra=10.68458*u.deg, dec=41.26917*u.deg, frame='icrs')
    # min_distance = cd.comoving_distance_transverse(2.1, **fidcosmo)
    # print('minimum distance', min_distance, 'Mpc/rad')

    # initialize data sources
    qso_record_table = table.Table(np.load(settings.get_qso_metadata_npy()))

    # prepare data for quicker access
    qso_record_list = [QSORecord.from_row(i) for i in qso_record_table]
    ar_ra = np.array([i.ra for i in qso_record_list])
    ar_dec = np.array([i.dec for i in qso_record_list])
    ar_z = np.array([i.z for i in qso_record_list])
    ar_extinction = np.array([i.extinction_g for i in qso_record_list])
    ar_distance = cd.fast_comoving_distance(ar_z)
    mpi_helper.r_print('QSO table size:', len(ar_distance))

    # TODO: find a more precise value instead of z=1.9
    # set maximum QSO angular separation to 200Mpc/h (in co-moving coordinates)
    # the article assumes h is measured in units of 100km/s/mpc
    radius_quantity = (200. * (100. * u.km / (u.Mpc * u.s)) / cd.H0
                       )  # type: u.Quantity
    radius = radius_quantity.value
    max_angular_separation = radius / (cd.comoving_distance(1.9) / u.radian)
    mpi_helper.r_print('maximum separation of QSOs:',
                       Angle(max_angular_separation).to_string(unit=u.degree))

    # print(ar_list)
    coord_set = coord.SkyCoord(ra=ar_ra * u.degree,
                               dec=ar_dec * u.degree,
                               distance=ar_distance * u.Mpc)
    # print(coord_set)

    # find all QSO pairs
    chunk_sizes, chunk_offsets = mpi_helper.get_chunks(len(coord_set),
                                                       comm.size)

    local_start_index = chunk_offsets[comm.rank]
    local_end_index = local_start_index + chunk_sizes[comm.rank]
    mpi_helper.l_print('matching objects in range:', local_start_index, 'to',
                       local_end_index)
    # each node matches a range of objects against the full list.
    count = matching.search_around_sky(
        coord_set[local_start_index:local_end_index], coord_set,
        max_angular_separation)

    # search around sky returns indices in the input lists.
    # each node should add its offset to get the QSO index in the original list (only for x[0]).
    # qso2 which contains the unmodified index to the full list of QSOs.
    # the third vector is a count so we can keep a reference to the angles vector.
    local_qso_index_1 = count[0] + local_start_index
    local_qso_index_2 = count[1]

    # find the mean ra,dec for each pair
    local_qso_ra_pairs = np.vstack(
        (ar_ra[local_qso_index_1], ar_ra[local_qso_index_2]))
    local_qso_dec_pairs = np.vstack(
        (ar_dec[local_qso_index_1], ar_dec[local_qso_index_2]))
    # we can safely assume that separations is small enough so we don't have catastrophic cancellation of the mean,
    # so checking the unit radius value is not required
    local_pair_means_ra, local_pair_means_dec, _ = find_spherical_mean_deg(
        local_qso_ra_pairs, local_qso_dec_pairs, axis=0)

    sky_groups = SkyGroups(nside=settings.get_healpix_nside())
    group_id = sky_groups.get_group_ids(local_pair_means_ra,
                                        local_pair_means_dec)

    local_qso_pairs_with_unity = np.vstack(
        (local_qso_index_1, local_qso_index_2, group_id,
         np.arange(count[0].size)))

    local_qso_pair_angles = count[2].to(u.rad).value
    mpi_helper.l_print('number of QSO pairs (including identity pairs):',
                       count[0].size)
    mpi_helper.l_print('angle vector size:', local_qso_pair_angles.size)

    # remove pairs of the same QSO.
    # local_qso_pairs = local_qso_pairs_with_unity.T[local_qso_pairs_with_unity[1] != local_qso_pairs_with_unity[0]]

    # remove pairs of the same QSO, which have different [plate,mjd,fiber]
    # assume that QSOs within roughly 10 arc-second (5e-5 rads) are the same object.
    local_qso_pairs = local_qso_pairs_with_unity.T[
        local_qso_pair_angles > 5e-5]

    mpi_helper.l_print(
        'total number of redundant objects removed:',
        local_qso_pairs_with_unity.shape[1] - local_qso_pairs.shape[0] -
        chunk_sizes[comm.rank])

    # l_print(pairs)
    mpi_helper.l_print('number of QSO pairs:', local_qso_pairs.shape[0])
    # l_print('angle vector:', x[2])

    # divide the work into sub chunks
    # Warning: the number of sub chunks must be identical for all nodes because gather is called after each sub chunk.
    # divide by comm.size to make sub chunk size independent of number of nodes.
    num_sub_chunks_per_node = settings.get_mpi_num_sub_chunks() // comm.size
    pixel_pair_sub_chunks = mpi_helper.get_chunks(local_qso_pairs.shape[0],
                                                  num_sub_chunks_per_node)
    sub_chunk_helper = SubChunkHelper(ar_extinction)
    for i, j, k in zip(pixel_pair_sub_chunks[0], pixel_pair_sub_chunks[1],
                       itertools.count()):
        sub_chunk_start = j
        sub_chunk_end = j + i
        mpi_helper.l_print("sub_chunk: size", i, ", starting at", j, ",", k,
                           "out of", len(pixel_pair_sub_chunks[0]))
        sub_chunk_helper.add_pairs_in_sub_chunk(
            local_qso_pair_angles,
            local_qso_pairs[sub_chunk_start:sub_chunk_end])
Exemplo n.º 5
0
def profile_main():
    galaxy_metadata_file_npy = settings.get_galaxy_metadata_npy()
    histogram_output_npz = settings.get_ism_real_median_npz()

    galaxy_record_table = table.Table(np.load(galaxy_metadata_file_npy))

    num_extinction_bins = settings.get_num_extinction_bins()

    extinction_field_name = settings.get_extinction_source()

    # group results into extinction bins with roughly equal number of spectra.
    galaxy_record_table.sort([extinction_field_name])

    # remove objects with unknown extinction
    galaxy_record_table = galaxy_record_table[np.where(
        np.isfinite(galaxy_record_table[extinction_field_name]))]

    # if comm.size > num_extinction_bins:
    #     raise Exception('too many MPI nodes')

    # split the work into 'jobs' for each mpi node.
    # a job is defined as a single extinction bin.
    # the index of every extinction bin is its job number.

    job_sizes, job_offsets = get_chunks(num_extinction_bins, comm.size)
    job_start = job_offsets[comm.rank]
    job_end = job_start + job_sizes[comm.rank]

    chunk_sizes, chunk_offsets = get_chunks(len(galaxy_record_table),
                                            num_extinction_bins)

    for i in range(job_start, job_end):
        extinction_bin_start = chunk_offsets[i]
        extinction_bin_end = extinction_bin_start + chunk_sizes[i]

        extinction_bin_record_table = galaxy_record_table[
            extinction_bin_start:extinction_bin_end]

        # this should be done before plate sort
        group_parameters = {
            'extinction_bin_number':
            i,
            'extinction_minimum':
            extinction_bin_record_table[extinction_field_name][0],
            'extinction_maximum':
            extinction_bin_record_table[extinction_field_name][-1],
            'extinction_mean':
            np.mean(extinction_bin_record_table[extinction_field_name]),
            'extinction_median':
            np.median(extinction_bin_record_table[extinction_field_name]),
        }

        # sort by plate to avoid constant switching of fits files (which are per plate).
        extinction_bin_record_table.sort(['plate', 'mjd', 'fiberID'])

        base_filename, file_extension = splitext(histogram_output_npz)
        output_filename = '{}_{:02d}{}'.format(base_filename, i,
                                               file_extension)

        l_print_no_barrier('Starting extinction bin {}'.format(i))
        calc_median_spectrum(extinction_bin_record_table,
                             output_filename,
                             group_parameters=group_parameters)
        l_print_no_barrier('Finished extinction bin {}'.format(i))

    for _ in barrier_sleep(comm, use_yield=True):
        l_print_no_barrier("waiting")
        pass
Exemplo n.º 6
0
def calc_median_spectrum(galaxy_record_table, histogram_output_npz,
                         group_parameters):
    histogram = np.zeros(shape=(num_bins, spec_size))
    global_histogram = np.zeros(shape=(num_bins, spec_size))
    chunk_sizes, chunk_offsets = get_chunks(len(galaxy_record_table),
                                            comm.size)
    local_start_index = chunk_offsets[comm.rank]
    local_end_index = local_start_index + chunk_sizes[comm.rank]
    update_gather_mask = get_update_mask(num_update_gather,
                                         chunk_sizes[comm.rank])
    spectrum_iterator = enum_spectra(qso_record_table=galaxy_record_table[
        local_start_index:local_end_index],
                                     pre_sort=False,
                                     and_mask=np.uint32(0),
                                     or_mask=np.uint32(0))
    for n, spectrum in enumerate(spectrum_iterator):  # type: int,QSOData
        ar_flux = np.interp(ar_wavelength,
                            spectrum.ar_wavelength,
                            spectrum.ar_flux,
                            left=np.nan,
                            right=np.nan)
        ar_ivar = np.interp(ar_wavelength,
                            spectrum.ar_wavelength,
                            spectrum.ar_ivar,
                            left=np.nan,
                            right=np.nan)

        ar_trend = savgol_filter(ar_flux, detrend_window, polyorder=2)

        # de-trend the spectrum
        ar_flux /= ar_trend

        ar_flux_int = np.empty(shape=spec_size, dtype=np.int)
        ar_flux_int[:] = ((ar_flux - flux_min) * num_bins / flux_range).astype(
            np.int)
        ar_flux_int[ar_flux_int >= num_bins] = num_bins - 1
        ar_flux_int[ar_flux_int < 0] = 0

        # noinspection PyArgumentList
        mask = np.logical_and.reduce(
            (np.isfinite(ar_flux), ar_ivar > 0, ar_trend > 2.))

        x = ar_flux_int[mask]
        y = np.arange(spec_size)[mask]
        # c = np.ones_like(y)
        c = ar_trend[mask]

        histogram[x, y] += c

        if update_gather_mask[n]:
            reduce_and_save(output_file=histogram_output_npz,
                            global_histogram=global_histogram,
                            histogram=histogram,
                            group_parameters=group_parameters)
            # l_print_no_barrier(n)
            list_n = comm.gather(n)
            if comm.rank == 0:
                r_print(sum(list_n))
    r_print('------------')
    reduce_and_save(output_file=histogram_output_npz,
                    global_histogram=global_histogram,
                    histogram=histogram,
                    group_parameters=group_parameters)