示例#1
0
 def process(self, model_path):
     """  The following should follow the algorithm in process_catalog
     :param model_path:
     :return:
     """
     import json
     from dla_cnn.data_loader import scan_flux_sample
     from dla_cnn.localize_model import predictions_ann as predictions_ann_c2
     from dla_cnn.data_loader import compute_peaks, get_lam_data
     #from dla_cnn.data_loader import add_abs_to_sightline
     from dla_cnn.absorption import add_abs_to_sightline
     from dla_cnn.data_model.Prediction import Prediction
     # Fluxes
     fluxes = scan_flux_sample(self.flux,
                               self.loglam,
                               self.z_qso,
                               -1,
                               stride=1)[0]
     # Model
     with open(model_path + "_hyperparams.json", 'r') as f:
         hyperparameters = json.load(f)
     loc_pred, loc_conf, offsets, density_data_flat = predictions_ann_c2(
         hyperparameters, fluxes, model_path)
     self.prediction = Prediction(loc_pred=loc_pred,
                                  loc_conf=loc_conf,
                                  offsets=offsets,
                                  density_data=density_data_flat)
     # Peaks
     _ = compute_peaks(self)
     # Absorbers?
     add_abs_to_sightline(self)
示例#2
0
def process_catalog(ids,
                    kernel_size,
                    model_path="",
                    debug=False,
                    CHUNK_SIZE=1000,
                    output_dir="../tmp/visuals/",
                    make_pdf=False,
                    num_cores=None):
    from dla_cnn.plots import generate_pdf
    from dla_cnn.absorption import add_abs_to_sightline
    if num_cores is None:
        num_cores = multiprocessing.cpu_count() - 1
    # num_cores = 24
    # p = None
    p = Pool(num_cores)  # a thread pool we'll reuse
    if debug:
        num_cores = 1
        p = None
    sightlines_processed_count = 0

    sightline_results = [
    ]  # array of map objects containing the classification, and an array of DLAs

    ids.sort(key=methodcaller('id_string'))

    # We'll handle the full process in batches so as to not exceed memory constraints
    done = False
    for sss, ids_batch in enumerate(
            np.array_split(ids, np.arange(CHUNK_SIZE, len(ids), CHUNK_SIZE))):
        num_sightlines = len(ids_batch)
        #if sss < 46:  # debugging
        #    sightlines_processed_count += num_sightlines
        #    continue
        if done:
            break
        # # Workaround for segfaults occuring in matplotlib, kill multiprocess pool every iteration
        # if p is not None:
        #     p.close()
        #     p.join()
        #     time.sleep(5)

        report_timer = timeit.default_timer()

        # Batch read files
        process_timer = timeit.default_timer()
        print("Reading {:d} sightlines with {:d} cores".format(
            num_sightlines, num_cores))
        if debug:
            sightlines_batch = []
            for iid in ids_batch:
                sightlines_batch.append(read_sightline(iid))
        else:
            sightlines_batch = p.map(read_sightline, ids_batch)
        print("Spectrum/Fits read done in {:0.1f}".format(
            timeit.default_timer() - process_timer))

        ##################################################################
        # Process model
        ##################################################################
        print("Model predictions begin")
        fluxes = np.vstack([
            scan_flux_sample(s.flux, s.loglam, s.z_qso, -1, stride=1)[0]
            for s in sightlines_batch
        ])
        #fluxes = np.vstack([scan_flux_sample(s.flux, s.loglam, s.z_qso, -1, stride=1, testing=s)[0] for s in sightlines_batch])
        with open(model_path + "_hyperparams.json", 'r') as f:
            hyperparameters = json.load(f)
        loc_pred, loc_conf, offsets, density_data_flat = predictions_ann_c2(
            hyperparameters, fluxes, model_path)

        # Add results from predictions and peaks_data to data model for easier processing later.
        for sl, lp, lc, of, dd in zip(
                sightlines_batch, np.split(loc_pred, num_sightlines),
                np.split(loc_conf, num_sightlines),
                np.split(offsets, num_sightlines),
                np.split(density_data_flat, num_sightlines)):
            sl.prediction = Prediction(loc_pred=lp,
                                       loc_conf=lc,
                                       offsets=of,
                                       density_data=dd)

        with Timer(disp="Compute peaks"):
            sightlines_batch = map(compute_peaks, sightlines_batch)
            sightlines_batch.sort(key=lambda s: s.id.id_string())

        ##################################################################
        # Process output for each sightline
        ##################################################################
        assert num_sightlines * REST_RANGE[2] == density_data_flat.shape[0]
        for sightline in sightlines_batch:
            smoothed_sample = sightline.prediction.smoothed_loc_conf()

            # Add absorbers
            add_abs_to_sightline(sightline)

            # Store classification level data in results
            sightline_json = ({
                'id': sightline.id.id_string(),
                'ra': float(sightline.id.ra),
                'dec': float(sightline.id.dec),
                'z_qso': float(sightline.z_qso),
                'num_dlas': len(sightline.dlas),
                'num_subdlas': len(sightline.subdlas),
                'num_lyb': len(sightline.lybs),
                'dlas': sightline.dlas,
                'subdlas': sightline.subdlas,
                'lyb': sightline.lybs
            })

            sightline_results.append(sightline_json)

        ##################################################################
        # Process pdfs for each sightline
        ##################################################################
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # print "Processing PDFs"
        if make_pdf:
            p.map(generate_pdf,
                  zip(sightlines_batch, itertools.repeat(output_dir)))  # TODO

        print(
            "Processed {:d} sightlines for reporting on {:d} cores in {:0.2f}s"
            .format(num_sightlines, num_cores,
                    timeit.default_timer() - report_timer))

        runtime = timeit.default_timer() - process_timer
        print(
            "Processed {:d} of {:d} in {:0.0f}s - {:0.2f}s per sample".format(
                sightlines_processed_count + num_sightlines, len(ids), runtime,
                runtime / num_sightlines))
        sightlines_processed_count += num_sightlines
        if debug:
            done = True

    # Write JSON string
    with open(output_dir + "/predictions.json", 'w') as outfile:
        json.dump(sightline_results, outfile, indent=4)