def process(self, model_path): """ The following should follow the algorithm in process_catalog :param model_path: :return: """ import json from dla_cnn.data_loader import scan_flux_sample from dla_cnn.localize_model import predictions_ann as predictions_ann_c2 from dla_cnn.data_loader import compute_peaks, get_lam_data #from dla_cnn.data_loader import add_abs_to_sightline from dla_cnn.absorption import add_abs_to_sightline from dla_cnn.data_model.Prediction import Prediction # Fluxes fluxes = scan_flux_sample(self.flux, self.loglam, self.z_qso, -1, stride=1)[0] # Model with open(model_path + "_hyperparams.json", 'r') as f: hyperparameters = json.load(f) loc_pred, loc_conf, offsets, density_data_flat = predictions_ann_c2( hyperparameters, fluxes, model_path) self.prediction = Prediction(loc_pred=loc_pred, loc_conf=loc_conf, offsets=offsets, density_data=density_data_flat) # Peaks _ = compute_peaks(self) # Absorbers? add_abs_to_sightline(self)
def process_catalog(ids, kernel_size, model_path="", debug=False, CHUNK_SIZE=1000, output_dir="../tmp/visuals/", make_pdf=False, num_cores=None): from dla_cnn.plots import generate_pdf from dla_cnn.absorption import add_abs_to_sightline if num_cores is None: num_cores = multiprocessing.cpu_count() - 1 # num_cores = 24 # p = None p = Pool(num_cores) # a thread pool we'll reuse if debug: num_cores = 1 p = None sightlines_processed_count = 0 sightline_results = [ ] # array of map objects containing the classification, and an array of DLAs ids.sort(key=methodcaller('id_string')) # We'll handle the full process in batches so as to not exceed memory constraints done = False for sss, ids_batch in enumerate( np.array_split(ids, np.arange(CHUNK_SIZE, len(ids), CHUNK_SIZE))): num_sightlines = len(ids_batch) #if sss < 46: # debugging # sightlines_processed_count += num_sightlines # continue if done: break # # Workaround for segfaults occuring in matplotlib, kill multiprocess pool every iteration # if p is not None: # p.close() # p.join() # time.sleep(5) report_timer = timeit.default_timer() # Batch read files process_timer = timeit.default_timer() print("Reading {:d} sightlines with {:d} cores".format( num_sightlines, num_cores)) if debug: sightlines_batch = [] for iid in ids_batch: sightlines_batch.append(read_sightline(iid)) else: sightlines_batch = p.map(read_sightline, ids_batch) print("Spectrum/Fits read done in {:0.1f}".format( timeit.default_timer() - process_timer)) ################################################################## # Process model ################################################################## print("Model predictions begin") fluxes = np.vstack([ scan_flux_sample(s.flux, s.loglam, s.z_qso, -1, stride=1)[0] for s in sightlines_batch ]) #fluxes = np.vstack([scan_flux_sample(s.flux, s.loglam, s.z_qso, -1, stride=1, testing=s)[0] for s in sightlines_batch]) with open(model_path + "_hyperparams.json", 'r') as f: hyperparameters = json.load(f) loc_pred, loc_conf, offsets, density_data_flat = predictions_ann_c2( hyperparameters, fluxes, model_path) # Add results from predictions and peaks_data to data model for easier processing later. for sl, lp, lc, of, dd in zip( sightlines_batch, np.split(loc_pred, num_sightlines), np.split(loc_conf, num_sightlines), np.split(offsets, num_sightlines), np.split(density_data_flat, num_sightlines)): sl.prediction = Prediction(loc_pred=lp, loc_conf=lc, offsets=of, density_data=dd) with Timer(disp="Compute peaks"): sightlines_batch = map(compute_peaks, sightlines_batch) sightlines_batch.sort(key=lambda s: s.id.id_string()) ################################################################## # Process output for each sightline ################################################################## assert num_sightlines * REST_RANGE[2] == density_data_flat.shape[0] for sightline in sightlines_batch: smoothed_sample = sightline.prediction.smoothed_loc_conf() # Add absorbers add_abs_to_sightline(sightline) # Store classification level data in results sightline_json = ({ 'id': sightline.id.id_string(), 'ra': float(sightline.id.ra), 'dec': float(sightline.id.dec), 'z_qso': float(sightline.z_qso), 'num_dlas': len(sightline.dlas), 'num_subdlas': len(sightline.subdlas), 'num_lyb': len(sightline.lybs), 'dlas': sightline.dlas, 'subdlas': sightline.subdlas, 'lyb': sightline.lybs }) sightline_results.append(sightline_json) ################################################################## # Process pdfs for each sightline ################################################################## if not os.path.exists(output_dir): os.makedirs(output_dir) # print "Processing PDFs" if make_pdf: p.map(generate_pdf, zip(sightlines_batch, itertools.repeat(output_dir))) # TODO print( "Processed {:d} sightlines for reporting on {:d} cores in {:0.2f}s" .format(num_sightlines, num_cores, timeit.default_timer() - report_timer)) runtime = timeit.default_timer() - process_timer print( "Processed {:d} of {:d} in {:0.0f}s - {:0.2f}s per sample".format( sightlines_processed_count + num_sightlines, len(ids), runtime, runtime / num_sightlines)) sightlines_processed_count += num_sightlines if debug: done = True # Write JSON string with open(output_dir + "/predictions.json", 'w') as outfile: json.dump(sightline_results, outfile, indent=4)