def gen_gamma_params(path, save_path): """ Generate dataset of basic features (1D) of cosmic events""" path = os.path.join(path, 'gamma/*.xcd') print(path) files = glob(path) print(files) total_data = [] for xcdf_file in files: print(xcdf_file) xf = XCDFFile(xcdf_file) data = [] # Normal 1D (no conditional) # params = "rec.logNPE, rec.nHit, rec.nTankHit, rec.zenithAngle, rec.azimuthAngle, rec.coreX, rec.coreY, rec.CxPE40" # For conditional params = "rec.logNPE, rec.nHit, rec.nTankHit, rec.zenithAngle, rec.azimuthAngle, rec.coreX, rec.coreY, rec.CxPE40, \ SimEvent.energyTrue, SimEvent.thetaTrue, SimEvent.phiTrue" for param in xf.fields(params): if abs(param[3] - np.pi) > .01: data.append(param) total_data.extend(data) total_data = np.array(total_data, dtype=np.float32) # Simple data augmentation total_data[:, 1] = np.log(total_data[:, 1]) # Take the log of rec.nHit total_data[:, 7] = np.log(total_data[:, 7] + .01) # Take the log of rec.rec.CxPE40 total_data[:, 8] = np.log( total_data[:, 8]) # For conditional only (Very important to have) assert total_data.shape == ( total_data.shape[0], 11 ) # 8 expected for no condition, 11 expected for conditional print("shuffling") np.random.shuffle(total_data) print(total_data[:15, :]) np.save(osp.join(save_path, "gamma_data"), total_data)
def get_layout(path, save_path='data/', sub='gamma/'): """ Get the x, y, and z coordinates of every PMT to visualize data later """ files = glob(os.path.join(path, sub, '*.xcd')) pmt_locs = np.zeros((1200, 3)) params = "event.hit.xPMT, event.hit.yPMT, event.hit.zPMT, event.hit.gridId" for xcdf_file in files: xf = XCDFFile(xcdf_file) print(xcdf_file) for xs, ys, zs, ids in list(xf.fields(params)): for x, y, z, id in zip(xs, ys, zs, ids): id -= 1 # change id to be 0 indexed # make sure all pmts with the same ID have the same coords if not np.isclose(pmt_locs[id][0], 0): assert pmt_locs[id][0] == x if not np.isclose(pmt_locs[id][1], 0): assert pmt_locs[id][1] == y if not np.isclose(pmt_locs[id][2], 0): assert pmt_locs[id][2] == z # save coord locations pmt_locs[id][0] = x pmt_locs[id][1] = y pmt_locs[id][2] = z zeros = np.count_nonzero(pmt_locs) print("{0} out of {1} pixels have data".format(zeros, 1200 * 3)) # assuming 3378 PMTs have value for early termination if zeros == 3378: break if not os.path.exists(save_path): os.makedirs(save_path) np.save(osp.join(save_path, "layout"), pmt_locs)
label2 = label2_w = 0 label3 = label3_w = 0 true_label_0 = true_label_0_w = 0 true_label_1 = true_label_1_w = 0 true_label_2 = true_label_2_w = 0 true_label_3 = true_label_3_w = 0 true01_label0 = 0 true23_label0 = 0 true123_label0 = 0 # delAngle threshold (in radian) for separating good and bad events for 6 fHit bins delAngle_threshold_g = np.array( [0.0341, 0.0237, 0.0182, 0.0131, 0.0091, 0.0072]) delAngle_threshold_h = np.array([0.0536, 0.0376, 0.032, 0.0273, 0.0226, 0.016]) xcdf = XCDFFile(args.input) # event loop ----------------------------------- for record in xcdf.fields( "rec.eventID, rec.nChTot, rec.nChAvail, rec.nTankHit, rec.nHit, rec.nHitSP10, rec.nHitSP20, rec.zenithAngle, rec.azimuthAngle, rec.dec, rec.ra, rec.CxPE20, rec.CxPE30, rec.CxPE40, rec.CxPE50, rec.CxPE40SPTime, rec.PINC, rec.angleFitStatus, rec.coreFitStatus, rec.coreFiduScale, rec.coreX, rec.coreY, rec.coreFitUnc, rec.SFCFChi2, rec.planeChi2, rec.fAnnulusCharge0, rec.fAnnulusCharge1, rec.fAnnulusCharge2, rec.fAnnulusCharge3, rec.GamCoreAge, rec.GamCoreAmp, rec.GamCoreChi2, rec.GamCorePackInt, rec.mPFnHits, rec.mPFnPlanes, rec.mPFp0nAssign, rec.mPFp0Weight, rec.mPFp0toangleFit, rec.mPFp1nAssign, rec.mPFp1Weight, rec.mPFp1toangleFit, rec.disMax, mc.zenithAngle, mc.azimuthAngle, mc.corsikaParticleId, mc.delAngle, mc.delCore, mc.logEnergy, mc.coreX, mc.coreY, sweets.IWgt, sweets.TWgt, rec.logNNEnergy, mc.coreFiduScale, rec.classLabel" ): eventID, nChTot, nChAvail, nTankHit, nHit, nHitSP10, nHitSP20, zenithAngle, azimuthAngle, dec, ra, CxPE20, CxPE30, CxPE40, CxPE50, CxPE40SPTime, PINC, angleFitStatus, coreFitStatus, coreFiduScale, coreX, coreY, coreFitUnc, SFCFChi2, planeChi2, fAnnulusCharge0, fAnnulusCharge1, fAnnulusCharge2, fAnnulusCharge3, GamCoreAge, GamCoreAmp, GamCoreChi2, GamCorePackInt, mPFnHits, mPFnPlanes, mPFp0nAssign, mPFp0Weight, mPFp0toangleFit, mPFp1nAssign, mPFp1Weight, mPFp1toangleFit, disMax, zenithAngle_true, azimuthAngle_true, corsikaParticleId, delAngle, delCore, logEnergy, coreX_true, coreY_true, IWgt, TWgt, logNNEnergy, true_coreFiduScale, classLabel = record nChAvail = float(nChAvail) nChTot = float(nChTot) angleFitStatus = float(angleFitStatus) coreFitStatus = float(coreFitStatus) coreFiduScale = float(coreFiduScale) zenithAngle = float(zenithAngle) nHit = float(nHit) nHitSP20 = float(nHitSP20) classLabel = int(classLabel)
from xgboost import XGBClassifier from xcdf import XCDFFile import ntpath def path_leaf(path): head, tail = ntpath.split(path) return tail or ntpath.basename(head) #deg = np.pi/180. parser = argparse.ArgumentParser() parser.add_argument('file', nargs='+', help='path to the file') parser.add_argument('out', nargs='+', help='path to the outfile') args_namespace = parser.parse_args() args = vars(args_namespace)['file'][0] outdir= vars(args_namespace)['out'][0] xf = XCDFFile(args) name=path_leaf(args) #xf = XCDFFile("reco_run006657_00001.xcd") print args[0] allset = [] #for record in xf.fields("rec.nHit,rec.CxPE40,rec.PINC,rec.logNNEnergy,rec.disMax,rec.LDFAge,rec.LDFAmp,rec.LDFChi2,rec.nChAvail,rec.nHitSP20"): for record in xf.fields("rec.nHit,rec.CxPE40,rec.PINC,rec.disMax,rec.LDFAge,rec.LDFAmp,rec.LDFChi2"): allset.append(record) allset=np.array(allset) #allset=allset[allset[:,1]!=0,:] allset[:,0]= np.log10(np.divide(allset[:,1], allset[:,0],out=np.zeros_like(allset[:,0]), where=allset[:,0]!=0)) allset[allset==-np.inf]=0 allset=np.delete(allset,1,1)
def gen_images_mapping(path="./HAWC/", sub="gamma/", display=False, log=True, train_split=0.8, two_dims=False, small=False, normalize=False, pixel_range=True, save_path="data/"): """ generate 40x40 images, each pixel is either 0 or mapped to a PMT using mapping of tanks to pixels Note that some pixels do not have a corresponding tank so they are always 0 pixel_range corresponds to normalizing the images from 0-255 :param path: path to data directory :param sub: subdirectory / type of event (currently only tried gamma) :param display: whether to show matplotlib visualizations of parsed data :param log: take log of charge data :param train_split: ratio of train to test data :param two_dims: whether to include a second channel (time) :param small: only use one XCDF file to run sanity check :param normalize: normalize data between (-1, 1) :param pixel_range: normalize data between (0, 256), the valid range of pixel values """ from squaremapping import sqmap # sqmap is a mapping of {PMT Grid ID: index into 40x40 image} files = glob(os.path.join(path, sub, '*.xcd')) total_data = [] labels = [] params = "event.hit.charge, event.hit.time, event.hit.gridId, " \ "rec.zenithAngle, rec.azimuthAngle" if small: files = files[0:1] for xcdf_file in files: print(xcdf_file) xf = XCDFFile(xcdf_file) for charge, time, gridid, zen, azi in list(xf.fields(params)): # first dim = charge, second dim = time # We generate a 40x40 grid for each event, from a mapping of gid -> x, y positions on grid if two_dims: grid = np.zeros((40, 40, 2)) # fill the grid with the smallest possible value, -500 grid[:, :, 1] = np.full((40, 40), -500.) else: grid = np.zeros((40, 40, 1)) # load in data to the created array for c, gid, t in zip(charge, gridid, time): # sqmap is currently only defined for PMTs with ID > 8 if gid > 8: coorsq = sqmap[int(gid)] # smallest charge is 0.1, so we set 0.1 and smaller to 0 to avoid log(0) issues if log: c = max(np.log(c + 1e-8) - np.log(0.1), 0.) grid[coorsq[0], coorsq[1], 0] = c if two_dims: grid[coorsq[0], coorsq[1], 1] = t if not np.isclose(c, 0) else 500. total_data.append(grid) labels.append([zen, azi]) total_data = np.array(total_data, dtype=np.float32) labels = np.array(labels) print('data shape', total_data.shape, 'labels shape', labels.shape) # shape should be (N, 40, 40, 2) # We can normalize values to [0, 255] to put data in same domain as images, or [-1, 1] if normalize: min_vals, max_vals = [], [] # normalize each channel independently for i in range(total_data.shape[3]): min_val, max_val = np.amin(total_data[:, :, :, i]), np.amax( total_data[:, :, :, i]) min_vals.append(min_val), max_vals.append(max_val) print('Dimension %i, min: %f, max: %f' % (i, min_val, max_val)) # Math to normalize channels to between [-1, 1] # Forward: -1. + 2. * (x - min) / (max - min) # inverse: (y + 1) / 2 * (max - min) + min dims = [] if pixel_range: dims.append(255. * (total_data[:, :, :, 0] - min_vals[0]) / (max_vals[0] - min_vals[0])) if two_dims: dims.append(255. * (total_data[:, :, :, 1] - min_vals[1]) / (max_vals[1] - min_vals[1])) else: dims.append(-1. + 2. * (total_data[:, :, :, 0] - min_vals[0]) / (max_vals[0] - min_vals[0])) if two_dims: dims.append(-1. + 2. * (total_data[:, :, :, 1] - min_vals[1]) / (max_vals[1] - min_vals[1])) # sanity check for normalization for d in dims: assert np.amax(d) <= 255.01 if two_dims else 1.01, np.amax(d) assert np.amin(d) >= 0.0 if two_dims else -1.01, np.amin(d) total_data = np.stack(dims, axis=3) # shuffle data and fix seed np.random.seed(0) p = np.random.permutation(len(total_data)) total_data, labels = total_data[p], labels[p] if display: plot.plot_hists(total_data) plot.plot_40x40(total_data[:16], 'ground truth - gamma - log - 40x40 grid') plot.plot_pmts(total_data[:16], 'ground truth - gamma - log - pmts', sparse=True, layout_path=save_path) for i in range(5, 8): plot.plot_pmts(total_data[:16], 'ground truth - gamma - log - pmts - single', sparse=True, single=i, layout_path=save_path) # make the train / test split split = int(train_split * len(total_data)) print("split size:", split) train_data, test_data, train_labels, test_labels = \ total_data[:split], total_data[split:], labels[:split], labels[split:] # save data according to save_path if not os.path.exists(save_path): os.makedirs(save_path) # create both training and testing data suffix = "_2" if two_dims else "" np.save(osp.join(save_path, "gamma_image_mapping_data" + suffix), train_data) np.save(osp.join(save_path, "gamma_labels" + suffix), train_labels) np.save(osp.join(save_path, "gamma_test_image_mapping_data" + suffix), test_data) np.save(osp.join(save_path, "gamma_test_labels" + suffix), test_labels)
def path_leaf(path): head, tail = ntpath.split(path) return tail or ntpath.basename(head) import os for prefix in ('OMP', 'MKL', 'NUMEXPR'): os.environ['%s_NUM_THREADS' % prefix] = '1' #deg = np.pi/180. parser = argparse.ArgumentParser() parser.add_argument('file', nargs='+', help='path to the file') parser.add_argument('out', nargs='+', help='path to the outfile') args_namespace = parser.parse_args() args = vars(args_namespace)['file'][0] outdir = vars(args_namespace)['out'][0] xf = XCDFFile(args) name = path_leaf(args) #xf = XCDFFile("reco_run006657_00001.xcd") print args[0] allset = [] #for record in xf.fields("rec.nHit,rec.CxPE40,rec.PINC,rec.logNNEnergy,rec.disMax,rec.LDFAge,rec.LDFAmp,rec.LDFChi2,rec.nChAvail,rec.nHitSP20"): #for record in xf.fields("rec.nHit,rec.CxPE40,rec.PINC,rec.disMax,rec.LDFAge,rec.LDFAmp,rec.LDFChi2"): for record in xf.fields( "rec.nHit,rec.CxPE40,rec.PINC,rec.logNNEnergyV2,rec.disMax,rec.LDFAmp,rec.LDFChi2,rec.nChAvail,rec.nHitSP20" ): allset.append(record) allset = np.array(allset) #allset=allset[allset[:,1]!=0,:] allset[:, 0] = np.log10( np.divide(allset[:, 1],