示例#1
0
def test_read_write_hdf(tmpdir, input_arr):
    tmp_file = tmpdir / "example.h5"

    # Write
    with h5py.File(str(tmp_file), "w") as hf:
        a = awkward0.JaggedArray.fromiter(input_arr)
        ah5 = awkward0.hdf5(hf)
        ah5["example"] = a

    # Read
    with h5py.File(str(tmp_file), "r") as hf:
        ah5 = awkward0.hdf5(hf)
        b = ah5["example"]

    assert a.tolist() == b.tolist()
示例#2
0
def collect_truth(*files, pvs=True):
    """
    This function collects the truth information from files as
    awkward arrays (JaggedArrays). Give it the same files as collect_data.

    pvs: Collect PVs or SVs (default True: PVs)
    """

    x_list = []
    y_list = []
    z_list = []
    n_list = []
    c_list = []

    p = "p" if pvs else "s"

    for XY_file in files:
        msg = f"Loaded {XY_file} in {{time:.4}} s"
        with Timer(msg), h5py.File(XY_file, mode="r") as XY:
            afile = awkward.hdf5(XY)
            x_list.append(afile[f"{p}v_loc_x"])
            y_list.append(afile[f"{p}v_loc_y"])
            z_list.append(afile[f"{p}v_loc"])
            n_list.append(afile[f"{p}v_ntracks"])
            c_list.append(afile[f"{p}v_cat"])

    return VertexInfo(
        concatenate(x_list),
        concatenate(y_list),
        concatenate(z_list),
        concatenate(n_list),
        concatenate(c_list),
    )
示例#3
0
def save_data_hdf5(hf, od, filelist=None, compression="lzf"):
    dset = hf.create_dataset("kernel", data=od.X, compression=compression)
    if filelist:
        dset.attrs["files"] = np.string_(",".join(
            str(s.stem) for s in filelist))

    hf.create_dataset("pv", data=od.Y[0], compression=compression)
    hf.create_dataset("sv", data=od.Y[2], compression=compression)
    hf.create_dataset("pv_other", data=od.Y[1], compression=compression)
    hf.create_dataset("sv_other", data=od.Y[3], compression=compression)
    hf.create_dataset("Xmax", data=od.Xmax, compression=compression)
    hf.create_dataset("Ymax", data=od.Ymax, compression=compression)

    akdh5 = awkward.hdf5(hf)
    akdh5["pv_loc_x"] = od.pv_loc_x
    akdh5["pv_loc_y"] = od.pv_loc_y
    akdh5["pv_loc"] = od.pv_loc
    akdh5["pv_ntracks"] = od.pv_ntracks
    akdh5["pv_cat"] = od.pv_cat
    akdh5["sv_loc_x"] = od.sv_loc_x
    akdh5["sv_loc_y"] = od.sv_loc_y
    akdh5["sv_loc"] = od.sv_loc
    akdh5["sv_ntracks"] = od.sv_ntracks
    akdh5["sv_cat"] = od.sv_cat

    return dset
def save_data_hdf5(hf, od, filelist=None, compression="lzf"):
    dset = hf.create_dataset("kernel", data=od.X, compression=compression)
    if filelist:
        dset.attrs["files"] = np.string_(",".join(str(s.stem) for s in filelist))

    hf.create_dataset("pv", data=od.Y[0], compression=compression)
    hf.create_dataset("sv", data=od.Y[2], compression=compression)
    hf.create_dataset("pv_other", data=od.Y[1], compression=compression)
    hf.create_dataset("sv_other", data=od.Y[3], compression=compression)
    hf.create_dataset("Xmax", data=od.Xmax, compression=compression)
    hf.create_dataset("Ymax", data=od.Ymax, compression=compression)
##  added 200922
    hf.create_dataset("poca_KDE_A", data=od.poca_KDE_A, compression=compression)
    hf.create_dataset("poca_KDE_A_xMax", data=od.poca_KDE_A_xMax, compression=compression)
    hf.create_dataset("poca_KDE_A_yMax", data=od.poca_KDE_A_yMax, compression=compression)
    hf.create_dataset("poca_KDE_B", data=od.poca_KDE_B, compression=compression)
    hf.create_dataset("poca_KDE_B_xMax", data=od.poca_KDE_B_xMax, compression=compression)
    hf.create_dataset("poca_KDE_B_yMax", data=od.poca_KDE_B_yMax, compression=compression)

    akdh5 = awkward.hdf5(hf)
    akdh5["pv_loc_x"] = od.pv_loc_x
    akdh5["pv_loc_y"] = od.pv_loc_y
    akdh5["pv_loc"] = od.pv_loc
    akdh5["pv_ntracks"] = od.pv_ntracks
    akdh5["pv_cat"] = od.pv_cat
    akdh5["sv_loc_x"] = od.sv_loc_x
    akdh5["sv_loc_y"] = od.sv_loc_y
    akdh5["sv_loc"] = od.sv_loc
    akdh5["sv_ntracks"] = od.sv_ntracks
    akdh5["sv_cat"] = od.sv_cat
    akdh5["recon_x"] = od.recon_x
    akdh5["recon_y"] = od.recon_y
    akdh5["recon_z"] = od.recon_z
    akdh5["recon_tx"] = od.recon_tx
    akdh5["recon_ty"] = od.recon_ty
## mds    akdh5["recon_pocax"] = od.recon_pocax
## mds    akdh5["recon_pocay"] = od.recon_pocay
## mds    akdh5["recon_pocaz"] = od.recon_pocaz
## mds    akdh5["recon_sigmapocaxy"] = od.recon_sigmapocaxy

##  added 200922
    akdh5["poca_x"] = od.poca_x
    akdh5["poca_y"] = od.poca_y
    akdh5["poca_z"] = od.poca_z
    akdh5["major_axis_x"] = od.major_axis_x
    akdh5["major_axis_y"] = od.major_axis_y
    akdh5["major_axis_z"] = od.major_axis_z
    akdh5["minor_axis1_x"] = od.minor_axis1_x
    akdh5["minor_axis1_y"] = od.minor_axis1_y
    akdh5["minor_axis1_z"] = od.minor_axis1_z
    akdh5["minor_axis2_x"] = od.minor_axis2_x
    akdh5["minor_axis2_y"] = od.minor_axis2_y
    akdh5["minor_axis2_z"] = od.minor_axis2_z

    return dset
示例#5
0
def collect_t2kde_data(
    *files,
    batch_size=1,
    dtype=np.float32,
    device=None,
    slice=None,
    **kargs,
):
    """
    This function collects data. It does not split it up. You can pass in multiple files.
    Example: collect_data('a.h5', 'b.h5')

    batch_size: The number of events per batch
    dtype: Select a different dtype (like float16)
    slice: Allow just a slice of data to be loaded
    device: The device to load onto (CPU by default)
    **kargs: Any other keyword arguments will be passed on to torch's DataLoader
    """

## these unit vectors will be used to convert the elements of 
## the ellipsoid major and minor axis vectors into vectors
    xhat = np.array([1, 0, 0])
    yhat = np.array([0, 1, 0])
    zhat = np.array([0, 0, 1])

    Xlist = []
    Ylist = []

    print("Loading data...")

    for XY_file in files:
        msg = f"Loaded {XY_file} in {{time:.4}} s"
        with Timer(msg), h5py.File(XY_file, mode="r") as f:
            ## [:,np.newaxis,:] makes X (a x b) --> (a x 1 x b) (axis 0, axis 1, axis 2)
            ## a is *probably* 4000 and b is *probably* N, but it could be the other
            ## way around;  check iwth .shape

## Here we read in the KDE itself plus the values of x and y where the KDE is maximal for 
## each bin of z. It appears that in the test file the original KDE values .AND. the values 
## of Xmax and Ymax have been divided by 2500. This should have been done only for the 
## KDE values, so Xmax and Ymax are re-scaled to better use the dynamic range available 
## using np.float16

## mds 200729  the KDE targets have many zeros. Learning zeros using a ratio
## mds         of predicted to target means that overestimating by a small
## mds         amount in the cost function, even adding an epsilon-like parameter## mds         there is difficult. Let's explicitly add epsilon here.
## mds         We might be able to do it equally well in the cost function,
## mds         but doing it here makes plotting easy as well.

            epsilon = 0.001 
## mds 201019            kernel = np.asarray(f["kernel"]) + epsilon
## we want to use the poca KDE, not the original kernel
            kernel = np.asarray(f["poca_KDE_B"]) + epsilon
            Xmax = 2500.*np.asarray(f["poca_KDE_B_xMax"])
            Ymax = 2500.*np.asarray(f["poca_KDE_B_yMax"]) 
            
            Y = ja.concatenate((kernel,Xmax,Ymax),axis=1).astype(dtype_Y)
            
## now build the feature set from the relevant tracks' parameters
## we need to use "afile" to account for the variable length
## structure of the awkward arrays

##  201018  use poca ellipsoid parameter rather than "track parameters"
        
            afile = awkward.hdf5(f)
            
            pocaz = np.asarray(0.001*afile["poca_z"].astype(dtype_Y))
            pocax = np.asarray(afile["poca_x"].astype(dtype_Y))
            pocay = np.asarray(afile["poca_y"].astype(dtype_Y))
            pocaMx = np.asarray(afile["major_axis_x"].astype(dtype_Y))
            print("pocaMx.shape = ", pocaMx.shape)
            pocaMy = np.asarray(afile["major_axis_y"].astype(dtype_Y))
            pocaMz = np.asarray(afile["major_axis_z"].astype(dtype_Y))

            nEvts = len(pocaz)
            print("nEvts = ", nEvts)

            print("len(pocaMx[0]) = ", len(pocaMx[0]))
            print("len(pocaMx[1]) = ", len(pocaMx[1]))
            print("len(pocaMx[2]) = ", len(pocaMx[2]))
            print("len(pocaMx[3]) = ", len(pocaMx[3]))
            print("len(pocaMx[4]) = ", len(pocaMx[4]))

            Mx = np.multiply(pocaMx.reshape(nEvts,1),xhat)
            My = np.multiply(pocaMy.reshape(nEvts,1),yhat)
            Mz = np.multiply(pocaMz.reshape(nEvts,1),zhat)
            majorAxis = Mx+My+Mz
            print("majorAxis.shape = ",majorAxis.shape)


            poca_m1x = np.asarray(afile["minor_axis1_x"].astype(dtype_Y))
            poca_m1y = np.asarray(afile["minor_axis1_y"].astype(dtype_Y))
            poca_m1z = np.asarray(afile["minor_axis1_z"].astype(dtype_Y))

            mx = np.multiply(poca_m1x.reshape(nEvts,1),xhat)
            my = np.multiply(poca_m1y.reshape(nEvts,1),yhat)
            mz = np.multiply(poca_m1z.reshape(nEvts,1),zhat)
            minorAxis_1 = mx+my+mz
            print("minorAxis_1.shape = ",minorAxis_1.shape)

            poca_m2x = np.asarray(afile["minor_axis2_x"].astype(dtype_Y))
            poca_m2y = np.asarray(afile["minor_axis2_y"].astype(dtype_Y))
            poca_m2z = np.asarray(afile["minor_axis2_z"].astype(dtype_Y))


            mx = np.multiply(poca_m2x.reshape(nEvts,1),xhat)
            my = np.multiply(poca_m2y.reshape(nEvts,1),yhat)
            mz = np.multiply(poca_m2z.reshape(nEvts,1),zhat)
            minorAxis_2 = mx+my+mz
            print("minorAxis_2.shape = ",minorAxis_1.shape)


            A, B, C, D, E, F = six_ellipsoid_parameters(majorAxis,minorAxis_1,minorAxis_2)

            print("A.shape = ",A.shape)
            for iTrk in range(3):
              print("majorAxis[iTrk][0][0] = ",majorAxis[iTrk][0][0])
              print("majorAxis[iTrk][1][0] = ",majorAxis[iTrk][1][0])
              print("majorAxis[iTrk][2][0] = ",majorAxis[iTrk][2][0])
              print("minorAxis_1[iTrk][0][0] = ",minorAxis_1[iTrk][0][0])
              print("minorAxis_1[iTrk][1][0] = ",minorAxis_1[iTrk][1][0])
              print("minorAxis_1[iTrk][2][0] = ",minorAxis_1[iTrk][2][0])
              print("minorAxis_2[iTrk][0][0] = ",minorAxis_2[iTrk][0][0])
              print("minorAxis_2[iTrk][1][0] = ",minorAxis_2[iTrk][1][0])
              print("minorAxis_2[iTrk][2][0] = ",minorAxis_2[iTrk][2][0])
              print("  ")
## mdsAA              print("A[iTrk][0] = ",A[iTrk][0])
## mdsAA              print("B[iTrk][0] = ",B[iTrk][0])
## mdsAA              print("C[iTrk][0] = ",C[iTrk][0])
## mdsAA              print("D[iTrk][0] = ",D[iTrk][0])
## mdsAA              print("E[iTrk][0] = ",E[iTrk][0])
## mdsAA              print("F[iTrk][0] = ",F[iTrk][0])
## mds              print("majorAxis[iTrk][0] = ", majorAxis[iTrk][0])
## mds              print("majorAxis[iTrk][1] = ", majorAxis[iTrk][1])
## mds              print("majorAxis[iTrk][2] = ", majorAxis[iTrk][2])


            

##  mark non-track data with -99 as a flag
            maxLen = 600 ## for safety:  600 >> 481, which is what was seen for 100 evts
            padded_pocaz   = np.zeros((nEvts,maxLen))-99.
            padded_pocax   = np.zeros((nEvts,maxLen))-99.
            padded_pocay   = np.zeros((nEvts,maxLen))-99.
            padded_pocaA  = np.zeros((nEvts,maxLen))-99.
            padded_pocaB  = np.zeros((nEvts,maxLen))-99.
            padded_pocaC  = np.zeros((nEvts,maxLen))-99.
            padded_pocaD  = np.zeros((nEvts,maxLen))-99.
            padded_pocaE  = np.zeros((nEvts,maxLen))-99.
            padded_pocaF  = np.zeros((nEvts,maxLen))-99.

            for i, e in enumerate(pocaz):
                fillingLength = min(len(e),maxLen)
                padded_pocaz[i,:fillingLength] = pocaz[i][:fillingLength].astype(dtype_Y)
                padded_pocax[i,:fillingLength] = pocax[i][:fillingLength].astype(dtype_Y)
                padded_pocay[i,:fillingLength] = pocay[i][:fillingLength].astype(dtype_Y)
                padded_pocaA[i,:fillingLength] = A[i][:fillingLength].astype(dtype_Y)
                padded_pocaB[i,:fillingLength] = B[i][:fillingLength].astype(dtype_Y)
                padded_pocaC[i,:fillingLength] = C[i][:fillingLength].astype(dtype_Y)
                padded_pocaD[i,:fillingLength] = D[i][:fillingLength].astype(dtype_Y)
                padded_pocaE[i,:fillingLength] = E[i][:fillingLength].astype(dtype_Y)
                padded_pocaF[i,:fillingLength] = F[i][:fillingLength].astype(dtype_Y)

            padded_pocaz  = padded_pocaz[:,np.newaxis,:]
            padded_pocax  = padded_pocax[:,np.newaxis,:]
            padded_pocay  = padded_pocay[:,np.newaxis,:]
            padded_pocaA = padded_pocaA[:,np.newaxis,:]
            padded_pocaB = padded_pocaB[:,np.newaxis,:]
            padded_pocaC = padded_pocaC[:,np.newaxis,:]
            padded_pocaD = padded_pocaD[:,np.newaxis,:]
            padded_pocaE = padded_pocaE[:,np.newaxis,:]
            padded_pocaF = padded_pocaF[:,np.newaxis,:]

            X = ja.concatenate((padded_pocaz,padded_pocax,padded_pocay,padded_pocaA,padded_pocaB,padded_pocaC,padded_pocaD,padded_pocaE,padded_pocaF),axis=1).astype(dtype_X)

## mds            print("X = ",X)
            print("len(X) = ",len(X))
            Xlist.append(X)
            Ylist.append(Y)
            print("len(Xlist) = ",len(Xlist))
    X = np.concatenate(Xlist, axis=0)
    Y = np.concatenate(Ylist, axis=0)
    print("outer loop X.shape = ", X.shape)

    if slice:
        X = X[slice, :]
        Y = Y[slice, :]

    with Timer(start=f"Constructing {X.shape[0]} event dataset"):
        x_t = torch.tensor(X)
        y_t = torch.tensor(Y)

        if device is not None:
            x_t = x_t.to(device)
            y_t = y_t.to(device)

        dataset = TensorDataset(x_t, y_t)

    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, **kargs)
    print("x_t.shape = ",x_t.shape)
    print("x_t.shape[0] = ", x_t.shape[0])
    print("x_t.shape[1] = ", x_t.shape[1])
    nFeatures = 6
    x_t.view(x_t.shape[0],nFeatures,-1)
    print("x_t.shape = ",x_t.shape)
    
    
    return loader
示例#6
0
def collect_t2kde_data(
    *files,
    batch_size=1,
    dtype=np.float32,
    device=None,
    slice=None,
    **kargs,
):
    """
    This function collects data. It does not split it up. You can pass in multiple files.
    Example: collect_data('a.h5', 'b.h5')

    batch_size: The number of events per batch
    dtype: Select a different dtype (like float16)
    slice: Allow just a slice of data to be loaded
    device: The device to load onto (CPU by default)
    **kargs: Any other keyword arguments will be passed on to torch's DataLoader
    """

    Xlist = []
    Ylist = []

    print("Loading data...")

    for XY_file in files:
        msg = f"Loaded {XY_file} in {{time:.4}} s"
        with Timer(msg), h5py.File(XY_file, mode="r") as f:
            ## [:,np.newaxis,:] makes X (a x b) --> (a x 1 x b) (axis 0, axis 1, axis 2)
            ## a is *probably* 4000 and b is *probably* N, but it could be the other
            ## way around;  check iwth .shape

            ## Here we read in the KDE itself plus the values of x and y where the KDE is maximal for
            ## each bin of z. It appears that in the test file the original KDE values .AND. the values
            ## of Xmax and Ymax have been divided by 2500. This should have been done only for the
            ## KDE values, so Xmax and Ymax are re-scaled to better use the dynamic range available
            ## using np.float16

            kernel = np.asarray(f["kernel"])
            Xmax = 2500. * np.asarray(f["Xmax"])
            Ymax = 2500. * np.asarray(f["Ymax"])

            Y = ja.concatenate((kernel, Xmax, Ymax), axis=1).astype(dtype_Y)

            ## now build the feature set from the relevant tracks' parameters
            ## we need to usse "afile" to account for the variable length
            ## structure of the awkward arrays

            afile = awkward.hdf5(f)

            pocaz = np.asarray(0.001 * afile["recon_pocaz"].astype(dtype_Y))
            pocax = np.asarray(afile["recon_pocax"].astype(dtype_Y))
            pocay = np.asarray(afile["recon_pocay"].astype(dtype_Y))
            pocaTx = np.asarray(afile["recon_tx"].astype(dtype_Y))
            pocaTy = np.asarray(afile["recon_ty"].astype(dtype_Y))
            pocaSigmapocaxy = np.asarray(
                afile["recon_sigmapocaxy"].astype(dtype_Y))
            nEvts = len(pocaz)

            ## mds for testing only            for i in range(nEvts-1):
            ## mds for testing only                maxLen = max(maxLen,len(pocaz[i]))
            ## mds for testing only            print("maxLen = ",maxLen)

            ##  mark non-track data with -99 as a flag
            maxLen = 600  ## for safety:  600 >> 481, which is what was seen for 100 evts
            padded_pocaz = np.zeros((nEvts, maxLen)) - 99.
            padded_pocax = np.zeros((nEvts, maxLen)) - 99.
            padded_pocay = np.zeros((nEvts, maxLen)) - 99.
            padded_tx = np.zeros((nEvts, maxLen)) - 99.
            padded_ty = np.zeros((nEvts, maxLen)) - 99.
            padded_sigma = np.zeros((nEvts, maxLen)) - 99.

            for i, e in enumerate(pocaz):
                fillingLength = min(len(e), maxLen)
                padded_pocaz[i, :fillingLength] = pocaz[
                    i][:fillingLength].astype(dtype_Y)
                padded_pocax[i, :fillingLength] = pocax[
                    i][:fillingLength].astype(dtype_Y)
                padded_pocay[i, :fillingLength] = pocay[
                    i][:fillingLength].astype(dtype_Y)
                padded_tx[i, :fillingLength] = pocaTx[
                    i][:fillingLength].astype(dtype_Y)
                padded_ty[i, :fillingLength] = pocaTy[
                    i][:fillingLength].astype(dtype_Y)
                padded_sigma[i, :fillingLength] = pocaSigmapocaxy[
                    i][:fillingLength].astype(dtype_Y)

            padded_pocaz = padded_pocaz[:, np.newaxis, :]
            padded_pocax = padded_pocax[:, np.newaxis, :]
            padded_pocay = padded_pocay[:, np.newaxis, :]
            padded_tx = padded_tx[:, np.newaxis, :]
            padded_ty = padded_ty[:, np.newaxis, :]
            padded_sigma = padded_sigma[:, np.newaxis, :]

            X = ja.concatenate((padded_pocaz, padded_pocax, padded_pocay,
                                padded_tx, padded_ty, padded_sigma),
                               axis=1).astype(dtype_X)

            ## mds            print("X = ",X)
            print("len(X) = ", len(X))
            Xlist.append(X)
            Ylist.append(Y)
            print("len(Xlist) = ", len(Xlist))
    X = np.concatenate(Xlist, axis=0)
    Y = np.concatenate(Ylist, axis=0)
    print("outer loop X.shape = ", X.shape)

    if slice:
        X = X[slice, :]
        Y = Y[slice, :]

    with Timer(start=f"Constructing {X.shape[0]} event dataset"):
        x_t = torch.tensor(X)
        y_t = torch.tensor(Y)

        if device is not None:
            x_t = x_t.to(device)
            y_t = y_t.to(device)

        dataset = TensorDataset(x_t, y_t)

    loader = torch.utils.data.DataLoader(dataset,
                                         batch_size=batch_size,
                                         **kargs)
    print("x_t.shape = ", x_t.shape)
    print("x_t.shape[0] = ", x_t.shape[0])
    print("x_t.shape[1] = ", x_t.shape[1])
    nFeatures = 6
    x_t.view(x_t.shape[0], nFeatures, -1)
    print("x_t.shape = ", x_t.shape)

    return loader
示例#7
0
def collect_poca(*files):

    #initialize lists
    pocax_list = []
    pocay_list = []
    pocaz_list = []

    majoraxisx_list = []
    majoraxisy_list = []
    majoraxisz_list = []

    minoraxis1x_list = []
    minoraxis1y_list = []
    minoraxis1z_list = []
    minoraxis2x_list = []
    minoraxis2y_list = []
    minoraxis2z_list = []

    #iterate through all files
    for XY_file in files:
        msg = f"Loaded {XY_file} in {{time:.4}} s"
        with h5py.File(XY_file, mode="r") as XY:

            #print keys in current hdf5 file
            print(XY.keys())

            afile = awkward.hdf5(XY)

            #append to appropriate lists
            pocax_list.append(afile["poca_x"])
            pocay_list.append(afile["poca_y"])
            pocaz_list.append(afile["poca_z"])

            majoraxisx_list.append(afile["major_axis_x"])
            majoraxisy_list.append(afile["major_axis_y"])
            majoraxisz_list.append(afile["major_axis_z"])

            minoraxis1x_list.append(afile["minor_axis1_x"])
            minoraxis1y_list.append(afile["minor_axis1_y"])
            minoraxis1z_list.append(afile["minor_axis1_z"])

            minoraxis2x_list.append(afile["minor_axis2_x"])
            minoraxis2y_list.append(afile["minor_axis2_y"])
            minoraxis2z_list.append(afile["minor_axis2_z"])

    #construct pocas dictionary
    pocas = {}
    pocas["x"] = {
        "poca": concatenate(pocax_list),
        "major_axis": concatenate(majoraxisx_list),
        "minor_axis1": concatenate(minoraxis1x_list),
        "minor_axis2": concatenate(minoraxis2x_list)
    }

    pocas["y"] = {
        "poca": concatenate(pocay_list),
        "major_axis": concatenate(majoraxisy_list),
        "minor_axis1": concatenate(minoraxis1y_list),
        "minor_axis2": concatenate(minoraxis2y_list)
    }

    pocas["z"] = {
        "poca": concatenate(pocaz_list),
        "major_axis": concatenate(majoraxisz_list),
        "minor_axis1": concatenate(minoraxis1z_list),
        "minor_axis2": concatenate(minoraxis2z_list)
    }

    return pocas