예제 #1
0
def getweights(flav, filename, ptmin, ptmax):
    pt = getvar('pt', flav, filename, ptmin=ptmin, ptmax=ptmax, train=1)
    flatdist = np.random.uniform(ptmin, ptmax, pt.shape[0])

    reweighter = BinsReweighter()
    reweighter.fit(original=pt, target=flatdist)
    return reweighter.predict_weights(pt)
예제 #2
0
def test_folding_bins_reweighter():
    reweighter = FoldingReweighter(BinsReweighter(n_bins=20, n_neighs=2),
                                   n_folds=3)
    check_reweighter(n_dimensions=2,
                     n_samples=1000000,
                     reweighter=reweighter,
                     folding=True)
예제 #3
0
def main ():

    # For reproducibility
    np.random.seed(21)

    # Parse command-line argument
    args = parser.parse_args()

    # Modify directory name to conform to convention
    if not args.dir.endswith('/'): args.dir += '/'

    print "Reading and reweighting, splitting files in:\n  {}".format(args.dir)

    # paths = sorted(glob.glob(args.dir + '*/*_slim.h5'))
    paths = sorted(glob.glob("./extractedHbbTopDatasets/*.h5"))


    print "Found {} files.".format(len(paths))

    # Reading input HDF5 file(s)
    data = None
    with Profile("Reading input HDF5 file(s)"):

        # Run batched conversion in parallel
        queue = multiprocessing.Queue()
        parts = run_batched(FileLoader, list(enumerate(paths)), queue=queue, max_processes=args.max_processes)
        
        data = np.lib.recfunctions.stack_arrays(zip(*sorted(parts, key=lambda t: t[0]))[1], autoconvert=True, usemask=False)

        # Concatenate data in sorted order, for reproducibility
        # data = np.concatenate(zip(*sorted(parts, key=lambda t: t[0]))[1])
        pass
    
    print "Found {} samples.".format(data.shape[0])

    # Subsample
    with Profile("Subsample"):
        for sig in [0,1]:

            # Select samples belonging to current category
            if sig == 0:
                msk = (data['signal'] == 0) & (data["dsid"] > 360000)
            else:
                msk = (data["signal"] == 1)

            # Store reference of samples belonging to other category
            other = np.array(~msk).astype(bool)

            # Subsample current category
            num_sample = int((args.train + args.test) * 1E+06)
            if num_sample <= msk.sum():
                idx = np.random.choice(np.where(msk)[0], num_sample, replace=False)
                sample = np.zeros_like(msk).astype(bool)
                sample[idx] = True
            else:
                print "[WARNING] Requested {:.1e} samples, but only {:.1e} are availabe in current mask. Using all available samples.".format(num_sample, msk.sum())
                sample = np.ones_like(msk).astype(bool)
                pass

            # Select subsample, and all samples from other categories
            data = data[sample | other]
            pass
        pass


    # Re-weighting
    with Profile("Re-weighting"):

        # Add new data columns
        data = append_fields(data, 'weight_train', np.ones_like(data['weight_test']))
        data = append_fields(data, 'weight_adv',   np.ones_like(data['weight_test']))

        # Reweight signal and background separately
        for sig in [0,1]:

            # Prepare data arrays
            msk = data['signal'] == sig

            # Flat pT
            # ------------------------------------------------------------------
            original = data['pt'][msk]
            xmin, xmax = original.min(), original.max()
            target = np.random.rand(original.size) * (xmax - xmin) + xmin

            # Fit bins-reweighter
            reweighter = BinsReweighter(n_bins=100, n_neighs=1)
            reweighter.fit(original, target=target)
            
            # Predict new, flat-pT weight
            data['weight_train'][msk] = reweighter.predict_weights(original)


            # (Flat-pT, physical-m) reweighted
            # ------------------------------------------------------------------
            original        = data['pt'][msk]
            original_weight = data['weight_test'][msk]

            ptmin, ptmax = data['pt'].min(), data['pt'].max()
            target = np.random.rand(msk.sum()) * (ptmax - ptmin) + ptmin

            # Fit bins-reweighter
            reweighter = BinsReweighter(n_bins=100, n_neighs=1)
            reweighter.fit(original, original_weight=original_weight, target=target)

            # Compute new weights
            data['weight_adv'][msk] = reweighter.predict_weights(original, original_weight=original_weight)

            # Standardise weight variables
            # ------------------------------------------------------------------
            weight_variables = filter(lambda name: name.startswith('weight_'), data.dtype.names)
            for var in weight_variables:
                print "  Ensuring unit mean for {}".format(var)
                data[var][msk] /= data[var][msk].mean()
                pass

            pass
        pass


    # Train/test split
    with Profile("Performing train/test split"):
        msk_sig = data['signal'] == 1
        num_sig =   msk_sig .sum()
        num_bkg = (~msk_sig).sum()
        num_train = int(args.train * 1E+06)
        print "Found {:.1e} signal and {:.1e} background samples.".format(num_sig, num_bkg)
        print "Using {:.1e} samples for training for each class, leaving {:.1e} signal and {:.1e} background samples for testing.".format(num_train, num_sig - num_train, num_bkg - num_train)

        idx_sig = np.where( msk_sig)[0]
        idx_bkg = np.where(~msk_sig)[0]
        idx_sig_train = np.random.choice(idx_sig, num_train, replace=False)
        idx_bkg_train = np.random.choice(idx_bkg, num_train, replace=False)

        data = append_fields(data, 'train', np.zeros_like(data['signal']).astype(int))
        data['train'][idx_sig_train] = 1
        data['train'][idx_bkg_train] = 1
        pass


    # Shuffle
    with Profile("Shuffling samples"):
        idx = np.arange(data.shape[0])
        np.random.shuffle(idx)
        data = data[idx]
        pass


    # Writing output HDF5 file
    with Profile("Writing output HDF5 file"):
        save_hdf5(data,  './reweightDatasets/extractedData.h5')
        pass

    return
data = np.reshape(data, (data.shape[0], data.shape[1]))
#data=data[data[:,0]==1] #Dijets
#data=data[data[:,1]==1] #Hbb
data = data[data[:, 2] == 1]  #Top
pt = data[:, 8]
weight = data[:, 3]

#Reweight to flat pt
from hep_ml.reweight import BinsReweighter

original = pt
xmin, xmax = original.min(), original.max()
target = np.random.rand(original.size) * (xmax - xmin) + xmin
bins = 500
neights = 1
reweighter = BinsReweighter(n_bins=bins, n_neighs=neights)
reweighter.fit(original, target=target)
weight1 = reweighter.predict_weights(original)

#Weight for training samples
weight1 = weight1 * 4000000 / np.sum(weight1)
#weight1=weight1*2000000/np.sum(weight1)

#Weight for validation samples
#weight1=weight1*1000000/np.sum(weight1)
#weight1=weight1*500000/np.sum(weight1)

new_hdf5 = h5py.File("weight.h5", 'a')
new_hdf5.create_dataset("trainDijets", data=weight1)
#new_hdf5.create_dataset("trainHbb",data=weight1)
#new_hdf5.create_dataset("trainTop",data=weight1)
예제 #5
0
def test_reweighter_2d():
    reweighter = BinsReweighter(n_bins=20, n_neighs=2)
    check_reweighter(n_dimensions=2, n_samples=1000000, reweighter=reweighter)