Пример #1
0
    def test_load_dataset(self):
        """ Tests the load_data function """

        # if the dataset doesn't exist
        with self.assertRaises(IOError):
            ds.load_dataset("this one definitely doesn't exist either",
                         "")
Пример #2
0
def augment(dataset, format, shift_size):
    """

    Parameters
    ----------
    dataset :
    format :
    shift_size :

    Returns
    -------

    """
    shift_size *= (math.pi/180.0)
    num_shifts = int(2*math.pi / shift_size)
    x_train, y_train, x_test, y_test = ds.load_dataset(dataset, format)
    augmented_x = np.zeros((x_train.shape[0]*num_shifts, x_train.shape[1]))
    augmented_y = np.zeros((y_train.shape[0]*num_shifts, y_train.shape[1]))
    for ix, line in enumerate(x_train):
        if (ix+1)%1000 == 0: print ix+1
        for s in xrange(num_shifts):
            shift = s * shift_size
            augmented_x[ix*num_shifts+s] = [verify_angle(val+shift) if index%4==2 else val for index,val in enumerate(line)]
            augmented_y[ix*num_shifts+s] = y_train[ix]
    tr.shuffle_in_unison(augmented_x, augmented_y)

    output_path = os.path.join(ds.get_path_to_dataset(dataset), "augmented_{}.npz".format(format))
    np.savez(output_path, x_train=augmented_x, x_test=x_test, y_train=augmented_y, y_test=y_test)
Пример #3
0
def _save_by_jet_num(dataset, num_jets):
    data, format = dataset.split('/')
    x_train, y_train, x_test, y_test = ds.load_dataset(data, format)
    if num_jets.endswith("+"):
        val = lambda x: x >= int(num_jets[:-1])
    elif num_jets.endswith("-"):
        val = lambda x: x <= int(num_jets[:-1])
    else:
        val = lambda x: x == int(num_jets)
    all_x = np.concatenate((x_train, x_test), axis=0)
    all_y = np.concatenate((y_train, y_test), axis=0)
    nulls = np.zeros((all_x.shape[0], all_x.shape[1]/4), dtype=np.bool)
    for y in xrange(all_x.shape[1]/4):
        for ix, row in enumerate((x_train > 0)[:, y*4:(y+1)*4]):
            nulls[ix, y] = all(row == 0)
    events_with_x_jets = val(np.array([row[:-2].sum() for row in ~nulls]))

    all_x, all_y = all_x[events_with_x_jets], all_y[events_with_x_jets]
    tr.shuffle_in_unison(all_x, all_y)

    cutoff = int(all_x.shape[0] * 0.8)  # 80% training 20% testing
    train_x = all_x[:cutoff]
    train_y = all_y[:cutoff]
    test_x = all_x[cutoff:]
    test_y = all_y[cutoff:]

    output_path = os.path.join(ds.get_path_to_dataset(data), "{}jets_{}.npz".format(num_jets, format))
    np.savez(output_path, x_train=train_x, x_test=test_x, y_train=train_y, y_test=test_y)
Пример #4
0
def _save_by_jet_num(dataset, num_jets):
    data, format = dataset.split('/')
    x_train, y_train, x_test, y_test = ds.load_dataset(data, format)
    if num_jets.endswith("+"):
        val = lambda x: x >= int(num_jets[:-1])
    elif num_jets.endswith("-"):
        val = lambda x: x <= int(num_jets[:-1])
    else:
        val = lambda x: x == int(num_jets)
    all_x = np.concatenate((x_train, x_test), axis=0)
    all_y = np.concatenate((y_train, y_test), axis=0)
    nulls = np.zeros((all_x.shape[0], all_x.shape[1]/4), dtype=np.bool)
    for y in xrange(all_x.shape[1]/4):
        for ix, row in enumerate((x_train > 0)[:, y*4:(y+1)*4]):
            nulls[ix, y] = all(row == 0)
    events_with_x_jets = val(np.array([row[:-2].sum() for row in ~nulls]))

    all_x, all_y = all_x[events_with_x_jets], all_y[events_with_x_jets]
    tr.shuffle_in_unison(all_x, all_y)

    cutoff = int(all_x.shape[0] * 0.8)  # 80% training 20% testing
    train_x = all_x[:cutoff]
    train_y = all_y[:cutoff]
    test_x = all_x[cutoff:]
    test_y = all_y[cutoff:]

    output_path = os.path.join(ds.get_path_to_dataset(data), "{}jets_{}.npz".format(num_jets, format))
    np.savez(output_path, x_train=train_x, x_test=test_x, y_train=train_y, y_test=test_y)
Пример #5
0
def augment(dataset, format, shift_size):
    shift_size *= (math.pi/180.0)
    num_shifts = int(2*math.pi / shift_size)
    x_train, y_train, x_test, y_test = ds.load_dataset(dataset, format)
    augmented_x = np.zeros((x_train.shape[0]*num_shifts, x_train.shape[1]))
    augmented_y = np.zeros((y_train.shape[0]*num_shifts, y_train.shape[1]))
    for ix, line in enumerate(x_train):
        if (ix+1)%1000 == 0: print ix+1
        for s in xrange(num_shifts):
            shift = s * shift_size
            augmented_x[ix*num_shifts+s] = [verify_angle(val+shift) if index%4==2 else val for index,val in enumerate(line)]
            augmented_y[ix*num_shifts+s] = y_train[ix]
    tr.shuffle_in_unison(augmented_x, augmented_y)

    output_path = os.path.join(ds.get_path_to_dataset(dataset), "augmented_{}.npz".format(format))
    np.savez(output_path, x_train=augmented_x, x_test=x_test, y_train=augmented_y, y_test=y_test)
Пример #6
0
def permutate_individual_sorted(dataset):
    """ Only use this for sorted data! Also, this takes up a significant amount of RAM """
    data, format = dataset.split('/')
    x_train, y_train, x_test, y_test = ds.load_dataset(data, format)

    # Generate permutations, transforms, and alter the dataset
    perms = list(gen_permutations(2, 7, 2))
    num_perms = len(perms)

    aperms = np.array(perms)
    labels = np.zeros(aperms.shape)
    r = np.arange(11)
    for i,p in enumerate(aperms):
        labels[i] = (p == r).astype('int32')

    transforms = np.zeros((44, 44 * num_perms))
    for i, p in enumerate(perms):
        transforms[:, i * 44:(i + 1) * 44] = E(p)

    # For the training data
    sorted_train_x = np.zeros((x_train.shape[0] * num_perms, x_train.shape[1]))
    sorted_train_y = np.zeros((sorted_train_x.shape[0], 2))

    for i, batch in enumerate(x_train):
        event = np.dot(batch, transforms).reshape((num_perms, x_train.shape[1]))
        arange = np.arange(num_perms)
        np.random.shuffle(arange)
        sorted_train_x[i * num_perms:(i + 1) * num_perms] = event[arange]
        sorted_train_y[i * num_perms:(i + 1) * num_perms] = labels[arange]

    # For the testing data
    sorted_test_x = np.zeros((x_test.shape[0] * num_perms, x_test.shape[1]))
    sorted_test_y = np.zeros((sorted_test_x.shape[0], 2))

    for i, batch in enumerate(x_test):
        event = np.dot(batch, transforms).reshape((num_perms, x_test.shape[1]))
        arange = np.arange(num_perms)
        np.random.shuffle(arange)
        sorted_test_x[i * num_perms:(i + 1) * num_perms] = event[arange]
        sorted_test_y[i * num_perms:(i + 1) * num_perms] = labels[arange]

    output_path = os.path.join(ds.get_path_to_dataset(data), "{}_{}.npz".format(format, "Permuted"))
    np.savez(output_path, x_train=sorted_train_x, x_test=sorted_test_x, y_train=sorted_train_y, y_test=sorted_test_y)
Пример #7
0
        cutoff = 1-i*(1/datapoints)
        e_b[i], e_s[i] = efficiencies(model, data, cutoff)[:,1]
        if experiment_epoch:
            point = experiment_epoch.curve.add()
            point.signal = e_s[i]
            point.background = e_b[i]
            point.cutoff = cutoff
    if save:
        plt.plot(e_b, e_s)
        plt.title("Efficiency Curve")
        plt.ylabel("Signal Efficiency")
        plt.xlabel("Background Inefficiency")
        plt.savefig(save, format="png")
    return trapz(e_s,e_b)

# ""
def confusion_matrix(model, data, offset='', **kwargs):
    eff = efficiencies(model, data, **kwargs)
    return MATRIX.format(offset, *(eff*100).flatten())


if __name__ == "__main__":
    from deep_learning.trainNN import load_model
    model = load_model("ttHLep/U_Optimal")
    x_train, y_train, x_test, y_test = ds.load_dataset("ttHLep", "Unsorted")
    x_train, x_test = tr.transform(x_train, x_test)
    data = (x_train, y_train, x_test, y_test)
    print significance(model, data)
    print AUC(model, data)
    print confusion_matrix(model, data)
    print confusion_matrix(model, data, over_rows=False)
Пример #8
0
def save_ratios(dataset, ratios, buffer=1000):
    """
    Divides a certain dataset into subsets of data with certain ratios of backgrond to signal. For a ratio list of
    length n, the counting index, i, for the background starts from index 0, and the counting index, j, for the signal
    starts at n-1. The ratio for each iteration is then i to j (i/j). Generates a temporary file to accomplish this.

    Parameters
    ----------
    dataset <string> : the name of the dataset (/-separated)
    ratios <list> : a list of integers that define ratios of background to signal.
    buffer <int> : an integer defining the number of data points to load into memory at a time.

    """
    ratios = [ratios] if type(ratios) is str else ratios
    ratios = map(lambda x: map(float, x.split(':')), ratios)
    data = dataset.split('/')[0]
    format = '/'.join(dataset.split('/')[1:])
    main_file, (x_train, y_train, x_test, y_test) = ds.load_dataset(data, format, mode='a')

    bkg_test, sig_test = sum_cols(y_test)
    bkg_train, sig_train = sum_cols(y_train)

    TEST_UPPER_LIMIT = int(1.5 * bkg_test) if bkg_test < sig_test else int(1.5 * sig_test)
    TRAIN_UPPER_LIMIT = int(1.5 * bkg_train) if bkg_train < sig_train else int(1.5 * sig_train)

    temp_h_file, temp_h_data = add_group_hdf5(".deep_learning.temp.h5", "Temp",
                                    [(bkg_train, x_train.shape[1]),
                                     (bkg_train, y_train.shape[1]),
                                     (sig_train, x_train.shape[1]),
                                     (sig_train, y_train.shape[1]),
                                     (bkg_test, x_test.shape[1]),
                                     (bkg_test, y_test.shape[1]),
                                     (sig_test, x_test.shape[1]),
                                     (sig_test, y_test.shape[1])],
                                    names=["train_bkg_x",
                                           "train_bkg_y",
                                           "train_sig_x",
                                           "train_sig_y",
                                           "test_bkg_x",
                                           "test_bkg_y",
                                           "test_sig_x",
                                           "test_sig_y"])

    print "Generating temporary files..."
    for i in xrange(int(math.ceil(x_train.shape[0] / buffer))):
        # index should be same shape and need to reshape the result :/
        train_bkg_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0])
        train_sig_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0])
        test_bkg_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0])
        test_sig_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0])

        for j in xrange(x_train.shape[1]):
            train_bkg_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 0] == 1
            train_sig_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 1] == 1
        for j in xrange(x_test.shape[1]):
            test_bkg_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 0] == 1
            test_sig_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 1] == 1

        selection = x_train[train_bkg_index]
        temp_h_data[0].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1])))

        selection = y_train[train_bkg_index[:, :y_train.shape[1]]]
        temp_h_data[1].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1])))

        selection = x_train[train_sig_index]
        temp_h_data[2].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1])))

        selection = y_train[train_sig_index[:, :y_train.shape[1]]]
        temp_h_data[3].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1])))

        selection = x_test[test_bkg_index]
        temp_h_data[4].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1])))

        selection = y_test[test_bkg_index[:, :y_test.shape[1]]]
        temp_h_data[5].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1])))

        selection = x_test[test_sig_index]
        temp_h_data[6].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1])))

        selection = y_test[test_sig_index[:, :y_test.shape[1]]]
        temp_h_data[7].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1])))

    # Perform all of this in archive so that you write to file every iteration
    buffer_reset = buffer
    for rat in ratios:

        print "Creating ratio {:d}/{:d} ...".format(*map(int, rat))

        h_file, h_data = add_group_hdf5(ds.get_path_to_dataset(data)+os.sep+data+".h5",
                                        "{}to{}".format(*map(int, rat)),
                                        [(TRAIN_UPPER_LIMIT, x_train.shape[1]),
                                         (TRAIN_UPPER_LIMIT, y_train.shape[1]),
                                         (TEST_UPPER_LIMIT, x_test.shape[1]),
                                         (TEST_UPPER_LIMIT, y_test.shape[1])],
                                        where='/{}'.format(format))

        test_bkg_indices = np.arange(bkg_test)
        test_sig_indices = np.arange(sig_test)
        train_bkg_indices = np.arange(bkg_train)
        train_sig_indices = np.arange(sig_train)

        train_count = 0
        buffer = buffer_reset
        while train_count < TRAIN_UPPER_LIMIT:
            if TRAIN_UPPER_LIMIT - train_count < buffer:
                buffer = TRAIN_UPPER_LIMIT - train_count

            # Indices to NOT include
            train_bkg_ix = np.random.choice(train_bkg_indices,
                                            train_bkg_indices.size - (rat[0] * buffer / sum(rat)),
                                            replace=False)
            train_sig_ix = np.random.choice(train_sig_indices,
                                            train_sig_indices.size - (rat[1] * buffer / sum(rat)),
                                            replace=False)

            # Indices to keep
            k_train_bkg = np.setdiff1d(train_bkg_indices, train_bkg_ix)
            k_train_sig = np.setdiff1d(train_sig_indices, train_sig_ix)

            train_small_x_sig = temp_h_data[2][k_train_sig]
            train_small_y_sig = temp_h_data[3][k_train_sig]
            train_small_x_bkg = temp_h_data[0][k_train_bkg]
            train_small_y_bkg = temp_h_data[1][k_train_bkg]

            train_x = np.concatenate((train_small_x_bkg, train_small_x_sig))
            train_y = np.concatenate((train_small_y_bkg, train_small_y_sig))

            tr.shuffle_in_unison(train_x, train_y)

            h_data[0].append(train_x)
            h_data[1].append(train_y)

            train_count += k_train_bkg.size + k_train_sig.size

            train_bkg_indices = train_bkg_ix
            train_sig_indices = train_sig_ix

        test_count = 0
        buffer = buffer_reset
        while test_count < TEST_UPPER_LIMIT:
            if TEST_UPPER_LIMIT - test_count < buffer:
                buffer = TEST_UPPER_LIMIT - test_count

            # Indices to NOT include
            test_bkg_ix = np.random.choice(test_bkg_indices, test_bkg_indices.size - (rat[0] * buffer / sum(rat)),
                                           replace=False)
            test_sig_ix = np.random.choice(test_sig_indices, test_sig_indices.size - (rat[1] * buffer / sum(rat)),
                                           replace=False)

            # Indices to keep
            k_test_bkg = np.setdiff1d(test_bkg_indices, test_bkg_ix)
            k_test_sig = np.setdiff1d(test_sig_indices, test_sig_ix)

            test_small_x_sig = temp_h_data[6][k_test_sig]
            test_small_y_sig = temp_h_data[7][k_test_sig]
            test_small_x_bkg = temp_h_data[4][k_test_bkg]
            test_small_y_bkg = temp_h_data[5][k_test_bkg]

            test_x = np.concatenate((test_small_x_bkg, test_small_x_sig))
            test_y = np.concatenate((test_small_y_bkg, test_small_y_sig))

            tr.shuffle_in_unison(test_x, test_y)

            h_data[2].append(test_x)
            h_data[3].append(test_y)

            test_count += k_test_bkg.size + k_test_sig.size

            test_bkg_indices = test_bkg_ix
            test_sig_indices = test_sig_ix

        print "Created Group: {}/{}to{}".format(format, *map(int, rat))

        h_file.flush()
        h_file.close()

    main_file.close()
    temp_h_file.close()
    os.remove(".deep_learning.temp.h5")
Пример #9
0
def run(model, exp, terms, save_freq=5, data=None):

    exp_dir = ds.get_path_to_dataset(pb.Experiment.Dataset.Name(exp.dataset))
    save_dir = os.path.join(exp_dir, exp.description)

    ##
    # Load data from .npz archive created by invoking
    # deep_learning/utils/archive.py
    ##

    if data:
        x_train, y_train, x_test, y_test = data
        x_train, x_test = tr.transform(x_train, x_test)
    else:
        h_file, (x_train, y_train, x_test, y_test) = ds.load_dataset(
            pb.Experiment.Dataset.Name(exp.dataset),
            exp.coordinates + '/transformed')
        data = x_train, y_train, x_test, y_test

    exp_file_name = exp.description + '.exp'

    # Start training

    train_length = x_train.shape[0]
    num_batches = int(ceil(train_length / exp.batch_size))

    valid = Validator(exp, terms)

    eTimes = np.array([])
    valid._clock = clock()
    model.summary()
    while valid.check():
        t = clock()
        if valid._num_epochs:
            print("Epoch {}/{}".format(valid.epochs + 1, valid._num_epochs))
        else:
            print("Epoch {}".format(valid.epochs + 1))
        bETA = 0
        bTimes = np.array([])
        #print("\t Training: ")
        for b in xrange(num_batches):
            bt = clock()
            # Update progress bar
            progress(b, num_batches, exp.batch_size, bETA)
            # Train on a batch
            x_batch = x_train[b * exp.batch_size:b * exp.batch_size +
                              exp.batch_size, :]
            y_batch = y_train[b * exp.batch_size:b * exp.batch_size +
                              exp.batch_size, :]
            model.train_on_batch(x_batch, y_batch)
            bTimes = np.append(bTimes, clock() - bt)
            bETA = np.median(bTimes) * (num_batches - b - 1)
        # Finish progress bar
        progress(num_batches,
                 num_batches,
                 exp.batch_size,
                 0,
                 end='\n',
                 time=clock() - t)
        # Calculate stats and add the epoch results to the experiment object
        epoch = exp.results.add()
        timer = clock()
        print("Evaluating Train")
        epoch.train_loss, epoch.train_accuracy = model.evaluate_generator(
            ((x_train[i * exp.batch_size:(i + 1) * exp.batch_size],
              y_train[i * exp.batch_size:(i + 1) * exp.batch_size])
             for i in xrange(num_batches)),
            num_batches,
            max_q_size=min((num_batches // 2, 10)))
        #print("Finished {:.2f}s".format(clock()-timer))
        timer = clock()
        print("Evaluating Test")
        epoch.test_loss, epoch.test_accuracy = model.evaluate_generator(
            ((x_test[i * exp.batch_size:(i + 1) * exp.batch_size],
              y_test[i * exp.batch_size:(i + 1) * exp.batch_size])
             for i in xrange(int(ceil(x_test.shape[0] / exp.batch_size)))),
            int(ceil(x_test.shape[0] / exp.batch_size)),
            max_q_size=min(
                (int(ceil(x_test.shape[0] / exp.batch_size)) // 2, 10)))
        #print("Finished {:.2f}s".format(clock() - timer))
        timer = clock()
        print("Calculating Sig")
        epoch.s_b = st.significance(model, data)
        #print("Finished {:.2f}".format(clock() - timer))
        #timer = clock()
        #print("Calculating AUC {:.2f}".format(clock()))
        #epoch.auc = st.AUC(model, data, experiment_epoch=epoch)
        #print("Finished {:.2f}".format(clock() - timer))
        timer = clock()
        for r in st.num_of_each_cell(model, data):
            epoch.matrix.add().columns.extend(r)
        print("Making CFM")
        matrix = st.confusion_matrix(model, data, offset='\t ')
        #print("Finished {:.2f}".format(clock() - timer))
        epoch.num_seconds = clock() - t
        timer = clock()
        print("Getting output")
        output = st.get_output_distro(model, data)
        epoch.output.background.extend(output["background"])
        epoch.output.signal.extend(output["signal"])
        #print("Finished {:.2f}".format(clock() - timer))
        # Print statistics
        print("\t Train Accuracy: {:.3f}\tTest Accuracy: {:.3f}".format(
            epoch.train_accuracy, epoch.test_accuracy))
        if valid.update_w():
            print("\t Slope: {:.5f} (test_accuracy / second)".format(
                valid.slope))
        print("\t Time this epoch: {:.2f}s".format(epoch.num_seconds), end='')
        if valid._num_epochs:
            eTimes = np.append(eTimes, epoch.num_seconds)
            print("\tFinal ETA: {}".format(
                convert_seconds(
                    np.median(eTimes) * (valid._num_epochs - valid.epochs))))
        else:
            print()
        print("\t Significance (S/sqrt(B)): {:.2f}".format(epoch.s_b))
        print("\t Area Under the Curve (efficiency): {:.3f}".format(epoch.auc))
        print(matrix)

        # Saves the model
        if (len(exp.results) % save_freq) == 0:
            save(model, exp, save_dir, exp_file_name)
            print("\t ", end='')
        sys.stdout.flush()

    exp.end_date_time = str(datetime.datetime.now())
    exp.total_time = valid.time

    print("\n" + valid.failed)
    print("Total Time: {}".format(convert_seconds(valid.time)))

    save(model, exp, save_dir, exp_file_name)
    print("\t ", end='')
    h_file.close()
Пример #10
0
def run(model, exp, terms, save_freq=5, data=None):

    exp_dir = ds.get_path_to_dataset(pb.Experiment.Dataset.Name(exp.dataset))
    save_dir = os.path.join(exp_dir, exp.description)

    ##
    # Load data from .npz archive created by invoking
    # deep_learning/utils/archive.py
    ##

    if data:
        x_train, y_train, x_test, y_test = data
        x_train, x_test = tr.transform(x_train, x_test)
    else:
        h_file, (x_train, y_train, x_test, y_test) = ds.load_dataset(pb.Experiment.Dataset.Name(exp.dataset), exp.coordinates)
        x_train, x_test = tr.transform(x_train, x_test)
        data = x_train, y_train, x_test, y_test

    exp_file_name = exp.description + '.exp'

    train_length = x_train.shape[0]
    num_batches = int(ceil(train_length / exp.batch_size))

    valid = Validator(exp, terms)

    eTimes = np.array([])
    valid._clock = clock()
    model.summary()
    while valid.check():
        t = clock()
        if valid._num_epochs:
            print("Epoch {}/{}".format(valid.epochs+1, valid._num_epochs))
        else:
            print("Epoch {}".format(valid.epochs+1))
        bETA = 0
        bTimes = np.array([])
        for b in xrange(num_batches):
            bt = clock()
            # Update progress bar
            progress(b, num_batches, exp.batch_size, bETA)
            # Train on a batch
            model.train_on_batch(x_train[b*exp.batch_size:b*exp.batch_size+exp.batch_size, :],
                                 y_train[b*exp.batch_size:b*exp.batch_size+exp.batch_size, :])
            bTimes = np.append(bTimes, clock()-bt)
            bETA = np.median(bTimes)*(num_batches-b-1)
        # Finish progress bar
        progress(num_batches, num_batches, exp.batch_size, 0, end='\n')
        # Calculate stats and add the epoch results to the experiment object
        epoch = exp.results.add()
        epoch.train_loss, epoch.train_accuracy = model.evaluate_generator(((x_train[i*exp.batch_size:(i+1)*exp.batch_size],
                                                                           y_train[i*exp.batch_size:(i+1)*exp.batch_size]) for i in xrange(int(ceil(x_test.shape[0]/exp.batch_size)))),
                                                                          int(ceil(x_test.shape[0]/exp.batch_size)))
        epoch.test_loss, epoch.test_accuracy = model.evaluate_generator(((x_test[i*exp.batch_size:(i+1)*exp.batch_size],
                                                                           y_test[i*exp.batch_size:(i+1)*exp.batch_size]) for i in xrange(num_batches)),
                                                              num_batches)
        epoch.s_b = st.significance(model, data)
        epoch.auc = st.AUC(model, data, experiment_epoch=epoch)
        for r in st.num_of_each_cell(model, data):
            epoch.matrix.add().columns.extend(r)
        matrix = st.confusion_matrix(model, data, offset='\t ')
        epoch.num_seconds = clock() - t
        # Print statistics
        print("\t Train Accuracy: {:.3f}\tTest Accuracy: {:.3f}".format(epoch.train_accuracy, epoch.test_accuracy))
        if valid.update_w():
            print("\t Slope: {:.5f} (test_accuracy / second)".format(valid.slope))
        print("\t Time this epoch: {:.2f}s".format(epoch.num_seconds), end='')
        if valid._num_epochs:
            eTimes = np.append(eTimes, epoch.num_seconds)
            print("\tFinal ETA: {}".format(convert_seconds(np.median(eTimes) * (valid._num_epochs - valid.epochs))))
        else:
            print()
        print("\t Significance (S/sqrt(B)): {:.2f}".format(epoch.s_b))
        print("\t Area Under the Curve (efficiency): {:.3f}".format(epoch.auc))
        print(matrix)

        if (len(exp.results) % save_freq) == 0:
            save(model, exp, save_dir, exp_file_name)
            print("\t Saved the model\n")
        sys.stdout.flush()

    exp.end_date_time = str(datetime.datetime.now())
    exp.total_time = valid.time

    print("\n"+valid.failed)
    print("Total Time: {}".format(convert_seconds(valid.time)))

    save(model, exp, save_dir, exp_file_name, graph=True)
    h_file.close()
Пример #11
0
    def test_load_dataset(self):
        """ Tests the load_data function """

        # if the dataset doesn't exist
        with self.assertRaises(IOError):
            ds.load_dataset("this one definitely doesn't exist either", "")
Пример #12
0
def save_ratios(dataset, ratios, buffer=1000):

    ratios = [ratios] if type(ratios) is str else ratios
    ratios = map(lambda x: map(float, x.split(':')), ratios)
    data, format = dataset.split('/')
    main_file, (x_train, y_train, x_test, y_test) = ds.load_dataset(data, format, mode='a')

    bkg_test, sig_test = sum_cols(y_test)
    bkg_train, sig_train = sum_cols(y_train)

    TEST_UPPER_LIMIT = int(1.5 * bkg_test) if bkg_test < sig_test else int(1.5 * sig_test)
    TRAIN_UPPER_LIMIT = int(1.5 * bkg_train) if bkg_train < sig_train else int(1.5 * sig_train)

    temp_h_file, temp_h_data = add_group_hdf5(".deep_learning.temp.hdf5", "Temp",
                                    [(bkg_train, x_train.shape[1]),
                                     (bkg_train, y_train.shape[1]),
                                     (sig_train, x_train.shape[1]),
                                     (sig_train, y_train.shape[1]),
                                     (bkg_test, x_test.shape[1]),
                                     (bkg_test, y_test.shape[1]),
                                     (sig_test, x_test.shape[1]),
                                     (sig_test, y_test.shape[1])],
                                    names=["train_bkg_x",
                                           "train_bkg_y",
                                           "train_sig_x",
                                           "train_sig_y",
                                           "test_bkg_x",
                                           "test_bkg_y",
                                           "test_sig_x",
                                           "test_sig_y"])

    print "Generating temporary files..."
    for i in xrange(int(math.ceil(x_train.shape[0] / buffer))):
        # index should be same shape and need to reshape the result :/
        train_bkg_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0])
        train_sig_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0])
        test_bkg_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0])
        test_sig_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0])

        for j in xrange(x_train.shape[1]):
            train_bkg_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 0] == 1
            train_sig_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 1] == 1
        for j in xrange(x_test.shape[1]):
            test_bkg_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 0] == 1
            test_sig_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 1] == 1

        selection = x_train[train_bkg_index]
        temp_h_data[0].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1])))

        selection = y_train[train_bkg_index[:, :y_train.shape[1]]]
        temp_h_data[1].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1])))

        selection = x_train[train_sig_index]
        temp_h_data[2].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1])))

        selection = y_train[train_sig_index[:, :y_train.shape[1]]]
        temp_h_data[3].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1])))

        selection = x_test[test_bkg_index]
        temp_h_data[4].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1])))

        selection = y_test[test_bkg_index[:, :y_test.shape[1]]]
        temp_h_data[5].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1])))

        selection = x_test[test_sig_index]
        temp_h_data[6].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1])))

        selection = y_test[test_sig_index[:, :y_test.shape[1]]]
        temp_h_data[7].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1])))

    # Perform all of this in archive so that you write to file every iteration
    buffer_reset = buffer
    for rat in ratios:

        print "Creating ratio {:d}/{:d} ...".format(*map(int, rat))

        h_file, h_data = add_group_hdf5(ds.get_path_to_dataset(data)+os.sep+data+".hdf5",
                                        "{}to{}".format(*map(int, rat)),
                                        [(TRAIN_UPPER_LIMIT, x_train.shape[1]),
                                         (TRAIN_UPPER_LIMIT, y_train.shape[1]),
                                         (TEST_UPPER_LIMIT, x_test.shape[1]),
                                         (TEST_UPPER_LIMIT, y_test.shape[1])],
                                        where='/{}'.format(format))

        test_bkg_indices = np.arange(bkg_test)
        test_sig_indices = np.arange(sig_test)
        train_bkg_indices = np.arange(bkg_train)
        train_sig_indices = np.arange(sig_train)

        train_count = 0
        buffer = buffer_reset
        while train_count < TRAIN_UPPER_LIMIT:
            if TRAIN_UPPER_LIMIT - train_count < buffer:
                buffer = TRAIN_UPPER_LIMIT - train_count

            # Indices to NOT include
            train_bkg_ix = np.random.choice(train_bkg_indices,
                                            train_bkg_indices.size - (rat[0] * buffer / sum(rat)),
                                            replace=False)
            train_sig_ix = np.random.choice(train_sig_indices,
                                            train_sig_indices.size - (rat[1] * buffer / sum(rat)),
                                            replace=False)

            # Indices to keep
            k_train_bkg = np.setdiff1d(train_bkg_indices, train_bkg_ix)
            k_train_sig = np.setdiff1d(train_sig_indices, train_sig_ix)

            train_small_x_sig = temp_h_data[2][k_train_sig]
            train_small_y_sig = temp_h_data[3][k_train_sig]
            train_small_x_bkg = temp_h_data[0][k_train_bkg]
            train_small_y_bkg = temp_h_data[1][k_train_bkg]

            train_x = np.concatenate((train_small_x_bkg, train_small_x_sig))
            train_y = np.concatenate((train_small_y_bkg, train_small_y_sig))

            tr.shuffle_in_unison(train_x, train_y)

            h_data[0].append(train_x)
            h_data[1].append(train_y)

            train_count += k_train_bkg.size + k_train_sig.size

            train_bkg_indices = train_bkg_ix
            train_sig_indices = train_sig_ix

        test_count = 0
        buffer = buffer_reset
        while test_count < TEST_UPPER_LIMIT:
            if TEST_UPPER_LIMIT - test_count < buffer:
                buffer = TEST_UPPER_LIMIT - test_count

            # Indices to NOT include
            test_bkg_ix = np.random.choice(test_bkg_indices, test_bkg_indices.size - (rat[0] * buffer / sum(rat)),
                                           replace=False)
            test_sig_ix = np.random.choice(test_sig_indices, test_sig_indices.size - (rat[1] * buffer / sum(rat)),
                                           replace=False)

            # Indices to keep
            k_test_bkg = np.setdiff1d(test_bkg_indices, test_bkg_ix)
            k_test_sig = np.setdiff1d(test_sig_indices, test_sig_ix)

            test_small_x_sig = temp_h_data[6][k_test_sig]
            test_small_y_sig = temp_h_data[7][k_test_sig]
            test_small_x_bkg = temp_h_data[4][k_test_bkg]
            test_small_y_bkg = temp_h_data[5][k_test_bkg]

            test_x = np.concatenate((test_small_x_bkg, test_small_x_sig))
            test_y = np.concatenate((test_small_y_bkg, test_small_y_sig))

            tr.shuffle_in_unison(test_x, test_y)

            h_data[2].append(test_x)
            h_data[3].append(test_y)

            test_count += k_test_bkg.size + k_test_sig.size

            test_bkg_indices = test_bkg_ix
            test_sig_indices = test_sig_ix

        print "Created Group: {}/{}to{}".format(format, *map(int, rat))

        h_file.flush()
        h_file.close()

    main_file.close()
    temp_h_file.close()
    os.remove(".deep_learning.temp.hdf5")