def num_of_each_cell(model, data, cutoff=0.5): x_train, y_train, x_test, y_test = data num_correct = count_filter(model, lambda p, y: (y > 0) == (p - y > cutoff - 1), (x_test, y_test)) num_predicted = count_filter(model, lambda p, y: p > cutoff, (x_test, y_test)) test_totals = ut.sum_cols(y_test) ss = num_correct[1] bs = num_predicted[1] - ss sb = test_totals[1]-ss bb = test_totals[0]-bs return np.array([[bb, bs], [sb, ss]])
MATRIX = """ {0} Predicted {0} ================== {0}R || {1:02.1f} || {2:02.1f} || TTBar {0}e ================== {0}a || {3:02.1f} || {4:02.1f} || TTHiggs {0}l ================== {0} TTBar TTHiggs """ def count_filter(model, criteria, (x_test, y_test), batch_size=64, **kwargs): rval = [] for i in xrange(int(y_test.shape[0]/batch_size)): predictions = model.predict([x_test[i*batch_size:(i+1)*batch_size]], **kwargs) bArray = criteria(predictions, y_test[i*batch_size:(i+1)*batch_size]) rval.append(ut.sum_cols(bArray, batch_size)) return tuple([c.sum() for c in np.array(rval).T]) def num_of_each_cell(model, data, cutoff=0.5): x_train, y_train, x_test, y_test = data num_correct = count_filter(model, lambda p, y: (y > 0) == (p - y > cutoff - 1), (x_test, y_test)) num_predicted = count_filter(model, lambda p, y: p > cutoff, (x_test, y_test)) test_totals = ut.sum_cols(y_test) ss = num_correct[1] bs = num_predicted[1] - ss sb = test_totals[1]-ss bb = test_totals[0]-bs return np.array([[bb, bs], [sb, ss]]) # Need to generalize for more categories
def save_ratios(dataset, ratios, buffer=1000): ratios = [ratios] if type(ratios) is str else ratios ratios = map(lambda x: map(float, x.split(':')), ratios) data, format = dataset.split('/') main_file, (x_train, y_train, x_test, y_test) = ds.load_dataset(data, format, mode='a') bkg_test, sig_test = sum_cols(y_test) bkg_train, sig_train = sum_cols(y_train) TEST_UPPER_LIMIT = int(1.5 * bkg_test) if bkg_test < sig_test else int(1.5 * sig_test) TRAIN_UPPER_LIMIT = int(1.5 * bkg_train) if bkg_train < sig_train else int(1.5 * sig_train) temp_h_file, temp_h_data = add_group_hdf5(".deep_learning.temp.hdf5", "Temp", [(bkg_train, x_train.shape[1]), (bkg_train, y_train.shape[1]), (sig_train, x_train.shape[1]), (sig_train, y_train.shape[1]), (bkg_test, x_test.shape[1]), (bkg_test, y_test.shape[1]), (sig_test, x_test.shape[1]), (sig_test, y_test.shape[1])], names=["train_bkg_x", "train_bkg_y", "train_sig_x", "train_sig_y", "test_bkg_x", "test_bkg_y", "test_sig_x", "test_sig_y"]) print "Generating temporary files..." for i in xrange(int(math.ceil(x_train.shape[0] / buffer))): # index should be same shape and need to reshape the result :/ train_bkg_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0]) train_sig_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0]) test_bkg_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0]) test_sig_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0]) for j in xrange(x_train.shape[1]): train_bkg_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 0] == 1 train_sig_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 1] == 1 for j in xrange(x_test.shape[1]): test_bkg_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 0] == 1 test_sig_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 1] == 1 selection = x_train[train_bkg_index] temp_h_data[0].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1]))) selection = y_train[train_bkg_index[:, :y_train.shape[1]]] temp_h_data[1].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1]))) selection = x_train[train_sig_index] temp_h_data[2].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1]))) selection = y_train[train_sig_index[:, :y_train.shape[1]]] temp_h_data[3].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1]))) selection = x_test[test_bkg_index] temp_h_data[4].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1]))) selection = y_test[test_bkg_index[:, :y_test.shape[1]]] temp_h_data[5].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1]))) selection = x_test[test_sig_index] temp_h_data[6].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1]))) selection = y_test[test_sig_index[:, :y_test.shape[1]]] temp_h_data[7].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1]))) # Perform all of this in archive so that you write to file every iteration buffer_reset = buffer for rat in ratios: print "Creating ratio {:d}/{:d} ...".format(*map(int, rat)) h_file, h_data = add_group_hdf5(ds.get_path_to_dataset(data)+os.sep+data+".hdf5", "{}to{}".format(*map(int, rat)), [(TRAIN_UPPER_LIMIT, x_train.shape[1]), (TRAIN_UPPER_LIMIT, y_train.shape[1]), (TEST_UPPER_LIMIT, x_test.shape[1]), (TEST_UPPER_LIMIT, y_test.shape[1])], where='/{}'.format(format)) test_bkg_indices = np.arange(bkg_test) test_sig_indices = np.arange(sig_test) train_bkg_indices = np.arange(bkg_train) train_sig_indices = np.arange(sig_train) train_count = 0 buffer = buffer_reset while train_count < TRAIN_UPPER_LIMIT: if TRAIN_UPPER_LIMIT - train_count < buffer: buffer = TRAIN_UPPER_LIMIT - train_count # Indices to NOT include train_bkg_ix = np.random.choice(train_bkg_indices, train_bkg_indices.size - (rat[0] * buffer / sum(rat)), replace=False) train_sig_ix = np.random.choice(train_sig_indices, train_sig_indices.size - (rat[1] * buffer / sum(rat)), replace=False) # Indices to keep k_train_bkg = np.setdiff1d(train_bkg_indices, train_bkg_ix) k_train_sig = np.setdiff1d(train_sig_indices, train_sig_ix) train_small_x_sig = temp_h_data[2][k_train_sig] train_small_y_sig = temp_h_data[3][k_train_sig] train_small_x_bkg = temp_h_data[0][k_train_bkg] train_small_y_bkg = temp_h_data[1][k_train_bkg] train_x = np.concatenate((train_small_x_bkg, train_small_x_sig)) train_y = np.concatenate((train_small_y_bkg, train_small_y_sig)) tr.shuffle_in_unison(train_x, train_y) h_data[0].append(train_x) h_data[1].append(train_y) train_count += k_train_bkg.size + k_train_sig.size train_bkg_indices = train_bkg_ix train_sig_indices = train_sig_ix test_count = 0 buffer = buffer_reset while test_count < TEST_UPPER_LIMIT: if TEST_UPPER_LIMIT - test_count < buffer: buffer = TEST_UPPER_LIMIT - test_count # Indices to NOT include test_bkg_ix = np.random.choice(test_bkg_indices, test_bkg_indices.size - (rat[0] * buffer / sum(rat)), replace=False) test_sig_ix = np.random.choice(test_sig_indices, test_sig_indices.size - (rat[1] * buffer / sum(rat)), replace=False) # Indices to keep k_test_bkg = np.setdiff1d(test_bkg_indices, test_bkg_ix) k_test_sig = np.setdiff1d(test_sig_indices, test_sig_ix) test_small_x_sig = temp_h_data[6][k_test_sig] test_small_y_sig = temp_h_data[7][k_test_sig] test_small_x_bkg = temp_h_data[4][k_test_bkg] test_small_y_bkg = temp_h_data[5][k_test_bkg] test_x = np.concatenate((test_small_x_bkg, test_small_x_sig)) test_y = np.concatenate((test_small_y_bkg, test_small_y_sig)) tr.shuffle_in_unison(test_x, test_y) h_data[2].append(test_x) h_data[3].append(test_y) test_count += k_test_bkg.size + k_test_sig.size test_bkg_indices = test_bkg_ix test_sig_indices = test_sig_ix print "Created Group: {}/{}to{}".format(format, *map(int, rat)) h_file.flush() h_file.close() main_file.close() temp_h_file.close() os.remove(".deep_learning.temp.hdf5")
def save_ratios(dataset, ratios, buffer=1000): """ Divides a certain dataset into subsets of data with certain ratios of backgrond to signal. For a ratio list of length n, the counting index, i, for the background starts from index 0, and the counting index, j, for the signal starts at n-1. The ratio for each iteration is then i to j (i/j). Generates a temporary file to accomplish this. Parameters ---------- dataset <string> : the name of the dataset (/-separated) ratios <list> : a list of integers that define ratios of background to signal. buffer <int> : an integer defining the number of data points to load into memory at a time. """ ratios = [ratios] if type(ratios) is str else ratios ratios = map(lambda x: map(float, x.split(':')), ratios) data = dataset.split('/')[0] format = '/'.join(dataset.split('/')[1:]) main_file, (x_train, y_train, x_test, y_test) = ds.load_dataset(data, format, mode='a') bkg_test, sig_test = sum_cols(y_test) bkg_train, sig_train = sum_cols(y_train) TEST_UPPER_LIMIT = int(1.5 * bkg_test) if bkg_test < sig_test else int(1.5 * sig_test) TRAIN_UPPER_LIMIT = int(1.5 * bkg_train) if bkg_train < sig_train else int(1.5 * sig_train) temp_h_file, temp_h_data = add_group_hdf5(".deep_learning.temp.h5", "Temp", [(bkg_train, x_train.shape[1]), (bkg_train, y_train.shape[1]), (sig_train, x_train.shape[1]), (sig_train, y_train.shape[1]), (bkg_test, x_test.shape[1]), (bkg_test, y_test.shape[1]), (sig_test, x_test.shape[1]), (sig_test, y_test.shape[1])], names=["train_bkg_x", "train_bkg_y", "train_sig_x", "train_sig_y", "test_bkg_x", "test_bkg_y", "test_sig_x", "test_sig_y"]) print "Generating temporary files..." for i in xrange(int(math.ceil(x_train.shape[0] / buffer))): # index should be same shape and need to reshape the result :/ train_bkg_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0]) train_sig_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0]) test_bkg_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0]) test_sig_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0]) for j in xrange(x_train.shape[1]): train_bkg_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 0] == 1 train_sig_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 1] == 1 for j in xrange(x_test.shape[1]): test_bkg_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 0] == 1 test_sig_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 1] == 1 selection = x_train[train_bkg_index] temp_h_data[0].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1]))) selection = y_train[train_bkg_index[:, :y_train.shape[1]]] temp_h_data[1].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1]))) selection = x_train[train_sig_index] temp_h_data[2].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1]))) selection = y_train[train_sig_index[:, :y_train.shape[1]]] temp_h_data[3].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1]))) selection = x_test[test_bkg_index] temp_h_data[4].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1]))) selection = y_test[test_bkg_index[:, :y_test.shape[1]]] temp_h_data[5].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1]))) selection = x_test[test_sig_index] temp_h_data[6].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1]))) selection = y_test[test_sig_index[:, :y_test.shape[1]]] temp_h_data[7].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1]))) # Perform all of this in archive so that you write to file every iteration buffer_reset = buffer for rat in ratios: print "Creating ratio {:d}/{:d} ...".format(*map(int, rat)) h_file, h_data = add_group_hdf5(ds.get_path_to_dataset(data)+os.sep+data+".h5", "{}to{}".format(*map(int, rat)), [(TRAIN_UPPER_LIMIT, x_train.shape[1]), (TRAIN_UPPER_LIMIT, y_train.shape[1]), (TEST_UPPER_LIMIT, x_test.shape[1]), (TEST_UPPER_LIMIT, y_test.shape[1])], where='/{}'.format(format)) test_bkg_indices = np.arange(bkg_test) test_sig_indices = np.arange(sig_test) train_bkg_indices = np.arange(bkg_train) train_sig_indices = np.arange(sig_train) train_count = 0 buffer = buffer_reset while train_count < TRAIN_UPPER_LIMIT: if TRAIN_UPPER_LIMIT - train_count < buffer: buffer = TRAIN_UPPER_LIMIT - train_count # Indices to NOT include train_bkg_ix = np.random.choice(train_bkg_indices, train_bkg_indices.size - (rat[0] * buffer / sum(rat)), replace=False) train_sig_ix = np.random.choice(train_sig_indices, train_sig_indices.size - (rat[1] * buffer / sum(rat)), replace=False) # Indices to keep k_train_bkg = np.setdiff1d(train_bkg_indices, train_bkg_ix) k_train_sig = np.setdiff1d(train_sig_indices, train_sig_ix) train_small_x_sig = temp_h_data[2][k_train_sig] train_small_y_sig = temp_h_data[3][k_train_sig] train_small_x_bkg = temp_h_data[0][k_train_bkg] train_small_y_bkg = temp_h_data[1][k_train_bkg] train_x = np.concatenate((train_small_x_bkg, train_small_x_sig)) train_y = np.concatenate((train_small_y_bkg, train_small_y_sig)) tr.shuffle_in_unison(train_x, train_y) h_data[0].append(train_x) h_data[1].append(train_y) train_count += k_train_bkg.size + k_train_sig.size train_bkg_indices = train_bkg_ix train_sig_indices = train_sig_ix test_count = 0 buffer = buffer_reset while test_count < TEST_UPPER_LIMIT: if TEST_UPPER_LIMIT - test_count < buffer: buffer = TEST_UPPER_LIMIT - test_count # Indices to NOT include test_bkg_ix = np.random.choice(test_bkg_indices, test_bkg_indices.size - (rat[0] * buffer / sum(rat)), replace=False) test_sig_ix = np.random.choice(test_sig_indices, test_sig_indices.size - (rat[1] * buffer / sum(rat)), replace=False) # Indices to keep k_test_bkg = np.setdiff1d(test_bkg_indices, test_bkg_ix) k_test_sig = np.setdiff1d(test_sig_indices, test_sig_ix) test_small_x_sig = temp_h_data[6][k_test_sig] test_small_y_sig = temp_h_data[7][k_test_sig] test_small_x_bkg = temp_h_data[4][k_test_bkg] test_small_y_bkg = temp_h_data[5][k_test_bkg] test_x = np.concatenate((test_small_x_bkg, test_small_x_sig)) test_y = np.concatenate((test_small_y_bkg, test_small_y_sig)) tr.shuffle_in_unison(test_x, test_y) h_data[2].append(test_x) h_data[3].append(test_y) test_count += k_test_bkg.size + k_test_sig.size test_bkg_indices = test_bkg_ix test_sig_indices = test_sig_ix print "Created Group: {}/{}to{}".format(format, *map(int, rat)) h_file.flush() h_file.close() main_file.close() temp_h_file.close() os.remove(".deep_learning.temp.h5")
for i in xrange(int(ceil(y_test.shape[0] / batch_size))): predictions = model.predict( [x_test[i * batch_size:(i + 1) * batch_size]], **kwargs) if type(index) is np.ndarray: if index[i * batch_size:(i + 1) * batch_size].any(): predictions = predictions[index[i * batch_size:(i + 1) * batch_size]] y_temp = y_test[i * batch_size:(i + 1) * batch_size][index[i * batch_size:(i + 1) * batch_size]] else: continue else: y_temp = y_test[i * batch_size:(i + 1) * batch_size] bArray = criteria(predictions, y_temp) rval.append(ut.sum_cols(bArray, batch_size)) return tuple([c.sum() for c in np.array(rval).T]) def num_of_each_cell(model, data): x_train, y_train, x_test, y_test = data matrix = [] for i in xrange(y_test.shape[1]): index = (y_test[:, i] > 0).flatten() num_predicted = count_filter( model, lambda p, y: np.vstack(map(lambda x: x == max(x), p)), (x_test, y_test), index=index) matrix.append(num_predicted) return np.array(matrix)