def eval_one_epoch(sess, ops, test_writer, is_full_training): """ ops: dict mapping from string to tf ops """ global EPOCH_CNT is_training = False test_idxs = np.arange(0, len(TEST_FILES)) # Test on all data: last batch might be smaller than BATCH_SIZE loss_sum = acc = 0 acc_seg = 0 for fn in range(len(TEST_FILES)): #log_string('----' + str(fn) + '-----') current_file = os.path.join(H5_DIR, TEST_FILES[test_idxs[fn]]) if RD: current_data, current_cluster, current_label = provider.load_h5_data_label_seg( current_file) else: current_data, current_label = provider.load_h5(current_file, 'seg') adds = provider.load_add(current_file, ['global']) if NUM_GLOB < adds['global'].shape[1]: log_string("Using less global variables than possible") adds['global'] = adds['global'][:, :NUM_GLOB] current_label = np.squeeze(current_label) file_size = current_data.shape[0] num_batches = file_size // BATCH_SIZE for batch_idx in range(num_batches): start_idx = batch_idx * BATCH_SIZE end_idx = (batch_idx + 1) * BATCH_SIZE batch_data, batch_label, batch_global = get_batch( current_data, current_label, adds['global'], start_idx, end_idx) cur_batch_size = end_idx - start_idx feed_dict = { ops['pointclouds_pl']: batch_data, ops['is_training_pl']: is_training, ops['global_pl']: batch_global, ops['labels_pl']: batch_label, ops['alpha']: 10 * (EPOCH_CNT - MAX_PRETRAIN + 1), } if is_full_training: summary, step, loss_val, pred_val, max_pool, dist = sess.run( [ ops['merged'], ops['step'], ops['kmeans_loss'], ops['pred'], ops['max_pool'], ops['stack_dist'], #ops['pi'] ], feed_dict=feed_dict) cluster_assign = np.zeros((cur_batch_size), dtype=int) for i in range(cur_batch_size): index_closest_cluster = np.argmin(dist[:, i]) cluster_assign[i] = index_closest_cluster if RD: batch_cluster = current_cluster[start_idx:end_idx] if batch_cluster.size == cluster_assign.size: acc += cluster_acc(batch_cluster, cluster_assign) else: summary, step, loss_val, pred_val, max_pool = sess.run( [ ops['merged'], ops['step'], ops['classify_loss'], ops['pred'], ops['max_pool'], ], feed_dict=feed_dict) test_writer.add_summary(summary, step) loss_sum += np.mean(loss_val) total_loss = loss_sum * 1.0 / float(num_batches) log_string('mean loss: %f' % (total_loss)) log_string('testing clustering accuracy: %f' % (acc / float(num_batches))) EPOCH_CNT += 1 if FLAGS.min == 'acc': return total_correct / float(total_seen) else: return total_loss
def train_one_epoch(sess, ops, train_writer, is_full_training): """ ops: dict mapping from string to tf ops """ is_training = True train_idxs = np.arange(0, len(TRAIN_FILES)) acc = loss_sum = 0 y_pool = [] y_assign = [] for fn in range(len(TRAIN_FILES)): #log_string('----' + str(fn) + '-----') current_file = os.path.join(H5_DIR, TRAIN_FILES[train_idxs[fn]]) if RD: current_data, current_cluster, current_label = provider.load_h5_data_label_seg( current_file) else: current_data, current_label = provider.load_h5(current_file, 'seg') adds = provider.load_add(current_file, ['global']) if NUM_GLOB < adds['global'].shape[1]: log_string("Using less global variables than possible") adds['global'] = adds['global'][:, :NUM_GLOB] current_label = np.squeeze(current_label) file_size = current_data.shape[0] num_batches = file_size // BATCH_SIZE if FLAGS.nbatches > 0: num_batches = FLAGS.nbatches log_string(str(datetime.now())) for batch_idx in range(num_batches): start_idx = batch_idx * BATCH_SIZE end_idx = (batch_idx + 1) * BATCH_SIZE batch_data, batch_label, batch_global = get_batch( current_data, current_label, adds['global'], start_idx, end_idx) cur_batch_size = end_idx - start_idx #print(batch_weight) feed_dict = { ops['pointclouds_pl']: batch_data, ops['labels_pl']: batch_label, ops['global_pl']: batch_global, ops['is_training_pl']: is_training, ops['alpha']: 10 * (EPOCH_CNT - MAX_PRETRAIN + 1), } if is_full_training: summary, step, _, loss_val, pred_val, max_pool, dist = sess.run( [ ops['merged'], ops['step'], ops['train_op_full'], ops['kmeans_loss'], ops['pred'], ops['max_pool'], ops['stack_dist'] ], feed_dict=feed_dict) cluster_assign = np.zeros((cur_batch_size), dtype=int) for i in range(cur_batch_size): index_closest_cluster = np.argmin(dist[:, i]) cluster_assign[i] = index_closest_cluster if RD: batch_cluster = current_cluster[start_idx:end_idx] if batch_cluster.size == cluster_assign.size: acc += cluster_acc(batch_cluster, cluster_assign) else: summary, step, _, loss_val, pred_val, max_pool = sess.run( [ ops['merged'], ops['step'], ops['train_op'], ops['classify_loss'], ops['pred'], ops['max_pool'] ], feed_dict=feed_dict) loss_sum += np.mean(loss_val) if len(y_pool) == 0: y_pool = np.squeeze(max_pool) else: y_pool = np.concatenate((y_pool, np.squeeze(max_pool)), axis=0) train_writer.add_summary(summary, step) log_string('mean loss: %f' % (loss_sum / float(num_batches))) log_string('train clustering accuracy: %f' % (acc / float(num_batches))) return y_pool
def eval_one_epoch(sess, ops): is_training = False eval_idxs = np.arange(0, len(EVALUATE_FILES)) y_val = [] for fn in range(len(EVALUATE_FILES)): current_file = os.path.join(H5_DIR, EVALUATE_FILES[eval_idxs[fn]]) current_data, current_label, current_cluster = provider.load_h5_data_label_seg( current_file) adds = provider.load_add(current_file, ['masses']) current_label = np.squeeze(current_label) file_size = current_data.shape[0] num_batches = file_size // BATCH_SIZE num_batches = 5 for batch_idx in range(num_batches): start_idx = batch_idx * BATCH_SIZE end_idx = (batch_idx + 1) * BATCH_SIZE batch_data, batch_label = get_batch(current_data, current_label, start_idx, end_idx) batch_cluster = current_cluster[start_idx:end_idx] cur_batch_size = end_idx - start_idx feed_dict = { ops['pointclouds_pl']: batch_data, ops['labels_pl']: batch_label, ops['alpha']: 1, #No impact on evaluation, ops['is_training_pl']: is_training, } loss, dist, max_pool = sess.run( [ops['kmeans_loss'], ops['stack_dist'], ops['max_pool']], feed_dict=feed_dict) cluster_assign = np.zeros((cur_batch_size), dtype=int) for i in range(cur_batch_size): index_closest_cluster = np.argmin(dist[:, i]) cluster_assign[i] = index_closest_cluster batch_cluster = np.array([ np.where(r == 1)[0][0] for r in current_cluster[start_idx:end_idx] ]) if len(y_val) == 0: y_val = batch_cluster y_assign = cluster_assign y_pool = np.squeeze(max_pool) y_mass = adds['masses'][start_idx:end_idx] else: y_val = np.concatenate((y_val, batch_cluster), axis=0) y_assign = np.concatenate((y_assign, cluster_assign), axis=0) y_pool = np.concatenate((y_pool, np.squeeze(max_pool)), axis=0) y_mass = np.concatenate( (y_mass, adds['masses'][start_idx:end_idx]), axis=0) with h5py.File(os.path.join(H5_OUT, '{0}.h5'.format(FLAGS.name)), "w") as fh5: dset = fh5.create_dataset("pid", data=y_val) #Real jet categories dset = fh5.create_dataset("label", data=y_assign) #Cluster labeling dset = fh5.create_dataset("max_pool", data=y_pool) dset = fh5.create_dataset("masses", data=y_mass)
def eval_one_epoch(sess, ops): is_training = False eval_idxs = np.arange(0, len(EVALUATE_FILES)) y_assign = [] y_glob = [] acc = 0 for fn in range(len(EVALUATE_FILES)): current_file = os.path.join(H5_DIR, EVALUATE_FILES[eval_idxs[fn]]) if RD: current_data, current_cluster, current_label = provider.load_h5_data_label_seg( current_file) else: current_data, current_label = provider.load_h5(current_file, 'seg') adds = provider.load_add(current_file, ['global', 'masses']) if NUM_GLOB < adds['global'].shape[1]: print("Using less global variables than possible") current_glob = adds['global'][:, :NUM_GLOB] else: current_glob = adds['global'] current_label = np.squeeze(current_label) file_size = current_data.shape[0] num_batches = file_size // BATCH_SIZE for batch_idx in range(num_batches): start_idx = batch_idx * BATCH_SIZE end_idx = (batch_idx + 1) * BATCH_SIZE batch_data, batch_label, batch_global = get_batch( current_data, current_label, current_glob, start_idx, end_idx) cur_batch_size = end_idx - start_idx feed_dict = { ops['pointclouds_pl']: batch_data, ops['global_pl']: batch_global, ops['labels_pl']: batch_label, ops['alpha']: 1, #No impact during evaluation ops['is_training_pl']: is_training, } dist, mu, max_pool = sess.run( [ops['stack_dist'], ops['mu'], ops['max_pool']], feed_dict=feed_dict) cluster_assign = np.zeros((cur_batch_size), dtype=int) if RD: batch_cluster = current_cluster[start_idx:end_idx] for i in range(cur_batch_size): index_closest_cluster = np.argmin(dist[:, i]) cluster_assign[i] = index_closest_cluster if RD: acc += cluster_acc(batch_cluster, cluster_assign) if len(y_assign) == 0: if RD: y_val = batch_cluster y_assign = cluster_assign y_pool = np.squeeze(max_pool) else: y_assign = np.concatenate((y_assign, cluster_assign), axis=0) y_pool = np.concatenate((y_pool, np.squeeze(max_pool)), axis=0) if RD: y_val = np.concatenate((y_val, batch_cluster), axis=0) if len(y_glob) == 0: y_glob = adds['global'][:num_batches * BATCH_SIZE] y_mass = adds['masses'][:num_batches * BATCH_SIZE] else: y_glob = np.concatenate( (y_glob, adds['global'][:num_batches * BATCH_SIZE]), axis=0) y_mass = np.concatenate( (y_mass, adds['masses'][:num_batches * BATCH_SIZE]), axis=0) with h5py.File(os.path.join(H5_OUT, '{0}.h5'.format(FLAGS.name)), "w") as fh5: if RD: dset = fh5.create_dataset("label", data=y_val) dset = fh5.create_dataset("pid", data=y_assign) dset = fh5.create_dataset("max_pool", data=y_pool) dset = fh5.create_dataset("global", data=y_glob) dset = fh5.create_dataset("masses", data=y_mass)
def eval_one_epoch(sess, ops): is_training = False total_correct = total_sig = total_correct_ones = total_seen = total_seen_ones = loss_sum = 0 eval_idxs = np.arange(0, len(EVALUATE_FILES)) y_pred = [] for fn in range(len(EVALUATE_FILES)): current_file = os.path.join(H5_DIR, EVALUATE_FILES[eval_idxs[fn]]) current_data, current_label = provider.load_h5(current_file, 'seg') full_data = current_data if current_data.shape[2] > NFEATURES: print('puppi not used') current_data = current_data[:, :, :NFEATURES] if current_data.shape[1] > NUM_POINT: print('Using less points') current_data = current_data[:, :NUM_POINT] current_label = current_label[:, :NUM_POINT] add_list = [ 'PFNoPU', 'puppiPU', 'chs', 'NPU', 'CHS_MET', 'PUPPI_MET', #'puppiNoPU', ] adds = provider.load_add(current_file, add_list) if not FLAGS.is_data: current_truth = adds['PFNoPU'] current_truth = preprocessing(current_data, current_truth) else: add_list.append('nLeptons') current_truth = np.zeros((current_data.shape)) current_label = np.squeeze(current_label) file_size = current_data.shape[0] num_batches = file_size // BATCH_SIZE #num_batches = 1 # if FLAGS.is_data: # num_batches = 600 for batch_idx in range(num_batches): scores = np.zeros(NUM_POINT) true = np.zeros(NUM_POINT) start_idx = batch_idx * BATCH_SIZE end_idx = (batch_idx + 1) * BATCH_SIZE batch_data, batch_label, batch_truth = get_batch( current_data, current_label, current_truth, start_idx, end_idx) cur_batch_size = end_idx - start_idx feed_dict = { ops['pointclouds_pl']: batch_data, ops['truth_pl']: batch_truth, ops['labels_pl']: batch_label, ops['is_training_pl']: is_training, } #,beforemax loss, pred = sess.run([ops['loss'], ops['pred']], feed_dict=feed_dict) pred_val = np.argmax(pred, 2) correct_ones = pred_val * batch_label total_sig += np.sum(batch_label == 2) total_correct_ones += np.sum(correct_ones == 4) loss_sum += np.mean(loss) if len(y_pred) == 0: y_pred = pred[:, :, 2] y_data = full_data[start_idx:end_idx] y_lab = batch_label y_add = {} for add in adds: y_add[add] = adds[add][start_idx:end_idx] else: y_pred = np.concatenate((y_pred, pred[:, :, 2]), axis=0) y_data = np.concatenate((y_data, full_data[start_idx:end_idx]), axis=0) y_lab = np.concatenate((y_lab, batch_label), axis=0) for add in adds: y_add[add] = np.concatenate( (y_add[add], adds[add][start_idx:end_idx]), axis=0) if not FLAGS.is_data: print('The signal accuracy is {0}'.format(total_correct_ones / float(total_sig))) flat_pred = y_pred.flatten() flat_lab = y_lab.flatten() flat_lab = flat_lab == 2 results = metrics.roc_curve(flat_lab, flat_pred) threshs = [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.90, 0.95] with open(os.path.join(MODEL_PATH, 'cut_eff.txt'), 'w') as f: for thresh in threshs: bin = np.argmax(results[1] > thresh) cut = results[2][bin] f.write('eff: {}, fpr: {}, cut: {} \n'.format( results[1][bin], results[0][bin], cut)) with h5py.File(os.path.join(H5_OUT, '{0}.h5'.format(FLAGS.name)), "w") as fh5: dset = fh5.create_dataset("DNN", data=y_pred) dset = fh5.create_dataset("data", data=y_data) dset = fh5.create_dataset("pid", data=y_lab) for add in adds: dset = fh5.create_dataset(add, data=y_add[add])